|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0075566750629723, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005037783375314861, |
|
"grad_norm": 0.3518223464488983, |
|
"learning_rate": 2e-05, |
|
"loss": 2.0757, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010075566750629723, |
|
"grad_norm": 0.4766307771205902, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3191, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015113350125944584, |
|
"grad_norm": 0.36295855045318604, |
|
"learning_rate": 6e-05, |
|
"loss": 1.9787, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.020151133501259445, |
|
"grad_norm": 0.307400107383728, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8764, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02518891687657431, |
|
"grad_norm": 0.3723292350769043, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0876, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.030226700251889168, |
|
"grad_norm": 0.37307557463645935, |
|
"learning_rate": 0.00012, |
|
"loss": 1.8658, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03526448362720403, |
|
"grad_norm": 0.435854434967041, |
|
"learning_rate": 0.00014, |
|
"loss": 2.0115, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04030226700251889, |
|
"grad_norm": 0.6088196039199829, |
|
"learning_rate": 0.00016, |
|
"loss": 2.184, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04534005037783375, |
|
"grad_norm": 0.32109102606773376, |
|
"learning_rate": 0.00018, |
|
"loss": 1.7607, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05037783375314862, |
|
"grad_norm": 0.36787062883377075, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5996, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.055415617128463476, |
|
"grad_norm": 0.3893103003501892, |
|
"learning_rate": 0.00019894736842105264, |
|
"loss": 1.6031, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.060453400503778336, |
|
"grad_norm": 0.5323490500450134, |
|
"learning_rate": 0.00019789473684210526, |
|
"loss": 1.5834, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0654911838790932, |
|
"grad_norm": 0.773453950881958, |
|
"learning_rate": 0.0001968421052631579, |
|
"loss": 1.574, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07052896725440806, |
|
"grad_norm": 0.5621123313903809, |
|
"learning_rate": 0.00019578947368421054, |
|
"loss": 1.2272, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07556675062972293, |
|
"grad_norm": 0.7536848187446594, |
|
"learning_rate": 0.00019473684210526317, |
|
"loss": 1.2776, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08060453400503778, |
|
"grad_norm": 0.7452588677406311, |
|
"learning_rate": 0.0001936842105263158, |
|
"loss": 1.1667, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08564231738035265, |
|
"grad_norm": 0.7769160270690918, |
|
"learning_rate": 0.00019263157894736842, |
|
"loss": 1.0503, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0906801007556675, |
|
"grad_norm": 0.781049907207489, |
|
"learning_rate": 0.00019157894736842104, |
|
"loss": 1.0204, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09571788413098237, |
|
"grad_norm": 0.753268837928772, |
|
"learning_rate": 0.0001905263157894737, |
|
"loss": 1.1032, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10075566750629723, |
|
"grad_norm": 0.8446284532546997, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 0.7637, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10579345088161209, |
|
"grad_norm": 3.604651927947998, |
|
"learning_rate": 0.00018842105263157898, |
|
"loss": 0.9087, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11083123425692695, |
|
"grad_norm": 0.9568591713905334, |
|
"learning_rate": 0.0001873684210526316, |
|
"loss": 0.851, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11586901763224182, |
|
"grad_norm": 1.1816169023513794, |
|
"learning_rate": 0.00018631578947368423, |
|
"loss": 0.5488, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12090680100755667, |
|
"grad_norm": 0.9833171963691711, |
|
"learning_rate": 0.00018526315789473685, |
|
"loss": 0.7534, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12594458438287154, |
|
"grad_norm": 1.3493332862854004, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 0.593, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1309823677581864, |
|
"grad_norm": 0.7266058921813965, |
|
"learning_rate": 0.0001831578947368421, |
|
"loss": 0.574, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13602015113350127, |
|
"grad_norm": 0.7107939720153809, |
|
"learning_rate": 0.00018210526315789476, |
|
"loss": 0.2413, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14105793450881612, |
|
"grad_norm": 0.7920565605163574, |
|
"learning_rate": 0.00018105263157894739, |
|
"loss": 0.4325, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14609571788413098, |
|
"grad_norm": 0.9443809390068054, |
|
"learning_rate": 0.00018, |
|
"loss": 0.4232, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.15113350125944586, |
|
"grad_norm": 0.5944586992263794, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 0.3594, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1561712846347607, |
|
"grad_norm": 0.5987979173660278, |
|
"learning_rate": 0.00017789473684210526, |
|
"loss": 0.3781, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.16120906801007556, |
|
"grad_norm": 0.6795870661735535, |
|
"learning_rate": 0.0001768421052631579, |
|
"loss": 0.5617, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.16624685138539042, |
|
"grad_norm": 0.8870770931243896, |
|
"learning_rate": 0.00017578947368421052, |
|
"loss": 0.5301, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1712846347607053, |
|
"grad_norm": 0.8014320731163025, |
|
"learning_rate": 0.00017473684210526317, |
|
"loss": 0.4682, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.17632241813602015, |
|
"grad_norm": 0.6445375680923462, |
|
"learning_rate": 0.0001736842105263158, |
|
"loss": 0.3388, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.181360201511335, |
|
"grad_norm": 0.6574522852897644, |
|
"learning_rate": 0.00017263157894736842, |
|
"loss": 0.5185, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.18639798488664988, |
|
"grad_norm": 0.5030068159103394, |
|
"learning_rate": 0.00017157894736842107, |
|
"loss": 0.2316, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.19143576826196473, |
|
"grad_norm": 0.773057758808136, |
|
"learning_rate": 0.0001705263157894737, |
|
"loss": 0.3112, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1964735516372796, |
|
"grad_norm": 0.6356672048568726, |
|
"learning_rate": 0.00016947368421052633, |
|
"loss": 0.268, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.20151133501259447, |
|
"grad_norm": 0.5076678991317749, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 0.495, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20654911838790932, |
|
"grad_norm": 0.6496306657791138, |
|
"learning_rate": 0.0001673684210526316, |
|
"loss": 0.1888, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.21158690176322417, |
|
"grad_norm": 1.2327401638031006, |
|
"learning_rate": 0.00016631578947368423, |
|
"loss": 0.3528, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.21662468513853905, |
|
"grad_norm": 0.6327301263809204, |
|
"learning_rate": 0.00016526315789473686, |
|
"loss": 0.4691, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2216624685138539, |
|
"grad_norm": 0.864808976650238, |
|
"learning_rate": 0.00016421052631578948, |
|
"loss": 0.6131, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.22670025188916876, |
|
"grad_norm": 0.6248288154602051, |
|
"learning_rate": 0.0001631578947368421, |
|
"loss": 0.3539, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23173803526448364, |
|
"grad_norm": 0.3140046298503876, |
|
"learning_rate": 0.00016210526315789473, |
|
"loss": 0.2844, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2367758186397985, |
|
"grad_norm": 0.7335410714149475, |
|
"learning_rate": 0.00016105263157894736, |
|
"loss": 0.63, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.24181360201511334, |
|
"grad_norm": 0.5439632534980774, |
|
"learning_rate": 0.00016, |
|
"loss": 0.1825, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.24685138539042822, |
|
"grad_norm": 0.6237998008728027, |
|
"learning_rate": 0.00015894736842105264, |
|
"loss": 0.3802, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2518891687657431, |
|
"grad_norm": 0.36704376339912415, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 0.0994, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.25692695214105793, |
|
"grad_norm": 0.4603656530380249, |
|
"learning_rate": 0.0001568421052631579, |
|
"loss": 0.2473, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2619647355163728, |
|
"grad_norm": 0.3848342299461365, |
|
"learning_rate": 0.00015578947368421052, |
|
"loss": 0.1398, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.26700251889168763, |
|
"grad_norm": 0.6655743718147278, |
|
"learning_rate": 0.00015473684210526317, |
|
"loss": 0.4164, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.27204030226700254, |
|
"grad_norm": 0.7751195430755615, |
|
"learning_rate": 0.0001536842105263158, |
|
"loss": 0.3564, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2770780856423174, |
|
"grad_norm": 0.6190752387046814, |
|
"learning_rate": 0.00015263157894736845, |
|
"loss": 0.3459, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.28211586901763225, |
|
"grad_norm": 0.6057133674621582, |
|
"learning_rate": 0.00015157894736842108, |
|
"loss": 0.356, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2871536523929471, |
|
"grad_norm": 1.0536139011383057, |
|
"learning_rate": 0.0001505263157894737, |
|
"loss": 0.2324, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.29219143576826195, |
|
"grad_norm": 0.7755753397941589, |
|
"learning_rate": 0.00014947368421052633, |
|
"loss": 0.4299, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2972292191435768, |
|
"grad_norm": 0.6408481597900391, |
|
"learning_rate": 0.00014842105263157895, |
|
"loss": 0.2747, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3022670025188917, |
|
"grad_norm": 0.6968539357185364, |
|
"learning_rate": 0.00014736842105263158, |
|
"loss": 0.1145, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.30730478589420657, |
|
"grad_norm": 0.49692606925964355, |
|
"learning_rate": 0.00014631578947368423, |
|
"loss": 0.2619, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3123425692695214, |
|
"grad_norm": 0.3295464515686035, |
|
"learning_rate": 0.00014526315789473686, |
|
"loss": 0.0819, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.31738035264483627, |
|
"grad_norm": 0.37510472536087036, |
|
"learning_rate": 0.00014421052631578948, |
|
"loss": 0.1576, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3224181360201511, |
|
"grad_norm": 0.5883765816688538, |
|
"learning_rate": 0.0001431578947368421, |
|
"loss": 0.1572, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.327455919395466, |
|
"grad_norm": 0.3986837565898895, |
|
"learning_rate": 0.00014210526315789474, |
|
"loss": 0.3157, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.33249370277078083, |
|
"grad_norm": 1.0316702127456665, |
|
"learning_rate": 0.00014105263157894736, |
|
"loss": 0.2099, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.33753148614609574, |
|
"grad_norm": 0.6176291704177856, |
|
"learning_rate": 0.00014, |
|
"loss": 0.1361, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3425692695214106, |
|
"grad_norm": 0.37712040543556213, |
|
"learning_rate": 0.00013894736842105264, |
|
"loss": 0.2508, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.34760705289672544, |
|
"grad_norm": 0.3802729845046997, |
|
"learning_rate": 0.00013789473684210527, |
|
"loss": 0.1122, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3526448362720403, |
|
"grad_norm": 0.481282502412796, |
|
"learning_rate": 0.0001368421052631579, |
|
"loss": 0.2649, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.35768261964735515, |
|
"grad_norm": 0.5663158893585205, |
|
"learning_rate": 0.00013578947368421055, |
|
"loss": 0.1305, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.36272040302267, |
|
"grad_norm": 0.5812940001487732, |
|
"learning_rate": 0.00013473684210526317, |
|
"loss": 0.3085, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3677581863979849, |
|
"grad_norm": 0.4372013211250305, |
|
"learning_rate": 0.0001336842105263158, |
|
"loss": 0.1936, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.37279596977329976, |
|
"grad_norm": 0.20600414276123047, |
|
"learning_rate": 0.00013263157894736842, |
|
"loss": 0.0684, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3778337531486146, |
|
"grad_norm": 0.38698187470436096, |
|
"learning_rate": 0.00013157894736842108, |
|
"loss": 0.1676, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.38287153652392947, |
|
"grad_norm": 0.38309377431869507, |
|
"learning_rate": 0.0001305263157894737, |
|
"loss": 0.1936, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3879093198992443, |
|
"grad_norm": 0.3905870318412781, |
|
"learning_rate": 0.00012947368421052633, |
|
"loss": 0.1431, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3929471032745592, |
|
"grad_norm": 0.43961116671562195, |
|
"learning_rate": 0.00012842105263157895, |
|
"loss": 0.2374, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3979848866498741, |
|
"grad_norm": 1.39609694480896, |
|
"learning_rate": 0.00012736842105263158, |
|
"loss": 0.2106, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.40302267002518893, |
|
"grad_norm": 0.3844812214374542, |
|
"learning_rate": 0.0001263157894736842, |
|
"loss": 0.0977, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4080604534005038, |
|
"grad_norm": 0.23563021421432495, |
|
"learning_rate": 0.00012526315789473683, |
|
"loss": 0.0693, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.41309823677581864, |
|
"grad_norm": 0.3840598464012146, |
|
"learning_rate": 0.00012421052631578949, |
|
"loss": 0.17, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4181360201511335, |
|
"grad_norm": 0.4412382245063782, |
|
"learning_rate": 0.0001231578947368421, |
|
"loss": 0.148, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.42317380352644834, |
|
"grad_norm": 0.27377572655677795, |
|
"learning_rate": 0.00012210526315789474, |
|
"loss": 0.073, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4282115869017632, |
|
"grad_norm": 0.2707730829715729, |
|
"learning_rate": 0.00012105263157894738, |
|
"loss": 0.1278, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4332493702770781, |
|
"grad_norm": 0.32431530952453613, |
|
"learning_rate": 0.00012, |
|
"loss": 0.1274, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.43828715365239296, |
|
"grad_norm": 0.2874161899089813, |
|
"learning_rate": 0.00011894736842105263, |
|
"loss": 0.1262, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.4433249370277078, |
|
"grad_norm": 0.3821753263473511, |
|
"learning_rate": 0.00011789473684210525, |
|
"loss": 0.1399, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.44836272040302266, |
|
"grad_norm": 0.3060382306575775, |
|
"learning_rate": 0.00011684210526315791, |
|
"loss": 0.1567, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.4534005037783375, |
|
"grad_norm": 0.26529163122177124, |
|
"learning_rate": 0.00011578947368421053, |
|
"loss": 0.0986, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.45843828715365237, |
|
"grad_norm": 0.44558581709861755, |
|
"learning_rate": 0.00011473684210526316, |
|
"loss": 0.2247, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4634760705289673, |
|
"grad_norm": 0.33645686507225037, |
|
"learning_rate": 0.0001136842105263158, |
|
"loss": 0.181, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.46851385390428213, |
|
"grad_norm": 0.14126701653003693, |
|
"learning_rate": 0.00011263157894736843, |
|
"loss": 0.0624, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.473551637279597, |
|
"grad_norm": 0.31580325961112976, |
|
"learning_rate": 0.00011157894736842105, |
|
"loss": 0.1199, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.47858942065491183, |
|
"grad_norm": 0.40185511112213135, |
|
"learning_rate": 0.0001105263157894737, |
|
"loss": 0.1006, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4836272040302267, |
|
"grad_norm": 0.3965336084365845, |
|
"learning_rate": 0.00010947368421052633, |
|
"loss": 0.1521, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.48866498740554154, |
|
"grad_norm": 0.5877782702445984, |
|
"learning_rate": 0.00010842105263157896, |
|
"loss": 0.2581, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.49370277078085645, |
|
"grad_norm": 0.5336182117462158, |
|
"learning_rate": 0.00010736842105263158, |
|
"loss": 0.2348, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4987405541561713, |
|
"grad_norm": 0.19688697159290314, |
|
"learning_rate": 0.00010631578947368421, |
|
"loss": 0.0784, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5037783375314862, |
|
"grad_norm": 0.2962247133255005, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 0.1655, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5088161209068011, |
|
"grad_norm": 0.1825219988822937, |
|
"learning_rate": 0.00010421052631578947, |
|
"loss": 0.0822, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5138539042821159, |
|
"grad_norm": 0.6996546387672424, |
|
"learning_rate": 0.00010315789473684211, |
|
"loss": 0.2294, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5188916876574308, |
|
"grad_norm": 0.39225417375564575, |
|
"learning_rate": 0.00010210526315789475, |
|
"loss": 0.1107, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5239294710327456, |
|
"grad_norm": 0.19891326129436493, |
|
"learning_rate": 0.00010105263157894738, |
|
"loss": 0.0728, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5289672544080605, |
|
"grad_norm": 0.6324350833892822, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3342, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5340050377833753, |
|
"grad_norm": 0.4613906741142273, |
|
"learning_rate": 9.894736842105263e-05, |
|
"loss": 0.2443, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5390428211586902, |
|
"grad_norm": 0.18734556436538696, |
|
"learning_rate": 9.789473684210527e-05, |
|
"loss": 0.0727, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5440806045340051, |
|
"grad_norm": 0.339933305978775, |
|
"learning_rate": 9.68421052631579e-05, |
|
"loss": 0.2469, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5491183879093199, |
|
"grad_norm": 0.48570263385772705, |
|
"learning_rate": 9.578947368421052e-05, |
|
"loss": 0.1371, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5541561712846348, |
|
"grad_norm": 0.30081725120544434, |
|
"learning_rate": 9.473684210526316e-05, |
|
"loss": 0.0915, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5591939546599496, |
|
"grad_norm": 0.26690733432769775, |
|
"learning_rate": 9.36842105263158e-05, |
|
"loss": 0.1134, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5642317380352645, |
|
"grad_norm": 0.4141266345977783, |
|
"learning_rate": 9.263157894736843e-05, |
|
"loss": 0.1727, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5692695214105793, |
|
"grad_norm": 0.33365899324417114, |
|
"learning_rate": 9.157894736842105e-05, |
|
"loss": 0.1533, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5743073047858942, |
|
"grad_norm": 0.36651313304901123, |
|
"learning_rate": 9.052631578947369e-05, |
|
"loss": 0.1598, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5793450881612091, |
|
"grad_norm": 0.39123257994651794, |
|
"learning_rate": 8.947368421052632e-05, |
|
"loss": 0.1852, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5843828715365239, |
|
"grad_norm": 1.0382803678512573, |
|
"learning_rate": 8.842105263157894e-05, |
|
"loss": 0.3205, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5894206549118388, |
|
"grad_norm": 0.13224704563617706, |
|
"learning_rate": 8.736842105263158e-05, |
|
"loss": 0.0522, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5944584382871536, |
|
"grad_norm": 0.561414897441864, |
|
"learning_rate": 8.631578947368421e-05, |
|
"loss": 0.1471, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5994962216624685, |
|
"grad_norm": 0.40099918842315674, |
|
"learning_rate": 8.526315789473685e-05, |
|
"loss": 0.194, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6045340050377834, |
|
"grad_norm": 0.4563421607017517, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 0.1486, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6095717884130982, |
|
"grad_norm": 0.48477980494499207, |
|
"learning_rate": 8.315789473684212e-05, |
|
"loss": 0.1673, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6146095717884131, |
|
"grad_norm": 0.22391052544116974, |
|
"learning_rate": 8.210526315789474e-05, |
|
"loss": 0.0837, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6196473551637279, |
|
"grad_norm": 0.3828778564929962, |
|
"learning_rate": 8.105263157894737e-05, |
|
"loss": 0.1297, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6246851385390428, |
|
"grad_norm": 0.3207460045814514, |
|
"learning_rate": 8e-05, |
|
"loss": 0.1185, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6297229219143576, |
|
"grad_norm": 0.37645190954208374, |
|
"learning_rate": 7.894736842105263e-05, |
|
"loss": 0.1658, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6347607052896725, |
|
"grad_norm": 0.3294924199581146, |
|
"learning_rate": 7.789473684210526e-05, |
|
"loss": 0.1122, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6397984886649875, |
|
"grad_norm": 0.3875572979450226, |
|
"learning_rate": 7.68421052631579e-05, |
|
"loss": 0.2184, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6448362720403022, |
|
"grad_norm": 0.4404910206794739, |
|
"learning_rate": 7.578947368421054e-05, |
|
"loss": 0.1977, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6498740554156172, |
|
"grad_norm": 0.10779596865177155, |
|
"learning_rate": 7.473684210526316e-05, |
|
"loss": 0.0367, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.654911838790932, |
|
"grad_norm": 0.16290880739688873, |
|
"learning_rate": 7.368421052631579e-05, |
|
"loss": 0.0449, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6599496221662469, |
|
"grad_norm": 0.314554899930954, |
|
"learning_rate": 7.263157894736843e-05, |
|
"loss": 0.0759, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6649874055415617, |
|
"grad_norm": 0.3773807883262634, |
|
"learning_rate": 7.157894736842105e-05, |
|
"loss": 0.1595, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6700251889168766, |
|
"grad_norm": 0.3229176998138428, |
|
"learning_rate": 7.052631578947368e-05, |
|
"loss": 0.152, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6750629722921915, |
|
"grad_norm": 0.41587066650390625, |
|
"learning_rate": 6.947368421052632e-05, |
|
"loss": 0.0599, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6801007556675063, |
|
"grad_norm": 0.2982085943222046, |
|
"learning_rate": 6.842105263157895e-05, |
|
"loss": 0.104, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6851385390428212, |
|
"grad_norm": 0.5094094276428223, |
|
"learning_rate": 6.736842105263159e-05, |
|
"loss": 0.1886, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.690176322418136, |
|
"grad_norm": 0.3865503668785095, |
|
"learning_rate": 6.631578947368421e-05, |
|
"loss": 0.337, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6952141057934509, |
|
"grad_norm": 0.4261918365955353, |
|
"learning_rate": 6.526315789473685e-05, |
|
"loss": 0.1312, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7002518891687658, |
|
"grad_norm": 0.31689754128456116, |
|
"learning_rate": 6.421052631578948e-05, |
|
"loss": 0.1656, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7052896725440806, |
|
"grad_norm": 0.46111634373664856, |
|
"learning_rate": 6.31578947368421e-05, |
|
"loss": 0.1724, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7103274559193955, |
|
"grad_norm": 0.20221607387065887, |
|
"learning_rate": 6.210526315789474e-05, |
|
"loss": 0.0835, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7153652392947103, |
|
"grad_norm": 0.3247107267379761, |
|
"learning_rate": 6.105263157894737e-05, |
|
"loss": 0.1468, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7204030226700252, |
|
"grad_norm": 0.2768368124961853, |
|
"learning_rate": 6e-05, |
|
"loss": 0.1175, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.72544080604534, |
|
"grad_norm": 0.27166804671287537, |
|
"learning_rate": 5.894736842105263e-05, |
|
"loss": 0.0795, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7304785894206549, |
|
"grad_norm": 0.10148835927248001, |
|
"learning_rate": 5.789473684210527e-05, |
|
"loss": 0.0391, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7355163727959698, |
|
"grad_norm": 0.2717987596988678, |
|
"learning_rate": 5.68421052631579e-05, |
|
"loss": 0.0882, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7405541561712846, |
|
"grad_norm": 0.18068474531173706, |
|
"learning_rate": 5.5789473684210526e-05, |
|
"loss": 0.0565, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.7455919395465995, |
|
"grad_norm": 0.4956297278404236, |
|
"learning_rate": 5.4736842105263165e-05, |
|
"loss": 0.1371, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7506297229219143, |
|
"grad_norm": 0.5892757177352905, |
|
"learning_rate": 5.368421052631579e-05, |
|
"loss": 0.1155, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7556675062972292, |
|
"grad_norm": 0.23781464993953705, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 0.1411, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.760705289672544, |
|
"grad_norm": 0.26974135637283325, |
|
"learning_rate": 5.157894736842106e-05, |
|
"loss": 0.0933, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7657430730478589, |
|
"grad_norm": 0.11715666949748993, |
|
"learning_rate": 5.052631578947369e-05, |
|
"loss": 0.0466, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7707808564231738, |
|
"grad_norm": 0.21726714074611664, |
|
"learning_rate": 4.9473684210526315e-05, |
|
"loss": 0.0699, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.7758186397984886, |
|
"grad_norm": 0.2556859254837036, |
|
"learning_rate": 4.842105263157895e-05, |
|
"loss": 0.0981, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.7808564231738035, |
|
"grad_norm": 0.12423796951770782, |
|
"learning_rate": 4.736842105263158e-05, |
|
"loss": 0.0455, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7858942065491183, |
|
"grad_norm": 0.20723406970500946, |
|
"learning_rate": 4.6315789473684214e-05, |
|
"loss": 0.088, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7909319899244333, |
|
"grad_norm": 0.18069583177566528, |
|
"learning_rate": 4.5263157894736846e-05, |
|
"loss": 0.0684, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7959697732997482, |
|
"grad_norm": 0.19325508177280426, |
|
"learning_rate": 4.421052631578947e-05, |
|
"loss": 0.0833, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.801007556675063, |
|
"grad_norm": 0.10966863483190536, |
|
"learning_rate": 4.3157894736842105e-05, |
|
"loss": 0.0445, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8060453400503779, |
|
"grad_norm": 0.16270394623279572, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 0.0656, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8110831234256927, |
|
"grad_norm": 0.13497555255889893, |
|
"learning_rate": 4.105263157894737e-05, |
|
"loss": 0.0471, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8161209068010076, |
|
"grad_norm": 0.4956112504005432, |
|
"learning_rate": 4e-05, |
|
"loss": 0.1575, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8211586901763224, |
|
"grad_norm": 0.20514702796936035, |
|
"learning_rate": 3.894736842105263e-05, |
|
"loss": 0.0584, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8261964735516373, |
|
"grad_norm": 0.25432294607162476, |
|
"learning_rate": 3.789473684210527e-05, |
|
"loss": 0.1155, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8312342569269522, |
|
"grad_norm": 0.1398286372423172, |
|
"learning_rate": 3.6842105263157895e-05, |
|
"loss": 0.0557, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.836272040302267, |
|
"grad_norm": 0.41116705536842346, |
|
"learning_rate": 3.578947368421053e-05, |
|
"loss": 0.1305, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8413098236775819, |
|
"grad_norm": 0.2796281576156616, |
|
"learning_rate": 3.473684210526316e-05, |
|
"loss": 0.0721, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8463476070528967, |
|
"grad_norm": 0.10607501864433289, |
|
"learning_rate": 3.368421052631579e-05, |
|
"loss": 0.0425, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8513853904282116, |
|
"grad_norm": 0.462009996175766, |
|
"learning_rate": 3.2631578947368426e-05, |
|
"loss": 0.0876, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.8564231738035264, |
|
"grad_norm": 0.3923139274120331, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 0.0865, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8614609571788413, |
|
"grad_norm": 0.23710176348686218, |
|
"learning_rate": 3.0526315789473684e-05, |
|
"loss": 0.1687, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.8664987405541562, |
|
"grad_norm": 0.5242993235588074, |
|
"learning_rate": 2.9473684210526314e-05, |
|
"loss": 0.1478, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.871536523929471, |
|
"grad_norm": 0.12361127138137817, |
|
"learning_rate": 2.842105263157895e-05, |
|
"loss": 0.0555, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.8765743073047859, |
|
"grad_norm": 0.31850466132164, |
|
"learning_rate": 2.7368421052631583e-05, |
|
"loss": 0.1228, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.8816120906801007, |
|
"grad_norm": 0.23594436049461365, |
|
"learning_rate": 2.6315789473684212e-05, |
|
"loss": 0.0663, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8866498740554156, |
|
"grad_norm": 0.3258284330368042, |
|
"learning_rate": 2.5263157894736845e-05, |
|
"loss": 0.2063, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.8916876574307305, |
|
"grad_norm": 0.3880285918712616, |
|
"learning_rate": 2.4210526315789474e-05, |
|
"loss": 0.1116, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.8967254408060453, |
|
"grad_norm": 0.165984645485878, |
|
"learning_rate": 2.3157894736842107e-05, |
|
"loss": 0.0751, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9017632241813602, |
|
"grad_norm": 0.47083160281181335, |
|
"learning_rate": 2.2105263157894736e-05, |
|
"loss": 0.1682, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.906801007556675, |
|
"grad_norm": 0.2563510239124298, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 0.0924, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9118387909319899, |
|
"grad_norm": 0.2977278232574463, |
|
"learning_rate": 2e-05, |
|
"loss": 0.107, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.9168765743073047, |
|
"grad_norm": 0.16103056073188782, |
|
"learning_rate": 1.8947368421052634e-05, |
|
"loss": 0.0648, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9219143576826196, |
|
"grad_norm": 0.14750486612319946, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 0.0613, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9269521410579346, |
|
"grad_norm": 0.4347628951072693, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 0.1277, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9319899244332494, |
|
"grad_norm": 0.2826248109340668, |
|
"learning_rate": 1.5789473684210526e-05, |
|
"loss": 0.1377, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9370277078085643, |
|
"grad_norm": 0.29777997732162476, |
|
"learning_rate": 1.4736842105263157e-05, |
|
"loss": 0.0928, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9420654911838791, |
|
"grad_norm": 0.26641038060188293, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 0.1496, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.947103274559194, |
|
"grad_norm": 0.34339556097984314, |
|
"learning_rate": 1.2631578947368422e-05, |
|
"loss": 0.2262, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9521410579345088, |
|
"grad_norm": 0.13338394463062286, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 0.0446, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.9571788413098237, |
|
"grad_norm": 0.3551732301712036, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 0.0587, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9622166246851386, |
|
"grad_norm": 0.20613446831703186, |
|
"learning_rate": 9.473684210526317e-06, |
|
"loss": 0.1182, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.9672544080604534, |
|
"grad_norm": 0.15159928798675537, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 0.0549, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.9722921914357683, |
|
"grad_norm": 0.17634347081184387, |
|
"learning_rate": 7.3684210526315784e-06, |
|
"loss": 0.0599, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.9773299748110831, |
|
"grad_norm": 0.40340152382850647, |
|
"learning_rate": 6.315789473684211e-06, |
|
"loss": 0.1586, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.982367758186398, |
|
"grad_norm": 0.18197712302207947, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 0.0772, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9874055415617129, |
|
"grad_norm": 0.19643299281597137, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 0.1048, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.9924433249370277, |
|
"grad_norm": 0.3259267508983612, |
|
"learning_rate": 3.1578947368421056e-06, |
|
"loss": 0.1996, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.9974811083123426, |
|
"grad_norm": 0.2846713066101074, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.1113, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.0025188916876575, |
|
"grad_norm": 0.06972451508045197, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.0277, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.0075566750629723, |
|
"grad_norm": 0.11068534851074219, |
|
"learning_rate": 0.0, |
|
"loss": 0.03, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.65920283789312e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|