|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 373, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.512355255212158, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.3837, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.601068071501158, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.4305, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.2300010534534525, |
|
"learning_rate": 5e-05, |
|
"loss": 1.266, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1453814538418174, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.1698, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.5014459760080596, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.1328, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.8330435913332193, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1005, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4286351070611591, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 0.978, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.310573576099708, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.9391, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1949914966282076, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.9192, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.2833854369614637, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.8458, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.3466532516140282, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.9498, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0265129854654136, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8738, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9754563625922208, |
|
"learning_rate": 0.0001999962133723217, |
|
"loss": 0.8873, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9666096955411021, |
|
"learning_rate": 0.00019998485377605772, |
|
"loss": 0.8805, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0373117093470032, |
|
"learning_rate": 0.00019996592207149934, |
|
"loss": 0.8797, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9925813153709726, |
|
"learning_rate": 0.00019993941969239282, |
|
"loss": 0.8048, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.986495951671587, |
|
"learning_rate": 0.0001999053486458311, |
|
"loss": 0.7769, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4142116862108192, |
|
"learning_rate": 0.00019986371151210145, |
|
"loss": 0.7946, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.0439568009598768, |
|
"learning_rate": 0.00019981451144449042, |
|
"loss": 0.8041, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 8.131551306091136, |
|
"learning_rate": 0.0001997577521690447, |
|
"loss": 0.8122, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8360699607020335, |
|
"learning_rate": 0.00019969343798428914, |
|
"loss": 0.8185, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8466183671401063, |
|
"learning_rate": 0.00019962157376090124, |
|
"loss": 0.7552, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9001204614115436, |
|
"learning_rate": 0.00019954216494134217, |
|
"loss": 0.7959, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.843763156462474, |
|
"learning_rate": 0.00019945521753944451, |
|
"loss": 0.8043, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7862883144042041, |
|
"learning_rate": 0.0001993607381399571, |
|
"loss": 0.792, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8743527199399872, |
|
"learning_rate": 0.00019925873389804613, |
|
"loss": 0.7685, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8981661887195656, |
|
"learning_rate": 0.00019914921253875328, |
|
"loss": 0.7858, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.938979054115086, |
|
"learning_rate": 0.00019903218235641076, |
|
"loss": 0.7859, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8854235003056622, |
|
"learning_rate": 0.00019890765221401314, |
|
"loss": 0.7487, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8115643107641081, |
|
"learning_rate": 0.00019877563154254612, |
|
"loss": 0.7344, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9184079760749247, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.8187, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8142820954054314, |
|
"learning_rate": 0.0001984891591719738, |
|
"loss": 0.7795, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8203580213935835, |
|
"learning_rate": 0.00019833472916815263, |
|
"loss": 0.8192, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7571847385129802, |
|
"learning_rate": 0.00019817285202418733, |
|
"loss": 0.7864, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8085847745812597, |
|
"learning_rate": 0.00019800353999944732, |
|
"loss": 0.7653, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7450297134757211, |
|
"learning_rate": 0.0001978268059163646, |
|
"loss": 0.7016, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7648175358121383, |
|
"learning_rate": 0.0001976426631594626, |
|
"loss": 0.7651, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.9073006141723517, |
|
"learning_rate": 0.0001974511256743425, |
|
"loss": 0.7839, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8632251284659535, |
|
"learning_rate": 0.0001972522079666272, |
|
"loss": 0.8221, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8464617036052766, |
|
"learning_rate": 0.00019704592510086258, |
|
"loss": 0.7659, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 7.869410992708508, |
|
"learning_rate": 0.00019683229269937686, |
|
"loss": 0.793, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6910263819695035, |
|
"learning_rate": 0.00019661132694109737, |
|
"loss": 0.7476, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7823617079640587, |
|
"learning_rate": 0.00019638304456032514, |
|
"loss": 0.7543, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8986022109125568, |
|
"learning_rate": 0.0001961474628454679, |
|
"loss": 0.7501, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8605053572792184, |
|
"learning_rate": 0.00019590459963773042, |
|
"loss": 0.8088, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7869120724203714, |
|
"learning_rate": 0.00019565447332976362, |
|
"loss": 0.7774, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8300328955398425, |
|
"learning_rate": 0.0001953971028642715, |
|
"loss": 0.7881, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.701180165130191, |
|
"learning_rate": 0.00019513250773257665, |
|
"loss": 0.7878, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9378185605974173, |
|
"learning_rate": 0.00019486070797314402, |
|
"loss": 0.7515, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.9816239126099178, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 0.7393, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.0146355146158235, |
|
"learning_rate": 0.00019429557745149082, |
|
"loss": 0.7174, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9489373557296571, |
|
"learning_rate": 0.00019400228948804774, |
|
"loss": 0.8367, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.4938666982243114, |
|
"learning_rate": 0.00019370188249118067, |
|
"loss": 0.8304, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7895857808322408, |
|
"learning_rate": 0.00019339437921147854, |
|
"loss": 0.7571, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0350953755957781, |
|
"learning_rate": 0.00019307980293694997, |
|
"loss": 0.7984, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.9217385632598274, |
|
"learning_rate": 0.00019275817749125955, |
|
"loss": 0.7453, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7391980939802598, |
|
"learning_rate": 0.00019242952723192355, |
|
"loss": 0.6975, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7501773078010896, |
|
"learning_rate": 0.00019209387704846535, |
|
"loss": 0.7518, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7573007042638589, |
|
"learning_rate": 0.0001917512523605304, |
|
"loss": 0.7379, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8140744826191785, |
|
"learning_rate": 0.0001914016791159613, |
|
"loss": 0.7044, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8450442031858358, |
|
"learning_rate": 0.00019104518378883253, |
|
"loss": 0.763, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7175341911600421, |
|
"learning_rate": 0.00019068179337744547, |
|
"loss": 0.688, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7728305894940893, |
|
"learning_rate": 0.00019031153540228398, |
|
"loss": 0.743, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8901123426372263, |
|
"learning_rate": 0.00018993443790392994, |
|
"loss": 0.7268, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8493239266624959, |
|
"learning_rate": 0.0001895505294409399, |
|
"loss": 0.761, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7812710594507711, |
|
"learning_rate": 0.0001891598390876821, |
|
"loss": 0.7707, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.0596730633980673, |
|
"learning_rate": 0.00018876239643213455, |
|
"loss": 0.7288, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.7573454798105217, |
|
"learning_rate": 0.00018835823157364458, |
|
"loss": 0.7391, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.0031748089998647, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 0.7195, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.439237965595793, |
|
"learning_rate": 0.0001875298581883559, |
|
"loss": 0.7838, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.844581374152523, |
|
"learning_rate": 0.000187105712396389, |
|
"loss": 0.6861, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.678850258225293, |
|
"learning_rate": 0.00018667496986639207, |
|
"loss": 0.7423, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7753613912910128, |
|
"learning_rate": 0.00018623766321959688, |
|
"loss": 0.7545, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8548883098140322, |
|
"learning_rate": 0.00018579382557435247, |
|
"loss": 0.7697, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8184215312663371, |
|
"learning_rate": 0.00018534349054361707, |
|
"loss": 0.725, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7700953831367962, |
|
"learning_rate": 0.00018488669223241258, |
|
"loss": 0.7443, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8152820586755163, |
|
"learning_rate": 0.00018442346523524146, |
|
"loss": 0.7442, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7745035443923122, |
|
"learning_rate": 0.00018395384463346722, |
|
"loss": 0.7462, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7277596367063084, |
|
"learning_rate": 0.00018347786599265712, |
|
"loss": 0.7176, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7876787713429247, |
|
"learning_rate": 0.00018299556535988915, |
|
"loss": 0.7013, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8438646538243175, |
|
"learning_rate": 0.0001825069792610218, |
|
"loss": 0.7209, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6802198137748939, |
|
"learning_rate": 0.00018201214469792793, |
|
"loss": 0.7562, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.835891816407763, |
|
"learning_rate": 0.00018151109914569266, |
|
"loss": 0.7503, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.744844585452191, |
|
"learning_rate": 0.00018100388054977508, |
|
"loss": 0.7329, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7364974652166336, |
|
"learning_rate": 0.00018049052732313465, |
|
"loss": 0.7198, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8292878106466043, |
|
"learning_rate": 0.00017997107834332216, |
|
"loss": 0.6571, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7148804485605235, |
|
"learning_rate": 0.00017944557294953528, |
|
"loss": 0.6983, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7140828522762399, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 0.7508, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7717708896492551, |
|
"learning_rate": 0.00017837655256715355, |
|
"loss": 0.7264, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7427007083892259, |
|
"learning_rate": 0.00017783311853820206, |
|
"loss": 0.6694, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7560632004948113, |
|
"learning_rate": 0.00017728379000843164, |
|
"loss": 0.6721, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6487067048836795, |
|
"learning_rate": 0.00017672860857989464, |
|
"loss": 0.6811, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.638380563962605, |
|
"learning_rate": 0.00017616761629789824, |
|
"loss": 0.7018, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7305532339795056, |
|
"learning_rate": 0.00017560085564782057, |
|
"loss": 0.7208, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7121274962251023, |
|
"learning_rate": 0.0001750283695518929, |
|
"loss": 0.7132, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7819223698571628, |
|
"learning_rate": 0.00017445020136594907, |
|
"loss": 0.6793, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.731314756139953, |
|
"learning_rate": 0.00017386639487614232, |
|
"loss": 0.6422, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7414278250964547, |
|
"learning_rate": 0.00017327699429562884, |
|
"loss": 0.7249, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6557522740071186, |
|
"learning_rate": 0.00017268204426121967, |
|
"loss": 0.7039, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7140545115742526, |
|
"learning_rate": 0.0001720815898300002, |
|
"loss": 0.7219, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.809113184222831, |
|
"learning_rate": 0.00017147567647591777, |
|
"loss": 0.7726, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7041999961741995, |
|
"learning_rate": 0.0001708643500863379, |
|
"loss": 0.7389, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.018002266507155, |
|
"learning_rate": 0.00017024765695856922, |
|
"loss": 0.6771, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7382080435909644, |
|
"learning_rate": 0.000169625643796357, |
|
"loss": 0.7513, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7763816161023751, |
|
"learning_rate": 0.0001689983577063464, |
|
"loss": 0.7229, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6472383525714234, |
|
"learning_rate": 0.00016836584619451476, |
|
"loss": 0.6477, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7937093362264419, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 0.7871, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7152698878521233, |
|
"learning_rate": 0.0001670853389043432, |
|
"loss": 0.6848, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.693763900764871, |
|
"learning_rate": 0.00016643744010209018, |
|
"loss": 0.7481, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7760662028837724, |
|
"learning_rate": 0.00016578450982284584, |
|
"loss": 0.7181, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.9732534367753803, |
|
"learning_rate": 0.0001651265975146875, |
|
"loss": 0.7059, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.721390970519964, |
|
"learning_rate": 0.00016446375300299424, |
|
"loss": 0.6929, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7892594293960745, |
|
"learning_rate": 0.00016379602648667363, |
|
"loss": 0.6969, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7144379882173059, |
|
"learning_rate": 0.00016312346853435976, |
|
"loss": 0.71, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.692091441113428, |
|
"learning_rate": 0.00016244613008058387, |
|
"loss": 0.7081, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7073830863522493, |
|
"learning_rate": 0.0001617640624219166, |
|
"loss": 0.6998, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6939706731832529, |
|
"learning_rate": 0.0001610773172130835, |
|
"loss": 0.7275, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6515914750845688, |
|
"learning_rate": 0.00016038594646305285, |
|
"loss": 0.7305, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6867489914120396, |
|
"learning_rate": 0.00015969000253109706, |
|
"loss": 0.6677, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7789785545676167, |
|
"learning_rate": 0.0001589895381228272, |
|
"loss": 0.6983, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6722067334974933, |
|
"learning_rate": 0.00015828460628620157, |
|
"loss": 0.6615, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.65540008670773, |
|
"learning_rate": 0.0001575752604075083, |
|
"loss": 0.6925, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7493774732322246, |
|
"learning_rate": 0.0001568615542073221, |
|
"loss": 0.7029, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.742091567316259, |
|
"learning_rate": 0.00015614354173643604, |
|
"loss": 0.6703, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6675199697978297, |
|
"learning_rate": 0.00015542127737176798, |
|
"loss": 0.6877, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8397591916647437, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 0.7347, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.704256604633287, |
|
"learning_rate": 0.00015396421207464908, |
|
"loss": 0.7225, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6307506132251888, |
|
"learning_rate": 0.0001532295214894739, |
|
"loss": 0.6939, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.650289475715352, |
|
"learning_rate": 0.00015249079969671114, |
|
"loss": 0.6572, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6508582023204799, |
|
"learning_rate": 0.00015174810264164865, |
|
"loss": 0.7119, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6690206745714716, |
|
"learning_rate": 0.0001510014865706309, |
|
"loss": 0.6948, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6408558123355553, |
|
"learning_rate": 0.00015025100802679942, |
|
"loss": 0.6977, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7091883584562263, |
|
"learning_rate": 0.0001494967238458108, |
|
"loss": 0.746, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7284758613046809, |
|
"learning_rate": 0.00014873869115153223, |
|
"loss": 0.7454, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7318871271330087, |
|
"learning_rate": 0.0001479769673517152, |
|
"loss": 0.7204, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6220632369248427, |
|
"learning_rate": 0.00014721161013364829, |
|
"loss": 0.6871, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6312761822751318, |
|
"learning_rate": 0.00014644267745978797, |
|
"loss": 0.7352, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.629060072944782, |
|
"learning_rate": 0.00014567022756336917, |
|
"loss": 0.6922, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6482601288258285, |
|
"learning_rate": 0.00014489431894399498, |
|
"loss": 0.6718, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7021326867479869, |
|
"learning_rate": 0.0001441150103632066, |
|
"loss": 0.7384, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6275332412079758, |
|
"learning_rate": 0.0001433323608400328, |
|
"loss": 0.6894, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6213116953061292, |
|
"learning_rate": 0.00014254642964652052, |
|
"loss": 0.6642, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6265488468753385, |
|
"learning_rate": 0.00014175727630324597, |
|
"loss": 0.6722, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6354231838763917, |
|
"learning_rate": 0.000140964960574807, |
|
"loss": 0.7121, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.719702694761498, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 0.6799, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6940261755213799, |
|
"learning_rate": 0.0001393710822137604, |
|
"loss": 0.6515, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6940905781567512, |
|
"learning_rate": 0.00013856964028963116, |
|
"loss": 0.7139, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6185000019239019, |
|
"learning_rate": 0.00013776527738815263, |
|
"loss": 0.6843, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6294907306228986, |
|
"learning_rate": 0.00013695805442578136, |
|
"loss": 0.7154, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.602922685838362, |
|
"learning_rate": 0.00013614803253557357, |
|
"loss": 0.685, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.676522392009219, |
|
"learning_rate": 0.00013533527306255547, |
|
"loss": 0.6851, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.611104819984954, |
|
"learning_rate": 0.00013451983755907737, |
|
"loss": 0.6992, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6796636946794475, |
|
"learning_rate": 0.00013370178778015224, |
|
"loss": 0.7065, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6046664764536965, |
|
"learning_rate": 0.00013288118567877875, |
|
"loss": 0.6627, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7122396015442166, |
|
"learning_rate": 0.00013205809340124952, |
|
"loss": 0.736, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.644051018087198, |
|
"learning_rate": 0.00013123257328244453, |
|
"loss": 0.7115, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6560336305912403, |
|
"learning_rate": 0.00013040468784111044, |
|
"loss": 0.6887, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.611680866224159, |
|
"learning_rate": 0.0001295744997751257, |
|
"loss": 0.6667, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7406926056500003, |
|
"learning_rate": 0.00012874207195675262, |
|
"loss": 0.7699, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6977955799983129, |
|
"learning_rate": 0.0001279074674278754, |
|
"loss": 0.7077, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6372845332477916, |
|
"learning_rate": 0.0001270707493952263, |
|
"loss": 0.6686, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6492145705292489, |
|
"learning_rate": 0.00012623198122559863, |
|
"loss": 0.7046, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6681970186049779, |
|
"learning_rate": 0.00012539122644104756, |
|
"loss": 0.7536, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6311121728953317, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 0.7268, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6171650489076799, |
|
"learning_rate": 0.00012370401186283185, |
|
"loss": 0.6731, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6139846134340758, |
|
"learning_rate": 0.00012285767984623563, |
|
"loss": 0.6648, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6727015510207016, |
|
"learning_rate": 0.00012200961675917604, |
|
"loss": 0.7182, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6093536573435098, |
|
"learning_rate": 0.00012115988682763627, |
|
"loss": 0.7114, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6831844462735769, |
|
"learning_rate": 0.00012030855440383386, |
|
"loss": 0.6701, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6158465960782087, |
|
"learning_rate": 0.00011945568396134721, |
|
"loss": 0.646, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.609767308811365, |
|
"learning_rate": 0.0001186013400902328, |
|
"loss": 0.6274, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5469825613650058, |
|
"learning_rate": 0.00011774558749213357, |
|
"loss": 0.6817, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6747015764561682, |
|
"learning_rate": 0.00011688849097537904, |
|
"loss": 0.691, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6382223494379158, |
|
"learning_rate": 0.00011603011545007707, |
|
"loss": 0.7, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6124637055623531, |
|
"learning_rate": 0.00011517052592319811, |
|
"loss": 0.6687, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6350222892172688, |
|
"learning_rate": 0.00011430978749365203, |
|
"loss": 0.6679, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6030636477769307, |
|
"learning_rate": 0.00011344796534735804, |
|
"loss": 0.642, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6399897114215611, |
|
"learning_rate": 0.00011258512475230807, |
|
"loss": 0.6743, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6493391217471788, |
|
"learning_rate": 0.00011172133105362358, |
|
"loss": 0.6282, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6279394364552422, |
|
"learning_rate": 0.00011085664966860727, |
|
"loss": 0.7069, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6238259325357338, |
|
"learning_rate": 0.00010999114608178837, |
|
"loss": 0.6586, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7124124134964612, |
|
"learning_rate": 0.00010912488583996363, |
|
"loss": 0.6666, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5628912814858279, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.661, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6905577214693723, |
|
"learning_rate": 0.00010739035786003239, |
|
"loss": 0.7154, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6530459282353105, |
|
"learning_rate": 0.00010652222148215905, |
|
"loss": 0.681, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.569548383478722, |
|
"learning_rate": 0.00010565359115979791, |
|
"loss": 0.6746, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5714999167791848, |
|
"learning_rate": 0.00010478453267654147, |
|
"loss": 0.6769, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7076381300646487, |
|
"learning_rate": 0.00010391511184840774, |
|
"loss": 0.6724, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5540842570197104, |
|
"learning_rate": 0.00010304539451885629, |
|
"loss": 0.6209, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6415952358813346, |
|
"learning_rate": 0.00010217544655380129, |
|
"loss": 0.6615, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6393147605518732, |
|
"learning_rate": 0.00010130533383662362, |
|
"loss": 0.6385, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6289522798672027, |
|
"learning_rate": 0.00010043512226318124, |
|
"loss": 0.6857, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6734225585184999, |
|
"learning_rate": 9.956487773681879e-05, |
|
"loss": 0.6244, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6164997133604642, |
|
"learning_rate": 9.869466616337642e-05, |
|
"loss": 0.6462, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6511338335869312, |
|
"learning_rate": 9.78245534461987e-05, |
|
"loss": 0.7006, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6377607002835902, |
|
"learning_rate": 9.695460548114373e-05, |
|
"loss": 0.6989, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5936010824613069, |
|
"learning_rate": 9.608488815159227e-05, |
|
"loss": 0.6407, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.643239848981791, |
|
"learning_rate": 9.521546732345858e-05, |
|
"loss": 0.693, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5653914084405384, |
|
"learning_rate": 9.43464088402021e-05, |
|
"loss": 0.6677, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5724255909192363, |
|
"learning_rate": 9.347777851784096e-05, |
|
"loss": 0.6372, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5659784186045361, |
|
"learning_rate": 9.260964213996762e-05, |
|
"loss": 0.7082, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6531108684279694, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 0.6176, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6422142052940009, |
|
"learning_rate": 9.087511416003635e-05, |
|
"loss": 0.6754, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6537245766903678, |
|
"learning_rate": 9.000885391821164e-05, |
|
"loss": 0.6484, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6396400269252728, |
|
"learning_rate": 8.914335033139274e-05, |
|
"loss": 0.6652, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5559545083417303, |
|
"learning_rate": 8.827866894637643e-05, |
|
"loss": 0.6791, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5995648170220891, |
|
"learning_rate": 8.741487524769199e-05, |
|
"loss": 0.6461, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5928913948160235, |
|
"learning_rate": 8.655203465264197e-05, |
|
"loss": 0.6631, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5570723055144339, |
|
"learning_rate": 8.5690212506348e-05, |
|
"loss": 0.664, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5932902194668669, |
|
"learning_rate": 8.482947407680193e-05, |
|
"loss": 0.6523, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.637557903417406, |
|
"learning_rate": 8.396988454992295e-05, |
|
"loss": 0.5927, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.588469166277496, |
|
"learning_rate": 8.311150902462095e-05, |
|
"loss": 0.6708, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7093924699893847, |
|
"learning_rate": 8.225441250786644e-05, |
|
"loss": 0.6862, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6644277543653607, |
|
"learning_rate": 8.139865990976722e-05, |
|
"loss": 0.6397, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5692386084383936, |
|
"learning_rate": 8.054431603865283e-05, |
|
"loss": 0.6501, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6346519607295912, |
|
"learning_rate": 7.969144559616613e-05, |
|
"loss": 0.6625, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6264859907511606, |
|
"learning_rate": 7.884011317236376e-05, |
|
"loss": 0.643, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.623282323523227, |
|
"learning_rate": 7.7990383240824e-05, |
|
"loss": 0.6844, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.592564998468818, |
|
"learning_rate": 7.714232015376441e-05, |
|
"loss": 0.6411, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.613168941035148, |
|
"learning_rate": 7.629598813716817e-05, |
|
"loss": 0.6556, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6112236475732046, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 0.6714, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6020471154492149, |
|
"learning_rate": 7.460877355895248e-05, |
|
"loss": 0.6621, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6620676419938459, |
|
"learning_rate": 7.376801877440143e-05, |
|
"loss": 0.6259, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6141172748711491, |
|
"learning_rate": 7.292925060477367e-05, |
|
"loss": 0.6335, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5491409778708467, |
|
"learning_rate": 7.20925325721246e-05, |
|
"loss": 0.6264, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6462891221310414, |
|
"learning_rate": 7.12579280432474e-05, |
|
"loss": 0.6721, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6064044918933936, |
|
"learning_rate": 7.04255002248743e-05, |
|
"loss": 0.6356, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5553383645621945, |
|
"learning_rate": 6.959531215888961e-05, |
|
"loss": 0.6569, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5637312706044894, |
|
"learning_rate": 6.876742671755547e-05, |
|
"loss": 0.676, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.597407585296278, |
|
"learning_rate": 6.794190659875052e-05, |
|
"loss": 0.6223, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6117957902349217, |
|
"learning_rate": 6.711881432122128e-05, |
|
"loss": 0.5797, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5636620516655729, |
|
"learning_rate": 6.62982122198478e-05, |
|
"loss": 0.6347, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6093740585052143, |
|
"learning_rate": 6.548016244092264e-05, |
|
"loss": 0.6364, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5832448160710062, |
|
"learning_rate": 6.466472693744454e-05, |
|
"loss": 0.6249, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6212348267309529, |
|
"learning_rate": 6.385196746442644e-05, |
|
"loss": 0.6373, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6041857720598327, |
|
"learning_rate": 6.304194557421866e-05, |
|
"loss": 0.6367, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6228926839065734, |
|
"learning_rate": 6.223472261184737e-05, |
|
"loss": 0.6272, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.592678196931043, |
|
"learning_rate": 6.143035971036885e-05, |
|
"loss": 0.6405, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6188393302850655, |
|
"learning_rate": 6.0628917786239615e-05, |
|
"loss": 0.6562, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6521252480679709, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 0.6177, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6015226341824674, |
|
"learning_rate": 5.9035039425192996e-05, |
|
"loss": 0.6543, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5522056983476296, |
|
"learning_rate": 5.824272369675403e-05, |
|
"loss": 0.6412, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6565986603056718, |
|
"learning_rate": 5.74535703534795e-05, |
|
"loss": 0.608, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6243504360364475, |
|
"learning_rate": 5.666763915996725e-05, |
|
"loss": 0.6187, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5710246396677061, |
|
"learning_rate": 5.588498963679338e-05, |
|
"loss": 0.6566, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6357843375059528, |
|
"learning_rate": 5.5105681056004996e-05, |
|
"loss": 0.6552, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6476235313255283, |
|
"learning_rate": 5.432977243663089e-05, |
|
"loss": 0.6433, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6420823226435337, |
|
"learning_rate": 5.355732254021205e-05, |
|
"loss": 0.6555, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5834281412304384, |
|
"learning_rate": 5.278838986635175e-05, |
|
"loss": 0.6457, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.603427697793282, |
|
"learning_rate": 5.202303264828482e-05, |
|
"loss": 0.6338, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5503026867657649, |
|
"learning_rate": 5.1261308848467806e-05, |
|
"loss": 0.6211, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6140538857825613, |
|
"learning_rate": 5.0503276154189205e-05, |
|
"loss": 0.6133, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5651106677229555, |
|
"learning_rate": 4.974899197320059e-05, |
|
"loss": 0.6085, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5806960834836444, |
|
"learning_rate": 4.899851342936913e-05, |
|
"loss": 0.6326, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6248344723288919, |
|
"learning_rate": 4.825189735835137e-05, |
|
"loss": 0.5868, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5547428746495593, |
|
"learning_rate": 4.750920030328889e-05, |
|
"loss": 0.6694, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6481730854953198, |
|
"learning_rate": 4.677047851052615e-05, |
|
"loss": 0.5942, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6287265723350564, |
|
"learning_rate": 4.6035787925350916e-05, |
|
"loss": 0.6555, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5975999751699451, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 0.5761, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6370150437871046, |
|
"learning_rate": 4.457872262823202e-05, |
|
"loss": 0.68, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5854134285960169, |
|
"learning_rate": 4.385645826356401e-05, |
|
"loss": 0.6401, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5939120027050839, |
|
"learning_rate": 4.313844579267793e-05, |
|
"loss": 0.6449, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6171451891512717, |
|
"learning_rate": 4.242473959249173e-05, |
|
"loss": 0.6585, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5375088192193765, |
|
"learning_rate": 4.1715393713798464e-05, |
|
"loss": 0.6159, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5570000723429187, |
|
"learning_rate": 4.1010461877172836e-05, |
|
"loss": 0.6302, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6991127608006316, |
|
"learning_rate": 4.030999746890295e-05, |
|
"loss": 0.633, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6208588931148632, |
|
"learning_rate": 3.961405353694716e-05, |
|
"loss": 0.6392, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5168283033651194, |
|
"learning_rate": 3.892268278691651e-05, |
|
"loss": 0.6427, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.583521526288499, |
|
"learning_rate": 3.8235937578083424e-05, |
|
"loss": 0.5509, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5431854753646991, |
|
"learning_rate": 3.755386991941618e-05, |
|
"loss": 0.6798, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5501260666199879, |
|
"learning_rate": 3.687653146564025e-05, |
|
"loss": 0.5858, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5343821946856416, |
|
"learning_rate": 3.6203973513326395e-05, |
|
"loss": 0.6327, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5536890084608536, |
|
"learning_rate": 3.553624699700578e-05, |
|
"loss": 0.5837, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5340609690647654, |
|
"learning_rate": 3.4873402485312544e-05, |
|
"loss": 0.6238, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.684928593101013, |
|
"learning_rate": 3.4215490177154173e-05, |
|
"loss": 0.6159, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6111300960968657, |
|
"learning_rate": 3.356255989790984e-05, |
|
"loss": 0.6057, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5583521629303174, |
|
"learning_rate": 3.2914661095656805e-05, |
|
"loss": 0.6339, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5796480790048567, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 0.6285, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6052550118253326, |
|
"learning_rate": 3.1634153805485243e-05, |
|
"loss": 0.6373, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5993042127057324, |
|
"learning_rate": 3.100164229365361e-05, |
|
"loss": 0.5693, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6482643299850712, |
|
"learning_rate": 3.0374356203643005e-05, |
|
"loss": 0.6157, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5279593828065655, |
|
"learning_rate": 2.9752343041430798e-05, |
|
"loss": 0.5828, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.578989064972009, |
|
"learning_rate": 2.9135649913662087e-05, |
|
"loss": 0.6142, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6613465262466375, |
|
"learning_rate": 2.8524323524082243e-05, |
|
"loss": 0.5862, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5593586714719698, |
|
"learning_rate": 2.7918410169999822e-05, |
|
"loss": 0.6186, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6009401050358155, |
|
"learning_rate": 2.7317955738780333e-05, |
|
"loss": 0.5771, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6239256426233234, |
|
"learning_rate": 2.672300570437116e-05, |
|
"loss": 0.5703, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5565646944070531, |
|
"learning_rate": 2.6133605123857708e-05, |
|
"loss": 0.6224, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6682481662923421, |
|
"learning_rate": 2.5549798634050936e-05, |
|
"loss": 0.63, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5811684620171189, |
|
"learning_rate": 2.4971630448107163e-05, |
|
"loss": 0.6351, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6113567840839303, |
|
"learning_rate": 2.4399144352179483e-05, |
|
"loss": 0.5717, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5287785841858424, |
|
"learning_rate": 2.3832383702101747e-05, |
|
"loss": 0.5764, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.581644293338218, |
|
"learning_rate": 2.327139142010538e-05, |
|
"loss": 0.6452, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5719089122205556, |
|
"learning_rate": 2.271620999156837e-05, |
|
"loss": 0.5994, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6003439148215275, |
|
"learning_rate": 2.216688146179795e-05, |
|
"loss": 0.6685, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5523434831489719, |
|
"learning_rate": 2.1623447432846466e-05, |
|
"loss": 0.5729, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5800800946514322, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 0.598, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5987900895932757, |
|
"learning_rate": 2.055442705046474e-05, |
|
"loss": 0.529, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5380314863841026, |
|
"learning_rate": 2.0028921656677855e-05, |
|
"loss": 0.5847, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6154729554684041, |
|
"learning_rate": 1.950947267686536e-05, |
|
"loss": 0.5655, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5653169274060361, |
|
"learning_rate": 1.8996119450224935e-05, |
|
"loss": 0.6026, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.6096171765238086, |
|
"learning_rate": 1.8488900854307366e-05, |
|
"loss": 0.6392, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5821164151718903, |
|
"learning_rate": 1.79878553020721e-05, |
|
"loss": 0.6021, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6002726973121734, |
|
"learning_rate": 1.7493020738978206e-05, |
|
"loss": 0.6432, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5407410721681246, |
|
"learning_rate": 1.7004434640110854e-05, |
|
"loss": 0.5675, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5951204193957552, |
|
"learning_rate": 1.6522134007342893e-05, |
|
"loss": 0.6271, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6003726456718518, |
|
"learning_rate": 1.60461553665328e-05, |
|
"loss": 0.6051, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6020973087534124, |
|
"learning_rate": 1.557653476475852e-05, |
|
"loss": 0.5899, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5500181352884663, |
|
"learning_rate": 1.5113307767587448e-05, |
|
"loss": 0.6425, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6418488919418228, |
|
"learning_rate": 1.4656509456382928e-05, |
|
"loss": 0.6296, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.54497743056703, |
|
"learning_rate": 1.4206174425647556e-05, |
|
"loss": 0.5507, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6216914003629271, |
|
"learning_rate": 1.3762336780403163e-05, |
|
"loss": 0.6547, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6114537267371907, |
|
"learning_rate": 1.332503013360794e-05, |
|
"loss": 0.5881, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5787707524225494, |
|
"learning_rate": 1.2894287603611032e-05, |
|
"loss": 0.6015, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5727228874213498, |
|
"learning_rate": 1.247014181164412e-05, |
|
"loss": 0.6315, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5365520051065367, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 0.5849, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5882130601500214, |
|
"learning_rate": 1.1641768426355427e-05, |
|
"loss": 0.5638, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5605719123191992, |
|
"learning_rate": 1.123760356786545e-05, |
|
"loss": 0.5813, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.540475552941427, |
|
"learning_rate": 1.0840160912317943e-05, |
|
"loss": 0.5988, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5836793137801853, |
|
"learning_rate": 1.0449470559060126e-05, |
|
"loss": 0.6359, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5415961274854277, |
|
"learning_rate": 1.0065562096070069e-05, |
|
"loss": 0.6206, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5586612112915813, |
|
"learning_rate": 9.68846459771604e-06, |
|
"loss": 0.5986, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6076513228987033, |
|
"learning_rate": 9.318206622554549e-06, |
|
"loss": 0.5981, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5929614655401766, |
|
"learning_rate": 8.954816211167482e-06, |
|
"loss": 0.5746, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5680926959791349, |
|
"learning_rate": 8.59832088403868e-06, |
|
"loss": 0.6537, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6220839217304087, |
|
"learning_rate": 8.24874763946959e-06, |
|
"loss": 0.5998, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.554602037671773, |
|
"learning_rate": 7.906122951534677e-06, |
|
"loss": 0.5864, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5935221816502435, |
|
"learning_rate": 7.570472768076464e-06, |
|
"loss": 0.556, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5727608314538538, |
|
"learning_rate": 7.241822508740448e-06, |
|
"loss": 0.6276, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6164522880456009, |
|
"learning_rate": 6.920197063050038e-06, |
|
"loss": 0.5819, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5718452829860532, |
|
"learning_rate": 6.605620788521472e-06, |
|
"loss": 0.6139, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.569531129249351, |
|
"learning_rate": 6.2981175088193564e-06, |
|
"loss": 0.5936, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5844212361264539, |
|
"learning_rate": 5.997710511952259e-06, |
|
"loss": 0.6065, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6109564839739653, |
|
"learning_rate": 5.70442254850918e-06, |
|
"loss": 0.648, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6491048595264175, |
|
"learning_rate": 5.418275829936537e-06, |
|
"loss": 0.555, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6204717817131216, |
|
"learning_rate": 5.139292026855991e-06, |
|
"loss": 0.6479, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6360339795207006, |
|
"learning_rate": 4.867492267423379e-06, |
|
"loss": 0.5502, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5500198353842222, |
|
"learning_rate": 4.602897135728513e-06, |
|
"loss": 0.6044, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5577787929024242, |
|
"learning_rate": 4.3455266702364e-06, |
|
"loss": 0.6021, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5620959438033024, |
|
"learning_rate": 4.095400362269597e-06, |
|
"loss": 0.6364, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6455883188430549, |
|
"learning_rate": 3.852537154532121e-06, |
|
"loss": 0.5935, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5463076296085635, |
|
"learning_rate": 3.616955439674863e-06, |
|
"loss": 0.5898, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5664602810118163, |
|
"learning_rate": 3.388673058902647e-06, |
|
"loss": 0.5727, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5844823066126078, |
|
"learning_rate": 3.167707300623135e-06, |
|
"loss": 0.5734, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5688042679406337, |
|
"learning_rate": 2.9540748991374268e-06, |
|
"loss": 0.5911, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5817718407371287, |
|
"learning_rate": 2.7477920333728203e-06, |
|
"loss": 0.583, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6549728437210761, |
|
"learning_rate": 2.548874325657502e-06, |
|
"loss": 0.6076, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5825225050052231, |
|
"learning_rate": 2.3573368405374052e-06, |
|
"loss": 0.5811, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5577099309283072, |
|
"learning_rate": 2.1731940836354103e-06, |
|
"loss": 0.6496, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6215824384460507, |
|
"learning_rate": 1.996460000552702e-06, |
|
"loss": 0.5724, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5177119689387795, |
|
"learning_rate": 1.827147975812693e-06, |
|
"loss": 0.6236, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6230359593868082, |
|
"learning_rate": 1.6652708318473763e-06, |
|
"loss": 0.6383, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5622331543379322, |
|
"learning_rate": 1.5108408280262276e-06, |
|
"loss": 0.594, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6107181340780342, |
|
"learning_rate": 1.3638696597277679e-06, |
|
"loss": 0.6463, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6031539978263786, |
|
"learning_rate": 1.2243684574538838e-06, |
|
"loss": 0.6587, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5761269685398007, |
|
"learning_rate": 1.092347785986858e-06, |
|
"loss": 0.6137, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5520617664080628, |
|
"learning_rate": 9.678176435892417e-07, |
|
"loss": 0.5566, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6171032667293939, |
|
"learning_rate": 8.507874612467382e-07, |
|
"loss": 0.6151, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6143485738921709, |
|
"learning_rate": 7.412661019538858e-07, |
|
"loss": 0.5945, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5659045654192223, |
|
"learning_rate": 6.392618600429057e-07, |
|
"loss": 0.5939, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6008704187809593, |
|
"learning_rate": 5.447824605555041e-07, |
|
"loss": 0.593, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6456664206486963, |
|
"learning_rate": 4.578350586578628e-07, |
|
"loss": 0.6243, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6134762242372662, |
|
"learning_rate": 3.7842623909875033e-07, |
|
"loss": 0.5784, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5477747018780752, |
|
"learning_rate": 3.0656201571085394e-07, |
|
"loss": 0.593, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5075895950713205, |
|
"learning_rate": 2.422478309553222e-07, |
|
"loss": 0.5505, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.587289160097588, |
|
"learning_rate": 1.854885555095942e-07, |
|
"loss": 0.6621, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.6392065048372381, |
|
"learning_rate": 1.3628848789853933e-07, |
|
"loss": 0.6049, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5961350765030653, |
|
"learning_rate": 9.465135416891757e-08, |
|
"loss": 0.6227, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5699523340921686, |
|
"learning_rate": 6.058030760718315e-08, |
|
"loss": 0.5973, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5616341300318727, |
|
"learning_rate": 3.4077928500686475e-08, |
|
"loss": 0.6024, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5919424339524653, |
|
"learning_rate": 1.5146223942297254e-08, |
|
"loss": 0.6247, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5459942448953277, |
|
"learning_rate": 3.7866276783149464e-09, |
|
"loss": 0.6021, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5739335118144996, |
|
"learning_rate": 0.0, |
|
"loss": 0.5901, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 373, |
|
"total_flos": 52213203664896.0, |
|
"train_loss": 0.6854520184424863, |
|
"train_runtime": 4288.0633, |
|
"train_samples_per_second": 5.561, |
|
"train_steps_per_second": 0.087 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 373, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 52213203664896.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|