|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.4934383202099737, |
|
"eval_steps": 500, |
|
"global_step": 285, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005249343832020997, |
|
"grad_norm": 1.1348930782232016, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": 1.1087, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010498687664041995, |
|
"grad_norm": 1.123696373079589, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 1.1356, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015748031496062992, |
|
"grad_norm": 1.0989081863562118, |
|
"learning_rate": 4.5e-07, |
|
"loss": 1.1158, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02099737532808399, |
|
"grad_norm": 1.0628548113414964, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.0986, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.026246719160104987, |
|
"grad_norm": 1.0629069543612368, |
|
"learning_rate": 7.5e-07, |
|
"loss": 1.0727, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.031496062992125984, |
|
"grad_norm": 1.1219311917213644, |
|
"learning_rate": 9e-07, |
|
"loss": 1.1513, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03674540682414698, |
|
"grad_norm": 1.068318638334139, |
|
"learning_rate": 1.05e-06, |
|
"loss": 1.0978, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04199475065616798, |
|
"grad_norm": 1.0335025624008565, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.0932, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.047244094488188976, |
|
"grad_norm": 0.9514112971268772, |
|
"learning_rate": 1.35e-06, |
|
"loss": 1.1046, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05249343832020997, |
|
"grad_norm": 0.8944230714776324, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.0638, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05774278215223097, |
|
"grad_norm": 0.8720343077794245, |
|
"learning_rate": 1.65e-06, |
|
"loss": 1.1132, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06299212598425197, |
|
"grad_norm": 0.7519518665820406, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.0788, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06824146981627296, |
|
"grad_norm": 0.7768466543241798, |
|
"learning_rate": 1.95e-06, |
|
"loss": 1.0795, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07349081364829396, |
|
"grad_norm": 0.7109922479048013, |
|
"learning_rate": 2.1e-06, |
|
"loss": 1.1012, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07874015748031496, |
|
"grad_norm": 0.6312078880187205, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.0851, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08398950131233596, |
|
"grad_norm": 0.5514473048370377, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.1041, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08923884514435695, |
|
"grad_norm": 0.6271281070432462, |
|
"learning_rate": 2.55e-06, |
|
"loss": 1.0855, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.09448818897637795, |
|
"grad_norm": 0.7059888078645049, |
|
"learning_rate": 2.7e-06, |
|
"loss": 1.0473, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09973753280839895, |
|
"grad_norm": 0.7226157330393405, |
|
"learning_rate": 2.85e-06, |
|
"loss": 1.0665, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10498687664041995, |
|
"grad_norm": 0.7244742832208652, |
|
"learning_rate": 3e-06, |
|
"loss": 1.0604, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11023622047244094, |
|
"grad_norm": 0.7088251146482789, |
|
"learning_rate": 3.1500000000000003e-06, |
|
"loss": 1.0516, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11548556430446194, |
|
"grad_norm": 0.5987242362229293, |
|
"learning_rate": 3.3e-06, |
|
"loss": 1.084, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.12073490813648294, |
|
"grad_norm": 0.5730637810768702, |
|
"learning_rate": 3.45e-06, |
|
"loss": 1.0621, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12598425196850394, |
|
"grad_norm": 0.5894968443138215, |
|
"learning_rate": 3.6e-06, |
|
"loss": 1.0797, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.13123359580052493, |
|
"grad_norm": 0.5798124303184627, |
|
"learning_rate": 3.75e-06, |
|
"loss": 1.0035, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13648293963254593, |
|
"grad_norm": 0.643205751513686, |
|
"learning_rate": 3.9e-06, |
|
"loss": 1.0455, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.14173228346456693, |
|
"grad_norm": 0.5621970774702022, |
|
"learning_rate": 4.05e-06, |
|
"loss": 1.0576, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14698162729658792, |
|
"grad_norm": 0.5506084571895594, |
|
"learning_rate": 4.2e-06, |
|
"loss": 1.0298, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.15223097112860892, |
|
"grad_norm": 0.48741149421912777, |
|
"learning_rate": 4.35e-06, |
|
"loss": 1.0018, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.15748031496062992, |
|
"grad_norm": 0.46403007703544275, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.9872, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16272965879265092, |
|
"grad_norm": 0.4754381818573106, |
|
"learning_rate": 4.65e-06, |
|
"loss": 1.0271, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1679790026246719, |
|
"grad_norm": 0.9362850890979981, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.0437, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1732283464566929, |
|
"grad_norm": 0.47391181595772164, |
|
"learning_rate": 4.95e-06, |
|
"loss": 1.0437, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1784776902887139, |
|
"grad_norm": 0.5276920454851337, |
|
"learning_rate": 5.1e-06, |
|
"loss": 1.0557, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1837270341207349, |
|
"grad_norm": 0.4616075133913133, |
|
"learning_rate": 5.2500000000000006e-06, |
|
"loss": 1.0465, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1889763779527559, |
|
"grad_norm": 0.4555174555636226, |
|
"learning_rate": 5.4e-06, |
|
"loss": 1.0588, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1942257217847769, |
|
"grad_norm": 0.5071864534648831, |
|
"learning_rate": 5.55e-06, |
|
"loss": 1.044, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1994750656167979, |
|
"grad_norm": 0.4851367263882934, |
|
"learning_rate": 5.7e-06, |
|
"loss": 1.0464, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2047244094488189, |
|
"grad_norm": 0.44188022228811896, |
|
"learning_rate": 5.85e-06, |
|
"loss": 1.0182, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2099737532808399, |
|
"grad_norm": 0.43420740120454643, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0188, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2152230971128609, |
|
"grad_norm": 0.4291543441241407, |
|
"learning_rate": 5.9998719351101036e-06, |
|
"loss": 1.0245, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2204724409448819, |
|
"grad_norm": 0.43326370236005163, |
|
"learning_rate": 5.999487751374158e-06, |
|
"loss": 1.0238, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.22572178477690288, |
|
"grad_norm": 0.427571644972227, |
|
"learning_rate": 5.998847481592462e-06, |
|
"loss": 1.0311, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.23097112860892388, |
|
"grad_norm": 0.4215063088273006, |
|
"learning_rate": 5.997951180429069e-06, |
|
"loss": 0.9925, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.23622047244094488, |
|
"grad_norm": 0.4206536914503675, |
|
"learning_rate": 5.996798924407118e-06, |
|
"loss": 1.003, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.24146981627296588, |
|
"grad_norm": 0.40910969064965136, |
|
"learning_rate": 5.995390811902302e-06, |
|
"loss": 0.9949, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.24671916010498687, |
|
"grad_norm": 0.4165775049327623, |
|
"learning_rate": 5.993726963134471e-06, |
|
"loss": 0.9734, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.25196850393700787, |
|
"grad_norm": 0.3832235501001726, |
|
"learning_rate": 5.9918075201573645e-06, |
|
"loss": 0.9485, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2572178477690289, |
|
"grad_norm": 0.37002495168808525, |
|
"learning_rate": 5.9896326468464835e-06, |
|
"loss": 0.9358, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.26246719160104987, |
|
"grad_norm": 0.44836853406053057, |
|
"learning_rate": 5.987202528885104e-06, |
|
"loss": 0.9982, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2677165354330709, |
|
"grad_norm": 0.4080608606117312, |
|
"learning_rate": 5.984517373748417e-06, |
|
"loss": 1.0129, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.27296587926509186, |
|
"grad_norm": 0.4001550595702573, |
|
"learning_rate": 5.981577410685822e-06, |
|
"loss": 0.9788, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2782152230971129, |
|
"grad_norm": 0.41021488877460305, |
|
"learning_rate": 5.978382890701347e-06, |
|
"loss": 1.0262, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.28346456692913385, |
|
"grad_norm": 0.39997016380492506, |
|
"learning_rate": 5.9749340865322284e-06, |
|
"loss": 1.0275, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2887139107611549, |
|
"grad_norm": 0.3839823787027912, |
|
"learning_rate": 5.971231292625615e-06, |
|
"loss": 0.9374, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.29396325459317585, |
|
"grad_norm": 0.4125068495663659, |
|
"learning_rate": 5.967274825113438e-06, |
|
"loss": 0.9954, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2992125984251969, |
|
"grad_norm": 0.3908377197765856, |
|
"learning_rate": 5.963065021785414e-06, |
|
"loss": 0.9671, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.30446194225721784, |
|
"grad_norm": 0.3850488592862481, |
|
"learning_rate": 5.958602242060207e-06, |
|
"loss": 0.9657, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.30971128608923887, |
|
"grad_norm": 0.3877990366088493, |
|
"learning_rate": 5.95388686695475e-06, |
|
"loss": 0.9678, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.31496062992125984, |
|
"grad_norm": 0.40470471194287355, |
|
"learning_rate": 5.948919299051706e-06, |
|
"loss": 1.0149, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.32020997375328086, |
|
"grad_norm": 0.42889495063392963, |
|
"learning_rate": 5.943699962465096e-06, |
|
"loss": 1.033, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.32545931758530183, |
|
"grad_norm": 0.39164358737100274, |
|
"learning_rate": 5.9382293028040985e-06, |
|
"loss": 0.9761, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.33070866141732286, |
|
"grad_norm": 0.3869342590567232, |
|
"learning_rate": 5.9325077871349975e-06, |
|
"loss": 0.9982, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3359580052493438, |
|
"grad_norm": 0.39264627926569035, |
|
"learning_rate": 5.9265359039413105e-06, |
|
"loss": 0.9667, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.34120734908136485, |
|
"grad_norm": 0.3887717698297268, |
|
"learning_rate": 5.920314163082079e-06, |
|
"loss": 0.9806, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3464566929133858, |
|
"grad_norm": 0.40896336915084297, |
|
"learning_rate": 5.913843095748342e-06, |
|
"loss": 1.0135, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.35170603674540685, |
|
"grad_norm": 0.3610209560875707, |
|
"learning_rate": 5.907123254417783e-06, |
|
"loss": 0.956, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3569553805774278, |
|
"grad_norm": 0.38154744815823505, |
|
"learning_rate": 5.9001552128075625e-06, |
|
"loss": 1.0045, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.36220472440944884, |
|
"grad_norm": 0.4094826396119445, |
|
"learning_rate": 5.892939565825335e-06, |
|
"loss": 1.0069, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3674540682414698, |
|
"grad_norm": 0.39129138622932325, |
|
"learning_rate": 5.885476929518457e-06, |
|
"loss": 0.9525, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.37270341207349084, |
|
"grad_norm": 0.3712890701175899, |
|
"learning_rate": 5.8777679410213956e-06, |
|
"loss": 0.9792, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3779527559055118, |
|
"grad_norm": 0.4086264062600148, |
|
"learning_rate": 5.869813258501323e-06, |
|
"loss": 0.9926, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.38320209973753283, |
|
"grad_norm": 0.368975878599487, |
|
"learning_rate": 5.861613561101934e-06, |
|
"loss": 0.9643, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3884514435695538, |
|
"grad_norm": 0.36792811629461203, |
|
"learning_rate": 5.853169548885461e-06, |
|
"loss": 0.9867, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3937007874015748, |
|
"grad_norm": 0.3566251893981936, |
|
"learning_rate": 5.844481942772898e-06, |
|
"loss": 1.0069, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3989501312335958, |
|
"grad_norm": 0.4578529359685586, |
|
"learning_rate": 5.835551484482459e-06, |
|
"loss": 1.0173, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4041994750656168, |
|
"grad_norm": 0.3935925285922137, |
|
"learning_rate": 5.826378936466249e-06, |
|
"loss": 0.9743, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4094488188976378, |
|
"grad_norm": 0.4109939217838428, |
|
"learning_rate": 5.81696508184517e-06, |
|
"loss": 0.9866, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4146981627296588, |
|
"grad_norm": 0.3839870332489822, |
|
"learning_rate": 5.807310724342058e-06, |
|
"loss": 0.9516, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4199475065616798, |
|
"grad_norm": 0.3774576797883406, |
|
"learning_rate": 5.797416688213067e-06, |
|
"loss": 0.9895, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4251968503937008, |
|
"grad_norm": 0.3817468964498129, |
|
"learning_rate": 5.787283818177297e-06, |
|
"loss": 0.9632, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4304461942257218, |
|
"grad_norm": 0.60843002346461, |
|
"learning_rate": 5.776912979344669e-06, |
|
"loss": 1.0166, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4356955380577428, |
|
"grad_norm": 0.3858713700245362, |
|
"learning_rate": 5.766305057142073e-06, |
|
"loss": 0.9976, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4409448818897638, |
|
"grad_norm": 0.3724153436541016, |
|
"learning_rate": 5.755460957237769e-06, |
|
"loss": 0.9645, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4461942257217848, |
|
"grad_norm": 0.38201105695018567, |
|
"learning_rate": 5.744381605464064e-06, |
|
"loss": 0.9899, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.45144356955380577, |
|
"grad_norm": 0.38383930861007165, |
|
"learning_rate": 5.7330679477382655e-06, |
|
"loss": 0.9919, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.4566929133858268, |
|
"grad_norm": 0.4078870418259581, |
|
"learning_rate": 5.7215209499819296e-06, |
|
"loss": 0.9797, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.46194225721784776, |
|
"grad_norm": 0.38463767466523974, |
|
"learning_rate": 5.709741598038387e-06, |
|
"loss": 0.9597, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.4671916010498688, |
|
"grad_norm": 0.36309855116472584, |
|
"learning_rate": 5.697730897588577e-06, |
|
"loss": 0.9737, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.47244094488188976, |
|
"grad_norm": 0.4106701446638758, |
|
"learning_rate": 5.685489874065187e-06, |
|
"loss": 0.9683, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4776902887139108, |
|
"grad_norm": 0.37110409255145443, |
|
"learning_rate": 5.673019572565103e-06, |
|
"loss": 1.0418, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.48293963254593175, |
|
"grad_norm": 0.3558357783330656, |
|
"learning_rate": 5.660321057760186e-06, |
|
"loss": 1.0055, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4881889763779528, |
|
"grad_norm": 0.40499489938404787, |
|
"learning_rate": 5.6473954138063674e-06, |
|
"loss": 1.0113, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.49343832020997375, |
|
"grad_norm": 0.39428526462199764, |
|
"learning_rate": 5.634243744251094e-06, |
|
"loss": 0.9875, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.49868766404199477, |
|
"grad_norm": 0.3711741011240413, |
|
"learning_rate": 5.620867171939109e-06, |
|
"loss": 0.9749, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5039370078740157, |
|
"grad_norm": 0.3961340085644134, |
|
"learning_rate": 5.607266838916585e-06, |
|
"loss": 0.982, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5091863517060368, |
|
"grad_norm": 0.3784646685814138, |
|
"learning_rate": 5.593443906333624e-06, |
|
"loss": 0.9957, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5144356955380578, |
|
"grad_norm": 0.3750460397069026, |
|
"learning_rate": 5.579399554345118e-06, |
|
"loss": 0.9755, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5196850393700787, |
|
"grad_norm": 0.3746718538274792, |
|
"learning_rate": 5.565134982009994e-06, |
|
"loss": 0.9736, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5249343832020997, |
|
"grad_norm": 0.38418890409196027, |
|
"learning_rate": 5.550651407188843e-06, |
|
"loss": 0.9506, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5301837270341208, |
|
"grad_norm": 0.422976375435725, |
|
"learning_rate": 5.535950066439941e-06, |
|
"loss": 1.0141, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5354330708661418, |
|
"grad_norm": 0.38354451243133536, |
|
"learning_rate": 5.521032214913679e-06, |
|
"loss": 0.9618, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5406824146981627, |
|
"grad_norm": 0.38257660011773076, |
|
"learning_rate": 5.505899126245397e-06, |
|
"loss": 0.939, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5459317585301837, |
|
"grad_norm": 0.3768438915225408, |
|
"learning_rate": 5.490552092446652e-06, |
|
"loss": 0.9675, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5511811023622047, |
|
"grad_norm": 0.3749655286727107, |
|
"learning_rate": 5.474992423794907e-06, |
|
"loss": 0.9592, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5564304461942258, |
|
"grad_norm": 0.38461916993489687, |
|
"learning_rate": 5.459221448721664e-06, |
|
"loss": 0.9623, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5616797900262467, |
|
"grad_norm": 0.35648642966931204, |
|
"learning_rate": 5.443240513699045e-06, |
|
"loss": 0.985, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5669291338582677, |
|
"grad_norm": 0.4051560712719681, |
|
"learning_rate": 5.427050983124842e-06, |
|
"loss": 0.9407, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5721784776902887, |
|
"grad_norm": 0.3769879713701903, |
|
"learning_rate": 5.410654239206021e-06, |
|
"loss": 0.968, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5774278215223098, |
|
"grad_norm": 0.3746822083724367, |
|
"learning_rate": 5.394051681840719e-06, |
|
"loss": 0.9497, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5826771653543307, |
|
"grad_norm": 0.3987231911136733, |
|
"learning_rate": 5.3772447284987216e-06, |
|
"loss": 0.961, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5879265091863517, |
|
"grad_norm": 0.37848222525971176, |
|
"learning_rate": 5.36023481410045e-06, |
|
"loss": 0.9707, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5931758530183727, |
|
"grad_norm": 0.3794904855253974, |
|
"learning_rate": 5.343023390894446e-06, |
|
"loss": 0.9714, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5984251968503937, |
|
"grad_norm": 0.37452267525256994, |
|
"learning_rate": 5.325611928333389e-06, |
|
"loss": 0.9406, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6036745406824147, |
|
"grad_norm": 0.39474437059829304, |
|
"learning_rate": 5.308001912948637e-06, |
|
"loss": 0.9626, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6089238845144357, |
|
"grad_norm": 0.4023921986663554, |
|
"learning_rate": 5.290194848223309e-06, |
|
"loss": 0.9889, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6141732283464567, |
|
"grad_norm": 0.39963771712171875, |
|
"learning_rate": 5.272192254463929e-06, |
|
"loss": 0.9639, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6194225721784777, |
|
"grad_norm": 0.3893586064595733, |
|
"learning_rate": 5.2539956686706205e-06, |
|
"loss": 0.9469, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6246719160104987, |
|
"grad_norm": 0.4651495625439333, |
|
"learning_rate": 5.2356066444058875e-06, |
|
"loss": 0.9658, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6299212598425197, |
|
"grad_norm": 0.39599728107932586, |
|
"learning_rate": 5.217026751661978e-06, |
|
"loss": 1.0137, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6351706036745407, |
|
"grad_norm": 0.406988761369817, |
|
"learning_rate": 5.198257576726835e-06, |
|
"loss": 0.9306, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6404199475065617, |
|
"grad_norm": 0.3611939094322339, |
|
"learning_rate": 5.179300722048673e-06, |
|
"loss": 0.9462, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6456692913385826, |
|
"grad_norm": 0.3809841775392484, |
|
"learning_rate": 5.1601578060991645e-06, |
|
"loss": 0.953, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6509186351706037, |
|
"grad_norm": 0.46022843064705843, |
|
"learning_rate": 5.1408304632352575e-06, |
|
"loss": 0.9422, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6561679790026247, |
|
"grad_norm": 0.3979704646560941, |
|
"learning_rate": 5.1213203435596425e-06, |
|
"loss": 0.9751, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6614173228346457, |
|
"grad_norm": 0.39388496260457084, |
|
"learning_rate": 5.101629112779873e-06, |
|
"loss": 0.9722, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.3899148438115094, |
|
"learning_rate": 5.08175845206615e-06, |
|
"loss": 0.9652, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6719160104986877, |
|
"grad_norm": 0.37391882787694275, |
|
"learning_rate": 5.061710057907788e-06, |
|
"loss": 0.9621, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6771653543307087, |
|
"grad_norm": 0.39500875865406576, |
|
"learning_rate": 5.041485641968385e-06, |
|
"loss": 0.9899, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6824146981627297, |
|
"grad_norm": 0.37540362490802714, |
|
"learning_rate": 5.021086930939672e-06, |
|
"loss": 0.9472, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6876640419947506, |
|
"grad_norm": 0.3940788728379769, |
|
"learning_rate": 5.000515666394105e-06, |
|
"loss": 0.9479, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6929133858267716, |
|
"grad_norm": 0.3919125365655477, |
|
"learning_rate": 4.979773604636169e-06, |
|
"loss": 0.9624, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6981627296587927, |
|
"grad_norm": 0.3804552314744538, |
|
"learning_rate": 4.958862516552433e-06, |
|
"loss": 0.9806, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7034120734908137, |
|
"grad_norm": 0.3674434286105591, |
|
"learning_rate": 4.937784187460362e-06, |
|
"loss": 0.9511, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7086614173228346, |
|
"grad_norm": 0.4109777494732396, |
|
"learning_rate": 4.916540416955884e-06, |
|
"loss": 0.9943, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7139107611548556, |
|
"grad_norm": 0.40231567788837497, |
|
"learning_rate": 4.895133018759753e-06, |
|
"loss": 0.9798, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7191601049868767, |
|
"grad_norm": 0.3721834479908975, |
|
"learning_rate": 4.873563820562698e-06, |
|
"loss": 0.9504, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7244094488188977, |
|
"grad_norm": 0.36127526200518306, |
|
"learning_rate": 4.851834663869379e-06, |
|
"loss": 0.9517, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7296587926509186, |
|
"grad_norm": 0.3513827139135777, |
|
"learning_rate": 4.82994740384117e-06, |
|
"loss": 0.9835, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7349081364829396, |
|
"grad_norm": 0.36760728272750326, |
|
"learning_rate": 4.80790390913777e-06, |
|
"loss": 0.9503, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7401574803149606, |
|
"grad_norm": 0.36275280721999276, |
|
"learning_rate": 4.785706061757656e-06, |
|
"loss": 0.9743, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7454068241469817, |
|
"grad_norm": 0.3733380512329921, |
|
"learning_rate": 4.763355756877419e-06, |
|
"loss": 0.9384, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7506561679790026, |
|
"grad_norm": 0.3801691027568987, |
|
"learning_rate": 4.740854902689947e-06, |
|
"loss": 0.9296, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.7559055118110236, |
|
"grad_norm": 0.39053906811778566, |
|
"learning_rate": 4.718205420241516e-06, |
|
"loss": 0.9488, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7611548556430446, |
|
"grad_norm": 0.3923993707534958, |
|
"learning_rate": 4.695409243267776e-06, |
|
"loss": 0.9383, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7664041994750657, |
|
"grad_norm": 0.364792552828712, |
|
"learning_rate": 4.672468318028657e-06, |
|
"loss": 0.9193, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7716535433070866, |
|
"grad_norm": 0.35070825551906964, |
|
"learning_rate": 4.649384603142202e-06, |
|
"loss": 0.9164, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.7769028871391076, |
|
"grad_norm": 0.37099778180795795, |
|
"learning_rate": 4.626160069417348e-06, |
|
"loss": 0.9425, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7821522309711286, |
|
"grad_norm": 0.36954118968922517, |
|
"learning_rate": 4.602796699685665e-06, |
|
"loss": 0.9265, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7874015748031497, |
|
"grad_norm": 0.4076466706382121, |
|
"learning_rate": 4.579296488632067e-06, |
|
"loss": 1.0133, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7926509186351706, |
|
"grad_norm": 0.4015334925568992, |
|
"learning_rate": 4.5556614426245165e-06, |
|
"loss": 0.9486, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7979002624671916, |
|
"grad_norm": 0.39628644809730684, |
|
"learning_rate": 4.5318935795427206e-06, |
|
"loss": 0.9605, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8031496062992126, |
|
"grad_norm": 0.36792154742540445, |
|
"learning_rate": 4.507994928605862e-06, |
|
"loss": 0.9287, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8083989501312336, |
|
"grad_norm": 0.3887839296706913, |
|
"learning_rate": 4.483967530199337e-06, |
|
"loss": 0.951, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8136482939632546, |
|
"grad_norm": 0.36716852968968616, |
|
"learning_rate": 4.459813435700569e-06, |
|
"loss": 0.9702, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8188976377952756, |
|
"grad_norm": 0.3533521076976156, |
|
"learning_rate": 4.4355347073038595e-06, |
|
"loss": 0.9612, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.8241469816272966, |
|
"grad_norm": 0.3499649930079787, |
|
"learning_rate": 4.411133417844328e-06, |
|
"loss": 0.9599, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.8293963254593176, |
|
"grad_norm": 0.38582146832565867, |
|
"learning_rate": 4.38661165062094e-06, |
|
"loss": 0.9894, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8346456692913385, |
|
"grad_norm": 0.39040836855795735, |
|
"learning_rate": 4.36197149921864e-06, |
|
"loss": 0.9747, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8398950131233596, |
|
"grad_norm": 0.3798580758700489, |
|
"learning_rate": 4.3372150673296155e-06, |
|
"loss": 0.9654, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8451443569553806, |
|
"grad_norm": 0.3764456540061034, |
|
"learning_rate": 4.3123444685736795e-06, |
|
"loss": 0.9823, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8503937007874016, |
|
"grad_norm": 0.3771195417830333, |
|
"learning_rate": 4.287361826317827e-06, |
|
"loss": 0.9456, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8556430446194225, |
|
"grad_norm": 0.37650137746409273, |
|
"learning_rate": 4.262269273494946e-06, |
|
"loss": 1.0022, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8608923884514436, |
|
"grad_norm": 0.38148353077474145, |
|
"learning_rate": 4.237068952421711e-06, |
|
"loss": 0.964, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8661417322834646, |
|
"grad_norm": 0.3982519128695332, |
|
"learning_rate": 4.2117630146156845e-06, |
|
"loss": 0.9673, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8713910761154856, |
|
"grad_norm": 0.36000775624632003, |
|
"learning_rate": 4.186353620611627e-06, |
|
"loss": 0.9359, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8766404199475065, |
|
"grad_norm": 0.36850454735662447, |
|
"learning_rate": 4.160842939777036e-06, |
|
"loss": 0.9422, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8818897637795275, |
|
"grad_norm": 0.37804115639757085, |
|
"learning_rate": 4.135233150126931e-06, |
|
"loss": 0.9454, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8871391076115486, |
|
"grad_norm": 0.3689383402086321, |
|
"learning_rate": 4.109526438137908e-06, |
|
"loss": 0.9455, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.8923884514435696, |
|
"grad_norm": 0.46527154775209717, |
|
"learning_rate": 4.08372499856146e-06, |
|
"loss": 0.9386, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8976377952755905, |
|
"grad_norm": 0.45653306710128705, |
|
"learning_rate": 4.0578310342365975e-06, |
|
"loss": 0.9616, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.9028871391076115, |
|
"grad_norm": 0.3773630567359451, |
|
"learning_rate": 4.031846755901785e-06, |
|
"loss": 0.9285, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9081364829396326, |
|
"grad_norm": 0.3644595191521506, |
|
"learning_rate": 4.005774382006182e-06, |
|
"loss": 0.9663, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.9133858267716536, |
|
"grad_norm": 0.3539767481135477, |
|
"learning_rate": 3.97961613852025e-06, |
|
"loss": 0.9564, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.9186351706036745, |
|
"grad_norm": 0.3819676152776953, |
|
"learning_rate": 3.953374258745705e-06, |
|
"loss": 0.9607, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9238845144356955, |
|
"grad_norm": 0.38397675786726637, |
|
"learning_rate": 3.927050983124842e-06, |
|
"loss": 0.9539, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9291338582677166, |
|
"grad_norm": 0.3979084367711538, |
|
"learning_rate": 3.900648559049258e-06, |
|
"loss": 0.9505, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.9343832020997376, |
|
"grad_norm": 0.3756154385935223, |
|
"learning_rate": 3.874169240667974e-06, |
|
"loss": 0.9519, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9396325459317585, |
|
"grad_norm": 0.40551973597201274, |
|
"learning_rate": 3.847615288694985e-06, |
|
"loss": 0.9727, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.9448818897637795, |
|
"grad_norm": 0.4149625851710124, |
|
"learning_rate": 3.820988970216249e-06, |
|
"loss": 0.9464, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9501312335958005, |
|
"grad_norm": 0.35739115830542967, |
|
"learning_rate": 3.7942925584961272e-06, |
|
"loss": 0.9427, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.9553805774278216, |
|
"grad_norm": 0.3759540038847051, |
|
"learning_rate": 3.767528332783307e-06, |
|
"loss": 0.9679, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9606299212598425, |
|
"grad_norm": 0.3525867658299593, |
|
"learning_rate": 3.740698578116199e-06, |
|
"loss": 0.9183, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9658792650918635, |
|
"grad_norm": 0.3557123352774738, |
|
"learning_rate": 3.7138055851278564e-06, |
|
"loss": 0.9383, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9711286089238845, |
|
"grad_norm": 0.3623514252763418, |
|
"learning_rate": 3.6868516498504025e-06, |
|
"loss": 0.9246, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9763779527559056, |
|
"grad_norm": 0.38495496418054853, |
|
"learning_rate": 3.6598390735190066e-06, |
|
"loss": 0.9612, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9816272965879265, |
|
"grad_norm": 0.3648599004428126, |
|
"learning_rate": 3.63277016237541e-06, |
|
"loss": 0.9293, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.9868766404199475, |
|
"grad_norm": 0.38871547084803876, |
|
"learning_rate": 3.6056472274710305e-06, |
|
"loss": 0.9973, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9921259842519685, |
|
"grad_norm": 0.38590844403642666, |
|
"learning_rate": 3.578472584469651e-06, |
|
"loss": 0.9457, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.9973753280839895, |
|
"grad_norm": 0.3872507088649178, |
|
"learning_rate": 3.5512485534497116e-06, |
|
"loss": 0.9462, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3872507088649178, |
|
"learning_rate": 3.523977458706237e-06, |
|
"loss": 0.9693, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.005249343832021, |
|
"grad_norm": 0.6232728744646114, |
|
"learning_rate": 3.49666162855239e-06, |
|
"loss": 0.887, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.010498687664042, |
|
"grad_norm": 0.4149641950734625, |
|
"learning_rate": 3.469303395120693e-06, |
|
"loss": 0.8826, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.015748031496063, |
|
"grad_norm": 0.37273340109017755, |
|
"learning_rate": 3.441905094163913e-06, |
|
"loss": 0.8893, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.020997375328084, |
|
"grad_norm": 0.4113832689982837, |
|
"learning_rate": 3.414469064855647e-06, |
|
"loss": 0.9205, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.026246719160105, |
|
"grad_norm": 0.49485155842511663, |
|
"learning_rate": 3.3869976495906104e-06, |
|
"loss": 0.9074, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.031496062992126, |
|
"grad_norm": 0.3736781934252868, |
|
"learning_rate": 3.3594931937846498e-06, |
|
"loss": 0.8966, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.036745406824147, |
|
"grad_norm": 0.3758650059773124, |
|
"learning_rate": 3.3319580456745023e-06, |
|
"loss": 0.8759, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.041994750656168, |
|
"grad_norm": 0.4056031624712629, |
|
"learning_rate": 3.3043945561173092e-06, |
|
"loss": 0.8788, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.047244094488189, |
|
"grad_norm": 0.36344982085137467, |
|
"learning_rate": 3.2768050783899063e-06, |
|
"loss": 0.873, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.05249343832021, |
|
"grad_norm": 0.3760103676246, |
|
"learning_rate": 3.249191967987912e-06, |
|
"loss": 0.899, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.057742782152231, |
|
"grad_norm": 0.39433477834527153, |
|
"learning_rate": 3.221557582424622e-06, |
|
"loss": 0.9019, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.0629921259842519, |
|
"grad_norm": 0.3595753440791428, |
|
"learning_rate": 3.1939042810297328e-06, |
|
"loss": 0.8781, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.068241469816273, |
|
"grad_norm": 0.3743448170598354, |
|
"learning_rate": 3.16623442474791e-06, |
|
"loss": 0.8689, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.073490813648294, |
|
"grad_norm": 0.3618551186966609, |
|
"learning_rate": 3.138550375937219e-06, |
|
"loss": 0.9094, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.078740157480315, |
|
"grad_norm": 0.36577516842050983, |
|
"learning_rate": 3.1108544981674356e-06, |
|
"loss": 0.8668, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.083989501312336, |
|
"grad_norm": 0.3985134455319658, |
|
"learning_rate": 3.0831491560182495e-06, |
|
"loss": 0.9016, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.0892388451443569, |
|
"grad_norm": 0.37808489525197075, |
|
"learning_rate": 3.0554367148773897e-06, |
|
"loss": 0.895, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.094488188976378, |
|
"grad_norm": 0.4112784941005797, |
|
"learning_rate": 3.027719540738673e-06, |
|
"loss": 0.859, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.099737532808399, |
|
"grad_norm": 0.3830296759827936, |
|
"learning_rate": 3e-06, |
|
"loss": 0.8569, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.10498687664042, |
|
"grad_norm": 0.3930755503999148, |
|
"learning_rate": 2.972280459261328e-06, |
|
"loss": 0.8774, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.110236220472441, |
|
"grad_norm": 0.36738851637178116, |
|
"learning_rate": 2.944563285122611e-06, |
|
"loss": 0.9086, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.1154855643044619, |
|
"grad_norm": 0.3897160841039193, |
|
"learning_rate": 2.9168508439817515e-06, |
|
"loss": 0.889, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.120734908136483, |
|
"grad_norm": 0.39858146379374537, |
|
"learning_rate": 2.889145501832566e-06, |
|
"loss": 0.8964, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.125984251968504, |
|
"grad_norm": 0.3739395525411432, |
|
"learning_rate": 2.861449624062782e-06, |
|
"loss": 0.8884, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1312335958005248, |
|
"grad_norm": 0.3755768464864809, |
|
"learning_rate": 2.83376557525209e-06, |
|
"loss": 0.851, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.136482939632546, |
|
"grad_norm": 0.38260315757882735, |
|
"learning_rate": 2.8060957189702674e-06, |
|
"loss": 0.9152, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.141732283464567, |
|
"grad_norm": 0.4205379839527009, |
|
"learning_rate": 2.7784424175753784e-06, |
|
"loss": 0.8683, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.1469816272965878, |
|
"grad_norm": 0.38325260941818995, |
|
"learning_rate": 2.7508080320120888e-06, |
|
"loss": 0.8943, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.152230971128609, |
|
"grad_norm": 0.3763198826603672, |
|
"learning_rate": 2.7231949216100943e-06, |
|
"loss": 0.8676, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1574803149606299, |
|
"grad_norm": 0.3767162287387105, |
|
"learning_rate": 2.6956054438826918e-06, |
|
"loss": 0.8482, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.162729658792651, |
|
"grad_norm": 0.3486273740901837, |
|
"learning_rate": 2.668041954325498e-06, |
|
"loss": 0.8879, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.167979002624672, |
|
"grad_norm": 0.39084218665366566, |
|
"learning_rate": 2.640506806215351e-06, |
|
"loss": 0.8679, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.1732283464566928, |
|
"grad_norm": 0.3538552501730603, |
|
"learning_rate": 2.613002350409391e-06, |
|
"loss": 0.8871, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.178477690288714, |
|
"grad_norm": 0.36544200913577, |
|
"learning_rate": 2.585530935144354e-06, |
|
"loss": 0.8616, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.1837270341207349, |
|
"grad_norm": 0.3985990462573467, |
|
"learning_rate": 2.558094905836087e-06, |
|
"loss": 0.8917, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.188976377952756, |
|
"grad_norm": 0.42608518999556655, |
|
"learning_rate": 2.5306966048793067e-06, |
|
"loss": 0.8817, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.194225721784777, |
|
"grad_norm": 0.37952769789031354, |
|
"learning_rate": 2.5033383714476097e-06, |
|
"loss": 0.8985, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.1994750656167978, |
|
"grad_norm": 0.40804864076806885, |
|
"learning_rate": 2.4760225412937633e-06, |
|
"loss": 0.9073, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.204724409448819, |
|
"grad_norm": 0.4167713152946991, |
|
"learning_rate": 2.4487514465502885e-06, |
|
"loss": 0.8566, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.20997375328084, |
|
"grad_norm": 0.4022153540631621, |
|
"learning_rate": 2.42152741553035e-06, |
|
"loss": 0.8713, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.2152230971128608, |
|
"grad_norm": 0.4222065137992956, |
|
"learning_rate": 2.39435277252897e-06, |
|
"loss": 0.9035, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.220472440944882, |
|
"grad_norm": 0.3666365807384159, |
|
"learning_rate": 2.3672298376245908e-06, |
|
"loss": 0.8637, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.2257217847769029, |
|
"grad_norm": 0.3976853335036615, |
|
"learning_rate": 2.3401609264809953e-06, |
|
"loss": 0.9398, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.2309711286089238, |
|
"grad_norm": 0.37956934109451046, |
|
"learning_rate": 2.3131483501495985e-06, |
|
"loss": 0.8353, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.236220472440945, |
|
"grad_norm": 0.33722056538083744, |
|
"learning_rate": 2.2861944148721446e-06, |
|
"loss": 0.8786, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.2414698162729658, |
|
"grad_norm": 0.49777382093647954, |
|
"learning_rate": 2.2593014218838e-06, |
|
"loss": 0.8834, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.246719160104987, |
|
"grad_norm": 0.35315516410389436, |
|
"learning_rate": 2.232471667216693e-06, |
|
"loss": 0.8442, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.2519685039370079, |
|
"grad_norm": 0.3816124424363711, |
|
"learning_rate": 2.2057074415038725e-06, |
|
"loss": 0.8573, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.257217847769029, |
|
"grad_norm": 0.36319142999803095, |
|
"learning_rate": 2.1790110297837514e-06, |
|
"loss": 0.8481, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.26246719160105, |
|
"grad_norm": 0.34672889281207053, |
|
"learning_rate": 2.152384711305015e-06, |
|
"loss": 0.8623, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.2677165354330708, |
|
"grad_norm": 0.37448151544392105, |
|
"learning_rate": 2.1258307593320262e-06, |
|
"loss": 0.8751, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.272965879265092, |
|
"grad_norm": 0.37082567424502005, |
|
"learning_rate": 2.099351440950742e-06, |
|
"loss": 0.8914, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.2782152230971129, |
|
"grad_norm": 0.39074992783073415, |
|
"learning_rate": 2.072949016875158e-06, |
|
"loss": 0.9222, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.2834645669291338, |
|
"grad_norm": 0.4150437401629804, |
|
"learning_rate": 2.046625741254295e-06, |
|
"loss": 0.9475, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.288713910761155, |
|
"grad_norm": 0.4504166670407193, |
|
"learning_rate": 2.0203838614797505e-06, |
|
"loss": 0.9026, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.2939632545931758, |
|
"grad_norm": 0.38345958484903814, |
|
"learning_rate": 1.994225617993819e-06, |
|
"loss": 0.9074, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.2992125984251968, |
|
"grad_norm": 0.37086048031752866, |
|
"learning_rate": 1.9681532440982154e-06, |
|
"loss": 0.8755, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.304461942257218, |
|
"grad_norm": 0.3775524407980251, |
|
"learning_rate": 1.942168965763402e-06, |
|
"loss": 0.8986, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.3097112860892388, |
|
"grad_norm": 0.364796377340789, |
|
"learning_rate": 1.916275001438541e-06, |
|
"loss": 0.867, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3149606299212597, |
|
"grad_norm": 0.3705604843330414, |
|
"learning_rate": 1.8904735618620928e-06, |
|
"loss": 0.8875, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.3202099737532809, |
|
"grad_norm": 0.3847344001283667, |
|
"learning_rate": 1.8647668498730693e-06, |
|
"loss": 0.8678, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.3254593175853018, |
|
"grad_norm": 0.3507183610862785, |
|
"learning_rate": 1.8391570602229647e-06, |
|
"loss": 0.8895, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.330708661417323, |
|
"grad_norm": 0.34464955572346173, |
|
"learning_rate": 1.8136463793883725e-06, |
|
"loss": 0.9112, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.3359580052493438, |
|
"grad_norm": 0.3804540728076062, |
|
"learning_rate": 1.7882369853843155e-06, |
|
"loss": 0.8818, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.341207349081365, |
|
"grad_norm": 0.38671544491057547, |
|
"learning_rate": 1.76293104757829e-06, |
|
"loss": 0.8712, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.3464566929133859, |
|
"grad_norm": 0.35028636565033566, |
|
"learning_rate": 1.7377307265050559e-06, |
|
"loss": 0.8795, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.3517060367454068, |
|
"grad_norm": 0.3596694021401425, |
|
"learning_rate": 1.7126381736821732e-06, |
|
"loss": 0.8791, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.356955380577428, |
|
"grad_norm": 0.3833574983214166, |
|
"learning_rate": 1.6876555314263213e-06, |
|
"loss": 0.9108, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.3622047244094488, |
|
"grad_norm": 0.3701840047085969, |
|
"learning_rate": 1.6627849326703855e-06, |
|
"loss": 0.8695, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3674540682414698, |
|
"grad_norm": 0.36098816535443995, |
|
"learning_rate": 1.6380285007813598e-06, |
|
"loss": 0.876, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.372703412073491, |
|
"grad_norm": 0.3900890284585014, |
|
"learning_rate": 1.6133883493790609e-06, |
|
"loss": 0.8498, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.3779527559055118, |
|
"grad_norm": 0.34906551126755136, |
|
"learning_rate": 1.5888665821556724e-06, |
|
"loss": 0.8513, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.3832020997375327, |
|
"grad_norm": 0.3753732283477496, |
|
"learning_rate": 1.5644652926961407e-06, |
|
"loss": 0.8714, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.3884514435695539, |
|
"grad_norm": 0.34748864593560347, |
|
"learning_rate": 1.5401865642994315e-06, |
|
"loss": 0.9124, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.3937007874015748, |
|
"grad_norm": 0.36698053817770165, |
|
"learning_rate": 1.5160324698006642e-06, |
|
"loss": 0.8814, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.3989501312335957, |
|
"grad_norm": 0.4000964153653425, |
|
"learning_rate": 1.4920050713941398e-06, |
|
"loss": 0.9082, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.4041994750656168, |
|
"grad_norm": 0.3985391177875817, |
|
"learning_rate": 1.4681064204572798e-06, |
|
"loss": 0.8749, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.4094488188976377, |
|
"grad_norm": 0.3578122677174226, |
|
"learning_rate": 1.4443385573754837e-06, |
|
"loss": 0.8608, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.4146981627296589, |
|
"grad_norm": 0.3576093239254431, |
|
"learning_rate": 1.4207035113679322e-06, |
|
"loss": 0.8798, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4199475065616798, |
|
"grad_norm": 0.35299639204379674, |
|
"learning_rate": 1.3972033003143348e-06, |
|
"loss": 0.8972, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.425196850393701, |
|
"grad_norm": 0.3937775289907907, |
|
"learning_rate": 1.3738399305826516e-06, |
|
"loss": 0.8736, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.4304461942257218, |
|
"grad_norm": 0.3691998032129419, |
|
"learning_rate": 1.3506153968577983e-06, |
|
"loss": 0.8667, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.4356955380577427, |
|
"grad_norm": 0.35764876894907843, |
|
"learning_rate": 1.3275316819713435e-06, |
|
"loss": 0.882, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.4409448818897639, |
|
"grad_norm": 0.3859579688778526, |
|
"learning_rate": 1.3045907567322243e-06, |
|
"loss": 0.844, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.4461942257217848, |
|
"grad_norm": 0.3736621084680505, |
|
"learning_rate": 1.2817945797584844e-06, |
|
"loss": 0.8525, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.4514435695538057, |
|
"grad_norm": 0.36602372507940695, |
|
"learning_rate": 1.2591450973100532e-06, |
|
"loss": 0.8577, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.4566929133858268, |
|
"grad_norm": 0.37926054124030645, |
|
"learning_rate": 1.236644243122581e-06, |
|
"loss": 0.8837, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.4619422572178478, |
|
"grad_norm": 0.3680022216795608, |
|
"learning_rate": 1.214293938242344e-06, |
|
"loss": 0.8984, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.4671916010498687, |
|
"grad_norm": 0.37824901927870175, |
|
"learning_rate": 1.1920960908622313e-06, |
|
"loss": 0.8745, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4724409448818898, |
|
"grad_norm": 0.3489273490529577, |
|
"learning_rate": 1.17005259615883e-06, |
|
"loss": 0.8628, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.4776902887139107, |
|
"grad_norm": 0.3735770062938505, |
|
"learning_rate": 1.1481653361306215e-06, |
|
"loss": 0.8619, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.4829396325459316, |
|
"grad_norm": 0.3458041443504503, |
|
"learning_rate": 1.1264361794373032e-06, |
|
"loss": 0.8761, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.4881889763779528, |
|
"grad_norm": 0.35998420937846626, |
|
"learning_rate": 1.104866981240248e-06, |
|
"loss": 0.8844, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.4934383202099737, |
|
"grad_norm": 0.4029178073367971, |
|
"learning_rate": 1.0834595830441168e-06, |
|
"loss": 0.8511, |
|
"step": 285 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 380, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 95, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.000110270806753e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|