llama-3.1-8b-fsdp-magpie-reasoning-v1-20k-math-verifiable-verificationnly-epoch5
/
trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 4.967509025270758, | |
"eval_steps": 18, | |
"global_step": 345, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.01444043321299639, | |
"grad_norm": 3.5763113498687744, | |
"learning_rate": 5.000000000000001e-07, | |
"loss": 0.865, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.01444043321299639, | |
"eval_loss": 0.8391121625900269, | |
"eval_runtime": 35.414, | |
"eval_samples_per_second": 18.128, | |
"eval_steps_per_second": 2.287, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.02888086642599278, | |
"grad_norm": 3.5320334434509277, | |
"learning_rate": 1.0000000000000002e-06, | |
"loss": 0.8262, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.04332129963898917, | |
"grad_norm": 3.329249858856201, | |
"learning_rate": 1.5e-06, | |
"loss": 0.833, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.05776173285198556, | |
"grad_norm": 3.478191375732422, | |
"learning_rate": 2.0000000000000003e-06, | |
"loss": 0.865, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.07220216606498195, | |
"grad_norm": 2.8155710697174072, | |
"learning_rate": 2.5e-06, | |
"loss": 0.8168, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.08664259927797834, | |
"grad_norm": 2.3139870166778564, | |
"learning_rate": 3e-06, | |
"loss": 0.77, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.10108303249097472, | |
"grad_norm": 2.0602500438690186, | |
"learning_rate": 3.5e-06, | |
"loss": 0.7738, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.11552346570397112, | |
"grad_norm": 2.700531244277954, | |
"learning_rate": 4.000000000000001e-06, | |
"loss": 0.7574, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.1299638989169675, | |
"grad_norm": 2.2553508281707764, | |
"learning_rate": 4.5e-06, | |
"loss": 0.7405, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.1444043321299639, | |
"grad_norm": 2.9637646675109863, | |
"learning_rate": 5e-06, | |
"loss": 0.7045, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.1588447653429603, | |
"grad_norm": 2.4607481956481934, | |
"learning_rate": 5.500000000000001e-06, | |
"loss": 0.7066, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.17328519855595667, | |
"grad_norm": 1.875809669494629, | |
"learning_rate": 6e-06, | |
"loss": 0.6924, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.18772563176895307, | |
"grad_norm": 1.5592328310012817, | |
"learning_rate": 6.5000000000000004e-06, | |
"loss": 0.6667, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.20216606498194944, | |
"grad_norm": 1.9023362398147583, | |
"learning_rate": 7e-06, | |
"loss": 0.7267, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.21660649819494585, | |
"grad_norm": 1.5530339479446411, | |
"learning_rate": 7.500000000000001e-06, | |
"loss": 0.6943, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.23104693140794225, | |
"grad_norm": 1.0332263708114624, | |
"learning_rate": 8.000000000000001e-06, | |
"loss": 0.7069, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.24548736462093862, | |
"grad_norm": 1.181794285774231, | |
"learning_rate": 8.5e-06, | |
"loss": 0.6616, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.259927797833935, | |
"grad_norm": 1.0393184423446655, | |
"learning_rate": 9e-06, | |
"loss": 0.6493, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.259927797833935, | |
"eval_loss": 0.6557387709617615, | |
"eval_runtime": 33.1067, | |
"eval_samples_per_second": 19.392, | |
"eval_steps_per_second": 2.447, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.2743682310469314, | |
"grad_norm": 0.9353659749031067, | |
"learning_rate": 9.5e-06, | |
"loss": 0.6798, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.2888086642599278, | |
"grad_norm": 1.0315933227539062, | |
"learning_rate": 1e-05, | |
"loss": 0.6541, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.30324909747292417, | |
"grad_norm": 0.8723016977310181, | |
"learning_rate": 9.999766401714795e-06, | |
"loss": 0.6554, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.3176895306859206, | |
"grad_norm": 0.8054132461547852, | |
"learning_rate": 9.999065628686439e-06, | |
"loss": 0.642, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.33212996389891697, | |
"grad_norm": 0.7567704319953918, | |
"learning_rate": 9.997897746394684e-06, | |
"loss": 0.6257, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.34657039711191334, | |
"grad_norm": 0.8518706560134888, | |
"learning_rate": 9.996262863965651e-06, | |
"loss": 0.6279, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.36101083032490977, | |
"grad_norm": 0.7092042565345764, | |
"learning_rate": 9.994161134161635e-06, | |
"loss": 0.6486, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.37545126353790614, | |
"grad_norm": 0.8764254450798035, | |
"learning_rate": 9.991592753366822e-06, | |
"loss": 0.6469, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.3898916967509025, | |
"grad_norm": 0.7938660383224487, | |
"learning_rate": 9.988557961568956e-06, | |
"loss": 0.641, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.4043321299638989, | |
"grad_norm": 1.8984333276748657, | |
"learning_rate": 9.985057042336898e-06, | |
"loss": 0.6165, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.4187725631768953, | |
"grad_norm": 0.8867112398147583, | |
"learning_rate": 9.981090322794145e-06, | |
"loss": 0.6312, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.4332129963898917, | |
"grad_norm": 0.7045503854751587, | |
"learning_rate": 9.976658173588244e-06, | |
"loss": 0.626, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.44765342960288806, | |
"grad_norm": 0.7188596129417419, | |
"learning_rate": 9.97176100885618e-06, | |
"loss": 0.6136, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.4620938628158845, | |
"grad_norm": 0.7122887372970581, | |
"learning_rate": 9.966399286185666e-06, | |
"loss": 0.6296, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.47653429602888087, | |
"grad_norm": 0.6428160071372986, | |
"learning_rate": 9.960573506572391e-06, | |
"loss": 0.5887, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.49097472924187724, | |
"grad_norm": 0.6762687563896179, | |
"learning_rate": 9.954284214373204e-06, | |
"loss": 0.6112, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.5054151624548736, | |
"grad_norm": 0.6077902317047119, | |
"learning_rate": 9.947531997255256e-06, | |
"loss": 0.629, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.51985559566787, | |
"grad_norm": 0.6718001365661621, | |
"learning_rate": 9.940317486141084e-06, | |
"loss": 0.6239, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.51985559566787, | |
"eval_loss": 0.6234476566314697, | |
"eval_runtime": 34.0064, | |
"eval_samples_per_second": 18.879, | |
"eval_steps_per_second": 2.382, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.5342960288808665, | |
"grad_norm": 0.6401779055595398, | |
"learning_rate": 9.932641355149655e-06, | |
"loss": 0.6241, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.5487364620938628, | |
"grad_norm": 0.7026526927947998, | |
"learning_rate": 9.924504321533387e-06, | |
"loss": 0.635, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.5631768953068592, | |
"grad_norm": 0.6177261471748352, | |
"learning_rate": 9.915907145611117e-06, | |
"loss": 0.6189, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.5776173285198556, | |
"grad_norm": 0.601033627986908, | |
"learning_rate": 9.906850630697068e-06, | |
"loss": 0.574, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.592057761732852, | |
"grad_norm": 0.6070717573165894, | |
"learning_rate": 9.89733562302578e-06, | |
"loss": 0.5855, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.6064981949458483, | |
"grad_norm": 0.6149937510490417, | |
"learning_rate": 9.887363011673046e-06, | |
"loss": 0.6494, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.6209386281588448, | |
"grad_norm": 0.6047448515892029, | |
"learning_rate": 9.876933728472826e-06, | |
"loss": 0.6008, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.6353790613718412, | |
"grad_norm": 0.6176997423171997, | |
"learning_rate": 9.866048747930194e-06, | |
"loss": 0.6201, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.6498194945848376, | |
"grad_norm": 0.5998260378837585, | |
"learning_rate": 9.854709087130261e-06, | |
"loss": 0.611, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.6642599277978339, | |
"grad_norm": 0.6084288954734802, | |
"learning_rate": 9.842915805643156e-06, | |
"loss": 0.6, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.6787003610108303, | |
"grad_norm": 0.574383556842804, | |
"learning_rate": 9.830670005425012e-06, | |
"loss": 0.5901, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.6931407942238267, | |
"grad_norm": 0.5806477665901184, | |
"learning_rate": 9.817972830715003e-06, | |
"loss": 0.5867, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.7075812274368231, | |
"grad_norm": 0.6218814849853516, | |
"learning_rate": 9.804825467928423e-06, | |
"loss": 0.6199, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.7220216606498195, | |
"grad_norm": 0.6401635408401489, | |
"learning_rate": 9.791229145545832e-06, | |
"loss": 0.6128, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.7364620938628159, | |
"grad_norm": 0.6286787986755371, | |
"learning_rate": 9.777185133998268e-06, | |
"loss": 0.6212, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.7509025270758123, | |
"grad_norm": 0.5961940288543701, | |
"learning_rate": 9.76269474554854e-06, | |
"loss": 0.6079, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.7653429602888087, | |
"grad_norm": 0.5932030081748962, | |
"learning_rate": 9.747759334168602e-06, | |
"loss": 0.6224, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.779783393501805, | |
"grad_norm": 0.6036499738693237, | |
"learning_rate": 9.73238029541305e-06, | |
"loss": 0.611, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.779783393501805, | |
"eval_loss": 0.6119250655174255, | |
"eval_runtime": 33.2208, | |
"eval_samples_per_second": 19.325, | |
"eval_steps_per_second": 2.438, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.7942238267148014, | |
"grad_norm": 0.6094003915786743, | |
"learning_rate": 9.716559066288716e-06, | |
"loss": 0.6127, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.8086642599277978, | |
"grad_norm": 0.6151455044746399, | |
"learning_rate": 9.7002971251204e-06, | |
"loss": 0.6075, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.8231046931407943, | |
"grad_norm": 0.59092116355896, | |
"learning_rate": 9.683595991412725e-06, | |
"loss": 0.5975, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.8375451263537906, | |
"grad_norm": 0.6535966396331787, | |
"learning_rate": 9.666457225708175e-06, | |
"loss": 0.5856, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.851985559566787, | |
"grad_norm": 0.6049758791923523, | |
"learning_rate": 9.648882429441258e-06, | |
"loss": 0.5877, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.8664259927797834, | |
"grad_norm": 0.6271098256111145, | |
"learning_rate": 9.630873244788884e-06, | |
"loss": 0.5895, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.8808664259927798, | |
"grad_norm": 0.6239963173866272, | |
"learning_rate": 9.612431354516912e-06, | |
"loss": 0.5971, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.8953068592057761, | |
"grad_norm": 0.6575446724891663, | |
"learning_rate": 9.593558481822923e-06, | |
"loss": 0.6224, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.9097472924187726, | |
"grad_norm": 0.6333041787147522, | |
"learning_rate": 9.574256390175192e-06, | |
"loss": 0.619, | |
"step": 63 | |
}, | |
{ | |
"epoch": 0.924187725631769, | |
"grad_norm": 0.6591416597366333, | |
"learning_rate": 9.554526883147926e-06, | |
"loss": 0.6211, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.9386281588447654, | |
"grad_norm": 0.5847511291503906, | |
"learning_rate": 9.534371804252727e-06, | |
"loss": 0.612, | |
"step": 65 | |
}, | |
{ | |
"epoch": 0.9530685920577617, | |
"grad_norm": 0.6347054243087769, | |
"learning_rate": 9.513793036766345e-06, | |
"loss": 0.6117, | |
"step": 66 | |
}, | |
{ | |
"epoch": 0.9675090252707581, | |
"grad_norm": 0.634484589099884, | |
"learning_rate": 9.492792503554695e-06, | |
"loss": 0.6211, | |
"step": 67 | |
}, | |
{ | |
"epoch": 0.9819494584837545, | |
"grad_norm": 0.6583951711654663, | |
"learning_rate": 9.4713721668932e-06, | |
"loss": 0.6184, | |
"step": 68 | |
}, | |
{ | |
"epoch": 0.9963898916967509, | |
"grad_norm": 0.598239004611969, | |
"learning_rate": 9.44953402828342e-06, | |
"loss": 0.6229, | |
"step": 69 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 0.9969831705093384, | |
"learning_rate": 9.427280128266049e-06, | |
"loss": 0.5986, | |
"step": 70 | |
}, | |
{ | |
"epoch": 1.0144404332129964, | |
"grad_norm": 0.9471399188041687, | |
"learning_rate": 9.404612546230244e-06, | |
"loss": 0.5527, | |
"step": 71 | |
}, | |
{ | |
"epoch": 1.0288808664259927, | |
"grad_norm": 0.726841926574707, | |
"learning_rate": 9.381533400219319e-06, | |
"loss": 0.5703, | |
"step": 72 | |
}, | |
{ | |
"epoch": 1.0288808664259927, | |
"eval_loss": 0.613798201084137, | |
"eval_runtime": 33.4351, | |
"eval_samples_per_second": 19.201, | |
"eval_steps_per_second": 2.423, | |
"step": 72 | |
}, | |
{ | |
"epoch": 1.0433212996389891, | |
"grad_norm": 0.7898038625717163, | |
"learning_rate": 9.358044846732848e-06, | |
"loss": 0.5478, | |
"step": 73 | |
}, | |
{ | |
"epoch": 1.0577617328519855, | |
"grad_norm": 0.7952622175216675, | |
"learning_rate": 9.334149080525154e-06, | |
"loss": 0.5393, | |
"step": 74 | |
}, | |
{ | |
"epoch": 1.0722021660649819, | |
"grad_norm": 0.8256924748420715, | |
"learning_rate": 9.309848334400247e-06, | |
"loss": 0.5613, | |
"step": 75 | |
}, | |
{ | |
"epoch": 1.0866425992779782, | |
"grad_norm": 0.8076434135437012, | |
"learning_rate": 9.285144879003173e-06, | |
"loss": 0.5346, | |
"step": 76 | |
}, | |
{ | |
"epoch": 1.1010830324909748, | |
"grad_norm": 0.7525290250778198, | |
"learning_rate": 9.26004102260786e-06, | |
"loss": 0.556, | |
"step": 77 | |
}, | |
{ | |
"epoch": 1.1155234657039712, | |
"grad_norm": 0.7560561895370483, | |
"learning_rate": 9.23453911090143e-06, | |
"loss": 0.5454, | |
"step": 78 | |
}, | |
{ | |
"epoch": 1.1299638989169676, | |
"grad_norm": 0.8224523663520813, | |
"learning_rate": 9.208641526765024e-06, | |
"loss": 0.5243, | |
"step": 79 | |
}, | |
{ | |
"epoch": 1.144404332129964, | |
"grad_norm": 0.7244045734405518, | |
"learning_rate": 9.182350690051134e-06, | |
"loss": 0.5678, | |
"step": 80 | |
}, | |
{ | |
"epoch": 1.1588447653429603, | |
"grad_norm": 0.7751045823097229, | |
"learning_rate": 9.155669057357515e-06, | |
"loss": 0.5389, | |
"step": 81 | |
}, | |
{ | |
"epoch": 1.1732851985559567, | |
"grad_norm": 0.7371921539306641, | |
"learning_rate": 9.12859912179762e-06, | |
"loss": 0.5243, | |
"step": 82 | |
}, | |
{ | |
"epoch": 1.187725631768953, | |
"grad_norm": 0.7369418740272522, | |
"learning_rate": 9.101143412767665e-06, | |
"loss": 0.5342, | |
"step": 83 | |
}, | |
{ | |
"epoch": 1.2021660649819494, | |
"grad_norm": 0.6683140397071838, | |
"learning_rate": 9.073304495710267e-06, | |
"loss": 0.5284, | |
"step": 84 | |
}, | |
{ | |
"epoch": 1.2166064981949458, | |
"grad_norm": 0.7613602876663208, | |
"learning_rate": 9.045084971874738e-06, | |
"loss": 0.5063, | |
"step": 85 | |
}, | |
{ | |
"epoch": 1.2310469314079422, | |
"grad_norm": 0.7036657929420471, | |
"learning_rate": 9.016487478074032e-06, | |
"loss": 0.5281, | |
"step": 86 | |
}, | |
{ | |
"epoch": 1.2454873646209386, | |
"grad_norm": 0.7911130785942078, | |
"learning_rate": 8.987514686438353e-06, | |
"loss": 0.5421, | |
"step": 87 | |
}, | |
{ | |
"epoch": 1.259927797833935, | |
"grad_norm": 0.7307482957839966, | |
"learning_rate": 8.95816930416548e-06, | |
"loss": 0.5298, | |
"step": 88 | |
}, | |
{ | |
"epoch": 1.2743682310469313, | |
"grad_norm": 0.6484747529029846, | |
"learning_rate": 8.928454073267801e-06, | |
"loss": 0.5451, | |
"step": 89 | |
}, | |
{ | |
"epoch": 1.288808664259928, | |
"grad_norm": 0.678794264793396, | |
"learning_rate": 8.898371770316113e-06, | |
"loss": 0.542, | |
"step": 90 | |
}, | |
{ | |
"epoch": 1.288808664259928, | |
"eval_loss": 0.6132499575614929, | |
"eval_runtime": 32.641, | |
"eval_samples_per_second": 19.669, | |
"eval_steps_per_second": 2.482, | |
"step": 90 | |
}, | |
{ | |
"epoch": 1.303249097472924, | |
"grad_norm": 0.7240316867828369, | |
"learning_rate": 8.867925206180166e-06, | |
"loss": 0.534, | |
"step": 91 | |
}, | |
{ | |
"epoch": 1.3176895306859207, | |
"grad_norm": 0.7762596011161804, | |
"learning_rate": 8.837117225766033e-06, | |
"loss": 0.5271, | |
"step": 92 | |
}, | |
{ | |
"epoch": 1.332129963898917, | |
"grad_norm": 0.695136547088623, | |
"learning_rate": 8.805950707750268e-06, | |
"loss": 0.5422, | |
"step": 93 | |
}, | |
{ | |
"epoch": 1.3465703971119134, | |
"grad_norm": 0.7882359623908997, | |
"learning_rate": 8.774428564310939e-06, | |
"loss": 0.5172, | |
"step": 94 | |
}, | |
{ | |
"epoch": 1.3610108303249098, | |
"grad_norm": 0.7503855228424072, | |
"learning_rate": 8.742553740855507e-06, | |
"loss": 0.5203, | |
"step": 95 | |
}, | |
{ | |
"epoch": 1.3754512635379061, | |
"grad_norm": 0.7731848359107971, | |
"learning_rate": 8.710329215745612e-06, | |
"loss": 0.5378, | |
"step": 96 | |
}, | |
{ | |
"epoch": 1.3898916967509025, | |
"grad_norm": 0.7920417189598083, | |
"learning_rate": 8.677758000018777e-06, | |
"loss": 0.5433, | |
"step": 97 | |
}, | |
{ | |
"epoch": 1.404332129963899, | |
"grad_norm": 0.9389582276344299, | |
"learning_rate": 8.644843137107058e-06, | |
"loss": 0.5542, | |
"step": 98 | |
}, | |
{ | |
"epoch": 1.4187725631768953, | |
"grad_norm": 0.6249178051948547, | |
"learning_rate": 8.61158770255267e-06, | |
"loss": 0.5218, | |
"step": 99 | |
}, | |
{ | |
"epoch": 1.4332129963898916, | |
"grad_norm": 0.8020306825637817, | |
"learning_rate": 8.577994803720605e-06, | |
"loss": 0.535, | |
"step": 100 | |
}, | |
{ | |
"epoch": 1.447653429602888, | |
"grad_norm": 0.7037068009376526, | |
"learning_rate": 8.544067579508292e-06, | |
"loss": 0.5257, | |
"step": 101 | |
}, | |
{ | |
"epoch": 1.4620938628158844, | |
"grad_norm": 0.752699077129364, | |
"learning_rate": 8.509809200052286e-06, | |
"loss": 0.5494, | |
"step": 102 | |
}, | |
{ | |
"epoch": 1.476534296028881, | |
"grad_norm": 0.7139776349067688, | |
"learning_rate": 8.475222866432065e-06, | |
"loss": 0.5384, | |
"step": 103 | |
}, | |
{ | |
"epoch": 1.4909747292418771, | |
"grad_norm": 0.6958439350128174, | |
"learning_rate": 8.440311810370921e-06, | |
"loss": 0.5371, | |
"step": 104 | |
}, | |
{ | |
"epoch": 1.5054151624548737, | |
"grad_norm": 0.6605455279350281, | |
"learning_rate": 8.405079293933986e-06, | |
"loss": 0.5355, | |
"step": 105 | |
}, | |
{ | |
"epoch": 1.5198555956678699, | |
"grad_norm": 0.7509002685546875, | |
"learning_rate": 8.36952860922343e-06, | |
"loss": 0.5441, | |
"step": 106 | |
}, | |
{ | |
"epoch": 1.5342960288808665, | |
"grad_norm": 0.6421518325805664, | |
"learning_rate": 8.333663078070845e-06, | |
"loss": 0.5286, | |
"step": 107 | |
}, | |
{ | |
"epoch": 1.5487364620938628, | |
"grad_norm": 0.6712743043899536, | |
"learning_rate": 8.297486051726864e-06, | |
"loss": 0.534, | |
"step": 108 | |
}, | |
{ | |
"epoch": 1.5487364620938628, | |
"eval_loss": 0.6092959642410278, | |
"eval_runtime": 32.5515, | |
"eval_samples_per_second": 19.723, | |
"eval_steps_per_second": 2.488, | |
"step": 108 | |
}, | |
{ | |
"epoch": 1.5631768953068592, | |
"grad_norm": 0.6272268891334534, | |
"learning_rate": 8.26100091054801e-06, | |
"loss": 0.5425, | |
"step": 109 | |
}, | |
{ | |
"epoch": 1.5776173285198556, | |
"grad_norm": 0.6285799741744995, | |
"learning_rate": 8.224211063680854e-06, | |
"loss": 0.5199, | |
"step": 110 | |
}, | |
{ | |
"epoch": 1.592057761732852, | |
"grad_norm": 0.5956050753593445, | |
"learning_rate": 8.18711994874345e-06, | |
"loss": 0.5464, | |
"step": 111 | |
}, | |
{ | |
"epoch": 1.6064981949458483, | |
"grad_norm": 0.6121729612350464, | |
"learning_rate": 8.149731031504136e-06, | |
"loss": 0.5207, | |
"step": 112 | |
}, | |
{ | |
"epoch": 1.6209386281588447, | |
"grad_norm": 0.5839301943778992, | |
"learning_rate": 8.112047805557693e-06, | |
"loss": 0.5177, | |
"step": 113 | |
}, | |
{ | |
"epoch": 1.6353790613718413, | |
"grad_norm": 0.6148134469985962, | |
"learning_rate": 8.074073791998907e-06, | |
"loss": 0.5325, | |
"step": 114 | |
}, | |
{ | |
"epoch": 1.6498194945848375, | |
"grad_norm": 0.6761514544487, | |
"learning_rate": 8.035812539093557e-06, | |
"loss": 0.5691, | |
"step": 115 | |
}, | |
{ | |
"epoch": 1.664259927797834, | |
"grad_norm": 0.5942521691322327, | |
"learning_rate": 7.997267621946871e-06, | |
"loss": 0.5471, | |
"step": 116 | |
}, | |
{ | |
"epoch": 1.6787003610108302, | |
"grad_norm": 0.6566746234893799, | |
"learning_rate": 7.958442642169469e-06, | |
"loss": 0.5333, | |
"step": 117 | |
}, | |
{ | |
"epoch": 1.6931407942238268, | |
"grad_norm": 0.6552228927612305, | |
"learning_rate": 7.919341227540828e-06, | |
"loss": 0.5473, | |
"step": 118 | |
}, | |
{ | |
"epoch": 1.707581227436823, | |
"grad_norm": 0.5929325819015503, | |
"learning_rate": 7.879967031670313e-06, | |
"loss": 0.5189, | |
"step": 119 | |
}, | |
{ | |
"epoch": 1.7220216606498195, | |
"grad_norm": 0.6215195059776306, | |
"learning_rate": 7.84032373365578e-06, | |
"loss": 0.5332, | |
"step": 120 | |
}, | |
{ | |
"epoch": 1.736462093862816, | |
"grad_norm": 0.6209510564804077, | |
"learning_rate": 7.800415037739802e-06, | |
"loss": 0.5198, | |
"step": 121 | |
}, | |
{ | |
"epoch": 1.7509025270758123, | |
"grad_norm": 0.6743574142456055, | |
"learning_rate": 7.760244672963548e-06, | |
"loss": 0.521, | |
"step": 122 | |
}, | |
{ | |
"epoch": 1.7653429602888087, | |
"grad_norm": 0.6415581107139587, | |
"learning_rate": 7.719816392818354e-06, | |
"loss": 0.5387, | |
"step": 123 | |
}, | |
{ | |
"epoch": 1.779783393501805, | |
"grad_norm": 0.6156140565872192, | |
"learning_rate": 7.679133974894984e-06, | |
"loss": 0.5545, | |
"step": 124 | |
}, | |
{ | |
"epoch": 1.7942238267148014, | |
"grad_norm": 0.6508094668388367, | |
"learning_rate": 7.638201220530664e-06, | |
"loss": 0.5187, | |
"step": 125 | |
}, | |
{ | |
"epoch": 1.8086642599277978, | |
"grad_norm": 0.6122965812683105, | |
"learning_rate": 7.597021954453887e-06, | |
"loss": 0.5547, | |
"step": 126 | |
}, | |
{ | |
"epoch": 1.8086642599277978, | |
"eval_loss": 0.6062851548194885, | |
"eval_runtime": 33.097, | |
"eval_samples_per_second": 19.398, | |
"eval_steps_per_second": 2.447, | |
"step": 126 | |
}, | |
{ | |
"epoch": 1.8231046931407944, | |
"grad_norm": 0.6912717819213867, | |
"learning_rate": 7.555600024427028e-06, | |
"loss": 0.5406, | |
"step": 127 | |
}, | |
{ | |
"epoch": 1.8375451263537905, | |
"grad_norm": 0.5661697387695312, | |
"learning_rate": 7.513939300886816e-06, | |
"loss": 0.5387, | |
"step": 128 | |
}, | |
{ | |
"epoch": 1.8519855595667871, | |
"grad_norm": 0.6387366652488708, | |
"learning_rate": 7.472043676582685e-06, | |
"loss": 0.5472, | |
"step": 129 | |
}, | |
{ | |
"epoch": 1.8664259927797833, | |
"grad_norm": 0.6369442939758301, | |
"learning_rate": 7.42991706621303e-06, | |
"loss": 0.5419, | |
"step": 130 | |
}, | |
{ | |
"epoch": 1.8808664259927799, | |
"grad_norm": 0.5893256664276123, | |
"learning_rate": 7.387563406059433e-06, | |
"loss": 0.5406, | |
"step": 131 | |
}, | |
{ | |
"epoch": 1.895306859205776, | |
"grad_norm": 0.6452357769012451, | |
"learning_rate": 7.344986653618844e-06, | |
"loss": 0.5123, | |
"step": 132 | |
}, | |
{ | |
"epoch": 1.9097472924187726, | |
"grad_norm": 0.6396621465682983, | |
"learning_rate": 7.302190787233808e-06, | |
"loss": 0.5151, | |
"step": 133 | |
}, | |
{ | |
"epoch": 1.924187725631769, | |
"grad_norm": 0.6289088129997253, | |
"learning_rate": 7.259179805720726e-06, | |
"loss": 0.5503, | |
"step": 134 | |
}, | |
{ | |
"epoch": 1.9386281588447654, | |
"grad_norm": 0.587646484375, | |
"learning_rate": 7.215957727996208e-06, | |
"loss": 0.5164, | |
"step": 135 | |
}, | |
{ | |
"epoch": 1.9530685920577617, | |
"grad_norm": 0.6339498162269592, | |
"learning_rate": 7.17252859270155e-06, | |
"loss": 0.5316, | |
"step": 136 | |
}, | |
{ | |
"epoch": 1.967509025270758, | |
"grad_norm": 0.6126948595046997, | |
"learning_rate": 7.128896457825364e-06, | |
"loss": 0.5397, | |
"step": 137 | |
}, | |
{ | |
"epoch": 1.9819494584837545, | |
"grad_norm": 0.5835312604904175, | |
"learning_rate": 7.085065400324407e-06, | |
"loss": 0.5161, | |
"step": 138 | |
}, | |
{ | |
"epoch": 1.9963898916967509, | |
"grad_norm": 0.619816780090332, | |
"learning_rate": 7.041039515742626e-06, | |
"loss": 0.5228, | |
"step": 139 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 1.1341562271118164, | |
"learning_rate": 6.9968229178284775e-06, | |
"loss": 0.5874, | |
"step": 140 | |
}, | |
{ | |
"epoch": 2.0144404332129966, | |
"grad_norm": 1.2002171277999878, | |
"learning_rate": 6.952419738150546e-06, | |
"loss": 0.4675, | |
"step": 141 | |
}, | |
{ | |
"epoch": 2.0288808664259927, | |
"grad_norm": 0.8529412746429443, | |
"learning_rate": 6.9078341257114765e-06, | |
"loss": 0.4614, | |
"step": 142 | |
}, | |
{ | |
"epoch": 2.0433212996389893, | |
"grad_norm": 0.7977300882339478, | |
"learning_rate": 6.863070246560319e-06, | |
"loss": 0.4506, | |
"step": 143 | |
}, | |
{ | |
"epoch": 2.0577617328519855, | |
"grad_norm": 1.0669164657592773, | |
"learning_rate": 6.818132283403236e-06, | |
"loss": 0.4592, | |
"step": 144 | |
}, | |
{ | |
"epoch": 2.0577617328519855, | |
"eval_loss": 0.63676917552948, | |
"eval_runtime": 33.7014, | |
"eval_samples_per_second": 19.05, | |
"eval_steps_per_second": 2.403, | |
"step": 144 | |
}, | |
{ | |
"epoch": 2.072202166064982, | |
"grad_norm": 0.9640716910362244, | |
"learning_rate": 6.773024435212678e-06, | |
"loss": 0.4684, | |
"step": 145 | |
}, | |
{ | |
"epoch": 2.0866425992779782, | |
"grad_norm": 0.890157163143158, | |
"learning_rate": 6.7277509168350445e-06, | |
"loss": 0.4403, | |
"step": 146 | |
}, | |
{ | |
"epoch": 2.101083032490975, | |
"grad_norm": 0.9896436929702759, | |
"learning_rate": 6.6823159585968355e-06, | |
"loss": 0.4572, | |
"step": 147 | |
}, | |
{ | |
"epoch": 2.115523465703971, | |
"grad_norm": 0.8642748594284058, | |
"learning_rate": 6.636723805909384e-06, | |
"loss": 0.456, | |
"step": 148 | |
}, | |
{ | |
"epoch": 2.1299638989169676, | |
"grad_norm": 0.9155515432357788, | |
"learning_rate": 6.590978718872166e-06, | |
"loss": 0.4465, | |
"step": 149 | |
}, | |
{ | |
"epoch": 2.1444043321299637, | |
"grad_norm": 0.8667652606964111, | |
"learning_rate": 6.545084971874738e-06, | |
"loss": 0.4421, | |
"step": 150 | |
}, | |
{ | |
"epoch": 2.1588447653429603, | |
"grad_norm": 0.889424204826355, | |
"learning_rate": 6.499046853197338e-06, | |
"loss": 0.4632, | |
"step": 151 | |
}, | |
{ | |
"epoch": 2.1732851985559565, | |
"grad_norm": 0.7416921257972717, | |
"learning_rate": 6.452868664610197e-06, | |
"loss": 0.4515, | |
"step": 152 | |
}, | |
{ | |
"epoch": 2.187725631768953, | |
"grad_norm": 0.7553042769432068, | |
"learning_rate": 6.406554720971583e-06, | |
"loss": 0.429, | |
"step": 153 | |
}, | |
{ | |
"epoch": 2.2021660649819497, | |
"grad_norm": 0.8044371604919434, | |
"learning_rate": 6.3601093498246215e-06, | |
"loss": 0.4522, | |
"step": 154 | |
}, | |
{ | |
"epoch": 2.216606498194946, | |
"grad_norm": 0.7472015619277954, | |
"learning_rate": 6.313536890992935e-06, | |
"loss": 0.4473, | |
"step": 155 | |
}, | |
{ | |
"epoch": 2.2310469314079424, | |
"grad_norm": 0.7978557348251343, | |
"learning_rate": 6.266841696175132e-06, | |
"loss": 0.4457, | |
"step": 156 | |
}, | |
{ | |
"epoch": 2.2454873646209386, | |
"grad_norm": 0.7715721130371094, | |
"learning_rate": 6.220028128538188e-06, | |
"loss": 0.4404, | |
"step": 157 | |
}, | |
{ | |
"epoch": 2.259927797833935, | |
"grad_norm": 0.7645951509475708, | |
"learning_rate": 6.173100562309751e-06, | |
"loss": 0.4567, | |
"step": 158 | |
}, | |
{ | |
"epoch": 2.2743682310469313, | |
"grad_norm": 0.883103609085083, | |
"learning_rate": 6.1260633823694224e-06, | |
"loss": 0.4569, | |
"step": 159 | |
}, | |
{ | |
"epoch": 2.288808664259928, | |
"grad_norm": 0.695549488067627, | |
"learning_rate": 6.078920983839032e-06, | |
"loss": 0.4531, | |
"step": 160 | |
}, | |
{ | |
"epoch": 2.303249097472924, | |
"grad_norm": 0.7921261191368103, | |
"learning_rate": 6.031677771671962e-06, | |
"loss": 0.454, | |
"step": 161 | |
}, | |
{ | |
"epoch": 2.3176895306859207, | |
"grad_norm": 0.7378138303756714, | |
"learning_rate": 5.984338160241552e-06, | |
"loss": 0.4454, | |
"step": 162 | |
}, | |
{ | |
"epoch": 2.3176895306859207, | |
"eval_loss": 0.6355770826339722, | |
"eval_runtime": 32.8781, | |
"eval_samples_per_second": 19.527, | |
"eval_steps_per_second": 2.464, | |
"step": 162 | |
}, | |
{ | |
"epoch": 2.332129963898917, | |
"grad_norm": 0.7265376448631287, | |
"learning_rate": 5.936906572928625e-06, | |
"loss": 0.4538, | |
"step": 163 | |
}, | |
{ | |
"epoch": 2.3465703971119134, | |
"grad_norm": 0.7353127002716064, | |
"learning_rate": 5.889387441708162e-06, | |
"loss": 0.4663, | |
"step": 164 | |
}, | |
{ | |
"epoch": 2.3610108303249095, | |
"grad_norm": 0.7641818523406982, | |
"learning_rate": 5.841785206735192e-06, | |
"loss": 0.4362, | |
"step": 165 | |
}, | |
{ | |
"epoch": 2.375451263537906, | |
"grad_norm": 0.7559286952018738, | |
"learning_rate": 5.794104315929904e-06, | |
"loss": 0.4422, | |
"step": 166 | |
}, | |
{ | |
"epoch": 2.3898916967509027, | |
"grad_norm": 0.7505171895027161, | |
"learning_rate": 5.746349224562021e-06, | |
"loss": 0.4728, | |
"step": 167 | |
}, | |
{ | |
"epoch": 2.404332129963899, | |
"grad_norm": 0.773835301399231, | |
"learning_rate": 5.698524394834531e-06, | |
"loss": 0.4555, | |
"step": 168 | |
}, | |
{ | |
"epoch": 2.4187725631768955, | |
"grad_norm": 0.6625051498413086, | |
"learning_rate": 5.650634295466717e-06, | |
"loss": 0.4443, | |
"step": 169 | |
}, | |
{ | |
"epoch": 2.4332129963898916, | |
"grad_norm": 0.7382647395133972, | |
"learning_rate": 5.6026834012766155e-06, | |
"loss": 0.4505, | |
"step": 170 | |
}, | |
{ | |
"epoch": 2.4476534296028882, | |
"grad_norm": 0.7342945337295532, | |
"learning_rate": 5.554676192762891e-06, | |
"loss": 0.462, | |
"step": 171 | |
}, | |
{ | |
"epoch": 2.4620938628158844, | |
"grad_norm": 0.69790118932724, | |
"learning_rate": 5.506617155686177e-06, | |
"loss": 0.4483, | |
"step": 172 | |
}, | |
{ | |
"epoch": 2.476534296028881, | |
"grad_norm": 0.7470245957374573, | |
"learning_rate": 5.458510780649932e-06, | |
"loss": 0.4432, | |
"step": 173 | |
}, | |
{ | |
"epoch": 2.490974729241877, | |
"grad_norm": 0.6948450803756714, | |
"learning_rate": 5.4103615626808426e-06, | |
"loss": 0.4543, | |
"step": 174 | |
}, | |
{ | |
"epoch": 2.5054151624548737, | |
"grad_norm": 0.6710882186889648, | |
"learning_rate": 5.362174000808813e-06, | |
"loss": 0.4442, | |
"step": 175 | |
}, | |
{ | |
"epoch": 2.51985559566787, | |
"grad_norm": 0.6929926872253418, | |
"learning_rate": 5.3139525976465675e-06, | |
"loss": 0.4448, | |
"step": 176 | |
}, | |
{ | |
"epoch": 2.5342960288808665, | |
"grad_norm": 0.68537837266922, | |
"learning_rate": 5.265701858968944e-06, | |
"loss": 0.4558, | |
"step": 177 | |
}, | |
{ | |
"epoch": 2.5487364620938626, | |
"grad_norm": 0.6701018214225769, | |
"learning_rate": 5.217426293291869e-06, | |
"loss": 0.4509, | |
"step": 178 | |
}, | |
{ | |
"epoch": 2.563176895306859, | |
"grad_norm": 0.6754850745201111, | |
"learning_rate": 5.169130411451083e-06, | |
"loss": 0.42, | |
"step": 179 | |
}, | |
{ | |
"epoch": 2.577617328519856, | |
"grad_norm": 0.6687860488891602, | |
"learning_rate": 5.120818726180662e-06, | |
"loss": 0.4483, | |
"step": 180 | |
}, | |
{ | |
"epoch": 2.577617328519856, | |
"eval_loss": 0.6319621801376343, | |
"eval_runtime": 32.6147, | |
"eval_samples_per_second": 19.684, | |
"eval_steps_per_second": 2.484, | |
"step": 180 | |
}, | |
{ | |
"epoch": 2.592057761732852, | |
"grad_norm": 0.6524101495742798, | |
"learning_rate": 5.072495751691338e-06, | |
"loss": 0.4223, | |
"step": 181 | |
}, | |
{ | |
"epoch": 2.606498194945848, | |
"grad_norm": 0.6371822953224182, | |
"learning_rate": 5.024166003248703e-06, | |
"loss": 0.4472, | |
"step": 182 | |
}, | |
{ | |
"epoch": 2.6209386281588447, | |
"grad_norm": 0.6347170472145081, | |
"learning_rate": 4.9758339967512995e-06, | |
"loss": 0.4339, | |
"step": 183 | |
}, | |
{ | |
"epoch": 2.6353790613718413, | |
"grad_norm": 0.647972822189331, | |
"learning_rate": 4.927504248308663e-06, | |
"loss": 0.4296, | |
"step": 184 | |
}, | |
{ | |
"epoch": 2.6498194945848375, | |
"grad_norm": 0.6498137712478638, | |
"learning_rate": 4.87918127381934e-06, | |
"loss": 0.4324, | |
"step": 185 | |
}, | |
{ | |
"epoch": 2.664259927797834, | |
"grad_norm": 0.6369062066078186, | |
"learning_rate": 4.830869588548918e-06, | |
"loss": 0.4494, | |
"step": 186 | |
}, | |
{ | |
"epoch": 2.67870036101083, | |
"grad_norm": 0.6868234276771545, | |
"learning_rate": 4.782573706708133e-06, | |
"loss": 0.4591, | |
"step": 187 | |
}, | |
{ | |
"epoch": 2.693140794223827, | |
"grad_norm": 0.6149920225143433, | |
"learning_rate": 4.734298141031057e-06, | |
"loss": 0.4478, | |
"step": 188 | |
}, | |
{ | |
"epoch": 2.707581227436823, | |
"grad_norm": 0.6293914318084717, | |
"learning_rate": 4.686047402353433e-06, | |
"loss": 0.4418, | |
"step": 189 | |
}, | |
{ | |
"epoch": 2.7220216606498195, | |
"grad_norm": 0.7016595602035522, | |
"learning_rate": 4.637825999191189e-06, | |
"loss": 0.447, | |
"step": 190 | |
}, | |
{ | |
"epoch": 2.7364620938628157, | |
"grad_norm": 0.6719167232513428, | |
"learning_rate": 4.589638437319157e-06, | |
"loss": 0.4494, | |
"step": 191 | |
}, | |
{ | |
"epoch": 2.7509025270758123, | |
"grad_norm": 0.6407426595687866, | |
"learning_rate": 4.541489219350069e-06, | |
"loss": 0.4524, | |
"step": 192 | |
}, | |
{ | |
"epoch": 2.765342960288809, | |
"grad_norm": 0.696622371673584, | |
"learning_rate": 4.493382844313826e-06, | |
"loss": 0.4567, | |
"step": 193 | |
}, | |
{ | |
"epoch": 2.779783393501805, | |
"grad_norm": 0.6366904377937317, | |
"learning_rate": 4.445323807237112e-06, | |
"loss": 0.467, | |
"step": 194 | |
}, | |
{ | |
"epoch": 2.794223826714801, | |
"grad_norm": 0.6364036798477173, | |
"learning_rate": 4.397316598723385e-06, | |
"loss": 0.448, | |
"step": 195 | |
}, | |
{ | |
"epoch": 2.808664259927798, | |
"grad_norm": 0.6690941452980042, | |
"learning_rate": 4.349365704533285e-06, | |
"loss": 0.4491, | |
"step": 196 | |
}, | |
{ | |
"epoch": 2.8231046931407944, | |
"grad_norm": 0.6544636487960815, | |
"learning_rate": 4.301475605165471e-06, | |
"loss": 0.4715, | |
"step": 197 | |
}, | |
{ | |
"epoch": 2.8375451263537905, | |
"grad_norm": 0.6416365504264832, | |
"learning_rate": 4.25365077543798e-06, | |
"loss": 0.4644, | |
"step": 198 | |
}, | |
{ | |
"epoch": 2.8375451263537905, | |
"eval_loss": 0.6322494149208069, | |
"eval_runtime": 32.6203, | |
"eval_samples_per_second": 19.681, | |
"eval_steps_per_second": 2.483, | |
"step": 198 | |
}, | |
{ | |
"epoch": 2.851985559566787, | |
"grad_norm": 0.6493151187896729, | |
"learning_rate": 4.205895684070099e-06, | |
"loss": 0.4533, | |
"step": 199 | |
}, | |
{ | |
"epoch": 2.8664259927797833, | |
"grad_norm": 0.6717944145202637, | |
"learning_rate": 4.158214793264808e-06, | |
"loss": 0.4689, | |
"step": 200 | |
}, | |
{ | |
"epoch": 2.88086642599278, | |
"grad_norm": 0.635381281375885, | |
"learning_rate": 4.1106125582918385e-06, | |
"loss": 0.4512, | |
"step": 201 | |
}, | |
{ | |
"epoch": 2.895306859205776, | |
"grad_norm": 0.6783753633499146, | |
"learning_rate": 4.063093427071376e-06, | |
"loss": 0.434, | |
"step": 202 | |
}, | |
{ | |
"epoch": 2.9097472924187726, | |
"grad_norm": 0.6600508689880371, | |
"learning_rate": 4.01566183975845e-06, | |
"loss": 0.4497, | |
"step": 203 | |
}, | |
{ | |
"epoch": 2.9241877256317688, | |
"grad_norm": 0.6617497801780701, | |
"learning_rate": 3.968322228328041e-06, | |
"loss": 0.453, | |
"step": 204 | |
}, | |
{ | |
"epoch": 2.9386281588447654, | |
"grad_norm": 0.6339384317398071, | |
"learning_rate": 3.92107901616097e-06, | |
"loss": 0.4517, | |
"step": 205 | |
}, | |
{ | |
"epoch": 2.953068592057762, | |
"grad_norm": 0.7040077447891235, | |
"learning_rate": 3.873936617630578e-06, | |
"loss": 0.4699, | |
"step": 206 | |
}, | |
{ | |
"epoch": 2.967509025270758, | |
"grad_norm": 0.6730422377586365, | |
"learning_rate": 3.82689943769025e-06, | |
"loss": 0.4375, | |
"step": 207 | |
}, | |
{ | |
"epoch": 2.9819494584837543, | |
"grad_norm": 0.6471662521362305, | |
"learning_rate": 3.779971871461813e-06, | |
"loss": 0.4615, | |
"step": 208 | |
}, | |
{ | |
"epoch": 2.996389891696751, | |
"grad_norm": 0.6612151861190796, | |
"learning_rate": 3.7331583038248688e-06, | |
"loss": 0.4489, | |
"step": 209 | |
}, | |
{ | |
"epoch": 3.0144404332129966, | |
"grad_norm": 1.2435836791992188, | |
"learning_rate": 3.6864631090070656e-06, | |
"loss": 0.3921, | |
"step": 210 | |
}, | |
{ | |
"epoch": 3.0288808664259927, | |
"grad_norm": 1.0806032419204712, | |
"learning_rate": 3.639890650175379e-06, | |
"loss": 0.368, | |
"step": 211 | |
}, | |
{ | |
"epoch": 3.0433212996389893, | |
"grad_norm": 0.8179118037223816, | |
"learning_rate": 3.593445279028418e-06, | |
"loss": 0.3847, | |
"step": 212 | |
}, | |
{ | |
"epoch": 3.0577617328519855, | |
"grad_norm": 1.0836800336837769, | |
"learning_rate": 3.5471313353898056e-06, | |
"loss": 0.3885, | |
"step": 213 | |
}, | |
{ | |
"epoch": 3.072202166064982, | |
"grad_norm": 1.4152075052261353, | |
"learning_rate": 3.5009531468026646e-06, | |
"loss": 0.3764, | |
"step": 214 | |
}, | |
{ | |
"epoch": 3.0866425992779782, | |
"grad_norm": 1.26155686378479, | |
"learning_rate": 3.4549150281252635e-06, | |
"loss": 0.3786, | |
"step": 215 | |
}, | |
{ | |
"epoch": 3.101083032490975, | |
"grad_norm": 0.9656947255134583, | |
"learning_rate": 3.409021281127835e-06, | |
"loss": 0.3837, | |
"step": 216 | |
}, | |
{ | |
"epoch": 3.101083032490975, | |
"eval_loss": 0.6669259071350098, | |
"eval_runtime": 32.2367, | |
"eval_samples_per_second": 19.915, | |
"eval_steps_per_second": 2.513, | |
"step": 216 | |
}, | |
{ | |
"epoch": 3.115523465703971, | |
"grad_norm": 1.0473723411560059, | |
"learning_rate": 3.3632761940906167e-06, | |
"loss": 0.3779, | |
"step": 217 | |
}, | |
{ | |
"epoch": 3.1299638989169676, | |
"grad_norm": 1.022290587425232, | |
"learning_rate": 3.3176840414031653e-06, | |
"loss": 0.3825, | |
"step": 218 | |
}, | |
{ | |
"epoch": 3.1444043321299637, | |
"grad_norm": 0.869454562664032, | |
"learning_rate": 3.2722490831649568e-06, | |
"loss": 0.366, | |
"step": 219 | |
}, | |
{ | |
"epoch": 3.1588447653429603, | |
"grad_norm": 0.8059322834014893, | |
"learning_rate": 3.226975564787322e-06, | |
"loss": 0.3625, | |
"step": 220 | |
}, | |
{ | |
"epoch": 3.1732851985559565, | |
"grad_norm": 0.8983060717582703, | |
"learning_rate": 3.181867716596765e-06, | |
"loss": 0.3684, | |
"step": 221 | |
}, | |
{ | |
"epoch": 3.187725631768953, | |
"grad_norm": 0.9604324102401733, | |
"learning_rate": 3.1369297534396823e-06, | |
"loss": 0.3705, | |
"step": 222 | |
}, | |
{ | |
"epoch": 3.2021660649819497, | |
"grad_norm": 0.8212615251541138, | |
"learning_rate": 3.092165874288525e-06, | |
"loss": 0.3503, | |
"step": 223 | |
}, | |
{ | |
"epoch": 3.216606498194946, | |
"grad_norm": 0.8354775905609131, | |
"learning_rate": 3.0475802618494564e-06, | |
"loss": 0.3634, | |
"step": 224 | |
}, | |
{ | |
"epoch": 3.2310469314079424, | |
"grad_norm": 0.7785860896110535, | |
"learning_rate": 3.0031770821715233e-06, | |
"loss": 0.3739, | |
"step": 225 | |
}, | |
{ | |
"epoch": 3.2454873646209386, | |
"grad_norm": 0.8317592144012451, | |
"learning_rate": 2.9589604842573762e-06, | |
"loss": 0.3582, | |
"step": 226 | |
}, | |
{ | |
"epoch": 3.259927797833935, | |
"grad_norm": 0.8261600136756897, | |
"learning_rate": 2.914934599675594e-06, | |
"loss": 0.3508, | |
"step": 227 | |
}, | |
{ | |
"epoch": 3.2743682310469313, | |
"grad_norm": 0.742267906665802, | |
"learning_rate": 2.871103542174637e-06, | |
"loss": 0.3723, | |
"step": 228 | |
}, | |
{ | |
"epoch": 3.288808664259928, | |
"grad_norm": 0.7474186420440674, | |
"learning_rate": 2.827471407298451e-06, | |
"loss": 0.382, | |
"step": 229 | |
}, | |
{ | |
"epoch": 3.303249097472924, | |
"grad_norm": 0.7781416177749634, | |
"learning_rate": 2.7840422720037943e-06, | |
"loss": 0.3699, | |
"step": 230 | |
}, | |
{ | |
"epoch": 3.3176895306859207, | |
"grad_norm": 0.7829609513282776, | |
"learning_rate": 2.7408201942792755e-06, | |
"loss": 0.368, | |
"step": 231 | |
}, | |
{ | |
"epoch": 3.332129963898917, | |
"grad_norm": 0.7839555740356445, | |
"learning_rate": 2.697809212766195e-06, | |
"loss": 0.378, | |
"step": 232 | |
}, | |
{ | |
"epoch": 3.3465703971119134, | |
"grad_norm": 0.7255096435546875, | |
"learning_rate": 2.655013346381158e-06, | |
"loss": 0.3495, | |
"step": 233 | |
}, | |
{ | |
"epoch": 3.3610108303249095, | |
"grad_norm": 0.7900465726852417, | |
"learning_rate": 2.612436593940568e-06, | |
"loss": 0.3809, | |
"step": 234 | |
}, | |
{ | |
"epoch": 3.3610108303249095, | |
"eval_loss": 0.6795952320098877, | |
"eval_runtime": 32.8352, | |
"eval_samples_per_second": 19.552, | |
"eval_steps_per_second": 2.467, | |
"step": 234 | |
}, | |
{ | |
"epoch": 3.375451263537906, | |
"grad_norm": 0.7024810910224915, | |
"learning_rate": 2.57008293378697e-06, | |
"loss": 0.3493, | |
"step": 235 | |
}, | |
{ | |
"epoch": 3.3898916967509027, | |
"grad_norm": 0.7617738842964172, | |
"learning_rate": 2.5279563234173177e-06, | |
"loss": 0.3726, | |
"step": 236 | |
}, | |
{ | |
"epoch": 3.404332129963899, | |
"grad_norm": 0.7388402223587036, | |
"learning_rate": 2.4860606991131857e-06, | |
"loss": 0.3614, | |
"step": 237 | |
}, | |
{ | |
"epoch": 3.4187725631768955, | |
"grad_norm": 0.7406207919120789, | |
"learning_rate": 2.444399975572974e-06, | |
"loss": 0.3712, | |
"step": 238 | |
}, | |
{ | |
"epoch": 3.4332129963898916, | |
"grad_norm": 0.810865581035614, | |
"learning_rate": 2.402978045546114e-06, | |
"loss": 0.3916, | |
"step": 239 | |
}, | |
{ | |
"epoch": 3.4476534296028882, | |
"grad_norm": 0.7760987877845764, | |
"learning_rate": 2.3617987794693358e-06, | |
"loss": 0.3824, | |
"step": 240 | |
}, | |
{ | |
"epoch": 3.4620938628158844, | |
"grad_norm": 0.743865966796875, | |
"learning_rate": 2.320866025105016e-06, | |
"loss": 0.3822, | |
"step": 241 | |
}, | |
{ | |
"epoch": 3.476534296028881, | |
"grad_norm": 0.718098521232605, | |
"learning_rate": 2.2801836071816476e-06, | |
"loss": 0.3674, | |
"step": 242 | |
}, | |
{ | |
"epoch": 3.490974729241877, | |
"grad_norm": 0.703536331653595, | |
"learning_rate": 2.2397553270364546e-06, | |
"loss": 0.3491, | |
"step": 243 | |
}, | |
{ | |
"epoch": 3.5054151624548737, | |
"grad_norm": 0.7709066271781921, | |
"learning_rate": 2.1995849622602017e-06, | |
"loss": 0.3735, | |
"step": 244 | |
}, | |
{ | |
"epoch": 3.51985559566787, | |
"grad_norm": 0.7621421217918396, | |
"learning_rate": 2.159676266344222e-06, | |
"loss": 0.3655, | |
"step": 245 | |
}, | |
{ | |
"epoch": 3.5342960288808665, | |
"grad_norm": 0.7338027954101562, | |
"learning_rate": 2.120032968329687e-06, | |
"loss": 0.3894, | |
"step": 246 | |
}, | |
{ | |
"epoch": 3.5487364620938626, | |
"grad_norm": 0.764658510684967, | |
"learning_rate": 2.0806587724591725e-06, | |
"loss": 0.3669, | |
"step": 247 | |
}, | |
{ | |
"epoch": 3.563176895306859, | |
"grad_norm": 0.7732599377632141, | |
"learning_rate": 2.0415573578305343e-06, | |
"loss": 0.3704, | |
"step": 248 | |
}, | |
{ | |
"epoch": 3.577617328519856, | |
"grad_norm": 0.7785531878471375, | |
"learning_rate": 2.0027323780531312e-06, | |
"loss": 0.3832, | |
"step": 249 | |
}, | |
{ | |
"epoch": 3.592057761732852, | |
"grad_norm": 0.7383318543434143, | |
"learning_rate": 1.9641874609064443e-06, | |
"loss": 0.3516, | |
"step": 250 | |
}, | |
{ | |
"epoch": 3.606498194945848, | |
"grad_norm": 0.7191964983940125, | |
"learning_rate": 1.9259262080010938e-06, | |
"loss": 0.3647, | |
"step": 251 | |
}, | |
{ | |
"epoch": 3.6209386281588447, | |
"grad_norm": 0.688936710357666, | |
"learning_rate": 1.887952194442309e-06, | |
"loss": 0.3646, | |
"step": 252 | |
}, | |
{ | |
"epoch": 3.6209386281588447, | |
"eval_loss": 0.6788443326950073, | |
"eval_runtime": 32.2809, | |
"eval_samples_per_second": 19.888, | |
"eval_steps_per_second": 2.509, | |
"step": 252 | |
}, | |
{ | |
"epoch": 3.6353790613718413, | |
"grad_norm": 0.7906298637390137, | |
"learning_rate": 1.8502689684958664e-06, | |
"loss": 0.3798, | |
"step": 253 | |
}, | |
{ | |
"epoch": 3.6498194945848375, | |
"grad_norm": 0.7622320055961609, | |
"learning_rate": 1.8128800512565514e-06, | |
"loss": 0.3624, | |
"step": 254 | |
}, | |
{ | |
"epoch": 3.664259927797834, | |
"grad_norm": 0.7161827683448792, | |
"learning_rate": 1.7757889363191484e-06, | |
"loss": 0.3709, | |
"step": 255 | |
}, | |
{ | |
"epoch": 3.67870036101083, | |
"grad_norm": 0.7306489944458008, | |
"learning_rate": 1.738999089451991e-06, | |
"loss": 0.3748, | |
"step": 256 | |
}, | |
{ | |
"epoch": 3.693140794223827, | |
"grad_norm": 0.746320903301239, | |
"learning_rate": 1.7025139482731385e-06, | |
"loss": 0.3755, | |
"step": 257 | |
}, | |
{ | |
"epoch": 3.707581227436823, | |
"grad_norm": 0.7224844694137573, | |
"learning_rate": 1.6663369219291558e-06, | |
"loss": 0.3566, | |
"step": 258 | |
}, | |
{ | |
"epoch": 3.7220216606498195, | |
"grad_norm": 0.7270101308822632, | |
"learning_rate": 1.6304713907765713e-06, | |
"loss": 0.3778, | |
"step": 259 | |
}, | |
{ | |
"epoch": 3.7364620938628157, | |
"grad_norm": 0.7161392569541931, | |
"learning_rate": 1.5949207060660138e-06, | |
"loss": 0.379, | |
"step": 260 | |
}, | |
{ | |
"epoch": 3.7509025270758123, | |
"grad_norm": 0.7086169123649597, | |
"learning_rate": 1.55968818962908e-06, | |
"loss": 0.3688, | |
"step": 261 | |
}, | |
{ | |
"epoch": 3.765342960288809, | |
"grad_norm": 0.7060995697975159, | |
"learning_rate": 1.5247771335679372e-06, | |
"loss": 0.3826, | |
"step": 262 | |
}, | |
{ | |
"epoch": 3.779783393501805, | |
"grad_norm": 0.6812713146209717, | |
"learning_rate": 1.4901907999477167e-06, | |
"loss": 0.3616, | |
"step": 263 | |
}, | |
{ | |
"epoch": 3.794223826714801, | |
"grad_norm": 0.6879968643188477, | |
"learning_rate": 1.4559324204917102e-06, | |
"loss": 0.3748, | |
"step": 264 | |
}, | |
{ | |
"epoch": 3.808664259927798, | |
"grad_norm": 0.7127858400344849, | |
"learning_rate": 1.4220051962793952e-06, | |
"loss": 0.3792, | |
"step": 265 | |
}, | |
{ | |
"epoch": 3.8231046931407944, | |
"grad_norm": 0.7238702774047852, | |
"learning_rate": 1.3884122974473307e-06, | |
"loss": 0.3675, | |
"step": 266 | |
}, | |
{ | |
"epoch": 3.8375451263537905, | |
"grad_norm": 0.7225318551063538, | |
"learning_rate": 1.3551568628929434e-06, | |
"loss": 0.3832, | |
"step": 267 | |
}, | |
{ | |
"epoch": 3.851985559566787, | |
"grad_norm": 0.7232125401496887, | |
"learning_rate": 1.3222419999812248e-06, | |
"loss": 0.3887, | |
"step": 268 | |
}, | |
{ | |
"epoch": 3.8664259927797833, | |
"grad_norm": 0.7618275284767151, | |
"learning_rate": 1.2896707842543898e-06, | |
"loss": 0.3837, | |
"step": 269 | |
}, | |
{ | |
"epoch": 3.88086642599278, | |
"grad_norm": 0.7329801321029663, | |
"learning_rate": 1.257446259144494e-06, | |
"loss": 0.3886, | |
"step": 270 | |
}, | |
{ | |
"epoch": 3.88086642599278, | |
"eval_loss": 0.6797041296958923, | |
"eval_runtime": 33.1748, | |
"eval_samples_per_second": 19.352, | |
"eval_steps_per_second": 2.442, | |
"step": 270 | |
}, | |
{ | |
"epoch": 3.895306859205776, | |
"grad_norm": 0.717837929725647, | |
"learning_rate": 1.225571435689062e-06, | |
"loss": 0.3648, | |
"step": 271 | |
}, | |
{ | |
"epoch": 3.9097472924187726, | |
"grad_norm": 0.7133547067642212, | |
"learning_rate": 1.1940492922497337e-06, | |
"loss": 0.3713, | |
"step": 272 | |
}, | |
{ | |
"epoch": 3.9241877256317688, | |
"grad_norm": 0.7317136526107788, | |
"learning_rate": 1.1628827742339688e-06, | |
"loss": 0.3735, | |
"step": 273 | |
}, | |
{ | |
"epoch": 3.9386281588447654, | |
"grad_norm": 0.7250531911849976, | |
"learning_rate": 1.1320747938198356e-06, | |
"loss": 0.4011, | |
"step": 274 | |
}, | |
{ | |
"epoch": 3.953068592057762, | |
"grad_norm": 0.7316252589225769, | |
"learning_rate": 1.1016282296838887e-06, | |
"loss": 0.3912, | |
"step": 275 | |
}, | |
{ | |
"epoch": 3.967509025270758, | |
"grad_norm": 0.7132654190063477, | |
"learning_rate": 1.0715459267321998e-06, | |
"loss": 0.362, | |
"step": 276 | |
}, | |
{ | |
"epoch": 3.9819494584837543, | |
"grad_norm": 0.7502458691596985, | |
"learning_rate": 1.0418306958345214e-06, | |
"loss": 0.3629, | |
"step": 277 | |
}, | |
{ | |
"epoch": 3.996389891696751, | |
"grad_norm": 0.7093997001647949, | |
"learning_rate": 1.0124853135616475e-06, | |
"loss": 0.3937, | |
"step": 278 | |
}, | |
{ | |
"epoch": 4.014440433212997, | |
"grad_norm": 1.1966173648834229, | |
"learning_rate": 9.835125219259694e-07, | |
"loss": 0.3186, | |
"step": 279 | |
}, | |
{ | |
"epoch": 4.028880866425993, | |
"grad_norm": 1.1906567811965942, | |
"learning_rate": 9.549150281252633e-07, | |
"loss": 0.3267, | |
"step": 280 | |
}, | |
{ | |
"epoch": 4.043321299638989, | |
"grad_norm": 0.9876328706741333, | |
"learning_rate": 9.266955042897357e-07, | |
"loss": 0.3445, | |
"step": 281 | |
}, | |
{ | |
"epoch": 4.0577617328519855, | |
"grad_norm": 0.8555173873901367, | |
"learning_rate": 8.988565872323362e-07, | |
"loss": 0.3231, | |
"step": 282 | |
}, | |
{ | |
"epoch": 4.072202166064982, | |
"grad_norm": 0.7850072383880615, | |
"learning_rate": 8.714008782023797e-07, | |
"loss": 0.3405, | |
"step": 283 | |
}, | |
{ | |
"epoch": 4.086642599277979, | |
"grad_norm": 0.9164103865623474, | |
"learning_rate": 8.443309426424862e-07, | |
"loss": 0.315, | |
"step": 284 | |
}, | |
{ | |
"epoch": 4.101083032490974, | |
"grad_norm": 1.1176913976669312, | |
"learning_rate": 8.176493099488664e-07, | |
"loss": 0.3354, | |
"step": 285 | |
}, | |
{ | |
"epoch": 4.115523465703971, | |
"grad_norm": 1.1691384315490723, | |
"learning_rate": 7.913584732349788e-07, | |
"loss": 0.3398, | |
"step": 286 | |
}, | |
{ | |
"epoch": 4.129963898916968, | |
"grad_norm": 1.0531779527664185, | |
"learning_rate": 7.654608890985709e-07, | |
"loss": 0.3309, | |
"step": 287 | |
}, | |
{ | |
"epoch": 4.144404332129964, | |
"grad_norm": 1.0438635349273682, | |
"learning_rate": 7.399589773921412e-07, | |
"loss": 0.3324, | |
"step": 288 | |
}, | |
{ | |
"epoch": 4.144404332129964, | |
"eval_loss": 0.7206189632415771, | |
"eval_runtime": 32.2481, | |
"eval_samples_per_second": 19.908, | |
"eval_steps_per_second": 2.512, | |
"step": 288 | |
}, | |
{ | |
"epoch": 4.15884476534296, | |
"grad_norm": 0.8594384789466858, | |
"learning_rate": 7.148551209968279e-07, | |
"loss": 0.3138, | |
"step": 289 | |
}, | |
{ | |
"epoch": 4.1732851985559565, | |
"grad_norm": 0.8748891949653625, | |
"learning_rate": 6.901516655997536e-07, | |
"loss": 0.3291, | |
"step": 290 | |
}, | |
{ | |
"epoch": 4.187725631768953, | |
"grad_norm": 0.8275448679924011, | |
"learning_rate": 6.658509194748463e-07, | |
"loss": 0.3208, | |
"step": 291 | |
}, | |
{ | |
"epoch": 4.20216606498195, | |
"grad_norm": 0.7752466201782227, | |
"learning_rate": 6.419551532671542e-07, | |
"loss": 0.3107, | |
"step": 292 | |
}, | |
{ | |
"epoch": 4.216606498194946, | |
"grad_norm": 0.8111825585365295, | |
"learning_rate": 6.184665997806832e-07, | |
"loss": 0.3228, | |
"step": 293 | |
}, | |
{ | |
"epoch": 4.231046931407942, | |
"grad_norm": 0.8536714911460876, | |
"learning_rate": 5.953874537697573e-07, | |
"loss": 0.3243, | |
"step": 294 | |
}, | |
{ | |
"epoch": 4.245487364620939, | |
"grad_norm": 0.7821819186210632, | |
"learning_rate": 5.727198717339511e-07, | |
"loss": 0.3229, | |
"step": 295 | |
}, | |
{ | |
"epoch": 4.259927797833935, | |
"grad_norm": 0.8092971444129944, | |
"learning_rate": 5.504659717165812e-07, | |
"loss": 0.3172, | |
"step": 296 | |
}, | |
{ | |
"epoch": 4.274368231046932, | |
"grad_norm": 0.7584081292152405, | |
"learning_rate": 5.286278331068018e-07, | |
"loss": 0.3169, | |
"step": 297 | |
}, | |
{ | |
"epoch": 4.2888086642599275, | |
"grad_norm": 0.7885962128639221, | |
"learning_rate": 5.072074964453055e-07, | |
"loss": 0.332, | |
"step": 298 | |
}, | |
{ | |
"epoch": 4.303249097472924, | |
"grad_norm": 0.7851223349571228, | |
"learning_rate": 4.862069632336558e-07, | |
"loss": 0.3334, | |
"step": 299 | |
}, | |
{ | |
"epoch": 4.317689530685921, | |
"grad_norm": 0.8225854635238647, | |
"learning_rate": 4.6562819574727304e-07, | |
"loss": 0.3221, | |
"step": 300 | |
}, | |
{ | |
"epoch": 4.332129963898917, | |
"grad_norm": 0.8033653497695923, | |
"learning_rate": 4.454731168520754e-07, | |
"loss": 0.3191, | |
"step": 301 | |
}, | |
{ | |
"epoch": 4.346570397111913, | |
"grad_norm": 0.8068827986717224, | |
"learning_rate": 4.257436098248091e-07, | |
"loss": 0.3348, | |
"step": 302 | |
}, | |
{ | |
"epoch": 4.3610108303249095, | |
"grad_norm": 0.8063839077949524, | |
"learning_rate": 4.064415181770787e-07, | |
"loss": 0.3307, | |
"step": 303 | |
}, | |
{ | |
"epoch": 4.375451263537906, | |
"grad_norm": 0.8209130167961121, | |
"learning_rate": 3.875686454830885e-07, | |
"loss": 0.3416, | |
"step": 304 | |
}, | |
{ | |
"epoch": 4.389891696750903, | |
"grad_norm": 0.7289718985557556, | |
"learning_rate": 3.691267552111183e-07, | |
"loss": 0.3165, | |
"step": 305 | |
}, | |
{ | |
"epoch": 4.404332129963899, | |
"grad_norm": 0.7588977217674255, | |
"learning_rate": 3.511175705587433e-07, | |
"loss": 0.3297, | |
"step": 306 | |
}, | |
{ | |
"epoch": 4.404332129963899, | |
"eval_loss": 0.7203958034515381, | |
"eval_runtime": 32.8539, | |
"eval_samples_per_second": 19.541, | |
"eval_steps_per_second": 2.465, | |
"step": 306 | |
}, | |
{ | |
"epoch": 4.418772563176895, | |
"grad_norm": 0.7277893424034119, | |
"learning_rate": 3.3354277429182626e-07, | |
"loss": 0.3184, | |
"step": 307 | |
}, | |
{ | |
"epoch": 4.433212996389892, | |
"grad_norm": 0.7536954283714294, | |
"learning_rate": 3.164040085872755e-07, | |
"loss": 0.3334, | |
"step": 308 | |
}, | |
{ | |
"epoch": 4.447653429602888, | |
"grad_norm": 0.7596468925476074, | |
"learning_rate": 2.997028748796016e-07, | |
"loss": 0.3253, | |
"step": 309 | |
}, | |
{ | |
"epoch": 4.462093862815885, | |
"grad_norm": 0.7149029970169067, | |
"learning_rate": 2.834409337112842e-07, | |
"loss": 0.3337, | |
"step": 310 | |
}, | |
{ | |
"epoch": 4.4765342960288805, | |
"grad_norm": 0.8042013645172119, | |
"learning_rate": 2.676197045869511e-07, | |
"loss": 0.2985, | |
"step": 311 | |
}, | |
{ | |
"epoch": 4.490974729241877, | |
"grad_norm": 0.7098730802536011, | |
"learning_rate": 2.522406658313997e-07, | |
"loss": 0.3305, | |
"step": 312 | |
}, | |
{ | |
"epoch": 4.505415162454874, | |
"grad_norm": 0.7298141121864319, | |
"learning_rate": 2.3730525445146146e-07, | |
"loss": 0.3142, | |
"step": 313 | |
}, | |
{ | |
"epoch": 4.51985559566787, | |
"grad_norm": 0.7250217199325562, | |
"learning_rate": 2.2281486600173207e-07, | |
"loss": 0.3276, | |
"step": 314 | |
}, | |
{ | |
"epoch": 4.534296028880867, | |
"grad_norm": 0.7107163071632385, | |
"learning_rate": 2.0877085445416889e-07, | |
"loss": 0.3112, | |
"step": 315 | |
}, | |
{ | |
"epoch": 4.548736462093863, | |
"grad_norm": 0.7627288103103638, | |
"learning_rate": 1.9517453207157865e-07, | |
"loss": 0.3353, | |
"step": 316 | |
}, | |
{ | |
"epoch": 4.563176895306859, | |
"grad_norm": 0.707574188709259, | |
"learning_rate": 1.8202716928499842e-07, | |
"loss": 0.3148, | |
"step": 317 | |
}, | |
{ | |
"epoch": 4.577617328519856, | |
"grad_norm": 0.7250429391860962, | |
"learning_rate": 1.6932999457498823e-07, | |
"loss": 0.3286, | |
"step": 318 | |
}, | |
{ | |
"epoch": 4.5920577617328515, | |
"grad_norm": 0.7276959419250488, | |
"learning_rate": 1.5708419435684463e-07, | |
"loss": 0.32, | |
"step": 319 | |
}, | |
{ | |
"epoch": 4.606498194945848, | |
"grad_norm": 0.7046621441841125, | |
"learning_rate": 1.4529091286973994e-07, | |
"loss": 0.3247, | |
"step": 320 | |
}, | |
{ | |
"epoch": 4.620938628158845, | |
"grad_norm": 0.7249467372894287, | |
"learning_rate": 1.3395125206980774e-07, | |
"loss": 0.3268, | |
"step": 321 | |
}, | |
{ | |
"epoch": 4.635379061371841, | |
"grad_norm": 0.7257922291755676, | |
"learning_rate": 1.230662715271741e-07, | |
"loss": 0.3195, | |
"step": 322 | |
}, | |
{ | |
"epoch": 4.649819494584838, | |
"grad_norm": 0.7479093074798584, | |
"learning_rate": 1.1263698832695513e-07, | |
"loss": 0.3463, | |
"step": 323 | |
}, | |
{ | |
"epoch": 4.664259927797834, | |
"grad_norm": 0.7323216199874878, | |
"learning_rate": 1.0266437697422026e-07, | |
"loss": 0.338, | |
"step": 324 | |
}, | |
{ | |
"epoch": 4.664259927797834, | |
"eval_loss": 0.7205774784088135, | |
"eval_runtime": 32.268, | |
"eval_samples_per_second": 19.896, | |
"eval_steps_per_second": 2.51, | |
"step": 324 | |
}, | |
{ | |
"epoch": 4.67870036101083, | |
"grad_norm": 0.7437517642974854, | |
"learning_rate": 9.314936930293283e-08, | |
"loss": 0.31, | |
"step": 325 | |
}, | |
{ | |
"epoch": 4.693140794223827, | |
"grad_norm": 0.7366902232170105, | |
"learning_rate": 8.40928543888836e-08, | |
"loss": 0.3344, | |
"step": 326 | |
}, | |
{ | |
"epoch": 4.707581227436823, | |
"grad_norm": 0.7316016554832458, | |
"learning_rate": 7.549567846661388e-08, | |
"loss": 0.3204, | |
"step": 327 | |
}, | |
{ | |
"epoch": 4.722021660649819, | |
"grad_norm": 0.7517760396003723, | |
"learning_rate": 6.735864485034493e-08, | |
"loss": 0.3358, | |
"step": 328 | |
}, | |
{ | |
"epoch": 4.736462093862816, | |
"grad_norm": 0.7169381976127625, | |
"learning_rate": 5.968251385891744e-08, | |
"loss": 0.3166, | |
"step": 329 | |
}, | |
{ | |
"epoch": 4.750902527075812, | |
"grad_norm": 0.7038097381591797, | |
"learning_rate": 5.246800274474439e-08, | |
"loss": 0.3104, | |
"step": 330 | |
}, | |
{ | |
"epoch": 4.765342960288809, | |
"grad_norm": 0.7148619294166565, | |
"learning_rate": 4.571578562679757e-08, | |
"loss": 0.3137, | |
"step": 331 | |
}, | |
{ | |
"epoch": 4.7797833935018055, | |
"grad_norm": 0.7279098033905029, | |
"learning_rate": 3.9426493427611177e-08, | |
"loss": 0.3223, | |
"step": 332 | |
}, | |
{ | |
"epoch": 4.794223826714801, | |
"grad_norm": 0.7073100209236145, | |
"learning_rate": 3.360071381433516e-08, | |
"loss": 0.3277, | |
"step": 333 | |
}, | |
{ | |
"epoch": 4.808664259927798, | |
"grad_norm": 0.74690181016922, | |
"learning_rate": 2.823899114382078e-08, | |
"loss": 0.3293, | |
"step": 334 | |
}, | |
{ | |
"epoch": 4.823104693140794, | |
"grad_norm": 0.7142198085784912, | |
"learning_rate": 2.3341826411756863e-08, | |
"loss": 0.3189, | |
"step": 335 | |
}, | |
{ | |
"epoch": 4.837545126353791, | |
"grad_norm": 0.734983503818512, | |
"learning_rate": 1.8909677205856682e-08, | |
"loss": 0.3098, | |
"step": 336 | |
}, | |
{ | |
"epoch": 4.851985559566787, | |
"grad_norm": 0.744941234588623, | |
"learning_rate": 1.494295766310161e-08, | |
"loss": 0.3259, | |
"step": 337 | |
}, | |
{ | |
"epoch": 4.866425992779783, | |
"grad_norm": 0.7279512286186218, | |
"learning_rate": 1.1442038431044856e-08, | |
"loss": 0.331, | |
"step": 338 | |
}, | |
{ | |
"epoch": 4.88086642599278, | |
"grad_norm": 0.7453312873840332, | |
"learning_rate": 8.407246633178601e-09, | |
"loss": 0.3394, | |
"step": 339 | |
}, | |
{ | |
"epoch": 4.8953068592057765, | |
"grad_norm": 0.7087642550468445, | |
"learning_rate": 5.838865838366792e-09, | |
"loss": 0.3178, | |
"step": 340 | |
}, | |
{ | |
"epoch": 4.909747292418773, | |
"grad_norm": 0.7283005714416504, | |
"learning_rate": 3.737136034349109e-09, | |
"loss": 0.331, | |
"step": 341 | |
}, | |
{ | |
"epoch": 4.924187725631769, | |
"grad_norm": 0.6744409203529358, | |
"learning_rate": 2.102253605316684e-09, | |
"loss": 0.3052, | |
"step": 342 | |
}, | |
{ | |
"epoch": 4.924187725631769, | |
"eval_loss": 0.7205825448036194, | |
"eval_runtime": 32.4625, | |
"eval_samples_per_second": 19.777, | |
"eval_steps_per_second": 2.495, | |
"step": 342 | |
}, | |
{ | |
"epoch": 4.938628158844765, | |
"grad_norm": 0.7398666143417358, | |
"learning_rate": 9.343713135623323e-10, | |
"loss": 0.3297, | |
"step": 343 | |
}, | |
{ | |
"epoch": 4.953068592057762, | |
"grad_norm": 0.7399023771286011, | |
"learning_rate": 2.335982852064156e-10, | |
"loss": 0.3273, | |
"step": 344 | |
}, | |
{ | |
"epoch": 4.967509025270758, | |
"grad_norm": 0.7517241835594177, | |
"learning_rate": 0.0, | |
"loss": 0.3359, | |
"step": 345 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 345, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 5, | |
"save_steps": 69, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 5.423224727839703e+17, | |
"train_batch_size": 1, | |
"trial_name": null, | |
"trial_params": null | |
} | |