hyungjoochae's picture
Upload folder using huggingface_hub
e30f2fa verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.967509025270758,
"eval_steps": 18,
"global_step": 345,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01444043321299639,
"grad_norm": 3.5763113498687744,
"learning_rate": 5.000000000000001e-07,
"loss": 0.865,
"step": 1
},
{
"epoch": 0.01444043321299639,
"eval_loss": 0.8391121625900269,
"eval_runtime": 35.414,
"eval_samples_per_second": 18.128,
"eval_steps_per_second": 2.287,
"step": 1
},
{
"epoch": 0.02888086642599278,
"grad_norm": 3.5320334434509277,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.8262,
"step": 2
},
{
"epoch": 0.04332129963898917,
"grad_norm": 3.329249858856201,
"learning_rate": 1.5e-06,
"loss": 0.833,
"step": 3
},
{
"epoch": 0.05776173285198556,
"grad_norm": 3.478191375732422,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.865,
"step": 4
},
{
"epoch": 0.07220216606498195,
"grad_norm": 2.8155710697174072,
"learning_rate": 2.5e-06,
"loss": 0.8168,
"step": 5
},
{
"epoch": 0.08664259927797834,
"grad_norm": 2.3139870166778564,
"learning_rate": 3e-06,
"loss": 0.77,
"step": 6
},
{
"epoch": 0.10108303249097472,
"grad_norm": 2.0602500438690186,
"learning_rate": 3.5e-06,
"loss": 0.7738,
"step": 7
},
{
"epoch": 0.11552346570397112,
"grad_norm": 2.700531244277954,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7574,
"step": 8
},
{
"epoch": 0.1299638989169675,
"grad_norm": 2.2553508281707764,
"learning_rate": 4.5e-06,
"loss": 0.7405,
"step": 9
},
{
"epoch": 0.1444043321299639,
"grad_norm": 2.9637646675109863,
"learning_rate": 5e-06,
"loss": 0.7045,
"step": 10
},
{
"epoch": 0.1588447653429603,
"grad_norm": 2.4607481956481934,
"learning_rate": 5.500000000000001e-06,
"loss": 0.7066,
"step": 11
},
{
"epoch": 0.17328519855595667,
"grad_norm": 1.875809669494629,
"learning_rate": 6e-06,
"loss": 0.6924,
"step": 12
},
{
"epoch": 0.18772563176895307,
"grad_norm": 1.5592328310012817,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.6667,
"step": 13
},
{
"epoch": 0.20216606498194944,
"grad_norm": 1.9023362398147583,
"learning_rate": 7e-06,
"loss": 0.7267,
"step": 14
},
{
"epoch": 0.21660649819494585,
"grad_norm": 1.5530339479446411,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6943,
"step": 15
},
{
"epoch": 0.23104693140794225,
"grad_norm": 1.0332263708114624,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7069,
"step": 16
},
{
"epoch": 0.24548736462093862,
"grad_norm": 1.181794285774231,
"learning_rate": 8.5e-06,
"loss": 0.6616,
"step": 17
},
{
"epoch": 0.259927797833935,
"grad_norm": 1.0393184423446655,
"learning_rate": 9e-06,
"loss": 0.6493,
"step": 18
},
{
"epoch": 0.259927797833935,
"eval_loss": 0.6557387709617615,
"eval_runtime": 33.1067,
"eval_samples_per_second": 19.392,
"eval_steps_per_second": 2.447,
"step": 18
},
{
"epoch": 0.2743682310469314,
"grad_norm": 0.9353659749031067,
"learning_rate": 9.5e-06,
"loss": 0.6798,
"step": 19
},
{
"epoch": 0.2888086642599278,
"grad_norm": 1.0315933227539062,
"learning_rate": 1e-05,
"loss": 0.6541,
"step": 20
},
{
"epoch": 0.30324909747292417,
"grad_norm": 0.8723016977310181,
"learning_rate": 9.999766401714795e-06,
"loss": 0.6554,
"step": 21
},
{
"epoch": 0.3176895306859206,
"grad_norm": 0.8054132461547852,
"learning_rate": 9.999065628686439e-06,
"loss": 0.642,
"step": 22
},
{
"epoch": 0.33212996389891697,
"grad_norm": 0.7567704319953918,
"learning_rate": 9.997897746394684e-06,
"loss": 0.6257,
"step": 23
},
{
"epoch": 0.34657039711191334,
"grad_norm": 0.8518706560134888,
"learning_rate": 9.996262863965651e-06,
"loss": 0.6279,
"step": 24
},
{
"epoch": 0.36101083032490977,
"grad_norm": 0.7092042565345764,
"learning_rate": 9.994161134161635e-06,
"loss": 0.6486,
"step": 25
},
{
"epoch": 0.37545126353790614,
"grad_norm": 0.8764254450798035,
"learning_rate": 9.991592753366822e-06,
"loss": 0.6469,
"step": 26
},
{
"epoch": 0.3898916967509025,
"grad_norm": 0.7938660383224487,
"learning_rate": 9.988557961568956e-06,
"loss": 0.641,
"step": 27
},
{
"epoch": 0.4043321299638989,
"grad_norm": 1.8984333276748657,
"learning_rate": 9.985057042336898e-06,
"loss": 0.6165,
"step": 28
},
{
"epoch": 0.4187725631768953,
"grad_norm": 0.8867112398147583,
"learning_rate": 9.981090322794145e-06,
"loss": 0.6312,
"step": 29
},
{
"epoch": 0.4332129963898917,
"grad_norm": 0.7045503854751587,
"learning_rate": 9.976658173588244e-06,
"loss": 0.626,
"step": 30
},
{
"epoch": 0.44765342960288806,
"grad_norm": 0.7188596129417419,
"learning_rate": 9.97176100885618e-06,
"loss": 0.6136,
"step": 31
},
{
"epoch": 0.4620938628158845,
"grad_norm": 0.7122887372970581,
"learning_rate": 9.966399286185666e-06,
"loss": 0.6296,
"step": 32
},
{
"epoch": 0.47653429602888087,
"grad_norm": 0.6428160071372986,
"learning_rate": 9.960573506572391e-06,
"loss": 0.5887,
"step": 33
},
{
"epoch": 0.49097472924187724,
"grad_norm": 0.6762687563896179,
"learning_rate": 9.954284214373204e-06,
"loss": 0.6112,
"step": 34
},
{
"epoch": 0.5054151624548736,
"grad_norm": 0.6077902317047119,
"learning_rate": 9.947531997255256e-06,
"loss": 0.629,
"step": 35
},
{
"epoch": 0.51985559566787,
"grad_norm": 0.6718001365661621,
"learning_rate": 9.940317486141084e-06,
"loss": 0.6239,
"step": 36
},
{
"epoch": 0.51985559566787,
"eval_loss": 0.6234476566314697,
"eval_runtime": 34.0064,
"eval_samples_per_second": 18.879,
"eval_steps_per_second": 2.382,
"step": 36
},
{
"epoch": 0.5342960288808665,
"grad_norm": 0.6401779055595398,
"learning_rate": 9.932641355149655e-06,
"loss": 0.6241,
"step": 37
},
{
"epoch": 0.5487364620938628,
"grad_norm": 0.7026526927947998,
"learning_rate": 9.924504321533387e-06,
"loss": 0.635,
"step": 38
},
{
"epoch": 0.5631768953068592,
"grad_norm": 0.6177261471748352,
"learning_rate": 9.915907145611117e-06,
"loss": 0.6189,
"step": 39
},
{
"epoch": 0.5776173285198556,
"grad_norm": 0.601033627986908,
"learning_rate": 9.906850630697068e-06,
"loss": 0.574,
"step": 40
},
{
"epoch": 0.592057761732852,
"grad_norm": 0.6070717573165894,
"learning_rate": 9.89733562302578e-06,
"loss": 0.5855,
"step": 41
},
{
"epoch": 0.6064981949458483,
"grad_norm": 0.6149937510490417,
"learning_rate": 9.887363011673046e-06,
"loss": 0.6494,
"step": 42
},
{
"epoch": 0.6209386281588448,
"grad_norm": 0.6047448515892029,
"learning_rate": 9.876933728472826e-06,
"loss": 0.6008,
"step": 43
},
{
"epoch": 0.6353790613718412,
"grad_norm": 0.6176997423171997,
"learning_rate": 9.866048747930194e-06,
"loss": 0.6201,
"step": 44
},
{
"epoch": 0.6498194945848376,
"grad_norm": 0.5998260378837585,
"learning_rate": 9.854709087130261e-06,
"loss": 0.611,
"step": 45
},
{
"epoch": 0.6642599277978339,
"grad_norm": 0.6084288954734802,
"learning_rate": 9.842915805643156e-06,
"loss": 0.6,
"step": 46
},
{
"epoch": 0.6787003610108303,
"grad_norm": 0.574383556842804,
"learning_rate": 9.830670005425012e-06,
"loss": 0.5901,
"step": 47
},
{
"epoch": 0.6931407942238267,
"grad_norm": 0.5806477665901184,
"learning_rate": 9.817972830715003e-06,
"loss": 0.5867,
"step": 48
},
{
"epoch": 0.7075812274368231,
"grad_norm": 0.6218814849853516,
"learning_rate": 9.804825467928423e-06,
"loss": 0.6199,
"step": 49
},
{
"epoch": 0.7220216606498195,
"grad_norm": 0.6401635408401489,
"learning_rate": 9.791229145545832e-06,
"loss": 0.6128,
"step": 50
},
{
"epoch": 0.7364620938628159,
"grad_norm": 0.6286787986755371,
"learning_rate": 9.777185133998268e-06,
"loss": 0.6212,
"step": 51
},
{
"epoch": 0.7509025270758123,
"grad_norm": 0.5961940288543701,
"learning_rate": 9.76269474554854e-06,
"loss": 0.6079,
"step": 52
},
{
"epoch": 0.7653429602888087,
"grad_norm": 0.5932030081748962,
"learning_rate": 9.747759334168602e-06,
"loss": 0.6224,
"step": 53
},
{
"epoch": 0.779783393501805,
"grad_norm": 0.6036499738693237,
"learning_rate": 9.73238029541305e-06,
"loss": 0.611,
"step": 54
},
{
"epoch": 0.779783393501805,
"eval_loss": 0.6119250655174255,
"eval_runtime": 33.2208,
"eval_samples_per_second": 19.325,
"eval_steps_per_second": 2.438,
"step": 54
},
{
"epoch": 0.7942238267148014,
"grad_norm": 0.6094003915786743,
"learning_rate": 9.716559066288716e-06,
"loss": 0.6127,
"step": 55
},
{
"epoch": 0.8086642599277978,
"grad_norm": 0.6151455044746399,
"learning_rate": 9.7002971251204e-06,
"loss": 0.6075,
"step": 56
},
{
"epoch": 0.8231046931407943,
"grad_norm": 0.59092116355896,
"learning_rate": 9.683595991412725e-06,
"loss": 0.5975,
"step": 57
},
{
"epoch": 0.8375451263537906,
"grad_norm": 0.6535966396331787,
"learning_rate": 9.666457225708175e-06,
"loss": 0.5856,
"step": 58
},
{
"epoch": 0.851985559566787,
"grad_norm": 0.6049758791923523,
"learning_rate": 9.648882429441258e-06,
"loss": 0.5877,
"step": 59
},
{
"epoch": 0.8664259927797834,
"grad_norm": 0.6271098256111145,
"learning_rate": 9.630873244788884e-06,
"loss": 0.5895,
"step": 60
},
{
"epoch": 0.8808664259927798,
"grad_norm": 0.6239963173866272,
"learning_rate": 9.612431354516912e-06,
"loss": 0.5971,
"step": 61
},
{
"epoch": 0.8953068592057761,
"grad_norm": 0.6575446724891663,
"learning_rate": 9.593558481822923e-06,
"loss": 0.6224,
"step": 62
},
{
"epoch": 0.9097472924187726,
"grad_norm": 0.6333041787147522,
"learning_rate": 9.574256390175192e-06,
"loss": 0.619,
"step": 63
},
{
"epoch": 0.924187725631769,
"grad_norm": 0.6591416597366333,
"learning_rate": 9.554526883147926e-06,
"loss": 0.6211,
"step": 64
},
{
"epoch": 0.9386281588447654,
"grad_norm": 0.5847511291503906,
"learning_rate": 9.534371804252727e-06,
"loss": 0.612,
"step": 65
},
{
"epoch": 0.9530685920577617,
"grad_norm": 0.6347054243087769,
"learning_rate": 9.513793036766345e-06,
"loss": 0.6117,
"step": 66
},
{
"epoch": 0.9675090252707581,
"grad_norm": 0.634484589099884,
"learning_rate": 9.492792503554695e-06,
"loss": 0.6211,
"step": 67
},
{
"epoch": 0.9819494584837545,
"grad_norm": 0.6583951711654663,
"learning_rate": 9.4713721668932e-06,
"loss": 0.6184,
"step": 68
},
{
"epoch": 0.9963898916967509,
"grad_norm": 0.598239004611969,
"learning_rate": 9.44953402828342e-06,
"loss": 0.6229,
"step": 69
},
{
"epoch": 1.0,
"grad_norm": 0.9969831705093384,
"learning_rate": 9.427280128266049e-06,
"loss": 0.5986,
"step": 70
},
{
"epoch": 1.0144404332129964,
"grad_norm": 0.9471399188041687,
"learning_rate": 9.404612546230244e-06,
"loss": 0.5527,
"step": 71
},
{
"epoch": 1.0288808664259927,
"grad_norm": 0.726841926574707,
"learning_rate": 9.381533400219319e-06,
"loss": 0.5703,
"step": 72
},
{
"epoch": 1.0288808664259927,
"eval_loss": 0.613798201084137,
"eval_runtime": 33.4351,
"eval_samples_per_second": 19.201,
"eval_steps_per_second": 2.423,
"step": 72
},
{
"epoch": 1.0433212996389891,
"grad_norm": 0.7898038625717163,
"learning_rate": 9.358044846732848e-06,
"loss": 0.5478,
"step": 73
},
{
"epoch": 1.0577617328519855,
"grad_norm": 0.7952622175216675,
"learning_rate": 9.334149080525154e-06,
"loss": 0.5393,
"step": 74
},
{
"epoch": 1.0722021660649819,
"grad_norm": 0.8256924748420715,
"learning_rate": 9.309848334400247e-06,
"loss": 0.5613,
"step": 75
},
{
"epoch": 1.0866425992779782,
"grad_norm": 0.8076434135437012,
"learning_rate": 9.285144879003173e-06,
"loss": 0.5346,
"step": 76
},
{
"epoch": 1.1010830324909748,
"grad_norm": 0.7525290250778198,
"learning_rate": 9.26004102260786e-06,
"loss": 0.556,
"step": 77
},
{
"epoch": 1.1155234657039712,
"grad_norm": 0.7560561895370483,
"learning_rate": 9.23453911090143e-06,
"loss": 0.5454,
"step": 78
},
{
"epoch": 1.1299638989169676,
"grad_norm": 0.8224523663520813,
"learning_rate": 9.208641526765024e-06,
"loss": 0.5243,
"step": 79
},
{
"epoch": 1.144404332129964,
"grad_norm": 0.7244045734405518,
"learning_rate": 9.182350690051134e-06,
"loss": 0.5678,
"step": 80
},
{
"epoch": 1.1588447653429603,
"grad_norm": 0.7751045823097229,
"learning_rate": 9.155669057357515e-06,
"loss": 0.5389,
"step": 81
},
{
"epoch": 1.1732851985559567,
"grad_norm": 0.7371921539306641,
"learning_rate": 9.12859912179762e-06,
"loss": 0.5243,
"step": 82
},
{
"epoch": 1.187725631768953,
"grad_norm": 0.7369418740272522,
"learning_rate": 9.101143412767665e-06,
"loss": 0.5342,
"step": 83
},
{
"epoch": 1.2021660649819494,
"grad_norm": 0.6683140397071838,
"learning_rate": 9.073304495710267e-06,
"loss": 0.5284,
"step": 84
},
{
"epoch": 1.2166064981949458,
"grad_norm": 0.7613602876663208,
"learning_rate": 9.045084971874738e-06,
"loss": 0.5063,
"step": 85
},
{
"epoch": 1.2310469314079422,
"grad_norm": 0.7036657929420471,
"learning_rate": 9.016487478074032e-06,
"loss": 0.5281,
"step": 86
},
{
"epoch": 1.2454873646209386,
"grad_norm": 0.7911130785942078,
"learning_rate": 8.987514686438353e-06,
"loss": 0.5421,
"step": 87
},
{
"epoch": 1.259927797833935,
"grad_norm": 0.7307482957839966,
"learning_rate": 8.95816930416548e-06,
"loss": 0.5298,
"step": 88
},
{
"epoch": 1.2743682310469313,
"grad_norm": 0.6484747529029846,
"learning_rate": 8.928454073267801e-06,
"loss": 0.5451,
"step": 89
},
{
"epoch": 1.288808664259928,
"grad_norm": 0.678794264793396,
"learning_rate": 8.898371770316113e-06,
"loss": 0.542,
"step": 90
},
{
"epoch": 1.288808664259928,
"eval_loss": 0.6132499575614929,
"eval_runtime": 32.641,
"eval_samples_per_second": 19.669,
"eval_steps_per_second": 2.482,
"step": 90
},
{
"epoch": 1.303249097472924,
"grad_norm": 0.7240316867828369,
"learning_rate": 8.867925206180166e-06,
"loss": 0.534,
"step": 91
},
{
"epoch": 1.3176895306859207,
"grad_norm": 0.7762596011161804,
"learning_rate": 8.837117225766033e-06,
"loss": 0.5271,
"step": 92
},
{
"epoch": 1.332129963898917,
"grad_norm": 0.695136547088623,
"learning_rate": 8.805950707750268e-06,
"loss": 0.5422,
"step": 93
},
{
"epoch": 1.3465703971119134,
"grad_norm": 0.7882359623908997,
"learning_rate": 8.774428564310939e-06,
"loss": 0.5172,
"step": 94
},
{
"epoch": 1.3610108303249098,
"grad_norm": 0.7503855228424072,
"learning_rate": 8.742553740855507e-06,
"loss": 0.5203,
"step": 95
},
{
"epoch": 1.3754512635379061,
"grad_norm": 0.7731848359107971,
"learning_rate": 8.710329215745612e-06,
"loss": 0.5378,
"step": 96
},
{
"epoch": 1.3898916967509025,
"grad_norm": 0.7920417189598083,
"learning_rate": 8.677758000018777e-06,
"loss": 0.5433,
"step": 97
},
{
"epoch": 1.404332129963899,
"grad_norm": 0.9389582276344299,
"learning_rate": 8.644843137107058e-06,
"loss": 0.5542,
"step": 98
},
{
"epoch": 1.4187725631768953,
"grad_norm": 0.6249178051948547,
"learning_rate": 8.61158770255267e-06,
"loss": 0.5218,
"step": 99
},
{
"epoch": 1.4332129963898916,
"grad_norm": 0.8020306825637817,
"learning_rate": 8.577994803720605e-06,
"loss": 0.535,
"step": 100
},
{
"epoch": 1.447653429602888,
"grad_norm": 0.7037068009376526,
"learning_rate": 8.544067579508292e-06,
"loss": 0.5257,
"step": 101
},
{
"epoch": 1.4620938628158844,
"grad_norm": 0.752699077129364,
"learning_rate": 8.509809200052286e-06,
"loss": 0.5494,
"step": 102
},
{
"epoch": 1.476534296028881,
"grad_norm": 0.7139776349067688,
"learning_rate": 8.475222866432065e-06,
"loss": 0.5384,
"step": 103
},
{
"epoch": 1.4909747292418771,
"grad_norm": 0.6958439350128174,
"learning_rate": 8.440311810370921e-06,
"loss": 0.5371,
"step": 104
},
{
"epoch": 1.5054151624548737,
"grad_norm": 0.6605455279350281,
"learning_rate": 8.405079293933986e-06,
"loss": 0.5355,
"step": 105
},
{
"epoch": 1.5198555956678699,
"grad_norm": 0.7509002685546875,
"learning_rate": 8.36952860922343e-06,
"loss": 0.5441,
"step": 106
},
{
"epoch": 1.5342960288808665,
"grad_norm": 0.6421518325805664,
"learning_rate": 8.333663078070845e-06,
"loss": 0.5286,
"step": 107
},
{
"epoch": 1.5487364620938628,
"grad_norm": 0.6712743043899536,
"learning_rate": 8.297486051726864e-06,
"loss": 0.534,
"step": 108
},
{
"epoch": 1.5487364620938628,
"eval_loss": 0.6092959642410278,
"eval_runtime": 32.5515,
"eval_samples_per_second": 19.723,
"eval_steps_per_second": 2.488,
"step": 108
},
{
"epoch": 1.5631768953068592,
"grad_norm": 0.6272268891334534,
"learning_rate": 8.26100091054801e-06,
"loss": 0.5425,
"step": 109
},
{
"epoch": 1.5776173285198556,
"grad_norm": 0.6285799741744995,
"learning_rate": 8.224211063680854e-06,
"loss": 0.5199,
"step": 110
},
{
"epoch": 1.592057761732852,
"grad_norm": 0.5956050753593445,
"learning_rate": 8.18711994874345e-06,
"loss": 0.5464,
"step": 111
},
{
"epoch": 1.6064981949458483,
"grad_norm": 0.6121729612350464,
"learning_rate": 8.149731031504136e-06,
"loss": 0.5207,
"step": 112
},
{
"epoch": 1.6209386281588447,
"grad_norm": 0.5839301943778992,
"learning_rate": 8.112047805557693e-06,
"loss": 0.5177,
"step": 113
},
{
"epoch": 1.6353790613718413,
"grad_norm": 0.6148134469985962,
"learning_rate": 8.074073791998907e-06,
"loss": 0.5325,
"step": 114
},
{
"epoch": 1.6498194945848375,
"grad_norm": 0.6761514544487,
"learning_rate": 8.035812539093557e-06,
"loss": 0.5691,
"step": 115
},
{
"epoch": 1.664259927797834,
"grad_norm": 0.5942521691322327,
"learning_rate": 7.997267621946871e-06,
"loss": 0.5471,
"step": 116
},
{
"epoch": 1.6787003610108302,
"grad_norm": 0.6566746234893799,
"learning_rate": 7.958442642169469e-06,
"loss": 0.5333,
"step": 117
},
{
"epoch": 1.6931407942238268,
"grad_norm": 0.6552228927612305,
"learning_rate": 7.919341227540828e-06,
"loss": 0.5473,
"step": 118
},
{
"epoch": 1.707581227436823,
"grad_norm": 0.5929325819015503,
"learning_rate": 7.879967031670313e-06,
"loss": 0.5189,
"step": 119
},
{
"epoch": 1.7220216606498195,
"grad_norm": 0.6215195059776306,
"learning_rate": 7.84032373365578e-06,
"loss": 0.5332,
"step": 120
},
{
"epoch": 1.736462093862816,
"grad_norm": 0.6209510564804077,
"learning_rate": 7.800415037739802e-06,
"loss": 0.5198,
"step": 121
},
{
"epoch": 1.7509025270758123,
"grad_norm": 0.6743574142456055,
"learning_rate": 7.760244672963548e-06,
"loss": 0.521,
"step": 122
},
{
"epoch": 1.7653429602888087,
"grad_norm": 0.6415581107139587,
"learning_rate": 7.719816392818354e-06,
"loss": 0.5387,
"step": 123
},
{
"epoch": 1.779783393501805,
"grad_norm": 0.6156140565872192,
"learning_rate": 7.679133974894984e-06,
"loss": 0.5545,
"step": 124
},
{
"epoch": 1.7942238267148014,
"grad_norm": 0.6508094668388367,
"learning_rate": 7.638201220530664e-06,
"loss": 0.5187,
"step": 125
},
{
"epoch": 1.8086642599277978,
"grad_norm": 0.6122965812683105,
"learning_rate": 7.597021954453887e-06,
"loss": 0.5547,
"step": 126
},
{
"epoch": 1.8086642599277978,
"eval_loss": 0.6062851548194885,
"eval_runtime": 33.097,
"eval_samples_per_second": 19.398,
"eval_steps_per_second": 2.447,
"step": 126
},
{
"epoch": 1.8231046931407944,
"grad_norm": 0.6912717819213867,
"learning_rate": 7.555600024427028e-06,
"loss": 0.5406,
"step": 127
},
{
"epoch": 1.8375451263537905,
"grad_norm": 0.5661697387695312,
"learning_rate": 7.513939300886816e-06,
"loss": 0.5387,
"step": 128
},
{
"epoch": 1.8519855595667871,
"grad_norm": 0.6387366652488708,
"learning_rate": 7.472043676582685e-06,
"loss": 0.5472,
"step": 129
},
{
"epoch": 1.8664259927797833,
"grad_norm": 0.6369442939758301,
"learning_rate": 7.42991706621303e-06,
"loss": 0.5419,
"step": 130
},
{
"epoch": 1.8808664259927799,
"grad_norm": 0.5893256664276123,
"learning_rate": 7.387563406059433e-06,
"loss": 0.5406,
"step": 131
},
{
"epoch": 1.895306859205776,
"grad_norm": 0.6452357769012451,
"learning_rate": 7.344986653618844e-06,
"loss": 0.5123,
"step": 132
},
{
"epoch": 1.9097472924187726,
"grad_norm": 0.6396621465682983,
"learning_rate": 7.302190787233808e-06,
"loss": 0.5151,
"step": 133
},
{
"epoch": 1.924187725631769,
"grad_norm": 0.6289088129997253,
"learning_rate": 7.259179805720726e-06,
"loss": 0.5503,
"step": 134
},
{
"epoch": 1.9386281588447654,
"grad_norm": 0.587646484375,
"learning_rate": 7.215957727996208e-06,
"loss": 0.5164,
"step": 135
},
{
"epoch": 1.9530685920577617,
"grad_norm": 0.6339498162269592,
"learning_rate": 7.17252859270155e-06,
"loss": 0.5316,
"step": 136
},
{
"epoch": 1.967509025270758,
"grad_norm": 0.6126948595046997,
"learning_rate": 7.128896457825364e-06,
"loss": 0.5397,
"step": 137
},
{
"epoch": 1.9819494584837545,
"grad_norm": 0.5835312604904175,
"learning_rate": 7.085065400324407e-06,
"loss": 0.5161,
"step": 138
},
{
"epoch": 1.9963898916967509,
"grad_norm": 0.619816780090332,
"learning_rate": 7.041039515742626e-06,
"loss": 0.5228,
"step": 139
},
{
"epoch": 2.0,
"grad_norm": 1.1341562271118164,
"learning_rate": 6.9968229178284775e-06,
"loss": 0.5874,
"step": 140
},
{
"epoch": 2.0144404332129966,
"grad_norm": 1.2002171277999878,
"learning_rate": 6.952419738150546e-06,
"loss": 0.4675,
"step": 141
},
{
"epoch": 2.0288808664259927,
"grad_norm": 0.8529412746429443,
"learning_rate": 6.9078341257114765e-06,
"loss": 0.4614,
"step": 142
},
{
"epoch": 2.0433212996389893,
"grad_norm": 0.7977300882339478,
"learning_rate": 6.863070246560319e-06,
"loss": 0.4506,
"step": 143
},
{
"epoch": 2.0577617328519855,
"grad_norm": 1.0669164657592773,
"learning_rate": 6.818132283403236e-06,
"loss": 0.4592,
"step": 144
},
{
"epoch": 2.0577617328519855,
"eval_loss": 0.63676917552948,
"eval_runtime": 33.7014,
"eval_samples_per_second": 19.05,
"eval_steps_per_second": 2.403,
"step": 144
},
{
"epoch": 2.072202166064982,
"grad_norm": 0.9640716910362244,
"learning_rate": 6.773024435212678e-06,
"loss": 0.4684,
"step": 145
},
{
"epoch": 2.0866425992779782,
"grad_norm": 0.890157163143158,
"learning_rate": 6.7277509168350445e-06,
"loss": 0.4403,
"step": 146
},
{
"epoch": 2.101083032490975,
"grad_norm": 0.9896436929702759,
"learning_rate": 6.6823159585968355e-06,
"loss": 0.4572,
"step": 147
},
{
"epoch": 2.115523465703971,
"grad_norm": 0.8642748594284058,
"learning_rate": 6.636723805909384e-06,
"loss": 0.456,
"step": 148
},
{
"epoch": 2.1299638989169676,
"grad_norm": 0.9155515432357788,
"learning_rate": 6.590978718872166e-06,
"loss": 0.4465,
"step": 149
},
{
"epoch": 2.1444043321299637,
"grad_norm": 0.8667652606964111,
"learning_rate": 6.545084971874738e-06,
"loss": 0.4421,
"step": 150
},
{
"epoch": 2.1588447653429603,
"grad_norm": 0.889424204826355,
"learning_rate": 6.499046853197338e-06,
"loss": 0.4632,
"step": 151
},
{
"epoch": 2.1732851985559565,
"grad_norm": 0.7416921257972717,
"learning_rate": 6.452868664610197e-06,
"loss": 0.4515,
"step": 152
},
{
"epoch": 2.187725631768953,
"grad_norm": 0.7553042769432068,
"learning_rate": 6.406554720971583e-06,
"loss": 0.429,
"step": 153
},
{
"epoch": 2.2021660649819497,
"grad_norm": 0.8044371604919434,
"learning_rate": 6.3601093498246215e-06,
"loss": 0.4522,
"step": 154
},
{
"epoch": 2.216606498194946,
"grad_norm": 0.7472015619277954,
"learning_rate": 6.313536890992935e-06,
"loss": 0.4473,
"step": 155
},
{
"epoch": 2.2310469314079424,
"grad_norm": 0.7978557348251343,
"learning_rate": 6.266841696175132e-06,
"loss": 0.4457,
"step": 156
},
{
"epoch": 2.2454873646209386,
"grad_norm": 0.7715721130371094,
"learning_rate": 6.220028128538188e-06,
"loss": 0.4404,
"step": 157
},
{
"epoch": 2.259927797833935,
"grad_norm": 0.7645951509475708,
"learning_rate": 6.173100562309751e-06,
"loss": 0.4567,
"step": 158
},
{
"epoch": 2.2743682310469313,
"grad_norm": 0.883103609085083,
"learning_rate": 6.1260633823694224e-06,
"loss": 0.4569,
"step": 159
},
{
"epoch": 2.288808664259928,
"grad_norm": 0.695549488067627,
"learning_rate": 6.078920983839032e-06,
"loss": 0.4531,
"step": 160
},
{
"epoch": 2.303249097472924,
"grad_norm": 0.7921261191368103,
"learning_rate": 6.031677771671962e-06,
"loss": 0.454,
"step": 161
},
{
"epoch": 2.3176895306859207,
"grad_norm": 0.7378138303756714,
"learning_rate": 5.984338160241552e-06,
"loss": 0.4454,
"step": 162
},
{
"epoch": 2.3176895306859207,
"eval_loss": 0.6355770826339722,
"eval_runtime": 32.8781,
"eval_samples_per_second": 19.527,
"eval_steps_per_second": 2.464,
"step": 162
},
{
"epoch": 2.332129963898917,
"grad_norm": 0.7265376448631287,
"learning_rate": 5.936906572928625e-06,
"loss": 0.4538,
"step": 163
},
{
"epoch": 2.3465703971119134,
"grad_norm": 0.7353127002716064,
"learning_rate": 5.889387441708162e-06,
"loss": 0.4663,
"step": 164
},
{
"epoch": 2.3610108303249095,
"grad_norm": 0.7641818523406982,
"learning_rate": 5.841785206735192e-06,
"loss": 0.4362,
"step": 165
},
{
"epoch": 2.375451263537906,
"grad_norm": 0.7559286952018738,
"learning_rate": 5.794104315929904e-06,
"loss": 0.4422,
"step": 166
},
{
"epoch": 2.3898916967509027,
"grad_norm": 0.7505171895027161,
"learning_rate": 5.746349224562021e-06,
"loss": 0.4728,
"step": 167
},
{
"epoch": 2.404332129963899,
"grad_norm": 0.773835301399231,
"learning_rate": 5.698524394834531e-06,
"loss": 0.4555,
"step": 168
},
{
"epoch": 2.4187725631768955,
"grad_norm": 0.6625051498413086,
"learning_rate": 5.650634295466717e-06,
"loss": 0.4443,
"step": 169
},
{
"epoch": 2.4332129963898916,
"grad_norm": 0.7382647395133972,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.4505,
"step": 170
},
{
"epoch": 2.4476534296028882,
"grad_norm": 0.7342945337295532,
"learning_rate": 5.554676192762891e-06,
"loss": 0.462,
"step": 171
},
{
"epoch": 2.4620938628158844,
"grad_norm": 0.69790118932724,
"learning_rate": 5.506617155686177e-06,
"loss": 0.4483,
"step": 172
},
{
"epoch": 2.476534296028881,
"grad_norm": 0.7470245957374573,
"learning_rate": 5.458510780649932e-06,
"loss": 0.4432,
"step": 173
},
{
"epoch": 2.490974729241877,
"grad_norm": 0.6948450803756714,
"learning_rate": 5.4103615626808426e-06,
"loss": 0.4543,
"step": 174
},
{
"epoch": 2.5054151624548737,
"grad_norm": 0.6710882186889648,
"learning_rate": 5.362174000808813e-06,
"loss": 0.4442,
"step": 175
},
{
"epoch": 2.51985559566787,
"grad_norm": 0.6929926872253418,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.4448,
"step": 176
},
{
"epoch": 2.5342960288808665,
"grad_norm": 0.68537837266922,
"learning_rate": 5.265701858968944e-06,
"loss": 0.4558,
"step": 177
},
{
"epoch": 2.5487364620938626,
"grad_norm": 0.6701018214225769,
"learning_rate": 5.217426293291869e-06,
"loss": 0.4509,
"step": 178
},
{
"epoch": 2.563176895306859,
"grad_norm": 0.6754850745201111,
"learning_rate": 5.169130411451083e-06,
"loss": 0.42,
"step": 179
},
{
"epoch": 2.577617328519856,
"grad_norm": 0.6687860488891602,
"learning_rate": 5.120818726180662e-06,
"loss": 0.4483,
"step": 180
},
{
"epoch": 2.577617328519856,
"eval_loss": 0.6319621801376343,
"eval_runtime": 32.6147,
"eval_samples_per_second": 19.684,
"eval_steps_per_second": 2.484,
"step": 180
},
{
"epoch": 2.592057761732852,
"grad_norm": 0.6524101495742798,
"learning_rate": 5.072495751691338e-06,
"loss": 0.4223,
"step": 181
},
{
"epoch": 2.606498194945848,
"grad_norm": 0.6371822953224182,
"learning_rate": 5.024166003248703e-06,
"loss": 0.4472,
"step": 182
},
{
"epoch": 2.6209386281588447,
"grad_norm": 0.6347170472145081,
"learning_rate": 4.9758339967512995e-06,
"loss": 0.4339,
"step": 183
},
{
"epoch": 2.6353790613718413,
"grad_norm": 0.647972822189331,
"learning_rate": 4.927504248308663e-06,
"loss": 0.4296,
"step": 184
},
{
"epoch": 2.6498194945848375,
"grad_norm": 0.6498137712478638,
"learning_rate": 4.87918127381934e-06,
"loss": 0.4324,
"step": 185
},
{
"epoch": 2.664259927797834,
"grad_norm": 0.6369062066078186,
"learning_rate": 4.830869588548918e-06,
"loss": 0.4494,
"step": 186
},
{
"epoch": 2.67870036101083,
"grad_norm": 0.6868234276771545,
"learning_rate": 4.782573706708133e-06,
"loss": 0.4591,
"step": 187
},
{
"epoch": 2.693140794223827,
"grad_norm": 0.6149920225143433,
"learning_rate": 4.734298141031057e-06,
"loss": 0.4478,
"step": 188
},
{
"epoch": 2.707581227436823,
"grad_norm": 0.6293914318084717,
"learning_rate": 4.686047402353433e-06,
"loss": 0.4418,
"step": 189
},
{
"epoch": 2.7220216606498195,
"grad_norm": 0.7016595602035522,
"learning_rate": 4.637825999191189e-06,
"loss": 0.447,
"step": 190
},
{
"epoch": 2.7364620938628157,
"grad_norm": 0.6719167232513428,
"learning_rate": 4.589638437319157e-06,
"loss": 0.4494,
"step": 191
},
{
"epoch": 2.7509025270758123,
"grad_norm": 0.6407426595687866,
"learning_rate": 4.541489219350069e-06,
"loss": 0.4524,
"step": 192
},
{
"epoch": 2.765342960288809,
"grad_norm": 0.696622371673584,
"learning_rate": 4.493382844313826e-06,
"loss": 0.4567,
"step": 193
},
{
"epoch": 2.779783393501805,
"grad_norm": 0.6366904377937317,
"learning_rate": 4.445323807237112e-06,
"loss": 0.467,
"step": 194
},
{
"epoch": 2.794223826714801,
"grad_norm": 0.6364036798477173,
"learning_rate": 4.397316598723385e-06,
"loss": 0.448,
"step": 195
},
{
"epoch": 2.808664259927798,
"grad_norm": 0.6690941452980042,
"learning_rate": 4.349365704533285e-06,
"loss": 0.4491,
"step": 196
},
{
"epoch": 2.8231046931407944,
"grad_norm": 0.6544636487960815,
"learning_rate": 4.301475605165471e-06,
"loss": 0.4715,
"step": 197
},
{
"epoch": 2.8375451263537905,
"grad_norm": 0.6416365504264832,
"learning_rate": 4.25365077543798e-06,
"loss": 0.4644,
"step": 198
},
{
"epoch": 2.8375451263537905,
"eval_loss": 0.6322494149208069,
"eval_runtime": 32.6203,
"eval_samples_per_second": 19.681,
"eval_steps_per_second": 2.483,
"step": 198
},
{
"epoch": 2.851985559566787,
"grad_norm": 0.6493151187896729,
"learning_rate": 4.205895684070099e-06,
"loss": 0.4533,
"step": 199
},
{
"epoch": 2.8664259927797833,
"grad_norm": 0.6717944145202637,
"learning_rate": 4.158214793264808e-06,
"loss": 0.4689,
"step": 200
},
{
"epoch": 2.88086642599278,
"grad_norm": 0.635381281375885,
"learning_rate": 4.1106125582918385e-06,
"loss": 0.4512,
"step": 201
},
{
"epoch": 2.895306859205776,
"grad_norm": 0.6783753633499146,
"learning_rate": 4.063093427071376e-06,
"loss": 0.434,
"step": 202
},
{
"epoch": 2.9097472924187726,
"grad_norm": 0.6600508689880371,
"learning_rate": 4.01566183975845e-06,
"loss": 0.4497,
"step": 203
},
{
"epoch": 2.9241877256317688,
"grad_norm": 0.6617497801780701,
"learning_rate": 3.968322228328041e-06,
"loss": 0.453,
"step": 204
},
{
"epoch": 2.9386281588447654,
"grad_norm": 0.6339384317398071,
"learning_rate": 3.92107901616097e-06,
"loss": 0.4517,
"step": 205
},
{
"epoch": 2.953068592057762,
"grad_norm": 0.7040077447891235,
"learning_rate": 3.873936617630578e-06,
"loss": 0.4699,
"step": 206
},
{
"epoch": 2.967509025270758,
"grad_norm": 0.6730422377586365,
"learning_rate": 3.82689943769025e-06,
"loss": 0.4375,
"step": 207
},
{
"epoch": 2.9819494584837543,
"grad_norm": 0.6471662521362305,
"learning_rate": 3.779971871461813e-06,
"loss": 0.4615,
"step": 208
},
{
"epoch": 2.996389891696751,
"grad_norm": 0.6612151861190796,
"learning_rate": 3.7331583038248688e-06,
"loss": 0.4489,
"step": 209
},
{
"epoch": 3.0144404332129966,
"grad_norm": 1.2435836791992188,
"learning_rate": 3.6864631090070656e-06,
"loss": 0.3921,
"step": 210
},
{
"epoch": 3.0288808664259927,
"grad_norm": 1.0806032419204712,
"learning_rate": 3.639890650175379e-06,
"loss": 0.368,
"step": 211
},
{
"epoch": 3.0433212996389893,
"grad_norm": 0.8179118037223816,
"learning_rate": 3.593445279028418e-06,
"loss": 0.3847,
"step": 212
},
{
"epoch": 3.0577617328519855,
"grad_norm": 1.0836800336837769,
"learning_rate": 3.5471313353898056e-06,
"loss": 0.3885,
"step": 213
},
{
"epoch": 3.072202166064982,
"grad_norm": 1.4152075052261353,
"learning_rate": 3.5009531468026646e-06,
"loss": 0.3764,
"step": 214
},
{
"epoch": 3.0866425992779782,
"grad_norm": 1.26155686378479,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.3786,
"step": 215
},
{
"epoch": 3.101083032490975,
"grad_norm": 0.9656947255134583,
"learning_rate": 3.409021281127835e-06,
"loss": 0.3837,
"step": 216
},
{
"epoch": 3.101083032490975,
"eval_loss": 0.6669259071350098,
"eval_runtime": 32.2367,
"eval_samples_per_second": 19.915,
"eval_steps_per_second": 2.513,
"step": 216
},
{
"epoch": 3.115523465703971,
"grad_norm": 1.0473723411560059,
"learning_rate": 3.3632761940906167e-06,
"loss": 0.3779,
"step": 217
},
{
"epoch": 3.1299638989169676,
"grad_norm": 1.022290587425232,
"learning_rate": 3.3176840414031653e-06,
"loss": 0.3825,
"step": 218
},
{
"epoch": 3.1444043321299637,
"grad_norm": 0.869454562664032,
"learning_rate": 3.2722490831649568e-06,
"loss": 0.366,
"step": 219
},
{
"epoch": 3.1588447653429603,
"grad_norm": 0.8059322834014893,
"learning_rate": 3.226975564787322e-06,
"loss": 0.3625,
"step": 220
},
{
"epoch": 3.1732851985559565,
"grad_norm": 0.8983060717582703,
"learning_rate": 3.181867716596765e-06,
"loss": 0.3684,
"step": 221
},
{
"epoch": 3.187725631768953,
"grad_norm": 0.9604324102401733,
"learning_rate": 3.1369297534396823e-06,
"loss": 0.3705,
"step": 222
},
{
"epoch": 3.2021660649819497,
"grad_norm": 0.8212615251541138,
"learning_rate": 3.092165874288525e-06,
"loss": 0.3503,
"step": 223
},
{
"epoch": 3.216606498194946,
"grad_norm": 0.8354775905609131,
"learning_rate": 3.0475802618494564e-06,
"loss": 0.3634,
"step": 224
},
{
"epoch": 3.2310469314079424,
"grad_norm": 0.7785860896110535,
"learning_rate": 3.0031770821715233e-06,
"loss": 0.3739,
"step": 225
},
{
"epoch": 3.2454873646209386,
"grad_norm": 0.8317592144012451,
"learning_rate": 2.9589604842573762e-06,
"loss": 0.3582,
"step": 226
},
{
"epoch": 3.259927797833935,
"grad_norm": 0.8261600136756897,
"learning_rate": 2.914934599675594e-06,
"loss": 0.3508,
"step": 227
},
{
"epoch": 3.2743682310469313,
"grad_norm": 0.742267906665802,
"learning_rate": 2.871103542174637e-06,
"loss": 0.3723,
"step": 228
},
{
"epoch": 3.288808664259928,
"grad_norm": 0.7474186420440674,
"learning_rate": 2.827471407298451e-06,
"loss": 0.382,
"step": 229
},
{
"epoch": 3.303249097472924,
"grad_norm": 0.7781416177749634,
"learning_rate": 2.7840422720037943e-06,
"loss": 0.3699,
"step": 230
},
{
"epoch": 3.3176895306859207,
"grad_norm": 0.7829609513282776,
"learning_rate": 2.7408201942792755e-06,
"loss": 0.368,
"step": 231
},
{
"epoch": 3.332129963898917,
"grad_norm": 0.7839555740356445,
"learning_rate": 2.697809212766195e-06,
"loss": 0.378,
"step": 232
},
{
"epoch": 3.3465703971119134,
"grad_norm": 0.7255096435546875,
"learning_rate": 2.655013346381158e-06,
"loss": 0.3495,
"step": 233
},
{
"epoch": 3.3610108303249095,
"grad_norm": 0.7900465726852417,
"learning_rate": 2.612436593940568e-06,
"loss": 0.3809,
"step": 234
},
{
"epoch": 3.3610108303249095,
"eval_loss": 0.6795952320098877,
"eval_runtime": 32.8352,
"eval_samples_per_second": 19.552,
"eval_steps_per_second": 2.467,
"step": 234
},
{
"epoch": 3.375451263537906,
"grad_norm": 0.7024810910224915,
"learning_rate": 2.57008293378697e-06,
"loss": 0.3493,
"step": 235
},
{
"epoch": 3.3898916967509027,
"grad_norm": 0.7617738842964172,
"learning_rate": 2.5279563234173177e-06,
"loss": 0.3726,
"step": 236
},
{
"epoch": 3.404332129963899,
"grad_norm": 0.7388402223587036,
"learning_rate": 2.4860606991131857e-06,
"loss": 0.3614,
"step": 237
},
{
"epoch": 3.4187725631768955,
"grad_norm": 0.7406207919120789,
"learning_rate": 2.444399975572974e-06,
"loss": 0.3712,
"step": 238
},
{
"epoch": 3.4332129963898916,
"grad_norm": 0.810865581035614,
"learning_rate": 2.402978045546114e-06,
"loss": 0.3916,
"step": 239
},
{
"epoch": 3.4476534296028882,
"grad_norm": 0.7760987877845764,
"learning_rate": 2.3617987794693358e-06,
"loss": 0.3824,
"step": 240
},
{
"epoch": 3.4620938628158844,
"grad_norm": 0.743865966796875,
"learning_rate": 2.320866025105016e-06,
"loss": 0.3822,
"step": 241
},
{
"epoch": 3.476534296028881,
"grad_norm": 0.718098521232605,
"learning_rate": 2.2801836071816476e-06,
"loss": 0.3674,
"step": 242
},
{
"epoch": 3.490974729241877,
"grad_norm": 0.703536331653595,
"learning_rate": 2.2397553270364546e-06,
"loss": 0.3491,
"step": 243
},
{
"epoch": 3.5054151624548737,
"grad_norm": 0.7709066271781921,
"learning_rate": 2.1995849622602017e-06,
"loss": 0.3735,
"step": 244
},
{
"epoch": 3.51985559566787,
"grad_norm": 0.7621421217918396,
"learning_rate": 2.159676266344222e-06,
"loss": 0.3655,
"step": 245
},
{
"epoch": 3.5342960288808665,
"grad_norm": 0.7338027954101562,
"learning_rate": 2.120032968329687e-06,
"loss": 0.3894,
"step": 246
},
{
"epoch": 3.5487364620938626,
"grad_norm": 0.764658510684967,
"learning_rate": 2.0806587724591725e-06,
"loss": 0.3669,
"step": 247
},
{
"epoch": 3.563176895306859,
"grad_norm": 0.7732599377632141,
"learning_rate": 2.0415573578305343e-06,
"loss": 0.3704,
"step": 248
},
{
"epoch": 3.577617328519856,
"grad_norm": 0.7785531878471375,
"learning_rate": 2.0027323780531312e-06,
"loss": 0.3832,
"step": 249
},
{
"epoch": 3.592057761732852,
"grad_norm": 0.7383318543434143,
"learning_rate": 1.9641874609064443e-06,
"loss": 0.3516,
"step": 250
},
{
"epoch": 3.606498194945848,
"grad_norm": 0.7191964983940125,
"learning_rate": 1.9259262080010938e-06,
"loss": 0.3647,
"step": 251
},
{
"epoch": 3.6209386281588447,
"grad_norm": 0.688936710357666,
"learning_rate": 1.887952194442309e-06,
"loss": 0.3646,
"step": 252
},
{
"epoch": 3.6209386281588447,
"eval_loss": 0.6788443326950073,
"eval_runtime": 32.2809,
"eval_samples_per_second": 19.888,
"eval_steps_per_second": 2.509,
"step": 252
},
{
"epoch": 3.6353790613718413,
"grad_norm": 0.7906298637390137,
"learning_rate": 1.8502689684958664e-06,
"loss": 0.3798,
"step": 253
},
{
"epoch": 3.6498194945848375,
"grad_norm": 0.7622320055961609,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.3624,
"step": 254
},
{
"epoch": 3.664259927797834,
"grad_norm": 0.7161827683448792,
"learning_rate": 1.7757889363191484e-06,
"loss": 0.3709,
"step": 255
},
{
"epoch": 3.67870036101083,
"grad_norm": 0.7306489944458008,
"learning_rate": 1.738999089451991e-06,
"loss": 0.3748,
"step": 256
},
{
"epoch": 3.693140794223827,
"grad_norm": 0.746320903301239,
"learning_rate": 1.7025139482731385e-06,
"loss": 0.3755,
"step": 257
},
{
"epoch": 3.707581227436823,
"grad_norm": 0.7224844694137573,
"learning_rate": 1.6663369219291558e-06,
"loss": 0.3566,
"step": 258
},
{
"epoch": 3.7220216606498195,
"grad_norm": 0.7270101308822632,
"learning_rate": 1.6304713907765713e-06,
"loss": 0.3778,
"step": 259
},
{
"epoch": 3.7364620938628157,
"grad_norm": 0.7161392569541931,
"learning_rate": 1.5949207060660138e-06,
"loss": 0.379,
"step": 260
},
{
"epoch": 3.7509025270758123,
"grad_norm": 0.7086169123649597,
"learning_rate": 1.55968818962908e-06,
"loss": 0.3688,
"step": 261
},
{
"epoch": 3.765342960288809,
"grad_norm": 0.7060995697975159,
"learning_rate": 1.5247771335679372e-06,
"loss": 0.3826,
"step": 262
},
{
"epoch": 3.779783393501805,
"grad_norm": 0.6812713146209717,
"learning_rate": 1.4901907999477167e-06,
"loss": 0.3616,
"step": 263
},
{
"epoch": 3.794223826714801,
"grad_norm": 0.6879968643188477,
"learning_rate": 1.4559324204917102e-06,
"loss": 0.3748,
"step": 264
},
{
"epoch": 3.808664259927798,
"grad_norm": 0.7127858400344849,
"learning_rate": 1.4220051962793952e-06,
"loss": 0.3792,
"step": 265
},
{
"epoch": 3.8231046931407944,
"grad_norm": 0.7238702774047852,
"learning_rate": 1.3884122974473307e-06,
"loss": 0.3675,
"step": 266
},
{
"epoch": 3.8375451263537905,
"grad_norm": 0.7225318551063538,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.3832,
"step": 267
},
{
"epoch": 3.851985559566787,
"grad_norm": 0.7232125401496887,
"learning_rate": 1.3222419999812248e-06,
"loss": 0.3887,
"step": 268
},
{
"epoch": 3.8664259927797833,
"grad_norm": 0.7618275284767151,
"learning_rate": 1.2896707842543898e-06,
"loss": 0.3837,
"step": 269
},
{
"epoch": 3.88086642599278,
"grad_norm": 0.7329801321029663,
"learning_rate": 1.257446259144494e-06,
"loss": 0.3886,
"step": 270
},
{
"epoch": 3.88086642599278,
"eval_loss": 0.6797041296958923,
"eval_runtime": 33.1748,
"eval_samples_per_second": 19.352,
"eval_steps_per_second": 2.442,
"step": 270
},
{
"epoch": 3.895306859205776,
"grad_norm": 0.717837929725647,
"learning_rate": 1.225571435689062e-06,
"loss": 0.3648,
"step": 271
},
{
"epoch": 3.9097472924187726,
"grad_norm": 0.7133547067642212,
"learning_rate": 1.1940492922497337e-06,
"loss": 0.3713,
"step": 272
},
{
"epoch": 3.9241877256317688,
"grad_norm": 0.7317136526107788,
"learning_rate": 1.1628827742339688e-06,
"loss": 0.3735,
"step": 273
},
{
"epoch": 3.9386281588447654,
"grad_norm": 0.7250531911849976,
"learning_rate": 1.1320747938198356e-06,
"loss": 0.4011,
"step": 274
},
{
"epoch": 3.953068592057762,
"grad_norm": 0.7316252589225769,
"learning_rate": 1.1016282296838887e-06,
"loss": 0.3912,
"step": 275
},
{
"epoch": 3.967509025270758,
"grad_norm": 0.7132654190063477,
"learning_rate": 1.0715459267321998e-06,
"loss": 0.362,
"step": 276
},
{
"epoch": 3.9819494584837543,
"grad_norm": 0.7502458691596985,
"learning_rate": 1.0418306958345214e-06,
"loss": 0.3629,
"step": 277
},
{
"epoch": 3.996389891696751,
"grad_norm": 0.7093997001647949,
"learning_rate": 1.0124853135616475e-06,
"loss": 0.3937,
"step": 278
},
{
"epoch": 4.014440433212997,
"grad_norm": 1.1966173648834229,
"learning_rate": 9.835125219259694e-07,
"loss": 0.3186,
"step": 279
},
{
"epoch": 4.028880866425993,
"grad_norm": 1.1906567811965942,
"learning_rate": 9.549150281252633e-07,
"loss": 0.3267,
"step": 280
},
{
"epoch": 4.043321299638989,
"grad_norm": 0.9876328706741333,
"learning_rate": 9.266955042897357e-07,
"loss": 0.3445,
"step": 281
},
{
"epoch": 4.0577617328519855,
"grad_norm": 0.8555173873901367,
"learning_rate": 8.988565872323362e-07,
"loss": 0.3231,
"step": 282
},
{
"epoch": 4.072202166064982,
"grad_norm": 0.7850072383880615,
"learning_rate": 8.714008782023797e-07,
"loss": 0.3405,
"step": 283
},
{
"epoch": 4.086642599277979,
"grad_norm": 0.9164103865623474,
"learning_rate": 8.443309426424862e-07,
"loss": 0.315,
"step": 284
},
{
"epoch": 4.101083032490974,
"grad_norm": 1.1176913976669312,
"learning_rate": 8.176493099488664e-07,
"loss": 0.3354,
"step": 285
},
{
"epoch": 4.115523465703971,
"grad_norm": 1.1691384315490723,
"learning_rate": 7.913584732349788e-07,
"loss": 0.3398,
"step": 286
},
{
"epoch": 4.129963898916968,
"grad_norm": 1.0531779527664185,
"learning_rate": 7.654608890985709e-07,
"loss": 0.3309,
"step": 287
},
{
"epoch": 4.144404332129964,
"grad_norm": 1.0438635349273682,
"learning_rate": 7.399589773921412e-07,
"loss": 0.3324,
"step": 288
},
{
"epoch": 4.144404332129964,
"eval_loss": 0.7206189632415771,
"eval_runtime": 32.2481,
"eval_samples_per_second": 19.908,
"eval_steps_per_second": 2.512,
"step": 288
},
{
"epoch": 4.15884476534296,
"grad_norm": 0.8594384789466858,
"learning_rate": 7.148551209968279e-07,
"loss": 0.3138,
"step": 289
},
{
"epoch": 4.1732851985559565,
"grad_norm": 0.8748891949653625,
"learning_rate": 6.901516655997536e-07,
"loss": 0.3291,
"step": 290
},
{
"epoch": 4.187725631768953,
"grad_norm": 0.8275448679924011,
"learning_rate": 6.658509194748463e-07,
"loss": 0.3208,
"step": 291
},
{
"epoch": 4.20216606498195,
"grad_norm": 0.7752466201782227,
"learning_rate": 6.419551532671542e-07,
"loss": 0.3107,
"step": 292
},
{
"epoch": 4.216606498194946,
"grad_norm": 0.8111825585365295,
"learning_rate": 6.184665997806832e-07,
"loss": 0.3228,
"step": 293
},
{
"epoch": 4.231046931407942,
"grad_norm": 0.8536714911460876,
"learning_rate": 5.953874537697573e-07,
"loss": 0.3243,
"step": 294
},
{
"epoch": 4.245487364620939,
"grad_norm": 0.7821819186210632,
"learning_rate": 5.727198717339511e-07,
"loss": 0.3229,
"step": 295
},
{
"epoch": 4.259927797833935,
"grad_norm": 0.8092971444129944,
"learning_rate": 5.504659717165812e-07,
"loss": 0.3172,
"step": 296
},
{
"epoch": 4.274368231046932,
"grad_norm": 0.7584081292152405,
"learning_rate": 5.286278331068018e-07,
"loss": 0.3169,
"step": 297
},
{
"epoch": 4.2888086642599275,
"grad_norm": 0.7885962128639221,
"learning_rate": 5.072074964453055e-07,
"loss": 0.332,
"step": 298
},
{
"epoch": 4.303249097472924,
"grad_norm": 0.7851223349571228,
"learning_rate": 4.862069632336558e-07,
"loss": 0.3334,
"step": 299
},
{
"epoch": 4.317689530685921,
"grad_norm": 0.8225854635238647,
"learning_rate": 4.6562819574727304e-07,
"loss": 0.3221,
"step": 300
},
{
"epoch": 4.332129963898917,
"grad_norm": 0.8033653497695923,
"learning_rate": 4.454731168520754e-07,
"loss": 0.3191,
"step": 301
},
{
"epoch": 4.346570397111913,
"grad_norm": 0.8068827986717224,
"learning_rate": 4.257436098248091e-07,
"loss": 0.3348,
"step": 302
},
{
"epoch": 4.3610108303249095,
"grad_norm": 0.8063839077949524,
"learning_rate": 4.064415181770787e-07,
"loss": 0.3307,
"step": 303
},
{
"epoch": 4.375451263537906,
"grad_norm": 0.8209130167961121,
"learning_rate": 3.875686454830885e-07,
"loss": 0.3416,
"step": 304
},
{
"epoch": 4.389891696750903,
"grad_norm": 0.7289718985557556,
"learning_rate": 3.691267552111183e-07,
"loss": 0.3165,
"step": 305
},
{
"epoch": 4.404332129963899,
"grad_norm": 0.7588977217674255,
"learning_rate": 3.511175705587433e-07,
"loss": 0.3297,
"step": 306
},
{
"epoch": 4.404332129963899,
"eval_loss": 0.7203958034515381,
"eval_runtime": 32.8539,
"eval_samples_per_second": 19.541,
"eval_steps_per_second": 2.465,
"step": 306
},
{
"epoch": 4.418772563176895,
"grad_norm": 0.7277893424034119,
"learning_rate": 3.3354277429182626e-07,
"loss": 0.3184,
"step": 307
},
{
"epoch": 4.433212996389892,
"grad_norm": 0.7536954283714294,
"learning_rate": 3.164040085872755e-07,
"loss": 0.3334,
"step": 308
},
{
"epoch": 4.447653429602888,
"grad_norm": 0.7596468925476074,
"learning_rate": 2.997028748796016e-07,
"loss": 0.3253,
"step": 309
},
{
"epoch": 4.462093862815885,
"grad_norm": 0.7149029970169067,
"learning_rate": 2.834409337112842e-07,
"loss": 0.3337,
"step": 310
},
{
"epoch": 4.4765342960288805,
"grad_norm": 0.8042013645172119,
"learning_rate": 2.676197045869511e-07,
"loss": 0.2985,
"step": 311
},
{
"epoch": 4.490974729241877,
"grad_norm": 0.7098730802536011,
"learning_rate": 2.522406658313997e-07,
"loss": 0.3305,
"step": 312
},
{
"epoch": 4.505415162454874,
"grad_norm": 0.7298141121864319,
"learning_rate": 2.3730525445146146e-07,
"loss": 0.3142,
"step": 313
},
{
"epoch": 4.51985559566787,
"grad_norm": 0.7250217199325562,
"learning_rate": 2.2281486600173207e-07,
"loss": 0.3276,
"step": 314
},
{
"epoch": 4.534296028880867,
"grad_norm": 0.7107163071632385,
"learning_rate": 2.0877085445416889e-07,
"loss": 0.3112,
"step": 315
},
{
"epoch": 4.548736462093863,
"grad_norm": 0.7627288103103638,
"learning_rate": 1.9517453207157865e-07,
"loss": 0.3353,
"step": 316
},
{
"epoch": 4.563176895306859,
"grad_norm": 0.707574188709259,
"learning_rate": 1.8202716928499842e-07,
"loss": 0.3148,
"step": 317
},
{
"epoch": 4.577617328519856,
"grad_norm": 0.7250429391860962,
"learning_rate": 1.6932999457498823e-07,
"loss": 0.3286,
"step": 318
},
{
"epoch": 4.5920577617328515,
"grad_norm": 0.7276959419250488,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.32,
"step": 319
},
{
"epoch": 4.606498194945848,
"grad_norm": 0.7046621441841125,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.3247,
"step": 320
},
{
"epoch": 4.620938628158845,
"grad_norm": 0.7249467372894287,
"learning_rate": 1.3395125206980774e-07,
"loss": 0.3268,
"step": 321
},
{
"epoch": 4.635379061371841,
"grad_norm": 0.7257922291755676,
"learning_rate": 1.230662715271741e-07,
"loss": 0.3195,
"step": 322
},
{
"epoch": 4.649819494584838,
"grad_norm": 0.7479093074798584,
"learning_rate": 1.1263698832695513e-07,
"loss": 0.3463,
"step": 323
},
{
"epoch": 4.664259927797834,
"grad_norm": 0.7323216199874878,
"learning_rate": 1.0266437697422026e-07,
"loss": 0.338,
"step": 324
},
{
"epoch": 4.664259927797834,
"eval_loss": 0.7205774784088135,
"eval_runtime": 32.268,
"eval_samples_per_second": 19.896,
"eval_steps_per_second": 2.51,
"step": 324
},
{
"epoch": 4.67870036101083,
"grad_norm": 0.7437517642974854,
"learning_rate": 9.314936930293283e-08,
"loss": 0.31,
"step": 325
},
{
"epoch": 4.693140794223827,
"grad_norm": 0.7366902232170105,
"learning_rate": 8.40928543888836e-08,
"loss": 0.3344,
"step": 326
},
{
"epoch": 4.707581227436823,
"grad_norm": 0.7316016554832458,
"learning_rate": 7.549567846661388e-08,
"loss": 0.3204,
"step": 327
},
{
"epoch": 4.722021660649819,
"grad_norm": 0.7517760396003723,
"learning_rate": 6.735864485034493e-08,
"loss": 0.3358,
"step": 328
},
{
"epoch": 4.736462093862816,
"grad_norm": 0.7169381976127625,
"learning_rate": 5.968251385891744e-08,
"loss": 0.3166,
"step": 329
},
{
"epoch": 4.750902527075812,
"grad_norm": 0.7038097381591797,
"learning_rate": 5.246800274474439e-08,
"loss": 0.3104,
"step": 330
},
{
"epoch": 4.765342960288809,
"grad_norm": 0.7148619294166565,
"learning_rate": 4.571578562679757e-08,
"loss": 0.3137,
"step": 331
},
{
"epoch": 4.7797833935018055,
"grad_norm": 0.7279098033905029,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.3223,
"step": 332
},
{
"epoch": 4.794223826714801,
"grad_norm": 0.7073100209236145,
"learning_rate": 3.360071381433516e-08,
"loss": 0.3277,
"step": 333
},
{
"epoch": 4.808664259927798,
"grad_norm": 0.74690181016922,
"learning_rate": 2.823899114382078e-08,
"loss": 0.3293,
"step": 334
},
{
"epoch": 4.823104693140794,
"grad_norm": 0.7142198085784912,
"learning_rate": 2.3341826411756863e-08,
"loss": 0.3189,
"step": 335
},
{
"epoch": 4.837545126353791,
"grad_norm": 0.734983503818512,
"learning_rate": 1.8909677205856682e-08,
"loss": 0.3098,
"step": 336
},
{
"epoch": 4.851985559566787,
"grad_norm": 0.744941234588623,
"learning_rate": 1.494295766310161e-08,
"loss": 0.3259,
"step": 337
},
{
"epoch": 4.866425992779783,
"grad_norm": 0.7279512286186218,
"learning_rate": 1.1442038431044856e-08,
"loss": 0.331,
"step": 338
},
{
"epoch": 4.88086642599278,
"grad_norm": 0.7453312873840332,
"learning_rate": 8.407246633178601e-09,
"loss": 0.3394,
"step": 339
},
{
"epoch": 4.8953068592057765,
"grad_norm": 0.7087642550468445,
"learning_rate": 5.838865838366792e-09,
"loss": 0.3178,
"step": 340
},
{
"epoch": 4.909747292418773,
"grad_norm": 0.7283005714416504,
"learning_rate": 3.737136034349109e-09,
"loss": 0.331,
"step": 341
},
{
"epoch": 4.924187725631769,
"grad_norm": 0.6744409203529358,
"learning_rate": 2.102253605316684e-09,
"loss": 0.3052,
"step": 342
},
{
"epoch": 4.924187725631769,
"eval_loss": 0.7205825448036194,
"eval_runtime": 32.4625,
"eval_samples_per_second": 19.777,
"eval_steps_per_second": 2.495,
"step": 342
},
{
"epoch": 4.938628158844765,
"grad_norm": 0.7398666143417358,
"learning_rate": 9.343713135623323e-10,
"loss": 0.3297,
"step": 343
},
{
"epoch": 4.953068592057762,
"grad_norm": 0.7399023771286011,
"learning_rate": 2.335982852064156e-10,
"loss": 0.3273,
"step": 344
},
{
"epoch": 4.967509025270758,
"grad_norm": 0.7517241835594177,
"learning_rate": 0.0,
"loss": 0.3359,
"step": 345
}
],
"logging_steps": 1,
"max_steps": 345,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 69,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.423224727839703e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}