|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.18488560203374163, |
|
"eval_steps": 500, |
|
"global_step": 1200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0015407133502811801, |
|
"grad_norm": 48.20575104273089, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 1.3971, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0030814267005623602, |
|
"grad_norm": 33.11975747134839, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 1.4053, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004622140050843541, |
|
"grad_norm": 39.826606299507084, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 1.3472, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0061628534011247205, |
|
"grad_norm": 6.773742739301677, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 1.1311, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.007703566751405901, |
|
"grad_norm": 12.278712041952108, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 0.9715, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009244280101687081, |
|
"grad_norm": 3.776870082439811, |
|
"learning_rate": 2.7692307692307694e-05, |
|
"loss": 0.8937, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.010784993451968261, |
|
"grad_norm": 6.812660145408396, |
|
"learning_rate": 2.9999955171465948e-05, |
|
"loss": 0.8472, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.012325706802249441, |
|
"grad_norm": 14.1920325479403, |
|
"learning_rate": 2.9999596544801216e-05, |
|
"loss": 0.8418, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.013866420152530621, |
|
"grad_norm": 4.760185564415112, |
|
"learning_rate": 2.999887930004599e-05, |
|
"loss": 0.8275, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.015407133502811803, |
|
"grad_norm": 5.64069685533569, |
|
"learning_rate": 2.9997803454348518e-05, |
|
"loss": 0.8085, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01694784685309298, |
|
"grad_norm": 8.910755610595423, |
|
"learning_rate": 2.9996369033430674e-05, |
|
"loss": 0.8105, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.018488560203374162, |
|
"grad_norm": 7.128375140746789, |
|
"learning_rate": 2.9994576071587345e-05, |
|
"loss": 0.7647, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.020029273553655344, |
|
"grad_norm": 4.883749519467239, |
|
"learning_rate": 2.9992424611685575e-05, |
|
"loss": 0.7472, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.021569986903936522, |
|
"grad_norm": 6.669698248447361, |
|
"learning_rate": 2.9989914705163582e-05, |
|
"loss": 0.7644, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.023110700254217704, |
|
"grad_norm": 8.814324372744018, |
|
"learning_rate": 2.9987046412029506e-05, |
|
"loss": 0.7642, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.024651413604498882, |
|
"grad_norm": 5.408352128962619, |
|
"learning_rate": 2.9983819800859976e-05, |
|
"loss": 0.7506, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.026192126954780064, |
|
"grad_norm": 4.916006313310609, |
|
"learning_rate": 2.998023494879848e-05, |
|
"loss": 0.7086, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.027732840305061242, |
|
"grad_norm": 6.4968532868316515, |
|
"learning_rate": 2.9976291941553508e-05, |
|
"loss": 0.7302, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.029273553655342423, |
|
"grad_norm": 5.414282856209666, |
|
"learning_rate": 2.9971990873396512e-05, |
|
"loss": 0.7389, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.030814267005623605, |
|
"grad_norm": 35.78019922037886, |
|
"learning_rate": 2.996733184715964e-05, |
|
"loss": 0.7247, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03235498035590478, |
|
"grad_norm": 4.344484335761467, |
|
"learning_rate": 2.9962314974233306e-05, |
|
"loss": 0.7239, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03389569370618596, |
|
"grad_norm": 4.761393885682967, |
|
"learning_rate": 2.995694037456349e-05, |
|
"loss": 0.7219, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03543640705646715, |
|
"grad_norm": 6.082748106236669, |
|
"learning_rate": 2.995120817664889e-05, |
|
"loss": 0.7036, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.036977120406748325, |
|
"grad_norm": 7.925002223835823, |
|
"learning_rate": 2.9945118517537857e-05, |
|
"loss": 0.6795, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0385178337570295, |
|
"grad_norm": 4.33748938479485, |
|
"learning_rate": 2.9938671542825102e-05, |
|
"loss": 0.6894, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04005854710731069, |
|
"grad_norm": 3.419861222736367, |
|
"learning_rate": 2.993186740664821e-05, |
|
"loss": 0.674, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.041599260457591866, |
|
"grad_norm": 4.418012569894865, |
|
"learning_rate": 2.9924706271683993e-05, |
|
"loss": 0.7091, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.043139973807873044, |
|
"grad_norm": 5.5535932232942065, |
|
"learning_rate": 2.9917188309144548e-05, |
|
"loss": 0.7114, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.04468068715815422, |
|
"grad_norm": 5.346051172390458, |
|
"learning_rate": 2.990931369877321e-05, |
|
"loss": 0.7092, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.04622140050843541, |
|
"grad_norm": 5.04585392782648, |
|
"learning_rate": 2.9901082628840216e-05, |
|
"loss": 0.7079, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.047762113858716586, |
|
"grad_norm": 5.3108067226217095, |
|
"learning_rate": 2.989249529613823e-05, |
|
"loss": 0.7044, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.049302827208997764, |
|
"grad_norm": 12.936374781230974, |
|
"learning_rate": 2.9883551905977647e-05, |
|
"loss": 0.6795, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05084354055927895, |
|
"grad_norm": 4.969368758151501, |
|
"learning_rate": 2.987425267218164e-05, |
|
"loss": 0.7365, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.05238425390956013, |
|
"grad_norm": 17.59963354249395, |
|
"learning_rate": 2.9864597817081083e-05, |
|
"loss": 0.6459, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.053924967259841305, |
|
"grad_norm": 4.287709062556245, |
|
"learning_rate": 2.985458757150924e-05, |
|
"loss": 0.7151, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.055465680610122484, |
|
"grad_norm": 2.353889201419343, |
|
"learning_rate": 2.9844222174796224e-05, |
|
"loss": 0.6982, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.05700639396040367, |
|
"grad_norm": 6.487203178892158, |
|
"learning_rate": 2.983350187476328e-05, |
|
"loss": 0.6946, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.05854710731068485, |
|
"grad_norm": 8.604226568596971, |
|
"learning_rate": 2.982242692771688e-05, |
|
"loss": 0.7024, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.060087820660966025, |
|
"grad_norm": 3.7381224404208107, |
|
"learning_rate": 2.9810997598442558e-05, |
|
"loss": 0.6813, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06162853401124721, |
|
"grad_norm": 7.21302363512955, |
|
"learning_rate": 2.9799214160198622e-05, |
|
"loss": 0.6572, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06316924736152839, |
|
"grad_norm": 7.446657585655329, |
|
"learning_rate": 2.9787076894709592e-05, |
|
"loss": 0.6612, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.06470996071180957, |
|
"grad_norm": 16.149479699980372, |
|
"learning_rate": 2.977458609215946e-05, |
|
"loss": 0.6823, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.06625067406209074, |
|
"grad_norm": 3.654646693112352, |
|
"learning_rate": 2.9761742051184786e-05, |
|
"loss": 0.6941, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.06779138741237192, |
|
"grad_norm": 7.468345726914931, |
|
"learning_rate": 2.9748545078867524e-05, |
|
"loss": 0.6254, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06933210076265311, |
|
"grad_norm": 6.559292545318311, |
|
"learning_rate": 2.9734995490727696e-05, |
|
"loss": 0.6977, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0708728141129343, |
|
"grad_norm": 8.749994768413615, |
|
"learning_rate": 2.9721093610715844e-05, |
|
"loss": 0.6742, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07241352746321547, |
|
"grad_norm": 14.746822726186753, |
|
"learning_rate": 2.9706839771205282e-05, |
|
"loss": 0.6986, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.07395424081349665, |
|
"grad_norm": 2.1125271451210494, |
|
"learning_rate": 2.9692234312984156e-05, |
|
"loss": 0.6708, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.07549495416377783, |
|
"grad_norm": 4.191787346884265, |
|
"learning_rate": 2.9677277585247296e-05, |
|
"loss": 0.6839, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.077035667514059, |
|
"grad_norm": 6.099946769106883, |
|
"learning_rate": 2.9661969945587867e-05, |
|
"loss": 0.7253, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07857638086434018, |
|
"grad_norm": 8.272898482647978, |
|
"learning_rate": 2.9646311759988804e-05, |
|
"loss": 0.6972, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.08011709421462138, |
|
"grad_norm": 12.38761120543235, |
|
"learning_rate": 2.9630303402814095e-05, |
|
"loss": 0.7174, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.08165780756490255, |
|
"grad_norm": 7.826170900143438, |
|
"learning_rate": 2.961394525679979e-05, |
|
"loss": 0.7227, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.08319852091518373, |
|
"grad_norm": 3.926770198625081, |
|
"learning_rate": 2.9597237713044888e-05, |
|
"loss": 0.6682, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.08473923426546491, |
|
"grad_norm": 3.3408474311965692, |
|
"learning_rate": 2.9580181171001962e-05, |
|
"loss": 0.6634, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.08627994761574609, |
|
"grad_norm": 15.954087186336258, |
|
"learning_rate": 2.956277603846761e-05, |
|
"loss": 0.7005, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.08782066096602727, |
|
"grad_norm": 3.3366594091813475, |
|
"learning_rate": 2.9545022731572723e-05, |
|
"loss": 0.6752, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.08936137431630845, |
|
"grad_norm": 4.359220549879328, |
|
"learning_rate": 2.9526921674772522e-05, |
|
"loss": 0.6985, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.09090208766658964, |
|
"grad_norm": 5.585924353950877, |
|
"learning_rate": 2.95084733008364e-05, |
|
"loss": 0.6729, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.09244280101687082, |
|
"grad_norm": 2.5002732127626075, |
|
"learning_rate": 2.94896780508376e-05, |
|
"loss": 0.6881, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.093983514367152, |
|
"grad_norm": 8.22666868773466, |
|
"learning_rate": 2.9470536374142656e-05, |
|
"loss": 0.6918, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.09552422771743317, |
|
"grad_norm": 14.050897991765508, |
|
"learning_rate": 2.9451048728400644e-05, |
|
"loss": 0.6584, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.09706494106771435, |
|
"grad_norm": 3.117445397285045, |
|
"learning_rate": 2.9431215579532253e-05, |
|
"loss": 0.6495, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.09860565441799553, |
|
"grad_norm": 7.000286096448409, |
|
"learning_rate": 2.9411037401718628e-05, |
|
"loss": 0.6568, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1001463677682767, |
|
"grad_norm": 3.09698455955492, |
|
"learning_rate": 2.939051467739006e-05, |
|
"loss": 0.7095, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1016870811185579, |
|
"grad_norm": 4.163429426840949, |
|
"learning_rate": 2.936964789721442e-05, |
|
"loss": 0.6573, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.10322779446883908, |
|
"grad_norm": 3.282543980811516, |
|
"learning_rate": 2.934843756008546e-05, |
|
"loss": 0.6901, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.10476850781912025, |
|
"grad_norm": 7.4918954929572115, |
|
"learning_rate": 2.932688417311085e-05, |
|
"loss": 0.6826, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.10630922116940143, |
|
"grad_norm": 49.53066344289919, |
|
"learning_rate": 2.9304988251600084e-05, |
|
"loss": 0.6668, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.10784993451968261, |
|
"grad_norm": 23.218366206451197, |
|
"learning_rate": 2.9282750319052154e-05, |
|
"loss": 0.6643, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.10939064786996379, |
|
"grad_norm": 2.59944116748961, |
|
"learning_rate": 2.9260170907143012e-05, |
|
"loss": 0.6709, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.11093136122024497, |
|
"grad_norm": 4.59006482180365, |
|
"learning_rate": 2.9237250555712887e-05, |
|
"loss": 0.6878, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.11247207457052616, |
|
"grad_norm": 3.1508976338573507, |
|
"learning_rate": 2.9213989812753366e-05, |
|
"loss": 0.6512, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.11401278792080734, |
|
"grad_norm": 4.5342197552006835, |
|
"learning_rate": 2.9190389234394285e-05, |
|
"loss": 0.6687, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.11555350127108852, |
|
"grad_norm": 6.514898099534209, |
|
"learning_rate": 2.9166449384890446e-05, |
|
"loss": 0.6868, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1170942146213697, |
|
"grad_norm": 5.1251296980659005, |
|
"learning_rate": 2.9142170836608115e-05, |
|
"loss": 0.6869, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.11863492797165087, |
|
"grad_norm": 5.896478552497953, |
|
"learning_rate": 2.9117554170011352e-05, |
|
"loss": 0.667, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.12017564132193205, |
|
"grad_norm": 3.4672489526983483, |
|
"learning_rate": 2.909259997364811e-05, |
|
"loss": 0.6674, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.12171635467221323, |
|
"grad_norm": 5.413662982418376, |
|
"learning_rate": 2.9067308844136193e-05, |
|
"loss": 0.6891, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.12325706802249442, |
|
"grad_norm": 5.807164850831801, |
|
"learning_rate": 2.9041681386148966e-05, |
|
"loss": 0.6447, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1247977813727756, |
|
"grad_norm": 5.542855195057769, |
|
"learning_rate": 2.9015718212400918e-05, |
|
"loss": 0.6486, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.12633849472305678, |
|
"grad_norm": 3.60243198052599, |
|
"learning_rate": 2.8989419943632992e-05, |
|
"loss": 0.6552, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.12787920807333797, |
|
"grad_norm": 10.54124596601907, |
|
"learning_rate": 2.896278720859776e-05, |
|
"loss": 0.667, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.12941992142361913, |
|
"grad_norm": 8.542824965925352, |
|
"learning_rate": 2.8935820644044398e-05, |
|
"loss": 0.697, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.13096063477390033, |
|
"grad_norm": 6.327341591650607, |
|
"learning_rate": 2.890852089470343e-05, |
|
"loss": 0.65, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.1325013481241815, |
|
"grad_norm": 7.687827002540841, |
|
"learning_rate": 2.888088861327135e-05, |
|
"loss": 0.6435, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.13404206147446268, |
|
"grad_norm": 3.356453126127434, |
|
"learning_rate": 2.885292446039499e-05, |
|
"loss": 0.6721, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.13558277482474385, |
|
"grad_norm": 8.406402473059597, |
|
"learning_rate": 2.8824629104655736e-05, |
|
"loss": 0.6694, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.13712348817502504, |
|
"grad_norm": 11.653019434398818, |
|
"learning_rate": 2.8796003222553558e-05, |
|
"loss": 0.6531, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.13866420152530623, |
|
"grad_norm": 6.371551478518258, |
|
"learning_rate": 2.8767047498490798e-05, |
|
"loss": 0.6568, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1402049148755874, |
|
"grad_norm": 26.71523757066426, |
|
"learning_rate": 2.8737762624755846e-05, |
|
"loss": 0.6857, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.1417456282258686, |
|
"grad_norm": 4.417578021376778, |
|
"learning_rate": 2.8708149301506573e-05, |
|
"loss": 0.665, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.14328634157614975, |
|
"grad_norm": 5.335327649767265, |
|
"learning_rate": 2.8678208236753577e-05, |
|
"loss": 0.7014, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.14482705492643094, |
|
"grad_norm": 4.155520038033631, |
|
"learning_rate": 2.8647940146343278e-05, |
|
"loss": 0.6767, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1463677682767121, |
|
"grad_norm": 3.815046866792752, |
|
"learning_rate": 2.86173457539408e-05, |
|
"loss": 0.6557, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1479084816269933, |
|
"grad_norm": 3.7651811393538552, |
|
"learning_rate": 2.8586425791012648e-05, |
|
"loss": 0.675, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1494491949772745, |
|
"grad_norm": 4.382966553943605, |
|
"learning_rate": 2.8555180996809246e-05, |
|
"loss": 0.6313, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.15098990832755566, |
|
"grad_norm": 10.103890347717131, |
|
"learning_rate": 2.8523612118347245e-05, |
|
"loss": 0.645, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.15253062167783685, |
|
"grad_norm": 6.294074332584702, |
|
"learning_rate": 2.8491719910391685e-05, |
|
"loss": 0.659, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.154071335028118, |
|
"grad_norm": 2.5808531227457565, |
|
"learning_rate": 2.845950513543791e-05, |
|
"loss": 0.6688, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1556120483783992, |
|
"grad_norm": 2.927888770737132, |
|
"learning_rate": 2.842696856369338e-05, |
|
"loss": 0.6381, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.15715276172868037, |
|
"grad_norm": 14.062433268070832, |
|
"learning_rate": 2.8394110973059233e-05, |
|
"loss": 0.6946, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.15869347507896156, |
|
"grad_norm": 2.4470437840581054, |
|
"learning_rate": 2.8360933149111695e-05, |
|
"loss": 0.6844, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.16023418842924275, |
|
"grad_norm": 3.8078577514013343, |
|
"learning_rate": 2.8327435885083292e-05, |
|
"loss": 0.64, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.16177490177952392, |
|
"grad_norm": 25.680768915471432, |
|
"learning_rate": 2.8293619981843887e-05, |
|
"loss": 0.6329, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.1633156151298051, |
|
"grad_norm": 5.0369491995422715, |
|
"learning_rate": 2.8259486247881537e-05, |
|
"loss": 0.6604, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.16485632848008627, |
|
"grad_norm": 3.9026521516961608, |
|
"learning_rate": 2.8225035499283155e-05, |
|
"loss": 0.6564, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.16639704183036746, |
|
"grad_norm": 4.945417598915296, |
|
"learning_rate": 2.8190268559715017e-05, |
|
"loss": 0.6655, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.16793775518064863, |
|
"grad_norm": 3.222466850494984, |
|
"learning_rate": 2.815518626040304e-05, |
|
"loss": 0.6603, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.16947846853092982, |
|
"grad_norm": 6.539136587655263, |
|
"learning_rate": 2.811978944011293e-05, |
|
"loss": 0.7036, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.171019181881211, |
|
"grad_norm": 8.14211829139052, |
|
"learning_rate": 2.8084078945130117e-05, |
|
"loss": 0.6356, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.17255989523149218, |
|
"grad_norm": 4.1954200205175605, |
|
"learning_rate": 2.8048055629239543e-05, |
|
"loss": 0.6591, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.17410060858177337, |
|
"grad_norm": 4.333940585698679, |
|
"learning_rate": 2.8011720353705224e-05, |
|
"loss": 0.6575, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.17564132193205453, |
|
"grad_norm": 4.2805487065333, |
|
"learning_rate": 2.797507398724966e-05, |
|
"loss": 0.6624, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.17718203528233573, |
|
"grad_norm": 4.7173213185412, |
|
"learning_rate": 2.7938117406033086e-05, |
|
"loss": 0.623, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1787227486326169, |
|
"grad_norm": 99.71383370833006, |
|
"learning_rate": 2.7900851493632508e-05, |
|
"loss": 0.6591, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.18026346198289808, |
|
"grad_norm": 3.6747989781213954, |
|
"learning_rate": 2.786327714102058e-05, |
|
"loss": 0.692, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.18180417533317927, |
|
"grad_norm": 2.5009166944220604, |
|
"learning_rate": 2.78253952465443e-05, |
|
"loss": 0.6614, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.18334488868346044, |
|
"grad_norm": 3.144011687958325, |
|
"learning_rate": 2.7787206715903543e-05, |
|
"loss": 0.6406, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.18488560203374163, |
|
"grad_norm": 4.063731315051197, |
|
"learning_rate": 2.7748712462129396e-05, |
|
"loss": 0.6444, |
|
"step": 1200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6490, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0934282360979456e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|