|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.998994974874372, |
|
"eval_steps": 500, |
|
"global_step": 1119, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02680067001675042, |
|
"grad_norm": 31.10676739208888, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0442, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05360134003350084, |
|
"grad_norm": 4.120159718355311, |
|
"learning_rate": 5e-06, |
|
"loss": 0.955, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08040201005025126, |
|
"grad_norm": 2.339286993290118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9144, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10720268006700168, |
|
"grad_norm": 1.0759658796503169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8882, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13400335008375208, |
|
"grad_norm": 0.6453086287600575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8699, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16080402010050251, |
|
"grad_norm": 1.1160652028087201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8652, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18760469011725292, |
|
"grad_norm": 0.6258812701119981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.848, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21440536013400335, |
|
"grad_norm": 0.5590720583080133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8463, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24120603015075376, |
|
"grad_norm": 0.6795899721647743, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8393, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26800670016750416, |
|
"grad_norm": 1.1809098922748584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8335, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2948073701842546, |
|
"grad_norm": 0.8420170258665194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8354, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32160804020100503, |
|
"grad_norm": 0.6851931492236875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8311, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34840871021775544, |
|
"grad_norm": 0.9823271515850558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8338, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37520938023450584, |
|
"grad_norm": 0.7764734742568185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8308, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4020100502512563, |
|
"grad_norm": 0.8463569347395117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8223, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4288107202680067, |
|
"grad_norm": 0.7229327877216101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8265, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4556113902847571, |
|
"grad_norm": 0.7406267992443841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8192, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4824120603015075, |
|
"grad_norm": 0.5245160572062765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8175, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.509212730318258, |
|
"grad_norm": 0.5446187756240741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8173, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5360134003350083, |
|
"grad_norm": 0.5387389341379448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8152, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5628140703517588, |
|
"grad_norm": 0.5942914061122081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8158, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5896147403685092, |
|
"grad_norm": 0.7289915117140335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8157, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6164154103852596, |
|
"grad_norm": 0.6166043522729069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8154, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6432160804020101, |
|
"grad_norm": 0.5402182876037194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.81, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6700167504187605, |
|
"grad_norm": 0.702136667602188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8086, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6968174204355109, |
|
"grad_norm": 0.5548041175483877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8067, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7236180904522613, |
|
"grad_norm": 0.5933880054884063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8046, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7504187604690117, |
|
"grad_norm": 0.6218650700836093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8045, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7772194304857621, |
|
"grad_norm": 0.6636079178178366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8029, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8040201005025126, |
|
"grad_norm": 0.7578337185188113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8017, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.830820770519263, |
|
"grad_norm": 0.5964607129683942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8039, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8576214405360134, |
|
"grad_norm": 0.5731797399351095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8018, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8844221105527639, |
|
"grad_norm": 1.2550493135376506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7994, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9112227805695142, |
|
"grad_norm": 0.9368889967347727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8074, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9380234505862647, |
|
"grad_norm": 0.5921834072388994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8049, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.964824120603015, |
|
"grad_norm": 0.5216457733637546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8019, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9916247906197655, |
|
"grad_norm": 0.6220842275028926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7989, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9996649916247906, |
|
"eval_loss": 0.7980747222900391, |
|
"eval_runtime": 396.9204, |
|
"eval_samples_per_second": 25.33, |
|
"eval_steps_per_second": 0.398, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.018425460636516, |
|
"grad_norm": 0.8379563173937747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.846, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0452261306532664, |
|
"grad_norm": 0.7007106059150332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7553, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0720268006700167, |
|
"grad_norm": 0.6985327962246497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7609, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0988274706867671, |
|
"grad_norm": 0.8231254452075508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7594, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1256281407035176, |
|
"grad_norm": 0.5396289186374543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7555, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.152428810720268, |
|
"grad_norm": 0.5275185306049085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7554, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1792294807370185, |
|
"grad_norm": 0.5322161709304825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7592, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2060301507537687, |
|
"grad_norm": 0.6335605844571386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7569, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2328308207705192, |
|
"grad_norm": 0.6175137584423412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7545, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2596314907872697, |
|
"grad_norm": 0.5525760530230355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.755, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2864321608040201, |
|
"grad_norm": 0.8168923150097971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7546, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3132328308207706, |
|
"grad_norm": 0.576393540227948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7613, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3400335008375208, |
|
"grad_norm": 0.5454484709306812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7558, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3668341708542713, |
|
"grad_norm": 0.519519630849878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7513, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3936348408710217, |
|
"grad_norm": 0.5412825055890007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.757, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4204355108877722, |
|
"grad_norm": 0.6663985833801256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7549, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4472361809045227, |
|
"grad_norm": 0.5433262572367299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7546, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.474036850921273, |
|
"grad_norm": 0.4990591351939153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7483, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5008375209380236, |
|
"grad_norm": 0.5826918311899233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7554, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5276381909547738, |
|
"grad_norm": 0.5498876689910938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.747, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5544388609715243, |
|
"grad_norm": 0.5033534721843692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7563, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5812395309882747, |
|
"grad_norm": 0.5475928059702524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7597, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.608040201005025, |
|
"grad_norm": 0.5483658876220942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6348408710217757, |
|
"grad_norm": 0.550772312369244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7552, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.661641541038526, |
|
"grad_norm": 0.541814614413297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7556, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6884422110552764, |
|
"grad_norm": 0.5623412222166373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7152428810720268, |
|
"grad_norm": 0.5486383001151054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7548, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.742043551088777, |
|
"grad_norm": 0.6450442334835524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7506, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7688442211055277, |
|
"grad_norm": 0.5093346834442244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7526, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.795644891122278, |
|
"grad_norm": 0.5882959805663536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.756, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8224455611390284, |
|
"grad_norm": 0.6121773592899815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7516, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.849246231155779, |
|
"grad_norm": 0.5053865601023015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7552, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8760469011725294, |
|
"grad_norm": 0.49653124526114595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7506, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9028475711892798, |
|
"grad_norm": 0.5356438120020067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7512, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.92964824120603, |
|
"grad_norm": 0.4792312103436412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9564489112227805, |
|
"grad_norm": 0.5887442908787863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7489, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.983249581239531, |
|
"grad_norm": 0.6013288648401325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7568, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9993299832495812, |
|
"eval_loss": 0.7848142385482788, |
|
"eval_runtime": 399.6688, |
|
"eval_samples_per_second": 25.156, |
|
"eval_steps_per_second": 0.395, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 2.0100502512562812, |
|
"grad_norm": 0.7899358115042834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.806, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.036850921273032, |
|
"grad_norm": 0.5438655957140969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7051, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.063651591289782, |
|
"grad_norm": 0.5954978197528383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7054, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.090452261306533, |
|
"grad_norm": 0.5330734638287157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7057, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.117252931323283, |
|
"grad_norm": 0.5719591031814022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7074, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1440536013400333, |
|
"grad_norm": 0.5749015032749988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7035, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.170854271356784, |
|
"grad_norm": 0.7000012184675283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7045, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.1976549413735342, |
|
"grad_norm": 0.5221545575547275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7085, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.224455611390285, |
|
"grad_norm": 0.5590282858104586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7067, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.251256281407035, |
|
"grad_norm": 0.680247988597194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.712, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.2780569514237854, |
|
"grad_norm": 0.5724224597541392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7132, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.304857621440536, |
|
"grad_norm": 0.5890303699337465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7126, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3316582914572863, |
|
"grad_norm": 0.7060913656445702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.71, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.358458961474037, |
|
"grad_norm": 0.5199319551859399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7095, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.3852596314907872, |
|
"grad_norm": 0.5667231972196302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7091, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.4120603015075375, |
|
"grad_norm": 0.5776579179225979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7061, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.438860971524288, |
|
"grad_norm": 0.5068031699453062, |
|
"learning_rate": 5e-06, |
|
"loss": 0.709, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4656616415410384, |
|
"grad_norm": 0.5622723362640956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7092, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.492462311557789, |
|
"grad_norm": 0.6678938168210493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7074, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5192629815745393, |
|
"grad_norm": 0.7495016986714225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7123, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5460636515912896, |
|
"grad_norm": 0.5890832065298138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7117, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5728643216080402, |
|
"grad_norm": 0.5358789200826624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7123, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.5996649916247905, |
|
"grad_norm": 0.7046518448288711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7124, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.626465661641541, |
|
"grad_norm": 0.5598275583119999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7095, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6532663316582914, |
|
"grad_norm": 0.6175269384061102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7098, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.6800670016750416, |
|
"grad_norm": 0.5833546621317353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7114, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7068676716917923, |
|
"grad_norm": 0.5577868241277573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7126, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.7336683417085426, |
|
"grad_norm": 0.5384748297588693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7107, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7604690117252932, |
|
"grad_norm": 0.6077432046061476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.708, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.7872696817420435, |
|
"grad_norm": 0.5845658826184227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7166, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.8140703517587937, |
|
"grad_norm": 0.5139932571124606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7112, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.8408710217755444, |
|
"grad_norm": 0.5214647759988249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7083, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8676716917922946, |
|
"grad_norm": 0.6459251681734683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7165, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.8944723618090453, |
|
"grad_norm": 0.6289426280816557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7126, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9212730318257956, |
|
"grad_norm": 0.4931010807851759, |
|
"learning_rate": 5e-06, |
|
"loss": 0.709, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.948073701842546, |
|
"grad_norm": 0.5257070901798452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7131, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9748743718592965, |
|
"grad_norm": 0.5735309103540566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7167, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.998994974874372, |
|
"eval_loss": 0.784430980682373, |
|
"eval_runtime": 398.3496, |
|
"eval_samples_per_second": 25.239, |
|
"eval_steps_per_second": 0.397, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 2.998994974874372, |
|
"step": 1119, |
|
"total_flos": 1874157297991680.0, |
|
"train_loss": 0.7673776837946369, |
|
"train_runtime": 65939.4578, |
|
"train_samples_per_second": 8.691, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1119, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1874157297991680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|