|
{ |
|
"best_metric": 0.9351106639839034, |
|
"best_model_checkpoint": "/scratch/camembertv2/runs/results/flue-PAWS-X/camembertav2-base-bf16-p2-17000/max_seq_length-148-gradient_accumulation_steps-2-precision-fp32-learning_rate-5e-05-epochs-6-lr_scheduler-linear-warmup_steps-0/SEED-666/checkpoint-18522", |
|
"epoch": 5.999028340080971, |
|
"eval_steps": 500, |
|
"global_step": 18522, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032388663967611336, |
|
"grad_norm": 5.7791056632995605, |
|
"learning_rate": 4.9730050750458916e-05, |
|
"loss": 0.5409, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06477732793522267, |
|
"grad_norm": 3.9132766723632812, |
|
"learning_rate": 4.946010150091783e-05, |
|
"loss": 0.3386, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09716599190283401, |
|
"grad_norm": 19.378026962280273, |
|
"learning_rate": 4.9190152251376743e-05, |
|
"loss": 0.3131, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12955465587044535, |
|
"grad_norm": 15.864294052124023, |
|
"learning_rate": 4.892020300183566e-05, |
|
"loss": 0.2785, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16194331983805668, |
|
"grad_norm": 12.896875381469727, |
|
"learning_rate": 4.865025375229457e-05, |
|
"loss": 0.2929, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19433198380566802, |
|
"grad_norm": 21.26336669921875, |
|
"learning_rate": 4.8380304502753484e-05, |
|
"loss": 0.2847, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22672064777327935, |
|
"grad_norm": 5.422796249389648, |
|
"learning_rate": 4.81103552532124e-05, |
|
"loss": 0.2673, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2591093117408907, |
|
"grad_norm": 3.9689624309539795, |
|
"learning_rate": 4.784040600367131e-05, |
|
"loss": 0.2561, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.291497975708502, |
|
"grad_norm": 8.32761001586914, |
|
"learning_rate": 4.7570456754130226e-05, |
|
"loss": 0.2618, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.32388663967611336, |
|
"grad_norm": 3.8402862548828125, |
|
"learning_rate": 4.730050750458914e-05, |
|
"loss": 0.2535, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3562753036437247, |
|
"grad_norm": 38.90117645263672, |
|
"learning_rate": 4.703055825504805e-05, |
|
"loss": 0.2476, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.38866396761133604, |
|
"grad_norm": 8.632582664489746, |
|
"learning_rate": 4.6760609005506967e-05, |
|
"loss": 0.2504, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 2.50603985786438, |
|
"learning_rate": 4.649065975596588e-05, |
|
"loss": 0.2375, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4534412955465587, |
|
"grad_norm": 22.682083129882812, |
|
"learning_rate": 4.6220710506424794e-05, |
|
"loss": 0.2304, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.48582995951417, |
|
"grad_norm": 3.2134041786193848, |
|
"learning_rate": 4.595076125688371e-05, |
|
"loss": 0.2342, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5182186234817814, |
|
"grad_norm": 9.210716247558594, |
|
"learning_rate": 4.568081200734262e-05, |
|
"loss": 0.2447, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5506072874493927, |
|
"grad_norm": 7.8480143547058105, |
|
"learning_rate": 4.5410862757801535e-05, |
|
"loss": 0.2161, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.582995951417004, |
|
"grad_norm": 1.0597597360610962, |
|
"learning_rate": 4.514091350826045e-05, |
|
"loss": 0.2234, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 1.914153814315796, |
|
"learning_rate": 4.487096425871936e-05, |
|
"loss": 0.2001, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6477732793522267, |
|
"grad_norm": 7.569839000701904, |
|
"learning_rate": 4.4601015009178276e-05, |
|
"loss": 0.2354, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.680161943319838, |
|
"grad_norm": 9.363743782043457, |
|
"learning_rate": 4.433106575963719e-05, |
|
"loss": 0.1989, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7125506072874493, |
|
"grad_norm": 0.2488391101360321, |
|
"learning_rate": 4.40611165100961e-05, |
|
"loss": 0.2227, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7449392712550608, |
|
"grad_norm": 6.931863307952881, |
|
"learning_rate": 4.379116726055502e-05, |
|
"loss": 0.2296, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.7773279352226721, |
|
"grad_norm": 2.7890470027923584, |
|
"learning_rate": 4.352121801101393e-05, |
|
"loss": 0.2161, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8097165991902834, |
|
"grad_norm": 0.6333386301994324, |
|
"learning_rate": 4.3251268761472844e-05, |
|
"loss": 0.21, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.40448668599128723, |
|
"learning_rate": 4.298131951193176e-05, |
|
"loss": 0.2115, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8744939271255061, |
|
"grad_norm": 10.913817405700684, |
|
"learning_rate": 4.271137026239067e-05, |
|
"loss": 0.1923, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9068825910931174, |
|
"grad_norm": 0.3693946301937103, |
|
"learning_rate": 4.2441421012849585e-05, |
|
"loss": 0.2019, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9392712550607287, |
|
"grad_norm": 2.760298252105713, |
|
"learning_rate": 4.21714717633085e-05, |
|
"loss": 0.2113, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.97165991902834, |
|
"grad_norm": 11.557462692260742, |
|
"learning_rate": 4.190152251376741e-05, |
|
"loss": 0.2073, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9998380566801619, |
|
"eval_accuracy": 0.9144869215291751, |
|
"eval_loss": 0.28149500489234924, |
|
"eval_runtime": 6.5505, |
|
"eval_samples_per_second": 303.488, |
|
"eval_steps_per_second": 38.012, |
|
"step": 3087 |
|
}, |
|
{ |
|
"epoch": 1.0040485829959513, |
|
"grad_norm": 11.277145385742188, |
|
"learning_rate": 4.1631573264226326e-05, |
|
"loss": 0.2007, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.0364372469635628, |
|
"grad_norm": 7.342209815979004, |
|
"learning_rate": 4.136162401468524e-05, |
|
"loss": 0.1369, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.0688259109311742, |
|
"grad_norm": 4.37772798538208, |
|
"learning_rate": 4.1091674765144154e-05, |
|
"loss": 0.1951, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1012145748987854, |
|
"grad_norm": 10.176764488220215, |
|
"learning_rate": 4.0821725515603074e-05, |
|
"loss": 0.1621, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.1336032388663968, |
|
"grad_norm": 0.42846062779426575, |
|
"learning_rate": 4.055177626606198e-05, |
|
"loss": 0.1593, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.165991902834008, |
|
"grad_norm": 0.37047576904296875, |
|
"learning_rate": 4.0281827016520895e-05, |
|
"loss": 0.1654, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.1983805668016194, |
|
"grad_norm": 0.9531276822090149, |
|
"learning_rate": 4.001187776697981e-05, |
|
"loss": 0.1686, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 6.070562362670898, |
|
"learning_rate": 3.974192851743873e-05, |
|
"loss": 0.1741, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 13.438559532165527, |
|
"learning_rate": 3.9471979267897636e-05, |
|
"loss": 0.1887, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 1.3613296747207642, |
|
"learning_rate": 3.920203001835655e-05, |
|
"loss": 0.1598, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3279352226720649, |
|
"grad_norm": 12.832125663757324, |
|
"learning_rate": 3.893208076881546e-05, |
|
"loss": 0.1735, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.360323886639676, |
|
"grad_norm": 11.818835258483887, |
|
"learning_rate": 3.8662131519274384e-05, |
|
"loss": 0.1838, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.3927125506072875, |
|
"grad_norm": 0.46300917863845825, |
|
"learning_rate": 3.839218226973329e-05, |
|
"loss": 0.168, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.425101214574899, |
|
"grad_norm": 5.4205403327941895, |
|
"learning_rate": 3.8122233020192204e-05, |
|
"loss": 0.1675, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.45748987854251, |
|
"grad_norm": 1.2334966659545898, |
|
"learning_rate": 3.785228377065112e-05, |
|
"loss": 0.1651, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.4898785425101215, |
|
"grad_norm": 1.531936526298523, |
|
"learning_rate": 3.758233452111004e-05, |
|
"loss": 0.1577, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.522267206477733, |
|
"grad_norm": 14.39586353302002, |
|
"learning_rate": 3.7312385271568945e-05, |
|
"loss": 0.1925, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.5546558704453441, |
|
"grad_norm": 17.419506072998047, |
|
"learning_rate": 3.704243602202786e-05, |
|
"loss": 0.1405, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.5870445344129553, |
|
"grad_norm": 16.734468460083008, |
|
"learning_rate": 3.677248677248677e-05, |
|
"loss": 0.1511, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.6194331983805668, |
|
"grad_norm": 0.26591989398002625, |
|
"learning_rate": 3.650253752294569e-05, |
|
"loss": 0.1704, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.6518218623481782, |
|
"grad_norm": 11.863933563232422, |
|
"learning_rate": 3.62325882734046e-05, |
|
"loss": 0.1498, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.1933823674917221, |
|
"learning_rate": 3.5962639023863514e-05, |
|
"loss": 0.1556, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.7165991902834008, |
|
"grad_norm": 5.462769508361816, |
|
"learning_rate": 3.569268977432243e-05, |
|
"loss": 0.1656, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.7489878542510122, |
|
"grad_norm": 0.5079367160797119, |
|
"learning_rate": 3.542274052478135e-05, |
|
"loss": 0.1637, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.7813765182186234, |
|
"grad_norm": 0.9675686955451965, |
|
"learning_rate": 3.5152791275240255e-05, |
|
"loss": 0.1475, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.8137651821862348, |
|
"grad_norm": 13.242751121520996, |
|
"learning_rate": 3.488284202569917e-05, |
|
"loss": 0.1799, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.37781623005867004, |
|
"learning_rate": 3.461289277615808e-05, |
|
"loss": 0.1562, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.8785425101214575, |
|
"grad_norm": 11.740013122558594, |
|
"learning_rate": 3.4342943526617e-05, |
|
"loss": 0.1798, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.9109311740890689, |
|
"grad_norm": 7.2465009689331055, |
|
"learning_rate": 3.407299427707591e-05, |
|
"loss": 0.163, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.9433198380566803, |
|
"grad_norm": 1.1531609296798706, |
|
"learning_rate": 3.380304502753482e-05, |
|
"loss": 0.1347, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.9757085020242915, |
|
"grad_norm": 3.8277270793914795, |
|
"learning_rate": 3.353309577799374e-05, |
|
"loss": 0.1719, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9225352112676056, |
|
"eval_loss": 0.3341009020805359, |
|
"eval_runtime": 5.9971, |
|
"eval_samples_per_second": 331.492, |
|
"eval_steps_per_second": 41.52, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 2.0080971659919027, |
|
"grad_norm": 31.751121520996094, |
|
"learning_rate": 3.326314652845266e-05, |
|
"loss": 0.1353, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.0404858299595143, |
|
"grad_norm": 6.975146770477295, |
|
"learning_rate": 3.2993197278911564e-05, |
|
"loss": 0.105, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.0728744939271255, |
|
"grad_norm": 2.166383981704712, |
|
"learning_rate": 3.272324802937048e-05, |
|
"loss": 0.1191, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 14.97061824798584, |
|
"learning_rate": 3.245329877982939e-05, |
|
"loss": 0.1143, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.1376518218623484, |
|
"grad_norm": 26.100902557373047, |
|
"learning_rate": 3.218334953028831e-05, |
|
"loss": 0.1116, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.1700404858299596, |
|
"grad_norm": 1.7684509754180908, |
|
"learning_rate": 3.191340028074722e-05, |
|
"loss": 0.1035, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.2024291497975708, |
|
"grad_norm": 0.07033982872962952, |
|
"learning_rate": 3.164345103120613e-05, |
|
"loss": 0.1092, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.234817813765182, |
|
"grad_norm": 0.3883877396583557, |
|
"learning_rate": 3.1373501781665046e-05, |
|
"loss": 0.1079, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.2672064777327936, |
|
"grad_norm": 7.267897129058838, |
|
"learning_rate": 3.110355253212397e-05, |
|
"loss": 0.1172, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.299595141700405, |
|
"grad_norm": 5.313385963439941, |
|
"learning_rate": 3.0833603282582874e-05, |
|
"loss": 0.1346, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.331983805668016, |
|
"grad_norm": 19.593353271484375, |
|
"learning_rate": 3.056365403304179e-05, |
|
"loss": 0.1318, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.3643724696356276, |
|
"grad_norm": 2.72308611869812, |
|
"learning_rate": 3.0293704783500704e-05, |
|
"loss": 0.1138, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.396761133603239, |
|
"grad_norm": 8.528360366821289, |
|
"learning_rate": 3.0023755533959618e-05, |
|
"loss": 0.1267, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.42914979757085, |
|
"grad_norm": 3.0180954933166504, |
|
"learning_rate": 2.9753806284418528e-05, |
|
"loss": 0.1, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 11.93277645111084, |
|
"learning_rate": 2.9483857034877442e-05, |
|
"loss": 0.1322, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.493927125506073, |
|
"grad_norm": 0.44001469016075134, |
|
"learning_rate": 2.921390778533636e-05, |
|
"loss": 0.1148, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 9.29806900024414, |
|
"learning_rate": 2.8943958535795273e-05, |
|
"loss": 0.0979, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.5587044534412957, |
|
"grad_norm": 0.31297165155410767, |
|
"learning_rate": 2.8674009286254183e-05, |
|
"loss": 0.131, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 0.13621266186237335, |
|
"learning_rate": 2.8404060036713097e-05, |
|
"loss": 0.1314, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.623481781376518, |
|
"grad_norm": 14.039895057678223, |
|
"learning_rate": 2.8134110787172014e-05, |
|
"loss": 0.1137, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.6558704453441297, |
|
"grad_norm": 0.19245870411396027, |
|
"learning_rate": 2.7864161537630927e-05, |
|
"loss": 0.1169, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.688259109311741, |
|
"grad_norm": 37.211849212646484, |
|
"learning_rate": 2.7594212288089838e-05, |
|
"loss": 0.1052, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.720647773279352, |
|
"grad_norm": 4.244668483734131, |
|
"learning_rate": 2.732426303854875e-05, |
|
"loss": 0.1126, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.753036437246964, |
|
"grad_norm": 0.1630948930978775, |
|
"learning_rate": 2.705431378900767e-05, |
|
"loss": 0.1371, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.785425101214575, |
|
"grad_norm": 9.61884593963623, |
|
"learning_rate": 2.6784364539466582e-05, |
|
"loss": 0.1309, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.817813765182186, |
|
"grad_norm": 12.167716026306152, |
|
"learning_rate": 2.6514415289925492e-05, |
|
"loss": 0.1219, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.850202429149798, |
|
"grad_norm": 4.939400672912598, |
|
"learning_rate": 2.624446604038441e-05, |
|
"loss": 0.1097, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.882591093117409, |
|
"grad_norm": 10.696518898010254, |
|
"learning_rate": 2.5974516790843323e-05, |
|
"loss": 0.1477, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.91497975708502, |
|
"grad_norm": 16.492700576782227, |
|
"learning_rate": 2.5704567541302237e-05, |
|
"loss": 0.1158, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 1.9808906316757202, |
|
"learning_rate": 2.5434618291761147e-05, |
|
"loss": 0.1113, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"grad_norm": 0.19531062245368958, |
|
"learning_rate": 2.5164669042220064e-05, |
|
"loss": 0.1018, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.999838056680162, |
|
"eval_accuracy": 0.9255533199195171, |
|
"eval_loss": 0.3031398355960846, |
|
"eval_runtime": 6.0146, |
|
"eval_samples_per_second": 330.531, |
|
"eval_steps_per_second": 41.399, |
|
"step": 9262 |
|
}, |
|
{ |
|
"epoch": 3.0121457489878543, |
|
"grad_norm": 0.03815652057528496, |
|
"learning_rate": 2.4894719792678978e-05, |
|
"loss": 0.1136, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 3.0445344129554655, |
|
"grad_norm": 7.544350624084473, |
|
"learning_rate": 2.462477054313789e-05, |
|
"loss": 0.0829, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 29.120431900024414, |
|
"learning_rate": 2.4354821293596805e-05, |
|
"loss": 0.0879, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.1093117408906883, |
|
"grad_norm": 5.044871807098389, |
|
"learning_rate": 2.408487204405572e-05, |
|
"loss": 0.0895, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 3.1417004048582995, |
|
"grad_norm": 0.42247331142425537, |
|
"learning_rate": 2.3814922794514633e-05, |
|
"loss": 0.1025, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 3.174089068825911, |
|
"grad_norm": 19.73353385925293, |
|
"learning_rate": 2.3544973544973546e-05, |
|
"loss": 0.0755, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 3.2064777327935223, |
|
"grad_norm": 8.545650482177734, |
|
"learning_rate": 2.327502429543246e-05, |
|
"loss": 0.0968, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 3.2388663967611335, |
|
"grad_norm": 0.08128660172224045, |
|
"learning_rate": 2.3005075045891374e-05, |
|
"loss": 0.0906, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.2712550607287447, |
|
"grad_norm": 0.023657312616705894, |
|
"learning_rate": 2.2735125796350287e-05, |
|
"loss": 0.0759, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 3.3036437246963564, |
|
"grad_norm": 0.2730010151863098, |
|
"learning_rate": 2.24651765468092e-05, |
|
"loss": 0.0846, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 3.3360323886639676, |
|
"grad_norm": 0.05432628467679024, |
|
"learning_rate": 2.2195227297268115e-05, |
|
"loss": 0.1003, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"grad_norm": 0.24408192932605743, |
|
"learning_rate": 2.192527804772703e-05, |
|
"loss": 0.0864, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 3.4008097165991904, |
|
"grad_norm": 0.038083989173173904, |
|
"learning_rate": 2.1655328798185942e-05, |
|
"loss": 0.0567, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.4331983805668016, |
|
"grad_norm": 0.1453145295381546, |
|
"learning_rate": 2.1385379548644856e-05, |
|
"loss": 0.0673, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.465587044534413, |
|
"grad_norm": 8.611372947692871, |
|
"learning_rate": 2.111543029910377e-05, |
|
"loss": 0.0892, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.4979757085020244, |
|
"grad_norm": 0.1930588334798813, |
|
"learning_rate": 2.0845481049562683e-05, |
|
"loss": 0.0914, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.5303643724696356, |
|
"grad_norm": 0.07848736643791199, |
|
"learning_rate": 2.0575531800021597e-05, |
|
"loss": 0.0705, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 3.562753036437247, |
|
"grad_norm": 1.9848402738571167, |
|
"learning_rate": 2.030558255048051e-05, |
|
"loss": 0.0991, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.5951417004048585, |
|
"grad_norm": 7.717613220214844, |
|
"learning_rate": 2.0035633300939424e-05, |
|
"loss": 0.091, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.6275303643724697, |
|
"grad_norm": 0.13098488748073578, |
|
"learning_rate": 1.9765684051398338e-05, |
|
"loss": 0.0819, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 3.659919028340081, |
|
"grad_norm": 18.124767303466797, |
|
"learning_rate": 1.949573480185725e-05, |
|
"loss": 0.0778, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 31.650789260864258, |
|
"learning_rate": 1.9225785552316165e-05, |
|
"loss": 0.099, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.7246963562753037, |
|
"grad_norm": 2.110788106918335, |
|
"learning_rate": 1.895583630277508e-05, |
|
"loss": 0.0847, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.757085020242915, |
|
"grad_norm": 6.184129238128662, |
|
"learning_rate": 1.8685887053233992e-05, |
|
"loss": 0.1054, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.7894736842105265, |
|
"grad_norm": 0.05220253765583038, |
|
"learning_rate": 1.8415937803692906e-05, |
|
"loss": 0.0987, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.8218623481781377, |
|
"grad_norm": 4.092259883880615, |
|
"learning_rate": 1.8145988554151823e-05, |
|
"loss": 0.0791, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 3.854251012145749, |
|
"grad_norm": 5.052979946136475, |
|
"learning_rate": 1.7876039304610734e-05, |
|
"loss": 0.0998, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 0.6650911569595337, |
|
"learning_rate": 1.760609005506965e-05, |
|
"loss": 0.1091, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.919028340080972, |
|
"grad_norm": 0.04196294769644737, |
|
"learning_rate": 1.733614080552856e-05, |
|
"loss": 0.0733, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 3.951417004048583, |
|
"grad_norm": 8.808391571044922, |
|
"learning_rate": 1.7066191555987478e-05, |
|
"loss": 0.1036, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.983805668016194, |
|
"grad_norm": 0.19184300303459167, |
|
"learning_rate": 1.6796242306446388e-05, |
|
"loss": 0.1012, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9265593561368209, |
|
"eval_loss": 0.3330951929092407, |
|
"eval_runtime": 6.0161, |
|
"eval_samples_per_second": 330.449, |
|
"eval_steps_per_second": 41.389, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 4.016194331983805, |
|
"grad_norm": 0.1378416270017624, |
|
"learning_rate": 1.6526293056905305e-05, |
|
"loss": 0.0653, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 4.048582995951417, |
|
"grad_norm": 45.89274978637695, |
|
"learning_rate": 1.6256343807364216e-05, |
|
"loss": 0.0768, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.080971659919029, |
|
"grad_norm": 5.21702241897583, |
|
"learning_rate": 1.5986394557823133e-05, |
|
"loss": 0.0446, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 4.113360323886639, |
|
"grad_norm": 5.125068664550781, |
|
"learning_rate": 1.5716445308282043e-05, |
|
"loss": 0.0407, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 4.145748987854251, |
|
"grad_norm": 0.0502559095621109, |
|
"learning_rate": 1.544649605874096e-05, |
|
"loss": 0.0613, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 4.178137651821863, |
|
"grad_norm": 15.276534080505371, |
|
"learning_rate": 1.517654680919987e-05, |
|
"loss": 0.0852, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 10.263306617736816, |
|
"learning_rate": 1.4906597559658786e-05, |
|
"loss": 0.0664, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.242914979757085, |
|
"grad_norm": 0.17894484102725983, |
|
"learning_rate": 1.4636648310117698e-05, |
|
"loss": 0.0402, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 4.275303643724697, |
|
"grad_norm": 0.08714679628610611, |
|
"learning_rate": 1.4366699060576613e-05, |
|
"loss": 0.0647, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.9923548102378845, |
|
"learning_rate": 1.4096749811035525e-05, |
|
"loss": 0.0529, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 4.340080971659919, |
|
"grad_norm": 0.03006882779300213, |
|
"learning_rate": 1.382680056149444e-05, |
|
"loss": 0.0776, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 4.372469635627531, |
|
"grad_norm": 3.831218719482422, |
|
"learning_rate": 1.3556851311953352e-05, |
|
"loss": 0.0555, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.4048582995951415, |
|
"grad_norm": 0.18301153182983398, |
|
"learning_rate": 1.3286902062412268e-05, |
|
"loss": 0.076, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 4.437246963562753, |
|
"grad_norm": 27.794418334960938, |
|
"learning_rate": 1.301695281287118e-05, |
|
"loss": 0.0585, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 4.469635627530364, |
|
"grad_norm": 0.12287744879722595, |
|
"learning_rate": 1.2747003563330095e-05, |
|
"loss": 0.0469, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 4.502024291497976, |
|
"grad_norm": 0.07235557585954666, |
|
"learning_rate": 1.2477054313789007e-05, |
|
"loss": 0.051, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 4.534412955465587, |
|
"grad_norm": 0.013693173415958881, |
|
"learning_rate": 1.220710506424792e-05, |
|
"loss": 0.051, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.566801619433198, |
|
"grad_norm": 3.417243242263794, |
|
"learning_rate": 1.1937155814706834e-05, |
|
"loss": 0.0501, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 4.59919028340081, |
|
"grad_norm": 0.20653334259986877, |
|
"learning_rate": 1.1667206565165748e-05, |
|
"loss": 0.0655, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 4.631578947368421, |
|
"grad_norm": 0.392309308052063, |
|
"learning_rate": 1.1397257315624662e-05, |
|
"loss": 0.042, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 4.663967611336032, |
|
"grad_norm": 0.3196917474269867, |
|
"learning_rate": 1.1127308066083575e-05, |
|
"loss": 0.0778, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 4.696356275303644, |
|
"grad_norm": 0.024839555844664574, |
|
"learning_rate": 1.0857358816542489e-05, |
|
"loss": 0.0504, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.728744939271255, |
|
"grad_norm": 0.03586389869451523, |
|
"learning_rate": 1.0587409567001405e-05, |
|
"loss": 0.0595, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 4.761133603238866, |
|
"grad_norm": 28.398067474365234, |
|
"learning_rate": 1.0317460317460318e-05, |
|
"loss": 0.0611, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 4.793522267206478, |
|
"grad_norm": 0.17493298649787903, |
|
"learning_rate": 1.0047511067919232e-05, |
|
"loss": 0.0633, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 4.825910931174089, |
|
"grad_norm": 11.968632698059082, |
|
"learning_rate": 9.777561818378146e-06, |
|
"loss": 0.0733, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 4.8582995951417, |
|
"grad_norm": 0.1989884227514267, |
|
"learning_rate": 9.50761256883706e-06, |
|
"loss": 0.0579, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.890688259109312, |
|
"grad_norm": 0.01483625266700983, |
|
"learning_rate": 9.237663319295973e-06, |
|
"loss": 0.0526, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 59.61151885986328, |
|
"learning_rate": 8.967714069754887e-06, |
|
"loss": 0.0514, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 4.955465587044534, |
|
"grad_norm": 0.20129810273647308, |
|
"learning_rate": 8.6977648202138e-06, |
|
"loss": 0.057, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"grad_norm": 0.1253397911787033, |
|
"learning_rate": 8.427815570672714e-06, |
|
"loss": 0.0667, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 4.9998380566801615, |
|
"eval_accuracy": 0.9280684104627767, |
|
"eval_loss": 0.3633970022201538, |
|
"eval_runtime": 5.8931, |
|
"eval_samples_per_second": 337.342, |
|
"eval_steps_per_second": 42.253, |
|
"step": 15437 |
|
}, |
|
{ |
|
"epoch": 5.020242914979757, |
|
"grad_norm": 0.3967650830745697, |
|
"learning_rate": 8.157866321131628e-06, |
|
"loss": 0.044, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.052631578947368, |
|
"grad_norm": 0.5851262807846069, |
|
"learning_rate": 7.887917071590541e-06, |
|
"loss": 0.0332, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 5.08502024291498, |
|
"grad_norm": 8.97497272491455, |
|
"learning_rate": 7.617967822049456e-06, |
|
"loss": 0.0329, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 5.117408906882591, |
|
"grad_norm": 3.0476999282836914, |
|
"learning_rate": 7.3480185725083695e-06, |
|
"loss": 0.0539, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 5.149797570850202, |
|
"grad_norm": 0.06740197539329529, |
|
"learning_rate": 7.078069322967283e-06, |
|
"loss": 0.0391, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 0.9729518890380859, |
|
"learning_rate": 6.808120073426197e-06, |
|
"loss": 0.04, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.2145748987854255, |
|
"grad_norm": 0.013517899438738823, |
|
"learning_rate": 6.5381708238851105e-06, |
|
"loss": 0.0549, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 5.246963562753036, |
|
"grad_norm": 0.11938950419425964, |
|
"learning_rate": 6.268221574344024e-06, |
|
"loss": 0.0303, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 5.279352226720648, |
|
"grad_norm": 0.1333564966917038, |
|
"learning_rate": 5.998272324802937e-06, |
|
"loss": 0.0516, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 5.3117408906882595, |
|
"grad_norm": 0.005177986808121204, |
|
"learning_rate": 5.728323075261851e-06, |
|
"loss": 0.0389, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 5.34412955465587, |
|
"grad_norm": 0.1303425431251526, |
|
"learning_rate": 5.458373825720764e-06, |
|
"loss": 0.0305, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.376518218623482, |
|
"grad_norm": 0.05090256780385971, |
|
"learning_rate": 5.188424576179678e-06, |
|
"loss": 0.0393, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 5.4089068825910935, |
|
"grad_norm": 0.02094370871782303, |
|
"learning_rate": 4.918475326638592e-06, |
|
"loss": 0.0469, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 5.441295546558704, |
|
"grad_norm": 0.15372493863105774, |
|
"learning_rate": 4.6485260770975054e-06, |
|
"loss": 0.0443, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 5.473684210526316, |
|
"grad_norm": 0.010713031515479088, |
|
"learning_rate": 4.378576827556419e-06, |
|
"loss": 0.0486, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 5.506072874493928, |
|
"grad_norm": 0.3018239438533783, |
|
"learning_rate": 4.108627578015333e-06, |
|
"loss": 0.032, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 7.911437034606934, |
|
"learning_rate": 3.8386783284742465e-06, |
|
"loss": 0.026, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 5.57085020242915, |
|
"grad_norm": 0.027054868638515472, |
|
"learning_rate": 3.5687290789331606e-06, |
|
"loss": 0.0319, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 5.603238866396762, |
|
"grad_norm": 11.172321319580078, |
|
"learning_rate": 3.2987798293920743e-06, |
|
"loss": 0.0382, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 5.635627530364372, |
|
"grad_norm": 0.013983375392854214, |
|
"learning_rate": 3.0288305798509884e-06, |
|
"loss": 0.0326, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 5.668016194331984, |
|
"grad_norm": 0.19562393426895142, |
|
"learning_rate": 2.758881330309902e-06, |
|
"loss": 0.0309, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.700404858299595, |
|
"grad_norm": 0.030886543914675713, |
|
"learning_rate": 2.4889320807688157e-06, |
|
"loss": 0.0292, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 5.732793522267206, |
|
"grad_norm": 0.24884428083896637, |
|
"learning_rate": 2.2189828312277294e-06, |
|
"loss": 0.0508, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 5.765182186234818, |
|
"grad_norm": 0.27042293548583984, |
|
"learning_rate": 1.949033581686643e-06, |
|
"loss": 0.0355, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 5.797570850202429, |
|
"grad_norm": 0.02953782118856907, |
|
"learning_rate": 1.6790843321455568e-06, |
|
"loss": 0.0238, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 5.82995951417004, |
|
"grad_norm": 0.1051710918545723, |
|
"learning_rate": 1.4091350826044704e-06, |
|
"loss": 0.0495, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.862348178137652, |
|
"grad_norm": 0.07639595121145248, |
|
"learning_rate": 1.1391858330633841e-06, |
|
"loss": 0.0364, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 5.894736842105263, |
|
"grad_norm": 0.010212107561528683, |
|
"learning_rate": 8.692365835222979e-07, |
|
"loss": 0.0515, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 5.9271255060728745, |
|
"grad_norm": 3.152235984802246, |
|
"learning_rate": 5.992873339812116e-07, |
|
"loss": 0.0294, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 5.959514170040486, |
|
"grad_norm": 8.367449760437012, |
|
"learning_rate": 3.293380844401253e-07, |
|
"loss": 0.0503, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 5.991902834008097, |
|
"grad_norm": 0.03374595567584038, |
|
"learning_rate": 5.938883489903899e-08, |
|
"loss": 0.0289, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 5.999028340080971, |
|
"eval_accuracy": 0.9351106639839034, |
|
"eval_loss": 0.43116068840026855, |
|
"eval_runtime": 5.8835, |
|
"eval_samples_per_second": 337.896, |
|
"eval_steps_per_second": 42.322, |
|
"step": 18522 |
|
}, |
|
{ |
|
"epoch": 5.999028340080971, |
|
"step": 18522, |
|
"total_flos": 1.3373133118742268e+16, |
|
"train_loss": 0.11955805376273435, |
|
"train_runtime": 3073.2453, |
|
"train_samples_per_second": 96.443, |
|
"train_steps_per_second": 6.027 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 18522, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3373133118742268e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|