omnis4 / trainer_state.json
multitensor's picture
Upload folder using huggingface_hub
6e29354 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 389,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002570694087403599,
"grad_norm": 0.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.7749,
"step": 1
},
{
"epoch": 0.005141388174807198,
"grad_norm": 0.0,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.7549,
"step": 2
},
{
"epoch": 0.007712082262210797,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.7659,
"step": 3
},
{
"epoch": 0.010282776349614395,
"grad_norm": 0.0,
"learning_rate": 6.666666666666667e-06,
"loss": 1.7842,
"step": 4
},
{
"epoch": 0.012853470437017995,
"grad_norm": 0.0,
"learning_rate": 8.333333333333334e-06,
"loss": 1.8015,
"step": 5
},
{
"epoch": 0.015424164524421594,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 1.7534,
"step": 6
},
{
"epoch": 0.017994858611825194,
"grad_norm": 0.0,
"learning_rate": 1.1666666666666668e-05,
"loss": 1.7747,
"step": 7
},
{
"epoch": 0.02056555269922879,
"grad_norm": 0.0,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.7783,
"step": 8
},
{
"epoch": 0.02313624678663239,
"grad_norm": 0.0,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.803,
"step": 9
},
{
"epoch": 0.02570694087403599,
"grad_norm": 0.0,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.7859,
"step": 10
},
{
"epoch": 0.028277634961439587,
"grad_norm": 0.0,
"learning_rate": 1.8333333333333333e-05,
"loss": 1.7856,
"step": 11
},
{
"epoch": 0.030848329048843187,
"grad_norm": 0.0,
"learning_rate": 2e-05,
"loss": 1.7825,
"step": 12
},
{
"epoch": 0.033419023136246784,
"grad_norm": 0.0,
"learning_rate": 1.9999652796146877e-05,
"loss": 1.8127,
"step": 13
},
{
"epoch": 0.03598971722365039,
"grad_norm": 0.0,
"learning_rate": 1.9998611208697607e-05,
"loss": 1.8115,
"step": 14
},
{
"epoch": 0.038560411311053984,
"grad_norm": 0.0,
"learning_rate": 1.9996875309980824e-05,
"loss": 1.7544,
"step": 15
},
{
"epoch": 0.04113110539845758,
"grad_norm": 0.0,
"learning_rate": 1.9994445220538678e-05,
"loss": 1.7886,
"step": 16
},
{
"epoch": 0.043701799485861184,
"grad_norm": 0.0,
"learning_rate": 1.999132110911845e-05,
"loss": 1.7866,
"step": 17
},
{
"epoch": 0.04627249357326478,
"grad_norm": 0.0,
"learning_rate": 1.9987503192660842e-05,
"loss": 1.7612,
"step": 18
},
{
"epoch": 0.04884318766066838,
"grad_norm": 0.0,
"learning_rate": 1.9982991736284914e-05,
"loss": 1.7944,
"step": 19
},
{
"epoch": 0.05141388174807198,
"grad_norm": 0.0,
"learning_rate": 1.997778705326968e-05,
"loss": 1.7656,
"step": 20
},
{
"epoch": 0.05398457583547558,
"grad_norm": 0.0,
"learning_rate": 1.9971889505032337e-05,
"loss": 1.7554,
"step": 21
},
{
"epoch": 0.056555269922879174,
"grad_norm": 0.0,
"learning_rate": 1.9965299501103178e-05,
"loss": 1.7637,
"step": 22
},
{
"epoch": 0.05912596401028278,
"grad_norm": 0.0,
"learning_rate": 1.995801749909715e-05,
"loss": 1.7803,
"step": 23
},
{
"epoch": 0.061696658097686374,
"grad_norm": 0.0,
"learning_rate": 1.995004400468209e-05,
"loss": 1.7402,
"step": 24
},
{
"epoch": 0.06426735218508997,
"grad_norm": 0.0,
"learning_rate": 1.9941379571543597e-05,
"loss": 1.7017,
"step": 25
},
{
"epoch": 0.06683804627249357,
"grad_norm": 0.0,
"learning_rate": 1.9932024801346583e-05,
"loss": 1.7671,
"step": 26
},
{
"epoch": 0.06940874035989718,
"grad_norm": 0.0,
"learning_rate": 1.992198034369349e-05,
"loss": 1.7014,
"step": 27
},
{
"epoch": 0.07197943444730077,
"grad_norm": 0.0,
"learning_rate": 1.991124689607921e-05,
"loss": 1.7532,
"step": 28
},
{
"epoch": 0.07455012853470437,
"grad_norm": 0.0,
"learning_rate": 1.9899825203842613e-05,
"loss": 1.7129,
"step": 29
},
{
"epoch": 0.07712082262210797,
"grad_norm": 0.0,
"learning_rate": 1.988771606011481e-05,
"loss": 1.7126,
"step": 30
},
{
"epoch": 0.07969151670951156,
"grad_norm": 0.0,
"learning_rate": 1.987492030576407e-05,
"loss": 1.7393,
"step": 31
},
{
"epoch": 0.08226221079691516,
"grad_norm": 0.0,
"learning_rate": 1.986143882933744e-05,
"loss": 1.7742,
"step": 32
},
{
"epoch": 0.08483290488431877,
"grad_norm": 0.0,
"learning_rate": 1.9847272566999026e-05,
"loss": 1.7483,
"step": 33
},
{
"epoch": 0.08740359897172237,
"grad_norm": 0.0,
"learning_rate": 1.9832422502465013e-05,
"loss": 1.707,
"step": 34
},
{
"epoch": 0.08997429305912596,
"grad_norm": 0.0,
"learning_rate": 1.9816889666935318e-05,
"loss": 1.7507,
"step": 35
},
{
"epoch": 0.09254498714652956,
"grad_norm": 0.0,
"learning_rate": 1.9800675139022006e-05,
"loss": 1.7339,
"step": 36
},
{
"epoch": 0.09511568123393316,
"grad_norm": 0.0,
"learning_rate": 1.9783780044674402e-05,
"loss": 1.748,
"step": 37
},
{
"epoch": 0.09768637532133675,
"grad_norm": 0.0,
"learning_rate": 1.976620555710087e-05,
"loss": 1.686,
"step": 38
},
{
"epoch": 0.10025706940874037,
"grad_norm": 0.0,
"learning_rate": 1.974795289668737e-05,
"loss": 1.7043,
"step": 39
},
{
"epoch": 0.10282776349614396,
"grad_norm": 0.0,
"learning_rate": 1.972902333091271e-05,
"loss": 1.7646,
"step": 40
},
{
"epoch": 0.10539845758354756,
"grad_norm": 0.0,
"learning_rate": 1.9709418174260523e-05,
"loss": 1.6802,
"step": 41
},
{
"epoch": 0.10796915167095116,
"grad_norm": 0.0,
"learning_rate": 1.9689138788127994e-05,
"loss": 1.6775,
"step": 42
},
{
"epoch": 0.11053984575835475,
"grad_norm": 0.0,
"learning_rate": 1.966818658073133e-05,
"loss": 1.6633,
"step": 43
},
{
"epoch": 0.11311053984575835,
"grad_norm": 0.0,
"learning_rate": 1.9646563007007952e-05,
"loss": 1.7637,
"step": 44
},
{
"epoch": 0.11568123393316196,
"grad_norm": 0.0,
"learning_rate": 1.9624269568515486e-05,
"loss": 1.7087,
"step": 45
},
{
"epoch": 0.11825192802056556,
"grad_norm": 0.0,
"learning_rate": 1.960130781332748e-05,
"loss": 1.6562,
"step": 46
},
{
"epoch": 0.12082262210796915,
"grad_norm": 0.0,
"learning_rate": 1.957767933592591e-05,
"loss": 1.698,
"step": 47
},
{
"epoch": 0.12339331619537275,
"grad_norm": 0.0,
"learning_rate": 1.955338577709046e-05,
"loss": 1.7444,
"step": 48
},
{
"epoch": 0.12596401028277635,
"grad_norm": 0.0,
"learning_rate": 1.9528428823784567e-05,
"loss": 1.6743,
"step": 49
},
{
"epoch": 0.12853470437017994,
"grad_norm": 0.0,
"learning_rate": 1.9502810209038302e-05,
"loss": 1.6741,
"step": 50
},
{
"epoch": 0.13110539845758354,
"grad_norm": 0.0,
"learning_rate": 1.9476531711828027e-05,
"loss": 1.708,
"step": 51
},
{
"epoch": 0.13367609254498714,
"grad_norm": 0.0,
"learning_rate": 1.9449595156952827e-05,
"loss": 1.6587,
"step": 52
},
{
"epoch": 0.13624678663239073,
"grad_norm": 0.0,
"learning_rate": 1.9422002414907837e-05,
"loss": 1.6887,
"step": 53
},
{
"epoch": 0.13881748071979436,
"grad_norm": 0.0,
"learning_rate": 1.9393755401754324e-05,
"loss": 1.6714,
"step": 54
},
{
"epoch": 0.14138817480719795,
"grad_norm": 0.0,
"learning_rate": 1.936485607898665e-05,
"loss": 1.7432,
"step": 55
},
{
"epoch": 0.14395886889460155,
"grad_norm": 0.0,
"learning_rate": 1.9335306453396066e-05,
"loss": 1.6675,
"step": 56
},
{
"epoch": 0.14652956298200515,
"grad_norm": 0.0,
"learning_rate": 1.9305108576931336e-05,
"loss": 1.6436,
"step": 57
},
{
"epoch": 0.14910025706940874,
"grad_norm": 0.0,
"learning_rate": 1.927426454655627e-05,
"loss": 1.6853,
"step": 58
},
{
"epoch": 0.15167095115681234,
"grad_norm": 0.0,
"learning_rate": 1.924277650410412e-05,
"loss": 1.6641,
"step": 59
},
{
"epoch": 0.15424164524421594,
"grad_norm": 0.0,
"learning_rate": 1.9210646636128805e-05,
"loss": 1.7385,
"step": 60
},
{
"epoch": 0.15681233933161953,
"grad_norm": 0.0,
"learning_rate": 1.9177877173753127e-05,
"loss": 1.7178,
"step": 61
},
{
"epoch": 0.15938303341902313,
"grad_norm": 0.0,
"learning_rate": 1.91444703925138e-05,
"loss": 1.6785,
"step": 62
},
{
"epoch": 0.16195372750642673,
"grad_norm": 0.0,
"learning_rate": 1.9110428612203463e-05,
"loss": 1.6799,
"step": 63
},
{
"epoch": 0.16452442159383032,
"grad_norm": 0.0,
"learning_rate": 1.9075754196709574e-05,
"loss": 1.7075,
"step": 64
},
{
"epoch": 0.16709511568123395,
"grad_norm": 0.0,
"learning_rate": 1.904044955385026e-05,
"loss": 1.6621,
"step": 65
},
{
"epoch": 0.16966580976863754,
"grad_norm": 0.0,
"learning_rate": 1.9004517135207127e-05,
"loss": 1.6492,
"step": 66
},
{
"epoch": 0.17223650385604114,
"grad_norm": 0.0,
"learning_rate": 1.8967959435955027e-05,
"loss": 1.7297,
"step": 67
},
{
"epoch": 0.17480719794344474,
"grad_norm": 0.0,
"learning_rate": 1.893077899468876e-05,
"loss": 1.6882,
"step": 68
},
{
"epoch": 0.17737789203084833,
"grad_norm": 0.0,
"learning_rate": 1.889297839324682e-05,
"loss": 1.6714,
"step": 69
},
{
"epoch": 0.17994858611825193,
"grad_norm": 0.0,
"learning_rate": 1.8854560256532098e-05,
"loss": 1.6489,
"step": 70
},
{
"epoch": 0.18251928020565553,
"grad_norm": 0.0,
"learning_rate": 1.8815527252329624e-05,
"loss": 1.6721,
"step": 71
},
{
"epoch": 0.18508997429305912,
"grad_norm": 0.0,
"learning_rate": 1.8775882091121282e-05,
"loss": 1.6533,
"step": 72
},
{
"epoch": 0.18766066838046272,
"grad_norm": 0.0,
"learning_rate": 1.8735627525897618e-05,
"loss": 1.6443,
"step": 73
},
{
"epoch": 0.19023136246786632,
"grad_norm": 0.0,
"learning_rate": 1.8694766351966665e-05,
"loss": 1.6631,
"step": 74
},
{
"epoch": 0.1928020565552699,
"grad_norm": 0.0,
"learning_rate": 1.8653301406759827e-05,
"loss": 1.6873,
"step": 75
},
{
"epoch": 0.1953727506426735,
"grad_norm": 0.0,
"learning_rate": 1.8611235569634852e-05,
"loss": 1.7046,
"step": 76
},
{
"epoch": 0.19794344473007713,
"grad_norm": 0.0,
"learning_rate": 1.8568571761675893e-05,
"loss": 1.7002,
"step": 77
},
{
"epoch": 0.20051413881748073,
"grad_norm": 0.0,
"learning_rate": 1.8525312945490647e-05,
"loss": 1.698,
"step": 78
},
{
"epoch": 0.20308483290488433,
"grad_norm": 0.0,
"learning_rate": 1.8481462125004647e-05,
"loss": 1.6765,
"step": 79
},
{
"epoch": 0.20565552699228792,
"grad_norm": 0.0,
"learning_rate": 1.8437022345252666e-05,
"loss": 1.7185,
"step": 80
},
{
"epoch": 0.20822622107969152,
"grad_norm": 0.0,
"learning_rate": 1.8391996692167242e-05,
"loss": 1.6653,
"step": 81
},
{
"epoch": 0.21079691516709512,
"grad_norm": 0.0,
"learning_rate": 1.8346388292364438e-05,
"loss": 1.7129,
"step": 82
},
{
"epoch": 0.2133676092544987,
"grad_norm": 0.0,
"learning_rate": 1.8300200312926674e-05,
"loss": 1.6709,
"step": 83
},
{
"epoch": 0.2159383033419023,
"grad_norm": 0.0,
"learning_rate": 1.8253435961182844e-05,
"loss": 1.6597,
"step": 84
},
{
"epoch": 0.2185089974293059,
"grad_norm": 0.0,
"learning_rate": 1.8206098484485563e-05,
"loss": 1.6812,
"step": 85
},
{
"epoch": 0.2210796915167095,
"grad_norm": 0.0,
"learning_rate": 1.8158191169985696e-05,
"loss": 1.6792,
"step": 86
},
{
"epoch": 0.2236503856041131,
"grad_norm": 0.0,
"learning_rate": 1.810971734440408e-05,
"loss": 1.6404,
"step": 87
},
{
"epoch": 0.2262210796915167,
"grad_norm": 0.0,
"learning_rate": 1.806068037380052e-05,
"loss": 1.6528,
"step": 88
},
{
"epoch": 0.22879177377892032,
"grad_norm": 0.0,
"learning_rate": 1.801108366334004e-05,
"loss": 1.6775,
"step": 89
},
{
"epoch": 0.23136246786632392,
"grad_norm": 0.0,
"learning_rate": 1.796093065705644e-05,
"loss": 1.679,
"step": 90
},
{
"epoch": 0.23393316195372751,
"grad_norm": 0.0,
"learning_rate": 1.791022483761312e-05,
"loss": 1.658,
"step": 91
},
{
"epoch": 0.2365038560411311,
"grad_norm": 0.0,
"learning_rate": 1.7858969726061262e-05,
"loss": 1.6277,
"step": 92
},
{
"epoch": 0.2390745501285347,
"grad_norm": 0.0,
"learning_rate": 1.7807168881595304e-05,
"loss": 1.6602,
"step": 93
},
{
"epoch": 0.2416452442159383,
"grad_norm": 0.0,
"learning_rate": 1.7754825901305814e-05,
"loss": 1.6758,
"step": 94
},
{
"epoch": 0.2442159383033419,
"grad_norm": 0.0,
"learning_rate": 1.7701944419929673e-05,
"loss": 1.6353,
"step": 95
},
{
"epoch": 0.2467866323907455,
"grad_norm": 0.0,
"learning_rate": 1.7648528109597704e-05,
"loss": 1.6602,
"step": 96
},
{
"epoch": 0.2493573264781491,
"grad_norm": 0.0,
"learning_rate": 1.7594580679579654e-05,
"loss": 1.6404,
"step": 97
},
{
"epoch": 0.2519280205655527,
"grad_norm": 0.0,
"learning_rate": 1.7540105876026647e-05,
"loss": 1.6365,
"step": 98
},
{
"epoch": 0.2544987146529563,
"grad_norm": 0.0,
"learning_rate": 1.7485107481711014e-05,
"loss": 1.6353,
"step": 99
},
{
"epoch": 0.2570694087403599,
"grad_norm": 0.0,
"learning_rate": 1.7429589315763637e-05,
"loss": 1.6541,
"step": 100
},
{
"epoch": 0.2596401028277635,
"grad_norm": 0.0,
"learning_rate": 1.737355523340875e-05,
"loss": 1.6133,
"step": 101
},
{
"epoch": 0.2622107969151671,
"grad_norm": 0.0,
"learning_rate": 1.7317009125696208e-05,
"loss": 1.6687,
"step": 102
},
{
"epoch": 0.2647814910025707,
"grad_norm": 0.0,
"learning_rate": 1.725995491923131e-05,
"loss": 1.636,
"step": 103
},
{
"epoch": 0.26735218508997427,
"grad_norm": 0.0,
"learning_rate": 1.7202396575902118e-05,
"loss": 1.6497,
"step": 104
},
{
"epoch": 0.2699228791773779,
"grad_norm": 0.0,
"learning_rate": 1.714433809260435e-05,
"loss": 1.6458,
"step": 105
},
{
"epoch": 0.27249357326478146,
"grad_norm": 0.0,
"learning_rate": 1.7085783500963825e-05,
"loss": 1.624,
"step": 106
},
{
"epoch": 0.2750642673521851,
"grad_norm": 0.0,
"learning_rate": 1.702673686705651e-05,
"loss": 1.6353,
"step": 107
},
{
"epoch": 0.2776349614395887,
"grad_norm": 0.0,
"learning_rate": 1.6967202291126174e-05,
"loss": 1.6406,
"step": 108
},
{
"epoch": 0.2802056555269923,
"grad_norm": 0.0,
"learning_rate": 1.690718390729964e-05,
"loss": 1.6323,
"step": 109
},
{
"epoch": 0.2827763496143959,
"grad_norm": 0.0,
"learning_rate": 1.684668588329973e-05,
"loss": 1.665,
"step": 110
},
{
"epoch": 0.2853470437017995,
"grad_norm": 0.0,
"learning_rate": 1.6785712420155864e-05,
"loss": 1.635,
"step": 111
},
{
"epoch": 0.2879177377892031,
"grad_norm": 0.0,
"learning_rate": 1.67242677519123e-05,
"loss": 1.6335,
"step": 112
},
{
"epoch": 0.29048843187660667,
"grad_norm": 0.0,
"learning_rate": 1.6662356145334158e-05,
"loss": 1.6846,
"step": 113
},
{
"epoch": 0.2930591259640103,
"grad_norm": 0.0,
"learning_rate": 1.6599981899611103e-05,
"loss": 1.6353,
"step": 114
},
{
"epoch": 0.29562982005141386,
"grad_norm": 0.0,
"learning_rate": 1.653714934605883e-05,
"loss": 1.6189,
"step": 115
},
{
"epoch": 0.2982005141388175,
"grad_norm": 0.0,
"learning_rate": 1.647386284781828e-05,
"loss": 1.7021,
"step": 116
},
{
"epoch": 0.30077120822622105,
"grad_norm": 0.0,
"learning_rate": 1.6410126799552653e-05,
"loss": 1.6777,
"step": 117
},
{
"epoch": 0.3033419023136247,
"grad_norm": 0.0,
"learning_rate": 1.6345945627142264e-05,
"loss": 1.6377,
"step": 118
},
{
"epoch": 0.3059125964010283,
"grad_norm": 0.0,
"learning_rate": 1.628132378737718e-05,
"loss": 1.6616,
"step": 119
},
{
"epoch": 0.30848329048843187,
"grad_norm": 0.0,
"learning_rate": 1.6216265767647756e-05,
"loss": 1.616,
"step": 120
},
{
"epoch": 0.3110539845758355,
"grad_norm": 0.0,
"learning_rate": 1.615077608563302e-05,
"loss": 1.6816,
"step": 121
},
{
"epoch": 0.31362467866323906,
"grad_norm": 0.0,
"learning_rate": 1.6084859288986957e-05,
"loss": 1.6099,
"step": 122
},
{
"epoch": 0.3161953727506427,
"grad_norm": 0.0,
"learning_rate": 1.601851995502272e-05,
"loss": 1.6274,
"step": 123
},
{
"epoch": 0.31876606683804626,
"grad_norm": 0.0,
"learning_rate": 1.5951762690394788e-05,
"loss": 1.6663,
"step": 124
},
{
"epoch": 0.3213367609254499,
"grad_norm": 0.0,
"learning_rate": 1.5884592130779056e-05,
"loss": 1.6494,
"step": 125
},
{
"epoch": 0.32390745501285345,
"grad_norm": 0.0,
"learning_rate": 1.581701294055095e-05,
"loss": 1.614,
"step": 126
},
{
"epoch": 0.3264781491002571,
"grad_norm": 0.0,
"learning_rate": 1.5749029812461515e-05,
"loss": 1.6265,
"step": 127
},
{
"epoch": 0.32904884318766064,
"grad_norm": 0.0,
"learning_rate": 1.568064746731156e-05,
"loss": 1.5913,
"step": 128
},
{
"epoch": 0.33161953727506427,
"grad_norm": 0.0,
"learning_rate": 1.5611870653623826e-05,
"loss": 1.5984,
"step": 129
},
{
"epoch": 0.3341902313624679,
"grad_norm": 0.0,
"learning_rate": 1.5542704147313257e-05,
"loss": 1.6343,
"step": 130
},
{
"epoch": 0.33676092544987146,
"grad_norm": 0.0,
"learning_rate": 1.5473152751355353e-05,
"loss": 1.6355,
"step": 131
},
{
"epoch": 0.3393316195372751,
"grad_norm": 0.0,
"learning_rate": 1.5403221295452647e-05,
"loss": 1.647,
"step": 132
},
{
"epoch": 0.34190231362467866,
"grad_norm": 0.0,
"learning_rate": 1.5332914635699327e-05,
"loss": 1.6191,
"step": 133
},
{
"epoch": 0.3444730077120823,
"grad_norm": 0.0,
"learning_rate": 1.5262237654244026e-05,
"loss": 1.624,
"step": 134
},
{
"epoch": 0.34704370179948585,
"grad_norm": 0.0,
"learning_rate": 1.5191195258950804e-05,
"loss": 1.6055,
"step": 135
},
{
"epoch": 0.3496143958868895,
"grad_norm": 0.0,
"learning_rate": 1.5119792383058338e-05,
"loss": 1.6492,
"step": 136
},
{
"epoch": 0.35218508997429304,
"grad_norm": 0.0,
"learning_rate": 1.5048033984837352e-05,
"loss": 1.6155,
"step": 137
},
{
"epoch": 0.35475578406169667,
"grad_norm": 0.0,
"learning_rate": 1.4975925047246319e-05,
"loss": 1.6042,
"step": 138
},
{
"epoch": 0.35732647814910024,
"grad_norm": 0.0,
"learning_rate": 1.4903470577585433e-05,
"loss": 1.6367,
"step": 139
},
{
"epoch": 0.35989717223650386,
"grad_norm": 0.0,
"learning_rate": 1.4830675607148899e-05,
"loss": 1.5928,
"step": 140
},
{
"epoch": 0.36246786632390743,
"grad_norm": 0.0,
"learning_rate": 1.475754519087557e-05,
"loss": 1.6526,
"step": 141
},
{
"epoch": 0.36503856041131105,
"grad_norm": 0.0,
"learning_rate": 1.4684084406997903e-05,
"loss": 1.6362,
"step": 142
},
{
"epoch": 0.3676092544987147,
"grad_norm": 0.0,
"learning_rate": 1.4610298356689341e-05,
"loss": 1.6201,
"step": 143
},
{
"epoch": 0.37017994858611825,
"grad_norm": 0.0,
"learning_rate": 1.453619216371008e-05,
"loss": 1.6162,
"step": 144
},
{
"epoch": 0.37275064267352187,
"grad_norm": 0.0,
"learning_rate": 1.446177097405127e-05,
"loss": 1.6172,
"step": 145
},
{
"epoch": 0.37532133676092544,
"grad_norm": 0.0,
"learning_rate": 1.4387039955577668e-05,
"loss": 1.6301,
"step": 146
},
{
"epoch": 0.37789203084832906,
"grad_norm": 0.0,
"learning_rate": 1.4312004297668791e-05,
"loss": 1.6096,
"step": 147
},
{
"epoch": 0.38046272493573263,
"grad_norm": 0.0,
"learning_rate": 1.4236669210858544e-05,
"loss": 1.6152,
"step": 148
},
{
"epoch": 0.38303341902313626,
"grad_norm": 0.0,
"learning_rate": 1.4161039926473412e-05,
"loss": 1.6321,
"step": 149
},
{
"epoch": 0.3856041131105398,
"grad_norm": 0.0,
"learning_rate": 1.4085121696269185e-05,
"loss": 1.5957,
"step": 150
},
{
"epoch": 0.38817480719794345,
"grad_norm": 0.0,
"learning_rate": 1.4008919792066273e-05,
"loss": 1.6421,
"step": 151
},
{
"epoch": 0.390745501285347,
"grad_norm": 0.0,
"learning_rate": 1.3932439505383628e-05,
"loss": 1.6189,
"step": 152
},
{
"epoch": 0.39331619537275064,
"grad_norm": 0.0,
"learning_rate": 1.385568614707129e-05,
"loss": 1.6106,
"step": 153
},
{
"epoch": 0.39588688946015427,
"grad_norm": 0.0,
"learning_rate": 1.3778665046941616e-05,
"loss": 1.6321,
"step": 154
},
{
"epoch": 0.39845758354755784,
"grad_norm": 0.0,
"learning_rate": 1.3701381553399147e-05,
"loss": 1.5796,
"step": 155
},
{
"epoch": 0.40102827763496146,
"grad_norm": 0.0,
"learning_rate": 1.3623841033069232e-05,
"loss": 1.6555,
"step": 156
},
{
"epoch": 0.40359897172236503,
"grad_norm": 0.0,
"learning_rate": 1.3546048870425356e-05,
"loss": 1.6028,
"step": 157
},
{
"epoch": 0.40616966580976865,
"grad_norm": 0.0,
"learning_rate": 1.3468010467415248e-05,
"loss": 1.5969,
"step": 158
},
{
"epoch": 0.4087403598971722,
"grad_norm": 0.0,
"learning_rate": 1.3389731243085747e-05,
"loss": 1.6077,
"step": 159
},
{
"epoch": 0.41131105398457585,
"grad_norm": 0.0,
"learning_rate": 1.3311216633206514e-05,
"loss": 1.5762,
"step": 160
},
{
"epoch": 0.4138817480719794,
"grad_norm": 0.0,
"learning_rate": 1.3232472089892567e-05,
"loss": 1.6079,
"step": 161
},
{
"epoch": 0.41645244215938304,
"grad_norm": 0.0,
"learning_rate": 1.315350308122567e-05,
"loss": 1.5994,
"step": 162
},
{
"epoch": 0.4190231362467866,
"grad_norm": 0.0,
"learning_rate": 1.3074315090874652e-05,
"loss": 1.5732,
"step": 163
},
{
"epoch": 0.42159383033419023,
"grad_norm": 0.0,
"learning_rate": 1.2994913617714573e-05,
"loss": 1.5901,
"step": 164
},
{
"epoch": 0.4241645244215938,
"grad_norm": 0.0,
"learning_rate": 1.2915304175444929e-05,
"loss": 1.6138,
"step": 165
},
{
"epoch": 0.4267352185089974,
"grad_norm": 0.0,
"learning_rate": 1.2835492292206735e-05,
"loss": 1.5945,
"step": 166
},
{
"epoch": 0.42930591259640105,
"grad_norm": 0.0,
"learning_rate": 1.2755483510198668e-05,
"loss": 1.6067,
"step": 167
},
{
"epoch": 0.4318766066838046,
"grad_norm": 0.0,
"learning_rate": 1.2675283385292212e-05,
"loss": 1.5957,
"step": 168
},
{
"epoch": 0.43444730077120824,
"grad_norm": 0.0,
"learning_rate": 1.2594897486645836e-05,
"loss": 1.6089,
"step": 169
},
{
"epoch": 0.4370179948586118,
"grad_norm": 0.0,
"learning_rate": 1.2514331396318298e-05,
"loss": 1.6335,
"step": 170
},
{
"epoch": 0.43958868894601544,
"grad_norm": 0.0,
"learning_rate": 1.2433590708880991e-05,
"loss": 1.6406,
"step": 171
},
{
"epoch": 0.442159383033419,
"grad_norm": 0.0,
"learning_rate": 1.2352681031029476e-05,
"loss": 1.5759,
"step": 172
},
{
"epoch": 0.44473007712082263,
"grad_norm": 0.0,
"learning_rate": 1.2271607981194132e-05,
"loss": 1.5955,
"step": 173
},
{
"epoch": 0.4473007712082262,
"grad_norm": 0.0,
"learning_rate": 1.2190377189150016e-05,
"loss": 1.6069,
"step": 174
},
{
"epoch": 0.4498714652956298,
"grad_norm": 0.0,
"learning_rate": 1.2108994295625924e-05,
"loss": 1.5796,
"step": 175
},
{
"epoch": 0.4524421593830334,
"grad_norm": 0.0,
"learning_rate": 1.2027464951912703e-05,
"loss": 1.5952,
"step": 176
},
{
"epoch": 0.455012853470437,
"grad_norm": 0.0,
"learning_rate": 1.1945794819470805e-05,
"loss": 1.6213,
"step": 177
},
{
"epoch": 0.45758354755784064,
"grad_norm": 0.0,
"learning_rate": 1.1863989569537165e-05,
"loss": 1.5974,
"step": 178
},
{
"epoch": 0.4601542416452442,
"grad_norm": 0.0,
"learning_rate": 1.1782054882731377e-05,
"loss": 1.5188,
"step": 179
},
{
"epoch": 0.46272493573264784,
"grad_norm": 0.0,
"learning_rate": 1.1699996448661242e-05,
"loss": 1.5964,
"step": 180
},
{
"epoch": 0.4652956298200514,
"grad_norm": 0.0,
"learning_rate": 1.161781996552765e-05,
"loss": 1.5681,
"step": 181
},
{
"epoch": 0.46786632390745503,
"grad_norm": 0.0,
"learning_rate": 1.1535531139728918e-05,
"loss": 1.5938,
"step": 182
},
{
"epoch": 0.4704370179948586,
"grad_norm": 0.0,
"learning_rate": 1.1453135685464524e-05,
"loss": 1.574,
"step": 183
},
{
"epoch": 0.4730077120822622,
"grad_norm": 0.0,
"learning_rate": 1.1370639324338313e-05,
"loss": 1.5872,
"step": 184
},
{
"epoch": 0.4755784061696658,
"grad_norm": 0.0,
"learning_rate": 1.1288047784961166e-05,
"loss": 1.5806,
"step": 185
},
{
"epoch": 0.4781491002570694,
"grad_norm": 0.0,
"learning_rate": 1.1205366802553231e-05,
"loss": 1.5542,
"step": 186
},
{
"epoch": 0.480719794344473,
"grad_norm": 0.0,
"learning_rate": 1.1122602118545642e-05,
"loss": 1.5723,
"step": 187
},
{
"epoch": 0.4832904884318766,
"grad_norm": 0.0,
"learning_rate": 1.1039759480181836e-05,
"loss": 1.5645,
"step": 188
},
{
"epoch": 0.48586118251928023,
"grad_norm": 0.0,
"learning_rate": 1.0956844640118462e-05,
"loss": 1.5884,
"step": 189
},
{
"epoch": 0.4884318766066838,
"grad_norm": 0.0,
"learning_rate": 1.0873863356025911e-05,
"loss": 1.5559,
"step": 190
},
{
"epoch": 0.4910025706940874,
"grad_norm": 0.0,
"learning_rate": 1.0790821390188493e-05,
"loss": 1.5623,
"step": 191
},
{
"epoch": 0.493573264781491,
"grad_norm": 0.0,
"learning_rate": 1.0707724509104318e-05,
"loss": 1.5916,
"step": 192
},
{
"epoch": 0.4961439588688946,
"grad_norm": 0.0,
"learning_rate": 1.062457848308484e-05,
"loss": 1.5696,
"step": 193
},
{
"epoch": 0.4987146529562982,
"grad_norm": 0.0,
"learning_rate": 1.0541389085854177e-05,
"loss": 1.5913,
"step": 194
},
{
"epoch": 0.5012853470437018,
"grad_norm": 0.0,
"learning_rate": 1.0458162094148185e-05,
"loss": 1.5439,
"step": 195
},
{
"epoch": 0.5038560411311054,
"grad_norm": 0.0,
"learning_rate": 1.0374903287313307e-05,
"loss": 1.6013,
"step": 196
},
{
"epoch": 0.506426735218509,
"grad_norm": 0.0,
"learning_rate": 1.029161844690525e-05,
"loss": 1.5813,
"step": 197
},
{
"epoch": 0.5089974293059126,
"grad_norm": 0.0,
"learning_rate": 1.0208313356287505e-05,
"loss": 1.5757,
"step": 198
},
{
"epoch": 0.5115681233933161,
"grad_norm": 0.0,
"learning_rate": 1.0124993800229774e-05,
"loss": 1.5508,
"step": 199
},
{
"epoch": 0.5141388174807198,
"grad_norm": 0.0,
"learning_rate": 1.004166556450623e-05,
"loss": 1.5774,
"step": 200
},
{
"epoch": 0.5167095115681234,
"grad_norm": 0.0,
"learning_rate": 9.958334435493776e-06,
"loss": 1.594,
"step": 201
},
{
"epoch": 0.519280205655527,
"grad_norm": 0.0,
"learning_rate": 9.87500619977023e-06,
"loss": 1.5977,
"step": 202
},
{
"epoch": 0.5218508997429306,
"grad_norm": 0.0,
"learning_rate": 9.791686643712498e-06,
"loss": 1.5938,
"step": 203
},
{
"epoch": 0.5244215938303342,
"grad_norm": 0.0,
"learning_rate": 9.708381553094754e-06,
"loss": 1.5371,
"step": 204
},
{
"epoch": 0.5269922879177378,
"grad_norm": 0.0,
"learning_rate": 9.625096712686694e-06,
"loss": 1.5315,
"step": 205
},
{
"epoch": 0.5295629820051414,
"grad_norm": 0.0,
"learning_rate": 9.541837905851817e-06,
"loss": 1.5708,
"step": 206
},
{
"epoch": 0.532133676092545,
"grad_norm": 0.0,
"learning_rate": 9.458610914145826e-06,
"loss": 1.5691,
"step": 207
},
{
"epoch": 0.5347043701799485,
"grad_norm": 0.0,
"learning_rate": 9.375421516915165e-06,
"loss": 1.5881,
"step": 208
},
{
"epoch": 0.5372750642673522,
"grad_norm": 0.0,
"learning_rate": 9.292275490895685e-06,
"loss": 1.5732,
"step": 209
},
{
"epoch": 0.5398457583547558,
"grad_norm": 0.0,
"learning_rate": 9.209178609811509e-06,
"loss": 1.5562,
"step": 210
},
{
"epoch": 0.5424164524421594,
"grad_norm": 0.0,
"learning_rate": 9.126136643974094e-06,
"loss": 1.5603,
"step": 211
},
{
"epoch": 0.5449871465295629,
"grad_norm": 0.0,
"learning_rate": 9.043155359881538e-06,
"loss": 1.5352,
"step": 212
},
{
"epoch": 0.5475578406169666,
"grad_norm": 0.0,
"learning_rate": 8.960240519818167e-06,
"loss": 1.5647,
"step": 213
},
{
"epoch": 0.5501285347043702,
"grad_norm": 0.0,
"learning_rate": 8.877397881454358e-06,
"loss": 1.5747,
"step": 214
},
{
"epoch": 0.5526992287917738,
"grad_norm": 0.0,
"learning_rate": 8.79463319744677e-06,
"loss": 1.5586,
"step": 215
},
{
"epoch": 0.5552699228791774,
"grad_norm": 0.0,
"learning_rate": 8.711952215038837e-06,
"loss": 1.5527,
"step": 216
},
{
"epoch": 0.5578406169665809,
"grad_norm": 0.0,
"learning_rate": 8.629360675661693e-06,
"loss": 1.5374,
"step": 217
},
{
"epoch": 0.5604113110539846,
"grad_norm": 0.0,
"learning_rate": 8.546864314535478e-06,
"loss": 1.5647,
"step": 218
},
{
"epoch": 0.5629820051413882,
"grad_norm": 0.0,
"learning_rate": 8.464468860271084e-06,
"loss": 1.5356,
"step": 219
},
{
"epoch": 0.5655526992287918,
"grad_norm": 0.0,
"learning_rate": 8.382180034472353e-06,
"loss": 1.5483,
"step": 220
},
{
"epoch": 0.5681233933161953,
"grad_norm": 0.0,
"learning_rate": 8.30000355133876e-06,
"loss": 1.5386,
"step": 221
},
{
"epoch": 0.570694087403599,
"grad_norm": 0.0,
"learning_rate": 8.217945117268624e-06,
"loss": 1.5552,
"step": 222
},
{
"epoch": 0.5732647814910026,
"grad_norm": 0.0,
"learning_rate": 8.136010430462837e-06,
"loss": 1.5635,
"step": 223
},
{
"epoch": 0.5758354755784062,
"grad_norm": 0.0,
"learning_rate": 8.0542051805292e-06,
"loss": 1.5657,
"step": 224
},
{
"epoch": 0.5784061696658098,
"grad_norm": 0.0,
"learning_rate": 7.9725350480873e-06,
"loss": 1.5386,
"step": 225
},
{
"epoch": 0.5809768637532133,
"grad_norm": 0.0,
"learning_rate": 7.89100570437408e-06,
"loss": 1.6018,
"step": 226
},
{
"epoch": 0.583547557840617,
"grad_norm": 0.0,
"learning_rate": 7.809622810849986e-06,
"loss": 1.5396,
"step": 227
},
{
"epoch": 0.5861182519280206,
"grad_norm": 0.0,
"learning_rate": 7.72839201880587e-06,
"loss": 1.5474,
"step": 228
},
{
"epoch": 0.5886889460154242,
"grad_norm": 0.0,
"learning_rate": 7.647318968970528e-06,
"loss": 1.5654,
"step": 229
},
{
"epoch": 0.5912596401028277,
"grad_norm": 0.0,
"learning_rate": 7.566409291119008e-06,
"loss": 1.5732,
"step": 230
},
{
"epoch": 0.5938303341902313,
"grad_norm": 0.0,
"learning_rate": 7.485668603681706e-06,
"loss": 1.5779,
"step": 231
},
{
"epoch": 0.596401028277635,
"grad_norm": 0.0,
"learning_rate": 7.405102513354166e-06,
"loss": 1.5449,
"step": 232
},
{
"epoch": 0.5989717223650386,
"grad_norm": 0.0,
"learning_rate": 7.324716614707794e-06,
"loss": 1.5408,
"step": 233
},
{
"epoch": 0.6015424164524421,
"grad_norm": 0.0,
"learning_rate": 7.2445164898013345e-06,
"loss": 1.5403,
"step": 234
},
{
"epoch": 0.6041131105398457,
"grad_norm": 0.0,
"learning_rate": 7.1645077077932666e-06,
"loss": 1.5159,
"step": 235
},
{
"epoch": 0.6066838046272494,
"grad_norm": 0.0,
"learning_rate": 7.084695824555074e-06,
"loss": 1.5557,
"step": 236
},
{
"epoch": 0.609254498714653,
"grad_norm": 0.0,
"learning_rate": 7.005086382285426e-06,
"loss": 1.5625,
"step": 237
},
{
"epoch": 0.6118251928020566,
"grad_norm": 0.0,
"learning_rate": 6.925684909125354e-06,
"loss": 1.552,
"step": 238
},
{
"epoch": 0.6143958868894601,
"grad_norm": 0.0,
"learning_rate": 6.84649691877433e-06,
"loss": 1.5488,
"step": 239
},
{
"epoch": 0.6169665809768637,
"grad_norm": 0.0,
"learning_rate": 6.767527910107437e-06,
"loss": 1.5181,
"step": 240
},
{
"epoch": 0.6195372750642674,
"grad_norm": 0.0,
"learning_rate": 6.688783366793488e-06,
"loss": 1.5403,
"step": 241
},
{
"epoch": 0.622107969151671,
"grad_norm": 0.0,
"learning_rate": 6.610268756914254e-06,
"loss": 1.5662,
"step": 242
},
{
"epoch": 0.6246786632390745,
"grad_norm": 0.0,
"learning_rate": 6.5319895325847535e-06,
"loss": 1.5222,
"step": 243
},
{
"epoch": 0.6272493573264781,
"grad_norm": 0.0,
"learning_rate": 6.453951129574644e-06,
"loss": 1.5439,
"step": 244
},
{
"epoch": 0.6298200514138818,
"grad_norm": 0.0,
"learning_rate": 6.3761589669307745e-06,
"loss": 1.5312,
"step": 245
},
{
"epoch": 0.6323907455012854,
"grad_norm": 0.0,
"learning_rate": 6.298618446600856e-06,
"loss": 1.5383,
"step": 246
},
{
"epoch": 0.6349614395886889,
"grad_norm": 0.0,
"learning_rate": 6.221334953058389e-06,
"loss": 1.5393,
"step": 247
},
{
"epoch": 0.6375321336760925,
"grad_norm": 0.0,
"learning_rate": 6.144313852928712e-06,
"loss": 1.5247,
"step": 248
},
{
"epoch": 0.6401028277634961,
"grad_norm": 0.0,
"learning_rate": 6.067560494616374e-06,
"loss": 1.5454,
"step": 249
},
{
"epoch": 0.6426735218508998,
"grad_norm": 0.0,
"learning_rate": 5.9910802079337285e-06,
"loss": 1.5215,
"step": 250
},
{
"epoch": 0.6452442159383034,
"grad_norm": 0.0,
"learning_rate": 5.9148783037308154e-06,
"loss": 1.5427,
"step": 251
},
{
"epoch": 0.6478149100257069,
"grad_norm": 0.0,
"learning_rate": 5.838960073526589e-06,
"loss": 1.5427,
"step": 252
},
{
"epoch": 0.6503856041131105,
"grad_norm": 0.0,
"learning_rate": 5.763330789141457e-06,
"loss": 1.5552,
"step": 253
},
{
"epoch": 0.6529562982005142,
"grad_norm": 0.0,
"learning_rate": 5.687995702331211e-06,
"loss": 1.5388,
"step": 254
},
{
"epoch": 0.6555269922879178,
"grad_norm": 0.0,
"learning_rate": 5.612960044422335e-06,
"loss": 1.5854,
"step": 255
},
{
"epoch": 0.6580976863753213,
"grad_norm": 0.0,
"learning_rate": 5.538229025948729e-06,
"loss": 1.5588,
"step": 256
},
{
"epoch": 0.6606683804627249,
"grad_norm": 0.0,
"learning_rate": 5.463807836289921e-06,
"loss": 1.5217,
"step": 257
},
{
"epoch": 0.6632390745501285,
"grad_norm": 0.0,
"learning_rate": 5.389701643310661e-06,
"loss": 1.5066,
"step": 258
},
{
"epoch": 0.6658097686375322,
"grad_norm": 0.0,
"learning_rate": 5.3159155930021e-06,
"loss": 1.5327,
"step": 259
},
{
"epoch": 0.6683804627249358,
"grad_norm": 0.0,
"learning_rate": 5.2424548091244334e-06,
"loss": 1.5522,
"step": 260
},
{
"epoch": 0.6709511568123393,
"grad_norm": 0.0,
"learning_rate": 5.169324392851105e-06,
"loss": 1.543,
"step": 261
},
{
"epoch": 0.6735218508997429,
"grad_norm": 0.0,
"learning_rate": 5.096529422414571e-06,
"loss": 1.5483,
"step": 262
},
{
"epoch": 0.6760925449871465,
"grad_norm": 0.0,
"learning_rate": 5.0240749527536845e-06,
"loss": 1.5234,
"step": 263
},
{
"epoch": 0.6786632390745502,
"grad_norm": 0.0,
"learning_rate": 4.951966015162652e-06,
"loss": 1.5315,
"step": 264
},
{
"epoch": 0.6812339331619537,
"grad_norm": 0.0,
"learning_rate": 4.880207616941663e-06,
"loss": 1.5193,
"step": 265
},
{
"epoch": 0.6838046272493573,
"grad_norm": 0.0,
"learning_rate": 4.8088047410492e-06,
"loss": 1.5586,
"step": 266
},
{
"epoch": 0.6863753213367609,
"grad_norm": 0.0,
"learning_rate": 4.737762345755975e-06,
"loss": 1.481,
"step": 267
},
{
"epoch": 0.6889460154241646,
"grad_norm": 0.0,
"learning_rate": 4.667085364300678e-06,
"loss": 1.5869,
"step": 268
},
{
"epoch": 0.6915167095115681,
"grad_norm": 0.0,
"learning_rate": 4.596778704547359e-06,
"loss": 1.5366,
"step": 269
},
{
"epoch": 0.6940874035989717,
"grad_norm": 0.0,
"learning_rate": 4.526847248644652e-06,
"loss": 1.5007,
"step": 270
},
{
"epoch": 0.6966580976863753,
"grad_norm": 0.0,
"learning_rate": 4.457295852686746e-06,
"loss": 1.5352,
"step": 271
},
{
"epoch": 0.699228791773779,
"grad_norm": 0.0,
"learning_rate": 4.388129346376177e-06,
"loss": 1.5447,
"step": 272
},
{
"epoch": 0.7017994858611826,
"grad_norm": 0.0,
"learning_rate": 4.319352532688444e-06,
"loss": 1.5701,
"step": 273
},
{
"epoch": 0.7043701799485861,
"grad_norm": 0.0,
"learning_rate": 4.250970187538484e-06,
"loss": 1.5,
"step": 274
},
{
"epoch": 0.7069408740359897,
"grad_norm": 0.0,
"learning_rate": 4.182987059449056e-06,
"loss": 1.5513,
"step": 275
},
{
"epoch": 0.7095115681233933,
"grad_norm": 0.0,
"learning_rate": 4.115407869220948e-06,
"loss": 1.5007,
"step": 276
},
{
"epoch": 0.712082262210797,
"grad_norm": 0.0,
"learning_rate": 4.048237309605216e-06,
"loss": 1.5398,
"step": 277
},
{
"epoch": 0.7146529562982005,
"grad_norm": 0.0,
"learning_rate": 3.981480044977284e-06,
"loss": 1.5476,
"step": 278
},
{
"epoch": 0.7172236503856041,
"grad_norm": 0.0,
"learning_rate": 3.915140711013044e-06,
"loss": 1.5015,
"step": 279
},
{
"epoch": 0.7197943444730077,
"grad_norm": 0.0,
"learning_rate": 3.849223914366981e-06,
"loss": 1.5405,
"step": 280
},
{
"epoch": 0.7223650385604113,
"grad_norm": 0.0,
"learning_rate": 3.7837342323522454e-06,
"loss": 1.5413,
"step": 281
},
{
"epoch": 0.7249357326478149,
"grad_norm": 0.0,
"learning_rate": 3.7186762126228227e-06,
"loss": 1.5874,
"step": 282
},
{
"epoch": 0.7275064267352185,
"grad_norm": 0.0,
"learning_rate": 3.654054372857738e-06,
"loss": 1.5122,
"step": 283
},
{
"epoch": 0.7300771208226221,
"grad_norm": 0.0,
"learning_rate": 3.5898732004473523e-06,
"loss": 1.55,
"step": 284
},
{
"epoch": 0.7326478149100257,
"grad_norm": 0.0,
"learning_rate": 3.5261371521817247e-06,
"loss": 1.5337,
"step": 285
},
{
"epoch": 0.7352185089974294,
"grad_norm": 0.0,
"learning_rate": 3.462850653941171e-06,
"loss": 1.5159,
"step": 286
},
{
"epoch": 0.7377892030848329,
"grad_norm": 0.0,
"learning_rate": 3.4000181003889e-06,
"loss": 1.5139,
"step": 287
},
{
"epoch": 0.7403598971722365,
"grad_norm": 0.0,
"learning_rate": 3.337643854665843e-06,
"loss": 1.499,
"step": 288
},
{
"epoch": 0.7429305912596401,
"grad_norm": 0.0,
"learning_rate": 3.2757322480876996e-06,
"loss": 1.5149,
"step": 289
},
{
"epoch": 0.7455012853470437,
"grad_norm": 0.0,
"learning_rate": 3.2142875798441376e-06,
"loss": 1.5098,
"step": 290
},
{
"epoch": 0.7480719794344473,
"grad_norm": 0.0,
"learning_rate": 3.15331411670027e-06,
"loss": 1.5217,
"step": 291
},
{
"epoch": 0.7506426735218509,
"grad_norm": 0.0,
"learning_rate": 3.092816092700366e-06,
"loss": 1.5017,
"step": 292
},
{
"epoch": 0.7532133676092545,
"grad_norm": 0.0,
"learning_rate": 3.032797708873828e-06,
"loss": 1.5398,
"step": 293
},
{
"epoch": 0.7557840616966581,
"grad_norm": 0.0,
"learning_rate": 2.97326313294349e-06,
"loss": 1.4983,
"step": 294
},
{
"epoch": 0.7583547557840618,
"grad_norm": 0.0,
"learning_rate": 2.914216499036178e-06,
"loss": 1.5271,
"step": 295
},
{
"epoch": 0.7609254498714653,
"grad_norm": 0.0,
"learning_rate": 2.855661907395655e-06,
"loss": 1.5286,
"step": 296
},
{
"epoch": 0.7634961439588689,
"grad_norm": 0.0,
"learning_rate": 2.7976034240978834e-06,
"loss": 1.4954,
"step": 297
},
{
"epoch": 0.7660668380462725,
"grad_norm": 0.0,
"learning_rate": 2.740045080768694e-06,
"loss": 1.4653,
"step": 298
},
{
"epoch": 0.7686375321336761,
"grad_norm": 0.0,
"learning_rate": 2.6829908743037936e-06,
"loss": 1.5271,
"step": 299
},
{
"epoch": 0.7712082262210797,
"grad_norm": 0.0,
"learning_rate": 2.626444766591253e-06,
"loss": 1.48,
"step": 300
},
{
"epoch": 0.7737789203084833,
"grad_norm": 0.0,
"learning_rate": 2.570410684236365e-06,
"loss": 1.5093,
"step": 301
},
{
"epoch": 0.7763496143958869,
"grad_norm": 0.0,
"learning_rate": 2.514892518288988e-06,
"loss": 1.531,
"step": 302
},
{
"epoch": 0.7789203084832905,
"grad_norm": 0.0,
"learning_rate": 2.4598941239733555e-06,
"loss": 1.4795,
"step": 303
},
{
"epoch": 0.781491002570694,
"grad_norm": 0.0,
"learning_rate": 2.4054193204203457e-06,
"loss": 1.5056,
"step": 304
},
{
"epoch": 0.7840616966580977,
"grad_norm": 0.0,
"learning_rate": 2.3514718904022993e-06,
"loss": 1.4841,
"step": 305
},
{
"epoch": 0.7866323907455013,
"grad_norm": 0.0,
"learning_rate": 2.2980555800703273e-06,
"loss": 1.5337,
"step": 306
},
{
"epoch": 0.7892030848329049,
"grad_norm": 0.0,
"learning_rate": 2.2451740986941905e-06,
"loss": 1.5212,
"step": 307
},
{
"epoch": 0.7917737789203085,
"grad_norm": 0.0,
"learning_rate": 2.1928311184046967e-06,
"loss": 1.5308,
"step": 308
},
{
"epoch": 0.794344473007712,
"grad_norm": 0.0,
"learning_rate": 2.1410302739387424e-06,
"loss": 1.5159,
"step": 309
},
{
"epoch": 0.7969151670951157,
"grad_norm": 0.0,
"learning_rate": 2.0897751623868833e-06,
"loss": 1.5349,
"step": 310
},
{
"epoch": 0.7994858611825193,
"grad_norm": 0.0,
"learning_rate": 2.0390693429435626e-06,
"loss": 1.5029,
"step": 311
},
{
"epoch": 0.8020565552699229,
"grad_norm": 0.0,
"learning_rate": 1.9889163366599607e-06,
"loss": 1.519,
"step": 312
},
{
"epoch": 0.8046272493573264,
"grad_norm": 0.0,
"learning_rate": 1.939319626199483e-06,
"loss": 1.5054,
"step": 313
},
{
"epoch": 0.8071979434447301,
"grad_norm": 0.0,
"learning_rate": 1.890282655595922e-06,
"loss": 1.4736,
"step": 314
},
{
"epoch": 0.8097686375321337,
"grad_norm": 0.0,
"learning_rate": 1.8418088300143044e-06,
"loss": 1.5242,
"step": 315
},
{
"epoch": 0.8123393316195373,
"grad_norm": 0.0,
"learning_rate": 1.7939015155144378e-06,
"loss": 1.5208,
"step": 316
},
{
"epoch": 0.8149100257069408,
"grad_norm": 0.0,
"learning_rate": 1.7465640388171589e-06,
"loss": 1.5332,
"step": 317
},
{
"epoch": 0.8174807197943444,
"grad_norm": 0.0,
"learning_rate": 1.6997996870733268e-06,
"loss": 1.4978,
"step": 318
},
{
"epoch": 0.8200514138817481,
"grad_norm": 0.0,
"learning_rate": 1.6536117076355652e-06,
"loss": 1.4961,
"step": 319
},
{
"epoch": 0.8226221079691517,
"grad_norm": 0.0,
"learning_rate": 1.6080033078327585e-06,
"loss": 1.5559,
"step": 320
},
{
"epoch": 0.8251928020565553,
"grad_norm": 0.0,
"learning_rate": 1.5629776547473397e-06,
"loss": 1.5435,
"step": 321
},
{
"epoch": 0.8277634961439588,
"grad_norm": 0.0,
"learning_rate": 1.5185378749953538e-06,
"loss": 1.4744,
"step": 322
},
{
"epoch": 0.8303341902313625,
"grad_norm": 0.0,
"learning_rate": 1.4746870545093528e-06,
"loss": 1.4885,
"step": 323
},
{
"epoch": 0.8329048843187661,
"grad_norm": 0.0,
"learning_rate": 1.4314282383241097e-06,
"loss": 1.5088,
"step": 324
},
{
"epoch": 0.8354755784061697,
"grad_norm": 0.0,
"learning_rate": 1.388764430365147e-06,
"loss": 1.4971,
"step": 325
},
{
"epoch": 0.8380462724935732,
"grad_norm": 0.0,
"learning_rate": 1.3466985932401743e-06,
"loss": 1.5269,
"step": 326
},
{
"epoch": 0.8406169665809768,
"grad_norm": 0.0,
"learning_rate": 1.3052336480333372e-06,
"loss": 1.5088,
"step": 327
},
{
"epoch": 0.8431876606683805,
"grad_norm": 0.0,
"learning_rate": 1.2643724741023845e-06,
"loss": 1.5046,
"step": 328
},
{
"epoch": 0.8457583547557841,
"grad_norm": 0.0,
"learning_rate": 1.2241179088787192e-06,
"loss": 1.5217,
"step": 329
},
{
"epoch": 0.8483290488431876,
"grad_norm": 0.0,
"learning_rate": 1.1844727476703776e-06,
"loss": 1.4951,
"step": 330
},
{
"epoch": 0.8508997429305912,
"grad_norm": 0.0,
"learning_rate": 1.1454397434679022e-06,
"loss": 1.4941,
"step": 331
},
{
"epoch": 0.8534704370179949,
"grad_norm": 0.0,
"learning_rate": 1.1070216067531825e-06,
"loss": 1.5122,
"step": 332
},
{
"epoch": 0.8560411311053985,
"grad_norm": 0.0,
"learning_rate": 1.0692210053112451e-06,
"loss": 1.5427,
"step": 333
},
{
"epoch": 0.8586118251928021,
"grad_norm": 0.0,
"learning_rate": 1.032040564044975e-06,
"loss": 1.5278,
"step": 334
},
{
"epoch": 0.8611825192802056,
"grad_norm": 0.0,
"learning_rate": 9.954828647928727e-07,
"loss": 1.4768,
"step": 335
},
{
"epoch": 0.8637532133676092,
"grad_norm": 0.0,
"learning_rate": 9.595504461497441e-07,
"loss": 1.5066,
"step": 336
},
{
"epoch": 0.8663239074550129,
"grad_norm": 0.0,
"learning_rate": 9.242458032904311e-07,
"loss": 1.4871,
"step": 337
},
{
"epoch": 0.8688946015424165,
"grad_norm": 0.0,
"learning_rate": 8.895713877965373e-07,
"loss": 1.5212,
"step": 338
},
{
"epoch": 0.87146529562982,
"grad_norm": 0.0,
"learning_rate": 8.555296074861996e-07,
"loss": 1.4919,
"step": 339
},
{
"epoch": 0.8740359897172236,
"grad_norm": 0.0,
"learning_rate": 8.22122826246875e-07,
"loss": 1.5476,
"step": 340
},
{
"epoch": 0.8766066838046273,
"grad_norm": 0.0,
"learning_rate": 7.89353363871197e-07,
"loss": 1.5142,
"step": 341
},
{
"epoch": 0.8791773778920309,
"grad_norm": 0.0,
"learning_rate": 7.572234958958846e-07,
"loss": 1.5332,
"step": 342
},
{
"epoch": 0.8817480719794345,
"grad_norm": 0.0,
"learning_rate": 7.2573545344373e-07,
"loss": 1.4924,
"step": 343
},
{
"epoch": 0.884318766066838,
"grad_norm": 0.0,
"learning_rate": 6.948914230686688e-07,
"loss": 1.5181,
"step": 344
},
{
"epoch": 0.8868894601542416,
"grad_norm": 0.0,
"learning_rate": 6.646935466039373e-07,
"loss": 1.5137,
"step": 345
},
{
"epoch": 0.8894601542416453,
"grad_norm": 0.0,
"learning_rate": 6.351439210133492e-07,
"loss": 1.5056,
"step": 346
},
{
"epoch": 0.8920308483290489,
"grad_norm": 0.0,
"learning_rate": 6.062445982456777e-07,
"loss": 1.4688,
"step": 347
},
{
"epoch": 0.8946015424164524,
"grad_norm": 0.0,
"learning_rate": 5.77997585092166e-07,
"loss": 1.5146,
"step": 348
},
{
"epoch": 0.897172236503856,
"grad_norm": 0.0,
"learning_rate": 5.504048430471753e-07,
"loss": 1.4695,
"step": 349
},
{
"epoch": 0.8997429305912596,
"grad_norm": 0.0,
"learning_rate": 5.234682881719766e-07,
"loss": 1.5129,
"step": 350
},
{
"epoch": 0.9023136246786633,
"grad_norm": 0.0,
"learning_rate": 4.971897909616985e-07,
"loss": 1.5061,
"step": 351
},
{
"epoch": 0.9048843187660668,
"grad_norm": 0.0,
"learning_rate": 4.715711762154362e-07,
"loss": 1.4722,
"step": 352
},
{
"epoch": 0.9074550128534704,
"grad_norm": 0.0,
"learning_rate": 4.4661422290954495e-07,
"loss": 1.5056,
"step": 353
},
{
"epoch": 0.910025706940874,
"grad_norm": 0.0,
"learning_rate": 4.2232066407409067e-07,
"loss": 1.5017,
"step": 354
},
{
"epoch": 0.9125964010282777,
"grad_norm": 0.0,
"learning_rate": 3.986921866725202e-07,
"loss": 1.5393,
"step": 355
},
{
"epoch": 0.9151670951156813,
"grad_norm": 0.0,
"learning_rate": 3.7573043148451673e-07,
"loss": 1.5034,
"step": 356
},
{
"epoch": 0.9177377892030848,
"grad_norm": 0.0,
"learning_rate": 3.5343699299205003e-07,
"loss": 1.5139,
"step": 357
},
{
"epoch": 0.9203084832904884,
"grad_norm": 0.0,
"learning_rate": 3.3181341926867283e-07,
"loss": 1.4788,
"step": 358
},
{
"epoch": 0.922879177377892,
"grad_norm": 0.0,
"learning_rate": 3.1086121187200667e-07,
"loss": 1.4746,
"step": 359
},
{
"epoch": 0.9254498714652957,
"grad_norm": 0.0,
"learning_rate": 2.905818257394799e-07,
"loss": 1.5112,
"step": 360
},
{
"epoch": 0.9280205655526992,
"grad_norm": 0.0,
"learning_rate": 2.7097666908729283e-07,
"loss": 1.5071,
"step": 361
},
{
"epoch": 0.9305912596401028,
"grad_norm": 0.0,
"learning_rate": 2.520471033126326e-07,
"loss": 1.4773,
"step": 362
},
{
"epoch": 0.9331619537275064,
"grad_norm": 0.0,
"learning_rate": 2.3379444289913344e-07,
"loss": 1.5146,
"step": 363
},
{
"epoch": 0.9357326478149101,
"grad_norm": 0.0,
"learning_rate": 2.1621995532559947e-07,
"loss": 1.4978,
"step": 364
},
{
"epoch": 0.9383033419023136,
"grad_norm": 0.0,
"learning_rate": 1.9932486097799408e-07,
"loss": 1.5183,
"step": 365
},
{
"epoch": 0.9408740359897172,
"grad_norm": 0.0,
"learning_rate": 1.8311033306468552e-07,
"loss": 1.4761,
"step": 366
},
{
"epoch": 0.9434447300771208,
"grad_norm": 0.0,
"learning_rate": 1.6757749753498865e-07,
"loss": 1.509,
"step": 367
},
{
"epoch": 0.9460154241645244,
"grad_norm": 0.0,
"learning_rate": 1.5272743300097316e-07,
"loss": 1.5095,
"step": 368
},
{
"epoch": 0.9485861182519281,
"grad_norm": 0.0,
"learning_rate": 1.3856117066256225e-07,
"loss": 1.5361,
"step": 369
},
{
"epoch": 0.9511568123393316,
"grad_norm": 0.0,
"learning_rate": 1.2507969423593225e-07,
"loss": 1.5051,
"step": 370
},
{
"epoch": 0.9537275064267352,
"grad_norm": 0.0,
"learning_rate": 1.1228393988519381e-07,
"loss": 1.5532,
"step": 371
},
{
"epoch": 0.9562982005141388,
"grad_norm": 0.0,
"learning_rate": 1.0017479615738957e-07,
"loss": 1.553,
"step": 372
},
{
"epoch": 0.9588688946015425,
"grad_norm": 0.0,
"learning_rate": 8.875310392079118e-08,
"loss": 1.5125,
"step": 373
},
{
"epoch": 0.961439588688946,
"grad_norm": 0.0,
"learning_rate": 7.801965630651165e-08,
"loss": 1.4321,
"step": 374
},
{
"epoch": 0.9640102827763496,
"grad_norm": 0.0,
"learning_rate": 6.797519865342161e-08,
"loss": 1.5005,
"step": 375
},
{
"epoch": 0.9665809768637532,
"grad_norm": 0.0,
"learning_rate": 5.862042845640403e-08,
"loss": 1.4973,
"step": 376
},
{
"epoch": 0.9691516709511568,
"grad_norm": 0.0,
"learning_rate": 4.9955995317908514e-08,
"loss": 1.5449,
"step": 377
},
{
"epoch": 0.9717223650385605,
"grad_norm": 0.0,
"learning_rate": 4.198250090284961e-08,
"loss": 1.4795,
"step": 378
},
{
"epoch": 0.974293059125964,
"grad_norm": 0.0,
"learning_rate": 3.47004988968247e-08,
"loss": 1.5508,
"step": 379
},
{
"epoch": 0.9768637532133676,
"grad_norm": 0.0,
"learning_rate": 2.8110494967664713e-08,
"loss": 1.5095,
"step": 380
},
{
"epoch": 0.9794344473007712,
"grad_norm": 0.0,
"learning_rate": 2.221294673032004e-08,
"loss": 1.5146,
"step": 381
},
{
"epoch": 0.9820051413881749,
"grad_norm": 0.0,
"learning_rate": 1.7008263715085904e-08,
"loss": 1.5112,
"step": 382
},
{
"epoch": 0.9845758354755784,
"grad_norm": 0.0,
"learning_rate": 1.24968073391607e-08,
"loss": 1.5144,
"step": 383
},
{
"epoch": 0.987146529562982,
"grad_norm": 0.0,
"learning_rate": 8.678890881552715e-09,
"loss": 1.5459,
"step": 384
},
{
"epoch": 0.9897172236503856,
"grad_norm": 0.0,
"learning_rate": 5.554779461323101e-09,
"loss": 1.4885,
"step": 385
},
{
"epoch": 0.9922879177377892,
"grad_norm": 0.0,
"learning_rate": 3.1246900191761463e-09,
"loss": 1.4919,
"step": 386
},
{
"epoch": 0.9948586118251928,
"grad_norm": 0.0,
"learning_rate": 1.3887913023946652e-09,
"loss": 1.5034,
"step": 387
},
{
"epoch": 0.9974293059125964,
"grad_norm": 0.0,
"learning_rate": 3.4720385312492223e-10,
"loss": 1.4812,
"step": 388
},
{
"epoch": 1.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 1.3696,
"step": 389
},
{
"epoch": 1.0,
"step": 389,
"total_flos": 1.4102482311698186e+18,
"train_loss": 1.5950692380302056,
"train_runtime": 5789.3639,
"train_samples_per_second": 17.167,
"train_steps_per_second": 0.067
}
],
"logging_steps": 1.0,
"max_steps": 389,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4102482311698186e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}