lesso17's picture
Training in progress, step 200, checkpoint
9fa22e2 verified
{
"best_metric": 2.559772253036499,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.23802439750074383,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011901219875037191,
"grad_norm": 0.7740907073020935,
"learning_rate": 1.0100000000000002e-05,
"loss": 2.8831,
"step": 1
},
{
"epoch": 0.0011901219875037191,
"eval_loss": 3.97263765335083,
"eval_runtime": 212.008,
"eval_samples_per_second": 6.674,
"eval_steps_per_second": 1.67,
"step": 1
},
{
"epoch": 0.0023802439750074383,
"grad_norm": 0.8903758525848389,
"learning_rate": 2.0200000000000003e-05,
"loss": 2.9817,
"step": 2
},
{
"epoch": 0.0035703659625111574,
"grad_norm": 1.0427348613739014,
"learning_rate": 3.0299999999999998e-05,
"loss": 3.1211,
"step": 3
},
{
"epoch": 0.0047604879500148765,
"grad_norm": 1.058449387550354,
"learning_rate": 4.0400000000000006e-05,
"loss": 3.1337,
"step": 4
},
{
"epoch": 0.005950609937518596,
"grad_norm": 1.0233001708984375,
"learning_rate": 5.05e-05,
"loss": 3.1145,
"step": 5
},
{
"epoch": 0.007140731925022315,
"grad_norm": 1.0409022569656372,
"learning_rate": 6.0599999999999996e-05,
"loss": 3.0859,
"step": 6
},
{
"epoch": 0.008330853912526033,
"grad_norm": 1.2061635255813599,
"learning_rate": 7.07e-05,
"loss": 3.042,
"step": 7
},
{
"epoch": 0.009520975900029753,
"grad_norm": 1.5559790134429932,
"learning_rate": 8.080000000000001e-05,
"loss": 3.0282,
"step": 8
},
{
"epoch": 0.010711097887533471,
"grad_norm": 1.5155705213546753,
"learning_rate": 9.09e-05,
"loss": 2.9623,
"step": 9
},
{
"epoch": 0.011901219875037191,
"grad_norm": 1.4185665845870972,
"learning_rate": 0.000101,
"loss": 2.8997,
"step": 10
},
{
"epoch": 0.01309134186254091,
"grad_norm": 1.2759884595870972,
"learning_rate": 0.00010046842105263158,
"loss": 3.0715,
"step": 11
},
{
"epoch": 0.01428146385004463,
"grad_norm": 1.3131742477416992,
"learning_rate": 9.993684210526315e-05,
"loss": 3.1495,
"step": 12
},
{
"epoch": 0.015471585837548348,
"grad_norm": 1.293761968612671,
"learning_rate": 9.940526315789473e-05,
"loss": 2.9229,
"step": 13
},
{
"epoch": 0.016661707825052066,
"grad_norm": 1.2810025215148926,
"learning_rate": 9.887368421052632e-05,
"loss": 3.0321,
"step": 14
},
{
"epoch": 0.017851829812555786,
"grad_norm": 1.2333396673202515,
"learning_rate": 9.83421052631579e-05,
"loss": 2.9467,
"step": 15
},
{
"epoch": 0.019041951800059506,
"grad_norm": 1.2278225421905518,
"learning_rate": 9.781052631578948e-05,
"loss": 2.9293,
"step": 16
},
{
"epoch": 0.020232073787563226,
"grad_norm": 1.2646679878234863,
"learning_rate": 9.727894736842106e-05,
"loss": 2.9282,
"step": 17
},
{
"epoch": 0.021422195775066943,
"grad_norm": 1.4180850982666016,
"learning_rate": 9.674736842105263e-05,
"loss": 2.9221,
"step": 18
},
{
"epoch": 0.022612317762570663,
"grad_norm": 1.4099845886230469,
"learning_rate": 9.621578947368421e-05,
"loss": 2.9182,
"step": 19
},
{
"epoch": 0.023802439750074383,
"grad_norm": 1.5067027807235718,
"learning_rate": 9.568421052631578e-05,
"loss": 2.8319,
"step": 20
},
{
"epoch": 0.024992561737578103,
"grad_norm": 1.4886541366577148,
"learning_rate": 9.515263157894737e-05,
"loss": 2.992,
"step": 21
},
{
"epoch": 0.02618268372508182,
"grad_norm": 1.6468743085861206,
"learning_rate": 9.462105263157895e-05,
"loss": 3.0023,
"step": 22
},
{
"epoch": 0.02737280571258554,
"grad_norm": 1.5920535326004028,
"learning_rate": 9.408947368421054e-05,
"loss": 2.7943,
"step": 23
},
{
"epoch": 0.02856292770008926,
"grad_norm": 1.651477336883545,
"learning_rate": 9.355789473684211e-05,
"loss": 3.0897,
"step": 24
},
{
"epoch": 0.02975304968759298,
"grad_norm": 1.6968199014663696,
"learning_rate": 9.302631578947369e-05,
"loss": 2.9393,
"step": 25
},
{
"epoch": 0.030943171675096696,
"grad_norm": 1.7623414993286133,
"learning_rate": 9.249473684210526e-05,
"loss": 3.0614,
"step": 26
},
{
"epoch": 0.03213329366260042,
"grad_norm": 1.6790002584457397,
"learning_rate": 9.196315789473685e-05,
"loss": 2.8908,
"step": 27
},
{
"epoch": 0.03332341565010413,
"grad_norm": 1.7653381824493408,
"learning_rate": 9.143157894736843e-05,
"loss": 2.9874,
"step": 28
},
{
"epoch": 0.03451353763760785,
"grad_norm": 1.9056634902954102,
"learning_rate": 9.09e-05,
"loss": 2.9435,
"step": 29
},
{
"epoch": 0.03570365962511157,
"grad_norm": 1.9138984680175781,
"learning_rate": 9.036842105263158e-05,
"loss": 2.9898,
"step": 30
},
{
"epoch": 0.03689378161261529,
"grad_norm": 2.077247142791748,
"learning_rate": 8.983684210526316e-05,
"loss": 3.0406,
"step": 31
},
{
"epoch": 0.03808390360011901,
"grad_norm": 2.173475980758667,
"learning_rate": 8.930526315789474e-05,
"loss": 3.1873,
"step": 32
},
{
"epoch": 0.03927402558762273,
"grad_norm": 2.2418313026428223,
"learning_rate": 8.877368421052632e-05,
"loss": 3.0145,
"step": 33
},
{
"epoch": 0.04046414757512645,
"grad_norm": 2.7017998695373535,
"learning_rate": 8.82421052631579e-05,
"loss": 3.0392,
"step": 34
},
{
"epoch": 0.04165426956263017,
"grad_norm": 2.644977569580078,
"learning_rate": 8.771052631578948e-05,
"loss": 3.065,
"step": 35
},
{
"epoch": 0.042844391550133885,
"grad_norm": 2.72674822807312,
"learning_rate": 8.717894736842105e-05,
"loss": 3.0143,
"step": 36
},
{
"epoch": 0.044034513537637605,
"grad_norm": 2.9149155616760254,
"learning_rate": 8.664736842105263e-05,
"loss": 3.0145,
"step": 37
},
{
"epoch": 0.045224635525141325,
"grad_norm": 2.7656924724578857,
"learning_rate": 8.61157894736842e-05,
"loss": 2.678,
"step": 38
},
{
"epoch": 0.046414757512645045,
"grad_norm": 3.271090507507324,
"learning_rate": 8.55842105263158e-05,
"loss": 2.7121,
"step": 39
},
{
"epoch": 0.047604879500148765,
"grad_norm": 3.187629461288452,
"learning_rate": 8.505263157894737e-05,
"loss": 2.7628,
"step": 40
},
{
"epoch": 0.048795001487652485,
"grad_norm": 3.188955783843994,
"learning_rate": 8.452105263157896e-05,
"loss": 2.3208,
"step": 41
},
{
"epoch": 0.049985123475156205,
"grad_norm": 2.877542734146118,
"learning_rate": 8.398947368421053e-05,
"loss": 2.3759,
"step": 42
},
{
"epoch": 0.051175245462659925,
"grad_norm": 3.292560338973999,
"learning_rate": 8.345789473684211e-05,
"loss": 2.3581,
"step": 43
},
{
"epoch": 0.05236536745016364,
"grad_norm": 3.187638282775879,
"learning_rate": 8.292631578947368e-05,
"loss": 2.5877,
"step": 44
},
{
"epoch": 0.05355548943766736,
"grad_norm": 3.735719680786133,
"learning_rate": 8.239473684210526e-05,
"loss": 2.6599,
"step": 45
},
{
"epoch": 0.05474561142517108,
"grad_norm": 4.3021697998046875,
"learning_rate": 8.186315789473683e-05,
"loss": 2.5517,
"step": 46
},
{
"epoch": 0.0559357334126748,
"grad_norm": 3.9108691215515137,
"learning_rate": 8.133157894736842e-05,
"loss": 2.8813,
"step": 47
},
{
"epoch": 0.05712585540017852,
"grad_norm": 3.6961636543273926,
"learning_rate": 8.080000000000001e-05,
"loss": 2.5407,
"step": 48
},
{
"epoch": 0.05831597738768224,
"grad_norm": 3.648516893386841,
"learning_rate": 8.026842105263159e-05,
"loss": 2.2375,
"step": 49
},
{
"epoch": 0.05950609937518596,
"grad_norm": 5.250331878662109,
"learning_rate": 7.973684210526316e-05,
"loss": 2.8475,
"step": 50
},
{
"epoch": 0.05950609937518596,
"eval_loss": 3.4745068550109863,
"eval_runtime": 160.7821,
"eval_samples_per_second": 8.801,
"eval_steps_per_second": 2.202,
"step": 50
},
{
"epoch": 0.06069622136268968,
"grad_norm": 5.582263469696045,
"learning_rate": 7.920526315789474e-05,
"loss": 3.3864,
"step": 51
},
{
"epoch": 0.06188634335019339,
"grad_norm": 3.670656442642212,
"learning_rate": 7.867368421052631e-05,
"loss": 3.1908,
"step": 52
},
{
"epoch": 0.06307646533769712,
"grad_norm": 2.174717426300049,
"learning_rate": 7.814210526315789e-05,
"loss": 3.1241,
"step": 53
},
{
"epoch": 0.06426658732520084,
"grad_norm": 1.5080410242080688,
"learning_rate": 7.761052631578946e-05,
"loss": 3.0446,
"step": 54
},
{
"epoch": 0.06545670931270456,
"grad_norm": 1.178946614265442,
"learning_rate": 7.707894736842105e-05,
"loss": 2.8995,
"step": 55
},
{
"epoch": 0.06664683130020826,
"grad_norm": 1.1536166667938232,
"learning_rate": 7.654736842105264e-05,
"loss": 2.8091,
"step": 56
},
{
"epoch": 0.06783695328771198,
"grad_norm": 1.0446966886520386,
"learning_rate": 7.601578947368422e-05,
"loss": 2.8239,
"step": 57
},
{
"epoch": 0.0690270752752157,
"grad_norm": 0.9518328905105591,
"learning_rate": 7.548421052631579e-05,
"loss": 2.7598,
"step": 58
},
{
"epoch": 0.07021719726271942,
"grad_norm": 1.0942273139953613,
"learning_rate": 7.495263157894737e-05,
"loss": 2.7408,
"step": 59
},
{
"epoch": 0.07140731925022314,
"grad_norm": 1.1928379535675049,
"learning_rate": 7.442105263157894e-05,
"loss": 2.7216,
"step": 60
},
{
"epoch": 0.07259744123772686,
"grad_norm": 1.1087335348129272,
"learning_rate": 7.388947368421053e-05,
"loss": 2.8022,
"step": 61
},
{
"epoch": 0.07378756322523058,
"grad_norm": 1.1715253591537476,
"learning_rate": 7.335789473684211e-05,
"loss": 2.8765,
"step": 62
},
{
"epoch": 0.0749776852127343,
"grad_norm": 1.0879360437393188,
"learning_rate": 7.282631578947368e-05,
"loss": 2.5474,
"step": 63
},
{
"epoch": 0.07616780720023802,
"grad_norm": 1.0960795879364014,
"learning_rate": 7.229473684210527e-05,
"loss": 2.7884,
"step": 64
},
{
"epoch": 0.07735792918774174,
"grad_norm": 1.1867204904556274,
"learning_rate": 7.176315789473685e-05,
"loss": 2.8801,
"step": 65
},
{
"epoch": 0.07854805117524546,
"grad_norm": 1.1470366716384888,
"learning_rate": 7.123157894736842e-05,
"loss": 2.8809,
"step": 66
},
{
"epoch": 0.07973817316274918,
"grad_norm": 1.210035800933838,
"learning_rate": 7.07e-05,
"loss": 2.7099,
"step": 67
},
{
"epoch": 0.0809282951502529,
"grad_norm": 1.1540971994400024,
"learning_rate": 7.016842105263159e-05,
"loss": 2.7081,
"step": 68
},
{
"epoch": 0.08211841713775662,
"grad_norm": 1.1911342144012451,
"learning_rate": 6.963684210526316e-05,
"loss": 2.676,
"step": 69
},
{
"epoch": 0.08330853912526034,
"grad_norm": 1.2271251678466797,
"learning_rate": 6.910526315789474e-05,
"loss": 2.7204,
"step": 70
},
{
"epoch": 0.08449866111276406,
"grad_norm": 1.435076117515564,
"learning_rate": 6.857368421052631e-05,
"loss": 2.7106,
"step": 71
},
{
"epoch": 0.08568878310026777,
"grad_norm": 1.3325750827789307,
"learning_rate": 6.80421052631579e-05,
"loss": 2.7424,
"step": 72
},
{
"epoch": 0.08687890508777149,
"grad_norm": 1.4230831861495972,
"learning_rate": 6.751052631578948e-05,
"loss": 2.952,
"step": 73
},
{
"epoch": 0.08806902707527521,
"grad_norm": 1.4959286451339722,
"learning_rate": 6.697894736842105e-05,
"loss": 2.8185,
"step": 74
},
{
"epoch": 0.08925914906277893,
"grad_norm": 1.5184545516967773,
"learning_rate": 6.644736842105264e-05,
"loss": 2.7411,
"step": 75
},
{
"epoch": 0.09044927105028265,
"grad_norm": 1.5939208269119263,
"learning_rate": 6.591578947368422e-05,
"loss": 2.8724,
"step": 76
},
{
"epoch": 0.09163939303778637,
"grad_norm": 1.5517206192016602,
"learning_rate": 6.538421052631579e-05,
"loss": 2.9149,
"step": 77
},
{
"epoch": 0.09282951502529009,
"grad_norm": 1.5986747741699219,
"learning_rate": 6.485263157894737e-05,
"loss": 2.7335,
"step": 78
},
{
"epoch": 0.09401963701279381,
"grad_norm": 1.9907118082046509,
"learning_rate": 6.432105263157894e-05,
"loss": 3.0217,
"step": 79
},
{
"epoch": 0.09520975900029753,
"grad_norm": 2.0418686866760254,
"learning_rate": 6.378947368421053e-05,
"loss": 2.8225,
"step": 80
},
{
"epoch": 0.09639988098780125,
"grad_norm": 2.0640804767608643,
"learning_rate": 6.32578947368421e-05,
"loss": 3.1019,
"step": 81
},
{
"epoch": 0.09759000297530497,
"grad_norm": 2.187643527984619,
"learning_rate": 6.27263157894737e-05,
"loss": 2.915,
"step": 82
},
{
"epoch": 0.09878012496280869,
"grad_norm": 2.249582052230835,
"learning_rate": 6.219473684210527e-05,
"loss": 3.0026,
"step": 83
},
{
"epoch": 0.09997024695031241,
"grad_norm": 2.528813362121582,
"learning_rate": 6.166315789473685e-05,
"loss": 2.777,
"step": 84
},
{
"epoch": 0.10116036893781613,
"grad_norm": 2.0678341388702393,
"learning_rate": 6.113157894736842e-05,
"loss": 2.7042,
"step": 85
},
{
"epoch": 0.10235049092531985,
"grad_norm": 2.3720791339874268,
"learning_rate": 6.0599999999999996e-05,
"loss": 2.7912,
"step": 86
},
{
"epoch": 0.10354061291282357,
"grad_norm": 2.4685397148132324,
"learning_rate": 6.006842105263158e-05,
"loss": 2.8001,
"step": 87
},
{
"epoch": 0.10473073490032728,
"grad_norm": 2.406266927719116,
"learning_rate": 5.953684210526315e-05,
"loss": 2.558,
"step": 88
},
{
"epoch": 0.105920856887831,
"grad_norm": 2.5169339179992676,
"learning_rate": 5.900526315789474e-05,
"loss": 2.2928,
"step": 89
},
{
"epoch": 0.10711097887533472,
"grad_norm": 2.6541452407836914,
"learning_rate": 5.847368421052632e-05,
"loss": 2.7278,
"step": 90
},
{
"epoch": 0.10830110086283844,
"grad_norm": 2.8647027015686035,
"learning_rate": 5.79421052631579e-05,
"loss": 2.635,
"step": 91
},
{
"epoch": 0.10949122285034216,
"grad_norm": 3.1823761463165283,
"learning_rate": 5.7410526315789475e-05,
"loss": 2.2292,
"step": 92
},
{
"epoch": 0.11068134483784588,
"grad_norm": 3.207031726837158,
"learning_rate": 5.687894736842105e-05,
"loss": 2.7533,
"step": 93
},
{
"epoch": 0.1118714668253496,
"grad_norm": 3.163825273513794,
"learning_rate": 5.6347368421052625e-05,
"loss": 2.5126,
"step": 94
},
{
"epoch": 0.11306158881285332,
"grad_norm": 3.2235989570617676,
"learning_rate": 5.5815789473684214e-05,
"loss": 2.5196,
"step": 95
},
{
"epoch": 0.11425171080035704,
"grad_norm": 4.001104831695557,
"learning_rate": 5.5284210526315796e-05,
"loss": 3.0249,
"step": 96
},
{
"epoch": 0.11544183278786076,
"grad_norm": 3.1947779655456543,
"learning_rate": 5.475263157894737e-05,
"loss": 2.0786,
"step": 97
},
{
"epoch": 0.11663195477536448,
"grad_norm": 3.7150704860687256,
"learning_rate": 5.422105263157895e-05,
"loss": 2.1846,
"step": 98
},
{
"epoch": 0.1178220767628682,
"grad_norm": 3.942005157470703,
"learning_rate": 5.368947368421053e-05,
"loss": 2.2755,
"step": 99
},
{
"epoch": 0.11901219875037192,
"grad_norm": 8.126349449157715,
"learning_rate": 5.3157894736842104e-05,
"loss": 2.4846,
"step": 100
},
{
"epoch": 0.11901219875037192,
"eval_loss": 3.5016069412231445,
"eval_runtime": 160.8549,
"eval_samples_per_second": 8.797,
"eval_steps_per_second": 2.201,
"step": 100
},
{
"epoch": 0.12020232073787564,
"grad_norm": 8.165000915527344,
"learning_rate": 5.262631578947368e-05,
"loss": 3.5609,
"step": 101
},
{
"epoch": 0.12139244272537936,
"grad_norm": 6.8532938957214355,
"learning_rate": 5.209473684210527e-05,
"loss": 3.6081,
"step": 102
},
{
"epoch": 0.12258256471288308,
"grad_norm": 4.252460479736328,
"learning_rate": 5.1563157894736844e-05,
"loss": 3.2864,
"step": 103
},
{
"epoch": 0.12377268670038678,
"grad_norm": 2.2745885848999023,
"learning_rate": 5.1031578947368426e-05,
"loss": 3.0608,
"step": 104
},
{
"epoch": 0.1249628086878905,
"grad_norm": 1.3418879508972168,
"learning_rate": 5.05e-05,
"loss": 2.8344,
"step": 105
},
{
"epoch": 0.12615293067539424,
"grad_norm": 1.0786305665969849,
"learning_rate": 4.9968421052631576e-05,
"loss": 2.8097,
"step": 106
},
{
"epoch": 0.12734305266289794,
"grad_norm": 1.0196248292922974,
"learning_rate": 4.943684210526316e-05,
"loss": 2.7265,
"step": 107
},
{
"epoch": 0.12853317465040168,
"grad_norm": 0.9965652823448181,
"learning_rate": 4.890526315789474e-05,
"loss": 2.785,
"step": 108
},
{
"epoch": 0.12972329663790538,
"grad_norm": 0.9790583252906799,
"learning_rate": 4.8373684210526316e-05,
"loss": 2.704,
"step": 109
},
{
"epoch": 0.13091341862540912,
"grad_norm": 1.0119240283966064,
"learning_rate": 4.784210526315789e-05,
"loss": 2.7151,
"step": 110
},
{
"epoch": 0.13210354061291282,
"grad_norm": 0.9607682228088379,
"learning_rate": 4.731052631578947e-05,
"loss": 2.6745,
"step": 111
},
{
"epoch": 0.13329366260041653,
"grad_norm": 1.0079097747802734,
"learning_rate": 4.6778947368421055e-05,
"loss": 2.6822,
"step": 112
},
{
"epoch": 0.13448378458792026,
"grad_norm": 1.1215417385101318,
"learning_rate": 4.624736842105263e-05,
"loss": 2.6709,
"step": 113
},
{
"epoch": 0.13567390657542397,
"grad_norm": 1.1396487951278687,
"learning_rate": 4.571578947368421e-05,
"loss": 2.7219,
"step": 114
},
{
"epoch": 0.1368640285629277,
"grad_norm": 1.1168203353881836,
"learning_rate": 4.518421052631579e-05,
"loss": 2.6713,
"step": 115
},
{
"epoch": 0.1380541505504314,
"grad_norm": 1.1319602727890015,
"learning_rate": 4.465263157894737e-05,
"loss": 2.7036,
"step": 116
},
{
"epoch": 0.13924427253793514,
"grad_norm": 1.2012885808944702,
"learning_rate": 4.412105263157895e-05,
"loss": 2.7752,
"step": 117
},
{
"epoch": 0.14043439452543885,
"grad_norm": 1.2033405303955078,
"learning_rate": 4.358947368421053e-05,
"loss": 2.7599,
"step": 118
},
{
"epoch": 0.14162451651294258,
"grad_norm": 1.1886316537857056,
"learning_rate": 4.30578947368421e-05,
"loss": 2.6826,
"step": 119
},
{
"epoch": 0.1428146385004463,
"grad_norm": 1.200430154800415,
"learning_rate": 4.2526315789473685e-05,
"loss": 2.7017,
"step": 120
},
{
"epoch": 0.14400476048795002,
"grad_norm": 1.2769813537597656,
"learning_rate": 4.199473684210527e-05,
"loss": 2.7329,
"step": 121
},
{
"epoch": 0.14519488247545373,
"grad_norm": 1.3486050367355347,
"learning_rate": 4.146315789473684e-05,
"loss": 2.5735,
"step": 122
},
{
"epoch": 0.14638500446295746,
"grad_norm": 1.413003921508789,
"learning_rate": 4.093157894736842e-05,
"loss": 2.7845,
"step": 123
},
{
"epoch": 0.14757512645046117,
"grad_norm": 1.3913912773132324,
"learning_rate": 4.0400000000000006e-05,
"loss": 2.6509,
"step": 124
},
{
"epoch": 0.1487652484379649,
"grad_norm": 1.4366058111190796,
"learning_rate": 3.986842105263158e-05,
"loss": 2.6653,
"step": 125
},
{
"epoch": 0.1499553704254686,
"grad_norm": 1.4925942420959473,
"learning_rate": 3.933684210526316e-05,
"loss": 2.6635,
"step": 126
},
{
"epoch": 0.15114549241297234,
"grad_norm": 1.6500319242477417,
"learning_rate": 3.880526315789473e-05,
"loss": 2.674,
"step": 127
},
{
"epoch": 0.15233561440047605,
"grad_norm": 1.5842981338500977,
"learning_rate": 3.827368421052632e-05,
"loss": 2.7202,
"step": 128
},
{
"epoch": 0.15352573638797976,
"grad_norm": 1.6864389181137085,
"learning_rate": 3.7742105263157896e-05,
"loss": 2.8063,
"step": 129
},
{
"epoch": 0.1547158583754835,
"grad_norm": 1.876000165939331,
"learning_rate": 3.721052631578947e-05,
"loss": 2.644,
"step": 130
},
{
"epoch": 0.1559059803629872,
"grad_norm": 1.9258829355239868,
"learning_rate": 3.6678947368421054e-05,
"loss": 2.8121,
"step": 131
},
{
"epoch": 0.15709610235049093,
"grad_norm": 1.9644100666046143,
"learning_rate": 3.6147368421052636e-05,
"loss": 2.811,
"step": 132
},
{
"epoch": 0.15828622433799464,
"grad_norm": 2.027679681777954,
"learning_rate": 3.561578947368421e-05,
"loss": 2.6938,
"step": 133
},
{
"epoch": 0.15947634632549837,
"grad_norm": 2.0536186695098877,
"learning_rate": 3.508421052631579e-05,
"loss": 2.5575,
"step": 134
},
{
"epoch": 0.16066646831300208,
"grad_norm": 2.3553948402404785,
"learning_rate": 3.455263157894737e-05,
"loss": 2.8297,
"step": 135
},
{
"epoch": 0.1618565903005058,
"grad_norm": 2.237311601638794,
"learning_rate": 3.402105263157895e-05,
"loss": 2.6245,
"step": 136
},
{
"epoch": 0.16304671228800952,
"grad_norm": 2.386514663696289,
"learning_rate": 3.3489473684210526e-05,
"loss": 2.5633,
"step": 137
},
{
"epoch": 0.16423683427551325,
"grad_norm": 2.3817429542541504,
"learning_rate": 3.295789473684211e-05,
"loss": 2.3802,
"step": 138
},
{
"epoch": 0.16542695626301696,
"grad_norm": 2.4430129528045654,
"learning_rate": 3.242631578947368e-05,
"loss": 2.6482,
"step": 139
},
{
"epoch": 0.1666170782505207,
"grad_norm": 2.4865427017211914,
"learning_rate": 3.1894736842105265e-05,
"loss": 1.8909,
"step": 140
},
{
"epoch": 0.1678072002380244,
"grad_norm": 3.3364109992980957,
"learning_rate": 3.136315789473685e-05,
"loss": 2.3072,
"step": 141
},
{
"epoch": 0.16899732222552813,
"grad_norm": 3.3114304542541504,
"learning_rate": 3.083157894736842e-05,
"loss": 2.7235,
"step": 142
},
{
"epoch": 0.17018744421303184,
"grad_norm": 3.0344221591949463,
"learning_rate": 3.0299999999999998e-05,
"loss": 2.2445,
"step": 143
},
{
"epoch": 0.17137756620053554,
"grad_norm": 2.9184038639068604,
"learning_rate": 2.9768421052631577e-05,
"loss": 2.1895,
"step": 144
},
{
"epoch": 0.17256768818803928,
"grad_norm": 3.6383919715881348,
"learning_rate": 2.923684210526316e-05,
"loss": 2.4532,
"step": 145
},
{
"epoch": 0.17375781017554298,
"grad_norm": 3.0095598697662354,
"learning_rate": 2.8705263157894737e-05,
"loss": 2.0861,
"step": 146
},
{
"epoch": 0.17494793216304672,
"grad_norm": 3.4419445991516113,
"learning_rate": 2.8173684210526313e-05,
"loss": 2.3893,
"step": 147
},
{
"epoch": 0.17613805415055042,
"grad_norm": 4.293227195739746,
"learning_rate": 2.7642105263157898e-05,
"loss": 2.0973,
"step": 148
},
{
"epoch": 0.17732817613805416,
"grad_norm": 4.744500637054443,
"learning_rate": 2.7110526315789473e-05,
"loss": 2.3532,
"step": 149
},
{
"epoch": 0.17851829812555786,
"grad_norm": 6.184078216552734,
"learning_rate": 2.6578947368421052e-05,
"loss": 2.7711,
"step": 150
},
{
"epoch": 0.17851829812555786,
"eval_loss": 2.812873125076294,
"eval_runtime": 160.8573,
"eval_samples_per_second": 8.797,
"eval_steps_per_second": 2.201,
"step": 150
},
{
"epoch": 0.1797084201130616,
"grad_norm": 2.1665163040161133,
"learning_rate": 2.6047368421052634e-05,
"loss": 2.9363,
"step": 151
},
{
"epoch": 0.1808985421005653,
"grad_norm": 2.231947422027588,
"learning_rate": 2.5515789473684213e-05,
"loss": 2.8189,
"step": 152
},
{
"epoch": 0.18208866408806904,
"grad_norm": 1.9498807191848755,
"learning_rate": 2.4984210526315788e-05,
"loss": 2.8967,
"step": 153
},
{
"epoch": 0.18327878607557274,
"grad_norm": 1.6646301746368408,
"learning_rate": 2.445263157894737e-05,
"loss": 2.9605,
"step": 154
},
{
"epoch": 0.18446890806307648,
"grad_norm": 1.2766884565353394,
"learning_rate": 2.3921052631578946e-05,
"loss": 2.7265,
"step": 155
},
{
"epoch": 0.18565903005058018,
"grad_norm": 1.0804224014282227,
"learning_rate": 2.3389473684210528e-05,
"loss": 2.7534,
"step": 156
},
{
"epoch": 0.18684915203808392,
"grad_norm": 0.9548969268798828,
"learning_rate": 2.2857894736842106e-05,
"loss": 2.7941,
"step": 157
},
{
"epoch": 0.18803927402558762,
"grad_norm": 0.8820357918739319,
"learning_rate": 2.2326315789473685e-05,
"loss": 2.5207,
"step": 158
},
{
"epoch": 0.18922939601309136,
"grad_norm": 0.9637076258659363,
"learning_rate": 2.1794736842105264e-05,
"loss": 2.5984,
"step": 159
},
{
"epoch": 0.19041951800059506,
"grad_norm": 0.9365648627281189,
"learning_rate": 2.1263157894736842e-05,
"loss": 2.6507,
"step": 160
},
{
"epoch": 0.19160963998809877,
"grad_norm": 0.9047538638114929,
"learning_rate": 2.073157894736842e-05,
"loss": 2.5624,
"step": 161
},
{
"epoch": 0.1927997619756025,
"grad_norm": 0.9913797974586487,
"learning_rate": 2.0200000000000003e-05,
"loss": 2.7407,
"step": 162
},
{
"epoch": 0.1939898839631062,
"grad_norm": 0.9947323203086853,
"learning_rate": 1.966842105263158e-05,
"loss": 2.6398,
"step": 163
},
{
"epoch": 0.19518000595060994,
"grad_norm": 0.9551875591278076,
"learning_rate": 1.913684210526316e-05,
"loss": 2.6093,
"step": 164
},
{
"epoch": 0.19637012793811365,
"grad_norm": 0.9988086819648743,
"learning_rate": 1.8605263157894736e-05,
"loss": 2.5585,
"step": 165
},
{
"epoch": 0.19756024992561738,
"grad_norm": 1.087716817855835,
"learning_rate": 1.8073684210526318e-05,
"loss": 2.6282,
"step": 166
},
{
"epoch": 0.1987503719131211,
"grad_norm": 1.0601743459701538,
"learning_rate": 1.7542105263157897e-05,
"loss": 2.6258,
"step": 167
},
{
"epoch": 0.19994049390062482,
"grad_norm": 1.1024737358093262,
"learning_rate": 1.7010526315789475e-05,
"loss": 2.5256,
"step": 168
},
{
"epoch": 0.20113061588812853,
"grad_norm": 1.1294111013412476,
"learning_rate": 1.6478947368421054e-05,
"loss": 2.6306,
"step": 169
},
{
"epoch": 0.20232073787563226,
"grad_norm": 1.1903879642486572,
"learning_rate": 1.5947368421052633e-05,
"loss": 2.6052,
"step": 170
},
{
"epoch": 0.20351085986313597,
"grad_norm": 1.253252387046814,
"learning_rate": 1.541578947368421e-05,
"loss": 2.7537,
"step": 171
},
{
"epoch": 0.2047009818506397,
"grad_norm": 1.3783352375030518,
"learning_rate": 1.4884210526315788e-05,
"loss": 2.5608,
"step": 172
},
{
"epoch": 0.2058911038381434,
"grad_norm": 1.3314725160598755,
"learning_rate": 1.4352631578947369e-05,
"loss": 2.6971,
"step": 173
},
{
"epoch": 0.20708122582564714,
"grad_norm": 1.3991272449493408,
"learning_rate": 1.3821052631578949e-05,
"loss": 2.6963,
"step": 174
},
{
"epoch": 0.20827134781315085,
"grad_norm": 1.5228500366210938,
"learning_rate": 1.3289473684210526e-05,
"loss": 2.5793,
"step": 175
},
{
"epoch": 0.20946146980065455,
"grad_norm": 1.4984205961227417,
"learning_rate": 1.2757894736842106e-05,
"loss": 2.666,
"step": 176
},
{
"epoch": 0.2106515917881583,
"grad_norm": 1.7694042921066284,
"learning_rate": 1.2226315789473685e-05,
"loss": 2.8852,
"step": 177
},
{
"epoch": 0.211841713775662,
"grad_norm": 1.8036147356033325,
"learning_rate": 1.1694736842105264e-05,
"loss": 2.7626,
"step": 178
},
{
"epoch": 0.21303183576316573,
"grad_norm": 1.7980536222457886,
"learning_rate": 1.1163157894736842e-05,
"loss": 2.7129,
"step": 179
},
{
"epoch": 0.21422195775066943,
"grad_norm": 2.07534122467041,
"learning_rate": 1.0631578947368421e-05,
"loss": 2.9602,
"step": 180
},
{
"epoch": 0.21541207973817317,
"grad_norm": 2.0630900859832764,
"learning_rate": 1.0100000000000002e-05,
"loss": 2.765,
"step": 181
},
{
"epoch": 0.21660220172567687,
"grad_norm": 2.077697992324829,
"learning_rate": 9.56842105263158e-06,
"loss": 2.5767,
"step": 182
},
{
"epoch": 0.2177923237131806,
"grad_norm": 2.0414209365844727,
"learning_rate": 9.036842105263159e-06,
"loss": 2.7381,
"step": 183
},
{
"epoch": 0.21898244570068431,
"grad_norm": 2.3121683597564697,
"learning_rate": 8.505263157894738e-06,
"loss": 2.6139,
"step": 184
},
{
"epoch": 0.22017256768818805,
"grad_norm": 2.3920252323150635,
"learning_rate": 7.973684210526316e-06,
"loss": 2.8152,
"step": 185
},
{
"epoch": 0.22136268967569175,
"grad_norm": 2.3734066486358643,
"learning_rate": 7.442105263157894e-06,
"loss": 2.3302,
"step": 186
},
{
"epoch": 0.2225528116631955,
"grad_norm": 2.291586399078369,
"learning_rate": 6.9105263157894745e-06,
"loss": 2.7101,
"step": 187
},
{
"epoch": 0.2237429336506992,
"grad_norm": 2.3693385124206543,
"learning_rate": 6.378947368421053e-06,
"loss": 2.232,
"step": 188
},
{
"epoch": 0.22493305563820293,
"grad_norm": 3.0694398880004883,
"learning_rate": 5.847368421052632e-06,
"loss": 2.2262,
"step": 189
},
{
"epoch": 0.22612317762570663,
"grad_norm": 2.5530786514282227,
"learning_rate": 5.315789473684211e-06,
"loss": 2.2363,
"step": 190
},
{
"epoch": 0.22731329961321037,
"grad_norm": 2.73111629486084,
"learning_rate": 4.78421052631579e-06,
"loss": 2.3876,
"step": 191
},
{
"epoch": 0.22850342160071407,
"grad_norm": 2.807893753051758,
"learning_rate": 4.252631578947369e-06,
"loss": 2.261,
"step": 192
},
{
"epoch": 0.22969354358821778,
"grad_norm": 2.7763075828552246,
"learning_rate": 3.721052631578947e-06,
"loss": 2.0528,
"step": 193
},
{
"epoch": 0.23088366557572151,
"grad_norm": 3.2379202842712402,
"learning_rate": 3.1894736842105266e-06,
"loss": 2.5698,
"step": 194
},
{
"epoch": 0.23207378756322522,
"grad_norm": 4.352906227111816,
"learning_rate": 2.6578947368421053e-06,
"loss": 2.8109,
"step": 195
},
{
"epoch": 0.23326390955072895,
"grad_norm": 3.5866363048553467,
"learning_rate": 2.1263157894736844e-06,
"loss": 2.2815,
"step": 196
},
{
"epoch": 0.23445403153823266,
"grad_norm": 4.414037227630615,
"learning_rate": 1.5947368421052633e-06,
"loss": 2.4135,
"step": 197
},
{
"epoch": 0.2356441535257364,
"grad_norm": 4.259603500366211,
"learning_rate": 1.0631578947368422e-06,
"loss": 2.0559,
"step": 198
},
{
"epoch": 0.2368342755132401,
"grad_norm": 4.946102619171143,
"learning_rate": 5.315789473684211e-07,
"loss": 1.8129,
"step": 199
},
{
"epoch": 0.23802439750074383,
"grad_norm": 7.032172203063965,
"learning_rate": 0.0,
"loss": 3.0412,
"step": 200
},
{
"epoch": 0.23802439750074383,
"eval_loss": 2.559772253036499,
"eval_runtime": 160.6933,
"eval_samples_per_second": 8.806,
"eval_steps_per_second": 2.203,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.0321122932791706e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}