dada22231's picture
Training in progress, step 95, checkpoint
3d2992a verified
{
"best_metric": 0.21727269887924194,
"best_model_checkpoint": "miner_id_24/checkpoint-75",
"epoch": 0.5764125900644672,
"eval_steps": 25,
"global_step": 95,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006067500948047023,
"grad_norm": 2.3723528385162354,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.6663,
"step": 1
},
{
"epoch": 0.006067500948047023,
"eval_loss": 0.8734264373779297,
"eval_runtime": 3.9768,
"eval_samples_per_second": 12.573,
"eval_steps_per_second": 3.269,
"step": 1
},
{
"epoch": 0.012135001896094046,
"grad_norm": 1.9676845073699951,
"learning_rate": 6.666666666666667e-05,
"loss": 0.684,
"step": 2
},
{
"epoch": 0.01820250284414107,
"grad_norm": 1.108001947402954,
"learning_rate": 0.0001,
"loss": 0.5024,
"step": 3
},
{
"epoch": 0.02427000379218809,
"grad_norm": 1.2786028385162354,
"learning_rate": 9.997376600647783e-05,
"loss": 0.4912,
"step": 4
},
{
"epoch": 0.030337504740235114,
"grad_norm": 0.999189019203186,
"learning_rate": 9.989509461357426e-05,
"loss": 0.4644,
"step": 5
},
{
"epoch": 0.03640500568828214,
"grad_norm": 1.3803685903549194,
"learning_rate": 9.976407754861426e-05,
"loss": 0.456,
"step": 6
},
{
"epoch": 0.04247250663632916,
"grad_norm": 1.477198839187622,
"learning_rate": 9.958086757163489e-05,
"loss": 0.3871,
"step": 7
},
{
"epoch": 0.04854000758437618,
"grad_norm": 0.7586098313331604,
"learning_rate": 9.934567829727386e-05,
"loss": 0.4674,
"step": 8
},
{
"epoch": 0.05460750853242321,
"grad_norm": 0.9069706797599792,
"learning_rate": 9.905878394570453e-05,
"loss": 0.3627,
"step": 9
},
{
"epoch": 0.06067500948047023,
"grad_norm": 1.5008825063705444,
"learning_rate": 9.872051902290737e-05,
"loss": 0.3324,
"step": 10
},
{
"epoch": 0.06674251042851725,
"grad_norm": 1.6178452968597412,
"learning_rate": 9.833127793065098e-05,
"loss": 0.4529,
"step": 11
},
{
"epoch": 0.07281001137656427,
"grad_norm": 2.704998254776001,
"learning_rate": 9.789151450663723e-05,
"loss": 0.5521,
"step": 12
},
{
"epoch": 0.0788775123246113,
"grad_norm": 2.38661527633667,
"learning_rate": 9.740174149534693e-05,
"loss": 0.3018,
"step": 13
},
{
"epoch": 0.08494501327265833,
"grad_norm": 1.7349988222122192,
"learning_rate": 9.686252995020249e-05,
"loss": 0.3929,
"step": 14
},
{
"epoch": 0.09101251422070535,
"grad_norm": 1.261366605758667,
"learning_rate": 9.627450856774539e-05,
"loss": 0.3397,
"step": 15
},
{
"epoch": 0.09708001516875236,
"grad_norm": 0.6867349743843079,
"learning_rate": 9.563836295460398e-05,
"loss": 0.3019,
"step": 16
},
{
"epoch": 0.1031475161167994,
"grad_norm": 0.5942005515098572,
"learning_rate": 9.495483482810688e-05,
"loss": 0.3616,
"step": 17
},
{
"epoch": 0.10921501706484642,
"grad_norm": 0.5795318484306335,
"learning_rate": 9.422472115147382e-05,
"loss": 0.3124,
"step": 18
},
{
"epoch": 0.11528251801289344,
"grad_norm": 0.6129356622695923,
"learning_rate": 9.3448873204592e-05,
"loss": 0.3087,
"step": 19
},
{
"epoch": 0.12135001896094046,
"grad_norm": 0.5287042260169983,
"learning_rate": 9.2628195591462e-05,
"loss": 0.3619,
"step": 20
},
{
"epoch": 0.12741751990898748,
"grad_norm": 0.5691443085670471,
"learning_rate": 9.176364518546989e-05,
"loss": 0.3189,
"step": 21
},
{
"epoch": 0.1334850208570345,
"grad_norm": 0.6991756558418274,
"learning_rate": 9.08562300137157e-05,
"loss": 0.3265,
"step": 22
},
{
"epoch": 0.13955252180508154,
"grad_norm": 1.2463486194610596,
"learning_rate": 8.990700808169889e-05,
"loss": 0.3651,
"step": 23
},
{
"epoch": 0.14562002275312855,
"grad_norm": 1.522375464439392,
"learning_rate": 8.891708613973126e-05,
"loss": 0.4087,
"step": 24
},
{
"epoch": 0.15168752370117558,
"grad_norm": 1.229692816734314,
"learning_rate": 8.788761839251559e-05,
"loss": 0.2718,
"step": 25
},
{
"epoch": 0.15168752370117558,
"eval_loss": 0.2550545930862427,
"eval_runtime": 4.0733,
"eval_samples_per_second": 12.275,
"eval_steps_per_second": 3.192,
"step": 25
},
{
"epoch": 0.1577550246492226,
"grad_norm": 0.9358745217323303,
"learning_rate": 8.681980515339464e-05,
"loss": 0.2008,
"step": 26
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.5705291032791138,
"learning_rate": 8.571489144483944e-05,
"loss": 0.3452,
"step": 27
},
{
"epoch": 0.16989002654531665,
"grad_norm": 0.5917467474937439,
"learning_rate": 8.457416554680877e-05,
"loss": 0.2588,
"step": 28
},
{
"epoch": 0.17595752749336366,
"grad_norm": 0.6592450141906738,
"learning_rate": 8.339895749467238e-05,
"loss": 0.2964,
"step": 29
},
{
"epoch": 0.1820250284414107,
"grad_norm": 0.5917816758155823,
"learning_rate": 8.219063752844926e-05,
"loss": 0.3058,
"step": 30
},
{
"epoch": 0.18809252938945772,
"grad_norm": 0.5254703164100647,
"learning_rate": 8.095061449516903e-05,
"loss": 0.3066,
"step": 31
},
{
"epoch": 0.19416003033750473,
"grad_norm": 0.4927206337451935,
"learning_rate": 7.968033420621935e-05,
"loss": 0.3338,
"step": 32
},
{
"epoch": 0.20022753128555176,
"grad_norm": 0.617469310760498,
"learning_rate": 7.838127775159452e-05,
"loss": 0.3476,
"step": 33
},
{
"epoch": 0.2062950322335988,
"grad_norm": 0.5512767434120178,
"learning_rate": 7.705495977301078e-05,
"loss": 0.2595,
"step": 34
},
{
"epoch": 0.2123625331816458,
"grad_norm": 0.6990464925765991,
"learning_rate": 7.570292669790186e-05,
"loss": 0.288,
"step": 35
},
{
"epoch": 0.21843003412969283,
"grad_norm": 1.4301836490631104,
"learning_rate": 7.43267549363537e-05,
"loss": 0.4128,
"step": 36
},
{
"epoch": 0.22449753507773987,
"grad_norm": 1.702950119972229,
"learning_rate": 7.292804904308087e-05,
"loss": 0.4041,
"step": 37
},
{
"epoch": 0.23056503602578687,
"grad_norm": 1.1743119955062866,
"learning_rate": 7.150843984658754e-05,
"loss": 0.2015,
"step": 38
},
{
"epoch": 0.2366325369738339,
"grad_norm": 0.9276676774024963,
"learning_rate": 7.006958254769438e-05,
"loss": 0.3364,
"step": 39
},
{
"epoch": 0.2427000379218809,
"grad_norm": 0.53074711561203,
"learning_rate": 6.861315478964841e-05,
"loss": 0.2547,
"step": 40
},
{
"epoch": 0.24876753886992795,
"grad_norm": 0.4709409773349762,
"learning_rate": 6.714085470206609e-05,
"loss": 0.2583,
"step": 41
},
{
"epoch": 0.25483503981797495,
"grad_norm": 0.5634525418281555,
"learning_rate": 6.56543989209901e-05,
"loss": 0.3323,
"step": 42
},
{
"epoch": 0.260902540766022,
"grad_norm": 0.6031685471534729,
"learning_rate": 6.415552058736854e-05,
"loss": 0.2583,
"step": 43
},
{
"epoch": 0.266970041714069,
"grad_norm": 0.6985509395599365,
"learning_rate": 6.264596732629e-05,
"loss": 0.3017,
"step": 44
},
{
"epoch": 0.27303754266211605,
"grad_norm": 0.5389325022697449,
"learning_rate": 6.112749920933111e-05,
"loss": 0.2776,
"step": 45
},
{
"epoch": 0.2791050436101631,
"grad_norm": 0.4434327185153961,
"learning_rate": 5.960188670239154e-05,
"loss": 0.2907,
"step": 46
},
{
"epoch": 0.28517254455821006,
"grad_norm": 0.47085919976234436,
"learning_rate": 5.80709086014102e-05,
"loss": 0.2155,
"step": 47
},
{
"epoch": 0.2912400455062571,
"grad_norm": 0.9153717756271362,
"learning_rate": 5.653634995836856e-05,
"loss": 0.3143,
"step": 48
},
{
"epoch": 0.29730754645430413,
"grad_norm": 2.431823253631592,
"learning_rate": 5.500000000000001e-05,
"loss": 0.5086,
"step": 49
},
{
"epoch": 0.30337504740235116,
"grad_norm": 0.8440378904342651,
"learning_rate": 5.346365004163145e-05,
"loss": 0.3082,
"step": 50
},
{
"epoch": 0.30337504740235116,
"eval_loss": 0.22515492141246796,
"eval_runtime": 4.0292,
"eval_samples_per_second": 12.409,
"eval_steps_per_second": 3.226,
"step": 50
},
{
"epoch": 0.3094425483503982,
"grad_norm": 0.46408069133758545,
"learning_rate": 5.192909139858981e-05,
"loss": 0.2074,
"step": 51
},
{
"epoch": 0.3155100492984452,
"grad_norm": 0.3362777531147003,
"learning_rate": 5.0398113297608465e-05,
"loss": 0.2881,
"step": 52
},
{
"epoch": 0.3215775502464922,
"grad_norm": 0.33510658144950867,
"learning_rate": 4.887250079066892e-05,
"loss": 0.2266,
"step": 53
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.5662570595741272,
"learning_rate": 4.7354032673710005e-05,
"loss": 0.2653,
"step": 54
},
{
"epoch": 0.33371255214258627,
"grad_norm": 0.7332994937896729,
"learning_rate": 4.584447941263149e-05,
"loss": 0.3039,
"step": 55
},
{
"epoch": 0.3397800530906333,
"grad_norm": 0.45034223794937134,
"learning_rate": 4.43456010790099e-05,
"loss": 0.2767,
"step": 56
},
{
"epoch": 0.34584755403868034,
"grad_norm": 0.36071422696113586,
"learning_rate": 4.285914529793391e-05,
"loss": 0.2616,
"step": 57
},
{
"epoch": 0.3519150549867273,
"grad_norm": 0.38735389709472656,
"learning_rate": 4.13868452103516e-05,
"loss": 0.2729,
"step": 58
},
{
"epoch": 0.35798255593477435,
"grad_norm": 0.3561176657676697,
"learning_rate": 3.9930417452305626e-05,
"loss": 0.2106,
"step": 59
},
{
"epoch": 0.3640500568828214,
"grad_norm": 0.430941104888916,
"learning_rate": 3.8491560153412466e-05,
"loss": 0.285,
"step": 60
},
{
"epoch": 0.3701175578308684,
"grad_norm": 0.622871994972229,
"learning_rate": 3.707195095691913e-05,
"loss": 0.3367,
"step": 61
},
{
"epoch": 0.37618505877891545,
"grad_norm": 1.2697404623031616,
"learning_rate": 3.567324506364632e-05,
"loss": 0.3697,
"step": 62
},
{
"epoch": 0.3822525597269625,
"grad_norm": 0.40310418605804443,
"learning_rate": 3.4297073302098156e-05,
"loss": 0.1306,
"step": 63
},
{
"epoch": 0.38832006067500946,
"grad_norm": 0.2851163148880005,
"learning_rate": 3.2945040226989244e-05,
"loss": 0.2808,
"step": 64
},
{
"epoch": 0.3943875616230565,
"grad_norm": 0.2384623885154724,
"learning_rate": 3.16187222484055e-05,
"loss": 0.2313,
"step": 65
},
{
"epoch": 0.4004550625711035,
"grad_norm": 0.24203841388225555,
"learning_rate": 3.0319665793780648e-05,
"loss": 0.2467,
"step": 66
},
{
"epoch": 0.40652256351915056,
"grad_norm": 0.2760860323905945,
"learning_rate": 2.9049385504830985e-05,
"loss": 0.2661,
"step": 67
},
{
"epoch": 0.4125900644671976,
"grad_norm": 0.3492850959300995,
"learning_rate": 2.7809362471550748e-05,
"loss": 0.2071,
"step": 68
},
{
"epoch": 0.41865756541524457,
"grad_norm": 0.463875412940979,
"learning_rate": 2.660104250532764e-05,
"loss": 0.2345,
"step": 69
},
{
"epoch": 0.4247250663632916,
"grad_norm": 0.34055763483047485,
"learning_rate": 2.5425834453191232e-05,
"loss": 0.2777,
"step": 70
},
{
"epoch": 0.43079256731133864,
"grad_norm": 0.3323991894721985,
"learning_rate": 2.4285108555160577e-05,
"loss": 0.2575,
"step": 71
},
{
"epoch": 0.43686006825938567,
"grad_norm": 0.43374085426330566,
"learning_rate": 2.3180194846605367e-05,
"loss": 0.2336,
"step": 72
},
{
"epoch": 0.4429275692074327,
"grad_norm": 0.45842763781547546,
"learning_rate": 2.2112381607484417e-05,
"loss": 0.2808,
"step": 73
},
{
"epoch": 0.44899507015547974,
"grad_norm": 0.5284908413887024,
"learning_rate": 2.1082913860268765e-05,
"loss": 0.3155,
"step": 74
},
{
"epoch": 0.4550625711035267,
"grad_norm": 0.6865499019622803,
"learning_rate": 2.0092991918301108e-05,
"loss": 0.237,
"step": 75
},
{
"epoch": 0.4550625711035267,
"eval_loss": 0.21727269887924194,
"eval_runtime": 4.0624,
"eval_samples_per_second": 12.308,
"eval_steps_per_second": 3.2,
"step": 75
},
{
"epoch": 0.46113007205157375,
"grad_norm": 0.21528764069080353,
"learning_rate": 1.91437699862843e-05,
"loss": 0.1446,
"step": 76
},
{
"epoch": 0.4671975729996208,
"grad_norm": 0.2670177221298218,
"learning_rate": 1.8236354814530112e-05,
"loss": 0.2939,
"step": 77
},
{
"epoch": 0.4732650739476678,
"grad_norm": 0.2085307091474533,
"learning_rate": 1.7371804408538024e-05,
"loss": 0.1794,
"step": 78
},
{
"epoch": 0.47933257489571485,
"grad_norm": 0.2803487777709961,
"learning_rate": 1.6551126795408016e-05,
"loss": 0.2405,
"step": 79
},
{
"epoch": 0.4854000758437618,
"grad_norm": 0.3227778375148773,
"learning_rate": 1.577527884852619e-05,
"loss": 0.2841,
"step": 80
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.31022030115127563,
"learning_rate": 1.5045165171893116e-05,
"loss": 0.247,
"step": 81
},
{
"epoch": 0.4975350777398559,
"grad_norm": 0.2896697223186493,
"learning_rate": 1.4361637045396029e-05,
"loss": 0.2381,
"step": 82
},
{
"epoch": 0.5036025786879029,
"grad_norm": 0.4402921199798584,
"learning_rate": 1.3725491432254624e-05,
"loss": 0.2843,
"step": 83
},
{
"epoch": 0.5096700796359499,
"grad_norm": 0.3511703312397003,
"learning_rate": 1.313747004979751e-05,
"loss": 0.2593,
"step": 84
},
{
"epoch": 0.515737580583997,
"grad_norm": 0.3542587459087372,
"learning_rate": 1.2598258504653081e-05,
"loss": 0.1878,
"step": 85
},
{
"epoch": 0.521805081532044,
"grad_norm": 0.38901492953300476,
"learning_rate": 1.2108485493362765e-05,
"loss": 0.261,
"step": 86
},
{
"epoch": 0.5278725824800911,
"grad_norm": 0.5619277954101562,
"learning_rate": 1.1668722069349041e-05,
"loss": 0.2785,
"step": 87
},
{
"epoch": 0.533940083428138,
"grad_norm": 0.38067886233329773,
"learning_rate": 1.1279480977092635e-05,
"loss": 0.159,
"step": 88
},
{
"epoch": 0.540007584376185,
"grad_norm": 0.2707197666168213,
"learning_rate": 1.094121605429547e-05,
"loss": 0.2126,
"step": 89
},
{
"epoch": 0.5460750853242321,
"grad_norm": 0.22393597662448883,
"learning_rate": 1.0654321702726141e-05,
"loss": 0.2335,
"step": 90
},
{
"epoch": 0.5521425862722791,
"grad_norm": 0.24070705473423004,
"learning_rate": 1.0419132428365116e-05,
"loss": 0.213,
"step": 91
},
{
"epoch": 0.5582100872203262,
"grad_norm": 0.29574301838874817,
"learning_rate": 1.0235922451385733e-05,
"loss": 0.2586,
"step": 92
},
{
"epoch": 0.5642775881683731,
"grad_norm": 0.2932300865650177,
"learning_rate": 1.0104905386425733e-05,
"loss": 0.2572,
"step": 93
},
{
"epoch": 0.5703450891164201,
"grad_norm": 0.3075341284275055,
"learning_rate": 1.002623399352217e-05,
"loss": 0.2183,
"step": 94
},
{
"epoch": 0.5764125900644672,
"grad_norm": 0.3770557641983032,
"learning_rate": 1e-05,
"loss": 0.2778,
"step": 95
}
],
"logging_steps": 1,
"max_steps": 95,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.173151075448914e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}