{ "best_metric": 1.1961991786956787, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.07989134776703682, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015978269553407366, "eval_loss": 1.6500585079193115, "eval_runtime": 49.2063, "eval_samples_per_second": 53.55, "eval_steps_per_second": 13.393, "step": 1 }, { "epoch": 0.0015978269553407365, "grad_norm": 0.31304410099983215, "learning_rate": 4.24e-05, "loss": 1.6422, "step": 10 }, { "epoch": 0.003195653910681473, "grad_norm": 0.5056787729263306, "learning_rate": 8.48e-05, "loss": 1.6523, "step": 20 }, { "epoch": 0.00479348086602221, "grad_norm": 0.5264449715614319, "learning_rate": 0.0001272, "loss": 1.4867, "step": 30 }, { "epoch": 0.006391307821362946, "grad_norm": 0.7551277875900269, "learning_rate": 0.0001696, "loss": 1.4439, "step": 40 }, { "epoch": 0.007989134776703682, "grad_norm": 1.3374431133270264, "learning_rate": 0.000212, "loss": 1.2566, "step": 50 }, { "epoch": 0.007989134776703682, "eval_loss": 1.3853693008422852, "eval_runtime": 49.2099, "eval_samples_per_second": 53.546, "eval_steps_per_second": 13.392, "step": 50 }, { "epoch": 0.00958696173204442, "grad_norm": 0.36312878131866455, "learning_rate": 0.00021174178932754136, "loss": 1.538, "step": 60 }, { "epoch": 0.011184788687385157, "grad_norm": 0.49091804027557373, "learning_rate": 0.00021096841528660647, "loss": 1.4567, "step": 70 }, { "epoch": 0.012782615642725892, "grad_norm": 0.5926417112350464, "learning_rate": 0.0002096836456777834, "loss": 1.2948, "step": 80 }, { "epoch": 0.01438044259806663, "grad_norm": 0.6516161561012268, "learning_rate": 0.00020789373976946182, "loss": 1.1847, "step": 90 }, { "epoch": 0.015978269553407365, "grad_norm": 1.2274402379989624, "learning_rate": 0.0002056074178033063, "loss": 1.0668, "step": 100 }, { "epoch": 0.015978269553407365, "eval_loss": 1.308932900428772, "eval_runtime": 49.1943, "eval_samples_per_second": 53.563, "eval_steps_per_second": 13.396, "step": 100 }, { "epoch": 0.017576096508748102, "grad_norm": 0.3842248022556305, "learning_rate": 0.00020283581851011567, "loss": 1.5373, "step": 110 }, { "epoch": 0.01917392346408884, "grad_norm": 0.4492928981781006, "learning_rate": 0.00019959244484304625, "loss": 1.4419, "step": 120 }, { "epoch": 0.020771750419429576, "grad_norm": 0.5767350196838379, "learning_rate": 0.00019589309819258114, "loss": 1.3442, "step": 130 }, { "epoch": 0.022369577374770314, "grad_norm": 0.6538853049278259, "learning_rate": 0.00019175580140374444, "loss": 1.0945, "step": 140 }, { "epoch": 0.023967404330111047, "grad_norm": 1.4180448055267334, "learning_rate": 0.00018720071097061167, "loss": 1.0496, "step": 150 }, { "epoch": 0.023967404330111047, "eval_loss": 1.282002568244934, "eval_runtime": 49.137, "eval_samples_per_second": 53.626, "eval_steps_per_second": 13.411, "step": 150 }, { "epoch": 0.025565231285451784, "grad_norm": 0.38757893443107605, "learning_rate": 0.00018225001883589702, "loss": 1.4831, "step": 160 }, { "epoch": 0.02716305824079252, "grad_norm": 0.4853920340538025, "learning_rate": 0.00017692784427403898, "loss": 1.4429, "step": 170 }, { "epoch": 0.02876088519613326, "grad_norm": 0.5605722069740295, "learning_rate": 0.00017126011638451976, "loss": 1.2459, "step": 180 }, { "epoch": 0.030358712151473996, "grad_norm": 0.6170331239700317, "learning_rate": 0.00016527444776789915, "loss": 1.009, "step": 190 }, { "epoch": 0.03195653910681473, "grad_norm": 1.6762750148773193, "learning_rate": 0.00015900000000000002, "loss": 1.0656, "step": 200 }, { "epoch": 0.03195653910681473, "eval_loss": 1.2561191320419312, "eval_runtime": 49.3809, "eval_samples_per_second": 53.361, "eval_steps_per_second": 13.345, "step": 200 }, { "epoch": 0.03355436606215547, "grad_norm": 0.37146052718162537, "learning_rate": 0.0001524673415596422, "loss": 1.4539, "step": 210 }, { "epoch": 0.035152193017496204, "grad_norm": 0.47493624687194824, "learning_rate": 0.00014570829890208668, "loss": 1.3451, "step": 220 }, { "epoch": 0.036750019972836945, "grad_norm": 0.5821739435195923, "learning_rate": 0.00013875580140374443, "loss": 1.3524, "step": 230 }, { "epoch": 0.03834784692817768, "grad_norm": 0.6922153234481812, "learning_rate": 0.00013164372093356477, "loss": 1.1759, "step": 240 }, { "epoch": 0.03994567388351841, "grad_norm": 1.3326733112335205, "learning_rate": 0.00012440670683269464, "loss": 1.0188, "step": 250 }, { "epoch": 0.03994567388351841, "eval_loss": 1.2323858737945557, "eval_runtime": 49.4073, "eval_samples_per_second": 53.332, "eval_steps_per_second": 13.338, "step": 250 }, { "epoch": 0.04154350083885915, "grad_norm": 0.3916065990924835, "learning_rate": 0.00011708001710637128, "loss": 1.4353, "step": 260 }, { "epoch": 0.04314132779419989, "grad_norm": 0.47572290897369385, "learning_rate": 0.00010969934665046512, "loss": 1.3617, "step": 270 }, { "epoch": 0.04473915474954063, "grad_norm": 0.5964052081108093, "learning_rate": 0.00010230065334953492, "loss": 1.2104, "step": 280 }, { "epoch": 0.04633698170488136, "grad_norm": 0.7098691463470459, "learning_rate": 9.491998289362875e-05, "loss": 1.0909, "step": 290 }, { "epoch": 0.047934808660222095, "grad_norm": 1.1634002923965454, "learning_rate": 8.759329316730539e-05, "loss": 0.9742, "step": 300 }, { "epoch": 0.047934808660222095, "eval_loss": 1.2163466215133667, "eval_runtime": 49.1098, "eval_samples_per_second": 53.655, "eval_steps_per_second": 13.419, "step": 300 }, { "epoch": 0.049532635615562835, "grad_norm": 0.38519808650016785, "learning_rate": 8.035627906643523e-05, "loss": 1.4363, "step": 310 }, { "epoch": 0.05113046257090357, "grad_norm": 0.4723591208457947, "learning_rate": 7.324419859625559e-05, "loss": 1.3084, "step": 320 }, { "epoch": 0.05272828952624431, "grad_norm": 0.6243221163749695, "learning_rate": 6.629170109791332e-05, "loss": 1.2501, "step": 330 }, { "epoch": 0.05432611648158504, "grad_norm": 0.6776363849639893, "learning_rate": 5.9532658440357784e-05, "loss": 1.0853, "step": 340 }, { "epoch": 0.055923943436925784, "grad_norm": 1.2277010679244995, "learning_rate": 5.300000000000002e-05, "loss": 1.0173, "step": 350 }, { "epoch": 0.055923943436925784, "eval_loss": 1.2060250043869019, "eval_runtime": 49.0653, "eval_samples_per_second": 53.704, "eval_steps_per_second": 13.431, "step": 350 }, { "epoch": 0.05752177039226652, "grad_norm": 0.3953673839569092, "learning_rate": 4.672555223210085e-05, "loss": 1.4167, "step": 360 }, { "epoch": 0.05911959734760725, "grad_norm": 0.5251022577285767, "learning_rate": 4.073988361548022e-05, "loss": 1.3159, "step": 370 }, { "epoch": 0.06071742430294799, "grad_norm": 0.597756564617157, "learning_rate": 3.507215572596106e-05, "loss": 1.2607, "step": 380 }, { "epoch": 0.062315251258288726, "grad_norm": 0.7255125641822815, "learning_rate": 2.9749981164102997e-05, "loss": 1.177, "step": 390 }, { "epoch": 0.06391307821362946, "grad_norm": 1.2851899862289429, "learning_rate": 2.479928902938834e-05, "loss": 1.0985, "step": 400 }, { "epoch": 0.06391307821362946, "eval_loss": 1.1991453170776367, "eval_runtime": 49.1876, "eval_samples_per_second": 53.57, "eval_steps_per_second": 13.398, "step": 400 }, { "epoch": 0.06551090516897021, "grad_norm": 0.4205108880996704, "learning_rate": 2.024419859625558e-05, "loss": 1.3338, "step": 410 }, { "epoch": 0.06710873212431094, "grad_norm": 0.5228121280670166, "learning_rate": 1.610690180741885e-05, "loss": 1.3194, "step": 420 }, { "epoch": 0.06870655907965167, "grad_norm": 0.5643351674079895, "learning_rate": 1.240755515695374e-05, "loss": 1.1673, "step": 430 }, { "epoch": 0.07030438603499241, "grad_norm": 0.6846547722816467, "learning_rate": 9.164181489884296e-06, "loss": 1.1176, "step": 440 }, { "epoch": 0.07190221299033314, "grad_norm": 1.4567266702651978, "learning_rate": 6.392582196693718e-06, "loss": 0.9772, "step": 450 }, { "epoch": 0.07190221299033314, "eval_loss": 1.1967874765396118, "eval_runtime": 49.249, "eval_samples_per_second": 53.504, "eval_steps_per_second": 13.381, "step": 450 }, { "epoch": 0.07350003994567389, "grad_norm": 0.4065600037574768, "learning_rate": 4.106260230538197e-06, "loss": 1.4111, "step": 460 }, { "epoch": 0.07509786690101462, "grad_norm": 0.4918181002140045, "learning_rate": 2.316354322216597e-06, "loss": 1.3486, "step": 470 }, { "epoch": 0.07669569385635536, "grad_norm": 0.6364920735359192, "learning_rate": 1.0315847133935416e-06, "loss": 1.1948, "step": 480 }, { "epoch": 0.07829352081169609, "grad_norm": 0.657301127910614, "learning_rate": 2.582106724586351e-07, "loss": 1.042, "step": 490 }, { "epoch": 0.07989134776703682, "grad_norm": 1.1757533550262451, "learning_rate": 0.0, "loss": 0.9801, "step": 500 }, { "epoch": 0.07989134776703682, "eval_loss": 1.1961991786956787, "eval_runtime": 49.2155, "eval_samples_per_second": 53.54, "eval_steps_per_second": 13.39, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3947946861068288e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }