{ "best_metric": 0.011846823617815971, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.05767567047966932, "eval_steps": 100, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005767567047966932, "grad_norm": 3.5087673664093018, "learning_rate": 2e-05, "loss": 0.9661, "step": 1 }, { "epoch": 0.0005767567047966932, "eval_loss": 0.256265789270401, "eval_runtime": 215.9962, "eval_samples_per_second": 8.028, "eval_steps_per_second": 2.009, "step": 1 }, { "epoch": 0.0011535134095933864, "grad_norm": 5.137994289398193, "learning_rate": 4e-05, "loss": 1.1907, "step": 2 }, { "epoch": 0.0017302701143900798, "grad_norm": 6.508111000061035, "learning_rate": 6e-05, "loss": 1.3703, "step": 3 }, { "epoch": 0.002307026819186773, "grad_norm": 2.984405040740967, "learning_rate": 8e-05, "loss": 0.7919, "step": 4 }, { "epoch": 0.0028837835239834664, "grad_norm": 6.96499490737915, "learning_rate": 0.0001, "loss": 1.5441, "step": 5 }, { "epoch": 0.0034605402287801595, "grad_norm": 5.478267192840576, "learning_rate": 0.00012, "loss": 0.7325, "step": 6 }, { "epoch": 0.004037296933576853, "grad_norm": 2.7913830280303955, "learning_rate": 0.00014, "loss": 0.4881, "step": 7 }, { "epoch": 0.004614053638373546, "grad_norm": 3.1158297061920166, "learning_rate": 0.00016, "loss": 0.2994, "step": 8 }, { "epoch": 0.005190810343170239, "grad_norm": 3.412966012954712, "learning_rate": 0.00018, "loss": 0.2548, "step": 9 }, { "epoch": 0.005767567047966933, "grad_norm": 3.8502895832061768, "learning_rate": 0.0002, "loss": 0.1836, "step": 10 }, { "epoch": 0.0063443237527636255, "grad_norm": 2.5958235263824463, "learning_rate": 0.00019999852288943748, "loss": 0.0852, "step": 11 }, { "epoch": 0.006921080457560319, "grad_norm": 4.420118808746338, "learning_rate": 0.00019999409160138693, "loss": 0.1172, "step": 12 }, { "epoch": 0.007497837162357013, "grad_norm": 2.869602918624878, "learning_rate": 0.00019998670626675846, "loss": 0.051, "step": 13 }, { "epoch": 0.008074593867153706, "grad_norm": 1.1618932485580444, "learning_rate": 0.0001999763671037312, "loss": 0.0148, "step": 14 }, { "epoch": 0.008651350571950399, "grad_norm": 4.91132116317749, "learning_rate": 0.00019996307441774684, "loss": 0.0444, "step": 15 }, { "epoch": 0.009228107276747092, "grad_norm": 1.5534745454788208, "learning_rate": 0.00019994682860150073, "loss": 0.0288, "step": 16 }, { "epoch": 0.009804863981543786, "grad_norm": 2.5079476833343506, "learning_rate": 0.0001999276301349302, "loss": 0.0389, "step": 17 }, { "epoch": 0.010381620686340479, "grad_norm": 2.015561580657959, "learning_rate": 0.00019990547958520044, "loss": 0.0538, "step": 18 }, { "epoch": 0.010958377391137171, "grad_norm": 0.5642620325088501, "learning_rate": 0.00019988037760668766, "loss": 0.0044, "step": 19 }, { "epoch": 0.011535134095933866, "grad_norm": 1.0824285745620728, "learning_rate": 0.0001998523249409598, "loss": 0.0099, "step": 20 }, { "epoch": 0.012111890800730558, "grad_norm": 1.1586933135986328, "learning_rate": 0.00019982132241675463, "loss": 0.0149, "step": 21 }, { "epoch": 0.012688647505527251, "grad_norm": 1.043748378753662, "learning_rate": 0.00019978737094995526, "loss": 0.0111, "step": 22 }, { "epoch": 0.013265404210323945, "grad_norm": 1.1925742626190186, "learning_rate": 0.0001997504715435631, "loss": 0.0212, "step": 23 }, { "epoch": 0.013842160915120638, "grad_norm": 4.5300445556640625, "learning_rate": 0.00019971062528766824, "loss": 0.0469, "step": 24 }, { "epoch": 0.01441891761991733, "grad_norm": 1.0790067911148071, "learning_rate": 0.00019966783335941717, "loss": 0.026, "step": 25 }, { "epoch": 0.014995674324714025, "grad_norm": 0.853151798248291, "learning_rate": 0.00019962209702297808, "loss": 0.0242, "step": 26 }, { "epoch": 0.015572431029510718, "grad_norm": 1.722130537033081, "learning_rate": 0.00019957341762950344, "loss": 0.0478, "step": 27 }, { "epoch": 0.016149187734307412, "grad_norm": 2.9154410362243652, "learning_rate": 0.00019952179661709028, "loss": 0.0796, "step": 28 }, { "epoch": 0.016725944439104103, "grad_norm": 1.8094756603240967, "learning_rate": 0.00019946723551073732, "loss": 0.0222, "step": 29 }, { "epoch": 0.017302701143900798, "grad_norm": 2.117493152618408, "learning_rate": 0.0001994097359223004, "loss": 0.0215, "step": 30 }, { "epoch": 0.017879457848697492, "grad_norm": 0.26287028193473816, "learning_rate": 0.0001993492995504444, "loss": 0.0066, "step": 31 }, { "epoch": 0.018456214553494183, "grad_norm": 30.89515495300293, "learning_rate": 0.0001992859281805935, "loss": 0.0778, "step": 32 }, { "epoch": 0.019032971258290877, "grad_norm": 4.260768413543701, "learning_rate": 0.00019921962368487807, "loss": 0.0816, "step": 33 }, { "epoch": 0.019609727963087572, "grad_norm": 1.1326998472213745, "learning_rate": 0.00019915038802207944, "loss": 0.0209, "step": 34 }, { "epoch": 0.020186484667884263, "grad_norm": 4.575008869171143, "learning_rate": 0.00019907822323757225, "loss": 0.1154, "step": 35 }, { "epoch": 0.020763241372680957, "grad_norm": 5.928327560424805, "learning_rate": 0.00019900313146326384, "loss": 0.2387, "step": 36 }, { "epoch": 0.02133999807747765, "grad_norm": 21.27695655822754, "learning_rate": 0.00019892511491753124, "loss": 0.202, "step": 37 }, { "epoch": 0.021916754782274343, "grad_norm": 3.236907482147217, "learning_rate": 0.00019884417590515573, "loss": 0.1093, "step": 38 }, { "epoch": 0.022493511487071037, "grad_norm": 2.1399052143096924, "learning_rate": 0.00019876031681725467, "loss": 0.0944, "step": 39 }, { "epoch": 0.02307026819186773, "grad_norm": 3.7547476291656494, "learning_rate": 0.000198673540131211, "loss": 0.1158, "step": 40 }, { "epoch": 0.023647024896664422, "grad_norm": 2.028630256652832, "learning_rate": 0.0001985838484105999, "loss": 0.062, "step": 41 }, { "epoch": 0.024223781601461117, "grad_norm": 1.1861765384674072, "learning_rate": 0.0001984912443051131, "loss": 0.0637, "step": 42 }, { "epoch": 0.02480053830625781, "grad_norm": 4.997815132141113, "learning_rate": 0.0001983957305504807, "loss": 0.0905, "step": 43 }, { "epoch": 0.025377295011054502, "grad_norm": 3.761312484741211, "learning_rate": 0.0001982973099683902, "loss": 0.0537, "step": 44 }, { "epoch": 0.025954051715851197, "grad_norm": 8.485271453857422, "learning_rate": 0.0001981959854664032, "loss": 0.2039, "step": 45 }, { "epoch": 0.02653080842064789, "grad_norm": 4.167719841003418, "learning_rate": 0.00019809176003786954, "loss": 0.1507, "step": 46 }, { "epoch": 0.027107565125444582, "grad_norm": 13.948970794677734, "learning_rate": 0.00019798463676183888, "loss": 0.0811, "step": 47 }, { "epoch": 0.027684321830241276, "grad_norm": 16.908702850341797, "learning_rate": 0.00019787461880296965, "loss": 0.2963, "step": 48 }, { "epoch": 0.02826107853503797, "grad_norm": 2.1487314701080322, "learning_rate": 0.00019776170941143565, "loss": 0.0582, "step": 49 }, { "epoch": 0.02883783523983466, "grad_norm": 3.16274356842041, "learning_rate": 0.00019764591192282993, "loss": 0.0622, "step": 50 }, { "epoch": 0.029414591944631356, "grad_norm": 2.1081337928771973, "learning_rate": 0.0001975272297580664, "loss": 0.1675, "step": 51 }, { "epoch": 0.02999134864942805, "grad_norm": 1.3357646465301514, "learning_rate": 0.00019740566642327867, "loss": 0.0394, "step": 52 }, { "epoch": 0.03056810535422474, "grad_norm": 2.5485501289367676, "learning_rate": 0.00019728122550971642, "loss": 0.1227, "step": 53 }, { "epoch": 0.031144862059021436, "grad_norm": 7.7235870361328125, "learning_rate": 0.0001971539106936394, "loss": 0.1759, "step": 54 }, { "epoch": 0.03172161876381813, "grad_norm": 5.811506748199463, "learning_rate": 0.00019702372573620881, "loss": 0.1152, "step": 55 }, { "epoch": 0.032298375468614825, "grad_norm": 1.4860045909881592, "learning_rate": 0.00019689067448337618, "loss": 0.0469, "step": 56 }, { "epoch": 0.032875132173411516, "grad_norm": 1.4226983785629272, "learning_rate": 0.00019675476086576972, "loss": 0.0395, "step": 57 }, { "epoch": 0.03345188887820821, "grad_norm": 0.34651094675064087, "learning_rate": 0.0001966159888985782, "loss": 0.0185, "step": 58 }, { "epoch": 0.034028645583004904, "grad_norm": 0.8589548468589783, "learning_rate": 0.00019647436268143247, "loss": 0.0172, "step": 59 }, { "epoch": 0.034605402287801595, "grad_norm": 1.4422504901885986, "learning_rate": 0.00019632988639828406, "loss": 0.0369, "step": 60 }, { "epoch": 0.035182158992598286, "grad_norm": 2.7948107719421387, "learning_rate": 0.00019618256431728194, "loss": 0.0839, "step": 61 }, { "epoch": 0.035758915697394984, "grad_norm": 0.973739743232727, "learning_rate": 0.00019603240079064604, "loss": 0.0967, "step": 62 }, { "epoch": 0.036335672402191675, "grad_norm": 1.0291193723678589, "learning_rate": 0.00019587940025453908, "loss": 0.0121, "step": 63 }, { "epoch": 0.036912429106988366, "grad_norm": 3.7419233322143555, "learning_rate": 0.00019572356722893518, "loss": 0.0257, "step": 64 }, { "epoch": 0.037489185811785064, "grad_norm": 0.9348726868629456, "learning_rate": 0.00019556490631748651, "loss": 0.0038, "step": 65 }, { "epoch": 0.038065942516581755, "grad_norm": 2.067941904067993, "learning_rate": 0.00019540342220738724, "loss": 0.011, "step": 66 }, { "epoch": 0.038642699221378446, "grad_norm": 5.878370761871338, "learning_rate": 0.00019523911966923507, "loss": 0.0176, "step": 67 }, { "epoch": 0.039219455926175144, "grad_norm": 2.2632899284362793, "learning_rate": 0.00019507200355689026, "loss": 0.0224, "step": 68 }, { "epoch": 0.039796212630971835, "grad_norm": 3.178539752960205, "learning_rate": 0.00019490207880733232, "loss": 0.0314, "step": 69 }, { "epoch": 0.040372969335768526, "grad_norm": 1.259251594543457, "learning_rate": 0.0001947293504405141, "loss": 0.0035, "step": 70 }, { "epoch": 0.040949726040565224, "grad_norm": 0.4863012433052063, "learning_rate": 0.00019455382355921353, "loss": 0.0102, "step": 71 }, { "epoch": 0.041526482745361915, "grad_norm": 52.19420623779297, "learning_rate": 0.00019437550334888278, "loss": 0.1446, "step": 72 }, { "epoch": 0.042103239450158605, "grad_norm": 1.488041639328003, "learning_rate": 0.00019419439507749515, "loss": 0.0246, "step": 73 }, { "epoch": 0.0426799961549553, "grad_norm": 0.3983250856399536, "learning_rate": 0.0001940105040953895, "loss": 0.0038, "step": 74 }, { "epoch": 0.043256752859751994, "grad_norm": 1.9497828483581543, "learning_rate": 0.00019382383583511206, "loss": 0.0124, "step": 75 }, { "epoch": 0.043833509564548685, "grad_norm": 5.796697616577148, "learning_rate": 0.00019363439581125601, "loss": 0.0264, "step": 76 }, { "epoch": 0.04441026626934538, "grad_norm": 4.687071323394775, "learning_rate": 0.00019344218962029857, "loss": 0.017, "step": 77 }, { "epoch": 0.044987022974142074, "grad_norm": 3.4617161750793457, "learning_rate": 0.00019324722294043558, "loss": 0.0973, "step": 78 }, { "epoch": 0.045563779678938765, "grad_norm": 1.2890996932983398, "learning_rate": 0.00019304950153141393, "loss": 0.0152, "step": 79 }, { "epoch": 0.04614053638373546, "grad_norm": 2.0128302574157715, "learning_rate": 0.00019284903123436127, "loss": 0.035, "step": 80 }, { "epoch": 0.046717293088532154, "grad_norm": 0.14461462199687958, "learning_rate": 0.00019264581797161343, "loss": 0.004, "step": 81 }, { "epoch": 0.047294049793328845, "grad_norm": 0.43382060527801514, "learning_rate": 0.00019243986774653956, "loss": 0.0044, "step": 82 }, { "epoch": 0.04787080649812554, "grad_norm": 0.5092380046844482, "learning_rate": 0.00019223118664336467, "loss": 0.0032, "step": 83 }, { "epoch": 0.048447563202922234, "grad_norm": 0.5211856365203857, "learning_rate": 0.0001920197808269901, "loss": 0.0059, "step": 84 }, { "epoch": 0.049024319907718925, "grad_norm": 1.6433213949203491, "learning_rate": 0.00019180565654281103, "loss": 0.1625, "step": 85 }, { "epoch": 0.04960107661251562, "grad_norm": 2.427119016647339, "learning_rate": 0.00019158882011653235, "loss": 0.0246, "step": 86 }, { "epoch": 0.05017783331731231, "grad_norm": 0.8261755108833313, "learning_rate": 0.00019136927795398157, "loss": 0.0113, "step": 87 }, { "epoch": 0.050754590022109004, "grad_norm": 0.15122078359127045, "learning_rate": 0.00019114703654091961, "loss": 0.005, "step": 88 }, { "epoch": 0.0513313467269057, "grad_norm": 0.5468443632125854, "learning_rate": 0.00019092210244284926, "loss": 0.017, "step": 89 }, { "epoch": 0.05190810343170239, "grad_norm": 0.640923798084259, "learning_rate": 0.00019069448230482118, "loss": 0.0323, "step": 90 }, { "epoch": 0.052484860136499084, "grad_norm": 0.2306308001279831, "learning_rate": 0.00019046418285123754, "loss": 0.0039, "step": 91 }, { "epoch": 0.05306161684129578, "grad_norm": 0.3201711177825928, "learning_rate": 0.00019023121088565352, "loss": 0.0037, "step": 92 }, { "epoch": 0.05363837354609247, "grad_norm": 1.7555040121078491, "learning_rate": 0.00018999557329057605, "loss": 0.0232, "step": 93 }, { "epoch": 0.054215130250889164, "grad_norm": 0.04883858934044838, "learning_rate": 0.00018975727702726076, "loss": 0.0016, "step": 94 }, { "epoch": 0.05479188695568586, "grad_norm": 0.11811670660972595, "learning_rate": 0.00018951632913550626, "loss": 0.0031, "step": 95 }, { "epoch": 0.05536864366048255, "grad_norm": 0.3842681050300598, "learning_rate": 0.000189272736733446, "loss": 0.0032, "step": 96 }, { "epoch": 0.055945400365279244, "grad_norm": 0.34558194875717163, "learning_rate": 0.0001890265070173382, "loss": 0.0041, "step": 97 }, { "epoch": 0.05652215707007594, "grad_norm": 0.23080027103424072, "learning_rate": 0.0001887776472613532, "loss": 0.0052, "step": 98 }, { "epoch": 0.05709891377487263, "grad_norm": 0.8554419875144958, "learning_rate": 0.00018852616481735841, "loss": 0.0142, "step": 99 }, { "epoch": 0.05767567047966932, "grad_norm": 0.6974347233772278, "learning_rate": 0.00018827206711470137, "loss": 0.0136, "step": 100 }, { "epoch": 0.05767567047966932, "eval_loss": 0.011846823617815971, "eval_runtime": 218.0508, "eval_samples_per_second": 7.952, "eval_steps_per_second": 1.99, "step": 100 } ], "logging_steps": 1, "max_steps": 588, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0888783655901594e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }