{ "best_metric": 2.168654680252075, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.7992007992007992, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003996003996003996, "grad_norm": 0.5972241759300232, "learning_rate": 2.333333333333333e-06, "loss": 2.2493, "step": 1 }, { "epoch": 0.003996003996003996, "eval_loss": 4.059081077575684, "eval_runtime": 12.5696, "eval_samples_per_second": 33.573, "eval_steps_per_second": 8.433, "step": 1 }, { "epoch": 0.007992007992007992, "grad_norm": 0.6049998998641968, "learning_rate": 4.666666666666666e-06, "loss": 2.3726, "step": 2 }, { "epoch": 0.011988011988011988, "grad_norm": 0.6612269282341003, "learning_rate": 7e-06, "loss": 2.3547, "step": 3 }, { "epoch": 0.015984015984015984, "grad_norm": 0.7072306275367737, "learning_rate": 9.333333333333333e-06, "loss": 2.5046, "step": 4 }, { "epoch": 0.01998001998001998, "grad_norm": 0.7108502388000488, "learning_rate": 1.1666666666666665e-05, "loss": 2.7725, "step": 5 }, { "epoch": 0.023976023976023976, "grad_norm": 0.7299054861068726, "learning_rate": 1.4e-05, "loss": 2.7561, "step": 6 }, { "epoch": 0.027972027972027972, "grad_norm": 0.7699148058891296, "learning_rate": 1.633333333333333e-05, "loss": 3.1112, "step": 7 }, { "epoch": 0.03196803196803197, "grad_norm": 0.8154407143592834, "learning_rate": 1.8666666666666665e-05, "loss": 3.123, "step": 8 }, { "epoch": 0.03596403596403597, "grad_norm": 0.8263328075408936, "learning_rate": 2.1e-05, "loss": 3.184, "step": 9 }, { "epoch": 0.03996003996003996, "grad_norm": 0.9174614548683167, "learning_rate": 2.333333333333333e-05, "loss": 3.5817, "step": 10 }, { "epoch": 0.04395604395604396, "grad_norm": 0.9907087087631226, "learning_rate": 2.5666666666666663e-05, "loss": 3.5593, "step": 11 }, { "epoch": 0.04795204795204795, "grad_norm": 0.9425901174545288, "learning_rate": 2.8e-05, "loss": 3.3512, "step": 12 }, { "epoch": 0.05194805194805195, "grad_norm": 1.01284921169281, "learning_rate": 3.0333333333333333e-05, "loss": 3.2925, "step": 13 }, { "epoch": 0.055944055944055944, "grad_norm": 1.0988140106201172, "learning_rate": 3.266666666666666e-05, "loss": 3.3614, "step": 14 }, { "epoch": 0.059940059940059943, "grad_norm": 1.2063215970993042, "learning_rate": 3.5e-05, "loss": 3.8333, "step": 15 }, { "epoch": 0.06393606393606394, "grad_norm": 1.2403212785720825, "learning_rate": 3.733333333333333e-05, "loss": 3.4647, "step": 16 }, { "epoch": 0.06793206793206794, "grad_norm": 1.2335699796676636, "learning_rate": 3.9666666666666664e-05, "loss": 3.4031, "step": 17 }, { "epoch": 0.07192807192807193, "grad_norm": 1.3119879961013794, "learning_rate": 4.2e-05, "loss": 3.5779, "step": 18 }, { "epoch": 0.07592407592407592, "grad_norm": 1.3801676034927368, "learning_rate": 4.4333333333333324e-05, "loss": 3.6713, "step": 19 }, { "epoch": 0.07992007992007992, "grad_norm": 1.4695338010787964, "learning_rate": 4.666666666666666e-05, "loss": 3.7263, "step": 20 }, { "epoch": 0.08391608391608392, "grad_norm": 1.3599457740783691, "learning_rate": 4.899999999999999e-05, "loss": 3.2312, "step": 21 }, { "epoch": 0.08791208791208792, "grad_norm": 1.5197994709014893, "learning_rate": 5.1333333333333325e-05, "loss": 3.5302, "step": 22 }, { "epoch": 0.0919080919080919, "grad_norm": 1.4586033821105957, "learning_rate": 5.3666666666666666e-05, "loss": 3.4314, "step": 23 }, { "epoch": 0.0959040959040959, "grad_norm": 1.602725863456726, "learning_rate": 5.6e-05, "loss": 3.4305, "step": 24 }, { "epoch": 0.0999000999000999, "grad_norm": 1.5094614028930664, "learning_rate": 5.833333333333333e-05, "loss": 3.1046, "step": 25 }, { "epoch": 0.1038961038961039, "grad_norm": 1.5275474786758423, "learning_rate": 6.0666666666666666e-05, "loss": 3.0118, "step": 26 }, { "epoch": 0.10789210789210789, "grad_norm": 1.580427885055542, "learning_rate": 6.3e-05, "loss": 3.1404, "step": 27 }, { "epoch": 0.11188811188811189, "grad_norm": 1.5823817253112793, "learning_rate": 6.533333333333333e-05, "loss": 2.8777, "step": 28 }, { "epoch": 0.11588411588411589, "grad_norm": 1.5733855962753296, "learning_rate": 6.766666666666667e-05, "loss": 3.5837, "step": 29 }, { "epoch": 0.11988011988011989, "grad_norm": 1.3774138689041138, "learning_rate": 7e-05, "loss": 2.9621, "step": 30 }, { "epoch": 0.12387612387612387, "grad_norm": 1.3485286235809326, "learning_rate": 6.999402376603183e-05, "loss": 2.4754, "step": 31 }, { "epoch": 0.12787212787212787, "grad_norm": 1.6682289838790894, "learning_rate": 6.99760971050058e-05, "loss": 3.1928, "step": 32 }, { "epoch": 0.13186813186813187, "grad_norm": 1.6313886642456055, "learning_rate": 6.994622613886018e-05, "loss": 2.7337, "step": 33 }, { "epoch": 0.13586413586413587, "grad_norm": 1.89947509765625, "learning_rate": 6.990442106850258e-05, "loss": 3.4725, "step": 34 }, { "epoch": 0.13986013986013987, "grad_norm": 1.7983272075653076, "learning_rate": 6.98506961703262e-05, "loss": 3.2372, "step": 35 }, { "epoch": 0.14385614385614387, "grad_norm": 1.5611772537231445, "learning_rate": 6.978506979133457e-05, "loss": 2.2857, "step": 36 }, { "epoch": 0.14785214785214784, "grad_norm": 1.8788586854934692, "learning_rate": 6.9707564342876e-05, "loss": 3.0923, "step": 37 }, { "epoch": 0.15184815184815184, "grad_norm": 1.6326425075531006, "learning_rate": 6.96182062929901e-05, "loss": 2.4778, "step": 38 }, { "epoch": 0.15584415584415584, "grad_norm": 1.6984632015228271, "learning_rate": 6.951702615736908e-05, "loss": 2.7221, "step": 39 }, { "epoch": 0.15984015984015984, "grad_norm": 1.6536245346069336, "learning_rate": 6.940405848893656e-05, "loss": 2.6242, "step": 40 }, { "epoch": 0.16383616383616384, "grad_norm": 1.6980955600738525, "learning_rate": 6.92793418660478e-05, "loss": 2.6704, "step": 41 }, { "epoch": 0.16783216783216784, "grad_norm": 1.6571403741836548, "learning_rate": 6.914291887931528e-05, "loss": 2.4392, "step": 42 }, { "epoch": 0.17182817182817184, "grad_norm": 1.718841791152954, "learning_rate": 6.899483611706398e-05, "loss": 2.9214, "step": 43 }, { "epoch": 0.17582417582417584, "grad_norm": 1.6418437957763672, "learning_rate": 6.883514414942155e-05, "loss": 2.4861, "step": 44 }, { "epoch": 0.1798201798201798, "grad_norm": 1.6787574291229248, "learning_rate": 6.866389751104867e-05, "loss": 2.561, "step": 45 }, { "epoch": 0.1838161838161838, "grad_norm": 1.6907758712768555, "learning_rate": 6.848115468251542e-05, "loss": 2.7484, "step": 46 }, { "epoch": 0.1878121878121878, "grad_norm": 1.613139271736145, "learning_rate": 6.828697807033038e-05, "loss": 2.2981, "step": 47 }, { "epoch": 0.1918081918081918, "grad_norm": 1.923545479774475, "learning_rate": 6.808143398562868e-05, "loss": 2.5458, "step": 48 }, { "epoch": 0.1958041958041958, "grad_norm": 2.405118703842163, "learning_rate": 6.786459262152698e-05, "loss": 2.6176, "step": 49 }, { "epoch": 0.1998001998001998, "grad_norm": 2.002331495285034, "learning_rate": 6.763652802915244e-05, "loss": 2.5153, "step": 50 }, { "epoch": 0.1998001998001998, "eval_loss": 2.4798147678375244, "eval_runtime": 12.3169, "eval_samples_per_second": 34.262, "eval_steps_per_second": 8.606, "step": 50 }, { "epoch": 0.2037962037962038, "grad_norm": 2.9109737873077393, "learning_rate": 6.739731809235446e-05, "loss": 2.0977, "step": 51 }, { "epoch": 0.2077922077922078, "grad_norm": 2.381870985031128, "learning_rate": 6.71470445011073e-05, "loss": 2.2809, "step": 52 }, { "epoch": 0.21178821178821178, "grad_norm": 1.9585492610931396, "learning_rate": 6.688579272361309e-05, "loss": 2.0122, "step": 53 }, { "epoch": 0.21578421578421578, "grad_norm": 1.659829020500183, "learning_rate": 6.66136519771145e-05, "loss": 2.1264, "step": 54 }, { "epoch": 0.21978021978021978, "grad_norm": 1.2644081115722656, "learning_rate": 6.633071519742718e-05, "loss": 2.0217, "step": 55 }, { "epoch": 0.22377622377622378, "grad_norm": 1.1033685207366943, "learning_rate": 6.603707900720217e-05, "loss": 2.0627, "step": 56 }, { "epoch": 0.22777222777222778, "grad_norm": 1.0150108337402344, "learning_rate": 6.573284368292943e-05, "loss": 2.1786, "step": 57 }, { "epoch": 0.23176823176823177, "grad_norm": 0.9603342413902283, "learning_rate": 6.541811312069348e-05, "loss": 2.2729, "step": 58 }, { "epoch": 0.23576423576423577, "grad_norm": 1.048278570175171, "learning_rate": 6.509299480069303e-05, "loss": 2.3577, "step": 59 }, { "epoch": 0.23976023976023977, "grad_norm": 0.9779584407806396, "learning_rate": 6.47575997505365e-05, "loss": 2.6499, "step": 60 }, { "epoch": 0.24375624375624375, "grad_norm": 0.9486345052719116, "learning_rate": 6.441204250732624e-05, "loss": 2.5664, "step": 61 }, { "epoch": 0.24775224775224775, "grad_norm": 0.9750168323516846, "learning_rate": 6.405644107854427e-05, "loss": 2.4282, "step": 62 }, { "epoch": 0.2517482517482518, "grad_norm": 1.1880064010620117, "learning_rate": 6.369091690175273e-05, "loss": 2.8261, "step": 63 }, { "epoch": 0.25574425574425574, "grad_norm": 1.0042461156845093, "learning_rate": 6.331559480312315e-05, "loss": 2.394, "step": 64 }, { "epoch": 0.2597402597402597, "grad_norm": 0.944476306438446, "learning_rate": 6.293060295480838e-05, "loss": 2.4344, "step": 65 }, { "epoch": 0.26373626373626374, "grad_norm": 1.3060429096221924, "learning_rate": 6.25360728311719e-05, "loss": 2.4905, "step": 66 }, { "epoch": 0.2677322677322677, "grad_norm": 1.1383546590805054, "learning_rate": 6.213213916388954e-05, "loss": 2.4844, "step": 67 }, { "epoch": 0.27172827172827174, "grad_norm": 1.3176931142807007, "learning_rate": 6.171893989593859e-05, "loss": 2.39, "step": 68 }, { "epoch": 0.2757242757242757, "grad_norm": 1.3587539196014404, "learning_rate": 6.129661613449057e-05, "loss": 2.8022, "step": 69 }, { "epoch": 0.27972027972027974, "grad_norm": 1.3367782831192017, "learning_rate": 6.086531210272306e-05, "loss": 2.3988, "step": 70 }, { "epoch": 0.2837162837162837, "grad_norm": 1.3037152290344238, "learning_rate": 6.042517509056784e-05, "loss": 2.3973, "step": 71 }, { "epoch": 0.28771228771228774, "grad_norm": 1.149904489517212, "learning_rate": 5.997635540441133e-05, "loss": 2.1774, "step": 72 }, { "epoch": 0.2917082917082917, "grad_norm": 1.1675207614898682, "learning_rate": 5.9519006315765176e-05, "loss": 2.5556, "step": 73 }, { "epoch": 0.2957042957042957, "grad_norm": 0.9884352087974548, "learning_rate": 5.9053284008924185e-05, "loss": 2.328, "step": 74 }, { "epoch": 0.2997002997002997, "grad_norm": 1.2047306299209595, "learning_rate": 5.85793475276295e-05, "loss": 2.6521, "step": 75 }, { "epoch": 0.3036963036963037, "grad_norm": 1.2565317153930664, "learning_rate": 5.809735872075529e-05, "loss": 2.4167, "step": 76 }, { "epoch": 0.3076923076923077, "grad_norm": 1.2600446939468384, "learning_rate": 5.760748218703755e-05, "loss": 2.8223, "step": 77 }, { "epoch": 0.3116883116883117, "grad_norm": 1.2325770854949951, "learning_rate": 5.710988521886378e-05, "loss": 2.6209, "step": 78 }, { "epoch": 0.3156843156843157, "grad_norm": 1.5291202068328857, "learning_rate": 5.660473774514275e-05, "loss": 3.0327, "step": 79 }, { "epoch": 0.3196803196803197, "grad_norm": 1.9587080478668213, "learning_rate": 5.6092212273273975e-05, "loss": 2.5743, "step": 80 }, { "epoch": 0.32367632367632365, "grad_norm": 1.377360224723816, "learning_rate": 5.557248383023655e-05, "loss": 2.7031, "step": 81 }, { "epoch": 0.3276723276723277, "grad_norm": 1.2387492656707764, "learning_rate": 5.5045729902817676e-05, "loss": 2.5218, "step": 82 }, { "epoch": 0.33166833166833165, "grad_norm": 1.3606826066970825, "learning_rate": 5.4512130377000987e-05, "loss": 2.4445, "step": 83 }, { "epoch": 0.3356643356643357, "grad_norm": 1.3767836093902588, "learning_rate": 5.397186747653573e-05, "loss": 2.3777, "step": 84 }, { "epoch": 0.33966033966033965, "grad_norm": 1.6508392095565796, "learning_rate": 5.342512570070745e-05, "loss": 2.4937, "step": 85 }, { "epoch": 0.3436563436563437, "grad_norm": 1.4804351329803467, "learning_rate": 5.287209176133174e-05, "loss": 2.7918, "step": 86 }, { "epoch": 0.34765234765234765, "grad_norm": 1.7005175352096558, "learning_rate": 5.231295451899226e-05, "loss": 2.872, "step": 87 }, { "epoch": 0.3516483516483517, "grad_norm": 1.546638011932373, "learning_rate": 5.174790491854502e-05, "loss": 2.4164, "step": 88 }, { "epoch": 0.35564435564435565, "grad_norm": 1.698038101196289, "learning_rate": 5.117713592391096e-05, "loss": 2.777, "step": 89 }, { "epoch": 0.3596403596403596, "grad_norm": 1.5624505281448364, "learning_rate": 5.060084245217884e-05, "loss": 2.2866, "step": 90 }, { "epoch": 0.36363636363636365, "grad_norm": 1.46035897731781, "learning_rate": 5.0019221307041306e-05, "loss": 2.2441, "step": 91 }, { "epoch": 0.3676323676323676, "grad_norm": 1.7755813598632812, "learning_rate": 4.943247111158662e-05, "loss": 2.5309, "step": 92 }, { "epoch": 0.37162837162837165, "grad_norm": 1.2800326347351074, "learning_rate": 4.884079224046898e-05, "loss": 1.7132, "step": 93 }, { "epoch": 0.3756243756243756, "grad_norm": 1.3917478322982788, "learning_rate": 4.824438675148086e-05, "loss": 2.2276, "step": 94 }, { "epoch": 0.37962037962037964, "grad_norm": 1.4779376983642578, "learning_rate": 4.764345831655036e-05, "loss": 1.9192, "step": 95 }, { "epoch": 0.3836163836163836, "grad_norm": 1.580915093421936, "learning_rate": 4.703821215218748e-05, "loss": 2.1596, "step": 96 }, { "epoch": 0.3876123876123876, "grad_norm": 1.6140340566635132, "learning_rate": 4.642885494940291e-05, "loss": 2.132, "step": 97 }, { "epoch": 0.3916083916083916, "grad_norm": 1.5008978843688965, "learning_rate": 4.581559480312316e-05, "loss": 2.0517, "step": 98 }, { "epoch": 0.3956043956043956, "grad_norm": 1.6168243885040283, "learning_rate": 4.519864114112636e-05, "loss": 2.1317, "step": 99 }, { "epoch": 0.3996003996003996, "grad_norm": 2.193976640701294, "learning_rate": 4.45782046525229e-05, "loss": 2.9634, "step": 100 }, { "epoch": 0.3996003996003996, "eval_loss": 2.2858269214630127, "eval_runtime": 12.3231, "eval_samples_per_second": 34.245, "eval_steps_per_second": 8.602, "step": 100 }, { "epoch": 0.4035964035964036, "grad_norm": 2.1162538528442383, "learning_rate": 4.3954497215805244e-05, "loss": 2.0271, "step": 101 }, { "epoch": 0.4075924075924076, "grad_norm": 2.1069529056549072, "learning_rate": 4.332773182649165e-05, "loss": 1.9228, "step": 102 }, { "epoch": 0.4115884115884116, "grad_norm": 1.6667039394378662, "learning_rate": 4.2698122524388405e-05, "loss": 1.8665, "step": 103 }, { "epoch": 0.4155844155844156, "grad_norm": 1.4171080589294434, "learning_rate": 4.206588432049535e-05, "loss": 1.9626, "step": 104 }, { "epoch": 0.4195804195804196, "grad_norm": 1.287769079208374, "learning_rate": 4.143123312357996e-05, "loss": 1.8907, "step": 105 }, { "epoch": 0.42357642357642356, "grad_norm": 1.1539276838302612, "learning_rate": 4.079438566644454e-05, "loss": 2.1804, "step": 106 }, { "epoch": 0.4275724275724276, "grad_norm": 1.1381549835205078, "learning_rate": 4.015555943191231e-05, "loss": 2.3958, "step": 107 }, { "epoch": 0.43156843156843155, "grad_norm": 1.0985478162765503, "learning_rate": 3.9514972578557114e-05, "loss": 2.4117, "step": 108 }, { "epoch": 0.4355644355644356, "grad_norm": 1.124737024307251, "learning_rate": 3.8872843866202525e-05, "loss": 2.8618, "step": 109 }, { "epoch": 0.43956043956043955, "grad_norm": 1.0490049123764038, "learning_rate": 3.8229392581215565e-05, "loss": 2.6686, "step": 110 }, { "epoch": 0.4435564435564436, "grad_norm": 0.9649744629859924, "learning_rate": 3.7584838461620587e-05, "loss": 2.37, "step": 111 }, { "epoch": 0.44755244755244755, "grad_norm": 0.9374983310699463, "learning_rate": 3.693940162205895e-05, "loss": 2.3227, "step": 112 }, { "epoch": 0.4515484515484515, "grad_norm": 1.1069667339324951, "learning_rate": 3.629330247862007e-05, "loss": 2.4709, "step": 113 }, { "epoch": 0.45554445554445555, "grad_norm": 1.046639084815979, "learning_rate": 3.564676167356954e-05, "loss": 2.4697, "step": 114 }, { "epoch": 0.4595404595404595, "grad_norm": 1.0034219026565552, "learning_rate": 3.5e-05, "loss": 2.1744, "step": 115 }, { "epoch": 0.46353646353646355, "grad_norm": 1.123319387435913, "learning_rate": 3.435323832643046e-05, "loss": 2.404, "step": 116 }, { "epoch": 0.4675324675324675, "grad_norm": 1.2217833995819092, "learning_rate": 3.370669752137993e-05, "loss": 2.6623, "step": 117 }, { "epoch": 0.47152847152847155, "grad_norm": 1.2300748825073242, "learning_rate": 3.306059837794105e-05, "loss": 2.2746, "step": 118 }, { "epoch": 0.4755244755244755, "grad_norm": 1.172756552696228, "learning_rate": 3.241516153837941e-05, "loss": 2.5027, "step": 119 }, { "epoch": 0.47952047952047955, "grad_norm": 1.145707130432129, "learning_rate": 3.177060741878443e-05, "loss": 2.0651, "step": 120 }, { "epoch": 0.4835164835164835, "grad_norm": 2.8998000621795654, "learning_rate": 3.1127156133797475e-05, "loss": 2.4739, "step": 121 }, { "epoch": 0.4875124875124875, "grad_norm": 1.1812527179718018, "learning_rate": 3.048502742144289e-05, "loss": 2.1995, "step": 122 }, { "epoch": 0.4915084915084915, "grad_norm": 1.3116470575332642, "learning_rate": 2.984444056808768e-05, "loss": 2.0807, "step": 123 }, { "epoch": 0.4955044955044955, "grad_norm": 1.3395541906356812, "learning_rate": 2.9205614333555444e-05, "loss": 2.294, "step": 124 }, { "epoch": 0.4995004995004995, "grad_norm": 1.2578343152999878, "learning_rate": 2.856876687642003e-05, "loss": 2.3764, "step": 125 }, { "epoch": 0.5034965034965035, "grad_norm": 1.472633719444275, "learning_rate": 2.7934115679504645e-05, "loss": 2.546, "step": 126 }, { "epoch": 0.5074925074925075, "grad_norm": 1.339499831199646, "learning_rate": 2.7301877475611606e-05, "loss": 2.3383, "step": 127 }, { "epoch": 0.5114885114885115, "grad_norm": 1.4892079830169678, "learning_rate": 2.667226817350835e-05, "loss": 2.6243, "step": 128 }, { "epoch": 0.5154845154845155, "grad_norm": 1.5566977262496948, "learning_rate": 2.604550278419475e-05, "loss": 2.465, "step": 129 }, { "epoch": 0.5194805194805194, "grad_norm": 1.3911737203598022, "learning_rate": 2.54217953474771e-05, "loss": 2.3863, "step": 130 }, { "epoch": 0.5234765234765235, "grad_norm": 1.2917157411575317, "learning_rate": 2.4801358858873636e-05, "loss": 2.2096, "step": 131 }, { "epoch": 0.5274725274725275, "grad_norm": 1.4031473398208618, "learning_rate": 2.4184405196876842e-05, "loss": 2.2251, "step": 132 }, { "epoch": 0.5314685314685315, "grad_norm": 1.1976553201675415, "learning_rate": 2.3571145050597088e-05, "loss": 2.247, "step": 133 }, { "epoch": 0.5354645354645354, "grad_norm": 1.7376965284347534, "learning_rate": 2.296178784781251e-05, "loss": 2.3154, "step": 134 }, { "epoch": 0.5394605394605395, "grad_norm": 2.4723494052886963, "learning_rate": 2.2356541683449646e-05, "loss": 2.1628, "step": 135 }, { "epoch": 0.5434565434565435, "grad_norm": 1.3845605850219727, "learning_rate": 2.175561324851914e-05, "loss": 2.0909, "step": 136 }, { "epoch": 0.5474525474525475, "grad_norm": 1.3435039520263672, "learning_rate": 2.1159207759531013e-05, "loss": 1.919, "step": 137 }, { "epoch": 0.5514485514485514, "grad_norm": 1.2770930528640747, "learning_rate": 2.0567528888413382e-05, "loss": 2.117, "step": 138 }, { "epoch": 0.5554445554445554, "grad_norm": 1.2313694953918457, "learning_rate": 1.9980778692958684e-05, "loss": 2.1183, "step": 139 }, { "epoch": 0.5594405594405595, "grad_norm": 1.5269200801849365, "learning_rate": 1.9399157547821162e-05, "loss": 2.3234, "step": 140 }, { "epoch": 0.5634365634365635, "grad_norm": 1.1705524921417236, "learning_rate": 1.882286407608904e-05, "loss": 1.7432, "step": 141 }, { "epoch": 0.5674325674325674, "grad_norm": 1.3677781820297241, "learning_rate": 1.825209508145497e-05, "loss": 1.9979, "step": 142 }, { "epoch": 0.5714285714285714, "grad_norm": 1.4716687202453613, "learning_rate": 1.7687045481007746e-05, "loss": 2.3369, "step": 143 }, { "epoch": 0.5754245754245755, "grad_norm": 1.9906991720199585, "learning_rate": 1.712790823866826e-05, "loss": 2.2108, "step": 144 }, { "epoch": 0.5794205794205795, "grad_norm": 1.378624677658081, "learning_rate": 1.657487429929254e-05, "loss": 1.6657, "step": 145 }, { "epoch": 0.5834165834165834, "grad_norm": 1.4062309265136719, "learning_rate": 1.602813252346427e-05, "loss": 2.0265, "step": 146 }, { "epoch": 0.5874125874125874, "grad_norm": 1.5582996606826782, "learning_rate": 1.5487869622999004e-05, "loss": 2.1221, "step": 147 }, { "epoch": 0.5914085914085914, "grad_norm": 1.8642396926879883, "learning_rate": 1.4954270097182317e-05, "loss": 2.4251, "step": 148 }, { "epoch": 0.5954045954045954, "grad_norm": 1.7950752973556519, "learning_rate": 1.4427516169763444e-05, "loss": 2.4922, "step": 149 }, { "epoch": 0.5994005994005994, "grad_norm": 2.0279133319854736, "learning_rate": 1.3907787726726029e-05, "loss": 1.9366, "step": 150 }, { "epoch": 0.5994005994005994, "eval_loss": 2.1868927478790283, "eval_runtime": 12.3324, "eval_samples_per_second": 34.219, "eval_steps_per_second": 8.595, "step": 150 }, { "epoch": 0.6033966033966034, "grad_norm": 0.7327056527137756, "learning_rate": 1.339526225485725e-05, "loss": 1.5937, "step": 151 }, { "epoch": 0.6073926073926074, "grad_norm": 0.754828929901123, "learning_rate": 1.2890114781136224e-05, "loss": 1.6017, "step": 152 }, { "epoch": 0.6113886113886113, "grad_norm": 0.8262466788291931, "learning_rate": 1.239251781296245e-05, "loss": 1.8901, "step": 153 }, { "epoch": 0.6153846153846154, "grad_norm": 0.830879807472229, "learning_rate": 1.1902641279244715e-05, "loss": 1.7634, "step": 154 }, { "epoch": 0.6193806193806194, "grad_norm": 0.9203765988349915, "learning_rate": 1.1420652472370497e-05, "loss": 1.9953, "step": 155 }, { "epoch": 0.6233766233766234, "grad_norm": 0.8587179183959961, "learning_rate": 1.0946715991075805e-05, "loss": 2.2729, "step": 156 }, { "epoch": 0.6273726273726273, "grad_norm": 0.8361735343933105, "learning_rate": 1.0480993684234815e-05, "loss": 2.2542, "step": 157 }, { "epoch": 0.6313686313686314, "grad_norm": 0.9825781583786011, "learning_rate": 1.0023644595588671e-05, "loss": 2.5244, "step": 158 }, { "epoch": 0.6353646353646354, "grad_norm": 1.0092859268188477, "learning_rate": 9.57482490943216e-06, "loss": 2.5485, "step": 159 }, { "epoch": 0.6393606393606394, "grad_norm": 0.9623708128929138, "learning_rate": 9.134687897276934e-06, "loss": 2.4684, "step": 160 }, { "epoch": 0.6433566433566433, "grad_norm": 1.0368270874023438, "learning_rate": 8.703383865509432e-06, "loss": 2.2466, "step": 161 }, { "epoch": 0.6473526473526473, "grad_norm": 1.0287292003631592, "learning_rate": 8.281060104061394e-06, "loss": 2.4866, "step": 162 }, { "epoch": 0.6513486513486514, "grad_norm": 1.1260746717453003, "learning_rate": 7.867860836110453e-06, "loss": 2.453, "step": 163 }, { "epoch": 0.6553446553446554, "grad_norm": 1.2558013200759888, "learning_rate": 7.463927168828087e-06, "loss": 2.6436, "step": 164 }, { "epoch": 0.6593406593406593, "grad_norm": 1.0530807971954346, "learning_rate": 7.069397045191617e-06, "loss": 2.057, "step": 165 }, { "epoch": 0.6633366633366633, "grad_norm": 1.1688792705535889, "learning_rate": 6.684405196876842e-06, "loss": 2.2365, "step": 166 }, { "epoch": 0.6673326673326674, "grad_norm": 1.1205819845199585, "learning_rate": 6.309083098247264e-06, "loss": 1.9908, "step": 167 }, { "epoch": 0.6713286713286714, "grad_norm": 1.5774847269058228, "learning_rate": 5.943558921455733e-06, "loss": 2.5213, "step": 168 }, { "epoch": 0.6753246753246753, "grad_norm": 1.5109460353851318, "learning_rate": 5.587957492673759e-06, "loss": 2.6296, "step": 169 }, { "epoch": 0.6793206793206793, "grad_norm": 1.3312510251998901, "learning_rate": 5.2424002494635095e-06, "loss": 2.375, "step": 170 }, { "epoch": 0.6833166833166833, "grad_norm": 1.354404091835022, "learning_rate": 4.9070051993069636e-06, "loss": 2.3225, "step": 171 }, { "epoch": 0.6873126873126874, "grad_norm": 1.286742091178894, "learning_rate": 4.581886879306507e-06, "loss": 2.4046, "step": 172 }, { "epoch": 0.6913086913086913, "grad_norm": 1.2853087186813354, "learning_rate": 4.2671563170705725e-06, "loss": 2.1293, "step": 173 }, { "epoch": 0.6953046953046953, "grad_norm": 1.219387412071228, "learning_rate": 3.962920992797834e-06, "loss": 2.2614, "step": 174 }, { "epoch": 0.6993006993006993, "grad_norm": 1.3206144571304321, "learning_rate": 3.6692848025728216e-06, "loss": 2.4399, "step": 175 }, { "epoch": 0.7032967032967034, "grad_norm": 1.8097885847091675, "learning_rate": 3.38634802288549e-06, "loss": 2.0227, "step": 176 }, { "epoch": 0.7072927072927073, "grad_norm": 1.4352858066558838, "learning_rate": 3.1142072763869042e-06, "loss": 2.1061, "step": 177 }, { "epoch": 0.7112887112887113, "grad_norm": 1.3352255821228027, "learning_rate": 2.852955498892694e-06, "loss": 2.1514, "step": 178 }, { "epoch": 0.7152847152847153, "grad_norm": 1.519750714302063, "learning_rate": 2.6026819076455325e-06, "loss": 2.1995, "step": 179 }, { "epoch": 0.7192807192807192, "grad_norm": 1.6516913175582886, "learning_rate": 2.36347197084755e-06, "loss": 2.56, "step": 180 }, { "epoch": 0.7232767232767233, "grad_norm": 1.4886879920959473, "learning_rate": 2.1354073784730253e-06, "loss": 2.3716, "step": 181 }, { "epoch": 0.7272727272727273, "grad_norm": 1.7722868919372559, "learning_rate": 1.9185660143713184e-06, "loss": 2.7071, "step": 182 }, { "epoch": 0.7312687312687313, "grad_norm": 1.390089750289917, "learning_rate": 1.7130219296696263e-06, "loss": 2.0848, "step": 183 }, { "epoch": 0.7352647352647352, "grad_norm": 1.5310990810394287, "learning_rate": 1.5188453174845743e-06, "loss": 2.1562, "step": 184 }, { "epoch": 0.7392607392607392, "grad_norm": 1.7777762413024902, "learning_rate": 1.3361024889513333e-06, "loss": 2.6377, "step": 185 }, { "epoch": 0.7432567432567433, "grad_norm": 1.459740161895752, "learning_rate": 1.16485585057844e-06, "loss": 1.9993, "step": 186 }, { "epoch": 0.7472527472527473, "grad_norm": 1.5496718883514404, "learning_rate": 1.0051638829360127e-06, "loss": 2.5303, "step": 187 }, { "epoch": 0.7512487512487512, "grad_norm": 1.6609615087509155, "learning_rate": 8.570811206847189e-07, "loss": 2.4637, "step": 188 }, { "epoch": 0.7552447552447552, "grad_norm": 1.4042152166366577, "learning_rate": 7.206581339521939e-07, "loss": 1.9604, "step": 189 }, { "epoch": 0.7592407592407593, "grad_norm": 1.4808423519134521, "learning_rate": 5.959415110634375e-07, "loss": 2.2657, "step": 190 }, { "epoch": 0.7632367632367633, "grad_norm": 1.4611728191375732, "learning_rate": 4.829738426309099e-07, "loss": 2.0324, "step": 191 }, { "epoch": 0.7672327672327672, "grad_norm": 1.6654783487319946, "learning_rate": 3.817937070098914e-07, "loss": 2.4196, "step": 192 }, { "epoch": 0.7712287712287712, "grad_norm": 1.9148050546646118, "learning_rate": 2.9243565712400384e-07, "loss": 2.1463, "step": 193 }, { "epoch": 0.7752247752247752, "grad_norm": 1.5592164993286133, "learning_rate": 2.1493020866542365e-07, "loss": 2.0875, "step": 194 }, { "epoch": 0.7792207792207793, "grad_norm": 1.38995361328125, "learning_rate": 1.4930382967379363e-07, "loss": 1.7449, "step": 195 }, { "epoch": 0.7832167832167832, "grad_norm": 1.3022801876068115, "learning_rate": 9.557893149741924e-08, "loss": 1.7856, "step": 196 }, { "epoch": 0.7872127872127872, "grad_norm": 1.5510423183441162, "learning_rate": 5.377386113981197e-08, "loss": 2.0427, "step": 197 }, { "epoch": 0.7912087912087912, "grad_norm": 1.591604232788086, "learning_rate": 2.3902894994198286e-08, "loss": 2.2617, "step": 198 }, { "epoch": 0.7952047952047953, "grad_norm": 1.8469445705413818, "learning_rate": 5.976233968155164e-09, "loss": 2.4747, "step": 199 }, { "epoch": 0.7992007992007992, "grad_norm": 3.1092138290405273, "learning_rate": 0.0, "loss": 2.4267, "step": 200 }, { "epoch": 0.7992007992007992, "eval_loss": 2.168654680252075, "eval_runtime": 12.3256, "eval_samples_per_second": 34.238, "eval_steps_per_second": 8.6, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.27587608625152e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }