{ "best_metric": 1.5968632698059082, "best_model_checkpoint": "miner_id_24/checkpoint-130", "epoch": 0.04518178609248147, "eval_steps": 10, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002823861630780092, "grad_norm": 1.2056736946105957, "learning_rate": 2e-05, "loss": 1.9521, "step": 1 }, { "epoch": 0.0002823861630780092, "eval_loss": 2.352551221847534, "eval_runtime": 133.636, "eval_samples_per_second": 5.582, "eval_steps_per_second": 5.582, "step": 1 }, { "epoch": 0.0005647723261560184, "grad_norm": 4.347576141357422, "learning_rate": 4e-05, "loss": 3.1192, "step": 2 }, { "epoch": 0.0008471584892340275, "grad_norm": 0.7937899231910706, "learning_rate": 6e-05, "loss": 1.1903, "step": 3 }, { "epoch": 0.0011295446523120368, "grad_norm": 3.9262712001800537, "learning_rate": 8e-05, "loss": 3.5143, "step": 4 }, { "epoch": 0.0014119308153900459, "grad_norm": 2.7941317558288574, "learning_rate": 0.0001, "loss": 2.9205, "step": 5 }, { "epoch": 0.001694316978468055, "grad_norm": 0.9424725770950317, "learning_rate": 0.00012, "loss": 2.582, "step": 6 }, { "epoch": 0.001976703141546064, "grad_norm": 1.8026996850967407, "learning_rate": 0.00014, "loss": 2.0555, "step": 7 }, { "epoch": 0.0022590893046240735, "grad_norm": 1.0503661632537842, "learning_rate": 0.00016, "loss": 2.6065, "step": 8 }, { "epoch": 0.0025414754677020824, "grad_norm": 2.3215556144714355, "learning_rate": 0.00018, "loss": 2.6192, "step": 9 }, { "epoch": 0.0028238616307800918, "grad_norm": 1.60389244556427, "learning_rate": 0.0002, "loss": 2.1819, "step": 10 }, { "epoch": 0.0028238616307800918, "eval_loss": 2.1240005493164062, "eval_runtime": 133.7157, "eval_samples_per_second": 5.579, "eval_steps_per_second": 5.579, "step": 10 }, { "epoch": 0.003106247793858101, "grad_norm": 1.0237597227096558, "learning_rate": 0.0001999979446958366, "loss": 1.8844, "step": 11 }, { "epoch": 0.00338863395693611, "grad_norm": 4.726717472076416, "learning_rate": 0.00019999177886783194, "loss": 3.1172, "step": 12 }, { "epoch": 0.0036710201200141194, "grad_norm": 1.6403062343597412, "learning_rate": 0.00019998150276943902, "loss": 1.7007, "step": 13 }, { "epoch": 0.003953406283092128, "grad_norm": 2.1164722442626953, "learning_rate": 0.000199967116823068, "loss": 2.5321, "step": 14 }, { "epoch": 0.004235792446170138, "grad_norm": 3.5340867042541504, "learning_rate": 0.0001999486216200688, "loss": 2.3608, "step": 15 }, { "epoch": 0.004518178609248147, "grad_norm": 5.55496072769165, "learning_rate": 0.00019992601792070679, "loss": 1.6141, "step": 16 }, { "epoch": 0.004800564772326156, "grad_norm": 7.0048136711120605, "learning_rate": 0.00019989930665413147, "loss": 2.2728, "step": 17 }, { "epoch": 0.005082950935404165, "grad_norm": 1.7807990312576294, "learning_rate": 0.00019986848891833845, "loss": 1.6759, "step": 18 }, { "epoch": 0.005365337098482174, "grad_norm": 3.1774418354034424, "learning_rate": 0.0001998335659801241, "loss": 0.8362, "step": 19 }, { "epoch": 0.0056477232615601836, "grad_norm": 5.73573637008667, "learning_rate": 0.00019979453927503364, "loss": 2.7934, "step": 20 }, { "epoch": 0.0056477232615601836, "eval_loss": 1.7092158794403076, "eval_runtime": 133.3036, "eval_samples_per_second": 5.596, "eval_steps_per_second": 5.596, "step": 20 }, { "epoch": 0.005930109424638193, "grad_norm": 1.705460548400879, "learning_rate": 0.00019975141040730207, "loss": 1.3571, "step": 21 }, { "epoch": 0.006212495587716202, "grad_norm": 1.3597909212112427, "learning_rate": 0.0001997041811497882, "loss": 1.491, "step": 22 }, { "epoch": 0.006494881750794211, "grad_norm": 1.3038731813430786, "learning_rate": 0.00019965285344390184, "loss": 2.4389, "step": 23 }, { "epoch": 0.00677726791387222, "grad_norm": 2.3923144340515137, "learning_rate": 0.00019959742939952392, "loss": 1.3591, "step": 24 }, { "epoch": 0.0070596540769502295, "grad_norm": 2.964477300643921, "learning_rate": 0.00019953791129491983, "loss": 1.1022, "step": 25 }, { "epoch": 0.007342040240028239, "grad_norm": 3.184072732925415, "learning_rate": 0.00019947430157664576, "loss": 1.4837, "step": 26 }, { "epoch": 0.007624426403106248, "grad_norm": 1.576446294784546, "learning_rate": 0.00019940660285944803, "loss": 2.0992, "step": 27 }, { "epoch": 0.007906812566184257, "grad_norm": 2.878796100616455, "learning_rate": 0.00019933481792615583, "loss": 2.4712, "step": 28 }, { "epoch": 0.008189198729262267, "grad_norm": 2.5952467918395996, "learning_rate": 0.0001992589497275665, "loss": 1.6412, "step": 29 }, { "epoch": 0.008471584892340275, "grad_norm": 2.8451197147369385, "learning_rate": 0.0001991790013823246, "loss": 1.6372, "step": 30 }, { "epoch": 0.008471584892340275, "eval_loss": 1.6834224462509155, "eval_runtime": 133.2875, "eval_samples_per_second": 5.597, "eval_steps_per_second": 5.597, "step": 30 }, { "epoch": 0.008753971055418284, "grad_norm": 2.855771780014038, "learning_rate": 0.00019909497617679348, "loss": 2.608, "step": 31 }, { "epoch": 0.009036357218496294, "grad_norm": 2.0999650955200195, "learning_rate": 0.0001990068775649202, "loss": 1.6182, "step": 32 }, { "epoch": 0.009318743381574303, "grad_norm": 5.745433330535889, "learning_rate": 0.00019891470916809362, "loss": 1.6578, "step": 33 }, { "epoch": 0.009601129544652313, "grad_norm": 1.7802093029022217, "learning_rate": 0.00019881847477499557, "loss": 1.8047, "step": 34 }, { "epoch": 0.009883515707730321, "grad_norm": 3.094785451889038, "learning_rate": 0.00019871817834144504, "loss": 1.2005, "step": 35 }, { "epoch": 0.01016590187080833, "grad_norm": 2.3366200923919678, "learning_rate": 0.0001986138239902355, "loss": 2.0731, "step": 36 }, { "epoch": 0.01044828803388634, "grad_norm": 3.872102737426758, "learning_rate": 0.0001985054160109657, "loss": 1.301, "step": 37 }, { "epoch": 0.010730674196964348, "grad_norm": 1.3710724115371704, "learning_rate": 0.00019839295885986296, "loss": 1.4742, "step": 38 }, { "epoch": 0.011013060360042359, "grad_norm": 2.481275796890259, "learning_rate": 0.0001982764571596004, "loss": 1.5866, "step": 39 }, { "epoch": 0.011295446523120367, "grad_norm": 1.8309324979782104, "learning_rate": 0.00019815591569910654, "loss": 2.644, "step": 40 }, { "epoch": 0.011295446523120367, "eval_loss": 1.6503487825393677, "eval_runtime": 133.397, "eval_samples_per_second": 5.592, "eval_steps_per_second": 5.592, "step": 40 }, { "epoch": 0.011577832686198376, "grad_norm": 1.9744967222213745, "learning_rate": 0.00019803133943336874, "loss": 2.3377, "step": 41 }, { "epoch": 0.011860218849276386, "grad_norm": 1.438886046409607, "learning_rate": 0.0001979027334832293, "loss": 1.4721, "step": 42 }, { "epoch": 0.012142605012354394, "grad_norm": 3.23305082321167, "learning_rate": 0.00019777010313517518, "loss": 1.7882, "step": 43 }, { "epoch": 0.012424991175432405, "grad_norm": 5.8521013259887695, "learning_rate": 0.00019763345384112043, "loss": 1.2744, "step": 44 }, { "epoch": 0.012707377338510413, "grad_norm": 5.29448127746582, "learning_rate": 0.00019749279121818235, "loss": 1.0265, "step": 45 }, { "epoch": 0.012989763501588421, "grad_norm": 3.448343515396118, "learning_rate": 0.00019734812104845047, "loss": 2.1143, "step": 46 }, { "epoch": 0.013272149664666432, "grad_norm": 2.209937810897827, "learning_rate": 0.00019719944927874881, "loss": 1.325, "step": 47 }, { "epoch": 0.01355453582774444, "grad_norm": 1.200800895690918, "learning_rate": 0.0001970467820203915, "loss": 1.8635, "step": 48 }, { "epoch": 0.01383692199082245, "grad_norm": 1.2242612838745117, "learning_rate": 0.00019689012554893154, "loss": 1.7063, "step": 49 }, { "epoch": 0.014119308153900459, "grad_norm": 1.140673279762268, "learning_rate": 0.00019672948630390294, "loss": 1.315, "step": 50 }, { "epoch": 0.014119308153900459, "eval_loss": 1.6434087753295898, "eval_runtime": 133.392, "eval_samples_per_second": 5.593, "eval_steps_per_second": 5.593, "step": 50 }, { "epoch": 0.014401694316978467, "grad_norm": 2.5457074642181396, "learning_rate": 0.00019656487088855592, "loss": 1.5371, "step": 51 }, { "epoch": 0.014684080480056478, "grad_norm": 2.4005753993988037, "learning_rate": 0.00019639628606958533, "loss": 1.0716, "step": 52 }, { "epoch": 0.014966466643134486, "grad_norm": 2.4281392097473145, "learning_rate": 0.0001962237387768529, "loss": 2.1807, "step": 53 }, { "epoch": 0.015248852806212496, "grad_norm": 2.5013136863708496, "learning_rate": 0.00019604723610310194, "loss": 1.0288, "step": 54 }, { "epoch": 0.015531238969290505, "grad_norm": 2.0805702209472656, "learning_rate": 0.00019586678530366606, "loss": 1.1582, "step": 55 }, { "epoch": 0.015813625132368513, "grad_norm": 7.539531707763672, "learning_rate": 0.00019568239379617088, "loss": 1.7791, "step": 56 }, { "epoch": 0.016096011295446522, "grad_norm": 1.5069524049758911, "learning_rate": 0.00019549406916022905, "loss": 1.6449, "step": 57 }, { "epoch": 0.016378397458524534, "grad_norm": 1.1627360582351685, "learning_rate": 0.00019530181913712872, "loss": 1.2881, "step": 58 }, { "epoch": 0.016660783621602542, "grad_norm": 5.9471282958984375, "learning_rate": 0.00019510565162951537, "loss": 1.6948, "step": 59 }, { "epoch": 0.01694316978468055, "grad_norm": 1.4962432384490967, "learning_rate": 0.00019490557470106686, "loss": 1.3254, "step": 60 }, { "epoch": 0.01694316978468055, "eval_loss": 1.633779525756836, "eval_runtime": 133.2937, "eval_samples_per_second": 5.597, "eval_steps_per_second": 5.597, "step": 60 }, { "epoch": 0.01722555594775856, "grad_norm": 1.3225600719451904, "learning_rate": 0.00019470159657616215, "loss": 1.7811, "step": 61 }, { "epoch": 0.017507942110836568, "grad_norm": 1.632053017616272, "learning_rate": 0.00019449372563954293, "loss": 1.8656, "step": 62 }, { "epoch": 0.01779032827391458, "grad_norm": 1.7111440896987915, "learning_rate": 0.0001942819704359693, "loss": 0.8098, "step": 63 }, { "epoch": 0.018072714436992588, "grad_norm": 1.5115035772323608, "learning_rate": 0.00019406633966986828, "loss": 1.0582, "step": 64 }, { "epoch": 0.018355100600070597, "grad_norm": 2.999513626098633, "learning_rate": 0.00019384684220497605, "loss": 2.1569, "step": 65 }, { "epoch": 0.018637486763148605, "grad_norm": 1.0796102285385132, "learning_rate": 0.00019362348706397373, "loss": 1.4516, "step": 66 }, { "epoch": 0.018919872926226614, "grad_norm": 2.9733681678771973, "learning_rate": 0.00019339628342811632, "loss": 1.6857, "step": 67 }, { "epoch": 0.019202259089304625, "grad_norm": 2.086916923522949, "learning_rate": 0.0001931652406368554, "loss": 0.7689, "step": 68 }, { "epoch": 0.019484645252382634, "grad_norm": 1.511246919631958, "learning_rate": 0.0001929303681874552, "loss": 1.5469, "step": 69 }, { "epoch": 0.019767031415460642, "grad_norm": 2.029017925262451, "learning_rate": 0.0001926916757346022, "loss": 1.3705, "step": 70 }, { "epoch": 0.019767031415460642, "eval_loss": 1.6125953197479248, "eval_runtime": 133.426, "eval_samples_per_second": 5.591, "eval_steps_per_second": 5.591, "step": 70 }, { "epoch": 0.02004941757853865, "grad_norm": 2.52913761138916, "learning_rate": 0.00019244917309000817, "loss": 1.722, "step": 71 }, { "epoch": 0.02033180374161666, "grad_norm": 7.770167350769043, "learning_rate": 0.00019220287022200707, "loss": 2.7036, "step": 72 }, { "epoch": 0.02061418990469467, "grad_norm": 1.7608412504196167, "learning_rate": 0.0001919527772551451, "loss": 1.2768, "step": 73 }, { "epoch": 0.02089657606777268, "grad_norm": 2.3405442237854004, "learning_rate": 0.00019169890446976454, "loss": 2.2241, "step": 74 }, { "epoch": 0.02117896223085069, "grad_norm": 2.386042356491089, "learning_rate": 0.00019144126230158127, "loss": 1.3922, "step": 75 }, { "epoch": 0.021461348393928697, "grad_norm": 2.280710458755493, "learning_rate": 0.0001911798613412557, "loss": 1.608, "step": 76 }, { "epoch": 0.021743734557006705, "grad_norm": 1.2972298860549927, "learning_rate": 0.0001909147123339575, "loss": 2.1776, "step": 77 }, { "epoch": 0.022026120720084717, "grad_norm": 1.4631404876708984, "learning_rate": 0.0001906458261789238, "loss": 3.1008, "step": 78 }, { "epoch": 0.022308506883162726, "grad_norm": 1.0595492124557495, "learning_rate": 0.00019037321392901136, "loss": 1.3511, "step": 79 }, { "epoch": 0.022590893046240734, "grad_norm": 0.9610152244567871, "learning_rate": 0.0001900968867902419, "loss": 2.1504, "step": 80 }, { "epoch": 0.022590893046240734, "eval_loss": 1.602448582649231, "eval_runtime": 133.5988, "eval_samples_per_second": 5.584, "eval_steps_per_second": 5.584, "step": 80 }, { "epoch": 0.022873279209318743, "grad_norm": 1.699086308479309, "learning_rate": 0.0001898168561213419, "loss": 1.3892, "step": 81 }, { "epoch": 0.02315566537239675, "grad_norm": 1.2091821432113647, "learning_rate": 0.0001895331334332753, "loss": 1.5162, "step": 82 }, { "epoch": 0.023438051535474763, "grad_norm": 1.6631978750228882, "learning_rate": 0.0001892457303887706, "loss": 0.8076, "step": 83 }, { "epoch": 0.02372043769855277, "grad_norm": 1.577644944190979, "learning_rate": 0.0001889546588018412, "loss": 1.4393, "step": 84 }, { "epoch": 0.02400282386163078, "grad_norm": 1.207412838935852, "learning_rate": 0.00018865993063730004, "loss": 2.1015, "step": 85 }, { "epoch": 0.02428521002470879, "grad_norm": 2.7810745239257812, "learning_rate": 0.00018836155801026753, "loss": 1.4547, "step": 86 }, { "epoch": 0.024567596187786797, "grad_norm": 2.054161787033081, "learning_rate": 0.0001880595531856738, "loss": 1.3083, "step": 87 }, { "epoch": 0.02484998235086481, "grad_norm": 3.753908634185791, "learning_rate": 0.00018775392857775432, "loss": 2.8305, "step": 88 }, { "epoch": 0.025132368513942818, "grad_norm": 4.611723899841309, "learning_rate": 0.00018744469674953956, "loss": 1.7302, "step": 89 }, { "epoch": 0.025414754677020826, "grad_norm": 2.27549409866333, "learning_rate": 0.00018713187041233896, "loss": 1.8115, "step": 90 }, { "epoch": 0.025414754677020826, "eval_loss": 1.6072229146957397, "eval_runtime": 133.6033, "eval_samples_per_second": 5.584, "eval_steps_per_second": 5.584, "step": 90 }, { "epoch": 0.025697140840098835, "grad_norm": 1.4146398305892944, "learning_rate": 0.00018681546242521786, "loss": 1.1993, "step": 91 }, { "epoch": 0.025979527003176843, "grad_norm": 2.2005670070648193, "learning_rate": 0.00018649548579446936, "loss": 1.9517, "step": 92 }, { "epoch": 0.026261913166254855, "grad_norm": 1.241758108139038, "learning_rate": 0.0001861719536730795, "loss": 2.0892, "step": 93 }, { "epoch": 0.026544299329332863, "grad_norm": 1.4617339372634888, "learning_rate": 0.00018584487936018661, "loss": 2.2815, "step": 94 }, { "epoch": 0.026826685492410872, "grad_norm": 1.677581548690796, "learning_rate": 0.00018551427630053463, "loss": 1.75, "step": 95 }, { "epoch": 0.02710907165548888, "grad_norm": 3.2750422954559326, "learning_rate": 0.00018518015808392045, "loss": 1.8473, "step": 96 }, { "epoch": 0.02739145781856689, "grad_norm": 1.7410293817520142, "learning_rate": 0.00018484253844463526, "loss": 1.2498, "step": 97 }, { "epoch": 0.0276738439816449, "grad_norm": 1.0431251525878906, "learning_rate": 0.00018450143126090015, "loss": 2.3196, "step": 98 }, { "epoch": 0.02795623014472291, "grad_norm": 2.758586883544922, "learning_rate": 0.00018415685055429533, "loss": 1.9701, "step": 99 }, { "epoch": 0.028238616307800918, "grad_norm": 1.7685903310775757, "learning_rate": 0.00018380881048918405, "loss": 1.4758, "step": 100 }, { "epoch": 0.028238616307800918, "eval_loss": 1.6011497974395752, "eval_runtime": 133.8006, "eval_samples_per_second": 5.575, "eval_steps_per_second": 5.575, "step": 100 }, { "epoch": 0.028521002470878926, "grad_norm": 1.1203055381774902, "learning_rate": 0.00018345732537213027, "loss": 1.7475, "step": 101 }, { "epoch": 0.028803388633956935, "grad_norm": 1.274515986442566, "learning_rate": 0.00018310240965131041, "loss": 2.6023, "step": 102 }, { "epoch": 0.029085774797034947, "grad_norm": 2.5792765617370605, "learning_rate": 0.00018274407791591966, "loss": 1.1908, "step": 103 }, { "epoch": 0.029368160960112955, "grad_norm": 1.466035008430481, "learning_rate": 0.00018238234489557215, "loss": 0.7359, "step": 104 }, { "epoch": 0.029650547123190964, "grad_norm": 3.4681172370910645, "learning_rate": 0.0001820172254596956, "loss": 1.8144, "step": 105 }, { "epoch": 0.029932933286268972, "grad_norm": 4.0510993003845215, "learning_rate": 0.00018164873461691986, "loss": 0.7832, "step": 106 }, { "epoch": 0.03021531944934698, "grad_norm": 5.226031303405762, "learning_rate": 0.00018127688751446027, "loss": 1.7575, "step": 107 }, { "epoch": 0.030497705612424993, "grad_norm": 1.0487242937088013, "learning_rate": 0.00018090169943749476, "loss": 2.077, "step": 108 }, { "epoch": 0.030780091775503, "grad_norm": 1.5338118076324463, "learning_rate": 0.0001805231858085356, "loss": 1.5191, "step": 109 }, { "epoch": 0.03106247793858101, "grad_norm": 1.2566704750061035, "learning_rate": 0.00018014136218679567, "loss": 1.756, "step": 110 }, { "epoch": 0.03106247793858101, "eval_loss": 1.6045676469802856, "eval_runtime": 133.6566, "eval_samples_per_second": 5.581, "eval_steps_per_second": 5.581, "step": 110 }, { "epoch": 0.03134486410165902, "grad_norm": 0.7646064758300781, "learning_rate": 0.00017975624426754848, "loss": 1.6671, "step": 111 }, { "epoch": 0.03162725026473703, "grad_norm": 2.133544445037842, "learning_rate": 0.00017936784788148328, "loss": 1.5909, "step": 112 }, { "epoch": 0.03190963642781504, "grad_norm": 2.059943199157715, "learning_rate": 0.00017897618899405423, "loss": 1.7489, "step": 113 }, { "epoch": 0.032192022590893044, "grad_norm": 0.8779903650283813, "learning_rate": 0.00017858128370482426, "loss": 1.4871, "step": 114 }, { "epoch": 0.032474408753971055, "grad_norm": 1.5168753862380981, "learning_rate": 0.000178183148246803, "loss": 1.8045, "step": 115 }, { "epoch": 0.03275679491704907, "grad_norm": 1.1241475343704224, "learning_rate": 0.00017778179898577973, "loss": 1.64, "step": 116 }, { "epoch": 0.03303918108012707, "grad_norm": 9.078608512878418, "learning_rate": 0.00017737725241965069, "loss": 2.7736, "step": 117 }, { "epoch": 0.033321567243205084, "grad_norm": 3.2590787410736084, "learning_rate": 0.00017696952517774062, "loss": 2.5064, "step": 118 }, { "epoch": 0.03360395340628309, "grad_norm": 2.293269395828247, "learning_rate": 0.00017655863402011947, "loss": 2.146, "step": 119 }, { "epoch": 0.0338863395693611, "grad_norm": 1.5803933143615723, "learning_rate": 0.00017614459583691346, "loss": 1.4979, "step": 120 }, { "epoch": 0.0338863395693611, "eval_loss": 1.603722095489502, "eval_runtime": 133.2987, "eval_samples_per_second": 5.596, "eval_steps_per_second": 5.596, "step": 120 }, { "epoch": 0.03416872573243911, "grad_norm": 1.4283939599990845, "learning_rate": 0.00017572742764761055, "loss": 1.4789, "step": 121 }, { "epoch": 0.03445111189551712, "grad_norm": 1.3361456394195557, "learning_rate": 0.00017530714660036112, "loss": 0.784, "step": 122 }, { "epoch": 0.03473349805859513, "grad_norm": 1.0861424207687378, "learning_rate": 0.00017488376997127283, "loss": 2.2809, "step": 123 }, { "epoch": 0.035015884221673135, "grad_norm": 4.459283351898193, "learning_rate": 0.0001744573151637007, "loss": 1.4483, "step": 124 }, { "epoch": 0.03529827038475115, "grad_norm": 1.324436902999878, "learning_rate": 0.00017402779970753155, "loss": 2.6136, "step": 125 }, { "epoch": 0.03558065654782916, "grad_norm": 3.7964041233062744, "learning_rate": 0.0001735952412584635, "loss": 1.092, "step": 126 }, { "epoch": 0.035863042710907164, "grad_norm": 2.560436725616455, "learning_rate": 0.00017315965759728014, "loss": 1.4307, "step": 127 }, { "epoch": 0.036145428873985176, "grad_norm": 1.473990797996521, "learning_rate": 0.00017272106662911973, "loss": 1.1344, "step": 128 }, { "epoch": 0.03642781503706318, "grad_norm": 3.3736298084259033, "learning_rate": 0.00017227948638273916, "loss": 1.5746, "step": 129 }, { "epoch": 0.03671020120014119, "grad_norm": 1.5858126878738403, "learning_rate": 0.00017183493500977278, "loss": 1.3798, "step": 130 }, { "epoch": 0.03671020120014119, "eval_loss": 1.5968632698059082, "eval_runtime": 133.2364, "eval_samples_per_second": 5.599, "eval_steps_per_second": 5.599, "step": 130 }, { "epoch": 0.036992587363219205, "grad_norm": 0.9281368851661682, "learning_rate": 0.0001713874307839863, "loss": 1.8154, "step": 131 }, { "epoch": 0.03727497352629721, "grad_norm": 2.6511611938476562, "learning_rate": 0.0001709369921005258, "loss": 1.4, "step": 132 }, { "epoch": 0.03755735968937522, "grad_norm": 1.9646121263504028, "learning_rate": 0.00017048363747516117, "loss": 1.4126, "step": 133 }, { "epoch": 0.03783974585245323, "grad_norm": 1.7208032608032227, "learning_rate": 0.00017002738554352552, "loss": 0.5568, "step": 134 }, { "epoch": 0.03812213201553124, "grad_norm": 2.9022722244262695, "learning_rate": 0.00016956825506034867, "loss": 1.6914, "step": 135 }, { "epoch": 0.03840451817860925, "grad_norm": 1.368131160736084, "learning_rate": 0.00016910626489868649, "loss": 1.5647, "step": 136 }, { "epoch": 0.038686904341687256, "grad_norm": 1.5058932304382324, "learning_rate": 0.00016864143404914504, "loss": 2.4011, "step": 137 }, { "epoch": 0.03896929050476527, "grad_norm": 2.3039586544036865, "learning_rate": 0.00016817378161909996, "loss": 0.9973, "step": 138 }, { "epoch": 0.03925167666784327, "grad_norm": 1.9210929870605469, "learning_rate": 0.00016770332683191096, "loss": 1.7679, "step": 139 }, { "epoch": 0.039534062830921285, "grad_norm": 1.3414863348007202, "learning_rate": 0.0001672300890261317, "loss": 1.4788, "step": 140 }, { "epoch": 0.039534062830921285, "eval_loss": 1.6231486797332764, "eval_runtime": 133.2243, "eval_samples_per_second": 5.6, "eval_steps_per_second": 5.6, "step": 140 }, { "epoch": 0.0398164489939993, "grad_norm": 1.8461261987686157, "learning_rate": 0.0001667540876547148, "loss": 2.1281, "step": 141 }, { "epoch": 0.0400988351570773, "grad_norm": 3.0568318367004395, "learning_rate": 0.0001662753422842123, "loss": 1.476, "step": 142 }, { "epoch": 0.040381221320155314, "grad_norm": 1.1168292760849, "learning_rate": 0.00016579387259397127, "loss": 0.9893, "step": 143 }, { "epoch": 0.04066360748323332, "grad_norm": 2.078538417816162, "learning_rate": 0.00016530969837532487, "loss": 0.8452, "step": 144 }, { "epoch": 0.04094599364631133, "grad_norm": 2.543635845184326, "learning_rate": 0.00016482283953077887, "loss": 1.5892, "step": 145 }, { "epoch": 0.04122837980938934, "grad_norm": 1.8149226903915405, "learning_rate": 0.00016433331607319343, "loss": 1.4509, "step": 146 }, { "epoch": 0.04151076597246735, "grad_norm": 1.4729820489883423, "learning_rate": 0.00016384114812496056, "loss": 1.5081, "step": 147 }, { "epoch": 0.04179315213554536, "grad_norm": 2.2723262310028076, "learning_rate": 0.00016334635591717703, "loss": 2.19, "step": 148 }, { "epoch": 0.042075538298623365, "grad_norm": 1.748171091079712, "learning_rate": 0.00016284895978881236, "loss": 2.2346, "step": 149 }, { "epoch": 0.04235792446170138, "grad_norm": 1.4556044340133667, "learning_rate": 0.00016234898018587337, "loss": 1.1614, "step": 150 }, { "epoch": 0.04235792446170138, "eval_loss": 1.6199051141738892, "eval_runtime": 133.3692, "eval_samples_per_second": 5.593, "eval_steps_per_second": 5.593, "step": 150 }, { "epoch": 0.04264031062477939, "grad_norm": 1.4214565753936768, "learning_rate": 0.00016184643766056317, "loss": 1.9267, "step": 151 }, { "epoch": 0.042922696787857394, "grad_norm": 1.2661665678024292, "learning_rate": 0.00016134135287043669, "loss": 1.8779, "step": 152 }, { "epoch": 0.043205082950935406, "grad_norm": 2.253584384918213, "learning_rate": 0.00016083374657755134, "loss": 1.2, "step": 153 }, { "epoch": 0.04348746911401341, "grad_norm": 2.3451638221740723, "learning_rate": 0.00016032363964761363, "loss": 2.1057, "step": 154 }, { "epoch": 0.04376985527709142, "grad_norm": 2.77101731300354, "learning_rate": 0.00015981105304912162, "loss": 1.8791, "step": 155 }, { "epoch": 0.044052241440169435, "grad_norm": 1.678722620010376, "learning_rate": 0.00015929600785250257, "loss": 2.4479, "step": 156 }, { "epoch": 0.04433462760324744, "grad_norm": 1.2198508977890015, "learning_rate": 0.00015877852522924732, "loss": 2.3366, "step": 157 }, { "epoch": 0.04461701376632545, "grad_norm": 5.628009796142578, "learning_rate": 0.0001582586264510396, "loss": 1.3177, "step": 158 }, { "epoch": 0.04489939992940346, "grad_norm": 2.065458297729492, "learning_rate": 0.00015773633288888197, "loss": 2.0971, "step": 159 }, { "epoch": 0.04518178609248147, "grad_norm": 0.9564564824104309, "learning_rate": 0.00015721166601221698, "loss": 1.3886, "step": 160 }, { "epoch": 0.04518178609248147, "eval_loss": 1.6064260005950928, "eval_runtime": 133.4242, "eval_samples_per_second": 5.591, "eval_steps_per_second": 5.591, "step": 160 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.566085736300544e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }