{ "best_metric": 1.6035598516464233, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.0338863395693611, "eval_steps": 10, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002823861630780092, "grad_norm": 1.2318695783615112, "learning_rate": 2e-05, "loss": 2.0472, "step": 1 }, { "epoch": 0.0002823861630780092, "eval_loss": 2.352551221847534, "eval_runtime": 134.4924, "eval_samples_per_second": 5.547, "eval_steps_per_second": 5.547, "step": 1 }, { "epoch": 0.0005647723261560184, "grad_norm": 3.381352663040161, "learning_rate": 4e-05, "loss": 2.7995, "step": 2 }, { "epoch": 0.0008471584892340275, "grad_norm": 0.7312057018280029, "learning_rate": 6e-05, "loss": 1.1274, "step": 3 }, { "epoch": 0.0011295446523120368, "grad_norm": 1.382169485092163, "learning_rate": 8e-05, "loss": 3.0301, "step": 4 }, { "epoch": 0.0014119308153900459, "grad_norm": 1.6353622674942017, "learning_rate": 0.0001, "loss": 2.7291, "step": 5 }, { "epoch": 0.001694316978468055, "grad_norm": 0.6910256147384644, "learning_rate": 0.00012, "loss": 2.8307, "step": 6 }, { "epoch": 0.001976703141546064, "grad_norm": 1.4815878868103027, "learning_rate": 0.00014, "loss": 2.0774, "step": 7 }, { "epoch": 0.0022590893046240735, "grad_norm": 1.042596459388733, "learning_rate": 0.00016, "loss": 2.6145, "step": 8 }, { "epoch": 0.0025414754677020824, "grad_norm": 0.8719121217727661, "learning_rate": 0.00018, "loss": 2.9212, "step": 9 }, { "epoch": 0.0028238616307800918, "grad_norm": 1.0421559810638428, "learning_rate": 0.0002, "loss": 2.3507, "step": 10 }, { "epoch": 0.0028238616307800918, "eval_loss": 2.1853621006011963, "eval_runtime": 133.969, "eval_samples_per_second": 5.568, "eval_steps_per_second": 5.568, "step": 10 }, { "epoch": 0.003106247793858101, "grad_norm": 0.9601467847824097, "learning_rate": 0.0001999979446958366, "loss": 1.9811, "step": 11 }, { "epoch": 0.00338863395693611, "grad_norm": 1.353819489479065, "learning_rate": 0.00019999177886783194, "loss": 2.373, "step": 12 }, { "epoch": 0.0036710201200141194, "grad_norm": 0.897077202796936, "learning_rate": 0.00019998150276943902, "loss": 1.696, "step": 13 }, { "epoch": 0.003953406283092128, "grad_norm": 1.358173131942749, "learning_rate": 0.000199967116823068, "loss": 2.3396, "step": 14 }, { "epoch": 0.004235792446170138, "grad_norm": 1.0444282293319702, "learning_rate": 0.0001999486216200688, "loss": 1.482, "step": 15 }, { "epoch": 0.004518178609248147, "grad_norm": 3.5194482803344727, "learning_rate": 0.00019992601792070679, "loss": 1.6677, "step": 16 }, { "epoch": 0.004800564772326156, "grad_norm": 3.523196220397949, "learning_rate": 0.00019989930665413147, "loss": 2.393, "step": 17 }, { "epoch": 0.005082950935404165, "grad_norm": 0.9800947904586792, "learning_rate": 0.00019986848891833845, "loss": 1.6947, "step": 18 }, { "epoch": 0.005365337098482174, "grad_norm": 1.540337324142456, "learning_rate": 0.0001998335659801241, "loss": 0.563, "step": 19 }, { "epoch": 0.0056477232615601836, "grad_norm": 1.577869176864624, "learning_rate": 0.00019979453927503364, "loss": 2.0525, "step": 20 }, { "epoch": 0.0056477232615601836, "eval_loss": 1.7350926399230957, "eval_runtime": 133.9613, "eval_samples_per_second": 5.569, "eval_steps_per_second": 5.569, "step": 20 }, { "epoch": 0.005930109424638193, "grad_norm": 0.876990020275116, "learning_rate": 0.00019975141040730207, "loss": 1.2469, "step": 21 }, { "epoch": 0.006212495587716202, "grad_norm": 1.1722772121429443, "learning_rate": 0.0001997041811497882, "loss": 2.2798, "step": 22 }, { "epoch": 0.006494881750794211, "grad_norm": 0.9232650995254517, "learning_rate": 0.00019965285344390184, "loss": 2.5319, "step": 23 }, { "epoch": 0.00677726791387222, "grad_norm": 1.6948198080062866, "learning_rate": 0.00019959742939952392, "loss": 1.6864, "step": 24 }, { "epoch": 0.0070596540769502295, "grad_norm": 1.277117133140564, "learning_rate": 0.00019953791129491983, "loss": 2.1785, "step": 25 }, { "epoch": 0.007342040240028239, "grad_norm": 3.347221851348877, "learning_rate": 0.00019947430157664576, "loss": 1.5917, "step": 26 }, { "epoch": 0.007624426403106248, "grad_norm": 1.2190711498260498, "learning_rate": 0.00019940660285944803, "loss": 2.3655, "step": 27 }, { "epoch": 0.007906812566184257, "grad_norm": 1.9080499410629272, "learning_rate": 0.00019933481792615583, "loss": 2.0602, "step": 28 }, { "epoch": 0.008189198729262267, "grad_norm": 1.5589416027069092, "learning_rate": 0.0001992589497275665, "loss": 1.9817, "step": 29 }, { "epoch": 0.008471584892340275, "grad_norm": 2.701538562774658, "learning_rate": 0.0001991790013823246, "loss": 1.434, "step": 30 }, { "epoch": 0.008471584892340275, "eval_loss": 1.6901509761810303, "eval_runtime": 134.7409, "eval_samples_per_second": 5.537, "eval_steps_per_second": 5.537, "step": 30 }, { "epoch": 0.008753971055418284, "grad_norm": 1.0493268966674805, "learning_rate": 0.00019909497617679348, "loss": 2.413, "step": 31 }, { "epoch": 0.009036357218496294, "grad_norm": 1.5526515245437622, "learning_rate": 0.0001990068775649202, "loss": 2.2275, "step": 32 }, { "epoch": 0.009318743381574303, "grad_norm": 1.4149539470672607, "learning_rate": 0.00019891470916809362, "loss": 2.4908, "step": 33 }, { "epoch": 0.009601129544652313, "grad_norm": 1.2589884996414185, "learning_rate": 0.00019881847477499557, "loss": 1.7045, "step": 34 }, { "epoch": 0.009883515707730321, "grad_norm": 2.9048409461975098, "learning_rate": 0.00019871817834144504, "loss": 1.6211, "step": 35 }, { "epoch": 0.01016590187080833, "grad_norm": 1.029226303100586, "learning_rate": 0.0001986138239902355, "loss": 2.1988, "step": 36 }, { "epoch": 0.01044828803388634, "grad_norm": 1.716841697692871, "learning_rate": 0.0001985054160109657, "loss": 1.7938, "step": 37 }, { "epoch": 0.010730674196964348, "grad_norm": 1.0365818738937378, "learning_rate": 0.00019839295885986296, "loss": 1.896, "step": 38 }, { "epoch": 0.011013060360042359, "grad_norm": 1.4247523546218872, "learning_rate": 0.0001982764571596004, "loss": 2.0023, "step": 39 }, { "epoch": 0.011295446523120367, "grad_norm": 1.0945035219192505, "learning_rate": 0.00019815591569910654, "loss": 2.4118, "step": 40 }, { "epoch": 0.011295446523120367, "eval_loss": 1.651785135269165, "eval_runtime": 134.7054, "eval_samples_per_second": 5.538, "eval_steps_per_second": 5.538, "step": 40 }, { "epoch": 0.011577832686198376, "grad_norm": 1.3919488191604614, "learning_rate": 0.00019803133943336874, "loss": 2.096, "step": 41 }, { "epoch": 0.011860218849276386, "grad_norm": 1.0962899923324585, "learning_rate": 0.0001979027334832293, "loss": 2.0469, "step": 42 }, { "epoch": 0.012142605012354394, "grad_norm": 1.147463083267212, "learning_rate": 0.00019777010313517518, "loss": 2.0405, "step": 43 }, { "epoch": 0.012424991175432405, "grad_norm": 2.677388906478882, "learning_rate": 0.00019763345384112043, "loss": 1.0599, "step": 44 }, { "epoch": 0.012707377338510413, "grad_norm": 1.20905339717865, "learning_rate": 0.00019749279121818235, "loss": 0.6549, "step": 45 }, { "epoch": 0.012989763501588421, "grad_norm": 1.109489917755127, "learning_rate": 0.00019734812104845047, "loss": 1.9309, "step": 46 }, { "epoch": 0.013272149664666432, "grad_norm": 1.518839955329895, "learning_rate": 0.00019719944927874881, "loss": 1.8122, "step": 47 }, { "epoch": 0.01355453582774444, "grad_norm": 0.8112537860870361, "learning_rate": 0.0001970467820203915, "loss": 1.8915, "step": 48 }, { "epoch": 0.01383692199082245, "grad_norm": 0.7357921004295349, "learning_rate": 0.00019689012554893154, "loss": 2.1672, "step": 49 }, { "epoch": 0.014119308153900459, "grad_norm": 0.7238839268684387, "learning_rate": 0.00019672948630390294, "loss": 0.9114, "step": 50 }, { "epoch": 0.014119308153900459, "eval_loss": 1.624532699584961, "eval_runtime": 134.5228, "eval_samples_per_second": 5.546, "eval_steps_per_second": 5.546, "step": 50 }, { "epoch": 0.014401694316978467, "grad_norm": 4.2082200050354, "learning_rate": 0.00019656487088855592, "loss": 1.5288, "step": 51 }, { "epoch": 0.014684080480056478, "grad_norm": 0.6968432068824768, "learning_rate": 0.00019639628606958533, "loss": 1.4653, "step": 52 }, { "epoch": 0.014966466643134486, "grad_norm": 1.9569013118743896, "learning_rate": 0.0001962237387768529, "loss": 1.7478, "step": 53 }, { "epoch": 0.015248852806212496, "grad_norm": 1.2472929954528809, "learning_rate": 0.00019604723610310194, "loss": 1.1666, "step": 54 }, { "epoch": 0.015531238969290505, "grad_norm": 0.7596736550331116, "learning_rate": 0.00019586678530366606, "loss": 1.7902, "step": 55 }, { "epoch": 0.015813625132368513, "grad_norm": 1.2892470359802246, "learning_rate": 0.00019568239379617088, "loss": 1.8049, "step": 56 }, { "epoch": 0.016096011295446522, "grad_norm": 1.5124154090881348, "learning_rate": 0.00019549406916022905, "loss": 1.7232, "step": 57 }, { "epoch": 0.016378397458524534, "grad_norm": 1.0523513555526733, "learning_rate": 0.00019530181913712872, "loss": 1.772, "step": 58 }, { "epoch": 0.016660783621602542, "grad_norm": 2.185861825942993, "learning_rate": 0.00019510565162951537, "loss": 2.3071, "step": 59 }, { "epoch": 0.01694316978468055, "grad_norm": 0.6275917887687683, "learning_rate": 0.00019490557470106686, "loss": 2.0081, "step": 60 }, { "epoch": 0.01694316978468055, "eval_loss": 1.624603033065796, "eval_runtime": 134.5052, "eval_samples_per_second": 5.546, "eval_steps_per_second": 5.546, "step": 60 }, { "epoch": 0.01722555594775856, "grad_norm": 0.9301966428756714, "learning_rate": 0.00019470159657616215, "loss": 1.7401, "step": 61 }, { "epoch": 0.017507942110836568, "grad_norm": 1.2353283166885376, "learning_rate": 0.00019449372563954293, "loss": 1.7241, "step": 62 }, { "epoch": 0.01779032827391458, "grad_norm": 1.0061556100845337, "learning_rate": 0.0001942819704359693, "loss": 0.5784, "step": 63 }, { "epoch": 0.018072714436992588, "grad_norm": 1.062904953956604, "learning_rate": 0.00019406633966986828, "loss": 2.1718, "step": 64 }, { "epoch": 0.018355100600070597, "grad_norm": 1.897362470626831, "learning_rate": 0.00019384684220497605, "loss": 1.9466, "step": 65 }, { "epoch": 0.018637486763148605, "grad_norm": 1.1101641654968262, "learning_rate": 0.00019362348706397373, "loss": 1.8438, "step": 66 }, { "epoch": 0.018919872926226614, "grad_norm": 0.8784818649291992, "learning_rate": 0.00019339628342811632, "loss": 0.844, "step": 67 }, { "epoch": 0.019202259089304625, "grad_norm": 1.256935477256775, "learning_rate": 0.0001931652406368554, "loss": 0.9449, "step": 68 }, { "epoch": 0.019484645252382634, "grad_norm": 1.1505467891693115, "learning_rate": 0.0001929303681874552, "loss": 1.9627, "step": 69 }, { "epoch": 0.019767031415460642, "grad_norm": 1.6793428659439087, "learning_rate": 0.0001926916757346022, "loss": 2.6109, "step": 70 }, { "epoch": 0.019767031415460642, "eval_loss": 1.6243711709976196, "eval_runtime": 134.6115, "eval_samples_per_second": 5.542, "eval_steps_per_second": 5.542, "step": 70 }, { "epoch": 0.02004941757853865, "grad_norm": 2.2794175148010254, "learning_rate": 0.00019244917309000817, "loss": 1.624, "step": 71 }, { "epoch": 0.02033180374161666, "grad_norm": 1.5373339653015137, "learning_rate": 0.00019220287022200707, "loss": 1.8068, "step": 72 }, { "epoch": 0.02061418990469467, "grad_norm": 0.9991006255149841, "learning_rate": 0.0001919527772551451, "loss": 1.6704, "step": 73 }, { "epoch": 0.02089657606777268, "grad_norm": 1.392340064048767, "learning_rate": 0.00019169890446976454, "loss": 2.378, "step": 74 }, { "epoch": 0.02117896223085069, "grad_norm": 1.1224641799926758, "learning_rate": 0.00019144126230158127, "loss": 1.6227, "step": 75 }, { "epoch": 0.021461348393928697, "grad_norm": 1.2003456354141235, "learning_rate": 0.0001911798613412557, "loss": 2.4362, "step": 76 }, { "epoch": 0.021743734557006705, "grad_norm": 1.102264642715454, "learning_rate": 0.0001909147123339575, "loss": 2.3662, "step": 77 }, { "epoch": 0.022026120720084717, "grad_norm": 0.7298341989517212, "learning_rate": 0.0001906458261789238, "loss": 2.8474, "step": 78 }, { "epoch": 0.022308506883162726, "grad_norm": 1.0796235799789429, "learning_rate": 0.00019037321392901136, "loss": 1.5521, "step": 79 }, { "epoch": 0.022590893046240734, "grad_norm": 0.7039556503295898, "learning_rate": 0.0001900968867902419, "loss": 2.47, "step": 80 }, { "epoch": 0.022590893046240734, "eval_loss": 1.617271900177002, "eval_runtime": 134.9383, "eval_samples_per_second": 5.528, "eval_steps_per_second": 5.528, "step": 80 }, { "epoch": 0.022873279209318743, "grad_norm": 0.908437192440033, "learning_rate": 0.0001898168561213419, "loss": 1.9605, "step": 81 }, { "epoch": 0.02315566537239675, "grad_norm": 0.8091934323310852, "learning_rate": 0.0001895331334332753, "loss": 1.5024, "step": 82 }, { "epoch": 0.023438051535474763, "grad_norm": 1.5132545232772827, "learning_rate": 0.0001892457303887706, "loss": 1.0696, "step": 83 }, { "epoch": 0.02372043769855277, "grad_norm": 1.2116718292236328, "learning_rate": 0.0001889546588018412, "loss": 1.7115, "step": 84 }, { "epoch": 0.02400282386163078, "grad_norm": 0.8074257969856262, "learning_rate": 0.00018865993063730004, "loss": 2.7291, "step": 85 }, { "epoch": 0.02428521002470879, "grad_norm": 1.1341824531555176, "learning_rate": 0.00018836155801026753, "loss": 1.8977, "step": 86 }, { "epoch": 0.024567596187786797, "grad_norm": 1.9230746030807495, "learning_rate": 0.0001880595531856738, "loss": 1.5919, "step": 87 }, { "epoch": 0.02484998235086481, "grad_norm": 0.9849350452423096, "learning_rate": 0.00018775392857775432, "loss": 2.8404, "step": 88 }, { "epoch": 0.025132368513942818, "grad_norm": 2.4642324447631836, "learning_rate": 0.00018744469674953956, "loss": 1.2864, "step": 89 }, { "epoch": 0.025414754677020826, "grad_norm": 1.7967941761016846, "learning_rate": 0.00018713187041233896, "loss": 1.8635, "step": 90 }, { "epoch": 0.025414754677020826, "eval_loss": 1.6206018924713135, "eval_runtime": 134.6331, "eval_samples_per_second": 5.541, "eval_steps_per_second": 5.541, "step": 90 }, { "epoch": 0.025697140840098835, "grad_norm": 1.2300053834915161, "learning_rate": 0.00018681546242521786, "loss": 2.5979, "step": 91 }, { "epoch": 0.025979527003176843, "grad_norm": 1.4374971389770508, "learning_rate": 0.00018649548579446936, "loss": 1.9891, "step": 92 }, { "epoch": 0.026261913166254855, "grad_norm": 1.0630384683609009, "learning_rate": 0.0001861719536730795, "loss": 2.2702, "step": 93 }, { "epoch": 0.026544299329332863, "grad_norm": 1.124637246131897, "learning_rate": 0.00018584487936018661, "loss": 2.8282, "step": 94 }, { "epoch": 0.026826685492410872, "grad_norm": 1.3020342588424683, "learning_rate": 0.00018551427630053463, "loss": 1.946, "step": 95 }, { "epoch": 0.02710907165548888, "grad_norm": 1.0363761186599731, "learning_rate": 0.00018518015808392045, "loss": 1.4936, "step": 96 }, { "epoch": 0.02739145781856689, "grad_norm": 1.612716794013977, "learning_rate": 0.00018484253844463526, "loss": 1.2949, "step": 97 }, { "epoch": 0.0276738439816449, "grad_norm": 0.5706181526184082, "learning_rate": 0.00018450143126090015, "loss": 1.3093, "step": 98 }, { "epoch": 0.02795623014472291, "grad_norm": 1.2899292707443237, "learning_rate": 0.00018415685055429533, "loss": 2.5082, "step": 99 }, { "epoch": 0.028238616307800918, "grad_norm": 1.1273449659347534, "learning_rate": 0.00018380881048918405, "loss": 1.8346, "step": 100 }, { "epoch": 0.028238616307800918, "eval_loss": 1.6035598516464233, "eval_runtime": 134.1623, "eval_samples_per_second": 5.56, "eval_steps_per_second": 5.56, "step": 100 }, { "epoch": 0.028521002470878926, "grad_norm": 1.1187326908111572, "learning_rate": 0.00018345732537213027, "loss": 2.3691, "step": 101 }, { "epoch": 0.028803388633956935, "grad_norm": 1.2883691787719727, "learning_rate": 0.00018310240965131041, "loss": 2.9992, "step": 102 }, { "epoch": 0.029085774797034947, "grad_norm": 1.5844606161117554, "learning_rate": 0.00018274407791591966, "loss": 1.4277, "step": 103 }, { "epoch": 0.029368160960112955, "grad_norm": 1.5807751417160034, "learning_rate": 0.00018238234489557215, "loss": 0.6992, "step": 104 }, { "epoch": 0.029650547123190964, "grad_norm": 1.4340300559997559, "learning_rate": 0.0001820172254596956, "loss": 2.2425, "step": 105 }, { "epoch": 0.029932933286268972, "grad_norm": 1.3567688465118408, "learning_rate": 0.00018164873461691986, "loss": 0.7425, "step": 106 }, { "epoch": 0.03021531944934698, "grad_norm": 1.126590609550476, "learning_rate": 0.00018127688751446027, "loss": 1.8765, "step": 107 }, { "epoch": 0.030497705612424993, "grad_norm": 0.9429426193237305, "learning_rate": 0.00018090169943749476, "loss": 2.088, "step": 108 }, { "epoch": 0.030780091775503, "grad_norm": 1.0398406982421875, "learning_rate": 0.0001805231858085356, "loss": 1.8745, "step": 109 }, { "epoch": 0.03106247793858101, "grad_norm": 0.563444197177887, "learning_rate": 0.00018014136218679567, "loss": 2.4523, "step": 110 }, { "epoch": 0.03106247793858101, "eval_loss": 1.6250616312026978, "eval_runtime": 134.2663, "eval_samples_per_second": 5.556, "eval_steps_per_second": 5.556, "step": 110 }, { "epoch": 0.03134486410165902, "grad_norm": 0.664234459400177, "learning_rate": 0.00017975624426754848, "loss": 1.4157, "step": 111 }, { "epoch": 0.03162725026473703, "grad_norm": 1.8084183931350708, "learning_rate": 0.00017936784788148328, "loss": 1.9874, "step": 112 }, { "epoch": 0.03190963642781504, "grad_norm": 1.104672908782959, "learning_rate": 0.00017897618899405423, "loss": 2.7257, "step": 113 }, { "epoch": 0.032192022590893044, "grad_norm": 0.7887753844261169, "learning_rate": 0.00017858128370482426, "loss": 2.7542, "step": 114 }, { "epoch": 0.032474408753971055, "grad_norm": 1.3737729787826538, "learning_rate": 0.000178183148246803, "loss": 2.0767, "step": 115 }, { "epoch": 0.03275679491704907, "grad_norm": 0.7426419258117676, "learning_rate": 0.00017778179898577973, "loss": 1.4909, "step": 116 }, { "epoch": 0.03303918108012707, "grad_norm": 2.0367558002471924, "learning_rate": 0.00017737725241965069, "loss": 1.6603, "step": 117 }, { "epoch": 0.033321567243205084, "grad_norm": 0.9449915289878845, "learning_rate": 0.00017696952517774062, "loss": 2.323, "step": 118 }, { "epoch": 0.03360395340628309, "grad_norm": 2.1352381706237793, "learning_rate": 0.00017655863402011947, "loss": 1.9585, "step": 119 }, { "epoch": 0.0338863395693611, "grad_norm": 1.0839155912399292, "learning_rate": 0.00017614459583691346, "loss": 1.5273, "step": 120 }, { "epoch": 0.0338863395693611, "eval_loss": 1.6136579513549805, "eval_runtime": 134.3244, "eval_samples_per_second": 5.554, "eval_steps_per_second": 5.554, "step": 120 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.174564302225408e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }