|
{ |
|
"best_metric": 0.24994200468063354, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-100", |
|
"epoch": 1.006859382655561, |
|
"eval_steps": 50, |
|
"global_step": 128, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007839294463498285, |
|
"grad_norm": 151.8634796142578, |
|
"learning_rate": 2e-05, |
|
"loss": 21.9839, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007839294463498285, |
|
"eval_loss": 1.2963035106658936, |
|
"eval_runtime": 59.0536, |
|
"eval_samples_per_second": 7.282, |
|
"eval_steps_per_second": 1.829, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01567858892699657, |
|
"grad_norm": 134.68370056152344, |
|
"learning_rate": 4e-05, |
|
"loss": 20.4607, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.023517883390494855, |
|
"grad_norm": 106.47999572753906, |
|
"learning_rate": 6e-05, |
|
"loss": 18.8466, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03135717785399314, |
|
"grad_norm": 68.35124969482422, |
|
"learning_rate": 8e-05, |
|
"loss": 14.5929, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.039196472317491425, |
|
"grad_norm": 50.851531982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 10.9012, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04703576678098971, |
|
"grad_norm": 37.46467971801758, |
|
"learning_rate": 0.00012, |
|
"loss": 7.959, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.054875061244487995, |
|
"grad_norm": 19.023597717285156, |
|
"learning_rate": 0.00014, |
|
"loss": 6.095, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06271435570798628, |
|
"grad_norm": 11.678672790527344, |
|
"learning_rate": 0.00016, |
|
"loss": 5.9509, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07055365017148457, |
|
"grad_norm": 9.556014060974121, |
|
"learning_rate": 0.00018, |
|
"loss": 4.9657, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07839294463498285, |
|
"grad_norm": 9.676369667053223, |
|
"learning_rate": 0.0002, |
|
"loss": 5.2787, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08623223909848114, |
|
"grad_norm": 6.937280178070068, |
|
"learning_rate": 0.00019996456111234527, |
|
"loss": 4.9893, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09407153356197942, |
|
"grad_norm": 6.7998046875, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 5.3298, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10191082802547771, |
|
"grad_norm": 6.393218040466309, |
|
"learning_rate": 0.000199681200703075, |
|
"loss": 4.416, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10975012248897599, |
|
"grad_norm": 6.558236122131348, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 4.915, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11758941695247428, |
|
"grad_norm": 5.369983196258545, |
|
"learning_rate": 0.00019911528310040074, |
|
"loss": 4.0733, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12542871141597256, |
|
"grad_norm": 6.554808616638184, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 5.1536, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13326800587947085, |
|
"grad_norm": 10.400140762329102, |
|
"learning_rate": 0.00019826841245925212, |
|
"loss": 4.4356, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14110730034296914, |
|
"grad_norm": 6.700842380523682, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 4.9583, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1489465948064674, |
|
"grad_norm": 7.305096626281738, |
|
"learning_rate": 0.00019714298932647098, |
|
"loss": 4.8479, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1567858892699657, |
|
"grad_norm": 5.645997524261475, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 5.104, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.164625183733464, |
|
"grad_norm": 5.677758693695068, |
|
"learning_rate": 0.00019574220383620055, |
|
"loss": 4.1287, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17246447819696228, |
|
"grad_norm": 5.749682903289795, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 4.3489, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18030377266046055, |
|
"grad_norm": 6.3827948570251465, |
|
"learning_rate": 0.00019407002666710336, |
|
"loss": 4.252, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.18814306712395884, |
|
"grad_norm": 5.22667932510376, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 4.2697, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.19598236158745713, |
|
"grad_norm": 5.128467559814453, |
|
"learning_rate": 0.00019213119778704128, |
|
"loss": 4.1894, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20382165605095542, |
|
"grad_norm": 5.749606132507324, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 4.4464, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2116609505144537, |
|
"grad_norm": 5.480006694793701, |
|
"learning_rate": 0.00018993121301712193, |
|
"loss": 4.5645, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21950024497795198, |
|
"grad_norm": 5.424717903137207, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 4.6284, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.22733953944145027, |
|
"grad_norm": 44.62896728515625, |
|
"learning_rate": 0.00018747630845319612, |
|
"loss": 4.5385, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.23517883390494856, |
|
"grad_norm": 6.1519622802734375, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 4.7595, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.24301812836844683, |
|
"grad_norm": 6.095805644989014, |
|
"learning_rate": 0.0001847734427889671, |
|
"loss": 4.3621, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2508574228319451, |
|
"grad_norm": 5.428371906280518, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 4.8596, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2586967172954434, |
|
"grad_norm": 4.395357131958008, |
|
"learning_rate": 0.0001818302775908169, |
|
"loss": 4.1379, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2665360117589417, |
|
"grad_norm": 10.262712478637695, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 4.634, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.27437530622244, |
|
"grad_norm": 5.000908851623535, |
|
"learning_rate": 0.00017865515558026428, |
|
"loss": 4.5441, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2822146006859383, |
|
"grad_norm": 6.854843616485596, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 4.1148, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2900538951494365, |
|
"grad_norm": 4.968626976013184, |
|
"learning_rate": 0.00017525707698561385, |
|
"loss": 4.2543, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2978931896129348, |
|
"grad_norm": 4.810691833496094, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 4.521, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3057324840764331, |
|
"grad_norm": 4.835721015930176, |
|
"learning_rate": 0.00017164567402983152, |
|
"loss": 4.0595, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3135717785399314, |
|
"grad_norm": 4.842018127441406, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 4.0795, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3214110730034297, |
|
"grad_norm": 4.61956787109375, |
|
"learning_rate": 0.00016783118362696163, |
|
"loss": 4.0167, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.329250367466928, |
|
"grad_norm": 4.931549072265625, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 4.1662, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3370896619304263, |
|
"grad_norm": 4.845169544219971, |
|
"learning_rate": 0.00016382441836448202, |
|
"loss": 4.1187, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.34492895639392457, |
|
"grad_norm": 4.64799690246582, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 4.3107, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3527682508574228, |
|
"grad_norm": 4.621283531188965, |
|
"learning_rate": 0.00015963673585385016, |
|
"loss": 4.2497, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3606075453209211, |
|
"grad_norm": 4.852513313293457, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 4.337, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3684468397844194, |
|
"grad_norm": 4.936686038970947, |
|
"learning_rate": 0.00015528000653611935, |
|
"loss": 4.4534, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3762861342479177, |
|
"grad_norm": 4.867508411407471, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 3.6969, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.384125428711416, |
|
"grad_norm": 4.731996536254883, |
|
"learning_rate": 0.000150766580033884, |
|
"loss": 3.8874, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.39196472317491426, |
|
"grad_norm": 4.869089126586914, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 5.1876, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.39196472317491426, |
|
"eval_loss": 0.2635273039340973, |
|
"eval_runtime": 59.8371, |
|
"eval_samples_per_second": 7.186, |
|
"eval_steps_per_second": 1.805, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.39980401763841256, |
|
"grad_norm": 4.403170585632324, |
|
"learning_rate": 0.0001461092501449326, |
|
"loss": 4.2116, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.40764331210191085, |
|
"grad_norm": 4.539646625518799, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 4.2727, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.41548260656540914, |
|
"grad_norm": 4.992217063903809, |
|
"learning_rate": 0.00014132121857683783, |
|
"loss": 4.2613, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4233219010289074, |
|
"grad_norm": 4.952049732208252, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 4.3292, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.43116119549240567, |
|
"grad_norm": 4.661520481109619, |
|
"learning_rate": 0.00013641605752528224, |
|
"loss": 4.7227, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.43900048995590396, |
|
"grad_norm": 4.57968807220459, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 4.332, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.44683978441940225, |
|
"grad_norm": 4.427809715270996, |
|
"learning_rate": 0.0001314076712021949, |
|
"loss": 4.0129, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.45467907888290054, |
|
"grad_norm": 4.379388809204102, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 3.625, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.46251837334639884, |
|
"grad_norm": 4.387825012207031, |
|
"learning_rate": 0.00012631025642275212, |
|
"loss": 4.0837, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.47035766780989713, |
|
"grad_norm": 4.690913677215576, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 4.481, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4781969622733954, |
|
"grad_norm": 4.645357608795166, |
|
"learning_rate": 0.00012113826236296244, |
|
"loss": 4.2539, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.48603625673689366, |
|
"grad_norm": 4.611278057098389, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 4.4024, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.49387555120039195, |
|
"grad_norm": 4.584770202636719, |
|
"learning_rate": 0.00011590634960190721, |
|
"loss": 3.708, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5017148456638902, |
|
"grad_norm": 4.215365409851074, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 3.9509, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5095541401273885, |
|
"grad_norm": 4.522916316986084, |
|
"learning_rate": 0.00011062934856473655, |
|
"loss": 4.1005, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5173934345908868, |
|
"grad_norm": 4.4469990730285645, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 3.5071, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5252327290543851, |
|
"grad_norm": 4.294273376464844, |
|
"learning_rate": 0.00010532221748421787, |
|
"loss": 3.8467, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5330720235178834, |
|
"grad_norm": 4.412168025970459, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 4.0123, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5409113179813817, |
|
"grad_norm": 4.205385684967041, |
|
"learning_rate": 0.0001, |
|
"loss": 4.1511, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.54875061244488, |
|
"grad_norm": 4.552511692047119, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 3.9935, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5565899069083783, |
|
"grad_norm": 4.413698196411133, |
|
"learning_rate": 9.467778251578217e-05, |
|
"loss": 3.9498, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5644292013718766, |
|
"grad_norm": 4.40337610244751, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 4.3969, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5722684958353749, |
|
"grad_norm": 4.428593158721924, |
|
"learning_rate": 8.937065143526347e-05, |
|
"loss": 3.8083, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.580107790298873, |
|
"grad_norm": 4.122357368469238, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 4.0771, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5879470847623713, |
|
"grad_norm": 4.366419792175293, |
|
"learning_rate": 8.409365039809281e-05, |
|
"loss": 4.1221, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5957863792258696, |
|
"grad_norm": 4.299376964569092, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 4.1303, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6036256736893679, |
|
"grad_norm": 4.406579494476318, |
|
"learning_rate": 7.886173763703757e-05, |
|
"loss": 3.9516, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6114649681528662, |
|
"grad_norm": 4.291518211364746, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 4.2969, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6193042626163645, |
|
"grad_norm": 4.067143440246582, |
|
"learning_rate": 7.368974357724789e-05, |
|
"loss": 4.2154, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6271435570798628, |
|
"grad_norm": 4.134098052978516, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 3.7089, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6349828515433611, |
|
"grad_norm": 3.921808958053589, |
|
"learning_rate": 6.859232879780515e-05, |
|
"loss": 4.3355, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6428221460068594, |
|
"grad_norm": 4.351531982421875, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 4.2442, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6506614404703577, |
|
"grad_norm": 4.067043304443359, |
|
"learning_rate": 6.358394247471778e-05, |
|
"loss": 4.1614, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.658500734933856, |
|
"grad_norm": 4.547780990600586, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 4.256, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6663400293973543, |
|
"grad_norm": 4.304778575897217, |
|
"learning_rate": 5.867878142316221e-05, |
|
"loss": 4.2114, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6741793238608526, |
|
"grad_norm": 3.7989909648895264, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 3.8491, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6820186183243508, |
|
"grad_norm": 4.27288293838501, |
|
"learning_rate": 5.38907498550674e-05, |
|
"loss": 4.041, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6898579127878491, |
|
"grad_norm": 4.575046062469482, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 4.0219, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6976972072513474, |
|
"grad_norm": 4.415938854217529, |
|
"learning_rate": 4.9233419966116036e-05, |
|
"loss": 4.0358, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7055365017148456, |
|
"grad_norm": 4.7177934646606445, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 4.0491, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7133757961783439, |
|
"grad_norm": 3.931637763977051, |
|
"learning_rate": 4.47199934638807e-05, |
|
"loss": 3.836, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7212150906418422, |
|
"grad_norm": 4.113886833190918, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 3.6984, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7290543851053405, |
|
"grad_norm": 4.356851100921631, |
|
"learning_rate": 4.036326414614985e-05, |
|
"loss": 4.158, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7368936795688388, |
|
"grad_norm": 4.171023368835449, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 3.8695, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7447329740323371, |
|
"grad_norm": 4.107118606567383, |
|
"learning_rate": 3.617558163551802e-05, |
|
"loss": 3.9032, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7525722684958354, |
|
"grad_norm": 4.645840167999268, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 4.6291, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7604115629593337, |
|
"grad_norm": 3.7909698486328125, |
|
"learning_rate": 3.216881637303839e-05, |
|
"loss": 3.776, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.768250857422832, |
|
"grad_norm": 4.037353038787842, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 3.8556, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7760901518863302, |
|
"grad_norm": 4.654599189758301, |
|
"learning_rate": 2.8354325970168484e-05, |
|
"loss": 3.6748, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7839294463498285, |
|
"grad_norm": 4.140764236450195, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 3.9498, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7839294463498285, |
|
"eval_loss": 0.24994200468063354, |
|
"eval_runtime": 59.8648, |
|
"eval_samples_per_second": 7.183, |
|
"eval_steps_per_second": 1.804, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7917687408133268, |
|
"grad_norm": 4.1459736824035645, |
|
"learning_rate": 2.4742923014386156e-05, |
|
"loss": 4.0253, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7996080352768251, |
|
"grad_norm": 4.166021347045898, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 4.1193, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8074473297403234, |
|
"grad_norm": 4.5724639892578125, |
|
"learning_rate": 2.1344844419735755e-05, |
|
"loss": 4.1613, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8152866242038217, |
|
"grad_norm": 3.7771105766296387, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 3.577, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.82312591866732, |
|
"grad_norm": 4.427511215209961, |
|
"learning_rate": 1.8169722409183097e-05, |
|
"loss": 4.0308, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8309652131308183, |
|
"grad_norm": 4.253128528594971, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 4.358, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8388045075943165, |
|
"grad_norm": 3.6124472618103027, |
|
"learning_rate": 1.522655721103291e-05, |
|
"loss": 3.8505, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8466438020578148, |
|
"grad_norm": 4.112203598022461, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 3.9511, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.854483096521313, |
|
"grad_norm": 3.7493703365325928, |
|
"learning_rate": 1.2523691546803873e-05, |
|
"loss": 3.8253, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8623223909848113, |
|
"grad_norm": 4.115346431732178, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 3.8934, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8701616854483096, |
|
"grad_norm": 4.116607666015625, |
|
"learning_rate": 1.0068786982878087e-05, |
|
"loss": 3.7177, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8780009799118079, |
|
"grad_norm": 3.9183499813079834, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 3.8943, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8858402743753062, |
|
"grad_norm": 4.0384039878845215, |
|
"learning_rate": 7.868802212958703e-06, |
|
"loss": 3.4521, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8936795688388045, |
|
"grad_norm": 4.025205612182617, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 3.8196, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9015188633023028, |
|
"grad_norm": 4.01348352432251, |
|
"learning_rate": 5.929973332896677e-06, |
|
"loss": 3.9242, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9093581577658011, |
|
"grad_norm": 4.146557331085205, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 4.0043, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9171974522292994, |
|
"grad_norm": 4.394860744476318, |
|
"learning_rate": 4.257796163799455e-06, |
|
"loss": 3.8837, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9250367466927977, |
|
"grad_norm": 4.56512451171875, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 4.3846, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.932876041156296, |
|
"grad_norm": 3.989962577819824, |
|
"learning_rate": 2.857010673529015e-06, |
|
"loss": 3.3769, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9407153356197943, |
|
"grad_norm": 4.115790843963623, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 3.7523, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9485546300832925, |
|
"grad_norm": 4.354365348815918, |
|
"learning_rate": 1.7315875407479032e-06, |
|
"loss": 3.7809, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9563939245467908, |
|
"grad_norm": 4.128818035125732, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 3.8053, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.964233219010289, |
|
"grad_norm": 4.223034858703613, |
|
"learning_rate": 8.847168995992916e-07, |
|
"loss": 3.5431, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9720725134737873, |
|
"grad_norm": 4.52358341217041, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 4.4289, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9799118079372856, |
|
"grad_norm": 3.970877170562744, |
|
"learning_rate": 3.1879929692498757e-07, |
|
"loss": 3.9163, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9877511024007839, |
|
"grad_norm": 4.14064359664917, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 3.9214, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9955903968642822, |
|
"grad_norm": 3.948699712753296, |
|
"learning_rate": 3.5438887654737355e-08, |
|
"loss": 4.2029, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.006859382655561, |
|
"grad_norm": 3.799189567565918, |
|
"learning_rate": 0.0, |
|
"loss": 3.387, |
|
"step": 128 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 128, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 2, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.203212659984957e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|