diff --git "a/checkpoint-597/trainer_state.json" "b/checkpoint-597/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-597/trainer_state.json" @@ -0,0 +1,4296 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9547738693467336, + "eval_steps": 50, + "global_step": 597, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 0.6762334704399109, + "learning_rate": 1e-05, + "loss": 1.3026, + "step": 1 + }, + { + "epoch": 0.01, + "eval_loss": 1.343465805053711, + "eval_runtime": 2.9584, + "eval_samples_per_second": 33.802, + "eval_steps_per_second": 16.901, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 0.7722721695899963, + "learning_rate": 2e-05, + "loss": 1.5419, + "step": 2 + }, + { + "epoch": 0.02, + "grad_norm": 0.6532348394393921, + "learning_rate": 3e-05, + "loss": 1.4429, + "step": 3 + }, + { + "epoch": 0.02, + "grad_norm": 0.8427589535713196, + "learning_rate": 4e-05, + "loss": 1.4, + "step": 4 + }, + { + "epoch": 0.03, + "grad_norm": 0.9355791807174683, + "learning_rate": 5e-05, + "loss": 1.2583, + "step": 5 + }, + { + "epoch": 0.03, + "grad_norm": 0.6357808113098145, + "learning_rate": 6e-05, + "loss": 1.2655, + "step": 6 + }, + { + "epoch": 0.04, + "grad_norm": 0.963829517364502, + "learning_rate": 7e-05, + "loss": 1.42, + "step": 7 + }, + { + "epoch": 0.04, + "grad_norm": 0.6698102951049805, + "learning_rate": 8e-05, + "loss": 1.3938, + "step": 8 + }, + { + "epoch": 0.05, + "grad_norm": 0.5394894480705261, + "learning_rate": 9e-05, + "loss": 1.2234, + "step": 9 + }, + { + "epoch": 0.05, + "grad_norm": 0.8773290514945984, + "learning_rate": 0.0001, + "loss": 1.4257, + "step": 10 + }, + { + "epoch": 0.06, + "grad_norm": 0.7960235476493835, + "learning_rate": 0.00011000000000000002, + "loss": 1.4272, + "step": 11 + }, + { + "epoch": 0.06, + "grad_norm": 0.7909610867500305, + "learning_rate": 0.00012, + "loss": 1.352, + "step": 12 + }, + { + "epoch": 0.07, + "grad_norm": 0.8417578339576721, + "learning_rate": 0.00013000000000000002, + "loss": 1.2048, + "step": 13 + }, + { + "epoch": 0.07, + "grad_norm": 0.8076886534690857, + "learning_rate": 0.00014, + "loss": 1.4186, + "step": 14 + }, + { + "epoch": 0.08, + "grad_norm": 0.7543106079101562, + "learning_rate": 0.00015000000000000001, + "loss": 1.0873, + "step": 15 + }, + { + "epoch": 0.08, + "grad_norm": 0.9430835247039795, + "learning_rate": 0.00016, + "loss": 1.4061, + "step": 16 + }, + { + "epoch": 0.09, + "grad_norm": 0.7473496794700623, + "learning_rate": 0.00017, + "loss": 1.1407, + "step": 17 + }, + { + "epoch": 0.09, + "grad_norm": 0.8123806715011597, + "learning_rate": 0.00018, + "loss": 1.4394, + "step": 18 + }, + { + "epoch": 0.1, + "grad_norm": 0.7778059244155884, + "learning_rate": 0.00019, + "loss": 1.2752, + "step": 19 + }, + { + "epoch": 0.1, + "grad_norm": 0.7027471661567688, + "learning_rate": 0.0002, + "loss": 1.3107, + "step": 20 + }, + { + "epoch": 0.11, + "grad_norm": 0.8443830609321594, + "learning_rate": 0.00019999918050612108, + "loss": 1.2204, + "step": 21 + }, + { + "epoch": 0.11, + "grad_norm": 0.6853266358375549, + "learning_rate": 0.00019999672203791565, + "loss": 1.2231, + "step": 22 + }, + { + "epoch": 0.12, + "grad_norm": 0.8757483959197998, + "learning_rate": 0.00019999262463567773, + "loss": 1.2069, + "step": 23 + }, + { + "epoch": 0.12, + "grad_norm": 0.7184014320373535, + "learning_rate": 0.00019998688836656323, + "loss": 1.2124, + "step": 24 + }, + { + "epoch": 0.13, + "grad_norm": 0.6530072093009949, + "learning_rate": 0.0001999795133245889, + "loss": 1.1672, + "step": 25 + }, + { + "epoch": 0.13, + "grad_norm": 0.7211533188819885, + "learning_rate": 0.0001999704996306308, + "loss": 1.3207, + "step": 26 + }, + { + "epoch": 0.14, + "grad_norm": 0.7048207521438599, + "learning_rate": 0.00019995984743242226, + "loss": 1.2003, + "step": 27 + }, + { + "epoch": 0.14, + "grad_norm": 0.6881248354911804, + "learning_rate": 0.00019994755690455152, + "loss": 1.117, + "step": 28 + }, + { + "epoch": 0.15, + "grad_norm": 0.7877801656723022, + "learning_rate": 0.00019993362824845875, + "loss": 1.0531, + "step": 29 + }, + { + "epoch": 0.15, + "grad_norm": 0.749905526638031, + "learning_rate": 0.000199918061692433, + "loss": 1.1462, + "step": 30 + }, + { + "epoch": 0.16, + "grad_norm": 0.67184978723526, + "learning_rate": 0.00019990085749160822, + "loss": 1.0939, + "step": 31 + }, + { + "epoch": 0.16, + "grad_norm": 0.6622844934463501, + "learning_rate": 0.0001998820159279591, + "loss": 1.1369, + "step": 32 + }, + { + "epoch": 0.17, + "grad_norm": 0.763306736946106, + "learning_rate": 0.00019986153731029656, + "loss": 1.3525, + "step": 33 + }, + { + "epoch": 0.17, + "grad_norm": 0.6171010136604309, + "learning_rate": 0.0001998394219742627, + "loss": 0.8807, + "step": 34 + }, + { + "epoch": 0.18, + "grad_norm": 0.7575845718383789, + "learning_rate": 0.00019981567028232514, + "loss": 1.206, + "step": 35 + }, + { + "epoch": 0.18, + "grad_norm": 0.5694592595100403, + "learning_rate": 0.00019979028262377118, + "loss": 0.9079, + "step": 36 + }, + { + "epoch": 0.19, + "grad_norm": 0.7056426405906677, + "learning_rate": 0.00019976325941470146, + "loss": 1.1133, + "step": 37 + }, + { + "epoch": 0.19, + "grad_norm": 0.6812122464179993, + "learning_rate": 0.00019973460109802305, + "loss": 1.2707, + "step": 38 + }, + { + "epoch": 0.2, + "grad_norm": 0.5790569186210632, + "learning_rate": 0.0001997043081434423, + "loss": 1.0047, + "step": 39 + }, + { + "epoch": 0.2, + "grad_norm": 0.6529936790466309, + "learning_rate": 0.00019967238104745696, + "loss": 1.0917, + "step": 40 + }, + { + "epoch": 0.21, + "grad_norm": 0.6274911165237427, + "learning_rate": 0.00019963882033334826, + "loss": 1.2586, + "step": 41 + }, + { + "epoch": 0.21, + "grad_norm": 0.6666668653488159, + "learning_rate": 0.00019960362655117218, + "loss": 1.1187, + "step": 42 + }, + { + "epoch": 0.22, + "grad_norm": 0.6239954233169556, + "learning_rate": 0.00019956680027775051, + "loss": 1.0343, + "step": 43 + }, + { + "epoch": 0.22, + "grad_norm": 0.6892250180244446, + "learning_rate": 0.0001995283421166614, + "loss": 1.0254, + "step": 44 + }, + { + "epoch": 0.23, + "grad_norm": 0.7392664551734924, + "learning_rate": 0.00019948825269822934, + "loss": 1.0592, + "step": 45 + }, + { + "epoch": 0.23, + "grad_norm": 0.7541553378105164, + "learning_rate": 0.00019944653267951504, + "loss": 1.2297, + "step": 46 + }, + { + "epoch": 0.24, + "grad_norm": 0.685874342918396, + "learning_rate": 0.00019940318274430449, + "loss": 1.321, + "step": 47 + }, + { + "epoch": 0.24, + "grad_norm": 0.7901135087013245, + "learning_rate": 0.00019935820360309777, + "loss": 1.2583, + "step": 48 + }, + { + "epoch": 0.25, + "grad_norm": 0.6619594693183899, + "learning_rate": 0.00019931159599309757, + "loss": 0.9762, + "step": 49 + }, + { + "epoch": 0.25, + "grad_norm": 0.6059371829032898, + "learning_rate": 0.00019926336067819684, + "loss": 1.1146, + "step": 50 + }, + { + "epoch": 0.25, + "eval_loss": 1.1476221084594727, + "eval_runtime": 2.9589, + "eval_samples_per_second": 33.796, + "eval_steps_per_second": 16.898, + "step": 50 + }, + { + "epoch": 0.26, + "grad_norm": 0.6533025503158569, + "learning_rate": 0.00019921349844896654, + "loss": 1.2439, + "step": 51 + }, + { + "epoch": 0.26, + "grad_norm": 0.5473713278770447, + "learning_rate": 0.00019916201012264254, + "loss": 0.8464, + "step": 52 + }, + { + "epoch": 0.27, + "grad_norm": 0.6035101413726807, + "learning_rate": 0.00019910889654311208, + "loss": 1.1297, + "step": 53 + }, + { + "epoch": 0.27, + "grad_norm": 0.7092946767807007, + "learning_rate": 0.00019905415858090036, + "loss": 1.0365, + "step": 54 + }, + { + "epoch": 0.28, + "grad_norm": 0.602556049823761, + "learning_rate": 0.00019899779713315575, + "loss": 1.1238, + "step": 55 + }, + { + "epoch": 0.28, + "grad_norm": 0.6566863059997559, + "learning_rate": 0.00019893981312363562, + "loss": 1.1097, + "step": 56 + }, + { + "epoch": 0.29, + "grad_norm": 0.6582695245742798, + "learning_rate": 0.00019888020750269067, + "loss": 1.3681, + "step": 57 + }, + { + "epoch": 0.29, + "grad_norm": 0.509901225566864, + "learning_rate": 0.00019881898124724981, + "loss": 0.7163, + "step": 58 + }, + { + "epoch": 0.3, + "grad_norm": 0.6406445503234863, + "learning_rate": 0.0001987561353608038, + "loss": 1.1309, + "step": 59 + }, + { + "epoch": 0.3, + "grad_norm": 0.5770175457000732, + "learning_rate": 0.00019869167087338907, + "loss": 1.1706, + "step": 60 + }, + { + "epoch": 0.31, + "grad_norm": 0.6582055687904358, + "learning_rate": 0.00019862558884157068, + "loss": 1.1121, + "step": 61 + }, + { + "epoch": 0.31, + "grad_norm": 0.7646100521087646, + "learning_rate": 0.00019855789034842504, + "loss": 1.1313, + "step": 62 + }, + { + "epoch": 0.32, + "grad_norm": 0.7127470970153809, + "learning_rate": 0.00019848857650352214, + "loss": 1.258, + "step": 63 + }, + { + "epoch": 0.32, + "grad_norm": 0.5721624493598938, + "learning_rate": 0.00019841764844290744, + "loss": 1.0163, + "step": 64 + }, + { + "epoch": 0.33, + "grad_norm": 0.6494898796081543, + "learning_rate": 0.00019834510732908315, + "loss": 1.1974, + "step": 65 + }, + { + "epoch": 0.33, + "grad_norm": 0.6703062057495117, + "learning_rate": 0.00019827095435098925, + "loss": 1.1376, + "step": 66 + }, + { + "epoch": 0.34, + "grad_norm": 0.696711003780365, + "learning_rate": 0.000198195190723984, + "loss": 0.9931, + "step": 67 + }, + { + "epoch": 0.34, + "grad_norm": 0.6563432216644287, + "learning_rate": 0.0001981178176898239, + "loss": 1.2047, + "step": 68 + }, + { + "epoch": 0.35, + "grad_norm": 0.7269361019134521, + "learning_rate": 0.0001980388365166436, + "loss": 1.6113, + "step": 69 + }, + { + "epoch": 0.35, + "grad_norm": 0.6356198191642761, + "learning_rate": 0.0001979582484989348, + "loss": 1.3778, + "step": 70 + }, + { + "epoch": 0.36, + "grad_norm": 0.6009278893470764, + "learning_rate": 0.00019787605495752528, + "loss": 1.2131, + "step": 71 + }, + { + "epoch": 0.36, + "grad_norm": 0.49109163880348206, + "learning_rate": 0.00019779225723955707, + "loss": 0.8246, + "step": 72 + }, + { + "epoch": 0.37, + "grad_norm": 0.5709823966026306, + "learning_rate": 0.00019770685671846456, + "loss": 1.0578, + "step": 73 + }, + { + "epoch": 0.37, + "grad_norm": 0.5613502860069275, + "learning_rate": 0.0001976198547939518, + "loss": 0.8883, + "step": 74 + }, + { + "epoch": 0.38, + "grad_norm": 0.750335156917572, + "learning_rate": 0.0001975312528919697, + "loss": 1.1836, + "step": 75 + }, + { + "epoch": 0.38, + "grad_norm": 0.6157568693161011, + "learning_rate": 0.00019744105246469263, + "loss": 1.0637, + "step": 76 + }, + { + "epoch": 0.39, + "grad_norm": 0.6417941451072693, + "learning_rate": 0.00019734925499049447, + "loss": 1.2824, + "step": 77 + }, + { + "epoch": 0.39, + "grad_norm": 0.8214441537857056, + "learning_rate": 0.0001972558619739246, + "loss": 1.1942, + "step": 78 + }, + { + "epoch": 0.4, + "grad_norm": 0.6943228244781494, + "learning_rate": 0.00019716087494568317, + "loss": 1.3261, + "step": 79 + }, + { + "epoch": 0.4, + "grad_norm": 0.739622950553894, + "learning_rate": 0.00019706429546259593, + "loss": 1.2639, + "step": 80 + }, + { + "epoch": 0.41, + "grad_norm": 0.6374944448471069, + "learning_rate": 0.00019696612510758876, + "loss": 0.9929, + "step": 81 + }, + { + "epoch": 0.41, + "grad_norm": 0.7595279812812805, + "learning_rate": 0.00019686636548966178, + "loss": 1.2859, + "step": 82 + }, + { + "epoch": 0.42, + "grad_norm": 0.6465960144996643, + "learning_rate": 0.00019676501824386294, + "loss": 1.0333, + "step": 83 + }, + { + "epoch": 0.42, + "grad_norm": 0.7063401341438293, + "learning_rate": 0.00019666208503126112, + "loss": 1.2189, + "step": 84 + }, + { + "epoch": 0.43, + "grad_norm": 0.631826639175415, + "learning_rate": 0.00019655756753891916, + "loss": 1.2583, + "step": 85 + }, + { + "epoch": 0.43, + "grad_norm": 0.6506052017211914, + "learning_rate": 0.0001964514674798659, + "loss": 1.2019, + "step": 86 + }, + { + "epoch": 0.44, + "grad_norm": 0.7421661615371704, + "learning_rate": 0.00019634378659306832, + "loss": 1.2122, + "step": 87 + }, + { + "epoch": 0.44, + "grad_norm": 0.5749310851097107, + "learning_rate": 0.00019623452664340306, + "loss": 1.0522, + "step": 88 + }, + { + "epoch": 0.45, + "grad_norm": 0.6523499488830566, + "learning_rate": 0.0001961236894216272, + "loss": 1.2135, + "step": 89 + }, + { + "epoch": 0.45, + "grad_norm": 0.5970554947853088, + "learning_rate": 0.00019601127674434928, + "loss": 1.0297, + "step": 90 + }, + { + "epoch": 0.46, + "grad_norm": 0.587348461151123, + "learning_rate": 0.00019589729045399934, + "loss": 1.0214, + "step": 91 + }, + { + "epoch": 0.46, + "grad_norm": 0.6518609523773193, + "learning_rate": 0.00019578173241879872, + "loss": 0.9928, + "step": 92 + }, + { + "epoch": 0.47, + "grad_norm": 0.7513082027435303, + "learning_rate": 0.00019566460453272945, + "loss": 1.1204, + "step": 93 + }, + { + "epoch": 0.47, + "grad_norm": 0.8648024201393127, + "learning_rate": 0.0001955459087155033, + "loss": 1.3671, + "step": 94 + }, + { + "epoch": 0.48, + "grad_norm": 0.6207080483436584, + "learning_rate": 0.0001954256469125301, + "loss": 1.1286, + "step": 95 + }, + { + "epoch": 0.48, + "grad_norm": 0.6174007058143616, + "learning_rate": 0.0001953038210948861, + "loss": 1.145, + "step": 96 + }, + { + "epoch": 0.49, + "grad_norm": 0.6160337328910828, + "learning_rate": 0.00019518043325928157, + "loss": 1.2688, + "step": 97 + }, + { + "epoch": 0.49, + "grad_norm": 0.662702202796936, + "learning_rate": 0.00019505548542802804, + "loss": 1.1212, + "step": 98 + }, + { + "epoch": 0.5, + "grad_norm": 0.7133952379226685, + "learning_rate": 0.00019492897964900512, + "loss": 1.0514, + "step": 99 + }, + { + "epoch": 0.5, + "grad_norm": 0.7767614126205444, + "learning_rate": 0.00019480091799562704, + "loss": 1.2387, + "step": 100 + }, + { + "epoch": 0.5, + "eval_loss": 1.1319388151168823, + "eval_runtime": 2.9089, + "eval_samples_per_second": 34.377, + "eval_steps_per_second": 17.189, + "step": 100 + }, + { + "epoch": 0.51, + "grad_norm": 0.6398429870605469, + "learning_rate": 0.00019467130256680868, + "loss": 1.0076, + "step": 101 + }, + { + "epoch": 0.51, + "grad_norm": 0.6510715484619141, + "learning_rate": 0.00019454013548693102, + "loss": 1.2372, + "step": 102 + }, + { + "epoch": 0.52, + "grad_norm": 0.7204650044441223, + "learning_rate": 0.00019440741890580643, + "loss": 1.0999, + "step": 103 + }, + { + "epoch": 0.52, + "grad_norm": 0.6531095504760742, + "learning_rate": 0.00019427315499864344, + "loss": 1.1123, + "step": 104 + }, + { + "epoch": 0.53, + "grad_norm": 0.5871708989143372, + "learning_rate": 0.00019413734596601104, + "loss": 1.2162, + "step": 105 + }, + { + "epoch": 0.53, + "grad_norm": 0.6323477625846863, + "learning_rate": 0.00019399999403380266, + "loss": 1.1369, + "step": 106 + }, + { + "epoch": 0.54, + "grad_norm": 0.6977123618125916, + "learning_rate": 0.00019386110145319963, + "loss": 1.0952, + "step": 107 + }, + { + "epoch": 0.54, + "grad_norm": 0.6638639569282532, + "learning_rate": 0.00019372067050063438, + "loss": 1.1125, + "step": 108 + }, + { + "epoch": 0.55, + "grad_norm": 0.6010698676109314, + "learning_rate": 0.000193578703477753, + "loss": 1.1715, + "step": 109 + }, + { + "epoch": 0.55, + "grad_norm": 0.5837023258209229, + "learning_rate": 0.00019343520271137763, + "loss": 0.8489, + "step": 110 + }, + { + "epoch": 0.56, + "grad_norm": 0.6870157718658447, + "learning_rate": 0.0001932901705534683, + "loss": 1.0953, + "step": 111 + }, + { + "epoch": 0.56, + "grad_norm": 0.5713046789169312, + "learning_rate": 0.00019314360938108425, + "loss": 1.1113, + "step": 112 + }, + { + "epoch": 0.57, + "grad_norm": 0.5966447591781616, + "learning_rate": 0.00019299552159634517, + "loss": 1.2646, + "step": 113 + }, + { + "epoch": 0.57, + "grad_norm": 0.6116918921470642, + "learning_rate": 0.00019284590962639176, + "loss": 1.0807, + "step": 114 + }, + { + "epoch": 0.58, + "grad_norm": 0.5885886549949646, + "learning_rate": 0.0001926947759233459, + "loss": 0.9551, + "step": 115 + }, + { + "epoch": 0.58, + "grad_norm": 0.5844876766204834, + "learning_rate": 0.00019254212296427044, + "loss": 1.0009, + "step": 116 + }, + { + "epoch": 0.59, + "grad_norm": 0.5967299342155457, + "learning_rate": 0.0001923879532511287, + "loss": 0.863, + "step": 117 + }, + { + "epoch": 0.59, + "grad_norm": 0.543732762336731, + "learning_rate": 0.0001922322693107434, + "loss": 0.8331, + "step": 118 + }, + { + "epoch": 0.6, + "grad_norm": 0.6925728917121887, + "learning_rate": 0.0001920750736947553, + "loss": 1.1044, + "step": 119 + }, + { + "epoch": 0.6, + "grad_norm": 0.5720507502555847, + "learning_rate": 0.00019191636897958122, + "loss": 1.2173, + "step": 120 + }, + { + "epoch": 0.61, + "grad_norm": 0.6664772033691406, + "learning_rate": 0.0001917561577663721, + "loss": 0.9849, + "step": 121 + }, + { + "epoch": 0.61, + "grad_norm": 0.6026978492736816, + "learning_rate": 0.00019159444268097012, + "loss": 1.2952, + "step": 122 + }, + { + "epoch": 0.62, + "grad_norm": 0.6648169755935669, + "learning_rate": 0.00019143122637386566, + "loss": 0.8417, + "step": 123 + }, + { + "epoch": 0.62, + "grad_norm": 0.7643215656280518, + "learning_rate": 0.00019126651152015403, + "loss": 1.1142, + "step": 124 + }, + { + "epoch": 0.63, + "grad_norm": 0.6389123797416687, + "learning_rate": 0.00019110030081949156, + "loss": 1.2387, + "step": 125 + }, + { + "epoch": 0.63, + "grad_norm": 0.7826026678085327, + "learning_rate": 0.00019093259699605125, + "loss": 1.1407, + "step": 126 + }, + { + "epoch": 0.64, + "grad_norm": 0.6801394820213318, + "learning_rate": 0.0001907634027984782, + "loss": 0.932, + "step": 127 + }, + { + "epoch": 0.64, + "grad_norm": 0.6450052857398987, + "learning_rate": 0.0001905927209998447, + "loss": 1.3197, + "step": 128 + }, + { + "epoch": 0.65, + "grad_norm": 0.6216878890991211, + "learning_rate": 0.00019042055439760444, + "loss": 1.2593, + "step": 129 + }, + { + "epoch": 0.65, + "grad_norm": 0.6000977158546448, + "learning_rate": 0.000190246905813547, + "loss": 0.9974, + "step": 130 + }, + { + "epoch": 0.66, + "grad_norm": 0.5806196928024292, + "learning_rate": 0.0001900717780937514, + "loss": 1.1792, + "step": 131 + }, + { + "epoch": 0.66, + "grad_norm": 0.6986164450645447, + "learning_rate": 0.00018989517410853955, + "loss": 1.252, + "step": 132 + }, + { + "epoch": 0.67, + "grad_norm": 0.6852320432662964, + "learning_rate": 0.0001897170967524291, + "loss": 1.098, + "step": 133 + }, + { + "epoch": 0.67, + "grad_norm": 0.6186272501945496, + "learning_rate": 0.00018953754894408616, + "loss": 1.1099, + "step": 134 + }, + { + "epoch": 0.68, + "grad_norm": 0.7196840643882751, + "learning_rate": 0.0001893565336262773, + "loss": 1.1809, + "step": 135 + }, + { + "epoch": 0.68, + "grad_norm": 0.6523413062095642, + "learning_rate": 0.00018917405376582145, + "loss": 1.2383, + "step": 136 + }, + { + "epoch": 0.69, + "grad_norm": 0.7788291573524475, + "learning_rate": 0.00018899011235354115, + "loss": 1.023, + "step": 137 + }, + { + "epoch": 0.69, + "grad_norm": 0.5616946220397949, + "learning_rate": 0.00018880471240421365, + "loss": 0.8242, + "step": 138 + }, + { + "epoch": 0.7, + "grad_norm": 0.6670994758605957, + "learning_rate": 0.00018861785695652142, + "loss": 1.2797, + "step": 139 + }, + { + "epoch": 0.7, + "grad_norm": 0.6285648345947266, + "learning_rate": 0.00018842954907300236, + "loss": 1.0959, + "step": 140 + }, + { + "epoch": 0.71, + "grad_norm": 0.6495100855827332, + "learning_rate": 0.00018823979183999964, + "loss": 1.1426, + "step": 141 + }, + { + "epoch": 0.71, + "grad_norm": 0.7513198256492615, + "learning_rate": 0.00018804858836761107, + "loss": 1.2578, + "step": 142 + }, + { + "epoch": 0.72, + "grad_norm": 0.5422288775444031, + "learning_rate": 0.0001878559417896382, + "loss": 0.9833, + "step": 143 + }, + { + "epoch": 0.72, + "grad_norm": 0.605277419090271, + "learning_rate": 0.0001876618552635348, + "loss": 1.2323, + "step": 144 + }, + { + "epoch": 0.73, + "grad_norm": 0.7177323698997498, + "learning_rate": 0.00018746633197035527, + "loss": 1.2153, + "step": 145 + }, + { + "epoch": 0.73, + "grad_norm": 0.5417729020118713, + "learning_rate": 0.00018726937511470246, + "loss": 0.9367, + "step": 146 + }, + { + "epoch": 0.74, + "grad_norm": 0.6895157098770142, + "learning_rate": 0.00018707098792467515, + "loss": 1.3363, + "step": 147 + }, + { + "epoch": 0.74, + "grad_norm": 0.5565975308418274, + "learning_rate": 0.00018687117365181512, + "loss": 1.0385, + "step": 148 + }, + { + "epoch": 0.75, + "grad_norm": 0.7168130278587341, + "learning_rate": 0.00018666993557105377, + "loss": 1.2281, + "step": 149 + }, + { + "epoch": 0.75, + "grad_norm": 0.839598536491394, + "learning_rate": 0.00018646727698065865, + "loss": 1.4159, + "step": 150 + }, + { + "epoch": 0.75, + "eval_loss": 1.119249939918518, + "eval_runtime": 2.9417, + "eval_samples_per_second": 33.994, + "eval_steps_per_second": 16.997, + "step": 150 + }, + { + "epoch": 0.76, + "grad_norm": 0.5981218814849854, + "learning_rate": 0.00018626320120217923, + "loss": 1.0671, + "step": 151 + }, + { + "epoch": 0.76, + "grad_norm": 0.6944805383682251, + "learning_rate": 0.00018605771158039253, + "loss": 1.3229, + "step": 152 + }, + { + "epoch": 0.77, + "grad_norm": 0.6238952875137329, + "learning_rate": 0.00018585081148324832, + "loss": 1.1578, + "step": 153 + }, + { + "epoch": 0.77, + "grad_norm": 0.6363958120346069, + "learning_rate": 0.00018564250430181387, + "loss": 1.3265, + "step": 154 + }, + { + "epoch": 0.78, + "grad_norm": 0.5761409401893616, + "learning_rate": 0.00018543279345021834, + "loss": 1.1844, + "step": 155 + }, + { + "epoch": 0.78, + "grad_norm": 0.810093104839325, + "learning_rate": 0.00018522168236559695, + "loss": 1.2033, + "step": 156 + }, + { + "epoch": 0.79, + "grad_norm": 0.7487497329711914, + "learning_rate": 0.0001850091745080345, + "loss": 1.1043, + "step": 157 + }, + { + "epoch": 0.79, + "grad_norm": 0.6162795424461365, + "learning_rate": 0.00018479527336050878, + "loss": 1.2486, + "step": 158 + }, + { + "epoch": 0.8, + "grad_norm": 0.5720970034599304, + "learning_rate": 0.00018457998242883344, + "loss": 1.0381, + "step": 159 + }, + { + "epoch": 0.8, + "grad_norm": 0.6686292886734009, + "learning_rate": 0.00018436330524160047, + "loss": 1.502, + "step": 160 + }, + { + "epoch": 0.81, + "grad_norm": 0.5931655764579773, + "learning_rate": 0.00018414524535012244, + "loss": 1.0813, + "step": 161 + }, + { + "epoch": 0.81, + "grad_norm": 0.6548634171485901, + "learning_rate": 0.00018392580632837423, + "loss": 1.3147, + "step": 162 + }, + { + "epoch": 0.82, + "grad_norm": 0.559681236743927, + "learning_rate": 0.00018370499177293464, + "loss": 1.1096, + "step": 163 + }, + { + "epoch": 0.82, + "grad_norm": 0.6365666389465332, + "learning_rate": 0.00018348280530292713, + "loss": 1.2062, + "step": 164 + }, + { + "epoch": 0.83, + "grad_norm": 0.616242527961731, + "learning_rate": 0.00018325925055996076, + "loss": 1.1219, + "step": 165 + }, + { + "epoch": 0.83, + "grad_norm": 0.6588903069496155, + "learning_rate": 0.0001830343312080704, + "loss": 1.2697, + "step": 166 + }, + { + "epoch": 0.84, + "grad_norm": 0.5880855321884155, + "learning_rate": 0.00018280805093365672, + "loss": 1.1511, + "step": 167 + }, + { + "epoch": 0.84, + "grad_norm": 0.7549880743026733, + "learning_rate": 0.00018258041344542566, + "loss": 1.2181, + "step": 168 + }, + { + "epoch": 0.85, + "grad_norm": 0.6862443089485168, + "learning_rate": 0.00018235142247432782, + "loss": 1.8496, + "step": 169 + }, + { + "epoch": 0.85, + "grad_norm": 0.5903118848800659, + "learning_rate": 0.0001821210817734972, + "loss": 1.2092, + "step": 170 + }, + { + "epoch": 0.86, + "grad_norm": 0.6936279535293579, + "learning_rate": 0.00018188939511818965, + "loss": 1.0635, + "step": 171 + }, + { + "epoch": 0.86, + "grad_norm": 0.6887457370758057, + "learning_rate": 0.0001816563663057211, + "loss": 0.9387, + "step": 172 + }, + { + "epoch": 0.87, + "grad_norm": 0.6930254101753235, + "learning_rate": 0.00018142199915540527, + "loss": 1.1651, + "step": 173 + }, + { + "epoch": 0.87, + "grad_norm": 0.6529977321624756, + "learning_rate": 0.00018118629750849105, + "loss": 1.2512, + "step": 174 + }, + { + "epoch": 0.88, + "grad_norm": 0.705954372882843, + "learning_rate": 0.0001809492652280996, + "loss": 1.2601, + "step": 175 + }, + { + "epoch": 0.88, + "grad_norm": 0.6263706088066101, + "learning_rate": 0.00018071090619916093, + "loss": 1.0446, + "step": 176 + }, + { + "epoch": 0.89, + "grad_norm": 0.7754440307617188, + "learning_rate": 0.00018047122432835038, + "loss": 1.2517, + "step": 177 + }, + { + "epoch": 0.89, + "grad_norm": 0.6904909610748291, + "learning_rate": 0.0001802302235440245, + "loss": 1.3028, + "step": 178 + }, + { + "epoch": 0.9, + "grad_norm": 0.6373815536499023, + "learning_rate": 0.0001799879077961566, + "loss": 0.7538, + "step": 179 + }, + { + "epoch": 0.9, + "grad_norm": 0.6192349791526794, + "learning_rate": 0.00017974428105627208, + "loss": 1.1583, + "step": 180 + }, + { + "epoch": 0.91, + "grad_norm": 0.6500440239906311, + "learning_rate": 0.00017949934731738347, + "loss": 1.189, + "step": 181 + }, + { + "epoch": 0.91, + "grad_norm": 0.5701293349266052, + "learning_rate": 0.0001792531105939247, + "loss": 0.9937, + "step": 182 + }, + { + "epoch": 0.92, + "grad_norm": 0.6383854150772095, + "learning_rate": 0.0001790055749216856, + "loss": 1.0381, + "step": 183 + }, + { + "epoch": 0.92, + "grad_norm": 0.7212352156639099, + "learning_rate": 0.00017875674435774547, + "loss": 1.2023, + "step": 184 + }, + { + "epoch": 0.93, + "grad_norm": 0.7195665836334229, + "learning_rate": 0.00017850662298040678, + "loss": 1.4138, + "step": 185 + }, + { + "epoch": 0.93, + "grad_norm": 0.6174137592315674, + "learning_rate": 0.0001782552148891283, + "loss": 0.8007, + "step": 186 + }, + { + "epoch": 0.94, + "grad_norm": 0.672179102897644, + "learning_rate": 0.00017800252420445788, + "loss": 1.1403, + "step": 187 + }, + { + "epoch": 0.94, + "grad_norm": 0.6487817168235779, + "learning_rate": 0.00017774855506796496, + "loss": 1.169, + "step": 188 + }, + { + "epoch": 0.95, + "grad_norm": 0.7027740478515625, + "learning_rate": 0.0001774933116421725, + "loss": 1.2268, + "step": 189 + }, + { + "epoch": 0.95, + "grad_norm": 0.7178415060043335, + "learning_rate": 0.00017723679811048904, + "loss": 1.2785, + "step": 190 + }, + { + "epoch": 0.96, + "grad_norm": 0.682354748249054, + "learning_rate": 0.00017697901867713995, + "loss": 1.2195, + "step": 191 + }, + { + "epoch": 0.96, + "grad_norm": 0.7199010252952576, + "learning_rate": 0.00017671997756709863, + "loss": 1.4132, + "step": 192 + }, + { + "epoch": 0.97, + "grad_norm": 0.7743118405342102, + "learning_rate": 0.0001764596790260171, + "loss": 0.9824, + "step": 193 + }, + { + "epoch": 0.97, + "grad_norm": 0.7540227174758911, + "learning_rate": 0.00017619812732015664, + "loss": 1.0527, + "step": 194 + }, + { + "epoch": 0.98, + "grad_norm": 0.6113067269325256, + "learning_rate": 0.00017593532673631766, + "loss": 1.2446, + "step": 195 + }, + { + "epoch": 0.98, + "grad_norm": 0.6951828598976135, + "learning_rate": 0.00017567128158176953, + "loss": 1.3333, + "step": 196 + }, + { + "epoch": 0.99, + "grad_norm": 0.570866584777832, + "learning_rate": 0.00017540599618418007, + "loss": 1.0012, + "step": 197 + }, + { + "epoch": 0.99, + "grad_norm": 0.5432811379432678, + "learning_rate": 0.00017513947489154443, + "loss": 1.1343, + "step": 198 + }, + { + "epoch": 1.0, + "grad_norm": 0.6711558103561401, + "learning_rate": 0.00017487172207211396, + "loss": 1.0945, + "step": 199 + }, + { + "epoch": 1.01, + "grad_norm": 0.675626814365387, + "learning_rate": 0.0001746027421143246, + "loss": 1.2807, + "step": 200 + }, + { + "epoch": 1.01, + "eval_loss": 1.1153115034103394, + "eval_runtime": 3.0007, + "eval_samples_per_second": 33.326, + "eval_steps_per_second": 16.663, + "step": 200 + }, + { + "epoch": 1.01, + "grad_norm": 0.6204088926315308, + "learning_rate": 0.00017433253942672496, + "loss": 1.2167, + "step": 201 + }, + { + "epoch": 1.02, + "grad_norm": 0.6080848574638367, + "learning_rate": 0.000174061118437904, + "loss": 0.979, + "step": 202 + }, + { + "epoch": 1.02, + "grad_norm": 0.8325397372245789, + "learning_rate": 0.00017378848359641847, + "loss": 0.9095, + "step": 203 + }, + { + "epoch": 1.01, + "grad_norm": 0.6108893752098083, + "learning_rate": 0.00017351463937072004, + "loss": 1.0784, + "step": 204 + }, + { + "epoch": 1.01, + "grad_norm": 0.6140009164810181, + "learning_rate": 0.00017323959024908209, + "loss": 1.131, + "step": 205 + }, + { + "epoch": 1.02, + "grad_norm": 0.7503536343574524, + "learning_rate": 0.00017296334073952605, + "loss": 1.0152, + "step": 206 + }, + { + "epoch": 1.02, + "grad_norm": 0.6903036236763, + "learning_rate": 0.0001726858953697475, + "loss": 1.1751, + "step": 207 + }, + { + "epoch": 1.03, + "grad_norm": 0.6842136979103088, + "learning_rate": 0.00017240725868704218, + "loss": 0.9362, + "step": 208 + }, + { + "epoch": 1.03, + "grad_norm": 0.6317443251609802, + "learning_rate": 0.00017212743525823112, + "loss": 1.0199, + "step": 209 + }, + { + "epoch": 1.04, + "grad_norm": 0.6331597566604614, + "learning_rate": 0.0001718464296695861, + "loss": 0.8634, + "step": 210 + }, + { + "epoch": 1.04, + "grad_norm": 0.7953663468360901, + "learning_rate": 0.0001715642465267543, + "loss": 1.0635, + "step": 211 + }, + { + "epoch": 1.05, + "grad_norm": 0.6130046248435974, + "learning_rate": 0.00017128089045468294, + "loss": 0.8426, + "step": 212 + }, + { + "epoch": 1.05, + "grad_norm": 0.5984789729118347, + "learning_rate": 0.00017099636609754329, + "loss": 0.7435, + "step": 213 + }, + { + "epoch": 1.06, + "grad_norm": 0.8032707571983337, + "learning_rate": 0.00017071067811865476, + "loss": 0.9271, + "step": 214 + }, + { + "epoch": 1.06, + "grad_norm": 0.78273606300354, + "learning_rate": 0.00017042383120040834, + "loss": 0.8695, + "step": 215 + }, + { + "epoch": 1.07, + "grad_norm": 0.7779294848442078, + "learning_rate": 0.00017013583004418993, + "loss": 1.085, + "step": 216 + }, + { + "epoch": 1.07, + "grad_norm": 0.7201984524726868, + "learning_rate": 0.00016984667937030318, + "loss": 0.8079, + "step": 217 + }, + { + "epoch": 1.08, + "grad_norm": 0.6246169805526733, + "learning_rate": 0.00016955638391789228, + "loss": 0.7941, + "step": 218 + }, + { + "epoch": 1.08, + "grad_norm": 0.7627923488616943, + "learning_rate": 0.00016926494844486412, + "loss": 0.9281, + "step": 219 + }, + { + "epoch": 1.09, + "grad_norm": 0.6979169249534607, + "learning_rate": 0.00016897237772781044, + "loss": 0.8461, + "step": 220 + }, + { + "epoch": 1.09, + "grad_norm": 0.7872811555862427, + "learning_rate": 0.00016867867656192946, + "loss": 0.9413, + "step": 221 + }, + { + "epoch": 1.1, + "grad_norm": 0.7482172846794128, + "learning_rate": 0.00016838384976094738, + "loss": 0.9107, + "step": 222 + }, + { + "epoch": 1.1, + "grad_norm": 0.8587368130683899, + "learning_rate": 0.00016808790215703935, + "loss": 0.9886, + "step": 223 + }, + { + "epoch": 1.11, + "grad_norm": 0.732606828212738, + "learning_rate": 0.00016779083860075033, + "loss": 0.6831, + "step": 224 + }, + { + "epoch": 1.11, + "grad_norm": 0.9272279143333435, + "learning_rate": 0.0001674926639609157, + "loss": 1.1396, + "step": 225 + }, + { + "epoch": 1.12, + "grad_norm": 0.6473172307014465, + "learning_rate": 0.00016719338312458124, + "loss": 0.8299, + "step": 226 + }, + { + "epoch": 1.12, + "grad_norm": 0.8427954316139221, + "learning_rate": 0.00016689300099692332, + "loss": 0.9203, + "step": 227 + }, + { + "epoch": 1.13, + "grad_norm": 0.8205825090408325, + "learning_rate": 0.00016659152250116812, + "loss": 0.8532, + "step": 228 + }, + { + "epoch": 1.13, + "grad_norm": 0.7522780299186707, + "learning_rate": 0.00016628895257851135, + "loss": 0.7687, + "step": 229 + }, + { + "epoch": 1.14, + "grad_norm": 0.8582683205604553, + "learning_rate": 0.000165985296188037, + "loss": 0.9217, + "step": 230 + }, + { + "epoch": 1.14, + "grad_norm": 0.8408709168434143, + "learning_rate": 0.0001656805583066361, + "loss": 1.0371, + "step": 231 + }, + { + "epoch": 1.15, + "grad_norm": 0.9851942658424377, + "learning_rate": 0.00016537474392892528, + "loss": 1.044, + "step": 232 + }, + { + "epoch": 1.15, + "grad_norm": 0.8463842868804932, + "learning_rate": 0.00016506785806716465, + "loss": 0.9521, + "step": 233 + }, + { + "epoch": 1.16, + "grad_norm": 0.825255811214447, + "learning_rate": 0.00016475990575117605, + "loss": 0.8524, + "step": 234 + }, + { + "epoch": 1.16, + "grad_norm": 1.1519947052001953, + "learning_rate": 0.0001644508920282601, + "loss": 0.9906, + "step": 235 + }, + { + "epoch": 1.17, + "grad_norm": 0.8102966547012329, + "learning_rate": 0.000164140821963114, + "loss": 0.9192, + "step": 236 + }, + { + "epoch": 1.17, + "grad_norm": 1.0159798860549927, + "learning_rate": 0.0001638297006377481, + "loss": 1.0234, + "step": 237 + }, + { + "epoch": 1.18, + "grad_norm": 1.0157923698425293, + "learning_rate": 0.00016351753315140287, + "loss": 0.8921, + "step": 238 + }, + { + "epoch": 1.18, + "grad_norm": 0.8466264009475708, + "learning_rate": 0.00016320432462046516, + "loss": 0.7098, + "step": 239 + }, + { + "epoch": 1.19, + "grad_norm": 0.8298121690750122, + "learning_rate": 0.00016289008017838445, + "loss": 0.8517, + "step": 240 + }, + { + "epoch": 1.19, + "grad_norm": 1.2163349390029907, + "learning_rate": 0.00016257480497558873, + "loss": 1.1172, + "step": 241 + }, + { + "epoch": 1.2, + "grad_norm": 0.9839556217193604, + "learning_rate": 0.0001622585041793999, + "loss": 1.1022, + "step": 242 + }, + { + "epoch": 1.2, + "grad_norm": 0.7986888289451599, + "learning_rate": 0.00016194118297394936, + "loss": 0.7826, + "step": 243 + }, + { + "epoch": 1.21, + "grad_norm": 0.9318971037864685, + "learning_rate": 0.00016162284656009274, + "loss": 0.8899, + "step": 244 + }, + { + "epoch": 1.21, + "grad_norm": 1.0234252214431763, + "learning_rate": 0.00016130350015532496, + "loss": 0.8831, + "step": 245 + }, + { + "epoch": 1.22, + "grad_norm": 0.8264230489730835, + "learning_rate": 0.00016098314899369446, + "loss": 1.1389, + "step": 246 + }, + { + "epoch": 1.22, + "grad_norm": 0.8845193982124329, + "learning_rate": 0.0001606617983257176, + "loss": 1.0822, + "step": 247 + }, + { + "epoch": 1.23, + "grad_norm": 0.9044338464736938, + "learning_rate": 0.00016033945341829248, + "loss": 1.0556, + "step": 248 + }, + { + "epoch": 1.23, + "grad_norm": 0.9660309553146362, + "learning_rate": 0.00016001611955461265, + "loss": 1.0331, + "step": 249 + }, + { + "epoch": 1.24, + "grad_norm": 1.0728851556777954, + "learning_rate": 0.0001596918020340805, + "loss": 1.0465, + "step": 250 + }, + { + "epoch": 1.24, + "eval_loss": 1.1568788290023804, + "eval_runtime": 2.9063, + "eval_samples_per_second": 34.408, + "eval_steps_per_second": 17.204, + "step": 250 + }, + { + "epoch": 1.24, + "grad_norm": 0.9447798728942871, + "learning_rate": 0.00015936650617222063, + "loss": 0.9487, + "step": 251 + }, + { + "epoch": 1.25, + "grad_norm": 1.0429767370224, + "learning_rate": 0.00015904023730059228, + "loss": 1.006, + "step": 252 + }, + { + "epoch": 1.25, + "grad_norm": 0.9871753454208374, + "learning_rate": 0.00015871300076670234, + "loss": 0.9494, + "step": 253 + }, + { + "epoch": 1.26, + "grad_norm": 0.7644299268722534, + "learning_rate": 0.00015838480193391754, + "loss": 0.6077, + "step": 254 + }, + { + "epoch": 1.26, + "grad_norm": 1.1654846668243408, + "learning_rate": 0.0001580556461813766, + "loss": 1.0632, + "step": 255 + }, + { + "epoch": 1.27, + "grad_norm": 1.0508393049240112, + "learning_rate": 0.00015772553890390197, + "loss": 0.8754, + "step": 256 + }, + { + "epoch": 1.27, + "grad_norm": 0.8676743507385254, + "learning_rate": 0.0001573944855119115, + "loss": 1.007, + "step": 257 + }, + { + "epoch": 1.28, + "grad_norm": 1.178464412689209, + "learning_rate": 0.00015706249143132982, + "loss": 1.041, + "step": 258 + }, + { + "epoch": 1.28, + "grad_norm": 1.0226370096206665, + "learning_rate": 0.00015672956210349923, + "loss": 1.1114, + "step": 259 + }, + { + "epoch": 1.29, + "grad_norm": 0.9840787649154663, + "learning_rate": 0.00015639570298509064, + "loss": 0.9043, + "step": 260 + }, + { + "epoch": 1.29, + "grad_norm": 1.0564519166946411, + "learning_rate": 0.0001560609195480142, + "loss": 0.9696, + "step": 261 + }, + { + "epoch": 1.3, + "grad_norm": 0.9174713492393494, + "learning_rate": 0.00015572521727932935, + "loss": 0.9849, + "step": 262 + }, + { + "epoch": 1.3, + "grad_norm": 0.7333153486251831, + "learning_rate": 0.00015538860168115527, + "loss": 0.7286, + "step": 263 + }, + { + "epoch": 1.31, + "grad_norm": 0.9282216429710388, + "learning_rate": 0.00015505107827058036, + "loss": 0.8975, + "step": 264 + }, + { + "epoch": 1.31, + "grad_norm": 1.003192663192749, + "learning_rate": 0.00015471265257957202, + "loss": 1.1836, + "step": 265 + }, + { + "epoch": 1.32, + "grad_norm": 0.8726491928100586, + "learning_rate": 0.00015437333015488587, + "loss": 0.9313, + "step": 266 + }, + { + "epoch": 1.32, + "grad_norm": 0.9721888899803162, + "learning_rate": 0.00015403311655797492, + "loss": 0.8935, + "step": 267 + }, + { + "epoch": 1.33, + "grad_norm": 1.0440247058868408, + "learning_rate": 0.0001536920173648984, + "loss": 0.9741, + "step": 268 + }, + { + "epoch": 1.33, + "grad_norm": 0.9814698100090027, + "learning_rate": 0.00015335003816623028, + "loss": 0.8982, + "step": 269 + }, + { + "epoch": 1.34, + "grad_norm": 0.904926598072052, + "learning_rate": 0.00015300718456696778, + "loss": 0.8579, + "step": 270 + }, + { + "epoch": 1.34, + "grad_norm": 1.0483490228652954, + "learning_rate": 0.00015266346218643947, + "loss": 0.8108, + "step": 271 + }, + { + "epoch": 1.35, + "grad_norm": 0.9156501293182373, + "learning_rate": 0.000152318876658213, + "loss": 0.9442, + "step": 272 + }, + { + "epoch": 1.35, + "grad_norm": 0.9268532395362854, + "learning_rate": 0.00015197343363000307, + "loss": 0.8243, + "step": 273 + }, + { + "epoch": 1.36, + "grad_norm": 0.8675321340560913, + "learning_rate": 0.00015162713876357858, + "loss": 0.7758, + "step": 274 + }, + { + "epoch": 1.36, + "grad_norm": 0.9131675362586975, + "learning_rate": 0.00015127999773467002, + "loss": 0.8845, + "step": 275 + }, + { + "epoch": 1.37, + "grad_norm": 1.0260008573532104, + "learning_rate": 0.00015093201623287631, + "loss": 0.9032, + "step": 276 + }, + { + "epoch": 1.37, + "grad_norm": 1.044528841972351, + "learning_rate": 0.00015058319996157172, + "loss": 1.0489, + "step": 277 + }, + { + "epoch": 1.38, + "grad_norm": 0.9959388375282288, + "learning_rate": 0.0001502335546378122, + "loss": 0.858, + "step": 278 + }, + { + "epoch": 1.38, + "grad_norm": 0.8414021730422974, + "learning_rate": 0.00014988308599224183, + "loss": 0.782, + "step": 279 + }, + { + "epoch": 1.39, + "grad_norm": 0.9205671548843384, + "learning_rate": 0.00014953179976899878, + "loss": 0.8376, + "step": 280 + }, + { + "epoch": 1.39, + "grad_norm": 0.9481040239334106, + "learning_rate": 0.0001491797017256212, + "loss": 0.851, + "step": 281 + }, + { + "epoch": 1.4, + "grad_norm": 0.8266577124595642, + "learning_rate": 0.00014882679763295306, + "loss": 0.7228, + "step": 282 + }, + { + "epoch": 1.4, + "grad_norm": 1.0222742557525635, + "learning_rate": 0.0001484730932750491, + "loss": 0.7955, + "step": 283 + }, + { + "epoch": 1.41, + "grad_norm": 1.0014468431472778, + "learning_rate": 0.00014811859444908052, + "loss": 0.9107, + "step": 284 + }, + { + "epoch": 1.41, + "grad_norm": 0.9157910346984863, + "learning_rate": 0.00014776330696523963, + "loss": 1.0208, + "step": 285 + }, + { + "epoch": 1.42, + "grad_norm": 1.0565227270126343, + "learning_rate": 0.00014740723664664483, + "loss": 0.6496, + "step": 286 + }, + { + "epoch": 1.42, + "grad_norm": 1.0323175191879272, + "learning_rate": 0.00014705038932924503, + "loss": 1.0043, + "step": 287 + }, + { + "epoch": 1.43, + "grad_norm": 1.0063213109970093, + "learning_rate": 0.00014669277086172406, + "loss": 1.1286, + "step": 288 + }, + { + "epoch": 1.43, + "grad_norm": 0.8602890968322754, + "learning_rate": 0.00014633438710540489, + "loss": 0.7254, + "step": 289 + }, + { + "epoch": 1.44, + "grad_norm": 0.9782769083976746, + "learning_rate": 0.00014597524393415335, + "loss": 0.7086, + "step": 290 + }, + { + "epoch": 1.44, + "grad_norm": 0.9836515784263611, + "learning_rate": 0.00014561534723428205, + "loss": 0.8405, + "step": 291 + }, + { + "epoch": 1.45, + "grad_norm": 1.0674123764038086, + "learning_rate": 0.00014525470290445392, + "loss": 1.0317, + "step": 292 + }, + { + "epoch": 1.45, + "grad_norm": 0.9632031917572021, + "learning_rate": 0.00014489331685558525, + "loss": 0.9473, + "step": 293 + }, + { + "epoch": 1.46, + "grad_norm": 1.0105828046798706, + "learning_rate": 0.00014453119501074924, + "loss": 0.8199, + "step": 294 + }, + { + "epoch": 1.46, + "grad_norm": 1.0012938976287842, + "learning_rate": 0.00014416834330507856, + "loss": 0.9099, + "step": 295 + }, + { + "epoch": 1.47, + "grad_norm": 1.0367400646209717, + "learning_rate": 0.00014380476768566824, + "loss": 1.0958, + "step": 296 + }, + { + "epoch": 1.47, + "grad_norm": 0.7329337000846863, + "learning_rate": 0.00014344047411147818, + "loss": 0.6189, + "step": 297 + }, + { + "epoch": 1.48, + "grad_norm": 0.9014643430709839, + "learning_rate": 0.00014307546855323549, + "loss": 0.8168, + "step": 298 + }, + { + "epoch": 1.48, + "grad_norm": 0.7568360567092896, + "learning_rate": 0.00014270975699333654, + "loss": 0.7857, + "step": 299 + }, + { + "epoch": 1.49, + "grad_norm": 0.9573479890823364, + "learning_rate": 0.00014234334542574906, + "loss": 0.9577, + "step": 300 + }, + { + "epoch": 1.49, + "eval_loss": 1.149274230003357, + "eval_runtime": 2.9222, + "eval_samples_per_second": 34.22, + "eval_steps_per_second": 17.11, + "step": 300 + }, + { + "epoch": 1.49, + "grad_norm": 0.8575048446655273, + "learning_rate": 0.00014197623985591373, + "loss": 0.8521, + "step": 301 + }, + { + "epoch": 1.5, + "grad_norm": 0.990980863571167, + "learning_rate": 0.00014160844630064595, + "loss": 1.0642, + "step": 302 + }, + { + "epoch": 1.5, + "grad_norm": 1.1145374774932861, + "learning_rate": 0.00014123997078803707, + "loss": 0.8963, + "step": 303 + }, + { + "epoch": 1.51, + "grad_norm": 0.9661507606506348, + "learning_rate": 0.00014087081935735564, + "loss": 0.9473, + "step": 304 + }, + { + "epoch": 1.51, + "grad_norm": 1.019618272781372, + "learning_rate": 0.00014050099805894837, + "loss": 0.9048, + "step": 305 + }, + { + "epoch": 1.52, + "grad_norm": 0.871661365032196, + "learning_rate": 0.00014013051295414108, + "loss": 0.6644, + "step": 306 + }, + { + "epoch": 1.52, + "grad_norm": 0.9834782481193542, + "learning_rate": 0.00013975937011513932, + "loss": 0.9226, + "step": 307 + }, + { + "epoch": 1.53, + "grad_norm": 0.9938518404960632, + "learning_rate": 0.00013938757562492873, + "loss": 0.9608, + "step": 308 + }, + { + "epoch": 1.53, + "grad_norm": 1.0692541599273682, + "learning_rate": 0.00013901513557717553, + "loss": 0.9646, + "step": 309 + }, + { + "epoch": 1.54, + "grad_norm": 1.039904236793518, + "learning_rate": 0.00013864205607612648, + "loss": 0.7799, + "step": 310 + }, + { + "epoch": 1.54, + "grad_norm": 0.9138852953910828, + "learning_rate": 0.000138268343236509, + "loss": 0.8297, + "step": 311 + }, + { + "epoch": 1.55, + "grad_norm": 1.01775324344635, + "learning_rate": 0.00013789400318343068, + "loss": 0.8992, + "step": 312 + }, + { + "epoch": 1.55, + "grad_norm": 1.0052934885025024, + "learning_rate": 0.0001375190420522792, + "loss": 0.8212, + "step": 313 + }, + { + "epoch": 1.56, + "grad_norm": 1.0567269325256348, + "learning_rate": 0.00013714346598862166, + "loss": 1.0402, + "step": 314 + }, + { + "epoch": 1.56, + "grad_norm": 0.8707680702209473, + "learning_rate": 0.00013676728114810367, + "loss": 0.8864, + "step": 315 + }, + { + "epoch": 1.57, + "grad_norm": 0.959578812122345, + "learning_rate": 0.00013639049369634876, + "loss": 0.7048, + "step": 316 + }, + { + "epoch": 1.57, + "grad_norm": 1.0675721168518066, + "learning_rate": 0.00013601310980885714, + "loss": 1.0025, + "step": 317 + }, + { + "epoch": 1.58, + "grad_norm": 0.8831722736358643, + "learning_rate": 0.0001356351356709045, + "loss": 0.8058, + "step": 318 + }, + { + "epoch": 1.58, + "grad_norm": 1.0400885343551636, + "learning_rate": 0.00013525657747744072, + "loss": 1.0273, + "step": 319 + }, + { + "epoch": 1.59, + "grad_norm": 1.0046364068984985, + "learning_rate": 0.00013487744143298822, + "loss": 0.8441, + "step": 320 + }, + { + "epoch": 1.59, + "grad_norm": 1.029714822769165, + "learning_rate": 0.0001344977337515404, + "loss": 0.7771, + "step": 321 + }, + { + "epoch": 1.6, + "grad_norm": 0.8168841004371643, + "learning_rate": 0.0001341174606564596, + "loss": 0.8024, + "step": 322 + }, + { + "epoch": 1.6, + "grad_norm": 0.9833108186721802, + "learning_rate": 0.00013373662838037537, + "loss": 0.9065, + "step": 323 + }, + { + "epoch": 1.61, + "grad_norm": 0.9366996884346008, + "learning_rate": 0.00013335524316508208, + "loss": 0.9436, + "step": 324 + }, + { + "epoch": 1.61, + "grad_norm": 0.8757138848304749, + "learning_rate": 0.00013297331126143667, + "loss": 0.8399, + "step": 325 + }, + { + "epoch": 1.62, + "grad_norm": 1.1467972993850708, + "learning_rate": 0.00013259083892925633, + "loss": 1.1416, + "step": 326 + }, + { + "epoch": 1.62, + "grad_norm": 0.9916189312934875, + "learning_rate": 0.00013220783243721572, + "loss": 0.9531, + "step": 327 + }, + { + "epoch": 1.63, + "grad_norm": 0.9911974668502808, + "learning_rate": 0.0001318242980627444, + "loss": 0.9476, + "step": 328 + }, + { + "epoch": 1.63, + "grad_norm": 1.0219913721084595, + "learning_rate": 0.0001314402420919238, + "loss": 0.9288, + "step": 329 + }, + { + "epoch": 1.64, + "grad_norm": 1.0889464616775513, + "learning_rate": 0.00013105567081938424, + "loss": 0.8025, + "step": 330 + }, + { + "epoch": 1.64, + "grad_norm": 0.8797928690910339, + "learning_rate": 0.00013067059054820183, + "loss": 0.9002, + "step": 331 + }, + { + "epoch": 1.65, + "grad_norm": 1.0043346881866455, + "learning_rate": 0.00013028500758979506, + "loss": 0.8971, + "step": 332 + }, + { + "epoch": 1.65, + "grad_norm": 0.9221352934837341, + "learning_rate": 0.00012989892826382145, + "loss": 0.8181, + "step": 333 + }, + { + "epoch": 1.66, + "grad_norm": 1.2053778171539307, + "learning_rate": 0.00012951235889807386, + "loss": 0.9374, + "step": 334 + }, + { + "epoch": 1.66, + "grad_norm": 1.2230528593063354, + "learning_rate": 0.00012912530582837682, + "loss": 0.9123, + "step": 335 + }, + { + "epoch": 1.67, + "grad_norm": 0.8403642773628235, + "learning_rate": 0.00012873777539848283, + "loss": 0.9323, + "step": 336 + }, + { + "epoch": 1.67, + "grad_norm": 1.1632657051086426, + "learning_rate": 0.00012834977395996818, + "loss": 1.1916, + "step": 337 + }, + { + "epoch": 1.68, + "grad_norm": 0.9937611222267151, + "learning_rate": 0.0001279613078721289, + "loss": 1.141, + "step": 338 + }, + { + "epoch": 1.68, + "grad_norm": 0.8973978161811829, + "learning_rate": 0.0001275723835018767, + "loss": 0.8399, + "step": 339 + }, + { + "epoch": 1.69, + "grad_norm": 1.0466402769088745, + "learning_rate": 0.0001271830072236343, + "loss": 0.8127, + "step": 340 + }, + { + "epoch": 1.69, + "grad_norm": 0.9691051244735718, + "learning_rate": 0.0001267931854192313, + "loss": 0.9794, + "step": 341 + }, + { + "epoch": 1.7, + "grad_norm": 0.925682544708252, + "learning_rate": 0.0001264029244777993, + "loss": 0.8233, + "step": 342 + }, + { + "epoch": 1.7, + "grad_norm": 0.9783278703689575, + "learning_rate": 0.00012601223079566743, + "loss": 0.9542, + "step": 343 + }, + { + "epoch": 1.71, + "grad_norm": 0.9945007562637329, + "learning_rate": 0.00012562111077625722, + "loss": 1.0757, + "step": 344 + }, + { + "epoch": 1.71, + "grad_norm": 1.1597148180007935, + "learning_rate": 0.000125229570829978, + "loss": 1.1052, + "step": 345 + }, + { + "epoch": 1.72, + "grad_norm": 0.7987023591995239, + "learning_rate": 0.0001248376173741215, + "loss": 0.8602, + "step": 346 + }, + { + "epoch": 1.72, + "grad_norm": 0.8969370126724243, + "learning_rate": 0.00012444525683275688, + "loss": 1.6019, + "step": 347 + }, + { + "epoch": 1.73, + "grad_norm": 1.0622583627700806, + "learning_rate": 0.00012405249563662537, + "loss": 1.0735, + "step": 348 + }, + { + "epoch": 1.73, + "grad_norm": 1.0987950563430786, + "learning_rate": 0.00012365934022303491, + "loss": 0.9973, + "step": 349 + }, + { + "epoch": 1.74, + "grad_norm": 0.9930221438407898, + "learning_rate": 0.00012326579703575462, + "loss": 1.1257, + "step": 350 + }, + { + "epoch": 1.74, + "eval_loss": 1.1461950540542603, + "eval_runtime": 2.9343, + "eval_samples_per_second": 34.08, + "eval_steps_per_second": 17.04, + "step": 350 + }, + { + "epoch": 1.74, + "grad_norm": 1.0799540281295776, + "learning_rate": 0.00012287187252490913, + "loss": 0.8758, + "step": 351 + }, + { + "epoch": 1.75, + "grad_norm": 1.0633143186569214, + "learning_rate": 0.00012247757314687297, + "loss": 1.0396, + "step": 352 + }, + { + "epoch": 1.75, + "grad_norm": 0.9504884481430054, + "learning_rate": 0.00012208290536416463, + "loss": 0.8192, + "step": 353 + }, + { + "epoch": 1.76, + "grad_norm": 0.8587303161621094, + "learning_rate": 0.00012168787564534078, + "loss": 0.748, + "step": 354 + }, + { + "epoch": 1.76, + "grad_norm": 1.3652898073196411, + "learning_rate": 0.0001212924904648902, + "loss": 1.0768, + "step": 355 + }, + { + "epoch": 1.77, + "grad_norm": 1.0679266452789307, + "learning_rate": 0.00012089675630312754, + "loss": 0.9099, + "step": 356 + }, + { + "epoch": 1.77, + "grad_norm": 1.2426522970199585, + "learning_rate": 0.00012050067964608724, + "loss": 0.9869, + "step": 357 + }, + { + "epoch": 1.78, + "grad_norm": 0.9639490246772766, + "learning_rate": 0.00012010426698541728, + "loss": 0.6993, + "step": 358 + }, + { + "epoch": 1.78, + "grad_norm": 1.1884175539016724, + "learning_rate": 0.0001197075248182726, + "loss": 0.9868, + "step": 359 + }, + { + "epoch": 1.79, + "grad_norm": 0.9860052466392517, + "learning_rate": 0.00011931045964720881, + "loss": 0.7148, + "step": 360 + }, + { + "epoch": 1.79, + "grad_norm": 0.8812693357467651, + "learning_rate": 0.00011891307798007536, + "loss": 0.9295, + "step": 361 + }, + { + "epoch": 1.8, + "grad_norm": 1.032242774963379, + "learning_rate": 0.00011851538632990921, + "loss": 1.2292, + "step": 362 + }, + { + "epoch": 1.8, + "grad_norm": 0.9777809381484985, + "learning_rate": 0.00011811739121482777, + "loss": 1.0646, + "step": 363 + }, + { + "epoch": 1.81, + "grad_norm": 1.0464228391647339, + "learning_rate": 0.0001177190991579223, + "loss": 0.9703, + "step": 364 + }, + { + "epoch": 1.81, + "grad_norm": 0.9763212203979492, + "learning_rate": 0.00011732051668715081, + "loss": 0.7753, + "step": 365 + }, + { + "epoch": 1.82, + "grad_norm": 1.114912748336792, + "learning_rate": 0.00011692165033523117, + "loss": 0.9979, + "step": 366 + }, + { + "epoch": 1.82, + "grad_norm": 0.8752657771110535, + "learning_rate": 0.00011652250663953415, + "loss": 0.9964, + "step": 367 + }, + { + "epoch": 1.83, + "grad_norm": 0.9158682823181152, + "learning_rate": 0.00011612309214197599, + "loss": 0.7576, + "step": 368 + }, + { + "epoch": 1.83, + "grad_norm": 0.8457457423210144, + "learning_rate": 0.00011572341338891144, + "loss": 0.9144, + "step": 369 + }, + { + "epoch": 1.84, + "grad_norm": 1.0021049976348877, + "learning_rate": 0.00011532347693102632, + "loss": 0.9226, + "step": 370 + }, + { + "epoch": 1.84, + "grad_norm": 0.9614117741584778, + "learning_rate": 0.00011492328932323022, + "loss": 1.0214, + "step": 371 + }, + { + "epoch": 1.85, + "grad_norm": 0.9289172291755676, + "learning_rate": 0.00011452285712454904, + "loss": 0.8793, + "step": 372 + }, + { + "epoch": 1.85, + "grad_norm": 1.0654929876327515, + "learning_rate": 0.00011412218689801748, + "loss": 1.1519, + "step": 373 + }, + { + "epoch": 1.86, + "grad_norm": 1.0563515424728394, + "learning_rate": 0.00011372128521057155, + "loss": 0.9859, + "step": 374 + }, + { + "epoch": 1.86, + "grad_norm": 1.011228322982788, + "learning_rate": 0.00011332015863294076, + "loss": 0.9138, + "step": 375 + }, + { + "epoch": 1.87, + "grad_norm": 0.942287802696228, + "learning_rate": 0.00011291881373954065, + "loss": 0.8865, + "step": 376 + }, + { + "epoch": 1.87, + "grad_norm": 0.9734610319137573, + "learning_rate": 0.00011251725710836489, + "loss": 0.8578, + "step": 377 + }, + { + "epoch": 1.88, + "grad_norm": 1.184990406036377, + "learning_rate": 0.00011211549532087749, + "loss": 1.0107, + "step": 378 + }, + { + "epoch": 1.88, + "grad_norm": 1.033831238746643, + "learning_rate": 0.00011171353496190498, + "loss": 1.0496, + "step": 379 + }, + { + "epoch": 1.89, + "grad_norm": 1.018054485321045, + "learning_rate": 0.00011131138261952845, + "loss": 0.8782, + "step": 380 + }, + { + "epoch": 1.89, + "grad_norm": 0.9694205522537231, + "learning_rate": 0.00011090904488497549, + "loss": 0.9928, + "step": 381 + }, + { + "epoch": 1.9, + "grad_norm": 0.9095280170440674, + "learning_rate": 0.0001105065283525124, + "loss": 0.9821, + "step": 382 + }, + { + "epoch": 1.9, + "grad_norm": 0.8029172420501709, + "learning_rate": 0.00011010383961933581, + "loss": 0.6811, + "step": 383 + }, + { + "epoch": 1.91, + "grad_norm": 0.9388089776039124, + "learning_rate": 0.00010970098528546481, + "loss": 0.9703, + "step": 384 + }, + { + "epoch": 1.91, + "grad_norm": 0.8639506697654724, + "learning_rate": 0.00010929797195363259, + "loss": 0.8579, + "step": 385 + }, + { + "epoch": 1.92, + "grad_norm": 1.001845121383667, + "learning_rate": 0.0001088948062291783, + "loss": 1.038, + "step": 386 + }, + { + "epoch": 1.92, + "grad_norm": 0.9668776392936707, + "learning_rate": 0.00010849149471993882, + "loss": 0.9457, + "step": 387 + }, + { + "epoch": 1.93, + "grad_norm": 0.8607358932495117, + "learning_rate": 0.00010808804403614043, + "loss": 0.8795, + "step": 388 + }, + { + "epoch": 1.93, + "grad_norm": 1.0189685821533203, + "learning_rate": 0.00010768446079029044, + "loss": 0.9203, + "step": 389 + }, + { + "epoch": 1.94, + "grad_norm": 0.9952776432037354, + "learning_rate": 0.0001072807515970688, + "loss": 1.0368, + "step": 390 + }, + { + "epoch": 1.94, + "grad_norm": 1.057427167892456, + "learning_rate": 0.00010687692307321984, + "loss": 1.0568, + "step": 391 + }, + { + "epoch": 1.95, + "grad_norm": 0.822589099407196, + "learning_rate": 0.00010647298183744359, + "loss": 0.9598, + "step": 392 + }, + { + "epoch": 1.95, + "grad_norm": 0.9903733730316162, + "learning_rate": 0.00010606893451028743, + "loss": 1.0595, + "step": 393 + }, + { + "epoch": 1.96, + "grad_norm": 1.0125857591629028, + "learning_rate": 0.00010566478771403763, + "loss": 0.9646, + "step": 394 + }, + { + "epoch": 1.96, + "grad_norm": 0.899347722530365, + "learning_rate": 0.00010526054807261067, + "loss": 1.0054, + "step": 395 + }, + { + "epoch": 1.97, + "grad_norm": 1.0629827976226807, + "learning_rate": 0.00010485622221144484, + "loss": 0.9319, + "step": 396 + }, + { + "epoch": 1.97, + "grad_norm": 0.9910023212432861, + "learning_rate": 0.00010445181675739144, + "loss": 0.9388, + "step": 397 + }, + { + "epoch": 1.98, + "grad_norm": 0.8644474744796753, + "learning_rate": 0.00010404733833860639, + "loss": 0.8007, + "step": 398 + }, + { + "epoch": 1.98, + "grad_norm": 0.9804188013076782, + "learning_rate": 0.00010364279358444144, + "loss": 0.9715, + "step": 399 + }, + { + "epoch": 1.99, + "grad_norm": 0.9533838033676147, + "learning_rate": 0.00010323818912533561, + "loss": 0.9404, + "step": 400 + }, + { + "epoch": 1.99, + "eval_loss": 1.1519674062728882, + "eval_runtime": 2.9242, + "eval_samples_per_second": 34.198, + "eval_steps_per_second": 17.099, + "step": 400 + }, + { + "epoch": 1.99, + "grad_norm": 0.9107962250709534, + "learning_rate": 0.00010283353159270643, + "loss": 0.9431, + "step": 401 + }, + { + "epoch": 2.0, + "grad_norm": 1.091841220855713, + "learning_rate": 0.00010242882761884131, + "loss": 0.9116, + "step": 402 + }, + { + "epoch": 2.01, + "grad_norm": 1.0149590969085693, + "learning_rate": 0.00010202408383678888, + "loss": 0.9091, + "step": 403 + }, + { + "epoch": 2.01, + "grad_norm": 1.1325241327285767, + "learning_rate": 0.00010161930688025017, + "loss": 1.0873, + "step": 404 + }, + { + "epoch": 2.02, + "grad_norm": 1.0526857376098633, + "learning_rate": 0.0001012145033834699, + "loss": 0.9299, + "step": 405 + }, + { + "epoch": 2.02, + "grad_norm": 1.074191689491272, + "learning_rate": 0.00010080967998112787, + "loss": 1.1391, + "step": 406 + }, + { + "epoch": 2.03, + "grad_norm": 1.1719715595245361, + "learning_rate": 0.00010040484330823006, + "loss": 0.7161, + "step": 407 + }, + { + "epoch": 2.01, + "grad_norm": 0.9399845004081726, + "learning_rate": 0.0001, + "loss": 0.7587, + "step": 408 + }, + { + "epoch": 2.01, + "grad_norm": 0.8841493725776672, + "learning_rate": 9.959515669176996e-05, + "loss": 0.6094, + "step": 409 + }, + { + "epoch": 2.02, + "grad_norm": 0.8972917199134827, + "learning_rate": 9.919032001887215e-05, + "loss": 0.823, + "step": 410 + }, + { + "epoch": 2.02, + "grad_norm": 1.1433557271957397, + "learning_rate": 9.878549661653012e-05, + "loss": 0.8466, + "step": 411 + }, + { + "epoch": 2.03, + "grad_norm": 1.080410361289978, + "learning_rate": 9.838069311974986e-05, + "loss": 0.6281, + "step": 412 + }, + { + "epoch": 2.03, + "grad_norm": 0.9081548452377319, + "learning_rate": 9.797591616321114e-05, + "loss": 0.7148, + "step": 413 + }, + { + "epoch": 2.04, + "grad_norm": 1.0240696668624878, + "learning_rate": 9.757117238115871e-05, + "loss": 0.5947, + "step": 414 + }, + { + "epoch": 2.04, + "grad_norm": 1.038631796836853, + "learning_rate": 9.716646840729361e-05, + "loss": 0.5712, + "step": 415 + }, + { + "epoch": 2.05, + "grad_norm": 1.0166879892349243, + "learning_rate": 9.676181087466444e-05, + "loss": 0.5389, + "step": 416 + }, + { + "epoch": 2.05, + "grad_norm": 1.2785813808441162, + "learning_rate": 9.635720641555858e-05, + "loss": 0.6171, + "step": 417 + }, + { + "epoch": 2.06, + "grad_norm": 1.2383880615234375, + "learning_rate": 9.595266166139366e-05, + "loss": 0.5927, + "step": 418 + }, + { + "epoch": 2.06, + "grad_norm": 1.4389182329177856, + "learning_rate": 9.554818324260859e-05, + "loss": 0.6568, + "step": 419 + }, + { + "epoch": 2.07, + "grad_norm": 1.3202635049819946, + "learning_rate": 9.514377778855521e-05, + "loss": 0.7599, + "step": 420 + }, + { + "epoch": 2.07, + "grad_norm": 1.3828835487365723, + "learning_rate": 9.473945192738933e-05, + "loss": 0.5168, + "step": 421 + }, + { + "epoch": 2.08, + "grad_norm": 0.9763804078102112, + "learning_rate": 9.433521228596237e-05, + "loss": 0.4694, + "step": 422 + }, + { + "epoch": 2.08, + "grad_norm": 1.2587525844573975, + "learning_rate": 9.393106548971256e-05, + "loss": 0.7283, + "step": 423 + }, + { + "epoch": 2.09, + "grad_norm": 0.9997501969337463, + "learning_rate": 9.352701816255643e-05, + "loss": 0.5682, + "step": 424 + }, + { + "epoch": 2.09, + "grad_norm": 1.4405382871627808, + "learning_rate": 9.312307692678017e-05, + "loss": 1.1634, + "step": 425 + }, + { + "epoch": 2.1, + "grad_norm": 1.2368428707122803, + "learning_rate": 9.27192484029312e-05, + "loss": 0.6116, + "step": 426 + }, + { + "epoch": 2.1, + "grad_norm": 0.9565535187721252, + "learning_rate": 9.231553920970958e-05, + "loss": 0.455, + "step": 427 + }, + { + "epoch": 2.11, + "grad_norm": 1.2496212720870972, + "learning_rate": 9.19119559638596e-05, + "loss": 0.706, + "step": 428 + }, + { + "epoch": 2.11, + "grad_norm": 1.08584725856781, + "learning_rate": 9.150850528006119e-05, + "loss": 0.6962, + "step": 429 + }, + { + "epoch": 2.12, + "grad_norm": 1.161963939666748, + "learning_rate": 9.110519377082172e-05, + "loss": 0.5892, + "step": 430 + }, + { + "epoch": 2.12, + "grad_norm": 1.1196906566619873, + "learning_rate": 9.070202804636745e-05, + "loss": 0.7261, + "step": 431 + }, + { + "epoch": 2.13, + "grad_norm": 1.372056007385254, + "learning_rate": 9.02990147145352e-05, + "loss": 0.7534, + "step": 432 + }, + { + "epoch": 2.13, + "grad_norm": 1.2965703010559082, + "learning_rate": 8.98961603806642e-05, + "loss": 0.5013, + "step": 433 + }, + { + "epoch": 2.14, + "grad_norm": 1.2913953065872192, + "learning_rate": 8.949347164748762e-05, + "loss": 0.7149, + "step": 434 + }, + { + "epoch": 2.14, + "grad_norm": 1.0722301006317139, + "learning_rate": 8.909095511502452e-05, + "loss": 0.539, + "step": 435 + }, + { + "epoch": 2.15, + "grad_norm": 1.3729982376098633, + "learning_rate": 8.868861738047158e-05, + "loss": 0.7224, + "step": 436 + }, + { + "epoch": 2.15, + "grad_norm": 1.3238959312438965, + "learning_rate": 8.828646503809504e-05, + "loss": 0.7495, + "step": 437 + }, + { + "epoch": 2.16, + "grad_norm": 1.4315913915634155, + "learning_rate": 8.788450467912255e-05, + "loss": 0.5041, + "step": 438 + }, + { + "epoch": 2.16, + "grad_norm": 1.145209789276123, + "learning_rate": 8.748274289163514e-05, + "loss": 0.6526, + "step": 439 + }, + { + "epoch": 2.17, + "grad_norm": 1.3024333715438843, + "learning_rate": 8.70811862604594e-05, + "loss": 0.7016, + "step": 440 + }, + { + "epoch": 2.17, + "grad_norm": 1.524943232536316, + "learning_rate": 8.667984136705928e-05, + "loss": 0.7276, + "step": 441 + }, + { + "epoch": 2.18, + "grad_norm": 1.4063531160354614, + "learning_rate": 8.627871478942851e-05, + "loss": 0.6246, + "step": 442 + }, + { + "epoch": 2.18, + "grad_norm": 1.2883118391036987, + "learning_rate": 8.587781310198255e-05, + "loss": 0.7363, + "step": 443 + }, + { + "epoch": 2.19, + "grad_norm": 1.2209841012954712, + "learning_rate": 8.5477142875451e-05, + "loss": 0.5598, + "step": 444 + }, + { + "epoch": 2.19, + "grad_norm": 0.9916577339172363, + "learning_rate": 8.507671067676979e-05, + "loss": 0.4323, + "step": 445 + }, + { + "epoch": 2.2, + "grad_norm": 1.306430459022522, + "learning_rate": 8.467652306897369e-05, + "loss": 0.7043, + "step": 446 + }, + { + "epoch": 2.2, + "grad_norm": 1.0825719833374023, + "learning_rate": 8.427658661108857e-05, + "loss": 0.634, + "step": 447 + }, + { + "epoch": 2.21, + "grad_norm": 1.1884212493896484, + "learning_rate": 8.387690785802402e-05, + "loss": 0.7186, + "step": 448 + }, + { + "epoch": 2.21, + "grad_norm": 1.6867362260818481, + "learning_rate": 8.347749336046586e-05, + "loss": 0.6552, + "step": 449 + }, + { + "epoch": 2.22, + "grad_norm": 1.268347978591919, + "learning_rate": 8.307834966476884e-05, + "loss": 0.7161, + "step": 450 + }, + { + "epoch": 2.22, + "eval_loss": 1.2602972984313965, + "eval_runtime": 2.9469, + "eval_samples_per_second": 33.934, + "eval_steps_per_second": 16.967, + "step": 450 + }, + { + "epoch": 2.22, + "grad_norm": 0.998717188835144, + "learning_rate": 8.267948331284923e-05, + "loss": 0.5212, + "step": 451 + }, + { + "epoch": 2.23, + "grad_norm": 1.153731346130371, + "learning_rate": 8.228090084207774e-05, + "loss": 0.6208, + "step": 452 + }, + { + "epoch": 2.23, + "grad_norm": 1.3108233213424683, + "learning_rate": 8.188260878517224e-05, + "loss": 0.6973, + "step": 453 + }, + { + "epoch": 2.24, + "grad_norm": 1.1354055404663086, + "learning_rate": 8.14846136700908e-05, + "loss": 0.4217, + "step": 454 + }, + { + "epoch": 2.24, + "grad_norm": 1.1650023460388184, + "learning_rate": 8.108692201992465e-05, + "loss": 0.4248, + "step": 455 + }, + { + "epoch": 2.25, + "grad_norm": 1.2203434705734253, + "learning_rate": 8.068954035279121e-05, + "loss": 0.6691, + "step": 456 + }, + { + "epoch": 2.25, + "grad_norm": 1.2530115842819214, + "learning_rate": 8.02924751817274e-05, + "loss": 0.6395, + "step": 457 + }, + { + "epoch": 2.26, + "grad_norm": 1.2986165285110474, + "learning_rate": 7.989573301458273e-05, + "loss": 0.8401, + "step": 458 + }, + { + "epoch": 2.26, + "grad_norm": 1.263421654701233, + "learning_rate": 7.949932035391278e-05, + "loss": 0.5025, + "step": 459 + }, + { + "epoch": 2.27, + "grad_norm": 1.4409805536270142, + "learning_rate": 7.91032436968725e-05, + "loss": 0.882, + "step": 460 + }, + { + "epoch": 2.27, + "grad_norm": 1.6700172424316406, + "learning_rate": 7.870750953510984e-05, + "loss": 0.8917, + "step": 461 + }, + { + "epoch": 2.28, + "grad_norm": 1.1698029041290283, + "learning_rate": 7.831212435465924e-05, + "loss": 0.664, + "step": 462 + }, + { + "epoch": 2.28, + "grad_norm": 1.5076547861099243, + "learning_rate": 7.79170946358354e-05, + "loss": 0.8633, + "step": 463 + }, + { + "epoch": 2.29, + "grad_norm": 1.0880191326141357, + "learning_rate": 7.75224268531271e-05, + "loss": 0.5256, + "step": 464 + }, + { + "epoch": 2.29, + "grad_norm": 1.05411696434021, + "learning_rate": 7.71281274750909e-05, + "loss": 0.5846, + "step": 465 + }, + { + "epoch": 2.3, + "grad_norm": 1.4615259170532227, + "learning_rate": 7.673420296424541e-05, + "loss": 0.8497, + "step": 466 + }, + { + "epoch": 2.3, + "grad_norm": 1.4441969394683838, + "learning_rate": 7.634065977696511e-05, + "loss": 0.7554, + "step": 467 + }, + { + "epoch": 2.31, + "grad_norm": 1.2453029155731201, + "learning_rate": 7.594750436337467e-05, + "loss": 0.6189, + "step": 468 + }, + { + "epoch": 2.31, + "grad_norm": 1.3973779678344727, + "learning_rate": 7.555474316724313e-05, + "loss": 0.7063, + "step": 469 + }, + { + "epoch": 2.32, + "grad_norm": 1.1818283796310425, + "learning_rate": 7.516238262587851e-05, + "loss": 0.6328, + "step": 470 + }, + { + "epoch": 2.32, + "grad_norm": 1.1389139890670776, + "learning_rate": 7.4770429170022e-05, + "loss": 0.648, + "step": 471 + }, + { + "epoch": 2.33, + "grad_norm": 1.4820585250854492, + "learning_rate": 7.437888922374276e-05, + "loss": 0.7222, + "step": 472 + }, + { + "epoch": 2.33, + "grad_norm": 1.3325060606002808, + "learning_rate": 7.398776920433258e-05, + "loss": 0.6432, + "step": 473 + }, + { + "epoch": 2.34, + "grad_norm": 1.0379180908203125, + "learning_rate": 7.35970755222007e-05, + "loss": 0.3837, + "step": 474 + }, + { + "epoch": 2.34, + "grad_norm": 1.351940631866455, + "learning_rate": 7.320681458076871e-05, + "loss": 0.6917, + "step": 475 + }, + { + "epoch": 2.35, + "grad_norm": 1.2660441398620605, + "learning_rate": 7.281699277636572e-05, + "loss": 0.6345, + "step": 476 + }, + { + "epoch": 2.35, + "grad_norm": 1.4925462007522583, + "learning_rate": 7.242761649812335e-05, + "loss": 0.4858, + "step": 477 + }, + { + "epoch": 2.36, + "grad_norm": 1.315204381942749, + "learning_rate": 7.20386921278711e-05, + "loss": 0.7035, + "step": 478 + }, + { + "epoch": 2.36, + "grad_norm": 1.4045330286026, + "learning_rate": 7.165022604003186e-05, + "loss": 0.809, + "step": 479 + }, + { + "epoch": 2.37, + "grad_norm": 1.2216744422912598, + "learning_rate": 7.126222460151719e-05, + "loss": 0.584, + "step": 480 + }, + { + "epoch": 2.37, + "grad_norm": 1.272883415222168, + "learning_rate": 7.08746941716232e-05, + "loss": 0.613, + "step": 481 + }, + { + "epoch": 2.38, + "grad_norm": 1.1015321016311646, + "learning_rate": 7.048764110192618e-05, + "loss": 0.4539, + "step": 482 + }, + { + "epoch": 2.38, + "grad_norm": 1.173862099647522, + "learning_rate": 7.010107173617857e-05, + "loss": 0.6842, + "step": 483 + }, + { + "epoch": 2.39, + "grad_norm": 1.3101396560668945, + "learning_rate": 6.971499241020495e-05, + "loss": 0.6377, + "step": 484 + }, + { + "epoch": 2.39, + "grad_norm": 1.1513952016830444, + "learning_rate": 6.932940945179818e-05, + "loss": 0.502, + "step": 485 + }, + { + "epoch": 2.4, + "grad_norm": 1.2137222290039062, + "learning_rate": 6.894432918061579e-05, + "loss": 0.6232, + "step": 486 + }, + { + "epoch": 2.4, + "grad_norm": 1.0849742889404297, + "learning_rate": 6.855975790807623e-05, + "loss": 0.4799, + "step": 487 + }, + { + "epoch": 2.41, + "grad_norm": 1.1737949848175049, + "learning_rate": 6.817570193725564e-05, + "loss": 0.5119, + "step": 488 + }, + { + "epoch": 2.41, + "grad_norm": 1.3420112133026123, + "learning_rate": 6.77921675627843e-05, + "loss": 0.7176, + "step": 489 + }, + { + "epoch": 2.42, + "grad_norm": 1.3262616395950317, + "learning_rate": 6.740916107074372e-05, + "loss": 0.7479, + "step": 490 + }, + { + "epoch": 2.42, + "grad_norm": 1.3177785873413086, + "learning_rate": 6.702668873856338e-05, + "loss": 0.6498, + "step": 491 + }, + { + "epoch": 2.43, + "grad_norm": 1.3273133039474487, + "learning_rate": 6.664475683491796e-05, + "loss": 0.6036, + "step": 492 + }, + { + "epoch": 2.43, + "grad_norm": 1.1320433616638184, + "learning_rate": 6.626337161962461e-05, + "loss": 0.5075, + "step": 493 + }, + { + "epoch": 2.44, + "grad_norm": 1.2999693155288696, + "learning_rate": 6.588253934354039e-05, + "loss": 0.5805, + "step": 494 + }, + { + "epoch": 2.44, + "grad_norm": 1.2638920545578003, + "learning_rate": 6.550226624845961e-05, + "loss": 0.6831, + "step": 495 + }, + { + "epoch": 2.45, + "grad_norm": 1.246358871459961, + "learning_rate": 6.512255856701177e-05, + "loss": 0.5891, + "step": 496 + }, + { + "epoch": 2.45, + "grad_norm": 1.216341257095337, + "learning_rate": 6.474342252255927e-05, + "loss": 0.6533, + "step": 497 + }, + { + "epoch": 2.46, + "grad_norm": 1.4384123086929321, + "learning_rate": 6.43648643290955e-05, + "loss": 0.7545, + "step": 498 + }, + { + "epoch": 2.46, + "grad_norm": 1.2650271654129028, + "learning_rate": 6.398689019114289e-05, + "loss": 0.7225, + "step": 499 + }, + { + "epoch": 2.47, + "grad_norm": 1.2374640703201294, + "learning_rate": 6.360950630365126e-05, + "loss": 0.5897, + "step": 500 + }, + { + "epoch": 2.47, + "eval_loss": 1.2661257982254028, + "eval_runtime": 2.939, + "eval_samples_per_second": 34.025, + "eval_steps_per_second": 17.013, + "step": 500 + }, + { + "epoch": 2.47, + "grad_norm": 1.6078161001205444, + "learning_rate": 6.323271885189635e-05, + "loss": 0.5883, + "step": 501 + }, + { + "epoch": 2.48, + "grad_norm": 1.2864457368850708, + "learning_rate": 6.285653401137837e-05, + "loss": 0.6071, + "step": 502 + }, + { + "epoch": 2.48, + "grad_norm": 1.3440560102462769, + "learning_rate": 6.248095794772079e-05, + "loss": 0.7475, + "step": 503 + }, + { + "epoch": 2.49, + "grad_norm": 1.1603760719299316, + "learning_rate": 6.210599681656933e-05, + "loss": 0.6603, + "step": 504 + }, + { + "epoch": 2.49, + "grad_norm": 1.2979274988174438, + "learning_rate": 6.173165676349103e-05, + "loss": 0.6754, + "step": 505 + }, + { + "epoch": 2.5, + "grad_norm": 1.1852139234542847, + "learning_rate": 6.135794392387353e-05, + "loss": 0.6516, + "step": 506 + }, + { + "epoch": 2.5, + "grad_norm": 1.2540236711502075, + "learning_rate": 6.0984864422824496e-05, + "loss": 0.5239, + "step": 507 + }, + { + "epoch": 2.51, + "grad_norm": 1.5349066257476807, + "learning_rate": 6.061242437507131e-05, + "loss": 0.5365, + "step": 508 + }, + { + "epoch": 2.51, + "grad_norm": 1.258698582649231, + "learning_rate": 6.024062988486072e-05, + "loss": 0.7099, + "step": 509 + }, + { + "epoch": 2.52, + "grad_norm": 1.2852509021759033, + "learning_rate": 5.986948704585895e-05, + "loss": 0.4977, + "step": 510 + }, + { + "epoch": 2.52, + "grad_norm": 1.3446050882339478, + "learning_rate": 5.949900194105167e-05, + "loss": 0.6753, + "step": 511 + }, + { + "epoch": 2.53, + "grad_norm": 1.2775357961654663, + "learning_rate": 5.9129180642644414e-05, + "loss": 0.5833, + "step": 512 + }, + { + "epoch": 2.53, + "grad_norm": 1.2563271522521973, + "learning_rate": 5.8760029211962954e-05, + "loss": 0.5167, + "step": 513 + }, + { + "epoch": 2.54, + "grad_norm": 1.1414707899093628, + "learning_rate": 5.839155369935407e-05, + "loss": 0.6838, + "step": 514 + }, + { + "epoch": 2.54, + "grad_norm": 1.5055315494537354, + "learning_rate": 5.802376014408632e-05, + "loss": 0.672, + "step": 515 + }, + { + "epoch": 2.55, + "grad_norm": 1.3252393007278442, + "learning_rate": 5.765665457425102e-05, + "loss": 0.7256, + "step": 516 + }, + { + "epoch": 2.55, + "grad_norm": 1.1352269649505615, + "learning_rate": 5.729024300666349e-05, + "loss": 0.5319, + "step": 517 + }, + { + "epoch": 2.56, + "grad_norm": 1.3474462032318115, + "learning_rate": 5.6924531446764504e-05, + "loss": 0.5271, + "step": 518 + }, + { + "epoch": 2.56, + "grad_norm": 1.1647965908050537, + "learning_rate": 5.6559525888521815e-05, + "loss": 0.5496, + "step": 519 + }, + { + "epoch": 2.57, + "grad_norm": 1.3941562175750732, + "learning_rate": 5.6195232314331766e-05, + "loss": 0.5551, + "step": 520 + }, + { + "epoch": 2.57, + "grad_norm": 1.4575625658035278, + "learning_rate": 5.5831656694921465e-05, + "loss": 0.6719, + "step": 521 + }, + { + "epoch": 2.58, + "grad_norm": 1.2469514608383179, + "learning_rate": 5.5468804989250786e-05, + "loss": 0.6593, + "step": 522 + }, + { + "epoch": 2.58, + "grad_norm": 1.3567513227462769, + "learning_rate": 5.510668314441474e-05, + "loss": 0.5666, + "step": 523 + }, + { + "epoch": 2.59, + "grad_norm": 1.294553279876709, + "learning_rate": 5.474529709554612e-05, + "loss": 0.6345, + "step": 524 + }, + { + "epoch": 2.59, + "grad_norm": 1.0715196132659912, + "learning_rate": 5.438465276571796e-05, + "loss": 0.401, + "step": 525 + }, + { + "epoch": 2.6, + "grad_norm": 1.3244280815124512, + "learning_rate": 5.402475606584669e-05, + "loss": 0.6757, + "step": 526 + }, + { + "epoch": 2.6, + "grad_norm": 1.2300620079040527, + "learning_rate": 5.366561289459512e-05, + "loss": 0.7366, + "step": 527 + }, + { + "epoch": 2.61, + "grad_norm": 1.0702522993087769, + "learning_rate": 5.3307229138275936e-05, + "loss": 0.4266, + "step": 528 + }, + { + "epoch": 2.61, + "grad_norm": 1.2548829317092896, + "learning_rate": 5.2949610670755e-05, + "loss": 0.8007, + "step": 529 + }, + { + "epoch": 2.62, + "grad_norm": 1.3317065238952637, + "learning_rate": 5.259276335335521e-05, + "loss": 0.6294, + "step": 530 + }, + { + "epoch": 2.62, + "grad_norm": 1.2665762901306152, + "learning_rate": 5.223669303476041e-05, + "loss": 0.514, + "step": 531 + }, + { + "epoch": 2.63, + "grad_norm": 1.2807515859603882, + "learning_rate": 5.1881405550919493e-05, + "loss": 0.6037, + "step": 532 + }, + { + "epoch": 2.63, + "grad_norm": 1.541114330291748, + "learning_rate": 5.152690672495091e-05, + "loss": 0.6603, + "step": 533 + }, + { + "epoch": 2.64, + "grad_norm": 1.299155831336975, + "learning_rate": 5.117320236704697e-05, + "loss": 0.5944, + "step": 534 + }, + { + "epoch": 2.64, + "grad_norm": 1.4731584787368774, + "learning_rate": 5.08202982743788e-05, + "loss": 0.7807, + "step": 535 + }, + { + "epoch": 2.65, + "grad_norm": 1.2540079355239868, + "learning_rate": 5.0468200231001286e-05, + "loss": 0.6043, + "step": 536 + }, + { + "epoch": 2.65, + "grad_norm": 1.2161357402801514, + "learning_rate": 5.01169140077582e-05, + "loss": 0.5168, + "step": 537 + }, + { + "epoch": 2.66, + "grad_norm": 1.5545177459716797, + "learning_rate": 4.976644536218783e-05, + "loss": 0.5285, + "step": 538 + }, + { + "epoch": 2.66, + "grad_norm": 1.5177483558654785, + "learning_rate": 4.9416800038428324e-05, + "loss": 0.6826, + "step": 539 + }, + { + "epoch": 2.67, + "grad_norm": 1.369188666343689, + "learning_rate": 4.9067983767123736e-05, + "loss": 0.7984, + "step": 540 + }, + { + "epoch": 2.67, + "grad_norm": 1.6426352262496948, + "learning_rate": 4.8720002265330015e-05, + "loss": 0.6126, + "step": 541 + }, + { + "epoch": 2.68, + "grad_norm": 1.1411386728286743, + "learning_rate": 4.837286123642141e-05, + "loss": 0.6635, + "step": 542 + }, + { + "epoch": 2.68, + "grad_norm": 1.2911747694015503, + "learning_rate": 4.8026566369996926e-05, + "loss": 0.4522, + "step": 543 + }, + { + "epoch": 2.69, + "grad_norm": 1.55097496509552, + "learning_rate": 4.768112334178699e-05, + "loss": 0.8282, + "step": 544 + }, + { + "epoch": 2.69, + "grad_norm": 1.736786961555481, + "learning_rate": 4.733653781356055e-05, + "loss": 0.6144, + "step": 545 + }, + { + "epoch": 2.7, + "grad_norm": 1.2241405248641968, + "learning_rate": 4.699281543303222e-05, + "loss": 0.5656, + "step": 546 + }, + { + "epoch": 2.7, + "grad_norm": 1.30910325050354, + "learning_rate": 4.6649961833769715e-05, + "loss": 0.5732, + "step": 547 + }, + { + "epoch": 2.71, + "grad_norm": 1.780985713005066, + "learning_rate": 4.630798263510162e-05, + "loss": 0.5689, + "step": 548 + }, + { + "epoch": 2.71, + "grad_norm": 1.4643489122390747, + "learning_rate": 4.596688344202509e-05, + "loss": 0.633, + "step": 549 + }, + { + "epoch": 2.72, + "grad_norm": 1.273721694946289, + "learning_rate": 4.562666984511416e-05, + "loss": 0.5271, + "step": 550 + }, + { + "epoch": 2.72, + "eval_loss": 1.281378149986267, + "eval_runtime": 3.0044, + "eval_samples_per_second": 33.285, + "eval_steps_per_second": 16.642, + "step": 550 + }, + { + "epoch": 2.72, + "grad_norm": 1.3252663612365723, + "learning_rate": 4.528734742042803e-05, + "loss": 0.4885, + "step": 551 + }, + { + "epoch": 2.73, + "grad_norm": 1.159148097038269, + "learning_rate": 4.494892172941965e-05, + "loss": 0.4881, + "step": 552 + }, + { + "epoch": 2.73, + "grad_norm": 1.4068233966827393, + "learning_rate": 4.461139831884474e-05, + "loss": 0.6787, + "step": 553 + }, + { + "epoch": 2.74, + "grad_norm": 1.279906153678894, + "learning_rate": 4.427478272067066e-05, + "loss": 0.5426, + "step": 554 + }, + { + "epoch": 2.74, + "grad_norm": 1.1998430490493774, + "learning_rate": 4.393908045198585e-05, + "loss": 0.5433, + "step": 555 + }, + { + "epoch": 2.75, + "grad_norm": 1.3037670850753784, + "learning_rate": 4.360429701490934e-05, + "loss": 0.5773, + "step": 556 + }, + { + "epoch": 2.75, + "grad_norm": 1.260678768157959, + "learning_rate": 4.327043789650078e-05, + "loss": 0.4421, + "step": 557 + }, + { + "epoch": 2.76, + "grad_norm": 0.9158841967582703, + "learning_rate": 4.2937508568670194e-05, + "loss": 0.4472, + "step": 558 + }, + { + "epoch": 2.76, + "grad_norm": 1.4653347730636597, + "learning_rate": 4.2605514488088515e-05, + "loss": 0.7012, + "step": 559 + }, + { + "epoch": 2.77, + "grad_norm": 1.3992079496383667, + "learning_rate": 4.227446109609809e-05, + "loss": 0.5479, + "step": 560 + }, + { + "epoch": 2.77, + "grad_norm": 1.353127360343933, + "learning_rate": 4.1944353818623424e-05, + "loss": 0.7026, + "step": 561 + }, + { + "epoch": 2.78, + "grad_norm": 1.393446683883667, + "learning_rate": 4.161519806608247e-05, + "loss": 0.5551, + "step": 562 + }, + { + "epoch": 2.78, + "grad_norm": 1.1399997472763062, + "learning_rate": 4.12869992332977e-05, + "loss": 0.3954, + "step": 563 + }, + { + "epoch": 2.79, + "grad_norm": 1.5926597118377686, + "learning_rate": 4.0959762699407766e-05, + "loss": 0.6895, + "step": 564 + }, + { + "epoch": 2.79, + "grad_norm": 1.2445831298828125, + "learning_rate": 4.0633493827779425e-05, + "loss": 0.5555, + "step": 565 + }, + { + "epoch": 2.8, + "grad_norm": 1.3105766773223877, + "learning_rate": 4.030819796591949e-05, + "loss": 0.7623, + "step": 566 + }, + { + "epoch": 2.8, + "grad_norm": 1.4394389390945435, + "learning_rate": 3.9983880445387366e-05, + "loss": 0.6299, + "step": 567 + }, + { + "epoch": 2.81, + "grad_norm": 1.4189199209213257, + "learning_rate": 3.966054658170754e-05, + "loss": 0.601, + "step": 568 + }, + { + "epoch": 2.81, + "grad_norm": 1.0969223976135254, + "learning_rate": 3.9338201674282406e-05, + "loss": 0.3905, + "step": 569 + }, + { + "epoch": 2.82, + "grad_norm": 1.2229801416397095, + "learning_rate": 3.9016851006305545e-05, + "loss": 0.5059, + "step": 570 + }, + { + "epoch": 2.82, + "grad_norm": 1.474869966506958, + "learning_rate": 3.869649984467504e-05, + "loss": 0.6408, + "step": 571 + }, + { + "epoch": 2.83, + "grad_norm": 1.5417041778564453, + "learning_rate": 3.8377153439907266e-05, + "loss": 0.6754, + "step": 572 + }, + { + "epoch": 2.83, + "grad_norm": 1.4375914335250854, + "learning_rate": 3.8058817026050677e-05, + "loss": 0.681, + "step": 573 + }, + { + "epoch": 2.84, + "grad_norm": 1.1246694326400757, + "learning_rate": 3.774149582060012e-05, + "loss": 0.5772, + "step": 574 + }, + { + "epoch": 2.84, + "grad_norm": 1.3641024827957153, + "learning_rate": 3.742519502441132e-05, + "loss": 0.7361, + "step": 575 + }, + { + "epoch": 2.85, + "grad_norm": 1.533368706703186, + "learning_rate": 3.710991982161555e-05, + "loss": 0.5878, + "step": 576 + }, + { + "epoch": 2.85, + "grad_norm": 1.2794849872589111, + "learning_rate": 3.679567537953485e-05, + "loss": 0.5081, + "step": 577 + }, + { + "epoch": 2.86, + "grad_norm": 1.4266434907913208, + "learning_rate": 3.648246684859716e-05, + "loss": 0.7266, + "step": 578 + }, + { + "epoch": 2.86, + "grad_norm": 1.4641722440719604, + "learning_rate": 3.617029936225193e-05, + "loss": 0.6243, + "step": 579 + }, + { + "epoch": 2.87, + "grad_norm": 1.2470731735229492, + "learning_rate": 3.585917803688603e-05, + "loss": 0.5591, + "step": 580 + }, + { + "epoch": 2.87, + "grad_norm": 1.4247914552688599, + "learning_rate": 3.55491079717399e-05, + "loss": 0.6843, + "step": 581 + }, + { + "epoch": 2.88, + "grad_norm": 1.4032796621322632, + "learning_rate": 3.5240094248824e-05, + "loss": 0.6464, + "step": 582 + }, + { + "epoch": 2.88, + "grad_norm": 2.002753257751465, + "learning_rate": 3.493214193283536e-05, + "loss": 0.5833, + "step": 583 + }, + { + "epoch": 2.89, + "grad_norm": 1.3017632961273193, + "learning_rate": 3.4625256071074773e-05, + "loss": 0.7035, + "step": 584 + }, + { + "epoch": 2.89, + "grad_norm": 1.2375082969665527, + "learning_rate": 3.4319441693363906e-05, + "loss": 0.6327, + "step": 585 + }, + { + "epoch": 2.9, + "grad_norm": 1.4240336418151855, + "learning_rate": 3.4014703811963025e-05, + "loss": 0.7169, + "step": 586 + }, + { + "epoch": 2.9, + "grad_norm": 1.3496601581573486, + "learning_rate": 3.3711047421488675e-05, + "loss": 0.5654, + "step": 587 + }, + { + "epoch": 2.91, + "grad_norm": 1.3548609018325806, + "learning_rate": 3.340847749883191e-05, + "loss": 0.5809, + "step": 588 + }, + { + "epoch": 2.91, + "grad_norm": 1.3450309038162231, + "learning_rate": 3.3106999003076746e-05, + "loss": 0.7006, + "step": 589 + }, + { + "epoch": 2.92, + "grad_norm": 1.364551305770874, + "learning_rate": 3.280661687541876e-05, + "loss": 0.5856, + "step": 590 + }, + { + "epoch": 2.92, + "grad_norm": 1.390633225440979, + "learning_rate": 3.2507336039084314e-05, + "loss": 0.58, + "step": 591 + }, + { + "epoch": 2.93, + "grad_norm": 1.3074802160263062, + "learning_rate": 3.2209161399249674e-05, + "loss": 0.5153, + "step": 592 + }, + { + "epoch": 2.93, + "grad_norm": 1.4046270847320557, + "learning_rate": 3.191209784296068e-05, + "loss": 0.7275, + "step": 593 + }, + { + "epoch": 2.94, + "grad_norm": 1.3784432411193848, + "learning_rate": 3.161615023905265e-05, + "loss": 0.7039, + "step": 594 + }, + { + "epoch": 2.94, + "grad_norm": 1.3270719051361084, + "learning_rate": 3.132132343807056e-05, + "loss": 0.7857, + "step": 595 + }, + { + "epoch": 2.95, + "grad_norm": 1.4376825094223022, + "learning_rate": 3.102762227218957e-05, + "loss": 0.7024, + "step": 596 + }, + { + "epoch": 2.95, + "grad_norm": 1.263853907585144, + "learning_rate": 3.073505155513591e-05, + "loss": 0.6328, + "step": 597 + } + ], + "logging_steps": 1, + "max_steps": 796, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 199, + "total_flos": 2.301389698719744e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}