{ "best_metric": 1.2935380935668945, "best_model_checkpoint": "/export/data/salmasia/tradutor/checkpoints/hf_llama3_lora/checkpoint-7000", "epoch": 3.1645937801770776, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00037230515060906794, "grad_norm": 1.4287878274917603, "learning_rate": 2e-08, "loss": 1.8524, "step": 1 }, { "epoch": 0.0007446103012181359, "grad_norm": 1.438577651977539, "learning_rate": 4e-08, "loss": 1.8624, "step": 2 }, { "epoch": 0.001116915451827204, "grad_norm": 1.4175422191619873, "learning_rate": 6.000000000000001e-08, "loss": 1.8535, "step": 3 }, { "epoch": 0.0014892206024362717, "grad_norm": 1.3822880983352661, "learning_rate": 8e-08, "loss": 1.8581, "step": 4 }, { "epoch": 0.0018615257530453398, "grad_norm": 1.4041341543197632, "learning_rate": 1.0000000000000001e-07, "loss": 1.8706, "step": 5 }, { "epoch": 0.002233830903654408, "grad_norm": 1.352337121963501, "learning_rate": 1.2000000000000002e-07, "loss": 1.8438, "step": 6 }, { "epoch": 0.002606136054263476, "grad_norm": 1.3975361585617065, "learning_rate": 1.4e-07, "loss": 1.8411, "step": 7 }, { "epoch": 0.0029784412048725435, "grad_norm": 1.3485698699951172, "learning_rate": 1.6e-07, "loss": 1.8473, "step": 8 }, { "epoch": 0.0033507463554816115, "grad_norm": 1.4282227754592896, "learning_rate": 1.8e-07, "loss": 1.8677, "step": 9 }, { "epoch": 0.0037230515060906796, "grad_norm": 1.3968946933746338, "learning_rate": 2.0000000000000002e-07, "loss": 1.876, "step": 10 }, { "epoch": 0.004095356656699747, "grad_norm": 1.4412802457809448, "learning_rate": 2.2e-07, "loss": 1.8736, "step": 11 }, { "epoch": 0.004467661807308816, "grad_norm": 1.3732651472091675, "learning_rate": 2.4000000000000003e-07, "loss": 1.8566, "step": 12 }, { "epoch": 0.004839966957917883, "grad_norm": 1.444575309753418, "learning_rate": 2.6e-07, "loss": 1.8654, "step": 13 }, { "epoch": 0.005212272108526952, "grad_norm": 1.4188005924224854, "learning_rate": 2.8e-07, "loss": 1.8672, "step": 14 }, { "epoch": 0.005584577259136019, "grad_norm": 1.3773771524429321, "learning_rate": 3.0000000000000004e-07, "loss": 1.8475, "step": 15 }, { "epoch": 0.005956882409745087, "grad_norm": 1.4325660467147827, "learning_rate": 3.2e-07, "loss": 1.8845, "step": 16 }, { "epoch": 0.0063291875603541554, "grad_norm": 1.366593599319458, "learning_rate": 3.4000000000000003e-07, "loss": 1.8627, "step": 17 }, { "epoch": 0.006701492710963223, "grad_norm": 1.3906618356704712, "learning_rate": 3.6e-07, "loss": 1.8424, "step": 18 }, { "epoch": 0.0070737978615722915, "grad_norm": 1.3560819625854492, "learning_rate": 3.8e-07, "loss": 1.8623, "step": 19 }, { "epoch": 0.007446103012181359, "grad_norm": 1.3274669647216797, "learning_rate": 4.0000000000000003e-07, "loss": 1.8461, "step": 20 }, { "epoch": 0.007818408162790428, "grad_norm": 1.3316004276275635, "learning_rate": 4.2000000000000006e-07, "loss": 1.8471, "step": 21 }, { "epoch": 0.008190713313399494, "grad_norm": 1.3909695148468018, "learning_rate": 4.4e-07, "loss": 1.8484, "step": 22 }, { "epoch": 0.008563018464008563, "grad_norm": 1.3994210958480835, "learning_rate": 4.6000000000000004e-07, "loss": 1.8488, "step": 23 }, { "epoch": 0.008935323614617631, "grad_norm": 1.4674007892608643, "learning_rate": 4.800000000000001e-07, "loss": 1.8548, "step": 24 }, { "epoch": 0.0093076287652267, "grad_norm": 1.4008136987686157, "learning_rate": 5.000000000000001e-07, "loss": 1.8322, "step": 25 }, { "epoch": 0.009679933915835767, "grad_norm": 1.386723279953003, "learning_rate": 5.2e-07, "loss": 1.8451, "step": 26 }, { "epoch": 0.010052239066444835, "grad_norm": 1.398271918296814, "learning_rate": 5.4e-07, "loss": 1.8389, "step": 27 }, { "epoch": 0.010424544217053903, "grad_norm": 1.4241478443145752, "learning_rate": 5.6e-07, "loss": 1.8636, "step": 28 }, { "epoch": 0.01079684936766297, "grad_norm": 1.4040849208831787, "learning_rate": 5.800000000000001e-07, "loss": 1.8427, "step": 29 }, { "epoch": 0.011169154518272039, "grad_norm": 1.3591426610946655, "learning_rate": 6.000000000000001e-07, "loss": 1.8354, "step": 30 }, { "epoch": 0.011541459668881107, "grad_norm": 1.4355502128601074, "learning_rate": 6.200000000000001e-07, "loss": 1.8536, "step": 31 }, { "epoch": 0.011913764819490174, "grad_norm": 1.4141122102737427, "learning_rate": 6.4e-07, "loss": 1.8295, "step": 32 }, { "epoch": 0.012286069970099242, "grad_norm": 1.4399964809417725, "learning_rate": 6.6e-07, "loss": 1.8555, "step": 33 }, { "epoch": 0.012658375120708311, "grad_norm": 1.3795485496520996, "learning_rate": 6.800000000000001e-07, "loss": 1.8304, "step": 34 }, { "epoch": 0.01303068027131738, "grad_norm": 1.4039381742477417, "learning_rate": 7.000000000000001e-07, "loss": 1.8467, "step": 35 }, { "epoch": 0.013402985421926446, "grad_norm": 1.4005100727081299, "learning_rate": 7.2e-07, "loss": 1.8253, "step": 36 }, { "epoch": 0.013775290572535515, "grad_norm": 1.415635347366333, "learning_rate": 7.4e-07, "loss": 1.8458, "step": 37 }, { "epoch": 0.014147595723144583, "grad_norm": 1.39545738697052, "learning_rate": 7.6e-07, "loss": 1.8264, "step": 38 }, { "epoch": 0.01451990087375365, "grad_norm": 1.3957033157348633, "learning_rate": 7.8e-07, "loss": 1.8404, "step": 39 }, { "epoch": 0.014892206024362718, "grad_norm": 1.3806068897247314, "learning_rate": 8.000000000000001e-07, "loss": 1.828, "step": 40 }, { "epoch": 0.015264511174971787, "grad_norm": 1.387890338897705, "learning_rate": 8.200000000000001e-07, "loss": 1.8302, "step": 41 }, { "epoch": 0.015636816325580855, "grad_norm": 1.3517177104949951, "learning_rate": 8.400000000000001e-07, "loss": 1.8021, "step": 42 }, { "epoch": 0.016009121476189922, "grad_norm": 1.3489928245544434, "learning_rate": 8.6e-07, "loss": 1.8084, "step": 43 }, { "epoch": 0.01638142662679899, "grad_norm": 1.352052092552185, "learning_rate": 8.8e-07, "loss": 1.8143, "step": 44 }, { "epoch": 0.01675373177740806, "grad_norm": 1.324896216392517, "learning_rate": 9.000000000000001e-07, "loss": 1.7936, "step": 45 }, { "epoch": 0.017126036928017126, "grad_norm": 1.2951364517211914, "learning_rate": 9.200000000000001e-07, "loss": 1.8092, "step": 46 }, { "epoch": 0.017498342078626192, "grad_norm": 1.3121585845947266, "learning_rate": 9.400000000000001e-07, "loss": 1.8096, "step": 47 }, { "epoch": 0.017870647229235263, "grad_norm": 1.359284520149231, "learning_rate": 9.600000000000001e-07, "loss": 1.822, "step": 48 }, { "epoch": 0.01824295237984433, "grad_norm": 1.3160185813903809, "learning_rate": 9.800000000000001e-07, "loss": 1.8113, "step": 49 }, { "epoch": 0.0186152575304534, "grad_norm": 1.2798627614974976, "learning_rate": 1.0000000000000002e-06, "loss": 1.7942, "step": 50 }, { "epoch": 0.018987562681062466, "grad_norm": 1.3182300329208374, "learning_rate": 1.02e-06, "loss": 1.8119, "step": 51 }, { "epoch": 0.019359867831671533, "grad_norm": 1.3083176612854004, "learning_rate": 1.04e-06, "loss": 1.8189, "step": 52 }, { "epoch": 0.019732172982280603, "grad_norm": 1.3400063514709473, "learning_rate": 1.06e-06, "loss": 1.837, "step": 53 }, { "epoch": 0.02010447813288967, "grad_norm": 1.2751755714416504, "learning_rate": 1.08e-06, "loss": 1.7936, "step": 54 }, { "epoch": 0.020476783283498737, "grad_norm": 1.223643183708191, "learning_rate": 1.1e-06, "loss": 1.7851, "step": 55 }, { "epoch": 0.020849088434107807, "grad_norm": 1.2413800954818726, "learning_rate": 1.12e-06, "loss": 1.7893, "step": 56 }, { "epoch": 0.021221393584716874, "grad_norm": 1.196629524230957, "learning_rate": 1.14e-06, "loss": 1.8031, "step": 57 }, { "epoch": 0.02159369873532594, "grad_norm": 1.158254861831665, "learning_rate": 1.1600000000000001e-06, "loss": 1.79, "step": 58 }, { "epoch": 0.02196600388593501, "grad_norm": 1.0098892450332642, "learning_rate": 1.1800000000000001e-06, "loss": 1.7772, "step": 59 }, { "epoch": 0.022338309036544077, "grad_norm": 0.949759304523468, "learning_rate": 1.2000000000000002e-06, "loss": 1.7582, "step": 60 }, { "epoch": 0.022710614187153144, "grad_norm": 0.9904881119728088, "learning_rate": 1.2200000000000002e-06, "loss": 1.775, "step": 61 }, { "epoch": 0.023082919337762214, "grad_norm": 0.9897575378417969, "learning_rate": 1.2400000000000002e-06, "loss": 1.7599, "step": 62 }, { "epoch": 0.02345522448837128, "grad_norm": 0.9522385597229004, "learning_rate": 1.26e-06, "loss": 1.7643, "step": 63 }, { "epoch": 0.023827529638980348, "grad_norm": 0.9590041637420654, "learning_rate": 1.28e-06, "loss": 1.7782, "step": 64 }, { "epoch": 0.024199834789589418, "grad_norm": 0.910637378692627, "learning_rate": 1.3e-06, "loss": 1.7478, "step": 65 }, { "epoch": 0.024572139940198485, "grad_norm": 0.9400556087493896, "learning_rate": 1.32e-06, "loss": 1.745, "step": 66 }, { "epoch": 0.02494444509080755, "grad_norm": 0.9209262728691101, "learning_rate": 1.34e-06, "loss": 1.7594, "step": 67 }, { "epoch": 0.025316750241416622, "grad_norm": 0.896427571773529, "learning_rate": 1.3600000000000001e-06, "loss": 1.7446, "step": 68 }, { "epoch": 0.02568905539202569, "grad_norm": 0.895569920539856, "learning_rate": 1.3800000000000001e-06, "loss": 1.769, "step": 69 }, { "epoch": 0.02606136054263476, "grad_norm": 0.7845183610916138, "learning_rate": 1.4000000000000001e-06, "loss": 1.7196, "step": 70 }, { "epoch": 0.026433665693243825, "grad_norm": 0.800564706325531, "learning_rate": 1.42e-06, "loss": 1.732, "step": 71 }, { "epoch": 0.026805970843852892, "grad_norm": 0.6883242130279541, "learning_rate": 1.44e-06, "loss": 1.7251, "step": 72 }, { "epoch": 0.027178275994461962, "grad_norm": 0.6776332259178162, "learning_rate": 1.46e-06, "loss": 1.7311, "step": 73 }, { "epoch": 0.02755058114507103, "grad_norm": 0.6384090781211853, "learning_rate": 1.48e-06, "loss": 1.7215, "step": 74 }, { "epoch": 0.027922886295680096, "grad_norm": 0.647678017616272, "learning_rate": 1.5e-06, "loss": 1.7038, "step": 75 }, { "epoch": 0.028295191446289166, "grad_norm": 0.6253066658973694, "learning_rate": 1.52e-06, "loss": 1.7308, "step": 76 }, { "epoch": 0.028667496596898233, "grad_norm": 0.599458634853363, "learning_rate": 1.54e-06, "loss": 1.6998, "step": 77 }, { "epoch": 0.0290398017475073, "grad_norm": 0.5987157225608826, "learning_rate": 1.56e-06, "loss": 1.7092, "step": 78 }, { "epoch": 0.02941210689811637, "grad_norm": 0.5856855511665344, "learning_rate": 1.5800000000000001e-06, "loss": 1.7001, "step": 79 }, { "epoch": 0.029784412048725437, "grad_norm": 0.5748167037963867, "learning_rate": 1.6000000000000001e-06, "loss": 1.6861, "step": 80 }, { "epoch": 0.030156717199334503, "grad_norm": 0.5907240509986877, "learning_rate": 1.6200000000000002e-06, "loss": 1.7043, "step": 81 }, { "epoch": 0.030529022349943574, "grad_norm": 0.5696431994438171, "learning_rate": 1.6400000000000002e-06, "loss": 1.6871, "step": 82 }, { "epoch": 0.03090132750055264, "grad_norm": 0.5690540075302124, "learning_rate": 1.6600000000000002e-06, "loss": 1.6584, "step": 83 }, { "epoch": 0.03127363265116171, "grad_norm": 0.5912604331970215, "learning_rate": 1.6800000000000002e-06, "loss": 1.6724, "step": 84 }, { "epoch": 0.031645937801770774, "grad_norm": 0.595125138759613, "learning_rate": 1.7000000000000002e-06, "loss": 1.6629, "step": 85 }, { "epoch": 0.032018242952379844, "grad_norm": 0.5880417227745056, "learning_rate": 1.72e-06, "loss": 1.6722, "step": 86 }, { "epoch": 0.032390548102988914, "grad_norm": 0.621819019317627, "learning_rate": 1.74e-06, "loss": 1.6583, "step": 87 }, { "epoch": 0.03276285325359798, "grad_norm": 0.6170995831489563, "learning_rate": 1.76e-06, "loss": 1.6434, "step": 88 }, { "epoch": 0.03313515840420705, "grad_norm": 0.6138344407081604, "learning_rate": 1.7800000000000001e-06, "loss": 1.6368, "step": 89 }, { "epoch": 0.03350746355481612, "grad_norm": 0.6477394700050354, "learning_rate": 1.8000000000000001e-06, "loss": 1.6606, "step": 90 }, { "epoch": 0.03387976870542518, "grad_norm": 0.6119678616523743, "learning_rate": 1.8200000000000002e-06, "loss": 1.6266, "step": 91 }, { "epoch": 0.03425207385603425, "grad_norm": 0.612330436706543, "learning_rate": 1.8400000000000002e-06, "loss": 1.618, "step": 92 }, { "epoch": 0.03462437900664332, "grad_norm": 0.6225689053535461, "learning_rate": 1.8600000000000002e-06, "loss": 1.6234, "step": 93 }, { "epoch": 0.034996684157252385, "grad_norm": 0.5868818759918213, "learning_rate": 1.8800000000000002e-06, "loss": 1.6091, "step": 94 }, { "epoch": 0.035368989307861455, "grad_norm": 0.5659094452857971, "learning_rate": 1.9000000000000002e-06, "loss": 1.6211, "step": 95 }, { "epoch": 0.035741294458470525, "grad_norm": 0.550031840801239, "learning_rate": 1.9200000000000003e-06, "loss": 1.6023, "step": 96 }, { "epoch": 0.03611359960907959, "grad_norm": 0.5339946746826172, "learning_rate": 1.94e-06, "loss": 1.607, "step": 97 }, { "epoch": 0.03648590475968866, "grad_norm": 0.5484851598739624, "learning_rate": 1.9600000000000003e-06, "loss": 1.6088, "step": 98 }, { "epoch": 0.03685820991029773, "grad_norm": 0.5426100492477417, "learning_rate": 1.98e-06, "loss": 1.5946, "step": 99 }, { "epoch": 0.0372305150609068, "grad_norm": 0.53022700548172, "learning_rate": 2.0000000000000003e-06, "loss": 1.5923, "step": 100 }, { "epoch": 0.03760282021151586, "grad_norm": 0.5056052207946777, "learning_rate": 2.02e-06, "loss": 1.5901, "step": 101 }, { "epoch": 0.03797512536212493, "grad_norm": 0.5120390057563782, "learning_rate": 2.04e-06, "loss": 1.5924, "step": 102 }, { "epoch": 0.038347430512734, "grad_norm": 0.5347453355789185, "learning_rate": 2.06e-06, "loss": 1.5924, "step": 103 }, { "epoch": 0.038719735663343066, "grad_norm": 0.535159170627594, "learning_rate": 2.08e-06, "loss": 1.5821, "step": 104 }, { "epoch": 0.039092040813952136, "grad_norm": 0.5138244032859802, "learning_rate": 2.1000000000000002e-06, "loss": 1.5981, "step": 105 }, { "epoch": 0.03946434596456121, "grad_norm": 0.5143431425094604, "learning_rate": 2.12e-06, "loss": 1.5742, "step": 106 }, { "epoch": 0.03983665111517027, "grad_norm": 0.5397112965583801, "learning_rate": 2.1400000000000003e-06, "loss": 1.5919, "step": 107 }, { "epoch": 0.04020895626577934, "grad_norm": 0.503512978553772, "learning_rate": 2.16e-06, "loss": 1.5856, "step": 108 }, { "epoch": 0.04058126141638841, "grad_norm": 0.5465199947357178, "learning_rate": 2.1800000000000003e-06, "loss": 1.5853, "step": 109 }, { "epoch": 0.040953566566997474, "grad_norm": 0.561886727809906, "learning_rate": 2.2e-06, "loss": 1.5841, "step": 110 }, { "epoch": 0.041325871717606544, "grad_norm": 0.5566477179527283, "learning_rate": 2.2200000000000003e-06, "loss": 1.5819, "step": 111 }, { "epoch": 0.041698176868215614, "grad_norm": 0.5444602370262146, "learning_rate": 2.24e-06, "loss": 1.5702, "step": 112 }, { "epoch": 0.04207048201882468, "grad_norm": 0.5601593852043152, "learning_rate": 2.2600000000000004e-06, "loss": 1.5667, "step": 113 }, { "epoch": 0.04244278716943375, "grad_norm": 0.5660892128944397, "learning_rate": 2.28e-06, "loss": 1.5746, "step": 114 }, { "epoch": 0.04281509232004282, "grad_norm": 0.5766957402229309, "learning_rate": 2.3000000000000004e-06, "loss": 1.5627, "step": 115 }, { "epoch": 0.04318739747065188, "grad_norm": 0.6171875596046448, "learning_rate": 2.3200000000000002e-06, "loss": 1.5807, "step": 116 }, { "epoch": 0.04355970262126095, "grad_norm": 0.5867440700531006, "learning_rate": 2.3400000000000005e-06, "loss": 1.5642, "step": 117 }, { "epoch": 0.04393200777187002, "grad_norm": 0.5847681164741516, "learning_rate": 2.3600000000000003e-06, "loss": 1.5593, "step": 118 }, { "epoch": 0.044304312922479085, "grad_norm": 0.5901392102241516, "learning_rate": 2.38e-06, "loss": 1.5697, "step": 119 }, { "epoch": 0.044676618073088155, "grad_norm": 0.5898075699806213, "learning_rate": 2.4000000000000003e-06, "loss": 1.5522, "step": 120 }, { "epoch": 0.045048923223697225, "grad_norm": 0.5409351587295532, "learning_rate": 2.42e-06, "loss": 1.5498, "step": 121 }, { "epoch": 0.04542122837430629, "grad_norm": 0.5224587321281433, "learning_rate": 2.4400000000000004e-06, "loss": 1.5567, "step": 122 }, { "epoch": 0.04579353352491536, "grad_norm": 0.48012423515319824, "learning_rate": 2.46e-06, "loss": 1.5345, "step": 123 }, { "epoch": 0.04616583867552443, "grad_norm": 0.44709786772727966, "learning_rate": 2.4800000000000004e-06, "loss": 1.5434, "step": 124 }, { "epoch": 0.04653814382613349, "grad_norm": 0.43979910016059875, "learning_rate": 2.5e-06, "loss": 1.5461, "step": 125 }, { "epoch": 0.04691044897674256, "grad_norm": 0.3758682608604431, "learning_rate": 2.52e-06, "loss": 1.5484, "step": 126 }, { "epoch": 0.04728275412735163, "grad_norm": 0.34224575757980347, "learning_rate": 2.5400000000000002e-06, "loss": 1.5238, "step": 127 }, { "epoch": 0.047655059277960696, "grad_norm": 0.3217172622680664, "learning_rate": 2.56e-06, "loss": 1.5567, "step": 128 }, { "epoch": 0.048027364428569766, "grad_norm": 0.2725611925125122, "learning_rate": 2.5800000000000003e-06, "loss": 1.5349, "step": 129 }, { "epoch": 0.048399669579178836, "grad_norm": 0.22641003131866455, "learning_rate": 2.6e-06, "loss": 1.533, "step": 130 }, { "epoch": 0.0487719747297879, "grad_norm": 0.19425782561302185, "learning_rate": 2.6200000000000003e-06, "loss": 1.5486, "step": 131 }, { "epoch": 0.04914427988039697, "grad_norm": 0.18164916336536407, "learning_rate": 2.64e-06, "loss": 1.5494, "step": 132 }, { "epoch": 0.04951658503100604, "grad_norm": 0.16984394192695618, "learning_rate": 2.6600000000000004e-06, "loss": 1.5342, "step": 133 }, { "epoch": 0.0498888901816151, "grad_norm": 0.15408554673194885, "learning_rate": 2.68e-06, "loss": 1.5349, "step": 134 }, { "epoch": 0.05026119533222417, "grad_norm": 0.1477842777967453, "learning_rate": 2.7000000000000004e-06, "loss": 1.5492, "step": 135 }, { "epoch": 0.050633500482833244, "grad_norm": 0.14385773241519928, "learning_rate": 2.7200000000000002e-06, "loss": 1.5415, "step": 136 }, { "epoch": 0.05100580563344231, "grad_norm": 0.1423702985048294, "learning_rate": 2.7400000000000004e-06, "loss": 1.5231, "step": 137 }, { "epoch": 0.05137811078405138, "grad_norm": 0.4125309884548187, "learning_rate": 2.7600000000000003e-06, "loss": 1.5381, "step": 138 }, { "epoch": 0.05175041593466045, "grad_norm": 0.14331988990306854, "learning_rate": 2.7800000000000005e-06, "loss": 1.5438, "step": 139 }, { "epoch": 0.05212272108526952, "grad_norm": 0.12616126239299774, "learning_rate": 2.8000000000000003e-06, "loss": 1.5481, "step": 140 }, { "epoch": 0.05249502623587858, "grad_norm": 0.1296859085559845, "learning_rate": 2.82e-06, "loss": 1.5316, "step": 141 }, { "epoch": 0.05286733138648765, "grad_norm": 0.14372870326042175, "learning_rate": 2.84e-06, "loss": 1.5314, "step": 142 }, { "epoch": 0.05323963653709672, "grad_norm": 0.12012794613838196, "learning_rate": 2.86e-06, "loss": 1.5256, "step": 143 }, { "epoch": 0.053611941687705784, "grad_norm": 0.1238911896944046, "learning_rate": 2.88e-06, "loss": 1.5376, "step": 144 }, { "epoch": 0.053984246838314855, "grad_norm": 0.11403171718120575, "learning_rate": 2.9e-06, "loss": 1.5313, "step": 145 }, { "epoch": 0.054356551988923925, "grad_norm": 0.11582870781421661, "learning_rate": 2.92e-06, "loss": 1.5293, "step": 146 }, { "epoch": 0.05472885713953299, "grad_norm": 0.11598813533782959, "learning_rate": 2.9400000000000002e-06, "loss": 1.5387, "step": 147 }, { "epoch": 0.05510116229014206, "grad_norm": 0.11165577918291092, "learning_rate": 2.96e-06, "loss": 1.5195, "step": 148 }, { "epoch": 0.05547346744075113, "grad_norm": 0.1125321313738823, "learning_rate": 2.9800000000000003e-06, "loss": 1.521, "step": 149 }, { "epoch": 0.05584577259136019, "grad_norm": 0.11534541845321655, "learning_rate": 3e-06, "loss": 1.5368, "step": 150 }, { "epoch": 0.05621807774196926, "grad_norm": 0.11158929765224457, "learning_rate": 3.0200000000000003e-06, "loss": 1.53, "step": 151 }, { "epoch": 0.05659038289257833, "grad_norm": 0.11701493710279465, "learning_rate": 3.04e-06, "loss": 1.517, "step": 152 }, { "epoch": 0.056962688043187396, "grad_norm": 0.10573873668909073, "learning_rate": 3.0600000000000003e-06, "loss": 1.5479, "step": 153 }, { "epoch": 0.057334993193796466, "grad_norm": 0.10436706990003586, "learning_rate": 3.08e-06, "loss": 1.5376, "step": 154 }, { "epoch": 0.057707298344405536, "grad_norm": 0.11538543552160263, "learning_rate": 3.1000000000000004e-06, "loss": 1.5405, "step": 155 }, { "epoch": 0.0580796034950146, "grad_norm": 0.11722234636545181, "learning_rate": 3.12e-06, "loss": 1.5249, "step": 156 }, { "epoch": 0.05845190864562367, "grad_norm": 0.11727150529623032, "learning_rate": 3.1400000000000004e-06, "loss": 1.5279, "step": 157 }, { "epoch": 0.05882421379623274, "grad_norm": 0.10549698024988174, "learning_rate": 3.1600000000000002e-06, "loss": 1.5101, "step": 158 }, { "epoch": 0.0591965189468418, "grad_norm": 0.10655630379915237, "learning_rate": 3.1800000000000005e-06, "loss": 1.523, "step": 159 }, { "epoch": 0.05956882409745087, "grad_norm": 0.10541582852602005, "learning_rate": 3.2000000000000003e-06, "loss": 1.5114, "step": 160 }, { "epoch": 0.05994112924805994, "grad_norm": 0.1057916209101677, "learning_rate": 3.2200000000000005e-06, "loss": 1.5146, "step": 161 }, { "epoch": 0.06031343439866901, "grad_norm": 0.10496512800455093, "learning_rate": 3.2400000000000003e-06, "loss": 1.5168, "step": 162 }, { "epoch": 0.06068573954927808, "grad_norm": 0.09892502427101135, "learning_rate": 3.2600000000000006e-06, "loss": 1.5053, "step": 163 }, { "epoch": 0.06105804469988715, "grad_norm": 0.10325302928686142, "learning_rate": 3.2800000000000004e-06, "loss": 1.5157, "step": 164 }, { "epoch": 0.06143034985049621, "grad_norm": 0.11229792982339859, "learning_rate": 3.3000000000000006e-06, "loss": 1.4997, "step": 165 }, { "epoch": 0.06180265500110528, "grad_norm": 0.10319443047046661, "learning_rate": 3.3200000000000004e-06, "loss": 1.5175, "step": 166 }, { "epoch": 0.06217496015171435, "grad_norm": 0.10021348297595978, "learning_rate": 3.3400000000000006e-06, "loss": 1.5383, "step": 167 }, { "epoch": 0.06254726530232342, "grad_norm": 0.09915069490671158, "learning_rate": 3.3600000000000004e-06, "loss": 1.5226, "step": 168 }, { "epoch": 0.06291957045293249, "grad_norm": 0.10507778823375702, "learning_rate": 3.3800000000000007e-06, "loss": 1.5251, "step": 169 }, { "epoch": 0.06329187560354155, "grad_norm": 0.10222301632165909, "learning_rate": 3.4000000000000005e-06, "loss": 1.5287, "step": 170 }, { "epoch": 0.06366418075415062, "grad_norm": 0.10224229842424393, "learning_rate": 3.4200000000000007e-06, "loss": 1.5047, "step": 171 }, { "epoch": 0.06403648590475969, "grad_norm": 0.09706564992666245, "learning_rate": 3.44e-06, "loss": 1.5136, "step": 172 }, { "epoch": 0.06440879105536876, "grad_norm": 0.09412699937820435, "learning_rate": 3.46e-06, "loss": 1.5141, "step": 173 }, { "epoch": 0.06478109620597783, "grad_norm": 0.09892858564853668, "learning_rate": 3.48e-06, "loss": 1.5198, "step": 174 }, { "epoch": 0.0651534013565869, "grad_norm": 0.10099875926971436, "learning_rate": 3.5e-06, "loss": 1.5338, "step": 175 }, { "epoch": 0.06552570650719595, "grad_norm": 0.0975039079785347, "learning_rate": 3.52e-06, "loss": 1.5202, "step": 176 }, { "epoch": 0.06589801165780503, "grad_norm": 0.09733244776725769, "learning_rate": 3.54e-06, "loss": 1.5256, "step": 177 }, { "epoch": 0.0662703168084141, "grad_norm": 0.10019107162952423, "learning_rate": 3.5600000000000002e-06, "loss": 1.5048, "step": 178 }, { "epoch": 0.06664262195902317, "grad_norm": 0.10042434930801392, "learning_rate": 3.58e-06, "loss": 1.499, "step": 179 }, { "epoch": 0.06701492710963224, "grad_norm": 0.10034509003162384, "learning_rate": 3.6000000000000003e-06, "loss": 1.5059, "step": 180 }, { "epoch": 0.0673872322602413, "grad_norm": 0.09680823236703873, "learning_rate": 3.62e-06, "loss": 1.5074, "step": 181 }, { "epoch": 0.06775953741085036, "grad_norm": 0.10564741492271423, "learning_rate": 3.6400000000000003e-06, "loss": 1.5014, "step": 182 }, { "epoch": 0.06813184256145943, "grad_norm": 0.09912260621786118, "learning_rate": 3.66e-06, "loss": 1.5329, "step": 183 }, { "epoch": 0.0685041477120685, "grad_norm": 0.09995097666978836, "learning_rate": 3.6800000000000003e-06, "loss": 1.4999, "step": 184 }, { "epoch": 0.06887645286267757, "grad_norm": 0.10238537192344666, "learning_rate": 3.7e-06, "loss": 1.5197, "step": 185 }, { "epoch": 0.06924875801328664, "grad_norm": 0.09943666309118271, "learning_rate": 3.7200000000000004e-06, "loss": 1.5234, "step": 186 }, { "epoch": 0.06962106316389571, "grad_norm": 0.09201087057590485, "learning_rate": 3.74e-06, "loss": 1.5103, "step": 187 }, { "epoch": 0.06999336831450477, "grad_norm": 0.09809333086013794, "learning_rate": 3.7600000000000004e-06, "loss": 1.5107, "step": 188 }, { "epoch": 0.07036567346511384, "grad_norm": 0.09340554475784302, "learning_rate": 3.7800000000000002e-06, "loss": 1.517, "step": 189 }, { "epoch": 0.07073797861572291, "grad_norm": 0.10037955641746521, "learning_rate": 3.8000000000000005e-06, "loss": 1.51, "step": 190 }, { "epoch": 0.07111028376633198, "grad_norm": 0.0913090705871582, "learning_rate": 3.820000000000001e-06, "loss": 1.5033, "step": 191 }, { "epoch": 0.07148258891694105, "grad_norm": 0.09793171286582947, "learning_rate": 3.8400000000000005e-06, "loss": 1.5268, "step": 192 }, { "epoch": 0.07185489406755012, "grad_norm": 0.09348586201667786, "learning_rate": 3.86e-06, "loss": 1.5046, "step": 193 }, { "epoch": 0.07222719921815918, "grad_norm": 0.09434423595666885, "learning_rate": 3.88e-06, "loss": 1.498, "step": 194 }, { "epoch": 0.07259950436876825, "grad_norm": 0.08928900957107544, "learning_rate": 3.900000000000001e-06, "loss": 1.4918, "step": 195 }, { "epoch": 0.07297180951937732, "grad_norm": 0.0911901518702507, "learning_rate": 3.920000000000001e-06, "loss": 1.4903, "step": 196 }, { "epoch": 0.07334411466998639, "grad_norm": 0.08914012461900711, "learning_rate": 3.94e-06, "loss": 1.512, "step": 197 }, { "epoch": 0.07371641982059546, "grad_norm": 0.0897861197590828, "learning_rate": 3.96e-06, "loss": 1.5127, "step": 198 }, { "epoch": 0.07408872497120453, "grad_norm": 0.0919618159532547, "learning_rate": 3.980000000000001e-06, "loss": 1.5032, "step": 199 }, { "epoch": 0.0744610301218136, "grad_norm": 0.0871221199631691, "learning_rate": 4.000000000000001e-06, "loss": 1.5092, "step": 200 }, { "epoch": 0.07483333527242265, "grad_norm": 0.08722933381795883, "learning_rate": 4.0200000000000005e-06, "loss": 1.4905, "step": 201 }, { "epoch": 0.07520564042303172, "grad_norm": 0.08796436339616776, "learning_rate": 4.04e-06, "loss": 1.4992, "step": 202 }, { "epoch": 0.0755779455736408, "grad_norm": 0.09674139320850372, "learning_rate": 4.060000000000001e-06, "loss": 1.5032, "step": 203 }, { "epoch": 0.07595025072424987, "grad_norm": 0.08799508959054947, "learning_rate": 4.08e-06, "loss": 1.5124, "step": 204 }, { "epoch": 0.07632255587485894, "grad_norm": 0.08769244700670242, "learning_rate": 4.1e-06, "loss": 1.5087, "step": 205 }, { "epoch": 0.076694861025468, "grad_norm": 0.08863134682178497, "learning_rate": 4.12e-06, "loss": 1.5129, "step": 206 }, { "epoch": 0.07706716617607706, "grad_norm": 0.1002495214343071, "learning_rate": 4.14e-06, "loss": 1.5024, "step": 207 }, { "epoch": 0.07743947132668613, "grad_norm": 0.09144977480173111, "learning_rate": 4.16e-06, "loss": 1.5006, "step": 208 }, { "epoch": 0.0778117764772952, "grad_norm": 0.10466210544109344, "learning_rate": 4.18e-06, "loss": 1.5022, "step": 209 }, { "epoch": 0.07818408162790427, "grad_norm": 0.08975500613451004, "learning_rate": 4.2000000000000004e-06, "loss": 1.4965, "step": 210 }, { "epoch": 0.07855638677851334, "grad_norm": 0.08787556737661362, "learning_rate": 4.22e-06, "loss": 1.5042, "step": 211 }, { "epoch": 0.07892869192912241, "grad_norm": 0.08773490786552429, "learning_rate": 4.24e-06, "loss": 1.5138, "step": 212 }, { "epoch": 0.07930099707973147, "grad_norm": 0.08304975181818008, "learning_rate": 4.26e-06, "loss": 1.507, "step": 213 }, { "epoch": 0.07967330223034054, "grad_norm": 0.08493588864803314, "learning_rate": 4.2800000000000005e-06, "loss": 1.5017, "step": 214 }, { "epoch": 0.08004560738094961, "grad_norm": 0.08817660808563232, "learning_rate": 4.3e-06, "loss": 1.5068, "step": 215 }, { "epoch": 0.08041791253155868, "grad_norm": 0.08963938057422638, "learning_rate": 4.32e-06, "loss": 1.5263, "step": 216 }, { "epoch": 0.08079021768216775, "grad_norm": 0.08390969783067703, "learning_rate": 4.34e-06, "loss": 1.5, "step": 217 }, { "epoch": 0.08116252283277682, "grad_norm": 0.08665873855352402, "learning_rate": 4.360000000000001e-06, "loss": 1.4882, "step": 218 }, { "epoch": 0.08153482798338588, "grad_norm": 0.08230563998222351, "learning_rate": 4.38e-06, "loss": 1.4888, "step": 219 }, { "epoch": 0.08190713313399495, "grad_norm": 0.08207309246063232, "learning_rate": 4.4e-06, "loss": 1.4954, "step": 220 }, { "epoch": 0.08227943828460402, "grad_norm": 0.08314842730760574, "learning_rate": 4.42e-06, "loss": 1.5103, "step": 221 }, { "epoch": 0.08265174343521309, "grad_norm": 0.08730518817901611, "learning_rate": 4.440000000000001e-06, "loss": 1.4845, "step": 222 }, { "epoch": 0.08302404858582216, "grad_norm": 0.09334740787744522, "learning_rate": 4.4600000000000005e-06, "loss": 1.4726, "step": 223 }, { "epoch": 0.08339635373643123, "grad_norm": 0.0923716351389885, "learning_rate": 4.48e-06, "loss": 1.5005, "step": 224 }, { "epoch": 0.08376865888704028, "grad_norm": 0.08330400288105011, "learning_rate": 4.5e-06, "loss": 1.4939, "step": 225 }, { "epoch": 0.08414096403764935, "grad_norm": 0.08828233182430267, "learning_rate": 4.520000000000001e-06, "loss": 1.4926, "step": 226 }, { "epoch": 0.08451326918825842, "grad_norm": 0.08477775007486343, "learning_rate": 4.540000000000001e-06, "loss": 1.4872, "step": 227 }, { "epoch": 0.0848855743388675, "grad_norm": 0.08313533663749695, "learning_rate": 4.56e-06, "loss": 1.488, "step": 228 }, { "epoch": 0.08525787948947657, "grad_norm": 0.08070395141839981, "learning_rate": 4.58e-06, "loss": 1.498, "step": 229 }, { "epoch": 0.08563018464008564, "grad_norm": 0.08496372401714325, "learning_rate": 4.600000000000001e-06, "loss": 1.4995, "step": 230 }, { "epoch": 0.08600248979069469, "grad_norm": 0.08610887080430984, "learning_rate": 4.620000000000001e-06, "loss": 1.4861, "step": 231 }, { "epoch": 0.08637479494130376, "grad_norm": 0.1109740138053894, "learning_rate": 4.6400000000000005e-06, "loss": 1.4997, "step": 232 }, { "epoch": 0.08674710009191283, "grad_norm": 0.0828074961900711, "learning_rate": 4.66e-06, "loss": 1.4768, "step": 233 }, { "epoch": 0.0871194052425219, "grad_norm": 0.10130389034748077, "learning_rate": 4.680000000000001e-06, "loss": 1.5143, "step": 234 }, { "epoch": 0.08749171039313097, "grad_norm": 0.07845676690340042, "learning_rate": 4.7e-06, "loss": 1.4835, "step": 235 }, { "epoch": 0.08786401554374004, "grad_norm": 0.0835128203034401, "learning_rate": 4.7200000000000005e-06, "loss": 1.487, "step": 236 }, { "epoch": 0.0882363206943491, "grad_norm": 0.07999230176210403, "learning_rate": 4.74e-06, "loss": 1.4768, "step": 237 }, { "epoch": 0.08860862584495817, "grad_norm": 0.08866851031780243, "learning_rate": 4.76e-06, "loss": 1.4877, "step": 238 }, { "epoch": 0.08898093099556724, "grad_norm": 0.09091200679540634, "learning_rate": 4.78e-06, "loss": 1.4663, "step": 239 }, { "epoch": 0.08935323614617631, "grad_norm": 0.09118737280368805, "learning_rate": 4.800000000000001e-06, "loss": 1.4818, "step": 240 }, { "epoch": 0.08972554129678538, "grad_norm": 0.09198911488056183, "learning_rate": 4.8200000000000004e-06, "loss": 1.486, "step": 241 }, { "epoch": 0.09009784644739445, "grad_norm": 0.08087292313575745, "learning_rate": 4.84e-06, "loss": 1.4839, "step": 242 }, { "epoch": 0.09047015159800352, "grad_norm": 0.0856778621673584, "learning_rate": 4.86e-06, "loss": 1.4882, "step": 243 }, { "epoch": 0.09084245674861258, "grad_norm": 0.09004831314086914, "learning_rate": 4.880000000000001e-06, "loss": 1.4977, "step": 244 }, { "epoch": 0.09121476189922165, "grad_norm": 0.08808062225580215, "learning_rate": 4.9000000000000005e-06, "loss": 1.4961, "step": 245 }, { "epoch": 0.09158706704983072, "grad_norm": 0.10897014290094376, "learning_rate": 4.92e-06, "loss": 1.4837, "step": 246 }, { "epoch": 0.09195937220043979, "grad_norm": 0.12116125226020813, "learning_rate": 4.94e-06, "loss": 1.4899, "step": 247 }, { "epoch": 0.09233167735104886, "grad_norm": 0.09493674337863922, "learning_rate": 4.960000000000001e-06, "loss": 1.4834, "step": 248 }, { "epoch": 0.09270398250165793, "grad_norm": 0.13177277147769928, "learning_rate": 4.980000000000001e-06, "loss": 1.4721, "step": 249 }, { "epoch": 0.09307628765226698, "grad_norm": 0.11521487683057785, "learning_rate": 5e-06, "loss": 1.5012, "step": 250 }, { "epoch": 0.09344859280287605, "grad_norm": 0.106010802090168, "learning_rate": 5.02e-06, "loss": 1.5041, "step": 251 }, { "epoch": 0.09382089795348512, "grad_norm": 0.08309640735387802, "learning_rate": 5.04e-06, "loss": 1.4651, "step": 252 }, { "epoch": 0.0941932031040942, "grad_norm": 0.23099473118782043, "learning_rate": 5.060000000000001e-06, "loss": 1.4706, "step": 253 }, { "epoch": 0.09456550825470326, "grad_norm": 0.08957033604383469, "learning_rate": 5.0800000000000005e-06, "loss": 1.4851, "step": 254 }, { "epoch": 0.09493781340531234, "grad_norm": 0.08968747407197952, "learning_rate": 5.1e-06, "loss": 1.4615, "step": 255 }, { "epoch": 0.09531011855592139, "grad_norm": 0.09505753219127655, "learning_rate": 5.12e-06, "loss": 1.4852, "step": 256 }, { "epoch": 0.09568242370653046, "grad_norm": 0.08881967514753342, "learning_rate": 5.140000000000001e-06, "loss": 1.5098, "step": 257 }, { "epoch": 0.09605472885713953, "grad_norm": 0.08623534440994263, "learning_rate": 5.1600000000000006e-06, "loss": 1.466, "step": 258 }, { "epoch": 0.0964270340077486, "grad_norm": 0.08393881469964981, "learning_rate": 5.18e-06, "loss": 1.4932, "step": 259 }, { "epoch": 0.09679933915835767, "grad_norm": 0.0790511816740036, "learning_rate": 5.2e-06, "loss": 1.489, "step": 260 }, { "epoch": 0.09717164430896674, "grad_norm": 0.08720332384109497, "learning_rate": 5.220000000000001e-06, "loss": 1.4878, "step": 261 }, { "epoch": 0.0975439494595758, "grad_norm": 0.09274443984031677, "learning_rate": 5.240000000000001e-06, "loss": 1.4629, "step": 262 }, { "epoch": 0.09791625461018487, "grad_norm": 0.0823291540145874, "learning_rate": 5.2600000000000005e-06, "loss": 1.4798, "step": 263 }, { "epoch": 0.09828855976079394, "grad_norm": 0.10442589223384857, "learning_rate": 5.28e-06, "loss": 1.4885, "step": 264 }, { "epoch": 0.09866086491140301, "grad_norm": 0.08945530652999878, "learning_rate": 5.300000000000001e-06, "loss": 1.4802, "step": 265 }, { "epoch": 0.09903317006201208, "grad_norm": 0.11119077354669571, "learning_rate": 5.320000000000001e-06, "loss": 1.4888, "step": 266 }, { "epoch": 0.09940547521262115, "grad_norm": 0.09534472972154617, "learning_rate": 5.3400000000000005e-06, "loss": 1.4838, "step": 267 }, { "epoch": 0.0997777803632302, "grad_norm": 0.08682861924171448, "learning_rate": 5.36e-06, "loss": 1.4617, "step": 268 }, { "epoch": 0.10015008551383928, "grad_norm": 0.10255074501037598, "learning_rate": 5.380000000000001e-06, "loss": 1.4767, "step": 269 }, { "epoch": 0.10052239066444835, "grad_norm": 0.09030777961015701, "learning_rate": 5.400000000000001e-06, "loss": 1.463, "step": 270 }, { "epoch": 0.10089469581505742, "grad_norm": 0.09011770784854889, "learning_rate": 5.420000000000001e-06, "loss": 1.484, "step": 271 }, { "epoch": 0.10126700096566649, "grad_norm": 0.08929470181465149, "learning_rate": 5.4400000000000004e-06, "loss": 1.4789, "step": 272 }, { "epoch": 0.10163930611627556, "grad_norm": 0.09251522272825241, "learning_rate": 5.460000000000001e-06, "loss": 1.4682, "step": 273 }, { "epoch": 0.10201161126688461, "grad_norm": 0.11165442317724228, "learning_rate": 5.480000000000001e-06, "loss": 1.4829, "step": 274 }, { "epoch": 0.10238391641749368, "grad_norm": 0.09440817683935165, "learning_rate": 5.500000000000001e-06, "loss": 1.4769, "step": 275 }, { "epoch": 0.10275622156810275, "grad_norm": 0.11706740409135818, "learning_rate": 5.5200000000000005e-06, "loss": 1.477, "step": 276 }, { "epoch": 0.10312852671871182, "grad_norm": 0.08593502640724182, "learning_rate": 5.540000000000001e-06, "loss": 1.4727, "step": 277 }, { "epoch": 0.1035008318693209, "grad_norm": 0.0881851464509964, "learning_rate": 5.560000000000001e-06, "loss": 1.4852, "step": 278 }, { "epoch": 0.10387313701992996, "grad_norm": 0.11567908525466919, "learning_rate": 5.580000000000001e-06, "loss": 1.4608, "step": 279 }, { "epoch": 0.10424544217053903, "grad_norm": 0.09296432882547379, "learning_rate": 5.600000000000001e-06, "loss": 1.4811, "step": 280 }, { "epoch": 0.10461774732114809, "grad_norm": 0.10216089338064194, "learning_rate": 5.620000000000001e-06, "loss": 1.4682, "step": 281 }, { "epoch": 0.10499005247175716, "grad_norm": 0.08564305305480957, "learning_rate": 5.64e-06, "loss": 1.4832, "step": 282 }, { "epoch": 0.10536235762236623, "grad_norm": 0.11430171877145767, "learning_rate": 5.66e-06, "loss": 1.4754, "step": 283 }, { "epoch": 0.1057346627729753, "grad_norm": 0.09230099618434906, "learning_rate": 5.68e-06, "loss": 1.4765, "step": 284 }, { "epoch": 0.10610696792358437, "grad_norm": 0.14818595349788666, "learning_rate": 5.7e-06, "loss": 1.4642, "step": 285 }, { "epoch": 0.10647927307419344, "grad_norm": 0.1206260696053505, "learning_rate": 5.72e-06, "loss": 1.462, "step": 286 }, { "epoch": 0.1068515782248025, "grad_norm": 0.11496801674365997, "learning_rate": 5.74e-06, "loss": 1.4741, "step": 287 }, { "epoch": 0.10722388337541157, "grad_norm": 0.09007902443408966, "learning_rate": 5.76e-06, "loss": 1.472, "step": 288 }, { "epoch": 0.10759618852602064, "grad_norm": 0.0860317051410675, "learning_rate": 5.78e-06, "loss": 1.4814, "step": 289 }, { "epoch": 0.10796849367662971, "grad_norm": 0.08847720175981522, "learning_rate": 5.8e-06, "loss": 1.473, "step": 290 }, { "epoch": 0.10834079882723878, "grad_norm": 0.09280810505151749, "learning_rate": 5.82e-06, "loss": 1.4628, "step": 291 }, { "epoch": 0.10871310397784785, "grad_norm": 0.0906905010342598, "learning_rate": 5.84e-06, "loss": 1.4637, "step": 292 }, { "epoch": 0.1090854091284569, "grad_norm": 0.08728771656751633, "learning_rate": 5.86e-06, "loss": 1.4667, "step": 293 }, { "epoch": 0.10945771427906598, "grad_norm": 0.09215451776981354, "learning_rate": 5.8800000000000005e-06, "loss": 1.4588, "step": 294 }, { "epoch": 0.10983001942967505, "grad_norm": 0.09837444871664047, "learning_rate": 5.9e-06, "loss": 1.464, "step": 295 }, { "epoch": 0.11020232458028412, "grad_norm": 0.08866247534751892, "learning_rate": 5.92e-06, "loss": 1.4585, "step": 296 }, { "epoch": 0.11057462973089319, "grad_norm": 0.11951064318418503, "learning_rate": 5.94e-06, "loss": 1.4612, "step": 297 }, { "epoch": 0.11094693488150226, "grad_norm": 0.08314000070095062, "learning_rate": 5.9600000000000005e-06, "loss": 1.4635, "step": 298 }, { "epoch": 0.11131924003211131, "grad_norm": 0.12508852779865265, "learning_rate": 5.98e-06, "loss": 1.4722, "step": 299 }, { "epoch": 0.11169154518272038, "grad_norm": 0.11367225646972656, "learning_rate": 6e-06, "loss": 1.4722, "step": 300 }, { "epoch": 0.11206385033332945, "grad_norm": 0.1337917000055313, "learning_rate": 6.02e-06, "loss": 1.4531, "step": 301 }, { "epoch": 0.11243615548393852, "grad_norm": 0.10296988487243652, "learning_rate": 6.040000000000001e-06, "loss": 1.4355, "step": 302 }, { "epoch": 0.1128084606345476, "grad_norm": 0.09838810563087463, "learning_rate": 6.0600000000000004e-06, "loss": 1.4705, "step": 303 }, { "epoch": 0.11318076578515666, "grad_norm": 0.14201873540878296, "learning_rate": 6.08e-06, "loss": 1.465, "step": 304 }, { "epoch": 0.11355307093576572, "grad_norm": 0.10391878336668015, "learning_rate": 6.1e-06, "loss": 1.4691, "step": 305 }, { "epoch": 0.11392537608637479, "grad_norm": 0.0908937081694603, "learning_rate": 6.120000000000001e-06, "loss": 1.4738, "step": 306 }, { "epoch": 0.11429768123698386, "grad_norm": 0.1304592341184616, "learning_rate": 6.1400000000000005e-06, "loss": 1.4564, "step": 307 }, { "epoch": 0.11466998638759293, "grad_norm": 0.11030007898807526, "learning_rate": 6.16e-06, "loss": 1.4681, "step": 308 }, { "epoch": 0.115042291538202, "grad_norm": 0.1404293328523636, "learning_rate": 6.18e-06, "loss": 1.4489, "step": 309 }, { "epoch": 0.11541459668881107, "grad_norm": 0.15717652440071106, "learning_rate": 6.200000000000001e-06, "loss": 1.4619, "step": 310 }, { "epoch": 0.11578690183942013, "grad_norm": 0.08294162154197693, "learning_rate": 6.220000000000001e-06, "loss": 1.4564, "step": 311 }, { "epoch": 0.1161592069900292, "grad_norm": 0.12932319939136505, "learning_rate": 6.24e-06, "loss": 1.4615, "step": 312 }, { "epoch": 0.11653151214063827, "grad_norm": 0.1291813999414444, "learning_rate": 6.26e-06, "loss": 1.4736, "step": 313 }, { "epoch": 0.11690381729124734, "grad_norm": 0.08296732604503632, "learning_rate": 6.280000000000001e-06, "loss": 1.4592, "step": 314 }, { "epoch": 0.11727612244185641, "grad_norm": 0.1045786589384079, "learning_rate": 6.300000000000001e-06, "loss": 1.4473, "step": 315 }, { "epoch": 0.11764842759246548, "grad_norm": 0.14720311760902405, "learning_rate": 6.3200000000000005e-06, "loss": 1.45, "step": 316 }, { "epoch": 0.11802073274307454, "grad_norm": 0.08579614013433456, "learning_rate": 6.34e-06, "loss": 1.4448, "step": 317 }, { "epoch": 0.1183930378936836, "grad_norm": 0.12133117765188217, "learning_rate": 6.360000000000001e-06, "loss": 1.4536, "step": 318 }, { "epoch": 0.11876534304429268, "grad_norm": 0.09560643136501312, "learning_rate": 6.380000000000001e-06, "loss": 1.4501, "step": 319 }, { "epoch": 0.11913764819490175, "grad_norm": 0.08440724015235901, "learning_rate": 6.4000000000000006e-06, "loss": 1.4516, "step": 320 }, { "epoch": 0.11950995334551082, "grad_norm": 0.0895630419254303, "learning_rate": 6.42e-06, "loss": 1.4537, "step": 321 }, { "epoch": 0.11988225849611989, "grad_norm": 0.10389460623264313, "learning_rate": 6.440000000000001e-06, "loss": 1.4552, "step": 322 }, { "epoch": 0.12025456364672896, "grad_norm": 0.09369999170303345, "learning_rate": 6.460000000000001e-06, "loss": 1.4544, "step": 323 }, { "epoch": 0.12062686879733801, "grad_norm": 0.099007248878479, "learning_rate": 6.480000000000001e-06, "loss": 1.4891, "step": 324 }, { "epoch": 0.12099917394794708, "grad_norm": 0.09093964099884033, "learning_rate": 6.5000000000000004e-06, "loss": 1.4671, "step": 325 }, { "epoch": 0.12137147909855615, "grad_norm": 0.09165001660585403, "learning_rate": 6.520000000000001e-06, "loss": 1.4577, "step": 326 }, { "epoch": 0.12174378424916522, "grad_norm": 0.09465596079826355, "learning_rate": 6.540000000000001e-06, "loss": 1.4737, "step": 327 }, { "epoch": 0.1221160893997743, "grad_norm": 0.09906231611967087, "learning_rate": 6.560000000000001e-06, "loss": 1.4587, "step": 328 }, { "epoch": 0.12248839455038336, "grad_norm": 0.09170754998922348, "learning_rate": 6.5800000000000005e-06, "loss": 1.4604, "step": 329 }, { "epoch": 0.12286069970099242, "grad_norm": 0.08741523325443268, "learning_rate": 6.600000000000001e-06, "loss": 1.4563, "step": 330 }, { "epoch": 0.12323300485160149, "grad_norm": 0.1020127385854721, "learning_rate": 6.620000000000001e-06, "loss": 1.4656, "step": 331 }, { "epoch": 0.12360531000221056, "grad_norm": 0.09506560862064362, "learning_rate": 6.640000000000001e-06, "loss": 1.4752, "step": 332 }, { "epoch": 0.12397761515281963, "grad_norm": 0.1042810007929802, "learning_rate": 6.660000000000001e-06, "loss": 1.4593, "step": 333 }, { "epoch": 0.1243499203034287, "grad_norm": 0.09513174742460251, "learning_rate": 6.680000000000001e-06, "loss": 1.4636, "step": 334 }, { "epoch": 0.12472222545403777, "grad_norm": 0.09068495035171509, "learning_rate": 6.700000000000001e-06, "loss": 1.4729, "step": 335 }, { "epoch": 0.12509453060464684, "grad_norm": 0.09143901616334915, "learning_rate": 6.720000000000001e-06, "loss": 1.4528, "step": 336 }, { "epoch": 0.1254668357552559, "grad_norm": 0.08922934532165527, "learning_rate": 6.740000000000001e-06, "loss": 1.4668, "step": 337 }, { "epoch": 0.12583914090586498, "grad_norm": 0.10561850666999817, "learning_rate": 6.760000000000001e-06, "loss": 1.4536, "step": 338 }, { "epoch": 0.12621144605647402, "grad_norm": 0.09490080922842026, "learning_rate": 6.780000000000001e-06, "loss": 1.4604, "step": 339 }, { "epoch": 0.1265837512070831, "grad_norm": 0.09219174832105637, "learning_rate": 6.800000000000001e-06, "loss": 1.4577, "step": 340 }, { "epoch": 0.12695605635769217, "grad_norm": 0.11259440332651138, "learning_rate": 6.820000000000001e-06, "loss": 1.461, "step": 341 }, { "epoch": 0.12732836150830124, "grad_norm": 0.10154417902231216, "learning_rate": 6.8400000000000014e-06, "loss": 1.4539, "step": 342 }, { "epoch": 0.1277006666589103, "grad_norm": 0.08527655154466629, "learning_rate": 6.860000000000001e-06, "loss": 1.4516, "step": 343 }, { "epoch": 0.12807297180951938, "grad_norm": 0.10697508603334427, "learning_rate": 6.88e-06, "loss": 1.4519, "step": 344 }, { "epoch": 0.12844527696012845, "grad_norm": 0.1440954953432083, "learning_rate": 6.9e-06, "loss": 1.4394, "step": 345 }, { "epoch": 0.12881758211073752, "grad_norm": 0.10360608994960785, "learning_rate": 6.92e-06, "loss": 1.4472, "step": 346 }, { "epoch": 0.1291898872613466, "grad_norm": 0.09860967099666595, "learning_rate": 6.9400000000000005e-06, "loss": 1.4362, "step": 347 }, { "epoch": 0.12956219241195566, "grad_norm": 0.10211774706840515, "learning_rate": 6.96e-06, "loss": 1.4497, "step": 348 }, { "epoch": 0.12993449756256473, "grad_norm": 0.10965809226036072, "learning_rate": 6.98e-06, "loss": 1.457, "step": 349 }, { "epoch": 0.1303068027131738, "grad_norm": 0.09159507602453232, "learning_rate": 7e-06, "loss": 1.4512, "step": 350 }, { "epoch": 0.13067910786378284, "grad_norm": 0.11210039258003235, "learning_rate": 7.0200000000000006e-06, "loss": 1.4418, "step": 351 }, { "epoch": 0.1310514130143919, "grad_norm": 0.0950743705034256, "learning_rate": 7.04e-06, "loss": 1.4443, "step": 352 }, { "epoch": 0.13142371816500098, "grad_norm": 0.10520799458026886, "learning_rate": 7.06e-06, "loss": 1.4587, "step": 353 }, { "epoch": 0.13179602331561005, "grad_norm": 0.09574756771326065, "learning_rate": 7.08e-06, "loss": 1.4353, "step": 354 }, { "epoch": 0.13216832846621912, "grad_norm": 0.09506519138813019, "learning_rate": 7.100000000000001e-06, "loss": 1.4425, "step": 355 }, { "epoch": 0.1325406336168282, "grad_norm": 0.09055452048778534, "learning_rate": 7.1200000000000004e-06, "loss": 1.4514, "step": 356 }, { "epoch": 0.13291293876743726, "grad_norm": 0.08780631422996521, "learning_rate": 7.14e-06, "loss": 1.4378, "step": 357 }, { "epoch": 0.13328524391804633, "grad_norm": 0.09684795886278152, "learning_rate": 7.16e-06, "loss": 1.4358, "step": 358 }, { "epoch": 0.1336575490686554, "grad_norm": 0.09261483699083328, "learning_rate": 7.180000000000001e-06, "loss": 1.4521, "step": 359 }, { "epoch": 0.13402985421926447, "grad_norm": 0.09582682698965073, "learning_rate": 7.2000000000000005e-06, "loss": 1.4393, "step": 360 }, { "epoch": 0.13440215936987354, "grad_norm": 0.0970984697341919, "learning_rate": 7.22e-06, "loss": 1.4289, "step": 361 }, { "epoch": 0.1347744645204826, "grad_norm": 0.10083375871181488, "learning_rate": 7.24e-06, "loss": 1.4493, "step": 362 }, { "epoch": 0.13514676967109168, "grad_norm": 0.11022008955478668, "learning_rate": 7.260000000000001e-06, "loss": 1.4504, "step": 363 }, { "epoch": 0.13551907482170072, "grad_norm": 0.09099752455949783, "learning_rate": 7.280000000000001e-06, "loss": 1.4439, "step": 364 }, { "epoch": 0.1358913799723098, "grad_norm": 0.13994981348514557, "learning_rate": 7.3e-06, "loss": 1.4537, "step": 365 }, { "epoch": 0.13626368512291887, "grad_norm": 0.0927768424153328, "learning_rate": 7.32e-06, "loss": 1.4577, "step": 366 }, { "epoch": 0.13663599027352794, "grad_norm": 0.12550784647464752, "learning_rate": 7.340000000000001e-06, "loss": 1.4595, "step": 367 }, { "epoch": 0.137008295424137, "grad_norm": 0.0998741090297699, "learning_rate": 7.360000000000001e-06, "loss": 1.4606, "step": 368 }, { "epoch": 0.13738060057474608, "grad_norm": 0.09384217113256454, "learning_rate": 7.3800000000000005e-06, "loss": 1.4388, "step": 369 }, { "epoch": 0.13775290572535515, "grad_norm": 0.10957197099924088, "learning_rate": 7.4e-06, "loss": 1.4463, "step": 370 }, { "epoch": 0.13812521087596422, "grad_norm": 0.10382582992315292, "learning_rate": 7.420000000000001e-06, "loss": 1.4393, "step": 371 }, { "epoch": 0.1384975160265733, "grad_norm": 0.09171084314584732, "learning_rate": 7.440000000000001e-06, "loss": 1.4493, "step": 372 }, { "epoch": 0.13886982117718236, "grad_norm": 0.10419381409883499, "learning_rate": 7.4600000000000006e-06, "loss": 1.4589, "step": 373 }, { "epoch": 0.13924212632779143, "grad_norm": 0.0996531993150711, "learning_rate": 7.48e-06, "loss": 1.4258, "step": 374 }, { "epoch": 0.1396144314784005, "grad_norm": 0.09250695258378983, "learning_rate": 7.500000000000001e-06, "loss": 1.4458, "step": 375 }, { "epoch": 0.13998673662900954, "grad_norm": 0.09015652537345886, "learning_rate": 7.520000000000001e-06, "loss": 1.452, "step": 376 }, { "epoch": 0.1403590417796186, "grad_norm": 0.11032413691282272, "learning_rate": 7.540000000000001e-06, "loss": 1.4307, "step": 377 }, { "epoch": 0.14073134693022768, "grad_norm": 0.09179755300283432, "learning_rate": 7.5600000000000005e-06, "loss": 1.4462, "step": 378 }, { "epoch": 0.14110365208083675, "grad_norm": 0.09686073660850525, "learning_rate": 7.58e-06, "loss": 1.4362, "step": 379 }, { "epoch": 0.14147595723144582, "grad_norm": 0.09349130094051361, "learning_rate": 7.600000000000001e-06, "loss": 1.4358, "step": 380 }, { "epoch": 0.1418482623820549, "grad_norm": 0.10256840288639069, "learning_rate": 7.620000000000001e-06, "loss": 1.4469, "step": 381 }, { "epoch": 0.14222056753266396, "grad_norm": 0.09944120049476624, "learning_rate": 7.640000000000001e-06, "loss": 1.4485, "step": 382 }, { "epoch": 0.14259287268327303, "grad_norm": 0.09924148768186569, "learning_rate": 7.660000000000001e-06, "loss": 1.4382, "step": 383 }, { "epoch": 0.1429651778338821, "grad_norm": 0.09792005270719528, "learning_rate": 7.680000000000001e-06, "loss": 1.4256, "step": 384 }, { "epoch": 0.14333748298449117, "grad_norm": 0.08989827334880829, "learning_rate": 7.7e-06, "loss": 1.441, "step": 385 }, { "epoch": 0.14370978813510024, "grad_norm": 0.13259711861610413, "learning_rate": 7.72e-06, "loss": 1.4604, "step": 386 }, { "epoch": 0.1440820932857093, "grad_norm": 0.10013754665851593, "learning_rate": 7.74e-06, "loss": 1.4319, "step": 387 }, { "epoch": 0.14445439843631835, "grad_norm": 0.10801331698894501, "learning_rate": 7.76e-06, "loss": 1.4358, "step": 388 }, { "epoch": 0.14482670358692742, "grad_norm": 0.0908760130405426, "learning_rate": 7.78e-06, "loss": 1.4496, "step": 389 }, { "epoch": 0.1451990087375365, "grad_norm": 0.09459855407476425, "learning_rate": 7.800000000000002e-06, "loss": 1.4482, "step": 390 }, { "epoch": 0.14557131388814556, "grad_norm": 0.09493505209684372, "learning_rate": 7.820000000000001e-06, "loss": 1.443, "step": 391 }, { "epoch": 0.14594361903875464, "grad_norm": 0.09393203258514404, "learning_rate": 7.840000000000001e-06, "loss": 1.4461, "step": 392 }, { "epoch": 0.1463159241893637, "grad_norm": 0.0932602807879448, "learning_rate": 7.860000000000001e-06, "loss": 1.4193, "step": 393 }, { "epoch": 0.14668822933997278, "grad_norm": 0.10115011781454086, "learning_rate": 7.88e-06, "loss": 1.4143, "step": 394 }, { "epoch": 0.14706053449058185, "grad_norm": 0.10510238260030746, "learning_rate": 7.9e-06, "loss": 1.4377, "step": 395 }, { "epoch": 0.14743283964119092, "grad_norm": 0.09857072681188583, "learning_rate": 7.92e-06, "loss": 1.4537, "step": 396 }, { "epoch": 0.14780514479179999, "grad_norm": 0.11328350752592087, "learning_rate": 7.94e-06, "loss": 1.4274, "step": 397 }, { "epoch": 0.14817744994240906, "grad_norm": 0.10213533043861389, "learning_rate": 7.960000000000002e-06, "loss": 1.4282, "step": 398 }, { "epoch": 0.14854975509301813, "grad_norm": 0.1178865134716034, "learning_rate": 7.980000000000002e-06, "loss": 1.4333, "step": 399 }, { "epoch": 0.1489220602436272, "grad_norm": 0.11789101362228394, "learning_rate": 8.000000000000001e-06, "loss": 1.4413, "step": 400 }, { "epoch": 0.14929436539423624, "grad_norm": 0.10633790493011475, "learning_rate": 8.020000000000001e-06, "loss": 1.4281, "step": 401 }, { "epoch": 0.1496666705448453, "grad_norm": 0.11100617051124573, "learning_rate": 8.040000000000001e-06, "loss": 1.4299, "step": 402 }, { "epoch": 0.15003897569545438, "grad_norm": 0.09987955540418625, "learning_rate": 8.06e-06, "loss": 1.4396, "step": 403 }, { "epoch": 0.15041128084606345, "grad_norm": 0.13840006291866302, "learning_rate": 8.08e-06, "loss": 1.4378, "step": 404 }, { "epoch": 0.15078358599667252, "grad_norm": 0.10468387603759766, "learning_rate": 8.1e-06, "loss": 1.4361, "step": 405 }, { "epoch": 0.1511558911472816, "grad_norm": 0.09684094786643982, "learning_rate": 8.120000000000002e-06, "loss": 1.4408, "step": 406 }, { "epoch": 0.15152819629789066, "grad_norm": 0.1058192178606987, "learning_rate": 8.14e-06, "loss": 1.4353, "step": 407 }, { "epoch": 0.15190050144849973, "grad_norm": 0.11632636189460754, "learning_rate": 8.16e-06, "loss": 1.4331, "step": 408 }, { "epoch": 0.1522728065991088, "grad_norm": 0.10219406336545944, "learning_rate": 8.18e-06, "loss": 1.4527, "step": 409 }, { "epoch": 0.15264511174971787, "grad_norm": 0.110807865858078, "learning_rate": 8.2e-06, "loss": 1.4389, "step": 410 }, { "epoch": 0.15301741690032694, "grad_norm": 0.09530390799045563, "learning_rate": 8.220000000000001e-06, "loss": 1.4252, "step": 411 }, { "epoch": 0.153389722050936, "grad_norm": 0.10499103367328644, "learning_rate": 8.24e-06, "loss": 1.4429, "step": 412 }, { "epoch": 0.15376202720154505, "grad_norm": 0.1094844788312912, "learning_rate": 8.26e-06, "loss": 1.4172, "step": 413 }, { "epoch": 0.15413433235215412, "grad_norm": 0.10519949346780777, "learning_rate": 8.28e-06, "loss": 1.4371, "step": 414 }, { "epoch": 0.1545066375027632, "grad_norm": 0.10553234815597534, "learning_rate": 8.3e-06, "loss": 1.4409, "step": 415 }, { "epoch": 0.15487894265337226, "grad_norm": 0.10840829461812973, "learning_rate": 8.32e-06, "loss": 1.4345, "step": 416 }, { "epoch": 0.15525124780398133, "grad_norm": 0.09976733475923538, "learning_rate": 8.34e-06, "loss": 1.4508, "step": 417 }, { "epoch": 0.1556235529545904, "grad_norm": 0.10259655117988586, "learning_rate": 8.36e-06, "loss": 1.4196, "step": 418 }, { "epoch": 0.15599585810519948, "grad_norm": 0.10481464862823486, "learning_rate": 8.380000000000001e-06, "loss": 1.4367, "step": 419 }, { "epoch": 0.15636816325580855, "grad_norm": 0.1130518987774849, "learning_rate": 8.400000000000001e-06, "loss": 1.4479, "step": 420 }, { "epoch": 0.15674046840641762, "grad_norm": 0.11022792756557465, "learning_rate": 8.42e-06, "loss": 1.421, "step": 421 }, { "epoch": 0.15711277355702669, "grad_norm": 0.0973302498459816, "learning_rate": 8.44e-06, "loss": 1.4445, "step": 422 }, { "epoch": 0.15748507870763576, "grad_norm": 0.09701745212078094, "learning_rate": 8.46e-06, "loss": 1.42, "step": 423 }, { "epoch": 0.15785738385824483, "grad_norm": 0.09979844093322754, "learning_rate": 8.48e-06, "loss": 1.4221, "step": 424 }, { "epoch": 0.15822968900885387, "grad_norm": 0.099349245429039, "learning_rate": 8.5e-06, "loss": 1.4131, "step": 425 }, { "epoch": 0.15860199415946294, "grad_norm": 0.10651430487632751, "learning_rate": 8.52e-06, "loss": 1.4249, "step": 426 }, { "epoch": 0.158974299310072, "grad_norm": 0.11765170842409134, "learning_rate": 8.540000000000001e-06, "loss": 1.4503, "step": 427 }, { "epoch": 0.15934660446068108, "grad_norm": 0.10456649959087372, "learning_rate": 8.560000000000001e-06, "loss": 1.4261, "step": 428 }, { "epoch": 0.15971890961129015, "grad_norm": 0.13042724132537842, "learning_rate": 8.580000000000001e-06, "loss": 1.4238, "step": 429 }, { "epoch": 0.16009121476189922, "grad_norm": 0.10613591969013214, "learning_rate": 8.6e-06, "loss": 1.4097, "step": 430 }, { "epoch": 0.1604635199125083, "grad_norm": 0.10777163505554199, "learning_rate": 8.62e-06, "loss": 1.4327, "step": 431 }, { "epoch": 0.16083582506311736, "grad_norm": 0.1047920435667038, "learning_rate": 8.64e-06, "loss": 1.4247, "step": 432 }, { "epoch": 0.16120813021372643, "grad_norm": 0.09752184897661209, "learning_rate": 8.66e-06, "loss": 1.4327, "step": 433 }, { "epoch": 0.1615804353643355, "grad_norm": 0.1117076650261879, "learning_rate": 8.68e-06, "loss": 1.4327, "step": 434 }, { "epoch": 0.16195274051494457, "grad_norm": 0.11003684252500534, "learning_rate": 8.700000000000001e-06, "loss": 1.4211, "step": 435 }, { "epoch": 0.16232504566555364, "grad_norm": 0.09675519168376923, "learning_rate": 8.720000000000001e-06, "loss": 1.435, "step": 436 }, { "epoch": 0.16269735081616268, "grad_norm": 0.09906768053770065, "learning_rate": 8.740000000000001e-06, "loss": 1.4381, "step": 437 }, { "epoch": 0.16306965596677175, "grad_norm": 0.10028046369552612, "learning_rate": 8.76e-06, "loss": 1.4251, "step": 438 }, { "epoch": 0.16344196111738082, "grad_norm": 0.11393031477928162, "learning_rate": 8.78e-06, "loss": 1.4234, "step": 439 }, { "epoch": 0.1638142662679899, "grad_norm": 0.10733836144208908, "learning_rate": 8.8e-06, "loss": 1.4168, "step": 440 }, { "epoch": 0.16418657141859896, "grad_norm": 0.10900059342384338, "learning_rate": 8.82e-06, "loss": 1.4356, "step": 441 }, { "epoch": 0.16455887656920803, "grad_norm": 0.10403525829315186, "learning_rate": 8.84e-06, "loss": 1.4058, "step": 442 }, { "epoch": 0.1649311817198171, "grad_norm": 0.10996660590171814, "learning_rate": 8.860000000000002e-06, "loss": 1.4317, "step": 443 }, { "epoch": 0.16530348687042618, "grad_norm": 0.1056893914937973, "learning_rate": 8.880000000000001e-06, "loss": 1.4258, "step": 444 }, { "epoch": 0.16567579202103525, "grad_norm": 0.10371371358633041, "learning_rate": 8.900000000000001e-06, "loss": 1.4261, "step": 445 }, { "epoch": 0.16604809717164432, "grad_norm": 0.10245388001203537, "learning_rate": 8.920000000000001e-06, "loss": 1.4237, "step": 446 }, { "epoch": 0.16642040232225339, "grad_norm": 0.11346007138490677, "learning_rate": 8.94e-06, "loss": 1.4171, "step": 447 }, { "epoch": 0.16679270747286246, "grad_norm": 0.1109023243188858, "learning_rate": 8.96e-06, "loss": 1.4311, "step": 448 }, { "epoch": 0.16716501262347153, "grad_norm": 0.10533205419778824, "learning_rate": 8.98e-06, "loss": 1.4159, "step": 449 }, { "epoch": 0.16753731777408057, "grad_norm": 0.1102309301495552, "learning_rate": 9e-06, "loss": 1.425, "step": 450 }, { "epoch": 0.16790962292468964, "grad_norm": 0.10830759257078171, "learning_rate": 9.020000000000002e-06, "loss": 1.4188, "step": 451 }, { "epoch": 0.1682819280752987, "grad_norm": 0.11227082461118698, "learning_rate": 9.040000000000002e-06, "loss": 1.4189, "step": 452 }, { "epoch": 0.16865423322590778, "grad_norm": 0.11561018973588943, "learning_rate": 9.060000000000001e-06, "loss": 1.4225, "step": 453 }, { "epoch": 0.16902653837651685, "grad_norm": 0.10718375444412231, "learning_rate": 9.080000000000001e-06, "loss": 1.437, "step": 454 }, { "epoch": 0.16939884352712592, "grad_norm": 0.11734096705913544, "learning_rate": 9.100000000000001e-06, "loss": 1.4243, "step": 455 }, { "epoch": 0.169771148677735, "grad_norm": 0.11969781666994095, "learning_rate": 9.12e-06, "loss": 1.4048, "step": 456 }, { "epoch": 0.17014345382834406, "grad_norm": 0.1117396354675293, "learning_rate": 9.14e-06, "loss": 1.4281, "step": 457 }, { "epoch": 0.17051575897895313, "grad_norm": 0.10435774177312851, "learning_rate": 9.16e-06, "loss": 1.429, "step": 458 }, { "epoch": 0.1708880641295622, "grad_norm": 0.10865868628025055, "learning_rate": 9.180000000000002e-06, "loss": 1.4092, "step": 459 }, { "epoch": 0.17126036928017127, "grad_norm": 0.1147746816277504, "learning_rate": 9.200000000000002e-06, "loss": 1.4197, "step": 460 }, { "epoch": 0.17163267443078034, "grad_norm": 0.1095675453543663, "learning_rate": 9.220000000000002e-06, "loss": 1.425, "step": 461 }, { "epoch": 0.17200497958138938, "grad_norm": 0.10761556029319763, "learning_rate": 9.240000000000001e-06, "loss": 1.4152, "step": 462 }, { "epoch": 0.17237728473199845, "grad_norm": 0.11335011571645737, "learning_rate": 9.260000000000001e-06, "loss": 1.4224, "step": 463 }, { "epoch": 0.17274958988260752, "grad_norm": 0.10845465958118439, "learning_rate": 9.280000000000001e-06, "loss": 1.4005, "step": 464 }, { "epoch": 0.1731218950332166, "grad_norm": 0.1037868782877922, "learning_rate": 9.3e-06, "loss": 1.4277, "step": 465 }, { "epoch": 0.17349420018382566, "grad_norm": 0.11700176447629929, "learning_rate": 9.32e-06, "loss": 1.4232, "step": 466 }, { "epoch": 0.17386650533443473, "grad_norm": 0.10587536543607712, "learning_rate": 9.340000000000002e-06, "loss": 1.4399, "step": 467 }, { "epoch": 0.1742388104850438, "grad_norm": 0.11157836019992828, "learning_rate": 9.360000000000002e-06, "loss": 1.4159, "step": 468 }, { "epoch": 0.17461111563565287, "grad_norm": 0.10884707421064377, "learning_rate": 9.38e-06, "loss": 1.4088, "step": 469 }, { "epoch": 0.17498342078626195, "grad_norm": 0.11198698729276657, "learning_rate": 9.4e-06, "loss": 1.3952, "step": 470 }, { "epoch": 0.17535572593687102, "grad_norm": 0.1030043363571167, "learning_rate": 9.42e-06, "loss": 1.4294, "step": 471 }, { "epoch": 0.17572803108748009, "grad_norm": 0.11066511273384094, "learning_rate": 9.440000000000001e-06, "loss": 1.4114, "step": 472 }, { "epoch": 0.17610033623808916, "grad_norm": 0.1187884584069252, "learning_rate": 9.460000000000001e-06, "loss": 1.415, "step": 473 }, { "epoch": 0.1764726413886982, "grad_norm": 0.09824799001216888, "learning_rate": 9.48e-06, "loss": 1.4166, "step": 474 }, { "epoch": 0.17684494653930727, "grad_norm": 0.09938450902700424, "learning_rate": 9.5e-06, "loss": 1.4139, "step": 475 }, { "epoch": 0.17721725168991634, "grad_norm": 0.11766441911458969, "learning_rate": 9.52e-06, "loss": 1.4234, "step": 476 }, { "epoch": 0.1775895568405254, "grad_norm": 0.10825181007385254, "learning_rate": 9.54e-06, "loss": 1.4207, "step": 477 }, { "epoch": 0.17796186199113448, "grad_norm": 0.10996980965137482, "learning_rate": 9.56e-06, "loss": 1.4228, "step": 478 }, { "epoch": 0.17833416714174355, "grad_norm": 0.11518129706382751, "learning_rate": 9.58e-06, "loss": 1.4155, "step": 479 }, { "epoch": 0.17870647229235262, "grad_norm": 0.1234855204820633, "learning_rate": 9.600000000000001e-06, "loss": 1.4309, "step": 480 }, { "epoch": 0.1790787774429617, "grad_norm": 0.10811053216457367, "learning_rate": 9.620000000000001e-06, "loss": 1.4211, "step": 481 }, { "epoch": 0.17945108259357076, "grad_norm": 0.10534091293811798, "learning_rate": 9.640000000000001e-06, "loss": 1.4175, "step": 482 }, { "epoch": 0.17982338774417983, "grad_norm": 0.12702758610248566, "learning_rate": 9.66e-06, "loss": 1.4104, "step": 483 }, { "epoch": 0.1801956928947889, "grad_norm": 0.10806774348020554, "learning_rate": 9.68e-06, "loss": 1.4136, "step": 484 }, { "epoch": 0.18056799804539797, "grad_norm": 0.10667596012353897, "learning_rate": 9.7e-06, "loss": 1.4111, "step": 485 }, { "epoch": 0.18094030319600704, "grad_norm": 0.1084335595369339, "learning_rate": 9.72e-06, "loss": 1.424, "step": 486 }, { "epoch": 0.18131260834661608, "grad_norm": 0.10393060743808746, "learning_rate": 9.74e-06, "loss": 1.413, "step": 487 }, { "epoch": 0.18168491349722515, "grad_norm": 0.11034102737903595, "learning_rate": 9.760000000000001e-06, "loss": 1.4215, "step": 488 }, { "epoch": 0.18205721864783422, "grad_norm": 0.1163318008184433, "learning_rate": 9.780000000000001e-06, "loss": 1.4123, "step": 489 }, { "epoch": 0.1824295237984433, "grad_norm": 0.11191318929195404, "learning_rate": 9.800000000000001e-06, "loss": 1.4292, "step": 490 }, { "epoch": 0.18280182894905236, "grad_norm": 0.10725145041942596, "learning_rate": 9.820000000000001e-06, "loss": 1.4084, "step": 491 }, { "epoch": 0.18317413409966143, "grad_norm": 0.11110781133174896, "learning_rate": 9.84e-06, "loss": 1.394, "step": 492 }, { "epoch": 0.1835464392502705, "grad_norm": 0.10888979583978653, "learning_rate": 9.86e-06, "loss": 1.4129, "step": 493 }, { "epoch": 0.18391874440087957, "grad_norm": 0.11001749336719513, "learning_rate": 9.88e-06, "loss": 1.4248, "step": 494 }, { "epoch": 0.18429104955148864, "grad_norm": 0.10707477480173111, "learning_rate": 9.9e-06, "loss": 1.4088, "step": 495 }, { "epoch": 0.18466335470209772, "grad_norm": 0.12006059288978577, "learning_rate": 9.920000000000002e-06, "loss": 1.4126, "step": 496 }, { "epoch": 0.18503565985270679, "grad_norm": 0.10763873159885406, "learning_rate": 9.940000000000001e-06, "loss": 1.4219, "step": 497 }, { "epoch": 0.18540796500331586, "grad_norm": 0.10750039666891098, "learning_rate": 9.960000000000001e-06, "loss": 1.3938, "step": 498 }, { "epoch": 0.1857802701539249, "grad_norm": 0.10844654589891434, "learning_rate": 9.980000000000001e-06, "loss": 1.4257, "step": 499 }, { "epoch": 0.18615257530453397, "grad_norm": 0.10777752101421356, "learning_rate": 1e-05, "loss": 1.4042, "step": 500 }, { "epoch": 0.18615257530453397, "eval_loss": 1.3990447521209717, "eval_runtime": 16.7137, "eval_samples_per_second": 103.747, "eval_steps_per_second": 5.205, "step": 500 }, { "epoch": 0.18652488045514304, "grad_norm": 0.12805776298046112, "learning_rate": 1.002e-05, "loss": 1.3923, "step": 501 }, { "epoch": 0.1868971856057521, "grad_norm": 0.11597350984811783, "learning_rate": 1.004e-05, "loss": 1.4246, "step": 502 }, { "epoch": 0.18726949075636118, "grad_norm": 0.12692782282829285, "learning_rate": 1.006e-05, "loss": 1.4091, "step": 503 }, { "epoch": 0.18764179590697025, "grad_norm": 0.11857876926660538, "learning_rate": 1.008e-05, "loss": 1.4025, "step": 504 }, { "epoch": 0.18801410105757932, "grad_norm": 0.11153510212898254, "learning_rate": 1.0100000000000002e-05, "loss": 1.4069, "step": 505 }, { "epoch": 0.1883864062081884, "grad_norm": 0.11478788405656815, "learning_rate": 1.0120000000000001e-05, "loss": 1.4043, "step": 506 }, { "epoch": 0.18875871135879746, "grad_norm": 0.11333264410495758, "learning_rate": 1.0140000000000001e-05, "loss": 1.4, "step": 507 }, { "epoch": 0.18913101650940653, "grad_norm": 0.12227758020162582, "learning_rate": 1.0160000000000001e-05, "loss": 1.4023, "step": 508 }, { "epoch": 0.1895033216600156, "grad_norm": 0.12102984637022018, "learning_rate": 1.018e-05, "loss": 1.4056, "step": 509 }, { "epoch": 0.18987562681062467, "grad_norm": 0.11923938989639282, "learning_rate": 1.02e-05, "loss": 1.4019, "step": 510 }, { "epoch": 0.1902479319612337, "grad_norm": 0.12334270030260086, "learning_rate": 1.022e-05, "loss": 1.3959, "step": 511 }, { "epoch": 0.19062023711184278, "grad_norm": 0.12032870948314667, "learning_rate": 1.024e-05, "loss": 1.3994, "step": 512 }, { "epoch": 0.19099254226245185, "grad_norm": 0.1170472502708435, "learning_rate": 1.0260000000000002e-05, "loss": 1.3959, "step": 513 }, { "epoch": 0.19136484741306092, "grad_norm": 0.10825354605913162, "learning_rate": 1.0280000000000002e-05, "loss": 1.4031, "step": 514 }, { "epoch": 0.19173715256367, "grad_norm": 0.11196181178092957, "learning_rate": 1.0300000000000001e-05, "loss": 1.4048, "step": 515 }, { "epoch": 0.19210945771427906, "grad_norm": 0.11114434897899628, "learning_rate": 1.0320000000000001e-05, "loss": 1.4064, "step": 516 }, { "epoch": 0.19248176286488813, "grad_norm": 0.11616680771112442, "learning_rate": 1.0340000000000001e-05, "loss": 1.3967, "step": 517 }, { "epoch": 0.1928540680154972, "grad_norm": 0.11657610535621643, "learning_rate": 1.036e-05, "loss": 1.4152, "step": 518 }, { "epoch": 0.19322637316610627, "grad_norm": 0.10928953438997269, "learning_rate": 1.038e-05, "loss": 1.4026, "step": 519 }, { "epoch": 0.19359867831671534, "grad_norm": 0.11354228109121323, "learning_rate": 1.04e-05, "loss": 1.4026, "step": 520 }, { "epoch": 0.19397098346732441, "grad_norm": 0.12005306780338287, "learning_rate": 1.0420000000000002e-05, "loss": 1.4094, "step": 521 }, { "epoch": 0.19434328861793349, "grad_norm": 0.12376662343740463, "learning_rate": 1.0440000000000002e-05, "loss": 1.4076, "step": 522 }, { "epoch": 0.19471559376854256, "grad_norm": 0.1191818043589592, "learning_rate": 1.0460000000000001e-05, "loss": 1.4055, "step": 523 }, { "epoch": 0.1950878989191516, "grad_norm": 0.11545392870903015, "learning_rate": 1.0480000000000001e-05, "loss": 1.3881, "step": 524 }, { "epoch": 0.19546020406976067, "grad_norm": 0.1315152794122696, "learning_rate": 1.0500000000000001e-05, "loss": 1.4119, "step": 525 }, { "epoch": 0.19583250922036974, "grad_norm": 0.11896664649248123, "learning_rate": 1.0520000000000001e-05, "loss": 1.4165, "step": 526 }, { "epoch": 0.1962048143709788, "grad_norm": 0.11113154143095016, "learning_rate": 1.054e-05, "loss": 1.3992, "step": 527 }, { "epoch": 0.19657711952158788, "grad_norm": 0.13099145889282227, "learning_rate": 1.056e-05, "loss": 1.3997, "step": 528 }, { "epoch": 0.19694942467219695, "grad_norm": 0.11786054819822311, "learning_rate": 1.0580000000000002e-05, "loss": 1.3967, "step": 529 }, { "epoch": 0.19732172982280602, "grad_norm": 0.1341516375541687, "learning_rate": 1.0600000000000002e-05, "loss": 1.3993, "step": 530 }, { "epoch": 0.1976940349734151, "grad_norm": 0.12682707607746124, "learning_rate": 1.0620000000000002e-05, "loss": 1.4148, "step": 531 }, { "epoch": 0.19806634012402416, "grad_norm": 0.1188652366399765, "learning_rate": 1.0640000000000001e-05, "loss": 1.3933, "step": 532 }, { "epoch": 0.19843864527463323, "grad_norm": 0.11299144476652145, "learning_rate": 1.0660000000000001e-05, "loss": 1.4004, "step": 533 }, { "epoch": 0.1988109504252423, "grad_norm": 0.12654365599155426, "learning_rate": 1.0680000000000001e-05, "loss": 1.3929, "step": 534 }, { "epoch": 0.19918325557585137, "grad_norm": 0.12072401493787766, "learning_rate": 1.0700000000000001e-05, "loss": 1.3967, "step": 535 }, { "epoch": 0.1995555607264604, "grad_norm": 0.12298454344272614, "learning_rate": 1.072e-05, "loss": 1.4072, "step": 536 }, { "epoch": 0.19992786587706948, "grad_norm": 0.127255380153656, "learning_rate": 1.0740000000000002e-05, "loss": 1.408, "step": 537 }, { "epoch": 0.20030017102767855, "grad_norm": 0.12813320755958557, "learning_rate": 1.0760000000000002e-05, "loss": 1.4025, "step": 538 }, { "epoch": 0.20067247617828762, "grad_norm": 0.12207869440317154, "learning_rate": 1.0780000000000002e-05, "loss": 1.3844, "step": 539 }, { "epoch": 0.2010447813288967, "grad_norm": 0.11978594213724136, "learning_rate": 1.0800000000000002e-05, "loss": 1.3953, "step": 540 }, { "epoch": 0.20141708647950576, "grad_norm": 0.12205009162425995, "learning_rate": 1.0820000000000001e-05, "loss": 1.3904, "step": 541 }, { "epoch": 0.20178939163011483, "grad_norm": 0.11987481266260147, "learning_rate": 1.0840000000000001e-05, "loss": 1.3985, "step": 542 }, { "epoch": 0.2021616967807239, "grad_norm": 0.12077482044696808, "learning_rate": 1.0860000000000001e-05, "loss": 1.4081, "step": 543 }, { "epoch": 0.20253400193133297, "grad_norm": 0.11721371859312057, "learning_rate": 1.0880000000000001e-05, "loss": 1.4001, "step": 544 }, { "epoch": 0.20290630708194204, "grad_norm": 0.11542621999979019, "learning_rate": 1.0900000000000002e-05, "loss": 1.4109, "step": 545 }, { "epoch": 0.20327861223255111, "grad_norm": 0.12170755863189697, "learning_rate": 1.0920000000000002e-05, "loss": 1.3943, "step": 546 }, { "epoch": 0.20365091738316018, "grad_norm": 0.11331729590892792, "learning_rate": 1.0940000000000002e-05, "loss": 1.404, "step": 547 }, { "epoch": 0.20402322253376923, "grad_norm": 0.12473474442958832, "learning_rate": 1.0960000000000002e-05, "loss": 1.399, "step": 548 }, { "epoch": 0.2043955276843783, "grad_norm": 0.12112352252006531, "learning_rate": 1.0980000000000002e-05, "loss": 1.394, "step": 549 }, { "epoch": 0.20476783283498737, "grad_norm": 0.11570420861244202, "learning_rate": 1.1000000000000001e-05, "loss": 1.3961, "step": 550 }, { "epoch": 0.20514013798559644, "grad_norm": 0.12518157064914703, "learning_rate": 1.1020000000000001e-05, "loss": 1.3929, "step": 551 }, { "epoch": 0.2055124431362055, "grad_norm": 0.12412535399198532, "learning_rate": 1.1040000000000001e-05, "loss": 1.3953, "step": 552 }, { "epoch": 0.20588474828681458, "grad_norm": 0.12580837309360504, "learning_rate": 1.1060000000000003e-05, "loss": 1.3887, "step": 553 }, { "epoch": 0.20625705343742365, "grad_norm": 0.12166325002908707, "learning_rate": 1.1080000000000002e-05, "loss": 1.4035, "step": 554 }, { "epoch": 0.20662935858803272, "grad_norm": 0.11380494385957718, "learning_rate": 1.1100000000000002e-05, "loss": 1.3881, "step": 555 }, { "epoch": 0.2070016637386418, "grad_norm": 0.12607711553573608, "learning_rate": 1.1120000000000002e-05, "loss": 1.3911, "step": 556 }, { "epoch": 0.20737396888925086, "grad_norm": 0.1209443137049675, "learning_rate": 1.1140000000000002e-05, "loss": 1.4025, "step": 557 }, { "epoch": 0.20774627403985993, "grad_norm": 0.11875788867473602, "learning_rate": 1.1160000000000002e-05, "loss": 1.382, "step": 558 }, { "epoch": 0.208118579190469, "grad_norm": 0.12261742353439331, "learning_rate": 1.1180000000000001e-05, "loss": 1.39, "step": 559 }, { "epoch": 0.20849088434107807, "grad_norm": 0.13006705045700073, "learning_rate": 1.1200000000000001e-05, "loss": 1.3926, "step": 560 }, { "epoch": 0.2088631894916871, "grad_norm": 0.1237824410200119, "learning_rate": 1.1220000000000003e-05, "loss": 1.3977, "step": 561 }, { "epoch": 0.20923549464229618, "grad_norm": 0.12436344474554062, "learning_rate": 1.1240000000000002e-05, "loss": 1.3924, "step": 562 }, { "epoch": 0.20960779979290525, "grad_norm": 0.12768109142780304, "learning_rate": 1.126e-05, "loss": 1.4062, "step": 563 }, { "epoch": 0.20998010494351432, "grad_norm": 0.12379388511180878, "learning_rate": 1.128e-05, "loss": 1.3951, "step": 564 }, { "epoch": 0.2103524100941234, "grad_norm": 0.12182633578777313, "learning_rate": 1.13e-05, "loss": 1.3969, "step": 565 }, { "epoch": 0.21072471524473246, "grad_norm": 0.12477164715528488, "learning_rate": 1.132e-05, "loss": 1.3954, "step": 566 }, { "epoch": 0.21109702039534153, "grad_norm": 0.12322055548429489, "learning_rate": 1.134e-05, "loss": 1.4005, "step": 567 }, { "epoch": 0.2114693255459506, "grad_norm": 0.12597453594207764, "learning_rate": 1.136e-05, "loss": 1.3974, "step": 568 }, { "epoch": 0.21184163069655967, "grad_norm": 0.13351480662822723, "learning_rate": 1.138e-05, "loss": 1.3914, "step": 569 }, { "epoch": 0.21221393584716874, "grad_norm": 0.13195490837097168, "learning_rate": 1.14e-05, "loss": 1.4162, "step": 570 }, { "epoch": 0.21258624099777781, "grad_norm": 0.12694774568080902, "learning_rate": 1.142e-05, "loss": 1.3844, "step": 571 }, { "epoch": 0.21295854614838688, "grad_norm": 0.12894363701343536, "learning_rate": 1.144e-05, "loss": 1.387, "step": 572 }, { "epoch": 0.21333085129899593, "grad_norm": 0.11793356388807297, "learning_rate": 1.146e-05, "loss": 1.3881, "step": 573 }, { "epoch": 0.213703156449605, "grad_norm": 0.13151799142360687, "learning_rate": 1.148e-05, "loss": 1.3872, "step": 574 }, { "epoch": 0.21407546160021407, "grad_norm": 0.1286250352859497, "learning_rate": 1.15e-05, "loss": 1.3668, "step": 575 }, { "epoch": 0.21444776675082314, "grad_norm": 0.13224467635154724, "learning_rate": 1.152e-05, "loss": 1.4045, "step": 576 }, { "epoch": 0.2148200719014322, "grad_norm": 0.12700480222702026, "learning_rate": 1.154e-05, "loss": 1.3814, "step": 577 }, { "epoch": 0.21519237705204128, "grad_norm": 0.13367070257663727, "learning_rate": 1.156e-05, "loss": 1.3868, "step": 578 }, { "epoch": 0.21556468220265035, "grad_norm": 0.12190359085798264, "learning_rate": 1.1580000000000001e-05, "loss": 1.3981, "step": 579 }, { "epoch": 0.21593698735325942, "grad_norm": 0.1240309402346611, "learning_rate": 1.16e-05, "loss": 1.3945, "step": 580 }, { "epoch": 0.2163092925038685, "grad_norm": 0.15947964787483215, "learning_rate": 1.162e-05, "loss": 1.3889, "step": 581 }, { "epoch": 0.21668159765447756, "grad_norm": 0.130709707736969, "learning_rate": 1.164e-05, "loss": 1.3987, "step": 582 }, { "epoch": 0.21705390280508663, "grad_norm": 0.12745317816734314, "learning_rate": 1.166e-05, "loss": 1.3802, "step": 583 }, { "epoch": 0.2174262079556957, "grad_norm": 0.12209773063659668, "learning_rate": 1.168e-05, "loss": 1.3798, "step": 584 }, { "epoch": 0.21779851310630474, "grad_norm": 0.14180858433246613, "learning_rate": 1.17e-05, "loss": 1.3814, "step": 585 }, { "epoch": 0.2181708182569138, "grad_norm": 0.11788824945688248, "learning_rate": 1.172e-05, "loss": 1.3758, "step": 586 }, { "epoch": 0.21854312340752288, "grad_norm": 0.12975317239761353, "learning_rate": 1.1740000000000001e-05, "loss": 1.395, "step": 587 }, { "epoch": 0.21891542855813195, "grad_norm": 0.12418822199106216, "learning_rate": 1.1760000000000001e-05, "loss": 1.4044, "step": 588 }, { "epoch": 0.21928773370874102, "grad_norm": 0.12132619321346283, "learning_rate": 1.178e-05, "loss": 1.3866, "step": 589 }, { "epoch": 0.2196600388593501, "grad_norm": 0.1325748711824417, "learning_rate": 1.18e-05, "loss": 1.3903, "step": 590 }, { "epoch": 0.22003234400995916, "grad_norm": 0.1266520470380783, "learning_rate": 1.182e-05, "loss": 1.3894, "step": 591 }, { "epoch": 0.22040464916056823, "grad_norm": 0.12683559954166412, "learning_rate": 1.184e-05, "loss": 1.3948, "step": 592 }, { "epoch": 0.2207769543111773, "grad_norm": 0.12880323827266693, "learning_rate": 1.186e-05, "loss": 1.3756, "step": 593 }, { "epoch": 0.22114925946178637, "grad_norm": 0.12198843061923981, "learning_rate": 1.188e-05, "loss": 1.3821, "step": 594 }, { "epoch": 0.22152156461239544, "grad_norm": 0.13132554292678833, "learning_rate": 1.1900000000000001e-05, "loss": 1.3877, "step": 595 }, { "epoch": 0.22189386976300451, "grad_norm": 0.1291092336177826, "learning_rate": 1.1920000000000001e-05, "loss": 1.4057, "step": 596 }, { "epoch": 0.22226617491361356, "grad_norm": 0.13351795077323914, "learning_rate": 1.1940000000000001e-05, "loss": 1.3652, "step": 597 }, { "epoch": 0.22263848006422263, "grad_norm": 0.12667742371559143, "learning_rate": 1.196e-05, "loss": 1.386, "step": 598 }, { "epoch": 0.2230107852148317, "grad_norm": 0.13008779287338257, "learning_rate": 1.198e-05, "loss": 1.3887, "step": 599 }, { "epoch": 0.22338309036544077, "grad_norm": 0.12079621851444244, "learning_rate": 1.2e-05, "loss": 1.3855, "step": 600 }, { "epoch": 0.22375539551604984, "grad_norm": 0.13083826005458832, "learning_rate": 1.202e-05, "loss": 1.3819, "step": 601 }, { "epoch": 0.2241277006666589, "grad_norm": 0.13189451396465302, "learning_rate": 1.204e-05, "loss": 1.3667, "step": 602 }, { "epoch": 0.22450000581726798, "grad_norm": 0.1339392513036728, "learning_rate": 1.2060000000000001e-05, "loss": 1.3802, "step": 603 }, { "epoch": 0.22487231096787705, "grad_norm": 0.12915685772895813, "learning_rate": 1.2080000000000001e-05, "loss": 1.3871, "step": 604 }, { "epoch": 0.22524461611848612, "grad_norm": 0.13761506974697113, "learning_rate": 1.2100000000000001e-05, "loss": 1.392, "step": 605 }, { "epoch": 0.2256169212690952, "grad_norm": 0.1275719255208969, "learning_rate": 1.2120000000000001e-05, "loss": 1.3715, "step": 606 }, { "epoch": 0.22598922641970426, "grad_norm": 0.16112855076789856, "learning_rate": 1.214e-05, "loss": 1.3759, "step": 607 }, { "epoch": 0.22636153157031333, "grad_norm": 0.13108204305171967, "learning_rate": 1.216e-05, "loss": 1.375, "step": 608 }, { "epoch": 0.2267338367209224, "grad_norm": 0.13481353223323822, "learning_rate": 1.218e-05, "loss": 1.3704, "step": 609 }, { "epoch": 0.22710614187153144, "grad_norm": 0.144792377948761, "learning_rate": 1.22e-05, "loss": 1.3865, "step": 610 }, { "epoch": 0.2274784470221405, "grad_norm": 0.13363252580165863, "learning_rate": 1.2220000000000002e-05, "loss": 1.3826, "step": 611 }, { "epoch": 0.22785075217274958, "grad_norm": 0.14733262360095978, "learning_rate": 1.2240000000000001e-05, "loss": 1.3875, "step": 612 }, { "epoch": 0.22822305732335865, "grad_norm": 0.13364124298095703, "learning_rate": 1.2260000000000001e-05, "loss": 1.3885, "step": 613 }, { "epoch": 0.22859536247396772, "grad_norm": 0.13215725123882294, "learning_rate": 1.2280000000000001e-05, "loss": 1.3644, "step": 614 }, { "epoch": 0.2289676676245768, "grad_norm": 0.14032886922359467, "learning_rate": 1.23e-05, "loss": 1.3931, "step": 615 }, { "epoch": 0.22933997277518586, "grad_norm": 0.15233881771564484, "learning_rate": 1.232e-05, "loss": 1.3771, "step": 616 }, { "epoch": 0.22971227792579493, "grad_norm": 0.14122669398784637, "learning_rate": 1.234e-05, "loss": 1.3774, "step": 617 }, { "epoch": 0.230084583076404, "grad_norm": 0.13818544149398804, "learning_rate": 1.236e-05, "loss": 1.374, "step": 618 }, { "epoch": 0.23045688822701307, "grad_norm": 0.14192582666873932, "learning_rate": 1.2380000000000002e-05, "loss": 1.3773, "step": 619 }, { "epoch": 0.23082919337762214, "grad_norm": 0.1523938775062561, "learning_rate": 1.2400000000000002e-05, "loss": 1.3832, "step": 620 }, { "epoch": 0.23120149852823121, "grad_norm": 0.1431209146976471, "learning_rate": 1.2420000000000001e-05, "loss": 1.3765, "step": 621 }, { "epoch": 0.23157380367884026, "grad_norm": 0.13690458238124847, "learning_rate": 1.2440000000000001e-05, "loss": 1.3791, "step": 622 }, { "epoch": 0.23194610882944933, "grad_norm": 0.14077654480934143, "learning_rate": 1.2460000000000001e-05, "loss": 1.3785, "step": 623 }, { "epoch": 0.2323184139800584, "grad_norm": 0.15121321380138397, "learning_rate": 1.248e-05, "loss": 1.3945, "step": 624 }, { "epoch": 0.23269071913066747, "grad_norm": 0.1371421217918396, "learning_rate": 1.25e-05, "loss": 1.376, "step": 625 }, { "epoch": 0.23306302428127654, "grad_norm": 0.14253154397010803, "learning_rate": 1.252e-05, "loss": 1.3823, "step": 626 }, { "epoch": 0.2334353294318856, "grad_norm": 0.1420959234237671, "learning_rate": 1.254e-05, "loss": 1.3702, "step": 627 }, { "epoch": 0.23380763458249468, "grad_norm": 0.15104436874389648, "learning_rate": 1.2560000000000002e-05, "loss": 1.385, "step": 628 }, { "epoch": 0.23417993973310375, "grad_norm": 0.1486346274614334, "learning_rate": 1.2580000000000002e-05, "loss": 1.3792, "step": 629 }, { "epoch": 0.23455224488371282, "grad_norm": 0.16748331487178802, "learning_rate": 1.2600000000000001e-05, "loss": 1.3916, "step": 630 }, { "epoch": 0.2349245500343219, "grad_norm": 0.14780151844024658, "learning_rate": 1.2620000000000001e-05, "loss": 1.3827, "step": 631 }, { "epoch": 0.23529685518493096, "grad_norm": 0.152265265583992, "learning_rate": 1.2640000000000001e-05, "loss": 1.3812, "step": 632 }, { "epoch": 0.23566916033554003, "grad_norm": 0.1454855501651764, "learning_rate": 1.266e-05, "loss": 1.3907, "step": 633 }, { "epoch": 0.23604146548614907, "grad_norm": 0.14732711017131805, "learning_rate": 1.268e-05, "loss": 1.3651, "step": 634 }, { "epoch": 0.23641377063675814, "grad_norm": 0.1476028710603714, "learning_rate": 1.27e-05, "loss": 1.3723, "step": 635 }, { "epoch": 0.2367860757873672, "grad_norm": 0.14366750419139862, "learning_rate": 1.2720000000000002e-05, "loss": 1.3904, "step": 636 }, { "epoch": 0.23715838093797628, "grad_norm": 0.15820929408073425, "learning_rate": 1.2740000000000002e-05, "loss": 1.3852, "step": 637 }, { "epoch": 0.23753068608858535, "grad_norm": 0.1551392823457718, "learning_rate": 1.2760000000000001e-05, "loss": 1.389, "step": 638 }, { "epoch": 0.23790299123919442, "grad_norm": 0.13778699934482574, "learning_rate": 1.2780000000000001e-05, "loss": 1.3627, "step": 639 }, { "epoch": 0.2382752963898035, "grad_norm": 0.13657543063163757, "learning_rate": 1.2800000000000001e-05, "loss": 1.3941, "step": 640 }, { "epoch": 0.23864760154041256, "grad_norm": 0.13666628301143646, "learning_rate": 1.2820000000000001e-05, "loss": 1.3729, "step": 641 }, { "epoch": 0.23901990669102163, "grad_norm": 0.14898766577243805, "learning_rate": 1.284e-05, "loss": 1.3823, "step": 642 }, { "epoch": 0.2393922118416307, "grad_norm": 0.1644555926322937, "learning_rate": 1.286e-05, "loss": 1.3794, "step": 643 }, { "epoch": 0.23976451699223977, "grad_norm": 0.1226273775100708, "learning_rate": 1.2880000000000002e-05, "loss": 1.3631, "step": 644 }, { "epoch": 0.24013682214284884, "grad_norm": 0.16430190205574036, "learning_rate": 1.2900000000000002e-05, "loss": 1.3936, "step": 645 }, { "epoch": 0.2405091272934579, "grad_norm": 0.15783290565013885, "learning_rate": 1.2920000000000002e-05, "loss": 1.3951, "step": 646 }, { "epoch": 0.24088143244406696, "grad_norm": 0.14612354338169098, "learning_rate": 1.2940000000000001e-05, "loss": 1.3701, "step": 647 }, { "epoch": 0.24125373759467603, "grad_norm": 0.15059353411197662, "learning_rate": 1.2960000000000001e-05, "loss": 1.3649, "step": 648 }, { "epoch": 0.2416260427452851, "grad_norm": 0.15658129751682281, "learning_rate": 1.2980000000000001e-05, "loss": 1.3793, "step": 649 }, { "epoch": 0.24199834789589417, "grad_norm": 0.14705069363117218, "learning_rate": 1.3000000000000001e-05, "loss": 1.3816, "step": 650 }, { "epoch": 0.24237065304650324, "grad_norm": 0.13878506422042847, "learning_rate": 1.302e-05, "loss": 1.3881, "step": 651 }, { "epoch": 0.2427429581971123, "grad_norm": 0.1551000475883484, "learning_rate": 1.3040000000000002e-05, "loss": 1.3759, "step": 652 }, { "epoch": 0.24311526334772138, "grad_norm": 0.15696851909160614, "learning_rate": 1.3060000000000002e-05, "loss": 1.3744, "step": 653 }, { "epoch": 0.24348756849833045, "grad_norm": 0.13662178814411163, "learning_rate": 1.3080000000000002e-05, "loss": 1.3699, "step": 654 }, { "epoch": 0.24385987364893952, "grad_norm": 0.14424225687980652, "learning_rate": 1.3100000000000002e-05, "loss": 1.371, "step": 655 }, { "epoch": 0.2442321787995486, "grad_norm": 0.17988137900829315, "learning_rate": 1.3120000000000001e-05, "loss": 1.3955, "step": 656 }, { "epoch": 0.24460448395015766, "grad_norm": 0.1564609706401825, "learning_rate": 1.3140000000000001e-05, "loss": 1.3792, "step": 657 }, { "epoch": 0.24497678910076673, "grad_norm": 0.1378934532403946, "learning_rate": 1.3160000000000001e-05, "loss": 1.3501, "step": 658 }, { "epoch": 0.24534909425137577, "grad_norm": 0.14894217252731323, "learning_rate": 1.3180000000000001e-05, "loss": 1.3716, "step": 659 }, { "epoch": 0.24572139940198484, "grad_norm": 0.1473313868045807, "learning_rate": 1.3200000000000002e-05, "loss": 1.3666, "step": 660 }, { "epoch": 0.2460937045525939, "grad_norm": 0.14587174355983734, "learning_rate": 1.3220000000000002e-05, "loss": 1.3887, "step": 661 }, { "epoch": 0.24646600970320298, "grad_norm": 0.14664749801158905, "learning_rate": 1.3240000000000002e-05, "loss": 1.3727, "step": 662 }, { "epoch": 0.24683831485381205, "grad_norm": 0.1472824066877365, "learning_rate": 1.3260000000000002e-05, "loss": 1.3676, "step": 663 }, { "epoch": 0.24721062000442112, "grad_norm": 0.1401338130235672, "learning_rate": 1.3280000000000002e-05, "loss": 1.3873, "step": 664 }, { "epoch": 0.2475829251550302, "grad_norm": 0.13379812240600586, "learning_rate": 1.3300000000000001e-05, "loss": 1.3625, "step": 665 }, { "epoch": 0.24795523030563926, "grad_norm": 0.14346960186958313, "learning_rate": 1.3320000000000001e-05, "loss": 1.359, "step": 666 }, { "epoch": 0.24832753545624833, "grad_norm": 0.14162355661392212, "learning_rate": 1.3340000000000001e-05, "loss": 1.3702, "step": 667 }, { "epoch": 0.2486998406068574, "grad_norm": 0.15658289194107056, "learning_rate": 1.3360000000000003e-05, "loss": 1.3698, "step": 668 }, { "epoch": 0.24907214575746647, "grad_norm": 0.13803306221961975, "learning_rate": 1.3380000000000002e-05, "loss": 1.3588, "step": 669 }, { "epoch": 0.24944445090807554, "grad_norm": 0.14932705461978912, "learning_rate": 1.3400000000000002e-05, "loss": 1.3748, "step": 670 }, { "epoch": 0.24981675605868459, "grad_norm": 0.3856603503227234, "learning_rate": 1.3420000000000002e-05, "loss": 1.3876, "step": 671 }, { "epoch": 0.2501890612092937, "grad_norm": 0.14300397038459778, "learning_rate": 1.3440000000000002e-05, "loss": 1.36, "step": 672 }, { "epoch": 0.2505613663599027, "grad_norm": 0.15117305517196655, "learning_rate": 1.3460000000000002e-05, "loss": 1.3943, "step": 673 }, { "epoch": 0.2509336715105118, "grad_norm": 0.15544290840625763, "learning_rate": 1.3480000000000001e-05, "loss": 1.3826, "step": 674 }, { "epoch": 0.25130597666112087, "grad_norm": 0.14352373778820038, "learning_rate": 1.3500000000000001e-05, "loss": 1.3577, "step": 675 }, { "epoch": 0.25167828181172996, "grad_norm": 0.14757125079631805, "learning_rate": 1.3520000000000003e-05, "loss": 1.3706, "step": 676 }, { "epoch": 0.252050586962339, "grad_norm": 0.14953893423080444, "learning_rate": 1.3540000000000003e-05, "loss": 1.3656, "step": 677 }, { "epoch": 0.25242289211294805, "grad_norm": 0.15957669913768768, "learning_rate": 1.3560000000000002e-05, "loss": 1.3569, "step": 678 }, { "epoch": 0.25279519726355715, "grad_norm": 0.15265138447284698, "learning_rate": 1.3580000000000002e-05, "loss": 1.3636, "step": 679 }, { "epoch": 0.2531675024141662, "grad_norm": 0.1513211727142334, "learning_rate": 1.3600000000000002e-05, "loss": 1.3717, "step": 680 }, { "epoch": 0.2535398075647753, "grad_norm": 0.15139970183372498, "learning_rate": 1.3620000000000002e-05, "loss": 1.3776, "step": 681 }, { "epoch": 0.25391211271538433, "grad_norm": 0.15319296717643738, "learning_rate": 1.3640000000000002e-05, "loss": 1.3674, "step": 682 }, { "epoch": 0.25428441786599343, "grad_norm": 0.14396759867668152, "learning_rate": 1.3660000000000001e-05, "loss": 1.3739, "step": 683 }, { "epoch": 0.25465672301660247, "grad_norm": 0.14723047614097595, "learning_rate": 1.3680000000000003e-05, "loss": 1.3777, "step": 684 }, { "epoch": 0.25502902816721157, "grad_norm": 0.13962876796722412, "learning_rate": 1.3700000000000003e-05, "loss": 1.3676, "step": 685 }, { "epoch": 0.2554013333178206, "grad_norm": 0.14297540485858917, "learning_rate": 1.3720000000000002e-05, "loss": 1.3725, "step": 686 }, { "epoch": 0.2557736384684297, "grad_norm": 0.14365188777446747, "learning_rate": 1.3740000000000002e-05, "loss": 1.3547, "step": 687 }, { "epoch": 0.25614594361903875, "grad_norm": 0.14428015053272247, "learning_rate": 1.376e-05, "loss": 1.3883, "step": 688 }, { "epoch": 0.25651824876964785, "grad_norm": 0.14234168827533722, "learning_rate": 1.378e-05, "loss": 1.345, "step": 689 }, { "epoch": 0.2568905539202569, "grad_norm": 0.14778351783752441, "learning_rate": 1.38e-05, "loss": 1.3773, "step": 690 }, { "epoch": 0.25726285907086593, "grad_norm": 0.14804288744926453, "learning_rate": 1.382e-05, "loss": 1.3877, "step": 691 }, { "epoch": 0.25763516422147503, "grad_norm": 0.1505001187324524, "learning_rate": 1.384e-05, "loss": 1.3638, "step": 692 }, { "epoch": 0.2580074693720841, "grad_norm": 0.1481354534626007, "learning_rate": 1.386e-05, "loss": 1.3624, "step": 693 }, { "epoch": 0.2583797745226932, "grad_norm": 0.1433618813753128, "learning_rate": 1.3880000000000001e-05, "loss": 1.3531, "step": 694 }, { "epoch": 0.2587520796733022, "grad_norm": 0.14664116501808167, "learning_rate": 1.39e-05, "loss": 1.3705, "step": 695 }, { "epoch": 0.2591243848239113, "grad_norm": 0.13810987770557404, "learning_rate": 1.392e-05, "loss": 1.3528, "step": 696 }, { "epoch": 0.25949668997452036, "grad_norm": 0.14705303311347961, "learning_rate": 1.394e-05, "loss": 1.3598, "step": 697 }, { "epoch": 0.25986899512512945, "grad_norm": 0.14440058171749115, "learning_rate": 1.396e-05, "loss": 1.3584, "step": 698 }, { "epoch": 0.2602413002757385, "grad_norm": 0.14364704489707947, "learning_rate": 1.398e-05, "loss": 1.3734, "step": 699 }, { "epoch": 0.2606136054263476, "grad_norm": 0.1593163013458252, "learning_rate": 1.4e-05, "loss": 1.3782, "step": 700 }, { "epoch": 0.26098591057695664, "grad_norm": 0.15553325414657593, "learning_rate": 1.402e-05, "loss": 1.3768, "step": 701 }, { "epoch": 0.2613582157275657, "grad_norm": 0.15635879337787628, "learning_rate": 1.4040000000000001e-05, "loss": 1.3541, "step": 702 }, { "epoch": 0.2617305208781748, "grad_norm": 0.15832242369651794, "learning_rate": 1.4060000000000001e-05, "loss": 1.3801, "step": 703 }, { "epoch": 0.2621028260287838, "grad_norm": 0.145589679479599, "learning_rate": 1.408e-05, "loss": 1.3701, "step": 704 }, { "epoch": 0.2624751311793929, "grad_norm": 0.1448379009962082, "learning_rate": 1.41e-05, "loss": 1.3817, "step": 705 }, { "epoch": 0.26284743633000196, "grad_norm": 0.15079927444458008, "learning_rate": 1.412e-05, "loss": 1.3642, "step": 706 }, { "epoch": 0.26321974148061106, "grad_norm": 0.15496708452701569, "learning_rate": 1.414e-05, "loss": 1.383, "step": 707 }, { "epoch": 0.2635920466312201, "grad_norm": 0.14300493896007538, "learning_rate": 1.416e-05, "loss": 1.3664, "step": 708 }, { "epoch": 0.2639643517818292, "grad_norm": 0.15340474247932434, "learning_rate": 1.418e-05, "loss": 1.371, "step": 709 }, { "epoch": 0.26433665693243824, "grad_norm": 0.14752480387687683, "learning_rate": 1.4200000000000001e-05, "loss": 1.3754, "step": 710 }, { "epoch": 0.26470896208304734, "grad_norm": 0.15268085896968842, "learning_rate": 1.4220000000000001e-05, "loss": 1.3604, "step": 711 }, { "epoch": 0.2650812672336564, "grad_norm": 0.15046533942222595, "learning_rate": 1.4240000000000001e-05, "loss": 1.3566, "step": 712 }, { "epoch": 0.2654535723842655, "grad_norm": 0.14135941863059998, "learning_rate": 1.426e-05, "loss": 1.3729, "step": 713 }, { "epoch": 0.2658258775348745, "grad_norm": 0.14512401819229126, "learning_rate": 1.428e-05, "loss": 1.3521, "step": 714 }, { "epoch": 0.26619818268548356, "grad_norm": 0.15231768786907196, "learning_rate": 1.43e-05, "loss": 1.3648, "step": 715 }, { "epoch": 0.26657048783609266, "grad_norm": 0.14620406925678253, "learning_rate": 1.432e-05, "loss": 1.3534, "step": 716 }, { "epoch": 0.2669427929867017, "grad_norm": 0.15307819843292236, "learning_rate": 1.434e-05, "loss": 1.3661, "step": 717 }, { "epoch": 0.2673150981373108, "grad_norm": 0.15813127160072327, "learning_rate": 1.4360000000000001e-05, "loss": 1.3647, "step": 718 }, { "epoch": 0.26768740328791985, "grad_norm": 0.15037232637405396, "learning_rate": 1.4380000000000001e-05, "loss": 1.3595, "step": 719 }, { "epoch": 0.26805970843852894, "grad_norm": 0.1518103927373886, "learning_rate": 1.4400000000000001e-05, "loss": 1.356, "step": 720 }, { "epoch": 0.268432013589138, "grad_norm": 0.15428012609481812, "learning_rate": 1.4420000000000001e-05, "loss": 1.3636, "step": 721 }, { "epoch": 0.2688043187397471, "grad_norm": 0.15426598489284515, "learning_rate": 1.444e-05, "loss": 1.355, "step": 722 }, { "epoch": 0.2691766238903561, "grad_norm": 0.15186642110347748, "learning_rate": 1.446e-05, "loss": 1.3409, "step": 723 }, { "epoch": 0.2695489290409652, "grad_norm": 0.15585435926914215, "learning_rate": 1.448e-05, "loss": 1.3503, "step": 724 }, { "epoch": 0.26992123419157427, "grad_norm": 0.1601250320672989, "learning_rate": 1.45e-05, "loss": 1.3641, "step": 725 }, { "epoch": 0.27029353934218336, "grad_norm": 0.15690714120864868, "learning_rate": 1.4520000000000002e-05, "loss": 1.3681, "step": 726 }, { "epoch": 0.2706658444927924, "grad_norm": 0.1501239687204361, "learning_rate": 1.4540000000000001e-05, "loss": 1.361, "step": 727 }, { "epoch": 0.27103814964340145, "grad_norm": 0.15437743067741394, "learning_rate": 1.4560000000000001e-05, "loss": 1.3699, "step": 728 }, { "epoch": 0.27141045479401055, "grad_norm": 0.14239777624607086, "learning_rate": 1.4580000000000001e-05, "loss": 1.352, "step": 729 }, { "epoch": 0.2717827599446196, "grad_norm": 0.15204055607318878, "learning_rate": 1.46e-05, "loss": 1.3609, "step": 730 }, { "epoch": 0.2721550650952287, "grad_norm": 0.14352013170719147, "learning_rate": 1.462e-05, "loss": 1.3316, "step": 731 }, { "epoch": 0.27252737024583773, "grad_norm": 0.1465366929769516, "learning_rate": 1.464e-05, "loss": 1.3621, "step": 732 }, { "epoch": 0.27289967539644683, "grad_norm": 0.1439734250307083, "learning_rate": 1.466e-05, "loss": 1.3471, "step": 733 }, { "epoch": 0.27327198054705587, "grad_norm": 0.16148164868354797, "learning_rate": 1.4680000000000002e-05, "loss": 1.3531, "step": 734 }, { "epoch": 0.27364428569766497, "grad_norm": 0.15046028792858124, "learning_rate": 1.4700000000000002e-05, "loss": 1.357, "step": 735 }, { "epoch": 0.274016590848274, "grad_norm": 0.1497603803873062, "learning_rate": 1.4720000000000001e-05, "loss": 1.3609, "step": 736 }, { "epoch": 0.2743888959988831, "grad_norm": 0.1596868485212326, "learning_rate": 1.4740000000000001e-05, "loss": 1.3569, "step": 737 }, { "epoch": 0.27476120114949215, "grad_norm": 0.1531444936990738, "learning_rate": 1.4760000000000001e-05, "loss": 1.3436, "step": 738 }, { "epoch": 0.2751335063001012, "grad_norm": 0.14884676039218903, "learning_rate": 1.478e-05, "loss": 1.3583, "step": 739 }, { "epoch": 0.2755058114507103, "grad_norm": 0.1622639298439026, "learning_rate": 1.48e-05, "loss": 1.3598, "step": 740 }, { "epoch": 0.27587811660131933, "grad_norm": 0.16207291185855865, "learning_rate": 1.482e-05, "loss": 1.3771, "step": 741 }, { "epoch": 0.27625042175192843, "grad_norm": 0.16403760015964508, "learning_rate": 1.4840000000000002e-05, "loss": 1.3618, "step": 742 }, { "epoch": 0.2766227269025375, "grad_norm": 0.1647598147392273, "learning_rate": 1.4860000000000002e-05, "loss": 1.3542, "step": 743 }, { "epoch": 0.2769950320531466, "grad_norm": 0.1558753401041031, "learning_rate": 1.4880000000000002e-05, "loss": 1.3527, "step": 744 }, { "epoch": 0.2773673372037556, "grad_norm": 0.1733637899160385, "learning_rate": 1.4900000000000001e-05, "loss": 1.3538, "step": 745 }, { "epoch": 0.2777396423543647, "grad_norm": 0.15772801637649536, "learning_rate": 1.4920000000000001e-05, "loss": 1.3492, "step": 746 }, { "epoch": 0.27811194750497376, "grad_norm": 0.15663520991802216, "learning_rate": 1.4940000000000001e-05, "loss": 1.3565, "step": 747 }, { "epoch": 0.27848425265558285, "grad_norm": 0.15726646780967712, "learning_rate": 1.496e-05, "loss": 1.3591, "step": 748 }, { "epoch": 0.2788565578061919, "grad_norm": 0.17913725972175598, "learning_rate": 1.498e-05, "loss": 1.3504, "step": 749 }, { "epoch": 0.279228862956801, "grad_norm": 0.15114524960517883, "learning_rate": 1.5000000000000002e-05, "loss": 1.3499, "step": 750 }, { "epoch": 0.27960116810741004, "grad_norm": 0.15915299952030182, "learning_rate": 1.5020000000000002e-05, "loss": 1.3364, "step": 751 }, { "epoch": 0.2799734732580191, "grad_norm": 0.15501591563224792, "learning_rate": 1.5040000000000002e-05, "loss": 1.3473, "step": 752 }, { "epoch": 0.2803457784086282, "grad_norm": 0.1513669490814209, "learning_rate": 1.5060000000000001e-05, "loss": 1.3547, "step": 753 }, { "epoch": 0.2807180835592372, "grad_norm": 0.16978231072425842, "learning_rate": 1.5080000000000001e-05, "loss": 1.3447, "step": 754 }, { "epoch": 0.2810903887098463, "grad_norm": 0.16618570685386658, "learning_rate": 1.5100000000000001e-05, "loss": 1.3493, "step": 755 }, { "epoch": 0.28146269386045536, "grad_norm": 0.15305636823177338, "learning_rate": 1.5120000000000001e-05, "loss": 1.3675, "step": 756 }, { "epoch": 0.28183499901106446, "grad_norm": 0.16489024460315704, "learning_rate": 1.514e-05, "loss": 1.3465, "step": 757 }, { "epoch": 0.2822073041616735, "grad_norm": 0.1630878895521164, "learning_rate": 1.516e-05, "loss": 1.3551, "step": 758 }, { "epoch": 0.2825796093122826, "grad_norm": 0.15557344257831573, "learning_rate": 1.5180000000000002e-05, "loss": 1.3452, "step": 759 }, { "epoch": 0.28295191446289164, "grad_norm": 0.16549739241600037, "learning_rate": 1.5200000000000002e-05, "loss": 1.3515, "step": 760 }, { "epoch": 0.28332421961350074, "grad_norm": 0.1618327498435974, "learning_rate": 1.5220000000000002e-05, "loss": 1.3422, "step": 761 }, { "epoch": 0.2836965247641098, "grad_norm": 0.16406500339508057, "learning_rate": 1.5240000000000001e-05, "loss": 1.354, "step": 762 }, { "epoch": 0.2840688299147189, "grad_norm": 0.1552378088235855, "learning_rate": 1.5260000000000003e-05, "loss": 1.3644, "step": 763 }, { "epoch": 0.2844411350653279, "grad_norm": 0.16994218528270721, "learning_rate": 1.5280000000000003e-05, "loss": 1.3639, "step": 764 }, { "epoch": 0.28481344021593696, "grad_norm": 0.15901988744735718, "learning_rate": 1.5300000000000003e-05, "loss": 1.3657, "step": 765 }, { "epoch": 0.28518574536654606, "grad_norm": 0.164791077375412, "learning_rate": 1.5320000000000002e-05, "loss": 1.3518, "step": 766 }, { "epoch": 0.2855580505171551, "grad_norm": 0.17213784158229828, "learning_rate": 1.5340000000000002e-05, "loss": 1.3479, "step": 767 }, { "epoch": 0.2859303556677642, "grad_norm": 0.15734751522541046, "learning_rate": 1.5360000000000002e-05, "loss": 1.3589, "step": 768 }, { "epoch": 0.28630266081837324, "grad_norm": 0.17351983487606049, "learning_rate": 1.5380000000000002e-05, "loss": 1.3469, "step": 769 }, { "epoch": 0.28667496596898234, "grad_norm": 0.1618126779794693, "learning_rate": 1.54e-05, "loss": 1.3508, "step": 770 }, { "epoch": 0.2870472711195914, "grad_norm": 0.15765798091888428, "learning_rate": 1.542e-05, "loss": 1.3704, "step": 771 }, { "epoch": 0.2874195762702005, "grad_norm": 0.17241904139518738, "learning_rate": 1.544e-05, "loss": 1.3537, "step": 772 }, { "epoch": 0.2877918814208095, "grad_norm": 0.15991517901420593, "learning_rate": 1.546e-05, "loss": 1.3434, "step": 773 }, { "epoch": 0.2881641865714186, "grad_norm": 0.18322332203388214, "learning_rate": 1.548e-05, "loss": 1.3534, "step": 774 }, { "epoch": 0.28853649172202767, "grad_norm": 0.1581200808286667, "learning_rate": 1.55e-05, "loss": 1.357, "step": 775 }, { "epoch": 0.2889087968726367, "grad_norm": 0.16509777307510376, "learning_rate": 1.552e-05, "loss": 1.3659, "step": 776 }, { "epoch": 0.2892811020232458, "grad_norm": 0.1753901094198227, "learning_rate": 1.554e-05, "loss": 1.3456, "step": 777 }, { "epoch": 0.28965340717385485, "grad_norm": 0.16197755932807922, "learning_rate": 1.556e-05, "loss": 1.3508, "step": 778 }, { "epoch": 0.29002571232446395, "grad_norm": 0.16059085726737976, "learning_rate": 1.5580000000000003e-05, "loss": 1.343, "step": 779 }, { "epoch": 0.290398017475073, "grad_norm": 0.170423224568367, "learning_rate": 1.5600000000000003e-05, "loss": 1.3545, "step": 780 }, { "epoch": 0.2907703226256821, "grad_norm": 0.1572631150484085, "learning_rate": 1.5620000000000003e-05, "loss": 1.3468, "step": 781 }, { "epoch": 0.29114262777629113, "grad_norm": 0.16300170123577118, "learning_rate": 1.5640000000000003e-05, "loss": 1.3604, "step": 782 }, { "epoch": 0.2915149329269002, "grad_norm": 0.1643751561641693, "learning_rate": 1.5660000000000003e-05, "loss": 1.3446, "step": 783 }, { "epoch": 0.29188723807750927, "grad_norm": 0.16648028790950775, "learning_rate": 1.5680000000000002e-05, "loss": 1.3365, "step": 784 }, { "epoch": 0.29225954322811837, "grad_norm": 0.1662505716085434, "learning_rate": 1.5700000000000002e-05, "loss": 1.3469, "step": 785 }, { "epoch": 0.2926318483787274, "grad_norm": 0.16111180186271667, "learning_rate": 1.5720000000000002e-05, "loss": 1.3461, "step": 786 }, { "epoch": 0.2930041535293365, "grad_norm": 0.16680561006069183, "learning_rate": 1.5740000000000002e-05, "loss": 1.3457, "step": 787 }, { "epoch": 0.29337645867994555, "grad_norm": 0.18159611523151398, "learning_rate": 1.576e-05, "loss": 1.3601, "step": 788 }, { "epoch": 0.2937487638305546, "grad_norm": 0.1730850487947464, "learning_rate": 1.578e-05, "loss": 1.3629, "step": 789 }, { "epoch": 0.2941210689811637, "grad_norm": 0.17999385297298431, "learning_rate": 1.58e-05, "loss": 1.3558, "step": 790 }, { "epoch": 0.29449337413177273, "grad_norm": 0.16653069853782654, "learning_rate": 1.582e-05, "loss": 1.3508, "step": 791 }, { "epoch": 0.29486567928238183, "grad_norm": 0.16509103775024414, "learning_rate": 1.584e-05, "loss": 1.3411, "step": 792 }, { "epoch": 0.2952379844329909, "grad_norm": 0.17233438789844513, "learning_rate": 1.586e-05, "loss": 1.3429, "step": 793 }, { "epoch": 0.29561028958359997, "grad_norm": 0.16929791867733002, "learning_rate": 1.588e-05, "loss": 1.3628, "step": 794 }, { "epoch": 0.295982594734209, "grad_norm": 0.16331572830677032, "learning_rate": 1.5900000000000004e-05, "loss": 1.346, "step": 795 }, { "epoch": 0.2963548998848181, "grad_norm": 0.1649729311466217, "learning_rate": 1.5920000000000003e-05, "loss": 1.35, "step": 796 }, { "epoch": 0.29672720503542716, "grad_norm": 0.16012988984584808, "learning_rate": 1.5940000000000003e-05, "loss": 1.3463, "step": 797 }, { "epoch": 0.29709951018603625, "grad_norm": 0.16761143505573273, "learning_rate": 1.5960000000000003e-05, "loss": 1.3485, "step": 798 }, { "epoch": 0.2974718153366453, "grad_norm": 0.17172127962112427, "learning_rate": 1.5980000000000003e-05, "loss": 1.351, "step": 799 }, { "epoch": 0.2978441204872544, "grad_norm": 0.16175536811351776, "learning_rate": 1.6000000000000003e-05, "loss": 1.3546, "step": 800 }, { "epoch": 0.29821642563786344, "grad_norm": 0.17179609835147858, "learning_rate": 1.6020000000000002e-05, "loss": 1.3506, "step": 801 }, { "epoch": 0.2985887307884725, "grad_norm": 0.1593533307313919, "learning_rate": 1.6040000000000002e-05, "loss": 1.3578, "step": 802 }, { "epoch": 0.2989610359390816, "grad_norm": 0.17455071210861206, "learning_rate": 1.6060000000000002e-05, "loss": 1.3424, "step": 803 }, { "epoch": 0.2993333410896906, "grad_norm": 0.1619621366262436, "learning_rate": 1.6080000000000002e-05, "loss": 1.3344, "step": 804 }, { "epoch": 0.2997056462402997, "grad_norm": 0.1549704223871231, "learning_rate": 1.6100000000000002e-05, "loss": 1.3478, "step": 805 }, { "epoch": 0.30007795139090876, "grad_norm": 0.16837717592716217, "learning_rate": 1.612e-05, "loss": 1.3585, "step": 806 }, { "epoch": 0.30045025654151786, "grad_norm": 0.15680178999900818, "learning_rate": 1.614e-05, "loss": 1.3474, "step": 807 }, { "epoch": 0.3008225616921269, "grad_norm": 0.16553205251693726, "learning_rate": 1.616e-05, "loss": 1.3576, "step": 808 }, { "epoch": 0.301194866842736, "grad_norm": 0.16092002391815186, "learning_rate": 1.618e-05, "loss": 1.3503, "step": 809 }, { "epoch": 0.30156717199334504, "grad_norm": 0.16862614452838898, "learning_rate": 1.62e-05, "loss": 1.3556, "step": 810 }, { "epoch": 0.30193947714395414, "grad_norm": 0.16667672991752625, "learning_rate": 1.6220000000000004e-05, "loss": 1.3602, "step": 811 }, { "epoch": 0.3023117822945632, "grad_norm": 0.16125406324863434, "learning_rate": 1.6240000000000004e-05, "loss": 1.3445, "step": 812 }, { "epoch": 0.3026840874451722, "grad_norm": 0.16460643708705902, "learning_rate": 1.626e-05, "loss": 1.3546, "step": 813 }, { "epoch": 0.3030563925957813, "grad_norm": 0.15730057656764984, "learning_rate": 1.628e-05, "loss": 1.3461, "step": 814 }, { "epoch": 0.30342869774639036, "grad_norm": 0.16555914282798767, "learning_rate": 1.63e-05, "loss": 1.3506, "step": 815 }, { "epoch": 0.30380100289699946, "grad_norm": 0.15877728164196014, "learning_rate": 1.632e-05, "loss": 1.3536, "step": 816 }, { "epoch": 0.3041733080476085, "grad_norm": 0.1610872894525528, "learning_rate": 1.634e-05, "loss": 1.3461, "step": 817 }, { "epoch": 0.3045456131982176, "grad_norm": 0.17202696204185486, "learning_rate": 1.636e-05, "loss": 1.3405, "step": 818 }, { "epoch": 0.30491791834882664, "grad_norm": 0.16373762488365173, "learning_rate": 1.638e-05, "loss": 1.3514, "step": 819 }, { "epoch": 0.30529022349943574, "grad_norm": 0.17125077545642853, "learning_rate": 1.64e-05, "loss": 1.3451, "step": 820 }, { "epoch": 0.3056625286500448, "grad_norm": 0.18695010244846344, "learning_rate": 1.6420000000000002e-05, "loss": 1.3443, "step": 821 }, { "epoch": 0.3060348338006539, "grad_norm": 0.1650739312171936, "learning_rate": 1.6440000000000002e-05, "loss": 1.3512, "step": 822 }, { "epoch": 0.3064071389512629, "grad_norm": 0.18323633074760437, "learning_rate": 1.646e-05, "loss": 1.3426, "step": 823 }, { "epoch": 0.306779444101872, "grad_norm": 0.1654973030090332, "learning_rate": 1.648e-05, "loss": 1.3452, "step": 824 }, { "epoch": 0.30715174925248107, "grad_norm": 0.1673707365989685, "learning_rate": 1.65e-05, "loss": 1.3493, "step": 825 }, { "epoch": 0.3075240544030901, "grad_norm": 0.16768096387386322, "learning_rate": 1.652e-05, "loss": 1.3484, "step": 826 }, { "epoch": 0.3078963595536992, "grad_norm": 0.15851683914661407, "learning_rate": 1.654e-05, "loss": 1.3481, "step": 827 }, { "epoch": 0.30826866470430825, "grad_norm": 0.1723686009645462, "learning_rate": 1.656e-05, "loss": 1.3566, "step": 828 }, { "epoch": 0.30864096985491735, "grad_norm": 0.16892585158348083, "learning_rate": 1.658e-05, "loss": 1.3563, "step": 829 }, { "epoch": 0.3090132750055264, "grad_norm": 0.16549967229366302, "learning_rate": 1.66e-05, "loss": 1.3514, "step": 830 }, { "epoch": 0.3093855801561355, "grad_norm": 0.16105307638645172, "learning_rate": 1.662e-05, "loss": 1.3528, "step": 831 }, { "epoch": 0.30975788530674453, "grad_norm": 0.1725122034549713, "learning_rate": 1.664e-05, "loss": 1.3408, "step": 832 }, { "epoch": 0.3101301904573536, "grad_norm": 0.16419732570648193, "learning_rate": 1.666e-05, "loss": 1.3422, "step": 833 }, { "epoch": 0.31050249560796267, "grad_norm": 0.16626691818237305, "learning_rate": 1.668e-05, "loss": 1.332, "step": 834 }, { "epoch": 0.31087480075857177, "grad_norm": 0.16400444507598877, "learning_rate": 1.67e-05, "loss": 1.3378, "step": 835 }, { "epoch": 0.3112471059091808, "grad_norm": 0.1672375500202179, "learning_rate": 1.672e-05, "loss": 1.334, "step": 836 }, { "epoch": 0.31161941105978985, "grad_norm": 0.16108368337154388, "learning_rate": 1.6740000000000002e-05, "loss": 1.3525, "step": 837 }, { "epoch": 0.31199171621039895, "grad_norm": 0.16484303772449493, "learning_rate": 1.6760000000000002e-05, "loss": 1.3541, "step": 838 }, { "epoch": 0.312364021361008, "grad_norm": 0.16695603728294373, "learning_rate": 1.6780000000000002e-05, "loss": 1.3298, "step": 839 }, { "epoch": 0.3127363265116171, "grad_norm": 0.1630071997642517, "learning_rate": 1.6800000000000002e-05, "loss": 1.3388, "step": 840 }, { "epoch": 0.31310863166222613, "grad_norm": 0.17112383246421814, "learning_rate": 1.682e-05, "loss": 1.3464, "step": 841 }, { "epoch": 0.31348093681283523, "grad_norm": 0.1732318103313446, "learning_rate": 1.684e-05, "loss": 1.3344, "step": 842 }, { "epoch": 0.3138532419634443, "grad_norm": 0.17030829191207886, "learning_rate": 1.686e-05, "loss": 1.3411, "step": 843 }, { "epoch": 0.31422554711405337, "grad_norm": 0.16124574840068817, "learning_rate": 1.688e-05, "loss": 1.3473, "step": 844 }, { "epoch": 0.3145978522646624, "grad_norm": 0.17811204493045807, "learning_rate": 1.69e-05, "loss": 1.3522, "step": 845 }, { "epoch": 0.3149701574152715, "grad_norm": 0.17002224922180176, "learning_rate": 1.692e-05, "loss": 1.3511, "step": 846 }, { "epoch": 0.31534246256588055, "grad_norm": 0.19199255108833313, "learning_rate": 1.694e-05, "loss": 1.3263, "step": 847 }, { "epoch": 0.31571476771648965, "grad_norm": 0.1713089942932129, "learning_rate": 1.696e-05, "loss": 1.3383, "step": 848 }, { "epoch": 0.3160870728670987, "grad_norm": 0.17415201663970947, "learning_rate": 1.698e-05, "loss": 1.3336, "step": 849 }, { "epoch": 0.31645937801770774, "grad_norm": 0.1600249856710434, "learning_rate": 1.7e-05, "loss": 1.3639, "step": 850 }, { "epoch": 0.31683168316831684, "grad_norm": 0.17309750616550446, "learning_rate": 1.702e-05, "loss": 1.3371, "step": 851 }, { "epoch": 0.3172039883189259, "grad_norm": 0.16656599938869476, "learning_rate": 1.704e-05, "loss": 1.3367, "step": 852 }, { "epoch": 0.317576293469535, "grad_norm": 0.17623035609722137, "learning_rate": 1.7060000000000003e-05, "loss": 1.3414, "step": 853 }, { "epoch": 0.317948598620144, "grad_norm": 0.16454781591892242, "learning_rate": 1.7080000000000002e-05, "loss": 1.3341, "step": 854 }, { "epoch": 0.3183209037707531, "grad_norm": 0.16562296450138092, "learning_rate": 1.7100000000000002e-05, "loss": 1.3321, "step": 855 }, { "epoch": 0.31869320892136216, "grad_norm": 0.16431719064712524, "learning_rate": 1.7120000000000002e-05, "loss": 1.3332, "step": 856 }, { "epoch": 0.31906551407197126, "grad_norm": 0.16330453753471375, "learning_rate": 1.7140000000000002e-05, "loss": 1.3376, "step": 857 }, { "epoch": 0.3194378192225803, "grad_norm": 0.1680067926645279, "learning_rate": 1.7160000000000002e-05, "loss": 1.3343, "step": 858 }, { "epoch": 0.3198101243731894, "grad_norm": 0.1762932538986206, "learning_rate": 1.718e-05, "loss": 1.3387, "step": 859 }, { "epoch": 0.32018242952379844, "grad_norm": 0.1647929847240448, "learning_rate": 1.72e-05, "loss": 1.3451, "step": 860 }, { "epoch": 0.32055473467440754, "grad_norm": 0.16490478813648224, "learning_rate": 1.722e-05, "loss": 1.3486, "step": 861 }, { "epoch": 0.3209270398250166, "grad_norm": 0.17381954193115234, "learning_rate": 1.724e-05, "loss": 1.3598, "step": 862 }, { "epoch": 0.3212993449756256, "grad_norm": 0.17484751343727112, "learning_rate": 1.726e-05, "loss": 1.3432, "step": 863 }, { "epoch": 0.3216716501262347, "grad_norm": 0.17640666663646698, "learning_rate": 1.728e-05, "loss": 1.3512, "step": 864 }, { "epoch": 0.32204395527684376, "grad_norm": 0.17663167417049408, "learning_rate": 1.73e-05, "loss": 1.3646, "step": 865 }, { "epoch": 0.32241626042745286, "grad_norm": 0.1713891476392746, "learning_rate": 1.732e-05, "loss": 1.3256, "step": 866 }, { "epoch": 0.3227885655780619, "grad_norm": 0.17021964490413666, "learning_rate": 1.734e-05, "loss": 1.332, "step": 867 }, { "epoch": 0.323160870728671, "grad_norm": 0.16920128464698792, "learning_rate": 1.736e-05, "loss": 1.3463, "step": 868 }, { "epoch": 0.32353317587928004, "grad_norm": 0.18202915787696838, "learning_rate": 1.7380000000000003e-05, "loss": 1.3461, "step": 869 }, { "epoch": 0.32390548102988914, "grad_norm": 0.16767475008964539, "learning_rate": 1.7400000000000003e-05, "loss": 1.33, "step": 870 }, { "epoch": 0.3242777861804982, "grad_norm": 0.18158170580863953, "learning_rate": 1.7420000000000003e-05, "loss": 1.3504, "step": 871 }, { "epoch": 0.3246500913311073, "grad_norm": 0.17010337114334106, "learning_rate": 1.7440000000000002e-05, "loss": 1.3229, "step": 872 }, { "epoch": 0.3250223964817163, "grad_norm": 0.1630079299211502, "learning_rate": 1.7460000000000002e-05, "loss": 1.3142, "step": 873 }, { "epoch": 0.32539470163232537, "grad_norm": 0.176998108625412, "learning_rate": 1.7480000000000002e-05, "loss": 1.3371, "step": 874 }, { "epoch": 0.32576700678293447, "grad_norm": 0.18017543852329254, "learning_rate": 1.7500000000000002e-05, "loss": 1.3562, "step": 875 }, { "epoch": 0.3261393119335435, "grad_norm": 0.17135824263095856, "learning_rate": 1.752e-05, "loss": 1.3387, "step": 876 }, { "epoch": 0.3265116170841526, "grad_norm": 0.17937518656253815, "learning_rate": 1.754e-05, "loss": 1.3389, "step": 877 }, { "epoch": 0.32688392223476165, "grad_norm": 0.1736384481191635, "learning_rate": 1.756e-05, "loss": 1.3556, "step": 878 }, { "epoch": 0.32725622738537075, "grad_norm": 0.162176251411438, "learning_rate": 1.758e-05, "loss": 1.3499, "step": 879 }, { "epoch": 0.3276285325359798, "grad_norm": 0.17646604776382446, "learning_rate": 1.76e-05, "loss": 1.3404, "step": 880 }, { "epoch": 0.3280008376865889, "grad_norm": 0.18382136523723602, "learning_rate": 1.762e-05, "loss": 1.3429, "step": 881 }, { "epoch": 0.32837314283719793, "grad_norm": 0.19566792249679565, "learning_rate": 1.764e-05, "loss": 1.3399, "step": 882 }, { "epoch": 0.328745447987807, "grad_norm": 0.17790797352790833, "learning_rate": 1.766e-05, "loss": 1.3413, "step": 883 }, { "epoch": 0.32911775313841607, "grad_norm": 0.16908185184001923, "learning_rate": 1.768e-05, "loss": 1.3363, "step": 884 }, { "epoch": 0.32949005828902517, "grad_norm": 0.17043371498584747, "learning_rate": 1.77e-05, "loss": 1.3415, "step": 885 }, { "epoch": 0.3298623634396342, "grad_norm": 0.18717624247074127, "learning_rate": 1.7720000000000003e-05, "loss": 1.3382, "step": 886 }, { "epoch": 0.33023466859024325, "grad_norm": 0.17335492372512817, "learning_rate": 1.7740000000000003e-05, "loss": 1.3391, "step": 887 }, { "epoch": 0.33060697374085235, "grad_norm": 0.1830221712589264, "learning_rate": 1.7760000000000003e-05, "loss": 1.3435, "step": 888 }, { "epoch": 0.3309792788914614, "grad_norm": 0.17462217807769775, "learning_rate": 1.7780000000000003e-05, "loss": 1.3322, "step": 889 }, { "epoch": 0.3313515840420705, "grad_norm": 0.17677493393421173, "learning_rate": 1.7800000000000002e-05, "loss": 1.3234, "step": 890 }, { "epoch": 0.33172388919267953, "grad_norm": 0.18080869317054749, "learning_rate": 1.7820000000000002e-05, "loss": 1.3442, "step": 891 }, { "epoch": 0.33209619434328863, "grad_norm": 0.1581638604402542, "learning_rate": 1.7840000000000002e-05, "loss": 1.3198, "step": 892 }, { "epoch": 0.3324684994938977, "grad_norm": 0.18022367358207703, "learning_rate": 1.7860000000000002e-05, "loss": 1.3182, "step": 893 }, { "epoch": 0.33284080464450677, "grad_norm": 0.19160951673984528, "learning_rate": 1.788e-05, "loss": 1.3365, "step": 894 }, { "epoch": 0.3332131097951158, "grad_norm": 0.18681097030639648, "learning_rate": 1.79e-05, "loss": 1.337, "step": 895 }, { "epoch": 0.3335854149457249, "grad_norm": 0.17079736292362213, "learning_rate": 1.792e-05, "loss": 1.3371, "step": 896 }, { "epoch": 0.33395772009633395, "grad_norm": 0.17504505813121796, "learning_rate": 1.794e-05, "loss": 1.3492, "step": 897 }, { "epoch": 0.33433002524694305, "grad_norm": 0.18182472884655, "learning_rate": 1.796e-05, "loss": 1.3276, "step": 898 }, { "epoch": 0.3347023303975521, "grad_norm": 0.1663830280303955, "learning_rate": 1.798e-05, "loss": 1.3366, "step": 899 }, { "epoch": 0.33507463554816114, "grad_norm": 0.18775950372219086, "learning_rate": 1.8e-05, "loss": 1.3478, "step": 900 }, { "epoch": 0.33544694069877024, "grad_norm": 0.16797006130218506, "learning_rate": 1.802e-05, "loss": 1.3456, "step": 901 }, { "epoch": 0.3358192458493793, "grad_norm": 0.16422320902347565, "learning_rate": 1.8040000000000003e-05, "loss": 1.327, "step": 902 }, { "epoch": 0.3361915509999884, "grad_norm": 0.1987656205892563, "learning_rate": 1.8060000000000003e-05, "loss": 1.3367, "step": 903 }, { "epoch": 0.3365638561505974, "grad_norm": 0.16781911253929138, "learning_rate": 1.8080000000000003e-05, "loss": 1.3366, "step": 904 }, { "epoch": 0.3369361613012065, "grad_norm": 0.17309685051441193, "learning_rate": 1.8100000000000003e-05, "loss": 1.3398, "step": 905 }, { "epoch": 0.33730846645181556, "grad_norm": 0.17799708247184753, "learning_rate": 1.8120000000000003e-05, "loss": 1.3504, "step": 906 }, { "epoch": 0.33768077160242466, "grad_norm": 0.16965223848819733, "learning_rate": 1.8140000000000003e-05, "loss": 1.3352, "step": 907 }, { "epoch": 0.3380530767530337, "grad_norm": 0.1705309897661209, "learning_rate": 1.8160000000000002e-05, "loss": 1.3119, "step": 908 }, { "epoch": 0.3384253819036428, "grad_norm": 0.18711362779140472, "learning_rate": 1.8180000000000002e-05, "loss": 1.3077, "step": 909 }, { "epoch": 0.33879768705425184, "grad_norm": 0.1738344430923462, "learning_rate": 1.8200000000000002e-05, "loss": 1.3292, "step": 910 }, { "epoch": 0.3391699922048609, "grad_norm": 0.17799516022205353, "learning_rate": 1.8220000000000002e-05, "loss": 1.3294, "step": 911 }, { "epoch": 0.33954229735547, "grad_norm": 0.18020783364772797, "learning_rate": 1.824e-05, "loss": 1.3383, "step": 912 }, { "epoch": 0.339914602506079, "grad_norm": 0.16939756274223328, "learning_rate": 1.826e-05, "loss": 1.3291, "step": 913 }, { "epoch": 0.3402869076566881, "grad_norm": 0.17973625659942627, "learning_rate": 1.828e-05, "loss": 1.3315, "step": 914 }, { "epoch": 0.34065921280729716, "grad_norm": 0.1785053163766861, "learning_rate": 1.83e-05, "loss": 1.3386, "step": 915 }, { "epoch": 0.34103151795790626, "grad_norm": 0.17944440245628357, "learning_rate": 1.832e-05, "loss": 1.3356, "step": 916 }, { "epoch": 0.3414038231085153, "grad_norm": 0.1690181940793991, "learning_rate": 1.834e-05, "loss": 1.3428, "step": 917 }, { "epoch": 0.3417761282591244, "grad_norm": 0.16925571858882904, "learning_rate": 1.8360000000000004e-05, "loss": 1.335, "step": 918 }, { "epoch": 0.34214843340973344, "grad_norm": 0.17593559622764587, "learning_rate": 1.8380000000000004e-05, "loss": 1.3326, "step": 919 }, { "epoch": 0.34252073856034254, "grad_norm": 0.1841055005788803, "learning_rate": 1.8400000000000003e-05, "loss": 1.3245, "step": 920 }, { "epoch": 0.3428930437109516, "grad_norm": 0.18622370064258575, "learning_rate": 1.8420000000000003e-05, "loss": 1.3273, "step": 921 }, { "epoch": 0.3432653488615607, "grad_norm": 0.17079919576644897, "learning_rate": 1.8440000000000003e-05, "loss": 1.3233, "step": 922 }, { "epoch": 0.3436376540121697, "grad_norm": 0.17922057211399078, "learning_rate": 1.8460000000000003e-05, "loss": 1.3385, "step": 923 }, { "epoch": 0.34400995916277877, "grad_norm": 0.1806870400905609, "learning_rate": 1.8480000000000003e-05, "loss": 1.3414, "step": 924 }, { "epoch": 0.34438226431338786, "grad_norm": 0.1918867528438568, "learning_rate": 1.8500000000000002e-05, "loss": 1.3441, "step": 925 }, { "epoch": 0.3447545694639969, "grad_norm": 0.18761901557445526, "learning_rate": 1.8520000000000002e-05, "loss": 1.3352, "step": 926 }, { "epoch": 0.345126874614606, "grad_norm": 0.17143699526786804, "learning_rate": 1.8540000000000002e-05, "loss": 1.3225, "step": 927 }, { "epoch": 0.34549917976521505, "grad_norm": 0.1826808899641037, "learning_rate": 1.8560000000000002e-05, "loss": 1.338, "step": 928 }, { "epoch": 0.34587148491582415, "grad_norm": 0.1739514321088791, "learning_rate": 1.858e-05, "loss": 1.3572, "step": 929 }, { "epoch": 0.3462437900664332, "grad_norm": 0.18177390098571777, "learning_rate": 1.86e-05, "loss": 1.3125, "step": 930 }, { "epoch": 0.3466160952170423, "grad_norm": 0.17709952592849731, "learning_rate": 1.862e-05, "loss": 1.326, "step": 931 }, { "epoch": 0.34698840036765133, "grad_norm": 0.1684853583574295, "learning_rate": 1.864e-05, "loss": 1.3176, "step": 932 }, { "epoch": 0.3473607055182604, "grad_norm": 0.18772630393505096, "learning_rate": 1.866e-05, "loss": 1.3246, "step": 933 }, { "epoch": 0.34773301066886947, "grad_norm": 0.1772524118423462, "learning_rate": 1.8680000000000004e-05, "loss": 1.3463, "step": 934 }, { "epoch": 0.34810531581947857, "grad_norm": 0.1651849001646042, "learning_rate": 1.8700000000000004e-05, "loss": 1.3414, "step": 935 }, { "epoch": 0.3484776209700876, "grad_norm": 0.19098713994026184, "learning_rate": 1.8720000000000004e-05, "loss": 1.3423, "step": 936 }, { "epoch": 0.34884992612069665, "grad_norm": 0.18180294334888458, "learning_rate": 1.8740000000000004e-05, "loss": 1.324, "step": 937 }, { "epoch": 0.34922223127130575, "grad_norm": 0.1721077859401703, "learning_rate": 1.876e-05, "loss": 1.3337, "step": 938 }, { "epoch": 0.3495945364219148, "grad_norm": 0.1962614804506302, "learning_rate": 1.878e-05, "loss": 1.3268, "step": 939 }, { "epoch": 0.3499668415725239, "grad_norm": 0.17538578808307648, "learning_rate": 1.88e-05, "loss": 1.3238, "step": 940 }, { "epoch": 0.35033914672313293, "grad_norm": 0.17809061706066132, "learning_rate": 1.882e-05, "loss": 1.3135, "step": 941 }, { "epoch": 0.35071145187374203, "grad_norm": 0.182732954621315, "learning_rate": 1.884e-05, "loss": 1.328, "step": 942 }, { "epoch": 0.3510837570243511, "grad_norm": 0.1797078251838684, "learning_rate": 1.886e-05, "loss": 1.3315, "step": 943 }, { "epoch": 0.35145606217496017, "grad_norm": 0.187151238322258, "learning_rate": 1.8880000000000002e-05, "loss": 1.3277, "step": 944 }, { "epoch": 0.3518283673255692, "grad_norm": 0.16728267073631287, "learning_rate": 1.8900000000000002e-05, "loss": 1.3369, "step": 945 }, { "epoch": 0.3522006724761783, "grad_norm": 0.1842220276594162, "learning_rate": 1.8920000000000002e-05, "loss": 1.3236, "step": 946 }, { "epoch": 0.35257297762678735, "grad_norm": 0.18164734542369843, "learning_rate": 1.894e-05, "loss": 1.3348, "step": 947 }, { "epoch": 0.3529452827773964, "grad_norm": 0.17997892200946808, "learning_rate": 1.896e-05, "loss": 1.3231, "step": 948 }, { "epoch": 0.3533175879280055, "grad_norm": 0.19494079053401947, "learning_rate": 1.898e-05, "loss": 1.3258, "step": 949 }, { "epoch": 0.35368989307861454, "grad_norm": 0.17961707711219788, "learning_rate": 1.9e-05, "loss": 1.3301, "step": 950 }, { "epoch": 0.35406219822922363, "grad_norm": 0.17583470046520233, "learning_rate": 1.902e-05, "loss": 1.3331, "step": 951 }, { "epoch": 0.3544345033798327, "grad_norm": 0.18307524919509888, "learning_rate": 1.904e-05, "loss": 1.3396, "step": 952 }, { "epoch": 0.3548068085304418, "grad_norm": 0.1822507530450821, "learning_rate": 1.906e-05, "loss": 1.3372, "step": 953 }, { "epoch": 0.3551791136810508, "grad_norm": 0.18464961647987366, "learning_rate": 1.908e-05, "loss": 1.3201, "step": 954 }, { "epoch": 0.3555514188316599, "grad_norm": 0.1767146736383438, "learning_rate": 1.91e-05, "loss": 1.3194, "step": 955 }, { "epoch": 0.35592372398226896, "grad_norm": 0.19084005057811737, "learning_rate": 1.912e-05, "loss": 1.3535, "step": 956 }, { "epoch": 0.35629602913287806, "grad_norm": 0.19618546962738037, "learning_rate": 1.914e-05, "loss": 1.3393, "step": 957 }, { "epoch": 0.3566683342834871, "grad_norm": 0.18410375714302063, "learning_rate": 1.916e-05, "loss": 1.3131, "step": 958 }, { "epoch": 0.3570406394340962, "grad_norm": 0.20000839233398438, "learning_rate": 1.918e-05, "loss": 1.3226, "step": 959 }, { "epoch": 0.35741294458470524, "grad_norm": 0.17728938162326813, "learning_rate": 1.9200000000000003e-05, "loss": 1.3195, "step": 960 }, { "epoch": 0.3577852497353143, "grad_norm": 0.19262300431728363, "learning_rate": 1.9220000000000002e-05, "loss": 1.3395, "step": 961 }, { "epoch": 0.3581575548859234, "grad_norm": 0.2089599370956421, "learning_rate": 1.9240000000000002e-05, "loss": 1.3296, "step": 962 }, { "epoch": 0.3585298600365324, "grad_norm": 0.19653920829296112, "learning_rate": 1.9260000000000002e-05, "loss": 1.3266, "step": 963 }, { "epoch": 0.3589021651871415, "grad_norm": 0.18377585709095, "learning_rate": 1.9280000000000002e-05, "loss": 1.3331, "step": 964 }, { "epoch": 0.35927447033775056, "grad_norm": 0.18499638140201569, "learning_rate": 1.93e-05, "loss": 1.3195, "step": 965 }, { "epoch": 0.35964677548835966, "grad_norm": 0.19310572743415833, "learning_rate": 1.932e-05, "loss": 1.3545, "step": 966 }, { "epoch": 0.3600190806389687, "grad_norm": 0.19002506136894226, "learning_rate": 1.934e-05, "loss": 1.341, "step": 967 }, { "epoch": 0.3603913857895778, "grad_norm": 0.1937621384859085, "learning_rate": 1.936e-05, "loss": 1.32, "step": 968 }, { "epoch": 0.36076369094018684, "grad_norm": 0.1868428736925125, "learning_rate": 1.938e-05, "loss": 1.3172, "step": 969 }, { "epoch": 0.36113599609079594, "grad_norm": 0.1846293956041336, "learning_rate": 1.94e-05, "loss": 1.3261, "step": 970 }, { "epoch": 0.361508301241405, "grad_norm": 0.18833589553833008, "learning_rate": 1.942e-05, "loss": 1.3307, "step": 971 }, { "epoch": 0.3618806063920141, "grad_norm": 0.1832055151462555, "learning_rate": 1.944e-05, "loss": 1.3167, "step": 972 }, { "epoch": 0.3622529115426231, "grad_norm": 0.1855573207139969, "learning_rate": 1.946e-05, "loss": 1.3279, "step": 973 }, { "epoch": 0.36262521669323217, "grad_norm": 0.1866430938243866, "learning_rate": 1.948e-05, "loss": 1.3361, "step": 974 }, { "epoch": 0.36299752184384126, "grad_norm": 0.1905186027288437, "learning_rate": 1.95e-05, "loss": 1.3255, "step": 975 }, { "epoch": 0.3633698269944503, "grad_norm": 0.1822662651538849, "learning_rate": 1.9520000000000003e-05, "loss": 1.314, "step": 976 }, { "epoch": 0.3637421321450594, "grad_norm": 0.18091034889221191, "learning_rate": 1.9540000000000003e-05, "loss": 1.3267, "step": 977 }, { "epoch": 0.36411443729566845, "grad_norm": 0.18915344774723053, "learning_rate": 1.9560000000000002e-05, "loss": 1.3161, "step": 978 }, { "epoch": 0.36448674244627755, "grad_norm": 0.17233604192733765, "learning_rate": 1.9580000000000002e-05, "loss": 1.3453, "step": 979 }, { "epoch": 0.3648590475968866, "grad_norm": 0.18806397914886475, "learning_rate": 1.9600000000000002e-05, "loss": 1.3442, "step": 980 }, { "epoch": 0.3652313527474957, "grad_norm": 0.19142785668373108, "learning_rate": 1.9620000000000002e-05, "loss": 1.3218, "step": 981 }, { "epoch": 0.36560365789810473, "grad_norm": 0.17856691777706146, "learning_rate": 1.9640000000000002e-05, "loss": 1.3334, "step": 982 }, { "epoch": 0.3659759630487138, "grad_norm": 0.189533069729805, "learning_rate": 1.966e-05, "loss": 1.3201, "step": 983 }, { "epoch": 0.36634826819932287, "grad_norm": 0.19316861033439636, "learning_rate": 1.968e-05, "loss": 1.3177, "step": 984 }, { "epoch": 0.3667205733499319, "grad_norm": 0.18213561177253723, "learning_rate": 1.97e-05, "loss": 1.3317, "step": 985 }, { "epoch": 0.367092878500541, "grad_norm": 0.21344400942325592, "learning_rate": 1.972e-05, "loss": 1.3263, "step": 986 }, { "epoch": 0.36746518365115005, "grad_norm": 0.1816537231206894, "learning_rate": 1.974e-05, "loss": 1.3155, "step": 987 }, { "epoch": 0.36783748880175915, "grad_norm": 0.18131639063358307, "learning_rate": 1.976e-05, "loss": 1.3265, "step": 988 }, { "epoch": 0.3682097939523682, "grad_norm": 0.21126648783683777, "learning_rate": 1.978e-05, "loss": 1.3293, "step": 989 }, { "epoch": 0.3685820991029773, "grad_norm": 0.17451252043247223, "learning_rate": 1.98e-05, "loss": 1.3293, "step": 990 }, { "epoch": 0.36895440425358633, "grad_norm": 0.17249158024787903, "learning_rate": 1.982e-05, "loss": 1.3242, "step": 991 }, { "epoch": 0.36932670940419543, "grad_norm": 0.18453699350357056, "learning_rate": 1.9840000000000003e-05, "loss": 1.335, "step": 992 }, { "epoch": 0.3696990145548045, "grad_norm": 0.16820859909057617, "learning_rate": 1.9860000000000003e-05, "loss": 1.3249, "step": 993 }, { "epoch": 0.37007131970541357, "grad_norm": 0.19031395018100739, "learning_rate": 1.9880000000000003e-05, "loss": 1.3255, "step": 994 }, { "epoch": 0.3704436248560226, "grad_norm": 0.19492731988430023, "learning_rate": 1.9900000000000003e-05, "loss": 1.3323, "step": 995 }, { "epoch": 0.3708159300066317, "grad_norm": 0.17573820054531097, "learning_rate": 1.9920000000000002e-05, "loss": 1.3371, "step": 996 }, { "epoch": 0.37118823515724075, "grad_norm": 0.18719618022441864, "learning_rate": 1.9940000000000002e-05, "loss": 1.3166, "step": 997 }, { "epoch": 0.3715605403078498, "grad_norm": 0.17977416515350342, "learning_rate": 1.9960000000000002e-05, "loss": 1.323, "step": 998 }, { "epoch": 0.3719328454584589, "grad_norm": 0.1703476756811142, "learning_rate": 1.9980000000000002e-05, "loss": 1.3185, "step": 999 }, { "epoch": 0.37230515060906794, "grad_norm": 0.1878376603126526, "learning_rate": 2e-05, "loss": 1.3445, "step": 1000 }, { "epoch": 0.37230515060906794, "eval_loss": 1.3607902526855469, "eval_runtime": 16.6559, "eval_samples_per_second": 104.108, "eval_steps_per_second": 5.223, "step": 1000 }, { "epoch": 0.37267745575967703, "grad_norm": 0.18507210910320282, "learning_rate": 1.9999999926150316e-05, "loss": 1.3343, "step": 1001 }, { "epoch": 0.3730497609102861, "grad_norm": 0.17154212296009064, "learning_rate": 1.999999970460126e-05, "loss": 1.3407, "step": 1002 }, { "epoch": 0.3734220660608952, "grad_norm": 0.1845930516719818, "learning_rate": 1.9999999335352835e-05, "loss": 1.3236, "step": 1003 }, { "epoch": 0.3737943712115042, "grad_norm": 0.1764208972454071, "learning_rate": 1.9999998818405046e-05, "loss": 1.3242, "step": 1004 }, { "epoch": 0.3741666763621133, "grad_norm": 0.1841861754655838, "learning_rate": 1.99999981537579e-05, "loss": 1.3287, "step": 1005 }, { "epoch": 0.37453898151272236, "grad_norm": 0.17870505154132843, "learning_rate": 1.9999997341411412e-05, "loss": 1.3134, "step": 1006 }, { "epoch": 0.37491128666333146, "grad_norm": 0.18854598701000214, "learning_rate": 1.999999638136559e-05, "loss": 1.3089, "step": 1007 }, { "epoch": 0.3752835918139405, "grad_norm": 0.19055992364883423, "learning_rate": 1.9999995273620453e-05, "loss": 1.3269, "step": 1008 }, { "epoch": 0.3756558969645496, "grad_norm": 0.1775302290916443, "learning_rate": 1.9999994018176008e-05, "loss": 1.3235, "step": 1009 }, { "epoch": 0.37602820211515864, "grad_norm": 0.17888285219669342, "learning_rate": 1.999999261503228e-05, "loss": 1.3249, "step": 1010 }, { "epoch": 0.3764005072657677, "grad_norm": 0.17626729607582092, "learning_rate": 1.999999106418929e-05, "loss": 1.3312, "step": 1011 }, { "epoch": 0.3767728124163768, "grad_norm": 0.1689792424440384, "learning_rate": 1.999998936564706e-05, "loss": 1.3235, "step": 1012 }, { "epoch": 0.3771451175669858, "grad_norm": 0.18726208806037903, "learning_rate": 1.9999987519405614e-05, "loss": 1.3249, "step": 1013 }, { "epoch": 0.3775174227175949, "grad_norm": 0.1736321747303009, "learning_rate": 1.9999985525464977e-05, "loss": 1.3251, "step": 1014 }, { "epoch": 0.37788972786820396, "grad_norm": 0.1852688491344452, "learning_rate": 1.9999983383825186e-05, "loss": 1.3272, "step": 1015 }, { "epoch": 0.37826203301881306, "grad_norm": 0.17666661739349365, "learning_rate": 1.9999981094486264e-05, "loss": 1.3341, "step": 1016 }, { "epoch": 0.3786343381694221, "grad_norm": 0.17533712089061737, "learning_rate": 1.9999978657448252e-05, "loss": 1.3045, "step": 1017 }, { "epoch": 0.3790066433200312, "grad_norm": 0.1879895031452179, "learning_rate": 1.999997607271118e-05, "loss": 1.3311, "step": 1018 }, { "epoch": 0.37937894847064024, "grad_norm": 0.17512273788452148, "learning_rate": 1.999997334027509e-05, "loss": 1.3284, "step": 1019 }, { "epoch": 0.37975125362124934, "grad_norm": 0.17723974585533142, "learning_rate": 1.9999970460140023e-05, "loss": 1.3109, "step": 1020 }, { "epoch": 0.3801235587718584, "grad_norm": 0.1877744346857071, "learning_rate": 1.999996743230602e-05, "loss": 1.3204, "step": 1021 }, { "epoch": 0.3804958639224674, "grad_norm": 0.17423878610134125, "learning_rate": 1.9999964256773125e-05, "loss": 1.3204, "step": 1022 }, { "epoch": 0.3808681690730765, "grad_norm": 0.19012653827667236, "learning_rate": 1.9999960933541383e-05, "loss": 1.3345, "step": 1023 }, { "epoch": 0.38124047422368557, "grad_norm": 0.19567571580410004, "learning_rate": 1.999995746261085e-05, "loss": 1.3195, "step": 1024 }, { "epoch": 0.38161277937429466, "grad_norm": 0.17858469486236572, "learning_rate": 1.999995384398157e-05, "loss": 1.3161, "step": 1025 }, { "epoch": 0.3819850845249037, "grad_norm": 0.20314441621303558, "learning_rate": 1.9999950077653597e-05, "loss": 1.3214, "step": 1026 }, { "epoch": 0.3823573896755128, "grad_norm": 0.19132466614246368, "learning_rate": 1.9999946163626993e-05, "loss": 1.3227, "step": 1027 }, { "epoch": 0.38272969482612185, "grad_norm": 0.18081142008304596, "learning_rate": 1.999994210190181e-05, "loss": 1.3226, "step": 1028 }, { "epoch": 0.38310199997673094, "grad_norm": 0.1872538924217224, "learning_rate": 1.999993789247811e-05, "loss": 1.321, "step": 1029 }, { "epoch": 0.38347430512734, "grad_norm": 0.16693538427352905, "learning_rate": 1.9999933535355955e-05, "loss": 1.32, "step": 1030 }, { "epoch": 0.3838466102779491, "grad_norm": 0.18578219413757324, "learning_rate": 1.9999929030535406e-05, "loss": 1.3046, "step": 1031 }, { "epoch": 0.3842189154285581, "grad_norm": 0.19395247101783752, "learning_rate": 1.999992437801654e-05, "loss": 1.3357, "step": 1032 }, { "epoch": 0.3845912205791672, "grad_norm": 0.18756742775440216, "learning_rate": 1.9999919577799415e-05, "loss": 1.3373, "step": 1033 }, { "epoch": 0.38496352572977627, "grad_norm": 0.18206077814102173, "learning_rate": 1.9999914629884104e-05, "loss": 1.3087, "step": 1034 }, { "epoch": 0.3853358308803853, "grad_norm": 0.18908272683620453, "learning_rate": 1.999990953427068e-05, "loss": 1.3191, "step": 1035 }, { "epoch": 0.3857081360309944, "grad_norm": 0.19302143156528473, "learning_rate": 1.9999904290959225e-05, "loss": 1.3283, "step": 1036 }, { "epoch": 0.38608044118160345, "grad_norm": 0.1931566596031189, "learning_rate": 1.999989889994981e-05, "loss": 1.3368, "step": 1037 }, { "epoch": 0.38645274633221255, "grad_norm": 0.18609251081943512, "learning_rate": 1.9999893361242512e-05, "loss": 1.3135, "step": 1038 }, { "epoch": 0.3868250514828216, "grad_norm": 0.19373655319213867, "learning_rate": 1.9999887674837416e-05, "loss": 1.3423, "step": 1039 }, { "epoch": 0.3871973566334307, "grad_norm": 0.2070687711238861, "learning_rate": 1.9999881840734613e-05, "loss": 1.3103, "step": 1040 }, { "epoch": 0.38756966178403973, "grad_norm": 0.18242163956165314, "learning_rate": 1.999987585893418e-05, "loss": 1.3367, "step": 1041 }, { "epoch": 0.38794196693464883, "grad_norm": 0.17910490930080414, "learning_rate": 1.9999869729436205e-05, "loss": 1.3158, "step": 1042 }, { "epoch": 0.38831427208525787, "grad_norm": 0.20917744934558868, "learning_rate": 1.9999863452240784e-05, "loss": 1.3203, "step": 1043 }, { "epoch": 0.38868657723586697, "grad_norm": 0.16768380999565125, "learning_rate": 1.9999857027348008e-05, "loss": 1.324, "step": 1044 }, { "epoch": 0.389058882386476, "grad_norm": 0.1901809275150299, "learning_rate": 1.999985045475797e-05, "loss": 1.333, "step": 1045 }, { "epoch": 0.3894311875370851, "grad_norm": 0.18755842745304108, "learning_rate": 1.9999843734470768e-05, "loss": 1.3174, "step": 1046 }, { "epoch": 0.38980349268769415, "grad_norm": 0.18284977972507477, "learning_rate": 1.9999836866486505e-05, "loss": 1.3204, "step": 1047 }, { "epoch": 0.3901757978383032, "grad_norm": 0.19717617332935333, "learning_rate": 1.9999829850805273e-05, "loss": 1.3158, "step": 1048 }, { "epoch": 0.3905481029889123, "grad_norm": 0.19506654143333435, "learning_rate": 1.9999822687427188e-05, "loss": 1.3254, "step": 1049 }, { "epoch": 0.39092040813952134, "grad_norm": 0.18811020255088806, "learning_rate": 1.9999815376352346e-05, "loss": 1.3195, "step": 1050 }, { "epoch": 0.39129271329013043, "grad_norm": 0.19153869152069092, "learning_rate": 1.9999807917580858e-05, "loss": 1.3346, "step": 1051 }, { "epoch": 0.3916650184407395, "grad_norm": 0.17620836198329926, "learning_rate": 1.9999800311112838e-05, "loss": 1.3085, "step": 1052 }, { "epoch": 0.3920373235913486, "grad_norm": 0.18910714983940125, "learning_rate": 1.999979255694839e-05, "loss": 1.3138, "step": 1053 }, { "epoch": 0.3924096287419576, "grad_norm": 0.1759803593158722, "learning_rate": 1.999978465508764e-05, "loss": 1.3205, "step": 1054 }, { "epoch": 0.3927819338925667, "grad_norm": 0.1760639101266861, "learning_rate": 1.9999776605530693e-05, "loss": 1.3111, "step": 1055 }, { "epoch": 0.39315423904317576, "grad_norm": 0.1799839735031128, "learning_rate": 1.9999768408277674e-05, "loss": 1.3128, "step": 1056 }, { "epoch": 0.39352654419378486, "grad_norm": 0.18230627477169037, "learning_rate": 1.9999760063328705e-05, "loss": 1.3294, "step": 1057 }, { "epoch": 0.3938988493443939, "grad_norm": 0.17548716068267822, "learning_rate": 1.9999751570683905e-05, "loss": 1.3224, "step": 1058 }, { "epoch": 0.39427115449500294, "grad_norm": 0.17289309203624725, "learning_rate": 1.9999742930343404e-05, "loss": 1.3113, "step": 1059 }, { "epoch": 0.39464345964561204, "grad_norm": 0.18272966146469116, "learning_rate": 1.9999734142307326e-05, "loss": 1.3166, "step": 1060 }, { "epoch": 0.3950157647962211, "grad_norm": 0.1845044642686844, "learning_rate": 1.99997252065758e-05, "loss": 1.3138, "step": 1061 }, { "epoch": 0.3953880699468302, "grad_norm": 0.1838754415512085, "learning_rate": 1.9999716123148966e-05, "loss": 1.312, "step": 1062 }, { "epoch": 0.3957603750974392, "grad_norm": 0.18985752761363983, "learning_rate": 1.999970689202695e-05, "loss": 1.3226, "step": 1063 }, { "epoch": 0.3961326802480483, "grad_norm": 0.1912066787481308, "learning_rate": 1.999969751320989e-05, "loss": 1.3408, "step": 1064 }, { "epoch": 0.39650498539865736, "grad_norm": 0.1811860352754593, "learning_rate": 1.9999687986697925e-05, "loss": 1.3085, "step": 1065 }, { "epoch": 0.39687729054926646, "grad_norm": 0.18986211717128754, "learning_rate": 1.9999678312491194e-05, "loss": 1.3329, "step": 1066 }, { "epoch": 0.3972495956998755, "grad_norm": 0.19396282732486725, "learning_rate": 1.9999668490589848e-05, "loss": 1.3115, "step": 1067 }, { "epoch": 0.3976219008504846, "grad_norm": 0.19043684005737305, "learning_rate": 1.999965852099402e-05, "loss": 1.3174, "step": 1068 }, { "epoch": 0.39799420600109364, "grad_norm": 0.18925096094608307, "learning_rate": 1.9999648403703867e-05, "loss": 1.3015, "step": 1069 }, { "epoch": 0.39836651115170274, "grad_norm": 0.19922588765621185, "learning_rate": 1.9999638138719532e-05, "loss": 1.3154, "step": 1070 }, { "epoch": 0.3987388163023118, "grad_norm": 0.1778346300125122, "learning_rate": 1.999962772604117e-05, "loss": 1.314, "step": 1071 }, { "epoch": 0.3991111214529208, "grad_norm": 0.18273292481899261, "learning_rate": 1.9999617165668935e-05, "loss": 1.2956, "step": 1072 }, { "epoch": 0.3994834266035299, "grad_norm": 0.18473650515079498, "learning_rate": 1.999960645760298e-05, "loss": 1.3015, "step": 1073 }, { "epoch": 0.39985573175413897, "grad_norm": 0.20028461515903473, "learning_rate": 1.9999595601843466e-05, "loss": 1.3271, "step": 1074 }, { "epoch": 0.40022803690474806, "grad_norm": 0.1781882643699646, "learning_rate": 1.9999584598390556e-05, "loss": 1.3189, "step": 1075 }, { "epoch": 0.4006003420553571, "grad_norm": 0.18099310994148254, "learning_rate": 1.9999573447244404e-05, "loss": 1.3319, "step": 1076 }, { "epoch": 0.4009726472059662, "grad_norm": 0.18824267387390137, "learning_rate": 1.9999562148405184e-05, "loss": 1.315, "step": 1077 }, { "epoch": 0.40134495235657525, "grad_norm": 0.18727093935012817, "learning_rate": 1.9999550701873056e-05, "loss": 1.3246, "step": 1078 }, { "epoch": 0.40171725750718434, "grad_norm": 0.19821962714195251, "learning_rate": 1.9999539107648195e-05, "loss": 1.2926, "step": 1079 }, { "epoch": 0.4020895626577934, "grad_norm": 0.18535029888153076, "learning_rate": 1.9999527365730766e-05, "loss": 1.3103, "step": 1080 }, { "epoch": 0.4024618678084025, "grad_norm": 0.18775507807731628, "learning_rate": 1.9999515476120945e-05, "loss": 1.3188, "step": 1081 }, { "epoch": 0.4028341729590115, "grad_norm": 0.18621505796909332, "learning_rate": 1.999950343881891e-05, "loss": 1.3179, "step": 1082 }, { "epoch": 0.4032064781096206, "grad_norm": 0.18887226283550262, "learning_rate": 1.9999491253824833e-05, "loss": 1.3146, "step": 1083 }, { "epoch": 0.40357878326022967, "grad_norm": 0.18072552978992462, "learning_rate": 1.9999478921138898e-05, "loss": 1.3137, "step": 1084 }, { "epoch": 0.4039510884108387, "grad_norm": 0.18956084549427032, "learning_rate": 1.999946644076129e-05, "loss": 1.3064, "step": 1085 }, { "epoch": 0.4043233935614478, "grad_norm": 0.19679097831249237, "learning_rate": 1.999945381269219e-05, "loss": 1.2912, "step": 1086 }, { "epoch": 0.40469569871205685, "grad_norm": 0.18827444314956665, "learning_rate": 1.9999441036931784e-05, "loss": 1.328, "step": 1087 }, { "epoch": 0.40506800386266595, "grad_norm": 0.1727646291255951, "learning_rate": 1.999942811348026e-05, "loss": 1.3148, "step": 1088 }, { "epoch": 0.405440309013275, "grad_norm": 0.18676549196243286, "learning_rate": 1.999941504233781e-05, "loss": 1.3315, "step": 1089 }, { "epoch": 0.4058126141638841, "grad_norm": 0.18589061498641968, "learning_rate": 1.9999401823504628e-05, "loss": 1.3326, "step": 1090 }, { "epoch": 0.40618491931449313, "grad_norm": 0.19661076366901398, "learning_rate": 1.999938845698091e-05, "loss": 1.3221, "step": 1091 }, { "epoch": 0.40655722446510223, "grad_norm": 0.1824602484703064, "learning_rate": 1.9999374942766853e-05, "loss": 1.3216, "step": 1092 }, { "epoch": 0.40692952961571127, "grad_norm": 0.1898437738418579, "learning_rate": 1.999936128086265e-05, "loss": 1.3212, "step": 1093 }, { "epoch": 0.40730183476632037, "grad_norm": 0.18662068247795105, "learning_rate": 1.9999347471268517e-05, "loss": 1.316, "step": 1094 }, { "epoch": 0.4076741399169294, "grad_norm": 0.17926158010959625, "learning_rate": 1.9999333513984644e-05, "loss": 1.3243, "step": 1095 }, { "epoch": 0.40804644506753845, "grad_norm": 0.18369993567466736, "learning_rate": 1.9999319409011243e-05, "loss": 1.3001, "step": 1096 }, { "epoch": 0.40841875021814755, "grad_norm": 0.18747882544994354, "learning_rate": 1.9999305156348523e-05, "loss": 1.3137, "step": 1097 }, { "epoch": 0.4087910553687566, "grad_norm": 0.17961914837360382, "learning_rate": 1.999929075599669e-05, "loss": 1.3138, "step": 1098 }, { "epoch": 0.4091633605193657, "grad_norm": 0.1906268298625946, "learning_rate": 1.9999276207955965e-05, "loss": 1.3301, "step": 1099 }, { "epoch": 0.40953566566997474, "grad_norm": 0.1943434178829193, "learning_rate": 1.9999261512226556e-05, "loss": 1.3254, "step": 1100 }, { "epoch": 0.40990797082058383, "grad_norm": 0.17660486698150635, "learning_rate": 1.999924666880868e-05, "loss": 1.3178, "step": 1101 }, { "epoch": 0.4102802759711929, "grad_norm": 0.1914491206407547, "learning_rate": 1.999923167770256e-05, "loss": 1.303, "step": 1102 }, { "epoch": 0.410652581121802, "grad_norm": 0.18666186928749084, "learning_rate": 1.9999216538908416e-05, "loss": 1.3144, "step": 1103 }, { "epoch": 0.411024886272411, "grad_norm": 0.18117761611938477, "learning_rate": 1.9999201252426473e-05, "loss": 1.3063, "step": 1104 }, { "epoch": 0.4113971914230201, "grad_norm": 0.1789981722831726, "learning_rate": 1.9999185818256953e-05, "loss": 1.3212, "step": 1105 }, { "epoch": 0.41176949657362916, "grad_norm": 0.18627247214317322, "learning_rate": 1.9999170236400087e-05, "loss": 1.3088, "step": 1106 }, { "epoch": 0.41214180172423825, "grad_norm": 0.18642041087150574, "learning_rate": 1.99991545068561e-05, "loss": 1.3296, "step": 1107 }, { "epoch": 0.4125141068748473, "grad_norm": 0.1862434297800064, "learning_rate": 1.999913862962523e-05, "loss": 1.3261, "step": 1108 }, { "epoch": 0.41288641202545634, "grad_norm": 0.1830601990222931, "learning_rate": 1.9999122604707714e-05, "loss": 1.3262, "step": 1109 }, { "epoch": 0.41325871717606544, "grad_norm": 0.1928444504737854, "learning_rate": 1.9999106432103785e-05, "loss": 1.3062, "step": 1110 }, { "epoch": 0.4136310223266745, "grad_norm": 0.1889939159154892, "learning_rate": 1.9999090111813674e-05, "loss": 1.3028, "step": 1111 }, { "epoch": 0.4140033274772836, "grad_norm": 0.18017908930778503, "learning_rate": 1.9999073643837637e-05, "loss": 1.3231, "step": 1112 }, { "epoch": 0.4143756326278926, "grad_norm": 0.18042001128196716, "learning_rate": 1.9999057028175906e-05, "loss": 1.301, "step": 1113 }, { "epoch": 0.4147479377785017, "grad_norm": 0.1881111115217209, "learning_rate": 1.999904026482873e-05, "loss": 1.3057, "step": 1114 }, { "epoch": 0.41512024292911076, "grad_norm": 0.17957095801830292, "learning_rate": 1.9999023353796357e-05, "loss": 1.316, "step": 1115 }, { "epoch": 0.41549254807971986, "grad_norm": 0.19074207544326782, "learning_rate": 1.999900629507904e-05, "loss": 1.3326, "step": 1116 }, { "epoch": 0.4158648532303289, "grad_norm": 0.19031281769275665, "learning_rate": 1.9998989088677027e-05, "loss": 1.3277, "step": 1117 }, { "epoch": 0.416237158380938, "grad_norm": 0.17994950711727142, "learning_rate": 1.9998971734590567e-05, "loss": 1.3236, "step": 1118 }, { "epoch": 0.41660946353154704, "grad_norm": 0.17953146994113922, "learning_rate": 1.9998954232819928e-05, "loss": 1.3225, "step": 1119 }, { "epoch": 0.41698176868215614, "grad_norm": 0.18217253684997559, "learning_rate": 1.9998936583365358e-05, "loss": 1.3234, "step": 1120 }, { "epoch": 0.4173540738327652, "grad_norm": 0.1890113651752472, "learning_rate": 1.9998918786227124e-05, "loss": 1.3206, "step": 1121 }, { "epoch": 0.4177263789833742, "grad_norm": 0.18318399786949158, "learning_rate": 1.999890084140549e-05, "loss": 1.3222, "step": 1122 }, { "epoch": 0.4180986841339833, "grad_norm": 0.19415515661239624, "learning_rate": 1.9998882748900714e-05, "loss": 1.3069, "step": 1123 }, { "epoch": 0.41847098928459237, "grad_norm": 0.1776365041732788, "learning_rate": 1.9998864508713068e-05, "loss": 1.3146, "step": 1124 }, { "epoch": 0.41884329443520146, "grad_norm": 0.18183977901935577, "learning_rate": 1.9998846120842824e-05, "loss": 1.3096, "step": 1125 }, { "epoch": 0.4192155995858105, "grad_norm": 0.19202455878257751, "learning_rate": 1.9998827585290245e-05, "loss": 1.2982, "step": 1126 }, { "epoch": 0.4195879047364196, "grad_norm": 0.18983449041843414, "learning_rate": 1.9998808902055616e-05, "loss": 1.32, "step": 1127 }, { "epoch": 0.41996020988702865, "grad_norm": 0.17936941981315613, "learning_rate": 1.9998790071139202e-05, "loss": 1.3271, "step": 1128 }, { "epoch": 0.42033251503763774, "grad_norm": 0.18909084796905518, "learning_rate": 1.9998771092541287e-05, "loss": 1.3014, "step": 1129 }, { "epoch": 0.4207048201882468, "grad_norm": 0.19434739649295807, "learning_rate": 1.9998751966262154e-05, "loss": 1.3097, "step": 1130 }, { "epoch": 0.4210771253388559, "grad_norm": 0.17867596447467804, "learning_rate": 1.9998732692302077e-05, "loss": 1.3138, "step": 1131 }, { "epoch": 0.4214494304894649, "grad_norm": 0.17590618133544922, "learning_rate": 1.999871327066135e-05, "loss": 1.3162, "step": 1132 }, { "epoch": 0.42182173564007397, "grad_norm": 0.19735798239707947, "learning_rate": 1.999869370134025e-05, "loss": 1.3109, "step": 1133 }, { "epoch": 0.42219404079068307, "grad_norm": 0.18292424082756042, "learning_rate": 1.999867398433908e-05, "loss": 1.3073, "step": 1134 }, { "epoch": 0.4225663459412921, "grad_norm": 0.1984787881374359, "learning_rate": 1.9998654119658115e-05, "loss": 1.3127, "step": 1135 }, { "epoch": 0.4229386510919012, "grad_norm": 0.19433364272117615, "learning_rate": 1.999863410729766e-05, "loss": 1.3199, "step": 1136 }, { "epoch": 0.42331095624251025, "grad_norm": 0.1794789433479309, "learning_rate": 1.9998613947258006e-05, "loss": 1.3065, "step": 1137 }, { "epoch": 0.42368326139311935, "grad_norm": 0.18570461869239807, "learning_rate": 1.9998593639539453e-05, "loss": 1.3106, "step": 1138 }, { "epoch": 0.4240555665437284, "grad_norm": 0.18316258490085602, "learning_rate": 1.9998573184142294e-05, "loss": 1.3232, "step": 1139 }, { "epoch": 0.4244278716943375, "grad_norm": 0.19057685136795044, "learning_rate": 1.9998552581066842e-05, "loss": 1.3216, "step": 1140 }, { "epoch": 0.42480017684494653, "grad_norm": 0.17034713923931122, "learning_rate": 1.9998531830313394e-05, "loss": 1.3071, "step": 1141 }, { "epoch": 0.42517248199555563, "grad_norm": 0.19503635168075562, "learning_rate": 1.999851093188226e-05, "loss": 1.3007, "step": 1142 }, { "epoch": 0.42554478714616467, "grad_norm": 0.18633443117141724, "learning_rate": 1.9998489885773746e-05, "loss": 1.3197, "step": 1143 }, { "epoch": 0.42591709229677377, "grad_norm": 0.16780251264572144, "learning_rate": 1.999846869198816e-05, "loss": 1.3104, "step": 1144 }, { "epoch": 0.4262893974473828, "grad_norm": 0.1772105097770691, "learning_rate": 1.9998447350525822e-05, "loss": 1.3154, "step": 1145 }, { "epoch": 0.42666170259799185, "grad_norm": 0.1707058846950531, "learning_rate": 1.9998425861387045e-05, "loss": 1.3108, "step": 1146 }, { "epoch": 0.42703400774860095, "grad_norm": 0.1806851178407669, "learning_rate": 1.9998404224572147e-05, "loss": 1.3152, "step": 1147 }, { "epoch": 0.42740631289921, "grad_norm": 0.19098347425460815, "learning_rate": 1.9998382440081442e-05, "loss": 1.3187, "step": 1148 }, { "epoch": 0.4277786180498191, "grad_norm": 0.18459391593933105, "learning_rate": 1.999836050791526e-05, "loss": 1.3139, "step": 1149 }, { "epoch": 0.42815092320042814, "grad_norm": 0.1884915679693222, "learning_rate": 1.9998338428073916e-05, "loss": 1.318, "step": 1150 }, { "epoch": 0.42852322835103723, "grad_norm": 0.181873619556427, "learning_rate": 1.9998316200557742e-05, "loss": 1.3283, "step": 1151 }, { "epoch": 0.4288955335016463, "grad_norm": 0.17732377350330353, "learning_rate": 1.9998293825367066e-05, "loss": 1.2947, "step": 1152 }, { "epoch": 0.4292678386522554, "grad_norm": 0.17769268155097961, "learning_rate": 1.999827130250222e-05, "loss": 1.3118, "step": 1153 }, { "epoch": 0.4296401438028644, "grad_norm": 0.1880146563053131, "learning_rate": 1.9998248631963532e-05, "loss": 1.3123, "step": 1154 }, { "epoch": 0.4300124489534735, "grad_norm": 0.18913207948207855, "learning_rate": 1.9998225813751338e-05, "loss": 1.3172, "step": 1155 }, { "epoch": 0.43038475410408256, "grad_norm": 0.19146108627319336, "learning_rate": 1.999820284786598e-05, "loss": 1.3207, "step": 1156 }, { "epoch": 0.43075705925469165, "grad_norm": 0.16881105303764343, "learning_rate": 1.9998179734307794e-05, "loss": 1.3047, "step": 1157 }, { "epoch": 0.4311293644053007, "grad_norm": 0.18325145542621613, "learning_rate": 1.9998156473077114e-05, "loss": 1.3047, "step": 1158 }, { "epoch": 0.43150166955590974, "grad_norm": 0.19172656536102295, "learning_rate": 1.9998133064174297e-05, "loss": 1.2973, "step": 1159 }, { "epoch": 0.43187397470651884, "grad_norm": 0.18118645250797272, "learning_rate": 1.9998109507599678e-05, "loss": 1.3079, "step": 1160 }, { "epoch": 0.4322462798571279, "grad_norm": 0.1834815889596939, "learning_rate": 1.999808580335361e-05, "loss": 1.3397, "step": 1161 }, { "epoch": 0.432618585007737, "grad_norm": 0.19053733348846436, "learning_rate": 1.9998061951436444e-05, "loss": 1.3065, "step": 1162 }, { "epoch": 0.432990890158346, "grad_norm": 0.18071451783180237, "learning_rate": 1.999803795184853e-05, "loss": 1.3144, "step": 1163 }, { "epoch": 0.4333631953089551, "grad_norm": 0.17997939884662628, "learning_rate": 1.9998013804590223e-05, "loss": 1.311, "step": 1164 }, { "epoch": 0.43373550045956416, "grad_norm": 0.19499291479587555, "learning_rate": 1.999798950966188e-05, "loss": 1.3367, "step": 1165 }, { "epoch": 0.43410780561017326, "grad_norm": 0.20583321154117584, "learning_rate": 1.9997965067063856e-05, "loss": 1.312, "step": 1166 }, { "epoch": 0.4344801107607823, "grad_norm": 0.18382562696933746, "learning_rate": 1.9997940476796516e-05, "loss": 1.3215, "step": 1167 }, { "epoch": 0.4348524159113914, "grad_norm": 0.19162672758102417, "learning_rate": 1.9997915738860224e-05, "loss": 1.3025, "step": 1168 }, { "epoch": 0.43522472106200044, "grad_norm": 0.1860532909631729, "learning_rate": 1.9997890853255346e-05, "loss": 1.2959, "step": 1169 }, { "epoch": 0.4355970262126095, "grad_norm": 0.1830555945634842, "learning_rate": 1.9997865819982247e-05, "loss": 1.3056, "step": 1170 }, { "epoch": 0.4359693313632186, "grad_norm": 0.18392489850521088, "learning_rate": 1.9997840639041293e-05, "loss": 1.3165, "step": 1171 }, { "epoch": 0.4363416365138276, "grad_norm": 0.18183907866477966, "learning_rate": 1.9997815310432864e-05, "loss": 1.3102, "step": 1172 }, { "epoch": 0.4367139416644367, "grad_norm": 0.19381216168403625, "learning_rate": 1.999778983415733e-05, "loss": 1.2985, "step": 1173 }, { "epoch": 0.43708624681504576, "grad_norm": 0.18290908634662628, "learning_rate": 1.9997764210215067e-05, "loss": 1.3021, "step": 1174 }, { "epoch": 0.43745855196565486, "grad_norm": 0.19478839635849, "learning_rate": 1.9997738438606454e-05, "loss": 1.3154, "step": 1175 }, { "epoch": 0.4378308571162639, "grad_norm": 0.1829633265733719, "learning_rate": 1.999771251933187e-05, "loss": 1.3105, "step": 1176 }, { "epoch": 0.438203162266873, "grad_norm": 0.1946062445640564, "learning_rate": 1.9997686452391703e-05, "loss": 1.315, "step": 1177 }, { "epoch": 0.43857546741748205, "grad_norm": 0.185636505484581, "learning_rate": 1.999766023778633e-05, "loss": 1.3065, "step": 1178 }, { "epoch": 0.43894777256809114, "grad_norm": 0.1923389434814453, "learning_rate": 1.9997633875516148e-05, "loss": 1.306, "step": 1179 }, { "epoch": 0.4393200777187002, "grad_norm": 0.18182942271232605, "learning_rate": 1.999760736558154e-05, "loss": 1.295, "step": 1180 }, { "epoch": 0.4396923828693093, "grad_norm": 0.1975245475769043, "learning_rate": 1.9997580707982896e-05, "loss": 1.3185, "step": 1181 }, { "epoch": 0.4400646880199183, "grad_norm": 0.1868218183517456, "learning_rate": 1.999755390272061e-05, "loss": 1.3127, "step": 1182 }, { "epoch": 0.44043699317052737, "grad_norm": 0.18964411318302155, "learning_rate": 1.9997526949795087e-05, "loss": 1.297, "step": 1183 }, { "epoch": 0.44080929832113647, "grad_norm": 0.19626370072364807, "learning_rate": 1.9997499849206715e-05, "loss": 1.2892, "step": 1184 }, { "epoch": 0.4411816034717455, "grad_norm": 0.1894233077764511, "learning_rate": 1.99974726009559e-05, "loss": 1.3238, "step": 1185 }, { "epoch": 0.4415539086223546, "grad_norm": 0.20008660852909088, "learning_rate": 1.9997445205043037e-05, "loss": 1.3053, "step": 1186 }, { "epoch": 0.44192621377296365, "grad_norm": 0.18232440948486328, "learning_rate": 1.999741766146854e-05, "loss": 1.3007, "step": 1187 }, { "epoch": 0.44229851892357275, "grad_norm": 0.19060754776000977, "learning_rate": 1.999738997023281e-05, "loss": 1.2931, "step": 1188 }, { "epoch": 0.4426708240741818, "grad_norm": 0.2020169347524643, "learning_rate": 1.999736213133626e-05, "loss": 1.3155, "step": 1189 }, { "epoch": 0.4430431292247909, "grad_norm": 0.1948639452457428, "learning_rate": 1.9997334144779295e-05, "loss": 1.3057, "step": 1190 }, { "epoch": 0.44341543437539993, "grad_norm": 0.1938532143831253, "learning_rate": 1.9997306010562334e-05, "loss": 1.2947, "step": 1191 }, { "epoch": 0.44378773952600903, "grad_norm": 0.21005672216415405, "learning_rate": 1.9997277728685788e-05, "loss": 1.3209, "step": 1192 }, { "epoch": 0.44416004467661807, "grad_norm": 0.20058314502239227, "learning_rate": 1.9997249299150078e-05, "loss": 1.2951, "step": 1193 }, { "epoch": 0.4445323498272271, "grad_norm": 0.1923152059316635, "learning_rate": 1.9997220721955627e-05, "loss": 1.3118, "step": 1194 }, { "epoch": 0.4449046549778362, "grad_norm": 0.19684681296348572, "learning_rate": 1.9997191997102853e-05, "loss": 1.3241, "step": 1195 }, { "epoch": 0.44527696012844525, "grad_norm": 0.19519715011119843, "learning_rate": 1.9997163124592175e-05, "loss": 1.3147, "step": 1196 }, { "epoch": 0.44564926527905435, "grad_norm": 0.19331727921962738, "learning_rate": 1.9997134104424033e-05, "loss": 1.3278, "step": 1197 }, { "epoch": 0.4460215704296634, "grad_norm": 0.19848302006721497, "learning_rate": 1.999710493659884e-05, "loss": 1.307, "step": 1198 }, { "epoch": 0.4463938755802725, "grad_norm": 0.19539180397987366, "learning_rate": 1.999707562111704e-05, "loss": 1.2933, "step": 1199 }, { "epoch": 0.44676618073088153, "grad_norm": 0.18977102637290955, "learning_rate": 1.999704615797906e-05, "loss": 1.3087, "step": 1200 }, { "epoch": 0.44713848588149063, "grad_norm": 0.2267744392156601, "learning_rate": 1.9997016547185333e-05, "loss": 1.3055, "step": 1201 }, { "epoch": 0.4475107910320997, "grad_norm": 0.19115746021270752, "learning_rate": 1.9996986788736298e-05, "loss": 1.2991, "step": 1202 }, { "epoch": 0.4478830961827088, "grad_norm": 0.1895446479320526, "learning_rate": 1.99969568826324e-05, "loss": 1.3006, "step": 1203 }, { "epoch": 0.4482554013333178, "grad_norm": 0.18717582523822784, "learning_rate": 1.999692682887407e-05, "loss": 1.3014, "step": 1204 }, { "epoch": 0.4486277064839269, "grad_norm": 0.19228947162628174, "learning_rate": 1.9996896627461764e-05, "loss": 1.3189, "step": 1205 }, { "epoch": 0.44900001163453596, "grad_norm": 0.19240860641002655, "learning_rate": 1.999686627839592e-05, "loss": 1.2877, "step": 1206 }, { "epoch": 0.449372316785145, "grad_norm": 0.18697458505630493, "learning_rate": 1.999683578167699e-05, "loss": 1.3058, "step": 1207 }, { "epoch": 0.4497446219357541, "grad_norm": 0.21261201798915863, "learning_rate": 1.999680513730542e-05, "loss": 1.3055, "step": 1208 }, { "epoch": 0.45011692708636314, "grad_norm": 0.19081415235996246, "learning_rate": 1.9996774345281668e-05, "loss": 1.2985, "step": 1209 }, { "epoch": 0.45048923223697224, "grad_norm": 0.18838602304458618, "learning_rate": 1.9996743405606188e-05, "loss": 1.3032, "step": 1210 }, { "epoch": 0.4508615373875813, "grad_norm": 0.19826094806194305, "learning_rate": 1.999671231827943e-05, "loss": 1.3116, "step": 1211 }, { "epoch": 0.4512338425381904, "grad_norm": 0.1974056512117386, "learning_rate": 1.999668108330186e-05, "loss": 1.3211, "step": 1212 }, { "epoch": 0.4516061476887994, "grad_norm": 0.1901020109653473, "learning_rate": 1.999664970067394e-05, "loss": 1.303, "step": 1213 }, { "epoch": 0.4519784528394085, "grad_norm": 0.18617480993270874, "learning_rate": 1.999661817039613e-05, "loss": 1.3013, "step": 1214 }, { "epoch": 0.45235075799001756, "grad_norm": 0.20741531252861023, "learning_rate": 1.9996586492468895e-05, "loss": 1.3031, "step": 1215 }, { "epoch": 0.45272306314062666, "grad_norm": 0.18789881467819214, "learning_rate": 1.999655466689271e-05, "loss": 1.3058, "step": 1216 }, { "epoch": 0.4530953682912357, "grad_norm": 0.1936379075050354, "learning_rate": 1.9996522693668034e-05, "loss": 1.311, "step": 1217 }, { "epoch": 0.4534676734418448, "grad_norm": 0.20415879786014557, "learning_rate": 1.9996490572795348e-05, "loss": 1.32, "step": 1218 }, { "epoch": 0.45383997859245384, "grad_norm": 0.19625741243362427, "learning_rate": 1.9996458304275125e-05, "loss": 1.3058, "step": 1219 }, { "epoch": 0.4542122837430629, "grad_norm": 0.20820340514183044, "learning_rate": 1.999642588810784e-05, "loss": 1.3154, "step": 1220 }, { "epoch": 0.454584588893672, "grad_norm": 0.2056087851524353, "learning_rate": 1.9996393324293972e-05, "loss": 1.3051, "step": 1221 }, { "epoch": 0.454956894044281, "grad_norm": 0.19374904036521912, "learning_rate": 1.9996360612833997e-05, "loss": 1.3236, "step": 1222 }, { "epoch": 0.4553291991948901, "grad_norm": 0.18062308430671692, "learning_rate": 1.999632775372841e-05, "loss": 1.292, "step": 1223 }, { "epoch": 0.45570150434549916, "grad_norm": 0.20089417695999146, "learning_rate": 1.9996294746977686e-05, "loss": 1.3074, "step": 1224 }, { "epoch": 0.45607380949610826, "grad_norm": 0.19090351462364197, "learning_rate": 1.9996261592582312e-05, "loss": 1.3025, "step": 1225 }, { "epoch": 0.4564461146467173, "grad_norm": 0.19648805260658264, "learning_rate": 1.9996228290542787e-05, "loss": 1.3098, "step": 1226 }, { "epoch": 0.4568184197973264, "grad_norm": 0.1995651125907898, "learning_rate": 1.99961948408596e-05, "loss": 1.3083, "step": 1227 }, { "epoch": 0.45719072494793545, "grad_norm": 0.1871064007282257, "learning_rate": 1.9996161243533238e-05, "loss": 1.2835, "step": 1228 }, { "epoch": 0.45756303009854454, "grad_norm": 0.19911763072013855, "learning_rate": 1.9996127498564203e-05, "loss": 1.3044, "step": 1229 }, { "epoch": 0.4579353352491536, "grad_norm": 0.17743955552577972, "learning_rate": 1.9996093605952992e-05, "loss": 1.3067, "step": 1230 }, { "epoch": 0.45830764039976263, "grad_norm": 0.19110862910747528, "learning_rate": 1.9996059565700103e-05, "loss": 1.2972, "step": 1231 }, { "epoch": 0.4586799455503717, "grad_norm": 0.18435370922088623, "learning_rate": 1.9996025377806044e-05, "loss": 1.3045, "step": 1232 }, { "epoch": 0.45905225070098077, "grad_norm": 0.18806539475917816, "learning_rate": 1.999599104227132e-05, "loss": 1.3115, "step": 1233 }, { "epoch": 0.45942455585158987, "grad_norm": 0.18353897333145142, "learning_rate": 1.9995956559096432e-05, "loss": 1.3079, "step": 1234 }, { "epoch": 0.4597968610021989, "grad_norm": 0.18023177981376648, "learning_rate": 1.9995921928281893e-05, "loss": 1.3037, "step": 1235 }, { "epoch": 0.460169166152808, "grad_norm": 0.18274232745170593, "learning_rate": 1.9995887149828216e-05, "loss": 1.3211, "step": 1236 }, { "epoch": 0.46054147130341705, "grad_norm": 0.18951238691806793, "learning_rate": 1.9995852223735914e-05, "loss": 1.299, "step": 1237 }, { "epoch": 0.46091377645402615, "grad_norm": 0.1818682998418808, "learning_rate": 1.9995817150005502e-05, "loss": 1.2999, "step": 1238 }, { "epoch": 0.4612860816046352, "grad_norm": 0.20125912129878998, "learning_rate": 1.9995781928637494e-05, "loss": 1.3045, "step": 1239 }, { "epoch": 0.4616583867552443, "grad_norm": 0.16718147695064545, "learning_rate": 1.9995746559632417e-05, "loss": 1.2862, "step": 1240 }, { "epoch": 0.46203069190585333, "grad_norm": 0.19009731709957123, "learning_rate": 1.999571104299079e-05, "loss": 1.3052, "step": 1241 }, { "epoch": 0.46240299705646243, "grad_norm": 0.18542075157165527, "learning_rate": 1.999567537871314e-05, "loss": 1.2998, "step": 1242 }, { "epoch": 0.46277530220707147, "grad_norm": 0.18148809671401978, "learning_rate": 1.999563956679999e-05, "loss": 1.3011, "step": 1243 }, { "epoch": 0.4631476073576805, "grad_norm": 0.1700984239578247, "learning_rate": 1.9995603607251873e-05, "loss": 1.3053, "step": 1244 }, { "epoch": 0.4635199125082896, "grad_norm": 0.19107471406459808, "learning_rate": 1.9995567500069314e-05, "loss": 1.3124, "step": 1245 }, { "epoch": 0.46389221765889865, "grad_norm": 0.18561914563179016, "learning_rate": 1.9995531245252854e-05, "loss": 1.3055, "step": 1246 }, { "epoch": 0.46426452280950775, "grad_norm": 0.18733622133731842, "learning_rate": 1.9995494842803026e-05, "loss": 1.3053, "step": 1247 }, { "epoch": 0.4646368279601168, "grad_norm": 0.18674108386039734, "learning_rate": 1.9995458292720364e-05, "loss": 1.3043, "step": 1248 }, { "epoch": 0.4650091331107259, "grad_norm": 0.17907080054283142, "learning_rate": 1.9995421595005408e-05, "loss": 1.3161, "step": 1249 }, { "epoch": 0.46538143826133493, "grad_norm": 0.17312780022621155, "learning_rate": 1.9995384749658705e-05, "loss": 1.321, "step": 1250 }, { "epoch": 0.46575374341194403, "grad_norm": 0.18137013912200928, "learning_rate": 1.99953477566808e-05, "loss": 1.3175, "step": 1251 }, { "epoch": 0.4661260485625531, "grad_norm": 0.1776915341615677, "learning_rate": 1.999531061607223e-05, "loss": 1.3135, "step": 1252 }, { "epoch": 0.4664983537131622, "grad_norm": 0.18204154074192047, "learning_rate": 1.9995273327833553e-05, "loss": 1.2955, "step": 1253 }, { "epoch": 0.4668706588637712, "grad_norm": 0.18510939180850983, "learning_rate": 1.999523589196531e-05, "loss": 1.2945, "step": 1254 }, { "epoch": 0.4672429640143803, "grad_norm": 0.17913533747196198, "learning_rate": 1.999519830846807e-05, "loss": 1.3177, "step": 1255 }, { "epoch": 0.46761526916498936, "grad_norm": 0.18509091436862946, "learning_rate": 1.9995160577342375e-05, "loss": 1.3117, "step": 1256 }, { "epoch": 0.4679875743155984, "grad_norm": 0.19263845682144165, "learning_rate": 1.999512269858878e-05, "loss": 1.2951, "step": 1257 }, { "epoch": 0.4683598794662075, "grad_norm": 0.1835833191871643, "learning_rate": 1.9995084672207855e-05, "loss": 1.3027, "step": 1258 }, { "epoch": 0.46873218461681654, "grad_norm": 0.18313245475292206, "learning_rate": 1.9995046498200158e-05, "loss": 1.2837, "step": 1259 }, { "epoch": 0.46910448976742564, "grad_norm": 0.1853981614112854, "learning_rate": 1.9995008176566247e-05, "loss": 1.2973, "step": 1260 }, { "epoch": 0.4694767949180347, "grad_norm": 0.20372353494167328, "learning_rate": 1.9994969707306697e-05, "loss": 1.3229, "step": 1261 }, { "epoch": 0.4698491000686438, "grad_norm": 0.183163583278656, "learning_rate": 1.9994931090422067e-05, "loss": 1.3065, "step": 1262 }, { "epoch": 0.4702214052192528, "grad_norm": 0.18896111845970154, "learning_rate": 1.9994892325912937e-05, "loss": 1.3135, "step": 1263 }, { "epoch": 0.4705937103698619, "grad_norm": 0.18443109095096588, "learning_rate": 1.999485341377987e-05, "loss": 1.3002, "step": 1264 }, { "epoch": 0.47096601552047096, "grad_norm": 0.19512967765331268, "learning_rate": 1.9994814354023446e-05, "loss": 1.2936, "step": 1265 }, { "epoch": 0.47133832067108006, "grad_norm": 0.1836230754852295, "learning_rate": 1.9994775146644245e-05, "loss": 1.2921, "step": 1266 }, { "epoch": 0.4717106258216891, "grad_norm": 0.20863986015319824, "learning_rate": 1.999473579164284e-05, "loss": 1.3075, "step": 1267 }, { "epoch": 0.47208293097229814, "grad_norm": 0.19831794500350952, "learning_rate": 1.999469628901981e-05, "loss": 1.3106, "step": 1268 }, { "epoch": 0.47245523612290724, "grad_norm": 0.18101930618286133, "learning_rate": 1.999465663877575e-05, "loss": 1.2915, "step": 1269 }, { "epoch": 0.4728275412735163, "grad_norm": 0.20090216398239136, "learning_rate": 1.9994616840911237e-05, "loss": 1.2941, "step": 1270 }, { "epoch": 0.4731998464241254, "grad_norm": 0.20153005421161652, "learning_rate": 1.9994576895426858e-05, "loss": 1.3137, "step": 1271 }, { "epoch": 0.4735721515747344, "grad_norm": 0.18877728283405304, "learning_rate": 1.999453680232321e-05, "loss": 1.2897, "step": 1272 }, { "epoch": 0.4739444567253435, "grad_norm": 0.19803088903427124, "learning_rate": 1.9994496561600874e-05, "loss": 1.2905, "step": 1273 }, { "epoch": 0.47431676187595256, "grad_norm": 0.19023656845092773, "learning_rate": 1.9994456173260457e-05, "loss": 1.3019, "step": 1274 }, { "epoch": 0.47468906702656166, "grad_norm": 0.17500866949558258, "learning_rate": 1.9994415637302545e-05, "loss": 1.3089, "step": 1275 }, { "epoch": 0.4750613721771707, "grad_norm": 0.18865026533603668, "learning_rate": 1.9994374953727747e-05, "loss": 1.2977, "step": 1276 }, { "epoch": 0.4754336773277798, "grad_norm": 0.176120787858963, "learning_rate": 1.9994334122536654e-05, "loss": 1.2825, "step": 1277 }, { "epoch": 0.47580598247838884, "grad_norm": 0.18565769493579865, "learning_rate": 1.9994293143729873e-05, "loss": 1.3182, "step": 1278 }, { "epoch": 0.47617828762899794, "grad_norm": 0.18790839612483978, "learning_rate": 1.9994252017308012e-05, "loss": 1.3215, "step": 1279 }, { "epoch": 0.476550592779607, "grad_norm": 0.19500873982906342, "learning_rate": 1.9994210743271675e-05, "loss": 1.3114, "step": 1280 }, { "epoch": 0.476922897930216, "grad_norm": 0.1855781525373459, "learning_rate": 1.9994169321621474e-05, "loss": 1.3051, "step": 1281 }, { "epoch": 0.4772952030808251, "grad_norm": 0.1813666671514511, "learning_rate": 1.9994127752358014e-05, "loss": 1.2957, "step": 1282 }, { "epoch": 0.47766750823143417, "grad_norm": 0.20357996225357056, "learning_rate": 1.999408603548192e-05, "loss": 1.2988, "step": 1283 }, { "epoch": 0.47803981338204327, "grad_norm": 0.17496971786022186, "learning_rate": 1.99940441709938e-05, "loss": 1.3004, "step": 1284 }, { "epoch": 0.4784121185326523, "grad_norm": 0.19494318962097168, "learning_rate": 1.9994002158894274e-05, "loss": 1.3138, "step": 1285 }, { "epoch": 0.4787844236832614, "grad_norm": 0.19843032956123352, "learning_rate": 1.9993959999183964e-05, "loss": 1.3031, "step": 1286 }, { "epoch": 0.47915672883387045, "grad_norm": 0.19136017560958862, "learning_rate": 1.9993917691863493e-05, "loss": 1.3004, "step": 1287 }, { "epoch": 0.47952903398447955, "grad_norm": 0.18518052995204926, "learning_rate": 1.9993875236933486e-05, "loss": 1.2824, "step": 1288 }, { "epoch": 0.4799013391350886, "grad_norm": 0.18933340907096863, "learning_rate": 1.9993832634394564e-05, "loss": 1.2983, "step": 1289 }, { "epoch": 0.4802736442856977, "grad_norm": 0.1836833357810974, "learning_rate": 1.9993789884247365e-05, "loss": 1.2916, "step": 1290 }, { "epoch": 0.48064594943630673, "grad_norm": 0.17370636761188507, "learning_rate": 1.9993746986492515e-05, "loss": 1.2841, "step": 1291 }, { "epoch": 0.4810182545869158, "grad_norm": 0.19281500577926636, "learning_rate": 1.999370394113065e-05, "loss": 1.3149, "step": 1292 }, { "epoch": 0.48139055973752487, "grad_norm": 0.1878812462091446, "learning_rate": 1.999366074816241e-05, "loss": 1.3104, "step": 1293 }, { "epoch": 0.4817628648881339, "grad_norm": 0.18660883605480194, "learning_rate": 1.999361740758842e-05, "loss": 1.3022, "step": 1294 }, { "epoch": 0.482135170038743, "grad_norm": 0.1894964575767517, "learning_rate": 1.999357391940933e-05, "loss": 1.3056, "step": 1295 }, { "epoch": 0.48250747518935205, "grad_norm": 0.18290723860263824, "learning_rate": 1.999353028362578e-05, "loss": 1.3045, "step": 1296 }, { "epoch": 0.48287978033996115, "grad_norm": 0.1781376600265503, "learning_rate": 1.9993486500238417e-05, "loss": 1.3114, "step": 1297 }, { "epoch": 0.4832520854905702, "grad_norm": 0.1864173412322998, "learning_rate": 1.9993442569247885e-05, "loss": 1.3034, "step": 1298 }, { "epoch": 0.4836243906411793, "grad_norm": 0.19244234263896942, "learning_rate": 1.9993398490654835e-05, "loss": 1.3071, "step": 1299 }, { "epoch": 0.48399669579178833, "grad_norm": 0.18302254378795624, "learning_rate": 1.9993354264459913e-05, "loss": 1.3059, "step": 1300 }, { "epoch": 0.48436900094239743, "grad_norm": 0.17523464560508728, "learning_rate": 1.9993309890663775e-05, "loss": 1.2919, "step": 1301 }, { "epoch": 0.4847413060930065, "grad_norm": 0.18764130771160126, "learning_rate": 1.999326536926708e-05, "loss": 1.2931, "step": 1302 }, { "epoch": 0.4851136112436156, "grad_norm": 0.1812962144613266, "learning_rate": 1.9993220700270484e-05, "loss": 1.289, "step": 1303 }, { "epoch": 0.4854859163942246, "grad_norm": 0.1877674013376236, "learning_rate": 1.9993175883674642e-05, "loss": 1.2957, "step": 1304 }, { "epoch": 0.48585822154483366, "grad_norm": 0.24576683342456818, "learning_rate": 1.9993130919480223e-05, "loss": 1.3056, "step": 1305 }, { "epoch": 0.48623052669544276, "grad_norm": 0.185043066740036, "learning_rate": 1.9993085807687883e-05, "loss": 1.297, "step": 1306 }, { "epoch": 0.4866028318460518, "grad_norm": 0.19536662101745605, "learning_rate": 1.9993040548298297e-05, "loss": 1.3053, "step": 1307 }, { "epoch": 0.4869751369966609, "grad_norm": 0.17894597351551056, "learning_rate": 1.9992995141312126e-05, "loss": 1.3136, "step": 1308 }, { "epoch": 0.48734744214726994, "grad_norm": 0.1845971643924713, "learning_rate": 1.9992949586730046e-05, "loss": 1.3001, "step": 1309 }, { "epoch": 0.48771974729787904, "grad_norm": 0.18812014162540436, "learning_rate": 1.9992903884552727e-05, "loss": 1.2948, "step": 1310 }, { "epoch": 0.4880920524484881, "grad_norm": 0.18313339352607727, "learning_rate": 1.9992858034780848e-05, "loss": 1.3007, "step": 1311 }, { "epoch": 0.4884643575990972, "grad_norm": 0.18373283743858337, "learning_rate": 1.9992812037415077e-05, "loss": 1.2924, "step": 1312 }, { "epoch": 0.4888366627497062, "grad_norm": 0.1760503053665161, "learning_rate": 1.9992765892456102e-05, "loss": 1.2953, "step": 1313 }, { "epoch": 0.4892089679003153, "grad_norm": 0.18641230463981628, "learning_rate": 1.99927195999046e-05, "loss": 1.288, "step": 1314 }, { "epoch": 0.48958127305092436, "grad_norm": 0.1766699105501175, "learning_rate": 1.999267315976126e-05, "loss": 1.2889, "step": 1315 }, { "epoch": 0.48995357820153346, "grad_norm": 0.17779386043548584, "learning_rate": 1.9992626572026764e-05, "loss": 1.2969, "step": 1316 }, { "epoch": 0.4903258833521425, "grad_norm": 0.1781335324048996, "learning_rate": 1.9992579836701796e-05, "loss": 1.2939, "step": 1317 }, { "epoch": 0.49069818850275154, "grad_norm": 0.18048399686813354, "learning_rate": 1.9992532953787057e-05, "loss": 1.29, "step": 1318 }, { "epoch": 0.49107049365336064, "grad_norm": 0.18843825161457062, "learning_rate": 1.999248592328323e-05, "loss": 1.3121, "step": 1319 }, { "epoch": 0.4914427988039697, "grad_norm": 0.1733110100030899, "learning_rate": 1.9992438745191017e-05, "loss": 1.3121, "step": 1320 }, { "epoch": 0.4918151039545788, "grad_norm": 0.18750248849391937, "learning_rate": 1.999239141951111e-05, "loss": 1.298, "step": 1321 }, { "epoch": 0.4921874091051878, "grad_norm": 0.18104903399944305, "learning_rate": 1.9992343946244205e-05, "loss": 1.3, "step": 1322 }, { "epoch": 0.4925597142557969, "grad_norm": 0.1810617595911026, "learning_rate": 1.9992296325391004e-05, "loss": 1.3075, "step": 1323 }, { "epoch": 0.49293201940640596, "grad_norm": 0.18038879334926605, "learning_rate": 1.999224855695222e-05, "loss": 1.304, "step": 1324 }, { "epoch": 0.49330432455701506, "grad_norm": 0.17841710150241852, "learning_rate": 1.999220064092855e-05, "loss": 1.3075, "step": 1325 }, { "epoch": 0.4936766297076241, "grad_norm": 0.18616177141666412, "learning_rate": 1.9992152577320706e-05, "loss": 1.2902, "step": 1326 }, { "epoch": 0.4940489348582332, "grad_norm": 0.18415139615535736, "learning_rate": 1.999210436612939e-05, "loss": 1.3003, "step": 1327 }, { "epoch": 0.49442124000884224, "grad_norm": 0.2058713585138321, "learning_rate": 1.9992056007355323e-05, "loss": 1.3074, "step": 1328 }, { "epoch": 0.49479354515945134, "grad_norm": 0.17545197904109955, "learning_rate": 1.9992007500999216e-05, "loss": 1.3167, "step": 1329 }, { "epoch": 0.4951658503100604, "grad_norm": 0.1859792321920395, "learning_rate": 1.9991958847061786e-05, "loss": 1.3226, "step": 1330 }, { "epoch": 0.4955381554606694, "grad_norm": 0.2005636990070343, "learning_rate": 1.999191004554375e-05, "loss": 1.3016, "step": 1331 }, { "epoch": 0.4959104606112785, "grad_norm": 0.17621280252933502, "learning_rate": 1.999186109644583e-05, "loss": 1.2963, "step": 1332 }, { "epoch": 0.49628276576188757, "grad_norm": 0.18345515429973602, "learning_rate": 1.9991811999768747e-05, "loss": 1.2984, "step": 1333 }, { "epoch": 0.49665507091249667, "grad_norm": 0.191512331366539, "learning_rate": 1.999176275551323e-05, "loss": 1.2894, "step": 1334 }, { "epoch": 0.4970273760631057, "grad_norm": 0.18425214290618896, "learning_rate": 1.9991713363680002e-05, "loss": 1.2952, "step": 1335 }, { "epoch": 0.4973996812137148, "grad_norm": 0.4703981280326843, "learning_rate": 1.9991663824269797e-05, "loss": 1.3056, "step": 1336 }, { "epoch": 0.49777198636432385, "grad_norm": 0.19318270683288574, "learning_rate": 1.999161413728334e-05, "loss": 1.2895, "step": 1337 }, { "epoch": 0.49814429151493295, "grad_norm": 0.19595178961753845, "learning_rate": 1.9991564302721374e-05, "loss": 1.2952, "step": 1338 }, { "epoch": 0.498516596665542, "grad_norm": 0.20309020578861237, "learning_rate": 1.9991514320584628e-05, "loss": 1.3124, "step": 1339 }, { "epoch": 0.4988889018161511, "grad_norm": 0.19227808713912964, "learning_rate": 1.9991464190873845e-05, "loss": 1.2956, "step": 1340 }, { "epoch": 0.49926120696676013, "grad_norm": 0.19931499660015106, "learning_rate": 1.999141391358976e-05, "loss": 1.2961, "step": 1341 }, { "epoch": 0.49963351211736917, "grad_norm": 0.19872671365737915, "learning_rate": 1.999136348873312e-05, "loss": 1.2977, "step": 1342 }, { "epoch": 0.5000058172679782, "grad_norm": 0.1944931149482727, "learning_rate": 1.999131291630467e-05, "loss": 1.2856, "step": 1343 }, { "epoch": 0.5003781224185874, "grad_norm": 0.19236011803150177, "learning_rate": 1.9991262196305153e-05, "loss": 1.2963, "step": 1344 }, { "epoch": 0.5007504275691964, "grad_norm": 0.19329576194286346, "learning_rate": 1.999121132873532e-05, "loss": 1.3028, "step": 1345 }, { "epoch": 0.5011227327198055, "grad_norm": 0.17316564917564392, "learning_rate": 1.9991160313595924e-05, "loss": 1.2826, "step": 1346 }, { "epoch": 0.5014950378704145, "grad_norm": 0.19512668251991272, "learning_rate": 1.9991109150887715e-05, "loss": 1.3035, "step": 1347 }, { "epoch": 0.5018673430210236, "grad_norm": 0.19115027785301208, "learning_rate": 1.9991057840611453e-05, "loss": 1.3046, "step": 1348 }, { "epoch": 0.5022396481716327, "grad_norm": 0.19715119898319244, "learning_rate": 1.9991006382767892e-05, "loss": 1.3129, "step": 1349 }, { "epoch": 0.5026119533222417, "grad_norm": 0.18349628150463104, "learning_rate": 1.9990954777357795e-05, "loss": 1.3005, "step": 1350 }, { "epoch": 0.5029842584728508, "grad_norm": 0.19362014532089233, "learning_rate": 1.999090302438192e-05, "loss": 1.3151, "step": 1351 }, { "epoch": 0.5033565636234599, "grad_norm": 0.18228283524513245, "learning_rate": 1.999085112384104e-05, "loss": 1.3068, "step": 1352 }, { "epoch": 0.503728868774069, "grad_norm": 0.193775936961174, "learning_rate": 1.9990799075735912e-05, "loss": 1.3139, "step": 1353 }, { "epoch": 0.504101173924678, "grad_norm": 0.18167784810066223, "learning_rate": 1.999074688006731e-05, "loss": 1.2844, "step": 1354 }, { "epoch": 0.5044734790752871, "grad_norm": 0.19863297045230865, "learning_rate": 1.9990694536836002e-05, "loss": 1.2932, "step": 1355 }, { "epoch": 0.5048457842258961, "grad_norm": 0.189836323261261, "learning_rate": 1.9990642046042766e-05, "loss": 1.284, "step": 1356 }, { "epoch": 0.5052180893765053, "grad_norm": 0.19672173261642456, "learning_rate": 1.9990589407688373e-05, "loss": 1.3062, "step": 1357 }, { "epoch": 0.5055903945271143, "grad_norm": 0.18417510390281677, "learning_rate": 1.99905366217736e-05, "loss": 1.303, "step": 1358 }, { "epoch": 0.5059626996777233, "grad_norm": 0.18591108918190002, "learning_rate": 1.999048368829923e-05, "loss": 1.291, "step": 1359 }, { "epoch": 0.5063350048283324, "grad_norm": 0.1832229644060135, "learning_rate": 1.9990430607266038e-05, "loss": 1.2888, "step": 1360 }, { "epoch": 0.5067073099789415, "grad_norm": 0.21367454528808594, "learning_rate": 1.999037737867482e-05, "loss": 1.307, "step": 1361 }, { "epoch": 0.5070796151295506, "grad_norm": 0.18742305040359497, "learning_rate": 1.999032400252635e-05, "loss": 1.2955, "step": 1362 }, { "epoch": 0.5074519202801596, "grad_norm": 0.18082669377326965, "learning_rate": 1.9990270478821422e-05, "loss": 1.2794, "step": 1363 }, { "epoch": 0.5078242254307687, "grad_norm": 0.19011190533638, "learning_rate": 1.9990216807560827e-05, "loss": 1.2864, "step": 1364 }, { "epoch": 0.5081965305813778, "grad_norm": 0.18790535628795624, "learning_rate": 1.9990162988745357e-05, "loss": 1.2974, "step": 1365 }, { "epoch": 0.5085688357319869, "grad_norm": 0.19116294384002686, "learning_rate": 1.9990109022375807e-05, "loss": 1.2972, "step": 1366 }, { "epoch": 0.5089411408825959, "grad_norm": 0.19198502600193024, "learning_rate": 1.999005490845297e-05, "loss": 1.3173, "step": 1367 }, { "epoch": 0.5093134460332049, "grad_norm": 0.19381704926490784, "learning_rate": 1.9990000646977653e-05, "loss": 1.3012, "step": 1368 }, { "epoch": 0.509685751183814, "grad_norm": 0.1835579127073288, "learning_rate": 1.9989946237950653e-05, "loss": 1.3028, "step": 1369 }, { "epoch": 0.5100580563344231, "grad_norm": 0.18221747875213623, "learning_rate": 1.998989168137277e-05, "loss": 1.2992, "step": 1370 }, { "epoch": 0.5104303614850322, "grad_norm": 0.18950197100639343, "learning_rate": 1.998983697724482e-05, "loss": 1.2997, "step": 1371 }, { "epoch": 0.5108026666356412, "grad_norm": 0.18854863941669464, "learning_rate": 1.99897821255676e-05, "loss": 1.3083, "step": 1372 }, { "epoch": 0.5111749717862503, "grad_norm": 0.1852334886789322, "learning_rate": 1.9989727126341927e-05, "loss": 1.2923, "step": 1373 }, { "epoch": 0.5115472769368594, "grad_norm": 0.18260884284973145, "learning_rate": 1.998967197956861e-05, "loss": 1.2941, "step": 1374 }, { "epoch": 0.5119195820874685, "grad_norm": 0.1905146837234497, "learning_rate": 1.9989616685248468e-05, "loss": 1.3061, "step": 1375 }, { "epoch": 0.5122918872380775, "grad_norm": 0.17799590528011322, "learning_rate": 1.9989561243382313e-05, "loss": 1.305, "step": 1376 }, { "epoch": 0.5126641923886865, "grad_norm": 0.18852990865707397, "learning_rate": 1.9989505653970963e-05, "loss": 1.3001, "step": 1377 }, { "epoch": 0.5130364975392957, "grad_norm": 0.19264602661132812, "learning_rate": 1.9989449917015242e-05, "loss": 1.2991, "step": 1378 }, { "epoch": 0.5134088026899047, "grad_norm": 0.1937609314918518, "learning_rate": 1.9989394032515974e-05, "loss": 1.2826, "step": 1379 }, { "epoch": 0.5137811078405138, "grad_norm": 0.19785352051258087, "learning_rate": 1.9989338000473982e-05, "loss": 1.2972, "step": 1380 }, { "epoch": 0.5141534129911228, "grad_norm": 0.18624554574489594, "learning_rate": 1.9989281820890095e-05, "loss": 1.2628, "step": 1381 }, { "epoch": 0.5145257181417319, "grad_norm": 0.18756809830665588, "learning_rate": 1.9989225493765144e-05, "loss": 1.2842, "step": 1382 }, { "epoch": 0.514898023292341, "grad_norm": 0.18710888922214508, "learning_rate": 1.9989169019099956e-05, "loss": 1.2887, "step": 1383 }, { "epoch": 0.5152703284429501, "grad_norm": 0.17629897594451904, "learning_rate": 1.9989112396895374e-05, "loss": 1.2993, "step": 1384 }, { "epoch": 0.5156426335935591, "grad_norm": 0.18918879330158234, "learning_rate": 1.9989055627152222e-05, "loss": 1.304, "step": 1385 }, { "epoch": 0.5160149387441682, "grad_norm": 0.1894582360982895, "learning_rate": 1.998899870987135e-05, "loss": 1.2917, "step": 1386 }, { "epoch": 0.5163872438947773, "grad_norm": 0.18650588393211365, "learning_rate": 1.9988941645053594e-05, "loss": 1.2985, "step": 1387 }, { "epoch": 0.5167595490453863, "grad_norm": 0.17856687307357788, "learning_rate": 1.9988884432699795e-05, "loss": 1.2898, "step": 1388 }, { "epoch": 0.5171318541959954, "grad_norm": 0.19840949773788452, "learning_rate": 1.9988827072810798e-05, "loss": 1.2982, "step": 1389 }, { "epoch": 0.5175041593466044, "grad_norm": 0.18756093084812164, "learning_rate": 1.9988769565387454e-05, "loss": 1.294, "step": 1390 }, { "epoch": 0.5178764644972135, "grad_norm": 0.17636418342590332, "learning_rate": 1.9988711910430613e-05, "loss": 1.291, "step": 1391 }, { "epoch": 0.5182487696478226, "grad_norm": 0.18831880390644073, "learning_rate": 1.998865410794112e-05, "loss": 1.2831, "step": 1392 }, { "epoch": 0.5186210747984317, "grad_norm": 0.17440542578697205, "learning_rate": 1.9988596157919836e-05, "loss": 1.305, "step": 1393 }, { "epoch": 0.5189933799490407, "grad_norm": 0.18249642848968506, "learning_rate": 1.9988538060367612e-05, "loss": 1.2852, "step": 1394 }, { "epoch": 0.5193656850996498, "grad_norm": 0.18289236724376678, "learning_rate": 1.9988479815285308e-05, "loss": 1.2902, "step": 1395 }, { "epoch": 0.5197379902502589, "grad_norm": 0.18215063214302063, "learning_rate": 1.998842142267378e-05, "loss": 1.2896, "step": 1396 }, { "epoch": 0.520110295400868, "grad_norm": 0.18357405066490173, "learning_rate": 1.99883628825339e-05, "loss": 1.2959, "step": 1397 }, { "epoch": 0.520482600551477, "grad_norm": 0.18299397826194763, "learning_rate": 1.9988304194866527e-05, "loss": 1.3121, "step": 1398 }, { "epoch": 0.520854905702086, "grad_norm": 0.18076153099536896, "learning_rate": 1.9988245359672523e-05, "loss": 1.3005, "step": 1399 }, { "epoch": 0.5212272108526952, "grad_norm": 0.17696769535541534, "learning_rate": 1.9988186376952766e-05, "loss": 1.2815, "step": 1400 }, { "epoch": 0.5215995160033042, "grad_norm": 0.18042761087417603, "learning_rate": 1.998812724670812e-05, "loss": 1.2893, "step": 1401 }, { "epoch": 0.5219718211539133, "grad_norm": 0.1830262839794159, "learning_rate": 1.9988067968939463e-05, "loss": 1.2929, "step": 1402 }, { "epoch": 0.5223441263045223, "grad_norm": 0.18442308902740479, "learning_rate": 1.998800854364767e-05, "loss": 1.2998, "step": 1403 }, { "epoch": 0.5227164314551314, "grad_norm": 0.1778024435043335, "learning_rate": 1.9987948970833616e-05, "loss": 1.2871, "step": 1404 }, { "epoch": 0.5230887366057405, "grad_norm": 0.17833255231380463, "learning_rate": 1.9987889250498185e-05, "loss": 1.2935, "step": 1405 }, { "epoch": 0.5234610417563496, "grad_norm": 0.1755494475364685, "learning_rate": 1.9987829382642253e-05, "loss": 1.2908, "step": 1406 }, { "epoch": 0.5238333469069586, "grad_norm": 0.18358327448368073, "learning_rate": 1.998776936726671e-05, "loss": 1.2984, "step": 1407 }, { "epoch": 0.5242056520575676, "grad_norm": 0.1703406274318695, "learning_rate": 1.998770920437244e-05, "loss": 1.2906, "step": 1408 }, { "epoch": 0.5245779572081768, "grad_norm": 0.1967104971408844, "learning_rate": 1.9987648893960334e-05, "loss": 1.293, "step": 1409 }, { "epoch": 0.5249502623587858, "grad_norm": 0.19022947549819946, "learning_rate": 1.998758843603128e-05, "loss": 1.2919, "step": 1410 }, { "epoch": 0.5253225675093949, "grad_norm": 0.17558811604976654, "learning_rate": 1.9987527830586168e-05, "loss": 1.2928, "step": 1411 }, { "epoch": 0.5256948726600039, "grad_norm": 0.1828990876674652, "learning_rate": 1.99874670776259e-05, "loss": 1.2967, "step": 1412 }, { "epoch": 0.5260671778106131, "grad_norm": 0.18160808086395264, "learning_rate": 1.9987406177151368e-05, "loss": 1.2936, "step": 1413 }, { "epoch": 0.5264394829612221, "grad_norm": 0.1777198314666748, "learning_rate": 1.9987345129163472e-05, "loss": 1.2832, "step": 1414 }, { "epoch": 0.5268117881118312, "grad_norm": 0.1877478063106537, "learning_rate": 1.998728393366312e-05, "loss": 1.3125, "step": 1415 }, { "epoch": 0.5271840932624402, "grad_norm": 0.17702911794185638, "learning_rate": 1.9987222590651206e-05, "loss": 1.2747, "step": 1416 }, { "epoch": 0.5275563984130492, "grad_norm": 0.19214600324630737, "learning_rate": 1.9987161100128646e-05, "loss": 1.2897, "step": 1417 }, { "epoch": 0.5279287035636584, "grad_norm": 0.18091824650764465, "learning_rate": 1.9987099462096342e-05, "loss": 1.29, "step": 1418 }, { "epoch": 0.5283010087142674, "grad_norm": 0.19012725353240967, "learning_rate": 1.9987037676555205e-05, "loss": 1.2834, "step": 1419 }, { "epoch": 0.5286733138648765, "grad_norm": 0.18362641334533691, "learning_rate": 1.9986975743506146e-05, "loss": 1.2939, "step": 1420 }, { "epoch": 0.5290456190154855, "grad_norm": 0.17870792746543884, "learning_rate": 1.9986913662950084e-05, "loss": 1.2931, "step": 1421 }, { "epoch": 0.5294179241660947, "grad_norm": 0.16860565543174744, "learning_rate": 1.9986851434887934e-05, "loss": 1.3015, "step": 1422 }, { "epoch": 0.5297902293167037, "grad_norm": 0.18669748306274414, "learning_rate": 1.9986789059320614e-05, "loss": 1.2966, "step": 1423 }, { "epoch": 0.5301625344673128, "grad_norm": 0.1844332367181778, "learning_rate": 1.998672653624905e-05, "loss": 1.2936, "step": 1424 }, { "epoch": 0.5305348396179218, "grad_norm": 0.18035277724266052, "learning_rate": 1.998666386567416e-05, "loss": 1.2964, "step": 1425 }, { "epoch": 0.530907144768531, "grad_norm": 0.19097861647605896, "learning_rate": 1.998660104759687e-05, "loss": 1.305, "step": 1426 }, { "epoch": 0.53127944991914, "grad_norm": 0.17276540398597717, "learning_rate": 1.998653808201811e-05, "loss": 1.3028, "step": 1427 }, { "epoch": 0.531651755069749, "grad_norm": 0.19778695702552795, "learning_rate": 1.9986474968938808e-05, "loss": 1.2907, "step": 1428 }, { "epoch": 0.5320240602203581, "grad_norm": 0.18655353784561157, "learning_rate": 1.99864117083599e-05, "loss": 1.2774, "step": 1429 }, { "epoch": 0.5323963653709671, "grad_norm": 0.18784590065479279, "learning_rate": 1.9986348300282318e-05, "loss": 1.2953, "step": 1430 }, { "epoch": 0.5327686705215763, "grad_norm": 0.17535433173179626, "learning_rate": 1.9986284744706995e-05, "loss": 1.2927, "step": 1431 }, { "epoch": 0.5331409756721853, "grad_norm": 0.1903795450925827, "learning_rate": 1.9986221041634874e-05, "loss": 1.2778, "step": 1432 }, { "epoch": 0.5335132808227944, "grad_norm": 0.1824638694524765, "learning_rate": 1.9986157191066897e-05, "loss": 1.2905, "step": 1433 }, { "epoch": 0.5338855859734034, "grad_norm": 0.17660009860992432, "learning_rate": 1.9986093193004005e-05, "loss": 1.2794, "step": 1434 }, { "epoch": 0.5342578911240126, "grad_norm": 0.18862681090831757, "learning_rate": 1.998602904744714e-05, "loss": 1.2997, "step": 1435 }, { "epoch": 0.5346301962746216, "grad_norm": 0.18798676133155823, "learning_rate": 1.9985964754397256e-05, "loss": 1.2973, "step": 1436 }, { "epoch": 0.5350025014252306, "grad_norm": 0.18016012012958527, "learning_rate": 1.9985900313855297e-05, "loss": 1.2902, "step": 1437 }, { "epoch": 0.5353748065758397, "grad_norm": 0.18391664326190948, "learning_rate": 1.9985835725822217e-05, "loss": 1.2734, "step": 1438 }, { "epoch": 0.5357471117264488, "grad_norm": 0.2144862860441208, "learning_rate": 1.998577099029897e-05, "loss": 1.2937, "step": 1439 }, { "epoch": 0.5361194168770579, "grad_norm": 0.18601621687412262, "learning_rate": 1.9985706107286515e-05, "loss": 1.2836, "step": 1440 }, { "epoch": 0.5364917220276669, "grad_norm": 0.18405480682849884, "learning_rate": 1.9985641076785806e-05, "loss": 1.2814, "step": 1441 }, { "epoch": 0.536864027178276, "grad_norm": 0.19651199877262115, "learning_rate": 1.9985575898797803e-05, "loss": 1.2892, "step": 1442 }, { "epoch": 0.537236332328885, "grad_norm": 0.18740807473659515, "learning_rate": 1.9985510573323474e-05, "loss": 1.3043, "step": 1443 }, { "epoch": 0.5376086374794942, "grad_norm": 0.17624454200267792, "learning_rate": 1.998544510036378e-05, "loss": 1.2874, "step": 1444 }, { "epoch": 0.5379809426301032, "grad_norm": 0.19030825793743134, "learning_rate": 1.9985379479919685e-05, "loss": 1.2915, "step": 1445 }, { "epoch": 0.5383532477807123, "grad_norm": 0.18553490936756134, "learning_rate": 1.9985313711992164e-05, "loss": 1.3027, "step": 1446 }, { "epoch": 0.5387255529313213, "grad_norm": 0.17176979780197144, "learning_rate": 1.998524779658219e-05, "loss": 1.2953, "step": 1447 }, { "epoch": 0.5390978580819304, "grad_norm": 0.1822216957807541, "learning_rate": 1.9985181733690728e-05, "loss": 1.2795, "step": 1448 }, { "epoch": 0.5394701632325395, "grad_norm": 0.18704013526439667, "learning_rate": 1.9985115523318758e-05, "loss": 1.2843, "step": 1449 }, { "epoch": 0.5398424683831485, "grad_norm": 0.18013562262058258, "learning_rate": 1.998504916546726e-05, "loss": 1.2947, "step": 1450 }, { "epoch": 0.5402147735337576, "grad_norm": 0.1960912048816681, "learning_rate": 1.998498266013721e-05, "loss": 1.3077, "step": 1451 }, { "epoch": 0.5405870786843667, "grad_norm": 0.18962568044662476, "learning_rate": 1.9984916007329596e-05, "loss": 1.2998, "step": 1452 }, { "epoch": 0.5409593838349758, "grad_norm": 0.17837461829185486, "learning_rate": 1.99848492070454e-05, "loss": 1.2742, "step": 1453 }, { "epoch": 0.5413316889855848, "grad_norm": 0.17769278585910797, "learning_rate": 1.9984782259285604e-05, "loss": 1.29, "step": 1454 }, { "epoch": 0.5417039941361939, "grad_norm": 0.19605344533920288, "learning_rate": 1.9984715164051203e-05, "loss": 1.2954, "step": 1455 }, { "epoch": 0.5420762992868029, "grad_norm": 0.18324050307273865, "learning_rate": 1.9984647921343185e-05, "loss": 1.3072, "step": 1456 }, { "epoch": 0.542448604437412, "grad_norm": 0.18658418953418732, "learning_rate": 1.9984580531162544e-05, "loss": 1.3097, "step": 1457 }, { "epoch": 0.5428209095880211, "grad_norm": 0.1968865692615509, "learning_rate": 1.9984512993510275e-05, "loss": 1.2943, "step": 1458 }, { "epoch": 0.5431932147386301, "grad_norm": 0.19187164306640625, "learning_rate": 1.9984445308387377e-05, "loss": 1.29, "step": 1459 }, { "epoch": 0.5435655198892392, "grad_norm": 0.18562346696853638, "learning_rate": 1.9984377475794847e-05, "loss": 1.2895, "step": 1460 }, { "epoch": 0.5439378250398483, "grad_norm": 0.18785440921783447, "learning_rate": 1.998430949573369e-05, "loss": 1.3003, "step": 1461 }, { "epoch": 0.5443101301904574, "grad_norm": 0.17857135832309723, "learning_rate": 1.9984241368204907e-05, "loss": 1.2806, "step": 1462 }, { "epoch": 0.5446824353410664, "grad_norm": 0.18769045174121857, "learning_rate": 1.99841730932095e-05, "loss": 1.2866, "step": 1463 }, { "epoch": 0.5450547404916755, "grad_norm": 0.1790071427822113, "learning_rate": 1.9984104670748493e-05, "loss": 1.2814, "step": 1464 }, { "epoch": 0.5454270456422845, "grad_norm": 0.19664452970027924, "learning_rate": 1.998403610082288e-05, "loss": 1.2965, "step": 1465 }, { "epoch": 0.5457993507928937, "grad_norm": 0.19964231550693512, "learning_rate": 1.9983967383433685e-05, "loss": 1.2875, "step": 1466 }, { "epoch": 0.5461716559435027, "grad_norm": 0.1774752289056778, "learning_rate": 1.9983898518581913e-05, "loss": 1.2741, "step": 1467 }, { "epoch": 0.5465439610941117, "grad_norm": 0.18312522768974304, "learning_rate": 1.998382950626859e-05, "loss": 1.2932, "step": 1468 }, { "epoch": 0.5469162662447208, "grad_norm": 0.19389499723911285, "learning_rate": 1.998376034649473e-05, "loss": 1.2921, "step": 1469 }, { "epoch": 0.5472885713953299, "grad_norm": 0.22545579075813293, "learning_rate": 1.9983691039261358e-05, "loss": 1.3011, "step": 1470 }, { "epoch": 0.547660876545939, "grad_norm": 0.17456892132759094, "learning_rate": 1.9983621584569496e-05, "loss": 1.2762, "step": 1471 }, { "epoch": 0.548033181696548, "grad_norm": 0.20221838355064392, "learning_rate": 1.9983551982420168e-05, "loss": 1.2822, "step": 1472 }, { "epoch": 0.5484054868471571, "grad_norm": 0.17790015041828156, "learning_rate": 1.9983482232814405e-05, "loss": 1.2854, "step": 1473 }, { "epoch": 0.5487777919977662, "grad_norm": 0.19563356041908264, "learning_rate": 1.9983412335753237e-05, "loss": 1.2901, "step": 1474 }, { "epoch": 0.5491500971483753, "grad_norm": 0.17833010852336884, "learning_rate": 1.9983342291237693e-05, "loss": 1.2993, "step": 1475 }, { "epoch": 0.5495224022989843, "grad_norm": 0.18136514723300934, "learning_rate": 1.998327209926881e-05, "loss": 1.2829, "step": 1476 }, { "epoch": 0.5498947074495933, "grad_norm": 0.19292525947093964, "learning_rate": 1.9983201759847627e-05, "loss": 1.2967, "step": 1477 }, { "epoch": 0.5502670126002024, "grad_norm": 0.1802646517753601, "learning_rate": 1.9983131272975178e-05, "loss": 1.2871, "step": 1478 }, { "epoch": 0.5506393177508115, "grad_norm": 0.18988975882530212, "learning_rate": 1.9983060638652507e-05, "loss": 1.2881, "step": 1479 }, { "epoch": 0.5510116229014206, "grad_norm": 0.18343466520309448, "learning_rate": 1.9982989856880655e-05, "loss": 1.2826, "step": 1480 }, { "epoch": 0.5513839280520296, "grad_norm": 0.18973736464977264, "learning_rate": 1.9982918927660676e-05, "loss": 1.277, "step": 1481 }, { "epoch": 0.5517562332026387, "grad_norm": 0.1893860101699829, "learning_rate": 1.9982847850993605e-05, "loss": 1.2789, "step": 1482 }, { "epoch": 0.5521285383532478, "grad_norm": 0.17867526412010193, "learning_rate": 1.9982776626880498e-05, "loss": 1.2916, "step": 1483 }, { "epoch": 0.5525008435038569, "grad_norm": 0.1821223944425583, "learning_rate": 1.998270525532241e-05, "loss": 1.2678, "step": 1484 }, { "epoch": 0.5528731486544659, "grad_norm": 0.18600653111934662, "learning_rate": 1.998263373632039e-05, "loss": 1.2878, "step": 1485 }, { "epoch": 0.553245453805075, "grad_norm": 0.17727398872375488, "learning_rate": 1.9982562069875495e-05, "loss": 1.293, "step": 1486 }, { "epoch": 0.5536177589556841, "grad_norm": 0.1803734004497528, "learning_rate": 1.9982490255988786e-05, "loss": 1.2873, "step": 1487 }, { "epoch": 0.5539900641062931, "grad_norm": 0.1834772229194641, "learning_rate": 1.9982418294661322e-05, "loss": 1.2884, "step": 1488 }, { "epoch": 0.5543623692569022, "grad_norm": 0.1872669756412506, "learning_rate": 1.998234618589417e-05, "loss": 1.2798, "step": 1489 }, { "epoch": 0.5547346744075112, "grad_norm": 0.18299680948257446, "learning_rate": 1.9982273929688384e-05, "loss": 1.3008, "step": 1490 }, { "epoch": 0.5551069795581203, "grad_norm": 0.1841609627008438, "learning_rate": 1.9982201526045044e-05, "loss": 1.2893, "step": 1491 }, { "epoch": 0.5554792847087294, "grad_norm": 0.19560469686985016, "learning_rate": 1.9982128974965215e-05, "loss": 1.2846, "step": 1492 }, { "epoch": 0.5558515898593385, "grad_norm": 0.17704376578330994, "learning_rate": 1.998205627644996e-05, "loss": 1.2972, "step": 1493 }, { "epoch": 0.5562238950099475, "grad_norm": 0.18441899120807648, "learning_rate": 1.9981983430500368e-05, "loss": 1.3004, "step": 1494 }, { "epoch": 0.5565962001605566, "grad_norm": 0.19001679122447968, "learning_rate": 1.9981910437117502e-05, "loss": 1.2903, "step": 1495 }, { "epoch": 0.5569685053111657, "grad_norm": 0.19117416441440582, "learning_rate": 1.998183729630245e-05, "loss": 1.287, "step": 1496 }, { "epoch": 0.5573408104617747, "grad_norm": 0.1872899830341339, "learning_rate": 1.9981764008056283e-05, "loss": 1.291, "step": 1497 }, { "epoch": 0.5577131156123838, "grad_norm": 0.19934000074863434, "learning_rate": 1.998169057238009e-05, "loss": 1.2897, "step": 1498 }, { "epoch": 0.5580854207629928, "grad_norm": 0.1839592605829239, "learning_rate": 1.9981616989274955e-05, "loss": 1.2772, "step": 1499 }, { "epoch": 0.558457725913602, "grad_norm": 0.18648071587085724, "learning_rate": 1.998154325874196e-05, "loss": 1.291, "step": 1500 }, { "epoch": 0.558457725913602, "eval_loss": 1.3492937088012695, "eval_runtime": 16.1571, "eval_samples_per_second": 107.321, "eval_steps_per_second": 5.385, "step": 1500 }, { "epoch": 0.558830031064211, "grad_norm": 0.18601854145526886, "learning_rate": 1.9981469380782205e-05, "loss": 1.2898, "step": 1501 }, { "epoch": 0.5592023362148201, "grad_norm": 0.18247583508491516, "learning_rate": 1.9981395355396764e-05, "loss": 1.276, "step": 1502 }, { "epoch": 0.5595746413654291, "grad_norm": 0.19537685811519623, "learning_rate": 1.9981321182586746e-05, "loss": 1.2851, "step": 1503 }, { "epoch": 0.5599469465160382, "grad_norm": 0.17951083183288574, "learning_rate": 1.998124686235324e-05, "loss": 1.2892, "step": 1504 }, { "epoch": 0.5603192516666473, "grad_norm": 0.1824089139699936, "learning_rate": 1.998117239469734e-05, "loss": 1.285, "step": 1505 }, { "epoch": 0.5606915568172564, "grad_norm": 0.19962851703166962, "learning_rate": 1.9981097779620156e-05, "loss": 1.2866, "step": 1506 }, { "epoch": 0.5610638619678654, "grad_norm": 0.17792317271232605, "learning_rate": 1.998102301712278e-05, "loss": 1.281, "step": 1507 }, { "epoch": 0.5614361671184744, "grad_norm": 0.20243586599826813, "learning_rate": 1.9980948107206323e-05, "loss": 1.3045, "step": 1508 }, { "epoch": 0.5618084722690836, "grad_norm": 0.1958632916212082, "learning_rate": 1.998087304987189e-05, "loss": 1.2902, "step": 1509 }, { "epoch": 0.5621807774196926, "grad_norm": 0.17324163019657135, "learning_rate": 1.9980797845120583e-05, "loss": 1.2868, "step": 1510 }, { "epoch": 0.5625530825703017, "grad_norm": 0.18130120635032654, "learning_rate": 1.998072249295352e-05, "loss": 1.289, "step": 1511 }, { "epoch": 0.5629253877209107, "grad_norm": 0.18327681720256805, "learning_rate": 1.9980646993371816e-05, "loss": 1.269, "step": 1512 }, { "epoch": 0.5632976928715199, "grad_norm": 0.1806684136390686, "learning_rate": 1.998057134637658e-05, "loss": 1.2989, "step": 1513 }, { "epoch": 0.5636699980221289, "grad_norm": 0.18309220671653748, "learning_rate": 1.998049555196893e-05, "loss": 1.287, "step": 1514 }, { "epoch": 0.564042303172738, "grad_norm": 0.19237829744815826, "learning_rate": 1.998041961014999e-05, "loss": 1.2894, "step": 1515 }, { "epoch": 0.564414608323347, "grad_norm": 0.1768263727426529, "learning_rate": 1.998034352092088e-05, "loss": 1.2811, "step": 1516 }, { "epoch": 0.564786913473956, "grad_norm": 0.17326796054840088, "learning_rate": 1.9980267284282718e-05, "loss": 1.2783, "step": 1517 }, { "epoch": 0.5651592186245652, "grad_norm": 0.18072554469108582, "learning_rate": 1.9980190900236637e-05, "loss": 1.2742, "step": 1518 }, { "epoch": 0.5655315237751742, "grad_norm": 0.17865929007530212, "learning_rate": 1.998011436878376e-05, "loss": 1.281, "step": 1519 }, { "epoch": 0.5659038289257833, "grad_norm": 0.18478450179100037, "learning_rate": 1.998003768992522e-05, "loss": 1.2918, "step": 1520 }, { "epoch": 0.5662761340763923, "grad_norm": 0.18366698920726776, "learning_rate": 1.9979960863662155e-05, "loss": 1.2721, "step": 1521 }, { "epoch": 0.5666484392270015, "grad_norm": 0.17557835578918457, "learning_rate": 1.997988388999569e-05, "loss": 1.2887, "step": 1522 }, { "epoch": 0.5670207443776105, "grad_norm": 0.19200293719768524, "learning_rate": 1.997980676892697e-05, "loss": 1.2778, "step": 1523 }, { "epoch": 0.5673930495282196, "grad_norm": 0.18071214854717255, "learning_rate": 1.9979729500457125e-05, "loss": 1.2984, "step": 1524 }, { "epoch": 0.5677653546788286, "grad_norm": 0.18595604598522186, "learning_rate": 1.9979652084587305e-05, "loss": 1.2879, "step": 1525 }, { "epoch": 0.5681376598294378, "grad_norm": 0.17925933003425598, "learning_rate": 1.9979574521318648e-05, "loss": 1.2851, "step": 1526 }, { "epoch": 0.5685099649800468, "grad_norm": 0.1706983745098114, "learning_rate": 1.9979496810652303e-05, "loss": 1.2867, "step": 1527 }, { "epoch": 0.5688822701306558, "grad_norm": 0.1855979859828949, "learning_rate": 1.9979418952589417e-05, "loss": 1.3028, "step": 1528 }, { "epoch": 0.5692545752812649, "grad_norm": 0.1966056227684021, "learning_rate": 1.997934094713114e-05, "loss": 1.2808, "step": 1529 }, { "epoch": 0.5696268804318739, "grad_norm": 0.1873730570077896, "learning_rate": 1.997926279427862e-05, "loss": 1.2833, "step": 1530 }, { "epoch": 0.5699991855824831, "grad_norm": 0.19508777558803558, "learning_rate": 1.9979184494033016e-05, "loss": 1.2821, "step": 1531 }, { "epoch": 0.5703714907330921, "grad_norm": 0.17242860794067383, "learning_rate": 1.9979106046395487e-05, "loss": 1.2833, "step": 1532 }, { "epoch": 0.5707437958837012, "grad_norm": 0.1809634268283844, "learning_rate": 1.9979027451367185e-05, "loss": 1.2796, "step": 1533 }, { "epoch": 0.5711161010343102, "grad_norm": 0.17092421650886536, "learning_rate": 1.9978948708949274e-05, "loss": 1.2873, "step": 1534 }, { "epoch": 0.5714884061849194, "grad_norm": 0.19138309359550476, "learning_rate": 1.9978869819142915e-05, "loss": 1.2923, "step": 1535 }, { "epoch": 0.5718607113355284, "grad_norm": 0.18097564578056335, "learning_rate": 1.997879078194928e-05, "loss": 1.2754, "step": 1536 }, { "epoch": 0.5722330164861374, "grad_norm": 0.1787206530570984, "learning_rate": 1.9978711597369528e-05, "loss": 1.2976, "step": 1537 }, { "epoch": 0.5726053216367465, "grad_norm": 0.1908443570137024, "learning_rate": 1.997863226540483e-05, "loss": 1.2946, "step": 1538 }, { "epoch": 0.5729776267873555, "grad_norm": 0.1926680952310562, "learning_rate": 1.9978552786056364e-05, "loss": 1.2935, "step": 1539 }, { "epoch": 0.5733499319379647, "grad_norm": 0.18011966347694397, "learning_rate": 1.9978473159325296e-05, "loss": 1.2915, "step": 1540 }, { "epoch": 0.5737222370885737, "grad_norm": 0.17944326996803284, "learning_rate": 1.997839338521281e-05, "loss": 1.2818, "step": 1541 }, { "epoch": 0.5740945422391828, "grad_norm": 0.1845857799053192, "learning_rate": 1.9978313463720073e-05, "loss": 1.2721, "step": 1542 }, { "epoch": 0.5744668473897918, "grad_norm": 0.18156015872955322, "learning_rate": 1.997823339484828e-05, "loss": 1.2868, "step": 1543 }, { "epoch": 0.574839152540401, "grad_norm": 0.18522031605243683, "learning_rate": 1.99781531785986e-05, "loss": 1.2795, "step": 1544 }, { "epoch": 0.57521145769101, "grad_norm": 0.20020869374275208, "learning_rate": 1.9978072814972226e-05, "loss": 1.3067, "step": 1545 }, { "epoch": 0.575583762841619, "grad_norm": 0.17684873938560486, "learning_rate": 1.9977992303970342e-05, "loss": 1.2903, "step": 1546 }, { "epoch": 0.5759560679922281, "grad_norm": 0.18267862498760223, "learning_rate": 1.997791164559414e-05, "loss": 1.2972, "step": 1547 }, { "epoch": 0.5763283731428372, "grad_norm": 0.17601077258586884, "learning_rate": 1.9977830839844808e-05, "loss": 1.278, "step": 1548 }, { "epoch": 0.5767006782934463, "grad_norm": 0.18240796029567719, "learning_rate": 1.997774988672354e-05, "loss": 1.2753, "step": 1549 }, { "epoch": 0.5770729834440553, "grad_norm": 0.19379922747612, "learning_rate": 1.9977668786231536e-05, "loss": 1.2932, "step": 1550 }, { "epoch": 0.5774452885946644, "grad_norm": 0.19803746044635773, "learning_rate": 1.9977587538369985e-05, "loss": 1.2876, "step": 1551 }, { "epoch": 0.5778175937452734, "grad_norm": 0.19004493951797485, "learning_rate": 1.9977506143140094e-05, "loss": 1.2687, "step": 1552 }, { "epoch": 0.5781898988958826, "grad_norm": 0.18594302237033844, "learning_rate": 1.9977424600543065e-05, "loss": 1.2875, "step": 1553 }, { "epoch": 0.5785622040464916, "grad_norm": 0.193922758102417, "learning_rate": 1.9977342910580097e-05, "loss": 1.2779, "step": 1554 }, { "epoch": 0.5789345091971007, "grad_norm": 0.19569186866283417, "learning_rate": 1.9977261073252405e-05, "loss": 1.2926, "step": 1555 }, { "epoch": 0.5793068143477097, "grad_norm": 0.18837492167949677, "learning_rate": 1.9977179088561193e-05, "loss": 1.2963, "step": 1556 }, { "epoch": 0.5796791194983189, "grad_norm": 0.17669405043125153, "learning_rate": 1.9977096956507668e-05, "loss": 1.2761, "step": 1557 }, { "epoch": 0.5800514246489279, "grad_norm": 0.191486656665802, "learning_rate": 1.997701467709305e-05, "loss": 1.2913, "step": 1558 }, { "epoch": 0.5804237297995369, "grad_norm": 0.18390990793704987, "learning_rate": 1.997693225031855e-05, "loss": 1.2948, "step": 1559 }, { "epoch": 0.580796034950146, "grad_norm": 0.18185609579086304, "learning_rate": 1.9976849676185384e-05, "loss": 1.283, "step": 1560 }, { "epoch": 0.5811683401007551, "grad_norm": 0.1948121190071106, "learning_rate": 1.997676695469478e-05, "loss": 1.2848, "step": 1561 }, { "epoch": 0.5815406452513642, "grad_norm": 0.18750061094760895, "learning_rate": 1.997668408584795e-05, "loss": 1.2828, "step": 1562 }, { "epoch": 0.5819129504019732, "grad_norm": 0.1849110871553421, "learning_rate": 1.9976601069646126e-05, "loss": 1.2883, "step": 1563 }, { "epoch": 0.5822852555525823, "grad_norm": 0.18599045276641846, "learning_rate": 1.9976517906090528e-05, "loss": 1.2742, "step": 1564 }, { "epoch": 0.5826575607031913, "grad_norm": 0.1824532300233841, "learning_rate": 1.997643459518239e-05, "loss": 1.2835, "step": 1565 }, { "epoch": 0.5830298658538005, "grad_norm": 0.18060551583766937, "learning_rate": 1.9976351136922934e-05, "loss": 1.2756, "step": 1566 }, { "epoch": 0.5834021710044095, "grad_norm": 0.18784838914871216, "learning_rate": 1.99762675313134e-05, "loss": 1.2935, "step": 1567 }, { "epoch": 0.5837744761550185, "grad_norm": 0.25187990069389343, "learning_rate": 1.9976183778355018e-05, "loss": 1.2882, "step": 1568 }, { "epoch": 0.5841467813056276, "grad_norm": 0.17487937211990356, "learning_rate": 1.997609987804903e-05, "loss": 1.2734, "step": 1569 }, { "epoch": 0.5845190864562367, "grad_norm": 0.18456514179706573, "learning_rate": 1.9976015830396676e-05, "loss": 1.2816, "step": 1570 }, { "epoch": 0.5848913916068458, "grad_norm": 0.18243633210659027, "learning_rate": 1.997593163539919e-05, "loss": 1.2809, "step": 1571 }, { "epoch": 0.5852636967574548, "grad_norm": 0.17159947752952576, "learning_rate": 1.9975847293057822e-05, "loss": 1.274, "step": 1572 }, { "epoch": 0.5856360019080639, "grad_norm": 0.18149283528327942, "learning_rate": 1.9975762803373815e-05, "loss": 1.2842, "step": 1573 }, { "epoch": 0.586008307058673, "grad_norm": 0.19415758550167084, "learning_rate": 1.9975678166348417e-05, "loss": 1.2903, "step": 1574 }, { "epoch": 0.5863806122092821, "grad_norm": 0.18272031843662262, "learning_rate": 1.9975593381982877e-05, "loss": 1.2931, "step": 1575 }, { "epoch": 0.5867529173598911, "grad_norm": 0.19155685603618622, "learning_rate": 1.997550845027845e-05, "loss": 1.2876, "step": 1576 }, { "epoch": 0.5871252225105001, "grad_norm": 0.17365863919258118, "learning_rate": 1.9975423371236392e-05, "loss": 1.2757, "step": 1577 }, { "epoch": 0.5874975276611092, "grad_norm": 0.1818249225616455, "learning_rate": 1.9975338144857954e-05, "loss": 1.2853, "step": 1578 }, { "epoch": 0.5878698328117183, "grad_norm": 0.17870758473873138, "learning_rate": 1.99752527711444e-05, "loss": 1.2682, "step": 1579 }, { "epoch": 0.5882421379623274, "grad_norm": 0.18116502463817596, "learning_rate": 1.9975167250096985e-05, "loss": 1.2774, "step": 1580 }, { "epoch": 0.5886144431129364, "grad_norm": 0.17226383090019226, "learning_rate": 1.997508158171698e-05, "loss": 1.2666, "step": 1581 }, { "epoch": 0.5889867482635455, "grad_norm": 0.18359726667404175, "learning_rate": 1.9974995766005644e-05, "loss": 1.2794, "step": 1582 }, { "epoch": 0.5893590534141546, "grad_norm": 0.21769051253795624, "learning_rate": 1.9974909802964244e-05, "loss": 1.2836, "step": 1583 }, { "epoch": 0.5897313585647637, "grad_norm": 0.18334870040416718, "learning_rate": 1.9974823692594054e-05, "loss": 1.2834, "step": 1584 }, { "epoch": 0.5901036637153727, "grad_norm": 0.18716122210025787, "learning_rate": 1.9974737434896346e-05, "loss": 1.2893, "step": 1585 }, { "epoch": 0.5904759688659817, "grad_norm": 0.21679291129112244, "learning_rate": 1.997465102987239e-05, "loss": 1.264, "step": 1586 }, { "epoch": 0.5908482740165909, "grad_norm": 0.1788294017314911, "learning_rate": 1.9974564477523462e-05, "loss": 1.2765, "step": 1587 }, { "epoch": 0.5912205791671999, "grad_norm": 0.18102407455444336, "learning_rate": 1.9974477777850847e-05, "loss": 1.2636, "step": 1588 }, { "epoch": 0.591592884317809, "grad_norm": 0.17994816601276398, "learning_rate": 1.997439093085582e-05, "loss": 1.2746, "step": 1589 }, { "epoch": 0.591965189468418, "grad_norm": 0.18664704263210297, "learning_rate": 1.9974303936539665e-05, "loss": 1.2926, "step": 1590 }, { "epoch": 0.5923374946190271, "grad_norm": 0.18071360886096954, "learning_rate": 1.9974216794903666e-05, "loss": 1.2776, "step": 1591 }, { "epoch": 0.5927097997696362, "grad_norm": 0.19198505580425262, "learning_rate": 1.9974129505949112e-05, "loss": 1.2895, "step": 1592 }, { "epoch": 0.5930821049202453, "grad_norm": 0.1870136260986328, "learning_rate": 1.997404206967729e-05, "loss": 1.2733, "step": 1593 }, { "epoch": 0.5934544100708543, "grad_norm": 0.18357659876346588, "learning_rate": 1.9973954486089494e-05, "loss": 1.2859, "step": 1594 }, { "epoch": 0.5938267152214634, "grad_norm": 0.20039206743240356, "learning_rate": 1.9973866755187012e-05, "loss": 1.2969, "step": 1595 }, { "epoch": 0.5941990203720725, "grad_norm": 0.1847948431968689, "learning_rate": 1.997377887697115e-05, "loss": 1.2811, "step": 1596 }, { "epoch": 0.5945713255226815, "grad_norm": 0.18077334761619568, "learning_rate": 1.9973690851443198e-05, "loss": 1.2874, "step": 1597 }, { "epoch": 0.5949436306732906, "grad_norm": 0.17171575129032135, "learning_rate": 1.9973602678604454e-05, "loss": 1.2942, "step": 1598 }, { "epoch": 0.5953159358238996, "grad_norm": 0.17874233424663544, "learning_rate": 1.9973514358456228e-05, "loss": 1.2938, "step": 1599 }, { "epoch": 0.5956882409745088, "grad_norm": 0.19295063614845276, "learning_rate": 1.997342589099982e-05, "loss": 1.3016, "step": 1600 }, { "epoch": 0.5960605461251178, "grad_norm": 0.18213194608688354, "learning_rate": 1.9973337276236538e-05, "loss": 1.2787, "step": 1601 }, { "epoch": 0.5964328512757269, "grad_norm": 0.18399560451507568, "learning_rate": 1.997324851416769e-05, "loss": 1.2824, "step": 1602 }, { "epoch": 0.5968051564263359, "grad_norm": 0.184067040681839, "learning_rate": 1.9973159604794587e-05, "loss": 1.2785, "step": 1603 }, { "epoch": 0.597177461576945, "grad_norm": 0.18284447491168976, "learning_rate": 1.9973070548118545e-05, "loss": 1.261, "step": 1604 }, { "epoch": 0.5975497667275541, "grad_norm": 0.18606549501419067, "learning_rate": 1.9972981344140875e-05, "loss": 1.2841, "step": 1605 }, { "epoch": 0.5979220718781632, "grad_norm": 0.18835964798927307, "learning_rate": 1.9972891992862895e-05, "loss": 1.28, "step": 1606 }, { "epoch": 0.5982943770287722, "grad_norm": 0.18669797480106354, "learning_rate": 1.997280249428593e-05, "loss": 1.2893, "step": 1607 }, { "epoch": 0.5986666821793812, "grad_norm": 0.19121499359607697, "learning_rate": 1.9972712848411292e-05, "loss": 1.2847, "step": 1608 }, { "epoch": 0.5990389873299904, "grad_norm": 0.18842531740665436, "learning_rate": 1.9972623055240316e-05, "loss": 1.2856, "step": 1609 }, { "epoch": 0.5994112924805994, "grad_norm": 0.18126575648784637, "learning_rate": 1.9972533114774322e-05, "loss": 1.2767, "step": 1610 }, { "epoch": 0.5997835976312085, "grad_norm": 0.17144346237182617, "learning_rate": 1.997244302701464e-05, "loss": 1.2738, "step": 1611 }, { "epoch": 0.6001559027818175, "grad_norm": 0.17671994864940643, "learning_rate": 1.99723527919626e-05, "loss": 1.2693, "step": 1612 }, { "epoch": 0.6005282079324266, "grad_norm": 0.17835721373558044, "learning_rate": 1.9972262409619534e-05, "loss": 1.2738, "step": 1613 }, { "epoch": 0.6009005130830357, "grad_norm": 0.17789305746555328, "learning_rate": 1.997217187998678e-05, "loss": 1.2877, "step": 1614 }, { "epoch": 0.6012728182336448, "grad_norm": 0.18692655861377716, "learning_rate": 1.9972081203065672e-05, "loss": 1.2754, "step": 1615 }, { "epoch": 0.6016451233842538, "grad_norm": 0.1710839867591858, "learning_rate": 1.997199037885755e-05, "loss": 1.2831, "step": 1616 }, { "epoch": 0.6020174285348628, "grad_norm": 0.1843128800392151, "learning_rate": 1.9971899407363757e-05, "loss": 1.281, "step": 1617 }, { "epoch": 0.602389733685472, "grad_norm": 0.19229008257389069, "learning_rate": 1.9971808288585636e-05, "loss": 1.3007, "step": 1618 }, { "epoch": 0.602762038836081, "grad_norm": 0.18288210034370422, "learning_rate": 1.997171702252453e-05, "loss": 1.2852, "step": 1619 }, { "epoch": 0.6031343439866901, "grad_norm": 0.18607817590236664, "learning_rate": 1.997162560918179e-05, "loss": 1.2698, "step": 1620 }, { "epoch": 0.6035066491372991, "grad_norm": 0.1832936704158783, "learning_rate": 1.997153404855877e-05, "loss": 1.2929, "step": 1621 }, { "epoch": 0.6038789542879083, "grad_norm": 0.1910560131072998, "learning_rate": 1.9971442340656812e-05, "loss": 1.2891, "step": 1622 }, { "epoch": 0.6042512594385173, "grad_norm": 0.19132542610168457, "learning_rate": 1.997135048547728e-05, "loss": 1.2862, "step": 1623 }, { "epoch": 0.6046235645891264, "grad_norm": 0.18313977122306824, "learning_rate": 1.9971258483021526e-05, "loss": 1.278, "step": 1624 }, { "epoch": 0.6049958697397354, "grad_norm": 0.17760074138641357, "learning_rate": 1.997116633329091e-05, "loss": 1.2783, "step": 1625 }, { "epoch": 0.6053681748903444, "grad_norm": 0.18431836366653442, "learning_rate": 1.997107403628679e-05, "loss": 1.28, "step": 1626 }, { "epoch": 0.6057404800409536, "grad_norm": 0.18512500822544098, "learning_rate": 1.9970981592010538e-05, "loss": 1.2817, "step": 1627 }, { "epoch": 0.6061127851915626, "grad_norm": 0.18769600987434387, "learning_rate": 1.9970889000463512e-05, "loss": 1.2779, "step": 1628 }, { "epoch": 0.6064850903421717, "grad_norm": 0.17421449720859528, "learning_rate": 1.997079626164708e-05, "loss": 1.2822, "step": 1629 }, { "epoch": 0.6068573954927807, "grad_norm": 0.1772501915693283, "learning_rate": 1.997070337556261e-05, "loss": 1.2774, "step": 1630 }, { "epoch": 0.6072297006433899, "grad_norm": 0.1793195903301239, "learning_rate": 1.9970610342211484e-05, "loss": 1.2789, "step": 1631 }, { "epoch": 0.6076020057939989, "grad_norm": 0.18559563159942627, "learning_rate": 1.9970517161595063e-05, "loss": 1.2971, "step": 1632 }, { "epoch": 0.607974310944608, "grad_norm": 0.18821711838245392, "learning_rate": 1.997042383371473e-05, "loss": 1.2901, "step": 1633 }, { "epoch": 0.608346616095217, "grad_norm": 0.18289709091186523, "learning_rate": 1.9970330358571862e-05, "loss": 1.2772, "step": 1634 }, { "epoch": 0.6087189212458262, "grad_norm": 0.18088442087173462, "learning_rate": 1.9970236736167846e-05, "loss": 1.2877, "step": 1635 }, { "epoch": 0.6090912263964352, "grad_norm": 0.1836581975221634, "learning_rate": 1.9970142966504053e-05, "loss": 1.2779, "step": 1636 }, { "epoch": 0.6094635315470442, "grad_norm": 0.18793676793575287, "learning_rate": 1.9970049049581878e-05, "loss": 1.2817, "step": 1637 }, { "epoch": 0.6098358366976533, "grad_norm": 0.18134282529354095, "learning_rate": 1.9969954985402702e-05, "loss": 1.2842, "step": 1638 }, { "epoch": 0.6102081418482623, "grad_norm": 0.1860257387161255, "learning_rate": 1.9969860773967916e-05, "loss": 1.292, "step": 1639 }, { "epoch": 0.6105804469988715, "grad_norm": 0.1779407411813736, "learning_rate": 1.9969766415278916e-05, "loss": 1.2869, "step": 1640 }, { "epoch": 0.6109527521494805, "grad_norm": 0.17907682061195374, "learning_rate": 1.9969671909337086e-05, "loss": 1.2777, "step": 1641 }, { "epoch": 0.6113250573000896, "grad_norm": 0.21232067048549652, "learning_rate": 1.996957725614383e-05, "loss": 1.2694, "step": 1642 }, { "epoch": 0.6116973624506986, "grad_norm": 0.1791219562292099, "learning_rate": 1.9969482455700544e-05, "loss": 1.2757, "step": 1643 }, { "epoch": 0.6120696676013078, "grad_norm": 0.17560942471027374, "learning_rate": 1.996938750800863e-05, "loss": 1.2838, "step": 1644 }, { "epoch": 0.6124419727519168, "grad_norm": 0.19401302933692932, "learning_rate": 1.9969292413069485e-05, "loss": 1.2896, "step": 1645 }, { "epoch": 0.6128142779025259, "grad_norm": 0.1834617555141449, "learning_rate": 1.9969197170884517e-05, "loss": 1.2824, "step": 1646 }, { "epoch": 0.6131865830531349, "grad_norm": 0.18403379619121552, "learning_rate": 1.9969101781455132e-05, "loss": 1.2783, "step": 1647 }, { "epoch": 0.613558888203744, "grad_norm": 0.1884704977273941, "learning_rate": 1.996900624478274e-05, "loss": 1.2783, "step": 1648 }, { "epoch": 0.6139311933543531, "grad_norm": 0.1792110949754715, "learning_rate": 1.9968910560868757e-05, "loss": 1.288, "step": 1649 }, { "epoch": 0.6143034985049621, "grad_norm": 0.1805010288953781, "learning_rate": 1.9968814729714584e-05, "loss": 1.2694, "step": 1650 }, { "epoch": 0.6146758036555712, "grad_norm": 0.17859011888504028, "learning_rate": 1.9968718751321643e-05, "loss": 1.2631, "step": 1651 }, { "epoch": 0.6150481088061802, "grad_norm": 0.1796571910381317, "learning_rate": 1.9968622625691353e-05, "loss": 1.2819, "step": 1652 }, { "epoch": 0.6154204139567894, "grad_norm": 0.1841122955083847, "learning_rate": 1.9968526352825135e-05, "loss": 1.27, "step": 1653 }, { "epoch": 0.6157927191073984, "grad_norm": 0.17526361346244812, "learning_rate": 1.9968429932724404e-05, "loss": 1.2927, "step": 1654 }, { "epoch": 0.6161650242580075, "grad_norm": 0.17394275963306427, "learning_rate": 1.996833336539059e-05, "loss": 1.2978, "step": 1655 }, { "epoch": 0.6165373294086165, "grad_norm": 0.17003491520881653, "learning_rate": 1.996823665082512e-05, "loss": 1.2652, "step": 1656 }, { "epoch": 0.6169096345592257, "grad_norm": 0.18178239464759827, "learning_rate": 1.9968139789029418e-05, "loss": 1.2768, "step": 1657 }, { "epoch": 0.6172819397098347, "grad_norm": 0.17376887798309326, "learning_rate": 1.9968042780004917e-05, "loss": 1.2898, "step": 1658 }, { "epoch": 0.6176542448604437, "grad_norm": 0.179343119263649, "learning_rate": 1.9967945623753052e-05, "loss": 1.2641, "step": 1659 }, { "epoch": 0.6180265500110528, "grad_norm": 0.1855798065662384, "learning_rate": 1.9967848320275253e-05, "loss": 1.2841, "step": 1660 }, { "epoch": 0.6183988551616619, "grad_norm": 0.17375251650810242, "learning_rate": 1.996775086957296e-05, "loss": 1.2759, "step": 1661 }, { "epoch": 0.618771160312271, "grad_norm": 0.1763077974319458, "learning_rate": 1.9967653271647613e-05, "loss": 1.274, "step": 1662 }, { "epoch": 0.61914346546288, "grad_norm": 0.17887422442436218, "learning_rate": 1.9967555526500652e-05, "loss": 1.2685, "step": 1663 }, { "epoch": 0.6195157706134891, "grad_norm": 0.17712260782718658, "learning_rate": 1.9967457634133524e-05, "loss": 1.2755, "step": 1664 }, { "epoch": 0.6198880757640981, "grad_norm": 0.1702347993850708, "learning_rate": 1.996735959454767e-05, "loss": 1.2645, "step": 1665 }, { "epoch": 0.6202603809147073, "grad_norm": 0.17374765872955322, "learning_rate": 1.996726140774454e-05, "loss": 1.2772, "step": 1666 }, { "epoch": 0.6206326860653163, "grad_norm": 0.17728720605373383, "learning_rate": 1.9967163073725585e-05, "loss": 1.2669, "step": 1667 }, { "epoch": 0.6210049912159253, "grad_norm": 0.18125899136066437, "learning_rate": 1.9967064592492258e-05, "loss": 1.2798, "step": 1668 }, { "epoch": 0.6213772963665344, "grad_norm": 0.17096486687660217, "learning_rate": 1.996696596404601e-05, "loss": 1.2666, "step": 1669 }, { "epoch": 0.6217496015171435, "grad_norm": 0.18018490076065063, "learning_rate": 1.9966867188388303e-05, "loss": 1.2884, "step": 1670 }, { "epoch": 0.6221219066677526, "grad_norm": 0.17836543917655945, "learning_rate": 1.9966768265520593e-05, "loss": 1.2874, "step": 1671 }, { "epoch": 0.6224942118183616, "grad_norm": 0.17728979885578156, "learning_rate": 1.996666919544434e-05, "loss": 1.2756, "step": 1672 }, { "epoch": 0.6228665169689707, "grad_norm": 0.18157729506492615, "learning_rate": 1.9966569978161008e-05, "loss": 1.2668, "step": 1673 }, { "epoch": 0.6232388221195797, "grad_norm": 0.1761295050382614, "learning_rate": 1.9966470613672064e-05, "loss": 1.2999, "step": 1674 }, { "epoch": 0.6236111272701889, "grad_norm": 0.18089498579502106, "learning_rate": 1.9966371101978975e-05, "loss": 1.2796, "step": 1675 }, { "epoch": 0.6239834324207979, "grad_norm": 0.17708507180213928, "learning_rate": 1.996627144308321e-05, "loss": 1.2575, "step": 1676 }, { "epoch": 0.6243557375714069, "grad_norm": 0.17729350924491882, "learning_rate": 1.9966171636986238e-05, "loss": 1.2783, "step": 1677 }, { "epoch": 0.624728042722016, "grad_norm": 0.18605685234069824, "learning_rate": 1.996607168368954e-05, "loss": 1.271, "step": 1678 }, { "epoch": 0.6251003478726251, "grad_norm": 0.1942148506641388, "learning_rate": 1.9965971583194587e-05, "loss": 1.2831, "step": 1679 }, { "epoch": 0.6254726530232342, "grad_norm": 0.18195581436157227, "learning_rate": 1.996587133550286e-05, "loss": 1.278, "step": 1680 }, { "epoch": 0.6258449581738432, "grad_norm": 0.17865653336048126, "learning_rate": 1.996577094061584e-05, "loss": 1.2773, "step": 1681 }, { "epoch": 0.6262172633244523, "grad_norm": 0.1941101998090744, "learning_rate": 1.9965670398535004e-05, "loss": 1.2721, "step": 1682 }, { "epoch": 0.6265895684750614, "grad_norm": 0.18968285620212555, "learning_rate": 1.9965569709261845e-05, "loss": 1.2871, "step": 1683 }, { "epoch": 0.6269618736256705, "grad_norm": 0.18020257353782654, "learning_rate": 1.996546887279785e-05, "loss": 1.2824, "step": 1684 }, { "epoch": 0.6273341787762795, "grad_norm": 0.1798500269651413, "learning_rate": 1.99653678891445e-05, "loss": 1.2753, "step": 1685 }, { "epoch": 0.6277064839268885, "grad_norm": 0.16887503862380981, "learning_rate": 1.9965266758303295e-05, "loss": 1.2799, "step": 1686 }, { "epoch": 0.6280787890774976, "grad_norm": 0.1764368861913681, "learning_rate": 1.9965165480275722e-05, "loss": 1.2649, "step": 1687 }, { "epoch": 0.6284510942281067, "grad_norm": 0.19635643064975739, "learning_rate": 1.9965064055063282e-05, "loss": 1.2691, "step": 1688 }, { "epoch": 0.6288233993787158, "grad_norm": 0.1750660389661789, "learning_rate": 1.996496248266747e-05, "loss": 1.2775, "step": 1689 }, { "epoch": 0.6291957045293248, "grad_norm": 0.17725105583667755, "learning_rate": 1.996486076308979e-05, "loss": 1.2829, "step": 1690 }, { "epoch": 0.6295680096799339, "grad_norm": 0.1825384497642517, "learning_rate": 1.9964758896331743e-05, "loss": 1.2577, "step": 1691 }, { "epoch": 0.629940314830543, "grad_norm": 0.18121573328971863, "learning_rate": 1.996465688239483e-05, "loss": 1.2805, "step": 1692 }, { "epoch": 0.6303126199811521, "grad_norm": 0.18279202282428741, "learning_rate": 1.996455472128056e-05, "loss": 1.2794, "step": 1693 }, { "epoch": 0.6306849251317611, "grad_norm": 0.17859618365764618, "learning_rate": 1.996445241299044e-05, "loss": 1.2718, "step": 1694 }, { "epoch": 0.6310572302823702, "grad_norm": 0.18363523483276367, "learning_rate": 1.9964349957525988e-05, "loss": 1.2731, "step": 1695 }, { "epoch": 0.6314295354329793, "grad_norm": 0.1790492683649063, "learning_rate": 1.9964247354888712e-05, "loss": 1.2734, "step": 1696 }, { "epoch": 0.6318018405835883, "grad_norm": 0.17761005461215973, "learning_rate": 1.9964144605080125e-05, "loss": 1.2536, "step": 1697 }, { "epoch": 0.6321741457341974, "grad_norm": 0.18184557557106018, "learning_rate": 1.9964041708101745e-05, "loss": 1.2721, "step": 1698 }, { "epoch": 0.6325464508848064, "grad_norm": 0.18488840758800507, "learning_rate": 1.99639386639551e-05, "loss": 1.2795, "step": 1699 }, { "epoch": 0.6329187560354155, "grad_norm": 0.1846456080675125, "learning_rate": 1.9963835472641704e-05, "loss": 1.2744, "step": 1700 }, { "epoch": 0.6332910611860246, "grad_norm": 0.1817891150712967, "learning_rate": 1.9963732134163084e-05, "loss": 1.2856, "step": 1701 }, { "epoch": 0.6336633663366337, "grad_norm": 0.18094323575496674, "learning_rate": 1.9963628648520767e-05, "loss": 1.28, "step": 1702 }, { "epoch": 0.6340356714872427, "grad_norm": 0.18917135894298553, "learning_rate": 1.9963525015716277e-05, "loss": 1.277, "step": 1703 }, { "epoch": 0.6344079766378518, "grad_norm": 0.19249355792999268, "learning_rate": 1.996342123575115e-05, "loss": 1.2728, "step": 1704 }, { "epoch": 0.6347802817884609, "grad_norm": 0.17361187934875488, "learning_rate": 1.9963317308626916e-05, "loss": 1.2699, "step": 1705 }, { "epoch": 0.63515258693907, "grad_norm": 0.18167155981063843, "learning_rate": 1.996321323434511e-05, "loss": 1.2732, "step": 1706 }, { "epoch": 0.635524892089679, "grad_norm": 0.18852338194847107, "learning_rate": 1.9963109012907268e-05, "loss": 1.2791, "step": 1707 }, { "epoch": 0.635897197240288, "grad_norm": 0.17643553018569946, "learning_rate": 1.9963004644314937e-05, "loss": 1.2737, "step": 1708 }, { "epoch": 0.6362695023908972, "grad_norm": 0.18324004113674164, "learning_rate": 1.9962900128569645e-05, "loss": 1.2779, "step": 1709 }, { "epoch": 0.6366418075415062, "grad_norm": 0.18091078102588654, "learning_rate": 1.9962795465672952e-05, "loss": 1.2825, "step": 1710 }, { "epoch": 0.6370141126921153, "grad_norm": 0.17854343354701996, "learning_rate": 1.996269065562639e-05, "loss": 1.2876, "step": 1711 }, { "epoch": 0.6373864178427243, "grad_norm": 0.16912639141082764, "learning_rate": 1.9962585698431513e-05, "loss": 1.2463, "step": 1712 }, { "epoch": 0.6377587229933334, "grad_norm": 0.18469521403312683, "learning_rate": 1.9962480594089867e-05, "loss": 1.2723, "step": 1713 }, { "epoch": 0.6381310281439425, "grad_norm": 0.1734856516122818, "learning_rate": 1.9962375342603013e-05, "loss": 1.2732, "step": 1714 }, { "epoch": 0.6385033332945516, "grad_norm": 0.17890629172325134, "learning_rate": 1.99622699439725e-05, "loss": 1.2711, "step": 1715 }, { "epoch": 0.6388756384451606, "grad_norm": 0.18328309059143066, "learning_rate": 1.996216439819988e-05, "loss": 1.2765, "step": 1716 }, { "epoch": 0.6392479435957696, "grad_norm": 0.18121285736560822, "learning_rate": 1.9962058705286722e-05, "loss": 1.2854, "step": 1717 }, { "epoch": 0.6396202487463788, "grad_norm": 0.18126457929611206, "learning_rate": 1.9961952865234582e-05, "loss": 1.2748, "step": 1718 }, { "epoch": 0.6399925538969878, "grad_norm": 0.17969095706939697, "learning_rate": 1.9961846878045024e-05, "loss": 1.2606, "step": 1719 }, { "epoch": 0.6403648590475969, "grad_norm": 0.18220089375972748, "learning_rate": 1.9961740743719612e-05, "loss": 1.2719, "step": 1720 }, { "epoch": 0.6407371641982059, "grad_norm": 0.1800280064344406, "learning_rate": 1.996163446225991e-05, "loss": 1.2857, "step": 1721 }, { "epoch": 0.6411094693488151, "grad_norm": 0.17898349463939667, "learning_rate": 1.9961528033667498e-05, "loss": 1.27, "step": 1722 }, { "epoch": 0.6414817744994241, "grad_norm": 0.18221056461334229, "learning_rate": 1.996142145794394e-05, "loss": 1.2665, "step": 1723 }, { "epoch": 0.6418540796500332, "grad_norm": 0.18029160797595978, "learning_rate": 1.996131473509081e-05, "loss": 1.2915, "step": 1724 }, { "epoch": 0.6422263848006422, "grad_norm": 0.17953678965568542, "learning_rate": 1.9961207865109688e-05, "loss": 1.2822, "step": 1725 }, { "epoch": 0.6425986899512512, "grad_norm": 0.20553520321846008, "learning_rate": 1.9961100848002154e-05, "loss": 1.2923, "step": 1726 }, { "epoch": 0.6429709951018604, "grad_norm": 0.1798718124628067, "learning_rate": 1.996099368376978e-05, "loss": 1.2898, "step": 1727 }, { "epoch": 0.6433433002524694, "grad_norm": 0.18341365456581116, "learning_rate": 1.996088637241416e-05, "loss": 1.2693, "step": 1728 }, { "epoch": 0.6437156054030785, "grad_norm": 0.18246109783649445, "learning_rate": 1.996077891393687e-05, "loss": 1.2783, "step": 1729 }, { "epoch": 0.6440879105536875, "grad_norm": 0.1854100227355957, "learning_rate": 1.9960671308339503e-05, "loss": 1.2791, "step": 1730 }, { "epoch": 0.6444602157042967, "grad_norm": 0.1951596736907959, "learning_rate": 1.9960563555623644e-05, "loss": 1.2851, "step": 1731 }, { "epoch": 0.6448325208549057, "grad_norm": 0.20141799747943878, "learning_rate": 1.9960455655790885e-05, "loss": 1.277, "step": 1732 }, { "epoch": 0.6452048260055148, "grad_norm": 0.1731090396642685, "learning_rate": 1.996034760884282e-05, "loss": 1.2755, "step": 1733 }, { "epoch": 0.6455771311561238, "grad_norm": 0.2005818784236908, "learning_rate": 1.9960239414781052e-05, "loss": 1.2724, "step": 1734 }, { "epoch": 0.645949436306733, "grad_norm": 0.1791912168264389, "learning_rate": 1.996013107360717e-05, "loss": 1.2689, "step": 1735 }, { "epoch": 0.646321741457342, "grad_norm": 0.1755741536617279, "learning_rate": 1.9960022585322774e-05, "loss": 1.2772, "step": 1736 }, { "epoch": 0.646694046607951, "grad_norm": 0.1764545440673828, "learning_rate": 1.9959913949929474e-05, "loss": 1.2745, "step": 1737 }, { "epoch": 0.6470663517585601, "grad_norm": 0.1788182109594345, "learning_rate": 1.9959805167428868e-05, "loss": 1.2635, "step": 1738 }, { "epoch": 0.6474386569091691, "grad_norm": 0.1874772310256958, "learning_rate": 1.9959696237822566e-05, "loss": 1.2883, "step": 1739 }, { "epoch": 0.6478109620597783, "grad_norm": 0.18383602797985077, "learning_rate": 1.9959587161112174e-05, "loss": 1.2667, "step": 1740 }, { "epoch": 0.6481832672103873, "grad_norm": 0.1755083203315735, "learning_rate": 1.9959477937299305e-05, "loss": 1.2857, "step": 1741 }, { "epoch": 0.6485555723609964, "grad_norm": 0.1754198670387268, "learning_rate": 1.995936856638557e-05, "loss": 1.2582, "step": 1742 }, { "epoch": 0.6489278775116054, "grad_norm": 0.18323983252048492, "learning_rate": 1.9959259048372593e-05, "loss": 1.2825, "step": 1743 }, { "epoch": 0.6493001826622146, "grad_norm": 0.17415131628513336, "learning_rate": 1.995914938326198e-05, "loss": 1.2624, "step": 1744 }, { "epoch": 0.6496724878128236, "grad_norm": 0.18525010347366333, "learning_rate": 1.9959039571055356e-05, "loss": 1.2923, "step": 1745 }, { "epoch": 0.6500447929634326, "grad_norm": 0.1781807243824005, "learning_rate": 1.995892961175434e-05, "loss": 1.2821, "step": 1746 }, { "epoch": 0.6504170981140417, "grad_norm": 0.1848139613866806, "learning_rate": 1.995881950536056e-05, "loss": 1.2953, "step": 1747 }, { "epoch": 0.6507894032646507, "grad_norm": 0.17933423817157745, "learning_rate": 1.9958709251875642e-05, "loss": 1.2666, "step": 1748 }, { "epoch": 0.6511617084152599, "grad_norm": 0.17203424870967865, "learning_rate": 1.9958598851301218e-05, "loss": 1.2691, "step": 1749 }, { "epoch": 0.6515340135658689, "grad_norm": 0.1798200160264969, "learning_rate": 1.995848830363891e-05, "loss": 1.2672, "step": 1750 }, { "epoch": 0.651906318716478, "grad_norm": 0.19163981080055237, "learning_rate": 1.9958377608890348e-05, "loss": 1.2615, "step": 1751 }, { "epoch": 0.652278623867087, "grad_norm": 0.177472323179245, "learning_rate": 1.9958266767057183e-05, "loss": 1.2775, "step": 1752 }, { "epoch": 0.6526509290176962, "grad_norm": 0.17973048985004425, "learning_rate": 1.995815577814104e-05, "loss": 1.2709, "step": 1753 }, { "epoch": 0.6530232341683052, "grad_norm": 0.20184633135795593, "learning_rate": 1.995804464214356e-05, "loss": 1.2796, "step": 1754 }, { "epoch": 0.6533955393189143, "grad_norm": 0.1837623566389084, "learning_rate": 1.9957933359066385e-05, "loss": 1.2679, "step": 1755 }, { "epoch": 0.6537678444695233, "grad_norm": 0.17423506081104279, "learning_rate": 1.995782192891116e-05, "loss": 1.2671, "step": 1756 }, { "epoch": 0.6541401496201324, "grad_norm": 0.1909656673669815, "learning_rate": 1.9957710351679533e-05, "loss": 1.2775, "step": 1757 }, { "epoch": 0.6545124547707415, "grad_norm": 0.19637969136238098, "learning_rate": 1.9957598627373145e-05, "loss": 1.2737, "step": 1758 }, { "epoch": 0.6548847599213505, "grad_norm": 0.18318361043930054, "learning_rate": 1.995748675599365e-05, "loss": 1.2772, "step": 1759 }, { "epoch": 0.6552570650719596, "grad_norm": 0.1808813214302063, "learning_rate": 1.9957374737542702e-05, "loss": 1.2918, "step": 1760 }, { "epoch": 0.6556293702225686, "grad_norm": 0.1866048276424408, "learning_rate": 1.9957262572021955e-05, "loss": 1.2677, "step": 1761 }, { "epoch": 0.6560016753731778, "grad_norm": 0.18360257148742676, "learning_rate": 1.9957150259433065e-05, "loss": 1.2832, "step": 1762 }, { "epoch": 0.6563739805237868, "grad_norm": 0.17542779445648193, "learning_rate": 1.995703779977769e-05, "loss": 1.2695, "step": 1763 }, { "epoch": 0.6567462856743959, "grad_norm": 0.1828540414571762, "learning_rate": 1.995692519305749e-05, "loss": 1.2893, "step": 1764 }, { "epoch": 0.6571185908250049, "grad_norm": 0.18941769003868103, "learning_rate": 1.995681243927413e-05, "loss": 1.2742, "step": 1765 }, { "epoch": 0.657490895975614, "grad_norm": 0.1731724739074707, "learning_rate": 1.9956699538429275e-05, "loss": 1.2966, "step": 1766 }, { "epoch": 0.6578632011262231, "grad_norm": 0.18366792798042297, "learning_rate": 1.9956586490524596e-05, "loss": 1.2779, "step": 1767 }, { "epoch": 0.6582355062768321, "grad_norm": 0.18383246660232544, "learning_rate": 1.9956473295561756e-05, "loss": 1.267, "step": 1768 }, { "epoch": 0.6586078114274412, "grad_norm": 0.19387759268283844, "learning_rate": 1.9956359953542433e-05, "loss": 1.2768, "step": 1769 }, { "epoch": 0.6589801165780503, "grad_norm": 0.18272364139556885, "learning_rate": 1.9956246464468294e-05, "loss": 1.2654, "step": 1770 }, { "epoch": 0.6593524217286594, "grad_norm": 0.17363867163658142, "learning_rate": 1.9956132828341022e-05, "loss": 1.2724, "step": 1771 }, { "epoch": 0.6597247268792684, "grad_norm": 0.18008361756801605, "learning_rate": 1.9956019045162294e-05, "loss": 1.2799, "step": 1772 }, { "epoch": 0.6600970320298775, "grad_norm": 0.18820790946483612, "learning_rate": 1.995590511493379e-05, "loss": 1.2741, "step": 1773 }, { "epoch": 0.6604693371804865, "grad_norm": 0.18072496354579926, "learning_rate": 1.995579103765719e-05, "loss": 1.2887, "step": 1774 }, { "epoch": 0.6608416423310957, "grad_norm": 0.18533405661582947, "learning_rate": 1.9955676813334182e-05, "loss": 1.2798, "step": 1775 }, { "epoch": 0.6612139474817047, "grad_norm": 0.19441775977611542, "learning_rate": 1.9955562441966452e-05, "loss": 1.2523, "step": 1776 }, { "epoch": 0.6615862526323137, "grad_norm": 0.19145584106445312, "learning_rate": 1.995544792355569e-05, "loss": 1.2635, "step": 1777 }, { "epoch": 0.6619585577829228, "grad_norm": 0.2026701271533966, "learning_rate": 1.9955333258103586e-05, "loss": 1.2747, "step": 1778 }, { "epoch": 0.6623308629335319, "grad_norm": 0.18436765670776367, "learning_rate": 1.9955218445611834e-05, "loss": 1.2653, "step": 1779 }, { "epoch": 0.662703168084141, "grad_norm": 0.18256279826164246, "learning_rate": 1.9955103486082135e-05, "loss": 1.2807, "step": 1780 }, { "epoch": 0.66307547323475, "grad_norm": 0.1890406608581543, "learning_rate": 1.9954988379516177e-05, "loss": 1.2832, "step": 1781 }, { "epoch": 0.6634477783853591, "grad_norm": 0.1824447363615036, "learning_rate": 1.995487312591567e-05, "loss": 1.2533, "step": 1782 }, { "epoch": 0.6638200835359682, "grad_norm": 0.1869254857301712, "learning_rate": 1.9954757725282308e-05, "loss": 1.2811, "step": 1783 }, { "epoch": 0.6641923886865773, "grad_norm": 0.18558120727539062, "learning_rate": 1.99546421776178e-05, "loss": 1.2687, "step": 1784 }, { "epoch": 0.6645646938371863, "grad_norm": 0.18721050024032593, "learning_rate": 1.995452648292385e-05, "loss": 1.2834, "step": 1785 }, { "epoch": 0.6649369989877953, "grad_norm": 0.1912430077791214, "learning_rate": 1.9954410641202173e-05, "loss": 1.2801, "step": 1786 }, { "epoch": 0.6653093041384044, "grad_norm": 0.1814345419406891, "learning_rate": 1.995429465245447e-05, "loss": 1.2739, "step": 1787 }, { "epoch": 0.6656816092890135, "grad_norm": 0.18435440957546234, "learning_rate": 1.9954178516682464e-05, "loss": 1.2652, "step": 1788 }, { "epoch": 0.6660539144396226, "grad_norm": 0.18596956133842468, "learning_rate": 1.9954062233887866e-05, "loss": 1.2661, "step": 1789 }, { "epoch": 0.6664262195902316, "grad_norm": 0.1784246861934662, "learning_rate": 1.995394580407239e-05, "loss": 1.2688, "step": 1790 }, { "epoch": 0.6667985247408407, "grad_norm": 0.1948971003293991, "learning_rate": 1.9953829227237762e-05, "loss": 1.2825, "step": 1791 }, { "epoch": 0.6671708298914498, "grad_norm": 0.17607876658439636, "learning_rate": 1.9953712503385702e-05, "loss": 1.2722, "step": 1792 }, { "epoch": 0.6675431350420589, "grad_norm": 0.19248910248279572, "learning_rate": 1.995359563251793e-05, "loss": 1.2908, "step": 1793 }, { "epoch": 0.6679154401926679, "grad_norm": 0.1824008673429489, "learning_rate": 1.9953478614636178e-05, "loss": 1.2737, "step": 1794 }, { "epoch": 0.668287745343277, "grad_norm": 0.17767947912216187, "learning_rate": 1.9953361449742167e-05, "loss": 1.2698, "step": 1795 }, { "epoch": 0.6686600504938861, "grad_norm": 0.20070941746234894, "learning_rate": 1.995324413783764e-05, "loss": 1.2716, "step": 1796 }, { "epoch": 0.6690323556444951, "grad_norm": 0.1810212880373001, "learning_rate": 1.9953126678924315e-05, "loss": 1.2641, "step": 1797 }, { "epoch": 0.6694046607951042, "grad_norm": 0.18314088881015778, "learning_rate": 1.9953009073003935e-05, "loss": 1.2713, "step": 1798 }, { "epoch": 0.6697769659457132, "grad_norm": 0.1876356452703476, "learning_rate": 1.9952891320078235e-05, "loss": 1.279, "step": 1799 }, { "epoch": 0.6701492710963223, "grad_norm": 0.18788988888263702, "learning_rate": 1.9952773420148958e-05, "loss": 1.2786, "step": 1800 }, { "epoch": 0.6705215762469314, "grad_norm": 0.17897135019302368, "learning_rate": 1.995265537321784e-05, "loss": 1.2746, "step": 1801 }, { "epoch": 0.6708938813975405, "grad_norm": 0.1953326016664505, "learning_rate": 1.9952537179286623e-05, "loss": 1.2815, "step": 1802 }, { "epoch": 0.6712661865481495, "grad_norm": 0.19297561049461365, "learning_rate": 1.995241883835706e-05, "loss": 1.2698, "step": 1803 }, { "epoch": 0.6716384916987586, "grad_norm": 0.18052466213703156, "learning_rate": 1.99523003504309e-05, "loss": 1.2744, "step": 1804 }, { "epoch": 0.6720107968493677, "grad_norm": 0.1788993775844574, "learning_rate": 1.9952181715509883e-05, "loss": 1.2621, "step": 1805 }, { "epoch": 0.6723831019999768, "grad_norm": 0.18485794961452484, "learning_rate": 1.9952062933595765e-05, "loss": 1.2767, "step": 1806 }, { "epoch": 0.6727554071505858, "grad_norm": 0.18124793469905853, "learning_rate": 1.9951944004690308e-05, "loss": 1.2811, "step": 1807 }, { "epoch": 0.6731277123011948, "grad_norm": 0.1836545318365097, "learning_rate": 1.9951824928795255e-05, "loss": 1.2692, "step": 1808 }, { "epoch": 0.673500017451804, "grad_norm": 0.1730068475008011, "learning_rate": 1.9951705705912377e-05, "loss": 1.2753, "step": 1809 }, { "epoch": 0.673872322602413, "grad_norm": 0.17720746994018555, "learning_rate": 1.995158633604343e-05, "loss": 1.2653, "step": 1810 }, { "epoch": 0.6742446277530221, "grad_norm": 0.1764315515756607, "learning_rate": 1.995146681919018e-05, "loss": 1.2608, "step": 1811 }, { "epoch": 0.6746169329036311, "grad_norm": 0.19442118704319, "learning_rate": 1.9951347155354386e-05, "loss": 1.2774, "step": 1812 }, { "epoch": 0.6749892380542402, "grad_norm": 0.18689338862895966, "learning_rate": 1.995122734453782e-05, "loss": 1.2659, "step": 1813 }, { "epoch": 0.6753615432048493, "grad_norm": 0.17291222512722015, "learning_rate": 1.995110738674225e-05, "loss": 1.2637, "step": 1814 }, { "epoch": 0.6757338483554584, "grad_norm": 0.2008456140756607, "learning_rate": 1.995098728196945e-05, "loss": 1.2837, "step": 1815 }, { "epoch": 0.6761061535060674, "grad_norm": 0.1726778745651245, "learning_rate": 1.995086703022119e-05, "loss": 1.2876, "step": 1816 }, { "epoch": 0.6764784586566764, "grad_norm": 0.18805347383022308, "learning_rate": 1.9950746631499252e-05, "loss": 1.2486, "step": 1817 }, { "epoch": 0.6768507638072856, "grad_norm": 0.1827612966299057, "learning_rate": 1.9950626085805406e-05, "loss": 1.2832, "step": 1818 }, { "epoch": 0.6772230689578946, "grad_norm": 0.17770785093307495, "learning_rate": 1.995050539314144e-05, "loss": 1.2713, "step": 1819 }, { "epoch": 0.6775953741085037, "grad_norm": 0.18160304427146912, "learning_rate": 1.9950384553509134e-05, "loss": 1.2836, "step": 1820 }, { "epoch": 0.6779676792591127, "grad_norm": 0.1883334517478943, "learning_rate": 1.9950263566910275e-05, "loss": 1.2813, "step": 1821 }, { "epoch": 0.6783399844097218, "grad_norm": 0.18117238581180573, "learning_rate": 1.9950142433346642e-05, "loss": 1.2715, "step": 1822 }, { "epoch": 0.6787122895603309, "grad_norm": 0.1771121472120285, "learning_rate": 1.9950021152820032e-05, "loss": 1.266, "step": 1823 }, { "epoch": 0.67908459471094, "grad_norm": 0.18538367748260498, "learning_rate": 1.9949899725332233e-05, "loss": 1.2638, "step": 1824 }, { "epoch": 0.679456899861549, "grad_norm": 0.17744530737400055, "learning_rate": 1.994977815088504e-05, "loss": 1.2669, "step": 1825 }, { "epoch": 0.679829205012158, "grad_norm": 0.17428846657276154, "learning_rate": 1.9949656429480252e-05, "loss": 1.2753, "step": 1826 }, { "epoch": 0.6802015101627672, "grad_norm": 0.2017204761505127, "learning_rate": 1.9949534561119658e-05, "loss": 1.2723, "step": 1827 }, { "epoch": 0.6805738153133762, "grad_norm": 0.18561388552188873, "learning_rate": 1.9949412545805065e-05, "loss": 1.2748, "step": 1828 }, { "epoch": 0.6809461204639853, "grad_norm": 0.18007059395313263, "learning_rate": 1.9949290383538272e-05, "loss": 1.2579, "step": 1829 }, { "epoch": 0.6813184256145943, "grad_norm": 0.17988115549087524, "learning_rate": 1.9949168074321088e-05, "loss": 1.2769, "step": 1830 }, { "epoch": 0.6816907307652035, "grad_norm": 0.19387783110141754, "learning_rate": 1.9949045618155312e-05, "loss": 1.2746, "step": 1831 }, { "epoch": 0.6820630359158125, "grad_norm": 0.1811383217573166, "learning_rate": 1.994892301504276e-05, "loss": 1.2784, "step": 1832 }, { "epoch": 0.6824353410664216, "grad_norm": 0.17574800550937653, "learning_rate": 1.9948800264985236e-05, "loss": 1.2688, "step": 1833 }, { "epoch": 0.6828076462170306, "grad_norm": 0.17047591507434845, "learning_rate": 1.9948677367984558e-05, "loss": 1.2783, "step": 1834 }, { "epoch": 0.6831799513676396, "grad_norm": 0.18834684789180756, "learning_rate": 1.994855432404254e-05, "loss": 1.2685, "step": 1835 }, { "epoch": 0.6835522565182488, "grad_norm": 0.17548666894435883, "learning_rate": 1.9948431133160998e-05, "loss": 1.269, "step": 1836 }, { "epoch": 0.6839245616688578, "grad_norm": 0.17728163301944733, "learning_rate": 1.9948307795341755e-05, "loss": 1.2748, "step": 1837 }, { "epoch": 0.6842968668194669, "grad_norm": 0.18654660880565643, "learning_rate": 1.9948184310586625e-05, "loss": 1.2839, "step": 1838 }, { "epoch": 0.6846691719700759, "grad_norm": 0.18166908621788025, "learning_rate": 1.9948060678897443e-05, "loss": 1.2677, "step": 1839 }, { "epoch": 0.6850414771206851, "grad_norm": 0.17448103427886963, "learning_rate": 1.9947936900276023e-05, "loss": 1.2738, "step": 1840 }, { "epoch": 0.6854137822712941, "grad_norm": 0.17304863035678864, "learning_rate": 1.9947812974724203e-05, "loss": 1.2705, "step": 1841 }, { "epoch": 0.6857860874219032, "grad_norm": 0.17068156599998474, "learning_rate": 1.994768890224381e-05, "loss": 1.2782, "step": 1842 }, { "epoch": 0.6861583925725122, "grad_norm": 0.17635151743888855, "learning_rate": 1.9947564682836678e-05, "loss": 1.2707, "step": 1843 }, { "epoch": 0.6865306977231214, "grad_norm": 0.17905697226524353, "learning_rate": 1.9947440316504636e-05, "loss": 1.2628, "step": 1844 }, { "epoch": 0.6869030028737304, "grad_norm": 0.17490267753601074, "learning_rate": 1.9947315803249525e-05, "loss": 1.257, "step": 1845 }, { "epoch": 0.6872753080243394, "grad_norm": 0.18009649217128754, "learning_rate": 1.9947191143073185e-05, "loss": 1.285, "step": 1846 }, { "epoch": 0.6876476131749485, "grad_norm": 0.19091250002384186, "learning_rate": 1.994706633597746e-05, "loss": 1.2647, "step": 1847 }, { "epoch": 0.6880199183255575, "grad_norm": 0.183569997549057, "learning_rate": 1.994694138196418e-05, "loss": 1.2662, "step": 1848 }, { "epoch": 0.6883922234761667, "grad_norm": 0.18256776034832, "learning_rate": 1.994681628103521e-05, "loss": 1.2643, "step": 1849 }, { "epoch": 0.6887645286267757, "grad_norm": 0.17568175494670868, "learning_rate": 1.9946691033192384e-05, "loss": 1.2724, "step": 1850 }, { "epoch": 0.6891368337773848, "grad_norm": 0.18592911958694458, "learning_rate": 1.9946565638437552e-05, "loss": 1.2602, "step": 1851 }, { "epoch": 0.6895091389279938, "grad_norm": 0.18410173058509827, "learning_rate": 1.9946440096772574e-05, "loss": 1.2539, "step": 1852 }, { "epoch": 0.689881444078603, "grad_norm": 0.18612074851989746, "learning_rate": 1.99463144081993e-05, "loss": 1.265, "step": 1853 }, { "epoch": 0.690253749229212, "grad_norm": 0.19011899828910828, "learning_rate": 1.9946188572719585e-05, "loss": 1.2938, "step": 1854 }, { "epoch": 0.690626054379821, "grad_norm": 0.18528851866722107, "learning_rate": 1.9946062590335287e-05, "loss": 1.2757, "step": 1855 }, { "epoch": 0.6909983595304301, "grad_norm": 0.1764160841703415, "learning_rate": 1.9945936461048273e-05, "loss": 1.2859, "step": 1856 }, { "epoch": 0.6913706646810392, "grad_norm": 0.17735952138900757, "learning_rate": 1.9945810184860396e-05, "loss": 1.2706, "step": 1857 }, { "epoch": 0.6917429698316483, "grad_norm": 0.18873561918735504, "learning_rate": 1.9945683761773533e-05, "loss": 1.2833, "step": 1858 }, { "epoch": 0.6921152749822573, "grad_norm": 0.17699329555034637, "learning_rate": 1.9945557191789543e-05, "loss": 1.2709, "step": 1859 }, { "epoch": 0.6924875801328664, "grad_norm": 0.18327003717422485, "learning_rate": 1.9945430474910295e-05, "loss": 1.2801, "step": 1860 }, { "epoch": 0.6928598852834754, "grad_norm": 0.18743474781513214, "learning_rate": 1.9945303611137665e-05, "loss": 1.2724, "step": 1861 }, { "epoch": 0.6932321904340846, "grad_norm": 0.17246921360492706, "learning_rate": 1.9945176600473526e-05, "loss": 1.252, "step": 1862 }, { "epoch": 0.6936044955846936, "grad_norm": 0.17474091053009033, "learning_rate": 1.994504944291975e-05, "loss": 1.2746, "step": 1863 }, { "epoch": 0.6939768007353027, "grad_norm": 0.19760233163833618, "learning_rate": 1.994492213847822e-05, "loss": 1.2925, "step": 1864 }, { "epoch": 0.6943491058859117, "grad_norm": 0.32504546642303467, "learning_rate": 1.9944794687150812e-05, "loss": 1.2715, "step": 1865 }, { "epoch": 0.6947214110365209, "grad_norm": 0.16915461421012878, "learning_rate": 1.9944667088939414e-05, "loss": 1.2777, "step": 1866 }, { "epoch": 0.6950937161871299, "grad_norm": 0.18415702879428864, "learning_rate": 1.9944539343845905e-05, "loss": 1.2666, "step": 1867 }, { "epoch": 0.6954660213377389, "grad_norm": 0.19284015893936157, "learning_rate": 1.994441145187217e-05, "loss": 1.2955, "step": 1868 }, { "epoch": 0.695838326488348, "grad_norm": 0.1781989485025406, "learning_rate": 1.994428341302011e-05, "loss": 1.2754, "step": 1869 }, { "epoch": 0.6962106316389571, "grad_norm": 0.18543171882629395, "learning_rate": 1.9944155227291603e-05, "loss": 1.2615, "step": 1870 }, { "epoch": 0.6965829367895662, "grad_norm": 0.1806837022304535, "learning_rate": 1.9944026894688547e-05, "loss": 1.2585, "step": 1871 }, { "epoch": 0.6969552419401752, "grad_norm": 0.18426816165447235, "learning_rate": 1.9943898415212842e-05, "loss": 1.2783, "step": 1872 }, { "epoch": 0.6973275470907843, "grad_norm": 0.17620044946670532, "learning_rate": 1.9943769788866377e-05, "loss": 1.2815, "step": 1873 }, { "epoch": 0.6976998522413933, "grad_norm": 0.1815589815378189, "learning_rate": 1.9943641015651057e-05, "loss": 1.2715, "step": 1874 }, { "epoch": 0.6980721573920025, "grad_norm": 0.1777685433626175, "learning_rate": 1.9943512095568785e-05, "loss": 1.2717, "step": 1875 }, { "epoch": 0.6984444625426115, "grad_norm": 0.19864802062511444, "learning_rate": 1.9943383028621463e-05, "loss": 1.2827, "step": 1876 }, { "epoch": 0.6988167676932205, "grad_norm": 0.1869768351316452, "learning_rate": 1.9943253814810998e-05, "loss": 1.276, "step": 1877 }, { "epoch": 0.6991890728438296, "grad_norm": 0.18127432465553284, "learning_rate": 1.9943124454139298e-05, "loss": 1.258, "step": 1878 }, { "epoch": 0.6995613779944387, "grad_norm": 0.18956811726093292, "learning_rate": 1.9942994946608273e-05, "loss": 1.2536, "step": 1879 }, { "epoch": 0.6999336831450478, "grad_norm": 0.19713161885738373, "learning_rate": 1.9942865292219837e-05, "loss": 1.272, "step": 1880 }, { "epoch": 0.7003059882956568, "grad_norm": 0.1791374236345291, "learning_rate": 1.9942735490975903e-05, "loss": 1.2739, "step": 1881 }, { "epoch": 0.7006782934462659, "grad_norm": 0.17911037802696228, "learning_rate": 1.9942605542878393e-05, "loss": 1.2725, "step": 1882 }, { "epoch": 0.701050598596875, "grad_norm": 0.1858641803264618, "learning_rate": 1.9942475447929223e-05, "loss": 1.2588, "step": 1883 }, { "epoch": 0.7014229037474841, "grad_norm": 0.1832643300294876, "learning_rate": 1.9942345206130313e-05, "loss": 1.2863, "step": 1884 }, { "epoch": 0.7017952088980931, "grad_norm": 0.18985310196876526, "learning_rate": 1.9942214817483588e-05, "loss": 1.2711, "step": 1885 }, { "epoch": 0.7021675140487021, "grad_norm": 0.8048028945922852, "learning_rate": 1.9942084281990973e-05, "loss": 1.2672, "step": 1886 }, { "epoch": 0.7025398191993112, "grad_norm": 0.19335870444774628, "learning_rate": 1.9941953599654398e-05, "loss": 1.2665, "step": 1887 }, { "epoch": 0.7029121243499203, "grad_norm": 0.17363718152046204, "learning_rate": 1.9941822770475795e-05, "loss": 1.2456, "step": 1888 }, { "epoch": 0.7032844295005294, "grad_norm": 0.18014384806156158, "learning_rate": 1.9941691794457088e-05, "loss": 1.2661, "step": 1889 }, { "epoch": 0.7036567346511384, "grad_norm": 0.17859698832035065, "learning_rate": 1.9941560671600223e-05, "loss": 1.2699, "step": 1890 }, { "epoch": 0.7040290398017475, "grad_norm": 0.17710040509700775, "learning_rate": 1.9941429401907126e-05, "loss": 1.2629, "step": 1891 }, { "epoch": 0.7044013449523566, "grad_norm": 0.186106339097023, "learning_rate": 1.9941297985379747e-05, "loss": 1.2573, "step": 1892 }, { "epoch": 0.7047736501029657, "grad_norm": 0.1873396337032318, "learning_rate": 1.9941166422020016e-05, "loss": 1.2468, "step": 1893 }, { "epoch": 0.7051459552535747, "grad_norm": 0.1858346313238144, "learning_rate": 1.9941034711829878e-05, "loss": 1.2776, "step": 1894 }, { "epoch": 0.7055182604041838, "grad_norm": 0.18264545500278473, "learning_rate": 1.9940902854811284e-05, "loss": 1.278, "step": 1895 }, { "epoch": 0.7058905655547928, "grad_norm": 0.18094655871391296, "learning_rate": 1.9940770850966184e-05, "loss": 1.2737, "step": 1896 }, { "epoch": 0.706262870705402, "grad_norm": 0.18092148005962372, "learning_rate": 1.9940638700296514e-05, "loss": 1.2721, "step": 1897 }, { "epoch": 0.706635175856011, "grad_norm": 0.18232271075248718, "learning_rate": 1.994050640280424e-05, "loss": 1.2564, "step": 1898 }, { "epoch": 0.70700748100662, "grad_norm": 0.17807061970233917, "learning_rate": 1.9940373958491308e-05, "loss": 1.2714, "step": 1899 }, { "epoch": 0.7073797861572291, "grad_norm": 0.17300982773303986, "learning_rate": 1.9940241367359675e-05, "loss": 1.255, "step": 1900 }, { "epoch": 0.7077520913078382, "grad_norm": 0.18998445570468903, "learning_rate": 1.9940108629411305e-05, "loss": 1.2794, "step": 1901 }, { "epoch": 0.7081243964584473, "grad_norm": 0.1696198284626007, "learning_rate": 1.9939975744648152e-05, "loss": 1.2784, "step": 1902 }, { "epoch": 0.7084967016090563, "grad_norm": 0.1855718344449997, "learning_rate": 1.993984271307218e-05, "loss": 1.2728, "step": 1903 }, { "epoch": 0.7088690067596654, "grad_norm": 0.18481917679309845, "learning_rate": 1.9939709534685353e-05, "loss": 1.2794, "step": 1904 }, { "epoch": 0.7092413119102745, "grad_norm": 0.18903714418411255, "learning_rate": 1.9939576209489648e-05, "loss": 1.2656, "step": 1905 }, { "epoch": 0.7096136170608836, "grad_norm": 0.17437592148780823, "learning_rate": 1.993944273748702e-05, "loss": 1.2737, "step": 1906 }, { "epoch": 0.7099859222114926, "grad_norm": 0.17127926647663116, "learning_rate": 1.9939309118679445e-05, "loss": 1.2662, "step": 1907 }, { "epoch": 0.7103582273621016, "grad_norm": 0.1836404949426651, "learning_rate": 1.99391753530689e-05, "loss": 1.2774, "step": 1908 }, { "epoch": 0.7107305325127107, "grad_norm": 0.18096838891506195, "learning_rate": 1.993904144065736e-05, "loss": 1.2775, "step": 1909 }, { "epoch": 0.7111028376633198, "grad_norm": 0.17942848801612854, "learning_rate": 1.9938907381446802e-05, "loss": 1.2671, "step": 1910 }, { "epoch": 0.7114751428139289, "grad_norm": 0.1706017553806305, "learning_rate": 1.9938773175439205e-05, "loss": 1.2683, "step": 1911 }, { "epoch": 0.7118474479645379, "grad_norm": 0.1811804175376892, "learning_rate": 1.9938638822636555e-05, "loss": 1.2771, "step": 1912 }, { "epoch": 0.712219753115147, "grad_norm": 0.1840972751379013, "learning_rate": 1.9938504323040826e-05, "loss": 1.268, "step": 1913 }, { "epoch": 0.7125920582657561, "grad_norm": 0.17353455722332, "learning_rate": 1.9938369676654015e-05, "loss": 1.2617, "step": 1914 }, { "epoch": 0.7129643634163652, "grad_norm": 0.16893000900745392, "learning_rate": 1.993823488347811e-05, "loss": 1.2651, "step": 1915 }, { "epoch": 0.7133366685669742, "grad_norm": 0.17910560965538025, "learning_rate": 1.9938099943515098e-05, "loss": 1.2633, "step": 1916 }, { "epoch": 0.7137089737175832, "grad_norm": 0.18880413472652435, "learning_rate": 1.9937964856766975e-05, "loss": 1.2445, "step": 1917 }, { "epoch": 0.7140812788681924, "grad_norm": 0.17777028679847717, "learning_rate": 1.9937829623235733e-05, "loss": 1.2701, "step": 1918 }, { "epoch": 0.7144535840188014, "grad_norm": 0.17710170149803162, "learning_rate": 1.993769424292337e-05, "loss": 1.2697, "step": 1919 }, { "epoch": 0.7148258891694105, "grad_norm": 0.17528566718101501, "learning_rate": 1.993755871583189e-05, "loss": 1.261, "step": 1920 }, { "epoch": 0.7151981943200195, "grad_norm": 0.17900541424751282, "learning_rate": 1.993742304196329e-05, "loss": 1.2624, "step": 1921 }, { "epoch": 0.7155704994706286, "grad_norm": 0.18231788277626038, "learning_rate": 1.9937287221319576e-05, "loss": 1.2659, "step": 1922 }, { "epoch": 0.7159428046212377, "grad_norm": 0.1796240657567978, "learning_rate": 1.993715125390275e-05, "loss": 1.2658, "step": 1923 }, { "epoch": 0.7163151097718468, "grad_norm": 0.16966049373149872, "learning_rate": 1.9937015139714825e-05, "loss": 1.2611, "step": 1924 }, { "epoch": 0.7166874149224558, "grad_norm": 0.18244795501232147, "learning_rate": 1.993687887875781e-05, "loss": 1.2567, "step": 1925 }, { "epoch": 0.7170597200730648, "grad_norm": 0.17813333868980408, "learning_rate": 1.993674247103372e-05, "loss": 1.2639, "step": 1926 }, { "epoch": 0.717432025223674, "grad_norm": 0.17147675156593323, "learning_rate": 1.9936605916544566e-05, "loss": 1.2508, "step": 1927 }, { "epoch": 0.717804330374283, "grad_norm": 0.1758221685886383, "learning_rate": 1.9936469215292366e-05, "loss": 1.2537, "step": 1928 }, { "epoch": 0.7181766355248921, "grad_norm": 0.1679316908121109, "learning_rate": 1.993633236727914e-05, "loss": 1.2708, "step": 1929 }, { "epoch": 0.7185489406755011, "grad_norm": 0.18029126524925232, "learning_rate": 1.9936195372506906e-05, "loss": 1.2628, "step": 1930 }, { "epoch": 0.7189212458261103, "grad_norm": 0.1729169636964798, "learning_rate": 1.9936058230977694e-05, "loss": 1.2676, "step": 1931 }, { "epoch": 0.7192935509767193, "grad_norm": 0.1810874193906784, "learning_rate": 1.993592094269352e-05, "loss": 1.2575, "step": 1932 }, { "epoch": 0.7196658561273284, "grad_norm": 0.17737914621829987, "learning_rate": 1.993578350765642e-05, "loss": 1.2801, "step": 1933 }, { "epoch": 0.7200381612779374, "grad_norm": 0.18004107475280762, "learning_rate": 1.9935645925868424e-05, "loss": 1.2596, "step": 1934 }, { "epoch": 0.7204104664285464, "grad_norm": 0.17037642002105713, "learning_rate": 1.9935508197331556e-05, "loss": 1.2656, "step": 1935 }, { "epoch": 0.7207827715791556, "grad_norm": 0.1718427985906601, "learning_rate": 1.993537032204786e-05, "loss": 1.2734, "step": 1936 }, { "epoch": 0.7211550767297646, "grad_norm": 0.19909080862998962, "learning_rate": 1.9935232300019364e-05, "loss": 1.2584, "step": 1937 }, { "epoch": 0.7215273818803737, "grad_norm": 0.17230063676834106, "learning_rate": 1.9935094131248113e-05, "loss": 1.2605, "step": 1938 }, { "epoch": 0.7218996870309827, "grad_norm": 0.18916752934455872, "learning_rate": 1.9934955815736145e-05, "loss": 1.2758, "step": 1939 }, { "epoch": 0.7222719921815919, "grad_norm": 0.1908046007156372, "learning_rate": 1.99348173534855e-05, "loss": 1.2634, "step": 1940 }, { "epoch": 0.7226442973322009, "grad_norm": 0.17501883208751678, "learning_rate": 1.9934678744498234e-05, "loss": 1.2604, "step": 1941 }, { "epoch": 0.72301660248281, "grad_norm": 0.17639294266700745, "learning_rate": 1.993453998877638e-05, "loss": 1.2641, "step": 1942 }, { "epoch": 0.723388907633419, "grad_norm": 0.19345468282699585, "learning_rate": 1.9934401086321995e-05, "loss": 1.2709, "step": 1943 }, { "epoch": 0.7237612127840282, "grad_norm": 0.18009069561958313, "learning_rate": 1.9934262037137132e-05, "loss": 1.2635, "step": 1944 }, { "epoch": 0.7241335179346372, "grad_norm": 0.19029593467712402, "learning_rate": 1.993412284122384e-05, "loss": 1.2532, "step": 1945 }, { "epoch": 0.7245058230852462, "grad_norm": 0.19446800649166107, "learning_rate": 1.9933983498584175e-05, "loss": 1.2771, "step": 1946 }, { "epoch": 0.7248781282358553, "grad_norm": 0.17743223905563354, "learning_rate": 1.99338440092202e-05, "loss": 1.276, "step": 1947 }, { "epoch": 0.7252504333864643, "grad_norm": 0.18026785552501678, "learning_rate": 1.9933704373133967e-05, "loss": 1.2737, "step": 1948 }, { "epoch": 0.7256227385370735, "grad_norm": 0.191921204328537, "learning_rate": 1.993356459032755e-05, "loss": 1.2722, "step": 1949 }, { "epoch": 0.7259950436876825, "grad_norm": 0.1790904849767685, "learning_rate": 1.9933424660803006e-05, "loss": 1.2661, "step": 1950 }, { "epoch": 0.7263673488382916, "grad_norm": 0.19434896111488342, "learning_rate": 1.99332845845624e-05, "loss": 1.2543, "step": 1951 }, { "epoch": 0.7267396539889006, "grad_norm": 0.17882947623729706, "learning_rate": 1.993314436160781e-05, "loss": 1.2631, "step": 1952 }, { "epoch": 0.7271119591395098, "grad_norm": 0.18903744220733643, "learning_rate": 1.99330039919413e-05, "loss": 1.2721, "step": 1953 }, { "epoch": 0.7274842642901188, "grad_norm": 0.17733576893806458, "learning_rate": 1.993286347556494e-05, "loss": 1.2782, "step": 1954 }, { "epoch": 0.7278565694407279, "grad_norm": 0.1745980829000473, "learning_rate": 1.9932722812480813e-05, "loss": 1.2609, "step": 1955 }, { "epoch": 0.7282288745913369, "grad_norm": 0.18172860145568848, "learning_rate": 1.9932582002690993e-05, "loss": 1.2645, "step": 1956 }, { "epoch": 0.728601179741946, "grad_norm": 0.17440102994441986, "learning_rate": 1.9932441046197558e-05, "loss": 1.2739, "step": 1957 }, { "epoch": 0.7289734848925551, "grad_norm": 0.17599335312843323, "learning_rate": 1.9932299943002596e-05, "loss": 1.2626, "step": 1958 }, { "epoch": 0.7293457900431641, "grad_norm": 0.18249286711215973, "learning_rate": 1.9932158693108183e-05, "loss": 1.2629, "step": 1959 }, { "epoch": 0.7297180951937732, "grad_norm": 0.19463662803173065, "learning_rate": 1.9932017296516414e-05, "loss": 1.2589, "step": 1960 }, { "epoch": 0.7300904003443822, "grad_norm": 0.17670099437236786, "learning_rate": 1.9931875753229367e-05, "loss": 1.2893, "step": 1961 }, { "epoch": 0.7304627054949914, "grad_norm": 0.181423619389534, "learning_rate": 1.9931734063249143e-05, "loss": 1.2583, "step": 1962 }, { "epoch": 0.7308350106456004, "grad_norm": 0.19615116715431213, "learning_rate": 1.993159222657783e-05, "loss": 1.273, "step": 1963 }, { "epoch": 0.7312073157962095, "grad_norm": 0.1963336318731308, "learning_rate": 1.9931450243217522e-05, "loss": 1.2694, "step": 1964 }, { "epoch": 0.7315796209468185, "grad_norm": 0.18081746995449066, "learning_rate": 1.993130811317032e-05, "loss": 1.2604, "step": 1965 }, { "epoch": 0.7319519260974277, "grad_norm": 0.19056904315948486, "learning_rate": 1.9931165836438314e-05, "loss": 1.2732, "step": 1966 }, { "epoch": 0.7323242312480367, "grad_norm": 0.18734106421470642, "learning_rate": 1.9931023413023615e-05, "loss": 1.2747, "step": 1967 }, { "epoch": 0.7326965363986457, "grad_norm": 0.17514047026634216, "learning_rate": 1.9930880842928325e-05, "loss": 1.2536, "step": 1968 }, { "epoch": 0.7330688415492548, "grad_norm": 0.17114852368831635, "learning_rate": 1.993073812615455e-05, "loss": 1.2664, "step": 1969 }, { "epoch": 0.7334411466998638, "grad_norm": 0.187078058719635, "learning_rate": 1.993059526270439e-05, "loss": 1.265, "step": 1970 }, { "epoch": 0.733813451850473, "grad_norm": 0.1790546178817749, "learning_rate": 1.9930452252579967e-05, "loss": 1.2676, "step": 1971 }, { "epoch": 0.734185757001082, "grad_norm": 0.18027950823307037, "learning_rate": 1.9930309095783386e-05, "loss": 1.2745, "step": 1972 }, { "epoch": 0.7345580621516911, "grad_norm": 0.18280746042728424, "learning_rate": 1.993016579231676e-05, "loss": 1.2724, "step": 1973 }, { "epoch": 0.7349303673023001, "grad_norm": 0.18688839673995972, "learning_rate": 1.9930022342182213e-05, "loss": 1.2695, "step": 1974 }, { "epoch": 0.7353026724529093, "grad_norm": 0.17303088307380676, "learning_rate": 1.9929878745381855e-05, "loss": 1.2601, "step": 1975 }, { "epoch": 0.7356749776035183, "grad_norm": 0.17247672379016876, "learning_rate": 1.992973500191781e-05, "loss": 1.2698, "step": 1976 }, { "epoch": 0.7360472827541273, "grad_norm": 0.18683381378650665, "learning_rate": 1.9929591111792206e-05, "loss": 1.2856, "step": 1977 }, { "epoch": 0.7364195879047364, "grad_norm": 0.17574839293956757, "learning_rate": 1.9929447075007164e-05, "loss": 1.2798, "step": 1978 }, { "epoch": 0.7367918930553455, "grad_norm": 0.177045539021492, "learning_rate": 1.992930289156481e-05, "loss": 1.2671, "step": 1979 }, { "epoch": 0.7371641982059546, "grad_norm": 0.1845403015613556, "learning_rate": 1.9929158561467276e-05, "loss": 1.2516, "step": 1980 }, { "epoch": 0.7375365033565636, "grad_norm": 0.18033714592456818, "learning_rate": 1.9929014084716695e-05, "loss": 1.2661, "step": 1981 }, { "epoch": 0.7379088085071727, "grad_norm": 0.17001290619373322, "learning_rate": 1.9928869461315197e-05, "loss": 1.2729, "step": 1982 }, { "epoch": 0.7382811136577817, "grad_norm": 0.1728724241256714, "learning_rate": 1.992872469126492e-05, "loss": 1.2679, "step": 1983 }, { "epoch": 0.7386534188083909, "grad_norm": 0.1804700642824173, "learning_rate": 1.9928579774568005e-05, "loss": 1.2785, "step": 1984 }, { "epoch": 0.7390257239589999, "grad_norm": 0.174544095993042, "learning_rate": 1.9928434711226586e-05, "loss": 1.2484, "step": 1985 }, { "epoch": 0.739398029109609, "grad_norm": 0.18159344792366028, "learning_rate": 1.9928289501242812e-05, "loss": 1.2658, "step": 1986 }, { "epoch": 0.739770334260218, "grad_norm": 0.16609835624694824, "learning_rate": 1.9928144144618824e-05, "loss": 1.266, "step": 1987 }, { "epoch": 0.7401426394108271, "grad_norm": 0.1732049435377121, "learning_rate": 1.992799864135677e-05, "loss": 1.2598, "step": 1988 }, { "epoch": 0.7405149445614362, "grad_norm": 0.17838548123836517, "learning_rate": 1.9927852991458802e-05, "loss": 1.2866, "step": 1989 }, { "epoch": 0.7408872497120452, "grad_norm": 0.17985506355762482, "learning_rate": 1.9927707194927067e-05, "loss": 1.2706, "step": 1990 }, { "epoch": 0.7412595548626543, "grad_norm": 0.1812310814857483, "learning_rate": 1.9927561251763717e-05, "loss": 1.2751, "step": 1991 }, { "epoch": 0.7416318600132634, "grad_norm": 0.1740981936454773, "learning_rate": 1.9927415161970913e-05, "loss": 1.2769, "step": 1992 }, { "epoch": 0.7420041651638725, "grad_norm": 0.1680895835161209, "learning_rate": 1.9927268925550808e-05, "loss": 1.2502, "step": 1993 }, { "epoch": 0.7423764703144815, "grad_norm": 0.17788243293762207, "learning_rate": 1.992712254250557e-05, "loss": 1.2735, "step": 1994 }, { "epoch": 0.7427487754650905, "grad_norm": 0.17746172845363617, "learning_rate": 1.9926976012837345e-05, "loss": 1.2665, "step": 1995 }, { "epoch": 0.7431210806156996, "grad_norm": 0.1807558834552765, "learning_rate": 1.9926829336548314e-05, "loss": 1.263, "step": 1996 }, { "epoch": 0.7434933857663087, "grad_norm": 0.17683039605617523, "learning_rate": 1.9926682513640634e-05, "loss": 1.2665, "step": 1997 }, { "epoch": 0.7438656909169178, "grad_norm": 0.1726188063621521, "learning_rate": 1.992653554411648e-05, "loss": 1.279, "step": 1998 }, { "epoch": 0.7442379960675268, "grad_norm": 0.1840149462223053, "learning_rate": 1.9926388427978016e-05, "loss": 1.267, "step": 1999 }, { "epoch": 0.7446103012181359, "grad_norm": 0.17595337331295013, "learning_rate": 1.992624116522742e-05, "loss": 1.264, "step": 2000 }, { "epoch": 0.7446103012181359, "eval_loss": 1.3380753993988037, "eval_runtime": 15.9929, "eval_samples_per_second": 108.423, "eval_steps_per_second": 5.44, "step": 2000 }, { "epoch": 0.744982606368745, "grad_norm": 0.17696943879127502, "learning_rate": 1.9926093755866862e-05, "loss": 1.2773, "step": 2001 }, { "epoch": 0.7453549115193541, "grad_norm": 0.1840478777885437, "learning_rate": 1.9925946199898526e-05, "loss": 1.2484, "step": 2002 }, { "epoch": 0.7457272166699631, "grad_norm": 0.16878820955753326, "learning_rate": 1.9925798497324583e-05, "loss": 1.2478, "step": 2003 }, { "epoch": 0.7460995218205722, "grad_norm": 0.17494863271713257, "learning_rate": 1.992565064814722e-05, "loss": 1.2547, "step": 2004 }, { "epoch": 0.7464718269711813, "grad_norm": 0.17591069638729095, "learning_rate": 1.992550265236862e-05, "loss": 1.2644, "step": 2005 }, { "epoch": 0.7468441321217903, "grad_norm": 0.17405180633068085, "learning_rate": 1.992535450999097e-05, "loss": 1.2685, "step": 2006 }, { "epoch": 0.7472164372723994, "grad_norm": 0.17987771332263947, "learning_rate": 1.9925206221016456e-05, "loss": 1.2792, "step": 2007 }, { "epoch": 0.7475887424230084, "grad_norm": 0.1718064546585083, "learning_rate": 1.992505778544727e-05, "loss": 1.2552, "step": 2008 }, { "epoch": 0.7479610475736175, "grad_norm": 0.16629862785339355, "learning_rate": 1.9924909203285604e-05, "loss": 1.2621, "step": 2009 }, { "epoch": 0.7483333527242266, "grad_norm": 0.16779156029224396, "learning_rate": 1.9924760474533654e-05, "loss": 1.2569, "step": 2010 }, { "epoch": 0.7487056578748357, "grad_norm": 0.17635951936244965, "learning_rate": 1.992461159919361e-05, "loss": 1.2617, "step": 2011 }, { "epoch": 0.7490779630254447, "grad_norm": 0.17284058034420013, "learning_rate": 1.9924462577267676e-05, "loss": 1.2622, "step": 2012 }, { "epoch": 0.7494502681760538, "grad_norm": 0.16899628937244415, "learning_rate": 1.9924313408758053e-05, "loss": 1.2684, "step": 2013 }, { "epoch": 0.7498225733266629, "grad_norm": 0.17767173051834106, "learning_rate": 1.9924164093666946e-05, "loss": 1.2624, "step": 2014 }, { "epoch": 0.750194878477272, "grad_norm": 0.17249037325382233, "learning_rate": 1.9924014631996557e-05, "loss": 1.2647, "step": 2015 }, { "epoch": 0.750567183627881, "grad_norm": 0.1744518280029297, "learning_rate": 1.9923865023749095e-05, "loss": 1.2541, "step": 2016 }, { "epoch": 0.75093948877849, "grad_norm": 0.17807431519031525, "learning_rate": 1.9923715268926765e-05, "loss": 1.2731, "step": 2017 }, { "epoch": 0.7513117939290992, "grad_norm": 0.17543937265872955, "learning_rate": 1.992356536753179e-05, "loss": 1.2463, "step": 2018 }, { "epoch": 0.7516840990797082, "grad_norm": 0.17383424937725067, "learning_rate": 1.9923415319566372e-05, "loss": 1.2479, "step": 2019 }, { "epoch": 0.7520564042303173, "grad_norm": 0.181121364235878, "learning_rate": 1.9923265125032736e-05, "loss": 1.2617, "step": 2020 }, { "epoch": 0.7524287093809263, "grad_norm": 0.18264862895011902, "learning_rate": 1.9923114783933096e-05, "loss": 1.2607, "step": 2021 }, { "epoch": 0.7528010145315354, "grad_norm": 0.1814924031496048, "learning_rate": 1.9922964296269672e-05, "loss": 1.2828, "step": 2022 }, { "epoch": 0.7531733196821445, "grad_norm": 0.17525698244571686, "learning_rate": 1.992281366204469e-05, "loss": 1.2538, "step": 2023 }, { "epoch": 0.7535456248327536, "grad_norm": 0.17692695558071136, "learning_rate": 1.9922662881260374e-05, "loss": 1.2701, "step": 2024 }, { "epoch": 0.7539179299833626, "grad_norm": 0.17601527273654938, "learning_rate": 1.9922511953918945e-05, "loss": 1.2625, "step": 2025 }, { "epoch": 0.7542902351339716, "grad_norm": 0.17493323981761932, "learning_rate": 1.992236088002264e-05, "loss": 1.256, "step": 2026 }, { "epoch": 0.7546625402845808, "grad_norm": 0.1716911792755127, "learning_rate": 1.992220965957369e-05, "loss": 1.2375, "step": 2027 }, { "epoch": 0.7550348454351898, "grad_norm": 0.1788312941789627, "learning_rate": 1.9922058292574323e-05, "loss": 1.2571, "step": 2028 }, { "epoch": 0.7554071505857989, "grad_norm": 0.1795925498008728, "learning_rate": 1.9921906779026775e-05, "loss": 1.2737, "step": 2029 }, { "epoch": 0.7557794557364079, "grad_norm": 0.17507243156433105, "learning_rate": 1.9921755118933292e-05, "loss": 1.2625, "step": 2030 }, { "epoch": 0.756151760887017, "grad_norm": 0.17492863535881042, "learning_rate": 1.992160331229611e-05, "loss": 1.268, "step": 2031 }, { "epoch": 0.7565240660376261, "grad_norm": 0.18614165484905243, "learning_rate": 1.992145135911746e-05, "loss": 1.2878, "step": 2032 }, { "epoch": 0.7568963711882352, "grad_norm": 0.18001849949359894, "learning_rate": 1.9921299259399604e-05, "loss": 1.2622, "step": 2033 }, { "epoch": 0.7572686763388442, "grad_norm": 0.18375787138938904, "learning_rate": 1.9921147013144782e-05, "loss": 1.2825, "step": 2034 }, { "epoch": 0.7576409814894532, "grad_norm": 0.17793749272823334, "learning_rate": 1.9920994620355236e-05, "loss": 1.2546, "step": 2035 }, { "epoch": 0.7580132866400624, "grad_norm": 0.18328694999217987, "learning_rate": 1.9920842081033225e-05, "loss": 1.2607, "step": 2036 }, { "epoch": 0.7583855917906714, "grad_norm": 0.17626026272773743, "learning_rate": 1.9920689395180996e-05, "loss": 1.27, "step": 2037 }, { "epoch": 0.7587578969412805, "grad_norm": 0.1846167892217636, "learning_rate": 1.9920536562800808e-05, "loss": 1.2657, "step": 2038 }, { "epoch": 0.7591302020918895, "grad_norm": 0.1859884262084961, "learning_rate": 1.9920383583894922e-05, "loss": 1.2806, "step": 2039 }, { "epoch": 0.7595025072424987, "grad_norm": 0.17855176329612732, "learning_rate": 1.992023045846559e-05, "loss": 1.2725, "step": 2040 }, { "epoch": 0.7598748123931077, "grad_norm": 0.17177069187164307, "learning_rate": 1.9920077186515076e-05, "loss": 1.2623, "step": 2041 }, { "epoch": 0.7602471175437168, "grad_norm": 0.18153630197048187, "learning_rate": 1.9919923768045646e-05, "loss": 1.2827, "step": 2042 }, { "epoch": 0.7606194226943258, "grad_norm": 0.18789653480052948, "learning_rate": 1.9919770203059564e-05, "loss": 1.2675, "step": 2043 }, { "epoch": 0.7609917278449349, "grad_norm": 0.18295817077159882, "learning_rate": 1.99196164915591e-05, "loss": 1.2635, "step": 2044 }, { "epoch": 0.761364032995544, "grad_norm": 0.18244396150112152, "learning_rate": 1.991946263354652e-05, "loss": 1.2586, "step": 2045 }, { "epoch": 0.761736338146153, "grad_norm": 0.19641557335853577, "learning_rate": 1.99193086290241e-05, "loss": 1.2838, "step": 2046 }, { "epoch": 0.7621086432967621, "grad_norm": 0.1869594007730484, "learning_rate": 1.9919154477994117e-05, "loss": 1.262, "step": 2047 }, { "epoch": 0.7624809484473711, "grad_norm": 0.1961170732975006, "learning_rate": 1.991900018045884e-05, "loss": 1.2574, "step": 2048 }, { "epoch": 0.7628532535979803, "grad_norm": 0.18542712926864624, "learning_rate": 1.9918845736420554e-05, "loss": 1.2627, "step": 2049 }, { "epoch": 0.7632255587485893, "grad_norm": 0.19344452023506165, "learning_rate": 1.9918691145881542e-05, "loss": 1.2757, "step": 2050 }, { "epoch": 0.7635978638991984, "grad_norm": 0.19451677799224854, "learning_rate": 1.9918536408844082e-05, "loss": 1.265, "step": 2051 }, { "epoch": 0.7639701690498074, "grad_norm": 0.19341467320919037, "learning_rate": 1.9918381525310464e-05, "loss": 1.2655, "step": 2052 }, { "epoch": 0.7643424742004166, "grad_norm": 0.1805962473154068, "learning_rate": 1.991822649528297e-05, "loss": 1.2639, "step": 2053 }, { "epoch": 0.7647147793510256, "grad_norm": 0.19268319010734558, "learning_rate": 1.9918071318763898e-05, "loss": 1.2563, "step": 2054 }, { "epoch": 0.7650870845016347, "grad_norm": 0.1859290599822998, "learning_rate": 1.991791599575553e-05, "loss": 1.2654, "step": 2055 }, { "epoch": 0.7654593896522437, "grad_norm": 0.18508413434028625, "learning_rate": 1.991776052626017e-05, "loss": 1.2577, "step": 2056 }, { "epoch": 0.7658316948028527, "grad_norm": 0.19763630628585815, "learning_rate": 1.9917604910280106e-05, "loss": 1.2622, "step": 2057 }, { "epoch": 0.7662039999534619, "grad_norm": 0.16519156098365784, "learning_rate": 1.991744914781764e-05, "loss": 1.2553, "step": 2058 }, { "epoch": 0.7665763051040709, "grad_norm": 0.191755473613739, "learning_rate": 1.991729323887507e-05, "loss": 1.2623, "step": 2059 }, { "epoch": 0.76694861025468, "grad_norm": 0.19937783479690552, "learning_rate": 1.9917137183454706e-05, "loss": 1.2631, "step": 2060 }, { "epoch": 0.767320915405289, "grad_norm": 0.17880156636238098, "learning_rate": 1.9916980981558846e-05, "loss": 1.2731, "step": 2061 }, { "epoch": 0.7676932205558982, "grad_norm": 0.17433685064315796, "learning_rate": 1.99168246331898e-05, "loss": 1.2617, "step": 2062 }, { "epoch": 0.7680655257065072, "grad_norm": 0.19513197243213654, "learning_rate": 1.9916668138349873e-05, "loss": 1.2746, "step": 2063 }, { "epoch": 0.7684378308571163, "grad_norm": 0.16553771495819092, "learning_rate": 1.9916511497041388e-05, "loss": 1.2607, "step": 2064 }, { "epoch": 0.7688101360077253, "grad_norm": 0.1720885932445526, "learning_rate": 1.9916354709266645e-05, "loss": 1.2665, "step": 2065 }, { "epoch": 0.7691824411583345, "grad_norm": 0.18258140981197357, "learning_rate": 1.9916197775027967e-05, "loss": 1.2586, "step": 2066 }, { "epoch": 0.7695547463089435, "grad_norm": 0.18003836274147034, "learning_rate": 1.991604069432767e-05, "loss": 1.2594, "step": 2067 }, { "epoch": 0.7699270514595525, "grad_norm": 0.1830645203590393, "learning_rate": 1.991588346716807e-05, "loss": 1.2621, "step": 2068 }, { "epoch": 0.7702993566101616, "grad_norm": 0.1741950958967209, "learning_rate": 1.9915726093551497e-05, "loss": 1.2628, "step": 2069 }, { "epoch": 0.7706716617607706, "grad_norm": 0.17035536468029022, "learning_rate": 1.991556857348027e-05, "loss": 1.2504, "step": 2070 }, { "epoch": 0.7710439669113798, "grad_norm": 0.1875630021095276, "learning_rate": 1.9915410906956723e-05, "loss": 1.2616, "step": 2071 }, { "epoch": 0.7714162720619888, "grad_norm": 0.1755642145872116, "learning_rate": 1.9915253093983175e-05, "loss": 1.2758, "step": 2072 }, { "epoch": 0.7717885772125979, "grad_norm": 0.17393867671489716, "learning_rate": 1.991509513456196e-05, "loss": 1.2584, "step": 2073 }, { "epoch": 0.7721608823632069, "grad_norm": 0.186118945479393, "learning_rate": 1.9914937028695412e-05, "loss": 1.2733, "step": 2074 }, { "epoch": 0.772533187513816, "grad_norm": 0.1890277862548828, "learning_rate": 1.991477877638587e-05, "loss": 1.2652, "step": 2075 }, { "epoch": 0.7729054926644251, "grad_norm": 0.18629412353038788, "learning_rate": 1.9914620377635666e-05, "loss": 1.2573, "step": 2076 }, { "epoch": 0.7732777978150341, "grad_norm": 0.17759479582309723, "learning_rate": 1.9914461832447142e-05, "loss": 1.2551, "step": 2077 }, { "epoch": 0.7736501029656432, "grad_norm": 0.1834801733493805, "learning_rate": 1.9914303140822634e-05, "loss": 1.252, "step": 2078 }, { "epoch": 0.7740224081162523, "grad_norm": 0.17973852157592773, "learning_rate": 1.9914144302764497e-05, "loss": 1.2553, "step": 2079 }, { "epoch": 0.7743947132668614, "grad_norm": 0.17896102368831635, "learning_rate": 1.9913985318275068e-05, "loss": 1.2553, "step": 2080 }, { "epoch": 0.7747670184174704, "grad_norm": 0.18261843919754028, "learning_rate": 1.99138261873567e-05, "loss": 1.2491, "step": 2081 }, { "epoch": 0.7751393235680795, "grad_norm": 0.19672666490077972, "learning_rate": 1.9913666910011737e-05, "loss": 1.2655, "step": 2082 }, { "epoch": 0.7755116287186885, "grad_norm": 0.18677249550819397, "learning_rate": 1.9913507486242537e-05, "loss": 1.2749, "step": 2083 }, { "epoch": 0.7758839338692977, "grad_norm": 0.18855483829975128, "learning_rate": 1.9913347916051458e-05, "loss": 1.2729, "step": 2084 }, { "epoch": 0.7762562390199067, "grad_norm": 0.18166404962539673, "learning_rate": 1.9913188199440848e-05, "loss": 1.253, "step": 2085 }, { "epoch": 0.7766285441705157, "grad_norm": 0.18028071522712708, "learning_rate": 1.9913028336413074e-05, "loss": 1.2587, "step": 2086 }, { "epoch": 0.7770008493211248, "grad_norm": 0.18412554264068604, "learning_rate": 1.991286832697049e-05, "loss": 1.2503, "step": 2087 }, { "epoch": 0.7773731544717339, "grad_norm": 0.18333709239959717, "learning_rate": 1.9912708171115463e-05, "loss": 1.2636, "step": 2088 }, { "epoch": 0.777745459622343, "grad_norm": 0.18019062280654907, "learning_rate": 1.991254786885036e-05, "loss": 1.2577, "step": 2089 }, { "epoch": 0.778117764772952, "grad_norm": 0.18314556777477264, "learning_rate": 1.991238742017755e-05, "loss": 1.2482, "step": 2090 }, { "epoch": 0.7784900699235611, "grad_norm": 0.17646336555480957, "learning_rate": 1.9912226825099395e-05, "loss": 1.262, "step": 2091 }, { "epoch": 0.7788623750741702, "grad_norm": 0.19445347785949707, "learning_rate": 1.9912066083618275e-05, "loss": 1.256, "step": 2092 }, { "epoch": 0.7792346802247793, "grad_norm": 0.17373862862586975, "learning_rate": 1.991190519573656e-05, "loss": 1.2702, "step": 2093 }, { "epoch": 0.7796069853753883, "grad_norm": 0.16923663020133972, "learning_rate": 1.9911744161456624e-05, "loss": 1.2482, "step": 2094 }, { "epoch": 0.7799792905259973, "grad_norm": 0.1783125400543213, "learning_rate": 1.9911582980780854e-05, "loss": 1.2623, "step": 2095 }, { "epoch": 0.7803515956766064, "grad_norm": 0.18715058267116547, "learning_rate": 1.9911421653711624e-05, "loss": 1.2634, "step": 2096 }, { "epoch": 0.7807239008272155, "grad_norm": 0.17201177775859833, "learning_rate": 1.9911260180251316e-05, "loss": 1.2659, "step": 2097 }, { "epoch": 0.7810962059778246, "grad_norm": 0.1753210425376892, "learning_rate": 1.991109856040232e-05, "loss": 1.2558, "step": 2098 }, { "epoch": 0.7814685111284336, "grad_norm": 0.16961847245693207, "learning_rate": 1.991093679416702e-05, "loss": 1.2614, "step": 2099 }, { "epoch": 0.7818408162790427, "grad_norm": 0.18109846115112305, "learning_rate": 1.9910774881547803e-05, "loss": 1.2597, "step": 2100 }, { "epoch": 0.7822131214296518, "grad_norm": 0.16653625667095184, "learning_rate": 1.9910612822547063e-05, "loss": 1.2603, "step": 2101 }, { "epoch": 0.7825854265802609, "grad_norm": 0.17125585675239563, "learning_rate": 1.9910450617167198e-05, "loss": 1.2597, "step": 2102 }, { "epoch": 0.7829577317308699, "grad_norm": 0.18307803571224213, "learning_rate": 1.9910288265410593e-05, "loss": 1.2728, "step": 2103 }, { "epoch": 0.783330036881479, "grad_norm": 0.17665451765060425, "learning_rate": 1.9910125767279655e-05, "loss": 1.2727, "step": 2104 }, { "epoch": 0.783702342032088, "grad_norm": 0.188068225979805, "learning_rate": 1.9909963122776785e-05, "loss": 1.2526, "step": 2105 }, { "epoch": 0.7840746471826971, "grad_norm": 0.1769012063741684, "learning_rate": 1.9909800331904375e-05, "loss": 1.2541, "step": 2106 }, { "epoch": 0.7844469523333062, "grad_norm": 0.1765563189983368, "learning_rate": 1.9909637394664842e-05, "loss": 1.2499, "step": 2107 }, { "epoch": 0.7848192574839152, "grad_norm": 0.19480668008327484, "learning_rate": 1.9909474311060583e-05, "loss": 1.2617, "step": 2108 }, { "epoch": 0.7851915626345243, "grad_norm": 0.1799997091293335, "learning_rate": 1.9909311081094012e-05, "loss": 1.2636, "step": 2109 }, { "epoch": 0.7855638677851334, "grad_norm": 0.16892112791538239, "learning_rate": 1.9909147704767537e-05, "loss": 1.2525, "step": 2110 }, { "epoch": 0.7859361729357425, "grad_norm": 0.17401593923568726, "learning_rate": 1.9908984182083574e-05, "loss": 1.2497, "step": 2111 }, { "epoch": 0.7863084780863515, "grad_norm": 0.18398383259773254, "learning_rate": 1.9908820513044535e-05, "loss": 1.2704, "step": 2112 }, { "epoch": 0.7866807832369606, "grad_norm": 0.17597925662994385, "learning_rate": 1.990865669765284e-05, "loss": 1.2628, "step": 2113 }, { "epoch": 0.7870530883875697, "grad_norm": 0.17537419497966766, "learning_rate": 1.9908492735910907e-05, "loss": 1.2674, "step": 2114 }, { "epoch": 0.7874253935381788, "grad_norm": 0.198894664645195, "learning_rate": 1.990832862782116e-05, "loss": 1.273, "step": 2115 }, { "epoch": 0.7877976986887878, "grad_norm": 0.18598125874996185, "learning_rate": 1.9908164373386016e-05, "loss": 1.2714, "step": 2116 }, { "epoch": 0.7881700038393968, "grad_norm": 0.18410056829452515, "learning_rate": 1.990799997260791e-05, "loss": 1.2774, "step": 2117 }, { "epoch": 0.7885423089900059, "grad_norm": 0.17665806412696838, "learning_rate": 1.9907835425489263e-05, "loss": 1.2697, "step": 2118 }, { "epoch": 0.788914614140615, "grad_norm": 0.18189984560012817, "learning_rate": 1.990767073203251e-05, "loss": 1.2582, "step": 2119 }, { "epoch": 0.7892869192912241, "grad_norm": 0.18040038645267487, "learning_rate": 1.9907505892240084e-05, "loss": 1.2587, "step": 2120 }, { "epoch": 0.7896592244418331, "grad_norm": 0.18493278324604034, "learning_rate": 1.9907340906114418e-05, "loss": 1.2563, "step": 2121 }, { "epoch": 0.7900315295924422, "grad_norm": 0.19034142792224884, "learning_rate": 1.9907175773657945e-05, "loss": 1.2633, "step": 2122 }, { "epoch": 0.7904038347430513, "grad_norm": 0.18734246492385864, "learning_rate": 1.990701049487311e-05, "loss": 1.2665, "step": 2123 }, { "epoch": 0.7907761398936604, "grad_norm": 0.18347962200641632, "learning_rate": 1.9906845069762352e-05, "loss": 1.2608, "step": 2124 }, { "epoch": 0.7911484450442694, "grad_norm": 0.18983064591884613, "learning_rate": 1.9906679498328114e-05, "loss": 1.2524, "step": 2125 }, { "epoch": 0.7915207501948784, "grad_norm": 0.17927458882331848, "learning_rate": 1.990651378057284e-05, "loss": 1.2826, "step": 2126 }, { "epoch": 0.7918930553454876, "grad_norm": 0.1906890869140625, "learning_rate": 1.990634791649898e-05, "loss": 1.2494, "step": 2127 }, { "epoch": 0.7922653604960966, "grad_norm": 0.19961805641651154, "learning_rate": 1.9906181906108983e-05, "loss": 1.2436, "step": 2128 }, { "epoch": 0.7926376656467057, "grad_norm": 0.18504062294960022, "learning_rate": 1.9906015749405302e-05, "loss": 1.2439, "step": 2129 }, { "epoch": 0.7930099707973147, "grad_norm": 0.18666766583919525, "learning_rate": 1.9905849446390387e-05, "loss": 1.2538, "step": 2130 }, { "epoch": 0.7933822759479238, "grad_norm": 0.19338330626487732, "learning_rate": 1.99056829970667e-05, "loss": 1.2609, "step": 2131 }, { "epoch": 0.7937545810985329, "grad_norm": 0.19320982694625854, "learning_rate": 1.9905516401436698e-05, "loss": 1.2667, "step": 2132 }, { "epoch": 0.794126886249142, "grad_norm": 0.18429437279701233, "learning_rate": 1.9905349659502836e-05, "loss": 1.2662, "step": 2133 }, { "epoch": 0.794499191399751, "grad_norm": 0.1833900511264801, "learning_rate": 1.9905182771267583e-05, "loss": 1.2662, "step": 2134 }, { "epoch": 0.79487149655036, "grad_norm": 0.19020430743694305, "learning_rate": 1.9905015736733406e-05, "loss": 1.2428, "step": 2135 }, { "epoch": 0.7952438017009692, "grad_norm": 0.18528495728969574, "learning_rate": 1.9904848555902764e-05, "loss": 1.2643, "step": 2136 }, { "epoch": 0.7956161068515782, "grad_norm": 0.1845444291830063, "learning_rate": 1.990468122877813e-05, "loss": 1.2741, "step": 2137 }, { "epoch": 0.7959884120021873, "grad_norm": 0.1760178506374359, "learning_rate": 1.9904513755361978e-05, "loss": 1.2555, "step": 2138 }, { "epoch": 0.7963607171527963, "grad_norm": 0.1901838481426239, "learning_rate": 1.990434613565678e-05, "loss": 1.2593, "step": 2139 }, { "epoch": 0.7967330223034055, "grad_norm": 0.19051726162433624, "learning_rate": 1.990417836966501e-05, "loss": 1.2585, "step": 2140 }, { "epoch": 0.7971053274540145, "grad_norm": 0.17843446135520935, "learning_rate": 1.9904010457389144e-05, "loss": 1.2613, "step": 2141 }, { "epoch": 0.7974776326046236, "grad_norm": 0.16943950951099396, "learning_rate": 1.990384239883167e-05, "loss": 1.2496, "step": 2142 }, { "epoch": 0.7978499377552326, "grad_norm": 0.19439001381397247, "learning_rate": 1.9903674193995064e-05, "loss": 1.2617, "step": 2143 }, { "epoch": 0.7982222429058417, "grad_norm": 0.17899306118488312, "learning_rate": 1.990350584288181e-05, "loss": 1.2463, "step": 2144 }, { "epoch": 0.7985945480564508, "grad_norm": 0.1824941188097, "learning_rate": 1.99033373454944e-05, "loss": 1.243, "step": 2145 }, { "epoch": 0.7989668532070598, "grad_norm": 0.1979120671749115, "learning_rate": 1.9903168701835314e-05, "loss": 1.2497, "step": 2146 }, { "epoch": 0.7993391583576689, "grad_norm": 0.18462349474430084, "learning_rate": 1.990299991190705e-05, "loss": 1.2597, "step": 2147 }, { "epoch": 0.7997114635082779, "grad_norm": 0.18362213671207428, "learning_rate": 1.9902830975712096e-05, "loss": 1.2657, "step": 2148 }, { "epoch": 0.8000837686588871, "grad_norm": 0.18074361979961395, "learning_rate": 1.9902661893252955e-05, "loss": 1.263, "step": 2149 }, { "epoch": 0.8004560738094961, "grad_norm": 0.1755630373954773, "learning_rate": 1.9902492664532116e-05, "loss": 1.2558, "step": 2150 }, { "epoch": 0.8008283789601052, "grad_norm": 0.1772383749485016, "learning_rate": 1.9902323289552084e-05, "loss": 1.2668, "step": 2151 }, { "epoch": 0.8012006841107142, "grad_norm": 0.18553152680397034, "learning_rate": 1.9902153768315355e-05, "loss": 1.2548, "step": 2152 }, { "epoch": 0.8015729892613234, "grad_norm": 0.18078090250492096, "learning_rate": 1.9901984100824442e-05, "loss": 1.2535, "step": 2153 }, { "epoch": 0.8019452944119324, "grad_norm": 0.19977954030036926, "learning_rate": 1.990181428708184e-05, "loss": 1.2606, "step": 2154 }, { "epoch": 0.8023175995625415, "grad_norm": 0.18592652678489685, "learning_rate": 1.9901644327090063e-05, "loss": 1.2663, "step": 2155 }, { "epoch": 0.8026899047131505, "grad_norm": 0.17959389090538025, "learning_rate": 1.9901474220851625e-05, "loss": 1.2591, "step": 2156 }, { "epoch": 0.8030622098637595, "grad_norm": 0.19428277015686035, "learning_rate": 1.9901303968369028e-05, "loss": 1.2703, "step": 2157 }, { "epoch": 0.8034345150143687, "grad_norm": 0.17926783859729767, "learning_rate": 1.9901133569644794e-05, "loss": 1.2682, "step": 2158 }, { "epoch": 0.8038068201649777, "grad_norm": 0.18147458136081696, "learning_rate": 1.9900963024681442e-05, "loss": 1.2536, "step": 2159 }, { "epoch": 0.8041791253155868, "grad_norm": 0.18848387897014618, "learning_rate": 1.990079233348149e-05, "loss": 1.2543, "step": 2160 }, { "epoch": 0.8045514304661958, "grad_norm": 0.20593321323394775, "learning_rate": 1.990062149604745e-05, "loss": 1.2601, "step": 2161 }, { "epoch": 0.804923735616805, "grad_norm": 0.17839789390563965, "learning_rate": 1.990045051238185e-05, "loss": 1.253, "step": 2162 }, { "epoch": 0.805296040767414, "grad_norm": 0.18125569820404053, "learning_rate": 1.9900279382487223e-05, "loss": 1.2479, "step": 2163 }, { "epoch": 0.805668345918023, "grad_norm": 0.1832691729068756, "learning_rate": 1.990010810636609e-05, "loss": 1.2563, "step": 2164 }, { "epoch": 0.8060406510686321, "grad_norm": 0.18866664171218872, "learning_rate": 1.9899936684020983e-05, "loss": 1.2379, "step": 2165 }, { "epoch": 0.8064129562192413, "grad_norm": 0.17657117545604706, "learning_rate": 1.989976511545443e-05, "loss": 1.2568, "step": 2166 }, { "epoch": 0.8067852613698503, "grad_norm": 0.17251810431480408, "learning_rate": 1.989959340066897e-05, "loss": 1.2435, "step": 2167 }, { "epoch": 0.8071575665204593, "grad_norm": 0.1733696609735489, "learning_rate": 1.9899421539667132e-05, "loss": 1.2434, "step": 2168 }, { "epoch": 0.8075298716710684, "grad_norm": 0.1835404336452484, "learning_rate": 1.989924953245146e-05, "loss": 1.2721, "step": 2169 }, { "epoch": 0.8079021768216774, "grad_norm": 0.18328076601028442, "learning_rate": 1.9899077379024497e-05, "loss": 1.2567, "step": 2170 }, { "epoch": 0.8082744819722866, "grad_norm": 0.18229638040065765, "learning_rate": 1.989890507938878e-05, "loss": 1.2604, "step": 2171 }, { "epoch": 0.8086467871228956, "grad_norm": 0.18881352245807648, "learning_rate": 1.989873263354686e-05, "loss": 1.2579, "step": 2172 }, { "epoch": 0.8090190922735047, "grad_norm": 0.179013192653656, "learning_rate": 1.9898560041501277e-05, "loss": 1.2631, "step": 2173 }, { "epoch": 0.8093913974241137, "grad_norm": 0.18358515202999115, "learning_rate": 1.9898387303254584e-05, "loss": 1.2431, "step": 2174 }, { "epoch": 0.8097637025747229, "grad_norm": 0.179754838347435, "learning_rate": 1.989821441880933e-05, "loss": 1.2645, "step": 2175 }, { "epoch": 0.8101360077253319, "grad_norm": 0.18313923478126526, "learning_rate": 1.989804138816807e-05, "loss": 1.2684, "step": 2176 }, { "epoch": 0.8105083128759409, "grad_norm": 0.19836069643497467, "learning_rate": 1.9897868211333362e-05, "loss": 1.2612, "step": 2177 }, { "epoch": 0.81088061802655, "grad_norm": 0.19085316359996796, "learning_rate": 1.9897694888307763e-05, "loss": 1.2471, "step": 2178 }, { "epoch": 0.811252923177159, "grad_norm": 0.18687599897384644, "learning_rate": 1.9897521419093828e-05, "loss": 1.2446, "step": 2179 }, { "epoch": 0.8116252283277682, "grad_norm": 0.1764868050813675, "learning_rate": 1.9897347803694126e-05, "loss": 1.2665, "step": 2180 }, { "epoch": 0.8119975334783772, "grad_norm": 0.2134658545255661, "learning_rate": 1.9897174042111214e-05, "loss": 1.2447, "step": 2181 }, { "epoch": 0.8123698386289863, "grad_norm": 0.1877439022064209, "learning_rate": 1.9897000134347665e-05, "loss": 1.2686, "step": 2182 }, { "epoch": 0.8127421437795953, "grad_norm": 0.1730710119009018, "learning_rate": 1.9896826080406046e-05, "loss": 1.2661, "step": 2183 }, { "epoch": 0.8131144489302045, "grad_norm": 0.2091267704963684, "learning_rate": 1.9896651880288926e-05, "loss": 1.2707, "step": 2184 }, { "epoch": 0.8134867540808135, "grad_norm": 0.19189974665641785, "learning_rate": 1.9896477533998883e-05, "loss": 1.2617, "step": 2185 }, { "epoch": 0.8138590592314225, "grad_norm": 0.18066149950027466, "learning_rate": 1.989630304153848e-05, "loss": 1.2609, "step": 2186 }, { "epoch": 0.8142313643820316, "grad_norm": 0.2049744427204132, "learning_rate": 1.9896128402910307e-05, "loss": 1.2573, "step": 2187 }, { "epoch": 0.8146036695326407, "grad_norm": 0.1748519241809845, "learning_rate": 1.9895953618116935e-05, "loss": 1.2437, "step": 2188 }, { "epoch": 0.8149759746832498, "grad_norm": 0.17954249680042267, "learning_rate": 1.9895778687160954e-05, "loss": 1.2665, "step": 2189 }, { "epoch": 0.8153482798338588, "grad_norm": 0.18017897009849548, "learning_rate": 1.989560361004494e-05, "loss": 1.2545, "step": 2190 }, { "epoch": 0.8157205849844679, "grad_norm": 0.20069225132465363, "learning_rate": 1.9895428386771482e-05, "loss": 1.2736, "step": 2191 }, { "epoch": 0.8160928901350769, "grad_norm": 0.17469745874404907, "learning_rate": 1.989525301734317e-05, "loss": 1.2449, "step": 2192 }, { "epoch": 0.8164651952856861, "grad_norm": 0.18415339291095734, "learning_rate": 1.9895077501762588e-05, "loss": 1.2507, "step": 2193 }, { "epoch": 0.8168375004362951, "grad_norm": 0.18301935493946075, "learning_rate": 1.9894901840032336e-05, "loss": 1.254, "step": 2194 }, { "epoch": 0.8172098055869041, "grad_norm": 0.18746277689933777, "learning_rate": 1.9894726032155e-05, "loss": 1.2621, "step": 2195 }, { "epoch": 0.8175821107375132, "grad_norm": 0.17902565002441406, "learning_rate": 1.9894550078133186e-05, "loss": 1.267, "step": 2196 }, { "epoch": 0.8179544158881223, "grad_norm": 0.1712873876094818, "learning_rate": 1.9894373977969486e-05, "loss": 1.2545, "step": 2197 }, { "epoch": 0.8183267210387314, "grad_norm": 0.1745031774044037, "learning_rate": 1.9894197731666506e-05, "loss": 1.244, "step": 2198 }, { "epoch": 0.8186990261893404, "grad_norm": 0.18271346390247345, "learning_rate": 1.9894021339226843e-05, "loss": 1.263, "step": 2199 }, { "epoch": 0.8190713313399495, "grad_norm": 0.17454905807971954, "learning_rate": 1.989384480065311e-05, "loss": 1.2636, "step": 2200 }, { "epoch": 0.8194436364905586, "grad_norm": 0.17742374539375305, "learning_rate": 1.9893668115947906e-05, "loss": 1.2489, "step": 2201 }, { "epoch": 0.8198159416411677, "grad_norm": 0.17203061282634735, "learning_rate": 1.9893491285113845e-05, "loss": 1.248, "step": 2202 }, { "epoch": 0.8201882467917767, "grad_norm": 0.1837659776210785, "learning_rate": 1.989331430815354e-05, "loss": 1.2582, "step": 2203 }, { "epoch": 0.8205605519423858, "grad_norm": 0.17900030314922333, "learning_rate": 1.9893137185069603e-05, "loss": 1.2472, "step": 2204 }, { "epoch": 0.8209328570929948, "grad_norm": 0.17265020310878754, "learning_rate": 1.9892959915864652e-05, "loss": 1.2608, "step": 2205 }, { "epoch": 0.821305162243604, "grad_norm": 0.1822979599237442, "learning_rate": 1.9892782500541304e-05, "loss": 1.2477, "step": 2206 }, { "epoch": 0.821677467394213, "grad_norm": 0.18672309815883636, "learning_rate": 1.9892604939102177e-05, "loss": 1.2489, "step": 2207 }, { "epoch": 0.822049772544822, "grad_norm": 0.18067359924316406, "learning_rate": 1.9892427231549897e-05, "loss": 1.2511, "step": 2208 }, { "epoch": 0.8224220776954311, "grad_norm": 0.18508952856063843, "learning_rate": 1.9892249377887086e-05, "loss": 1.2522, "step": 2209 }, { "epoch": 0.8227943828460402, "grad_norm": 0.17746296525001526, "learning_rate": 1.9892071378116378e-05, "loss": 1.2487, "step": 2210 }, { "epoch": 0.8231666879966493, "grad_norm": 0.1730239987373352, "learning_rate": 1.9891893232240394e-05, "loss": 1.2631, "step": 2211 }, { "epoch": 0.8235389931472583, "grad_norm": 0.1845473349094391, "learning_rate": 1.9891714940261764e-05, "loss": 1.2595, "step": 2212 }, { "epoch": 0.8239112982978674, "grad_norm": 0.18520519137382507, "learning_rate": 1.989153650218313e-05, "loss": 1.2479, "step": 2213 }, { "epoch": 0.8242836034484765, "grad_norm": 0.19345776736736298, "learning_rate": 1.989135791800712e-05, "loss": 1.2744, "step": 2214 }, { "epoch": 0.8246559085990856, "grad_norm": 0.17609384655952454, "learning_rate": 1.9891179187736375e-05, "loss": 1.2465, "step": 2215 }, { "epoch": 0.8250282137496946, "grad_norm": 0.1815994530916214, "learning_rate": 1.9891000311373533e-05, "loss": 1.2523, "step": 2216 }, { "epoch": 0.8254005189003036, "grad_norm": 0.19329750537872314, "learning_rate": 1.9890821288921238e-05, "loss": 1.2651, "step": 2217 }, { "epoch": 0.8257728240509127, "grad_norm": 0.18610511720180511, "learning_rate": 1.9890642120382132e-05, "loss": 1.2453, "step": 2218 }, { "epoch": 0.8261451292015218, "grad_norm": 0.1695135533809662, "learning_rate": 1.9890462805758863e-05, "loss": 1.2745, "step": 2219 }, { "epoch": 0.8265174343521309, "grad_norm": 0.18071943521499634, "learning_rate": 1.9890283345054082e-05, "loss": 1.2737, "step": 2220 }, { "epoch": 0.8268897395027399, "grad_norm": 0.18198001384735107, "learning_rate": 1.9890103738270433e-05, "loss": 1.2495, "step": 2221 }, { "epoch": 0.827262044653349, "grad_norm": 0.1641894280910492, "learning_rate": 1.9889923985410576e-05, "loss": 1.2556, "step": 2222 }, { "epoch": 0.8276343498039581, "grad_norm": 0.18489384651184082, "learning_rate": 1.9889744086477162e-05, "loss": 1.2591, "step": 2223 }, { "epoch": 0.8280066549545672, "grad_norm": 0.1797334849834442, "learning_rate": 1.9889564041472846e-05, "loss": 1.2376, "step": 2224 }, { "epoch": 0.8283789601051762, "grad_norm": 0.16851265728473663, "learning_rate": 1.988938385040029e-05, "loss": 1.2442, "step": 2225 }, { "epoch": 0.8287512652557852, "grad_norm": 0.17845316231250763, "learning_rate": 1.9889203513262153e-05, "loss": 1.2432, "step": 2226 }, { "epoch": 0.8291235704063944, "grad_norm": 0.18235254287719727, "learning_rate": 1.9889023030061106e-05, "loss": 1.2593, "step": 2227 }, { "epoch": 0.8294958755570034, "grad_norm": 0.1719173640012741, "learning_rate": 1.9888842400799805e-05, "loss": 1.2567, "step": 2228 }, { "epoch": 0.8298681807076125, "grad_norm": 0.17865481972694397, "learning_rate": 1.9888661625480927e-05, "loss": 1.2712, "step": 2229 }, { "epoch": 0.8302404858582215, "grad_norm": 0.1867910474538803, "learning_rate": 1.9888480704107135e-05, "loss": 1.2432, "step": 2230 }, { "epoch": 0.8306127910088306, "grad_norm": 0.17481322586536407, "learning_rate": 1.9888299636681105e-05, "loss": 1.2538, "step": 2231 }, { "epoch": 0.8309850961594397, "grad_norm": 0.18114839494228363, "learning_rate": 1.9888118423205504e-05, "loss": 1.2521, "step": 2232 }, { "epoch": 0.8313574013100488, "grad_norm": 0.17833612859249115, "learning_rate": 1.988793706368302e-05, "loss": 1.2607, "step": 2233 }, { "epoch": 0.8317297064606578, "grad_norm": 0.17668797075748444, "learning_rate": 1.9887755558116324e-05, "loss": 1.2573, "step": 2234 }, { "epoch": 0.8321020116112668, "grad_norm": 0.1729530692100525, "learning_rate": 1.9887573906508103e-05, "loss": 1.2558, "step": 2235 }, { "epoch": 0.832474316761876, "grad_norm": 0.19922944903373718, "learning_rate": 1.988739210886103e-05, "loss": 1.2743, "step": 2236 }, { "epoch": 0.832846621912485, "grad_norm": 0.18021075427532196, "learning_rate": 1.98872101651778e-05, "loss": 1.2672, "step": 2237 }, { "epoch": 0.8332189270630941, "grad_norm": 0.1811213493347168, "learning_rate": 1.9887028075461096e-05, "loss": 1.247, "step": 2238 }, { "epoch": 0.8335912322137031, "grad_norm": 0.18018268048763275, "learning_rate": 1.9886845839713604e-05, "loss": 1.2451, "step": 2239 }, { "epoch": 0.8339635373643123, "grad_norm": 0.1896362155675888, "learning_rate": 1.9886663457938025e-05, "loss": 1.2558, "step": 2240 }, { "epoch": 0.8343358425149213, "grad_norm": 0.1847160905599594, "learning_rate": 1.9886480930137046e-05, "loss": 1.2619, "step": 2241 }, { "epoch": 0.8347081476655304, "grad_norm": 0.17278259992599487, "learning_rate": 1.988629825631336e-05, "loss": 1.258, "step": 2242 }, { "epoch": 0.8350804528161394, "grad_norm": 0.17939510941505432, "learning_rate": 1.9886115436469674e-05, "loss": 1.2665, "step": 2243 }, { "epoch": 0.8354527579667484, "grad_norm": 0.17530454695224762, "learning_rate": 1.9885932470608676e-05, "loss": 1.2531, "step": 2244 }, { "epoch": 0.8358250631173576, "grad_norm": 0.20562461018562317, "learning_rate": 1.9885749358733086e-05, "loss": 1.2467, "step": 2245 }, { "epoch": 0.8361973682679666, "grad_norm": 0.17135387659072876, "learning_rate": 1.988556610084559e-05, "loss": 1.2486, "step": 2246 }, { "epoch": 0.8365696734185757, "grad_norm": 0.18840156495571136, "learning_rate": 1.9885382696948906e-05, "loss": 1.2536, "step": 2247 }, { "epoch": 0.8369419785691847, "grad_norm": 0.1828533560037613, "learning_rate": 1.988519914704574e-05, "loss": 1.2581, "step": 2248 }, { "epoch": 0.8373142837197939, "grad_norm": 0.1792685091495514, "learning_rate": 1.988501545113881e-05, "loss": 1.2597, "step": 2249 }, { "epoch": 0.8376865888704029, "grad_norm": 0.18444480001926422, "learning_rate": 1.9884831609230813e-05, "loss": 1.2414, "step": 2250 }, { "epoch": 0.838058894021012, "grad_norm": 0.18098483979701996, "learning_rate": 1.9884647621324475e-05, "loss": 1.2448, "step": 2251 }, { "epoch": 0.838431199171621, "grad_norm": 0.1816195845603943, "learning_rate": 1.9884463487422515e-05, "loss": 1.2499, "step": 2252 }, { "epoch": 0.83880350432223, "grad_norm": 0.19393447041511536, "learning_rate": 1.9884279207527647e-05, "loss": 1.2584, "step": 2253 }, { "epoch": 0.8391758094728392, "grad_norm": 0.18464012444019318, "learning_rate": 1.9884094781642592e-05, "loss": 1.2544, "step": 2254 }, { "epoch": 0.8395481146234482, "grad_norm": 0.17956916987895966, "learning_rate": 1.9883910209770083e-05, "loss": 1.2602, "step": 2255 }, { "epoch": 0.8399204197740573, "grad_norm": 0.18106333911418915, "learning_rate": 1.988372549191284e-05, "loss": 1.2553, "step": 2256 }, { "epoch": 0.8402927249246663, "grad_norm": 0.18379664421081543, "learning_rate": 1.9883540628073592e-05, "loss": 1.2517, "step": 2257 }, { "epoch": 0.8406650300752755, "grad_norm": 0.17581118643283844, "learning_rate": 1.9883355618255068e-05, "loss": 1.2534, "step": 2258 }, { "epoch": 0.8410373352258845, "grad_norm": 0.17392081022262573, "learning_rate": 1.988317046246e-05, "loss": 1.2616, "step": 2259 }, { "epoch": 0.8414096403764936, "grad_norm": 0.1733708381652832, "learning_rate": 1.9882985160691127e-05, "loss": 1.2549, "step": 2260 }, { "epoch": 0.8417819455271026, "grad_norm": 0.1678503155708313, "learning_rate": 1.9882799712951182e-05, "loss": 1.2653, "step": 2261 }, { "epoch": 0.8421542506777118, "grad_norm": 0.17663797736167908, "learning_rate": 1.9882614119242906e-05, "loss": 1.2556, "step": 2262 }, { "epoch": 0.8425265558283208, "grad_norm": 0.17948319017887115, "learning_rate": 1.988242837956904e-05, "loss": 1.2459, "step": 2263 }, { "epoch": 0.8428988609789299, "grad_norm": 0.17682886123657227, "learning_rate": 1.9882242493932327e-05, "loss": 1.2554, "step": 2264 }, { "epoch": 0.8432711661295389, "grad_norm": 0.17694690823554993, "learning_rate": 1.9882056462335513e-05, "loss": 1.2536, "step": 2265 }, { "epoch": 0.8436434712801479, "grad_norm": 0.18028870224952698, "learning_rate": 1.9881870284781345e-05, "loss": 1.2572, "step": 2266 }, { "epoch": 0.8440157764307571, "grad_norm": 0.17367003858089447, "learning_rate": 1.9881683961272572e-05, "loss": 1.2496, "step": 2267 }, { "epoch": 0.8443880815813661, "grad_norm": 0.18058733642101288, "learning_rate": 1.988149749181195e-05, "loss": 1.2648, "step": 2268 }, { "epoch": 0.8447603867319752, "grad_norm": 0.1787886917591095, "learning_rate": 1.9881310876402225e-05, "loss": 1.2452, "step": 2269 }, { "epoch": 0.8451326918825842, "grad_norm": 0.17933231592178345, "learning_rate": 1.988112411504616e-05, "loss": 1.2531, "step": 2270 }, { "epoch": 0.8455049970331934, "grad_norm": 0.16929695010185242, "learning_rate": 1.9880937207746515e-05, "loss": 1.2432, "step": 2271 }, { "epoch": 0.8458773021838024, "grad_norm": 0.17460580170154572, "learning_rate": 1.9880750154506048e-05, "loss": 1.2518, "step": 2272 }, { "epoch": 0.8462496073344115, "grad_norm": 0.19186393916606903, "learning_rate": 1.9880562955327516e-05, "loss": 1.2534, "step": 2273 }, { "epoch": 0.8466219124850205, "grad_norm": 0.17282183468341827, "learning_rate": 1.9880375610213694e-05, "loss": 1.241, "step": 2274 }, { "epoch": 0.8469942176356297, "grad_norm": 0.16319245100021362, "learning_rate": 1.9880188119167345e-05, "loss": 1.2538, "step": 2275 }, { "epoch": 0.8473665227862387, "grad_norm": 0.1815318912267685, "learning_rate": 1.988000048219123e-05, "loss": 1.2561, "step": 2276 }, { "epoch": 0.8477388279368477, "grad_norm": 0.17304065823554993, "learning_rate": 1.9879812699288136e-05, "loss": 1.2545, "step": 2277 }, { "epoch": 0.8481111330874568, "grad_norm": 0.17184028029441833, "learning_rate": 1.9879624770460827e-05, "loss": 1.2637, "step": 2278 }, { "epoch": 0.8484834382380658, "grad_norm": 0.1777077466249466, "learning_rate": 1.9879436695712076e-05, "loss": 1.2599, "step": 2279 }, { "epoch": 0.848855743388675, "grad_norm": 0.17770980298519135, "learning_rate": 1.9879248475044668e-05, "loss": 1.2289, "step": 2280 }, { "epoch": 0.849228048539284, "grad_norm": 0.1785004436969757, "learning_rate": 1.987906010846138e-05, "loss": 1.2577, "step": 2281 }, { "epoch": 0.8496003536898931, "grad_norm": 0.16683989763259888, "learning_rate": 1.9878871595964993e-05, "loss": 1.2454, "step": 2282 }, { "epoch": 0.8499726588405021, "grad_norm": 0.18355853855609894, "learning_rate": 1.9878682937558297e-05, "loss": 1.284, "step": 2283 }, { "epoch": 0.8503449639911113, "grad_norm": 0.1772661954164505, "learning_rate": 1.987849413324407e-05, "loss": 1.2512, "step": 2284 }, { "epoch": 0.8507172691417203, "grad_norm": 0.17306406795978546, "learning_rate": 1.9878305183025103e-05, "loss": 1.2589, "step": 2285 }, { "epoch": 0.8510895742923293, "grad_norm": 0.16759039461612701, "learning_rate": 1.987811608690419e-05, "loss": 1.2458, "step": 2286 }, { "epoch": 0.8514618794429384, "grad_norm": 0.17323878407478333, "learning_rate": 1.9877926844884126e-05, "loss": 1.2559, "step": 2287 }, { "epoch": 0.8518341845935475, "grad_norm": 0.16814987361431122, "learning_rate": 1.9877737456967698e-05, "loss": 1.2589, "step": 2288 }, { "epoch": 0.8522064897441566, "grad_norm": 0.16971348226070404, "learning_rate": 1.987754792315771e-05, "loss": 1.2541, "step": 2289 }, { "epoch": 0.8525787948947656, "grad_norm": 0.16793325543403625, "learning_rate": 1.9877358243456956e-05, "loss": 1.2716, "step": 2290 }, { "epoch": 0.8529511000453747, "grad_norm": 0.17254911363124847, "learning_rate": 1.9877168417868244e-05, "loss": 1.253, "step": 2291 }, { "epoch": 0.8533234051959837, "grad_norm": 0.17216616868972778, "learning_rate": 1.9876978446394372e-05, "loss": 1.2495, "step": 2292 }, { "epoch": 0.8536957103465929, "grad_norm": 0.1849498599767685, "learning_rate": 1.9876788329038147e-05, "loss": 1.2736, "step": 2293 }, { "epoch": 0.8540680154972019, "grad_norm": 0.17307497560977936, "learning_rate": 1.987659806580238e-05, "loss": 1.2549, "step": 2294 }, { "epoch": 0.854440320647811, "grad_norm": 0.17321421205997467, "learning_rate": 1.9876407656689883e-05, "loss": 1.2634, "step": 2295 }, { "epoch": 0.85481262579842, "grad_norm": 0.17435167729854584, "learning_rate": 1.987621710170346e-05, "loss": 1.2443, "step": 2296 }, { "epoch": 0.8551849309490291, "grad_norm": 0.1697143167257309, "learning_rate": 1.9876026400845933e-05, "loss": 1.2621, "step": 2297 }, { "epoch": 0.8555572360996382, "grad_norm": 0.18251660466194153, "learning_rate": 1.9875835554120114e-05, "loss": 1.2588, "step": 2298 }, { "epoch": 0.8559295412502472, "grad_norm": 0.182033970952034, "learning_rate": 1.9875644561528824e-05, "loss": 1.2656, "step": 2299 }, { "epoch": 0.8563018464008563, "grad_norm": 0.19597557187080383, "learning_rate": 1.9875453423074883e-05, "loss": 1.2545, "step": 2300 }, { "epoch": 0.8566741515514654, "grad_norm": 0.17287831008434296, "learning_rate": 1.9875262138761116e-05, "loss": 1.2662, "step": 2301 }, { "epoch": 0.8570464567020745, "grad_norm": 0.18034107983112335, "learning_rate": 1.987507070859035e-05, "loss": 1.2561, "step": 2302 }, { "epoch": 0.8574187618526835, "grad_norm": 0.1734304428100586, "learning_rate": 1.98748791325654e-05, "loss": 1.2659, "step": 2303 }, { "epoch": 0.8577910670032926, "grad_norm": 0.18945597112178802, "learning_rate": 1.9874687410689114e-05, "loss": 1.265, "step": 2304 }, { "epoch": 0.8581633721539016, "grad_norm": 0.1864440143108368, "learning_rate": 1.9874495542964308e-05, "loss": 1.2634, "step": 2305 }, { "epoch": 0.8585356773045107, "grad_norm": 0.1847706139087677, "learning_rate": 1.987430352939383e-05, "loss": 1.2572, "step": 2306 }, { "epoch": 0.8589079824551198, "grad_norm": 0.18453647196292877, "learning_rate": 1.98741113699805e-05, "loss": 1.2501, "step": 2307 }, { "epoch": 0.8592802876057288, "grad_norm": 0.17989476025104523, "learning_rate": 1.9873919064727173e-05, "loss": 1.2589, "step": 2308 }, { "epoch": 0.8596525927563379, "grad_norm": 0.19050173461437225, "learning_rate": 1.9873726613636678e-05, "loss": 1.2489, "step": 2309 }, { "epoch": 0.860024897906947, "grad_norm": 0.17572608590126038, "learning_rate": 1.987353401671186e-05, "loss": 1.2375, "step": 2310 }, { "epoch": 0.8603972030575561, "grad_norm": 0.17381906509399414, "learning_rate": 1.987334127395556e-05, "loss": 1.2631, "step": 2311 }, { "epoch": 0.8607695082081651, "grad_norm": 0.17476747930049896, "learning_rate": 1.9873148385370635e-05, "loss": 1.2418, "step": 2312 }, { "epoch": 0.8611418133587742, "grad_norm": 0.1777612864971161, "learning_rate": 1.9872955350959927e-05, "loss": 1.2502, "step": 2313 }, { "epoch": 0.8615141185093833, "grad_norm": 0.17522254586219788, "learning_rate": 1.9872762170726284e-05, "loss": 1.2653, "step": 2314 }, { "epoch": 0.8618864236599924, "grad_norm": 0.17853045463562012, "learning_rate": 1.9872568844672567e-05, "loss": 1.2536, "step": 2315 }, { "epoch": 0.8622587288106014, "grad_norm": 0.1815364956855774, "learning_rate": 1.9872375372801627e-05, "loss": 1.2518, "step": 2316 }, { "epoch": 0.8626310339612104, "grad_norm": 0.1911124587059021, "learning_rate": 1.9872181755116324e-05, "loss": 1.2583, "step": 2317 }, { "epoch": 0.8630033391118195, "grad_norm": 0.1666967123746872, "learning_rate": 1.9871987991619516e-05, "loss": 1.2509, "step": 2318 }, { "epoch": 0.8633756442624286, "grad_norm": 0.19720950722694397, "learning_rate": 1.9871794082314064e-05, "loss": 1.2615, "step": 2319 }, { "epoch": 0.8637479494130377, "grad_norm": 0.18802502751350403, "learning_rate": 1.987160002720283e-05, "loss": 1.2608, "step": 2320 }, { "epoch": 0.8641202545636467, "grad_norm": 0.18227264285087585, "learning_rate": 1.9871405826288685e-05, "loss": 1.2452, "step": 2321 }, { "epoch": 0.8644925597142558, "grad_norm": 0.1774458885192871, "learning_rate": 1.9871211479574497e-05, "loss": 1.2562, "step": 2322 }, { "epoch": 0.8648648648648649, "grad_norm": 0.1868571639060974, "learning_rate": 1.9871016987063133e-05, "loss": 1.2485, "step": 2323 }, { "epoch": 0.865237170015474, "grad_norm": 0.18438728153705597, "learning_rate": 1.987082234875747e-05, "loss": 1.2437, "step": 2324 }, { "epoch": 0.865609475166083, "grad_norm": 0.17169415950775146, "learning_rate": 1.987062756466038e-05, "loss": 1.2487, "step": 2325 }, { "epoch": 0.865981780316692, "grad_norm": 0.1830715835094452, "learning_rate": 1.9870432634774737e-05, "loss": 1.2454, "step": 2326 }, { "epoch": 0.8663540854673011, "grad_norm": 0.18534304201602936, "learning_rate": 1.9870237559103426e-05, "loss": 1.2691, "step": 2327 }, { "epoch": 0.8667263906179102, "grad_norm": 0.17846928536891937, "learning_rate": 1.9870042337649328e-05, "loss": 1.2524, "step": 2328 }, { "epoch": 0.8670986957685193, "grad_norm": 0.17754697799682617, "learning_rate": 1.9869846970415317e-05, "loss": 1.2456, "step": 2329 }, { "epoch": 0.8674710009191283, "grad_norm": 0.1744270771741867, "learning_rate": 1.9869651457404293e-05, "loss": 1.2393, "step": 2330 }, { "epoch": 0.8678433060697374, "grad_norm": 0.18277184665203094, "learning_rate": 1.9869455798619133e-05, "loss": 1.2672, "step": 2331 }, { "epoch": 0.8682156112203465, "grad_norm": 0.1796514391899109, "learning_rate": 1.986925999406273e-05, "loss": 1.247, "step": 2332 }, { "epoch": 0.8685879163709556, "grad_norm": 0.17978057265281677, "learning_rate": 1.9869064043737977e-05, "loss": 1.258, "step": 2333 }, { "epoch": 0.8689602215215646, "grad_norm": 0.1840755194425583, "learning_rate": 1.9868867947647768e-05, "loss": 1.2558, "step": 2334 }, { "epoch": 0.8693325266721736, "grad_norm": 0.18063770234584808, "learning_rate": 1.9868671705794997e-05, "loss": 1.2623, "step": 2335 }, { "epoch": 0.8697048318227828, "grad_norm": 0.16667230427265167, "learning_rate": 1.9868475318182566e-05, "loss": 1.2476, "step": 2336 }, { "epoch": 0.8700771369733918, "grad_norm": 0.17838679254055023, "learning_rate": 1.9868278784813374e-05, "loss": 1.2663, "step": 2337 }, { "epoch": 0.8704494421240009, "grad_norm": 0.1771472841501236, "learning_rate": 1.9868082105690323e-05, "loss": 1.2516, "step": 2338 }, { "epoch": 0.8708217472746099, "grad_norm": 0.18569402396678925, "learning_rate": 1.9867885280816317e-05, "loss": 1.2496, "step": 2339 }, { "epoch": 0.871194052425219, "grad_norm": 0.17394711077213287, "learning_rate": 1.986768831019427e-05, "loss": 1.2457, "step": 2340 }, { "epoch": 0.8715663575758281, "grad_norm": 0.16605480015277863, "learning_rate": 1.986749119382708e-05, "loss": 1.2542, "step": 2341 }, { "epoch": 0.8719386627264372, "grad_norm": 0.17628324031829834, "learning_rate": 1.9867293931717664e-05, "loss": 1.2544, "step": 2342 }, { "epoch": 0.8723109678770462, "grad_norm": 0.18517057597637177, "learning_rate": 1.986709652386894e-05, "loss": 1.2576, "step": 2343 }, { "epoch": 0.8726832730276552, "grad_norm": 0.17365185916423798, "learning_rate": 1.9866898970283816e-05, "loss": 1.2472, "step": 2344 }, { "epoch": 0.8730555781782644, "grad_norm": 0.17242515087127686, "learning_rate": 1.9866701270965217e-05, "loss": 1.2536, "step": 2345 }, { "epoch": 0.8734278833288734, "grad_norm": 0.1831977367401123, "learning_rate": 1.986650342591606e-05, "loss": 1.2538, "step": 2346 }, { "epoch": 0.8738001884794825, "grad_norm": 0.1806264966726303, "learning_rate": 1.986630543513926e-05, "loss": 1.2417, "step": 2347 }, { "epoch": 0.8741724936300915, "grad_norm": 0.1779744178056717, "learning_rate": 1.9866107298637754e-05, "loss": 1.2577, "step": 2348 }, { "epoch": 0.8745447987807007, "grad_norm": 0.18145568668842316, "learning_rate": 1.9865909016414462e-05, "loss": 1.2498, "step": 2349 }, { "epoch": 0.8749171039313097, "grad_norm": 0.1738026887178421, "learning_rate": 1.9865710588472307e-05, "loss": 1.2482, "step": 2350 }, { "epoch": 0.8752894090819188, "grad_norm": 0.17503541707992554, "learning_rate": 1.986551201481423e-05, "loss": 1.2433, "step": 2351 }, { "epoch": 0.8756617142325278, "grad_norm": 0.18008525669574738, "learning_rate": 1.9865313295443164e-05, "loss": 1.2692, "step": 2352 }, { "epoch": 0.8760340193831369, "grad_norm": 0.17630065977573395, "learning_rate": 1.9865114430362037e-05, "loss": 1.256, "step": 2353 }, { "epoch": 0.876406324533746, "grad_norm": 0.16775432229042053, "learning_rate": 1.9864915419573787e-05, "loss": 1.2493, "step": 2354 }, { "epoch": 0.876778629684355, "grad_norm": 0.17783991992473602, "learning_rate": 1.9864716263081356e-05, "loss": 1.2344, "step": 2355 }, { "epoch": 0.8771509348349641, "grad_norm": 0.1752803474664688, "learning_rate": 1.9864516960887684e-05, "loss": 1.2532, "step": 2356 }, { "epoch": 0.8775232399855731, "grad_norm": 0.17517109215259552, "learning_rate": 1.9864317512995718e-05, "loss": 1.2508, "step": 2357 }, { "epoch": 0.8778955451361823, "grad_norm": 0.19087643921375275, "learning_rate": 1.9864117919408397e-05, "loss": 1.2517, "step": 2358 }, { "epoch": 0.8782678502867913, "grad_norm": 0.17448683083057404, "learning_rate": 1.986391818012868e-05, "loss": 1.2431, "step": 2359 }, { "epoch": 0.8786401554374004, "grad_norm": 0.1711607575416565, "learning_rate": 1.9863718295159503e-05, "loss": 1.257, "step": 2360 }, { "epoch": 0.8790124605880094, "grad_norm": 0.1857292354106903, "learning_rate": 1.9863518264503832e-05, "loss": 1.2477, "step": 2361 }, { "epoch": 0.8793847657386186, "grad_norm": 0.18781475722789764, "learning_rate": 1.9863318088164613e-05, "loss": 1.2672, "step": 2362 }, { "epoch": 0.8797570708892276, "grad_norm": 0.16976489126682281, "learning_rate": 1.9863117766144807e-05, "loss": 1.2528, "step": 2363 }, { "epoch": 0.8801293760398367, "grad_norm": 0.17341898381710052, "learning_rate": 1.9862917298447365e-05, "loss": 1.2625, "step": 2364 }, { "epoch": 0.8805016811904457, "grad_norm": 0.18177741765975952, "learning_rate": 1.986271668507526e-05, "loss": 1.2602, "step": 2365 }, { "epoch": 0.8808739863410547, "grad_norm": 0.1771697849035263, "learning_rate": 1.9862515926031444e-05, "loss": 1.2413, "step": 2366 }, { "epoch": 0.8812462914916639, "grad_norm": 0.17161764204502106, "learning_rate": 1.9862315021318886e-05, "loss": 1.2514, "step": 2367 }, { "epoch": 0.8816185966422729, "grad_norm": 0.18902365863323212, "learning_rate": 1.986211397094056e-05, "loss": 1.2348, "step": 2368 }, { "epoch": 0.881990901792882, "grad_norm": 0.17638647556304932, "learning_rate": 1.9861912774899425e-05, "loss": 1.2421, "step": 2369 }, { "epoch": 0.882363206943491, "grad_norm": 0.17760038375854492, "learning_rate": 1.9861711433198457e-05, "loss": 1.2574, "step": 2370 }, { "epoch": 0.8827355120941002, "grad_norm": 0.17845885455608368, "learning_rate": 1.9861509945840632e-05, "loss": 1.252, "step": 2371 }, { "epoch": 0.8831078172447092, "grad_norm": 0.17632153630256653, "learning_rate": 1.9861308312828923e-05, "loss": 1.2648, "step": 2372 }, { "epoch": 0.8834801223953183, "grad_norm": 0.17382317781448364, "learning_rate": 1.9861106534166307e-05, "loss": 1.2574, "step": 2373 }, { "epoch": 0.8838524275459273, "grad_norm": 0.18132738769054413, "learning_rate": 1.986090460985577e-05, "loss": 1.2546, "step": 2374 }, { "epoch": 0.8842247326965365, "grad_norm": 0.17880843579769135, "learning_rate": 1.9860702539900288e-05, "loss": 1.2542, "step": 2375 }, { "epoch": 0.8845970378471455, "grad_norm": 0.17678992450237274, "learning_rate": 1.9860500324302848e-05, "loss": 1.258, "step": 2376 }, { "epoch": 0.8849693429977545, "grad_norm": 0.17159947752952576, "learning_rate": 1.986029796306644e-05, "loss": 1.2545, "step": 2377 }, { "epoch": 0.8853416481483636, "grad_norm": 0.17294186353683472, "learning_rate": 1.9860095456194045e-05, "loss": 1.2572, "step": 2378 }, { "epoch": 0.8857139532989726, "grad_norm": 0.17959366738796234, "learning_rate": 1.9859892803688666e-05, "loss": 1.2406, "step": 2379 }, { "epoch": 0.8860862584495818, "grad_norm": 0.1762249618768692, "learning_rate": 1.9859690005553282e-05, "loss": 1.2681, "step": 2380 }, { "epoch": 0.8864585636001908, "grad_norm": 0.17363481223583221, "learning_rate": 1.98594870617909e-05, "loss": 1.2502, "step": 2381 }, { "epoch": 0.8868308687507999, "grad_norm": 0.18337921798229218, "learning_rate": 1.985928397240451e-05, "loss": 1.2509, "step": 2382 }, { "epoch": 0.8872031739014089, "grad_norm": 0.17734134197235107, "learning_rate": 1.985908073739712e-05, "loss": 1.2543, "step": 2383 }, { "epoch": 0.8875754790520181, "grad_norm": 0.1737191379070282, "learning_rate": 1.9858877356771722e-05, "loss": 1.2543, "step": 2384 }, { "epoch": 0.8879477842026271, "grad_norm": 0.18461737036705017, "learning_rate": 1.9858673830531322e-05, "loss": 1.2598, "step": 2385 }, { "epoch": 0.8883200893532361, "grad_norm": 0.1724756807088852, "learning_rate": 1.9858470158678932e-05, "loss": 1.2436, "step": 2386 }, { "epoch": 0.8886923945038452, "grad_norm": 0.16471019387245178, "learning_rate": 1.9858266341217556e-05, "loss": 1.2428, "step": 2387 }, { "epoch": 0.8890646996544542, "grad_norm": 0.1689532995223999, "learning_rate": 1.9858062378150204e-05, "loss": 1.2496, "step": 2388 }, { "epoch": 0.8894370048050634, "grad_norm": 0.17168131470680237, "learning_rate": 1.9857858269479887e-05, "loss": 1.2495, "step": 2389 }, { "epoch": 0.8898093099556724, "grad_norm": 0.18134863674640656, "learning_rate": 1.9857654015209627e-05, "loss": 1.24, "step": 2390 }, { "epoch": 0.8901816151062815, "grad_norm": 0.16769009828567505, "learning_rate": 1.985744961534243e-05, "loss": 1.244, "step": 2391 }, { "epoch": 0.8905539202568905, "grad_norm": 0.16388821601867676, "learning_rate": 1.9857245069881326e-05, "loss": 1.2531, "step": 2392 }, { "epoch": 0.8909262254074997, "grad_norm": 0.17608126997947693, "learning_rate": 1.985704037882933e-05, "loss": 1.2432, "step": 2393 }, { "epoch": 0.8912985305581087, "grad_norm": 0.17434832453727722, "learning_rate": 1.9856835542189464e-05, "loss": 1.2555, "step": 2394 }, { "epoch": 0.8916708357087177, "grad_norm": 0.16499166190624237, "learning_rate": 1.9856630559964758e-05, "loss": 1.2392, "step": 2395 }, { "epoch": 0.8920431408593268, "grad_norm": 0.18076403439044952, "learning_rate": 1.9856425432158236e-05, "loss": 1.2435, "step": 2396 }, { "epoch": 0.8924154460099359, "grad_norm": 0.17794503271579742, "learning_rate": 1.9856220158772927e-05, "loss": 1.2558, "step": 2397 }, { "epoch": 0.892787751160545, "grad_norm": 0.17728637158870697, "learning_rate": 1.9856014739811867e-05, "loss": 1.2692, "step": 2398 }, { "epoch": 0.893160056311154, "grad_norm": 0.17668506503105164, "learning_rate": 1.9855809175278088e-05, "loss": 1.2552, "step": 2399 }, { "epoch": 0.8935323614617631, "grad_norm": 0.1653260886669159, "learning_rate": 1.9855603465174623e-05, "loss": 1.2485, "step": 2400 }, { "epoch": 0.8939046666123721, "grad_norm": 0.17724184691905975, "learning_rate": 1.9855397609504517e-05, "loss": 1.2513, "step": 2401 }, { "epoch": 0.8942769717629813, "grad_norm": 0.1668618619441986, "learning_rate": 1.9855191608270807e-05, "loss": 1.2387, "step": 2402 }, { "epoch": 0.8946492769135903, "grad_norm": 0.16532456874847412, "learning_rate": 1.9854985461476534e-05, "loss": 1.2431, "step": 2403 }, { "epoch": 0.8950215820641994, "grad_norm": 0.17286598682403564, "learning_rate": 1.9854779169124745e-05, "loss": 1.2322, "step": 2404 }, { "epoch": 0.8953938872148084, "grad_norm": 0.17562846839427948, "learning_rate": 1.9854572731218483e-05, "loss": 1.2538, "step": 2405 }, { "epoch": 0.8957661923654175, "grad_norm": 0.1721685379743576, "learning_rate": 1.9854366147760803e-05, "loss": 1.2421, "step": 2406 }, { "epoch": 0.8961384975160266, "grad_norm": 0.16794408857822418, "learning_rate": 1.9854159418754754e-05, "loss": 1.2622, "step": 2407 }, { "epoch": 0.8965108026666356, "grad_norm": 0.17553912103176117, "learning_rate": 1.9853952544203387e-05, "loss": 1.2526, "step": 2408 }, { "epoch": 0.8968831078172447, "grad_norm": 0.17011193931102753, "learning_rate": 1.985374552410976e-05, "loss": 1.2588, "step": 2409 }, { "epoch": 0.8972554129678538, "grad_norm": 0.17169992625713348, "learning_rate": 1.9853538358476933e-05, "loss": 1.2377, "step": 2410 }, { "epoch": 0.8976277181184629, "grad_norm": 0.1703435331583023, "learning_rate": 1.985333104730796e-05, "loss": 1.2366, "step": 2411 }, { "epoch": 0.8980000232690719, "grad_norm": 0.17589643597602844, "learning_rate": 1.9853123590605904e-05, "loss": 1.2371, "step": 2412 }, { "epoch": 0.898372328419681, "grad_norm": 0.18404798209667206, "learning_rate": 1.9852915988373834e-05, "loss": 1.2483, "step": 2413 }, { "epoch": 0.89874463357029, "grad_norm": 0.18587037920951843, "learning_rate": 1.9852708240614812e-05, "loss": 1.2575, "step": 2414 }, { "epoch": 0.8991169387208992, "grad_norm": 0.16864776611328125, "learning_rate": 1.9852500347331908e-05, "loss": 1.2358, "step": 2415 }, { "epoch": 0.8994892438715082, "grad_norm": 0.17366687953472137, "learning_rate": 1.985229230852819e-05, "loss": 1.2397, "step": 2416 }, { "epoch": 0.8998615490221172, "grad_norm": 0.1771089732646942, "learning_rate": 1.9852084124206735e-05, "loss": 1.2628, "step": 2417 }, { "epoch": 0.9002338541727263, "grad_norm": 0.16770784556865692, "learning_rate": 1.9851875794370616e-05, "loss": 1.2384, "step": 2418 }, { "epoch": 0.9006061593233354, "grad_norm": 0.17862196266651154, "learning_rate": 1.985166731902291e-05, "loss": 1.2611, "step": 2419 }, { "epoch": 0.9009784644739445, "grad_norm": 0.17286571860313416, "learning_rate": 1.9851458698166693e-05, "loss": 1.2643, "step": 2420 }, { "epoch": 0.9013507696245535, "grad_norm": 0.17281019687652588, "learning_rate": 1.9851249931805053e-05, "loss": 1.2539, "step": 2421 }, { "epoch": 0.9017230747751626, "grad_norm": 0.1699199378490448, "learning_rate": 1.985104101994107e-05, "loss": 1.2567, "step": 2422 }, { "epoch": 0.9020953799257717, "grad_norm": 0.17170144617557526, "learning_rate": 1.9850831962577824e-05, "loss": 1.2658, "step": 2423 }, { "epoch": 0.9024676850763808, "grad_norm": 0.17239661514759064, "learning_rate": 1.9850622759718415e-05, "loss": 1.2399, "step": 2424 }, { "epoch": 0.9028399902269898, "grad_norm": 0.17295676469802856, "learning_rate": 1.9850413411365923e-05, "loss": 1.2494, "step": 2425 }, { "epoch": 0.9032122953775988, "grad_norm": 0.17475001513957977, "learning_rate": 1.985020391752344e-05, "loss": 1.2487, "step": 2426 }, { "epoch": 0.9035846005282079, "grad_norm": 0.16339634358882904, "learning_rate": 1.9849994278194068e-05, "loss": 1.2435, "step": 2427 }, { "epoch": 0.903956905678817, "grad_norm": 0.1701515167951584, "learning_rate": 1.9849784493380897e-05, "loss": 1.2499, "step": 2428 }, { "epoch": 0.9043292108294261, "grad_norm": 0.16852952539920807, "learning_rate": 1.9849574563087025e-05, "loss": 1.242, "step": 2429 }, { "epoch": 0.9047015159800351, "grad_norm": 0.1811753362417221, "learning_rate": 1.984936448731556e-05, "loss": 1.2504, "step": 2430 }, { "epoch": 0.9050738211306442, "grad_norm": 0.16984617710113525, "learning_rate": 1.9849154266069597e-05, "loss": 1.2338, "step": 2431 }, { "epoch": 0.9054461262812533, "grad_norm": 0.18032081425189972, "learning_rate": 1.984894389935224e-05, "loss": 1.2283, "step": 2432 }, { "epoch": 0.9058184314318624, "grad_norm": 0.1682289093732834, "learning_rate": 1.9848733387166606e-05, "loss": 1.2465, "step": 2433 }, { "epoch": 0.9061907365824714, "grad_norm": 0.1777002066373825, "learning_rate": 1.9848522729515794e-05, "loss": 1.2515, "step": 2434 }, { "epoch": 0.9065630417330804, "grad_norm": 0.17563197016716003, "learning_rate": 1.984831192640292e-05, "loss": 1.2523, "step": 2435 }, { "epoch": 0.9069353468836896, "grad_norm": 0.17422862350940704, "learning_rate": 1.9848100977831098e-05, "loss": 1.2584, "step": 2436 }, { "epoch": 0.9073076520342986, "grad_norm": 0.1725662648677826, "learning_rate": 1.984788988380344e-05, "loss": 1.2468, "step": 2437 }, { "epoch": 0.9076799571849077, "grad_norm": 0.16899681091308594, "learning_rate": 1.9847678644323068e-05, "loss": 1.2547, "step": 2438 }, { "epoch": 0.9080522623355167, "grad_norm": 0.1767120510339737, "learning_rate": 1.9847467259393102e-05, "loss": 1.2531, "step": 2439 }, { "epoch": 0.9084245674861258, "grad_norm": 0.1795816719532013, "learning_rate": 1.984725572901666e-05, "loss": 1.2519, "step": 2440 }, { "epoch": 0.9087968726367349, "grad_norm": 0.1791762113571167, "learning_rate": 1.984704405319687e-05, "loss": 1.2787, "step": 2441 }, { "epoch": 0.909169177787344, "grad_norm": 0.1724022626876831, "learning_rate": 1.9846832231936857e-05, "loss": 1.259, "step": 2442 }, { "epoch": 0.909541482937953, "grad_norm": 0.182962566614151, "learning_rate": 1.984662026523975e-05, "loss": 1.2568, "step": 2443 }, { "epoch": 0.909913788088562, "grad_norm": 0.17582541704177856, "learning_rate": 1.984640815310868e-05, "loss": 1.237, "step": 2444 }, { "epoch": 0.9102860932391712, "grad_norm": 0.17969557642936707, "learning_rate": 1.984619589554678e-05, "loss": 1.2662, "step": 2445 }, { "epoch": 0.9106583983897802, "grad_norm": 0.17635543644428253, "learning_rate": 1.9845983492557183e-05, "loss": 1.2597, "step": 2446 }, { "epoch": 0.9110307035403893, "grad_norm": 0.1814170926809311, "learning_rate": 1.984577094414303e-05, "loss": 1.243, "step": 2447 }, { "epoch": 0.9114030086909983, "grad_norm": 0.16915488243103027, "learning_rate": 1.984555825030746e-05, "loss": 1.2477, "step": 2448 }, { "epoch": 0.9117753138416075, "grad_norm": 0.1874271184206009, "learning_rate": 1.9845345411053608e-05, "loss": 1.2594, "step": 2449 }, { "epoch": 0.9121476189922165, "grad_norm": 0.17666561901569366, "learning_rate": 1.984513242638462e-05, "loss": 1.2704, "step": 2450 }, { "epoch": 0.9125199241428256, "grad_norm": 0.16944633424282074, "learning_rate": 1.9844919296303647e-05, "loss": 1.2457, "step": 2451 }, { "epoch": 0.9128922292934346, "grad_norm": 0.18913982808589935, "learning_rate": 1.9844706020813835e-05, "loss": 1.242, "step": 2452 }, { "epoch": 0.9132645344440437, "grad_norm": 0.17225903272628784, "learning_rate": 1.9844492599918333e-05, "loss": 1.2557, "step": 2453 }, { "epoch": 0.9136368395946528, "grad_norm": 0.17968986928462982, "learning_rate": 1.984427903362029e-05, "loss": 1.2457, "step": 2454 }, { "epoch": 0.9140091447452618, "grad_norm": 0.17983782291412354, "learning_rate": 1.9844065321922867e-05, "loss": 1.2449, "step": 2455 }, { "epoch": 0.9143814498958709, "grad_norm": 0.17764170467853546, "learning_rate": 1.9843851464829216e-05, "loss": 1.241, "step": 2456 }, { "epoch": 0.9147537550464799, "grad_norm": 0.17187097668647766, "learning_rate": 1.9843637462342498e-05, "loss": 1.2388, "step": 2457 }, { "epoch": 0.9151260601970891, "grad_norm": 0.18232551217079163, "learning_rate": 1.984342331446587e-05, "loss": 1.2553, "step": 2458 }, { "epoch": 0.9154983653476981, "grad_norm": 0.19238729774951935, "learning_rate": 1.9843209021202496e-05, "loss": 1.2357, "step": 2459 }, { "epoch": 0.9158706704983072, "grad_norm": 0.1838630735874176, "learning_rate": 1.9842994582555546e-05, "loss": 1.2509, "step": 2460 }, { "epoch": 0.9162429756489162, "grad_norm": 0.17731057107448578, "learning_rate": 1.984277999852818e-05, "loss": 1.2439, "step": 2461 }, { "epoch": 0.9166152807995253, "grad_norm": 0.1820240616798401, "learning_rate": 1.9842565269123577e-05, "loss": 1.2467, "step": 2462 }, { "epoch": 0.9169875859501344, "grad_norm": 0.1758500039577484, "learning_rate": 1.9842350394344898e-05, "loss": 1.2482, "step": 2463 }, { "epoch": 0.9173598911007435, "grad_norm": 0.18491673469543457, "learning_rate": 1.984213537419532e-05, "loss": 1.2456, "step": 2464 }, { "epoch": 0.9177321962513525, "grad_norm": 0.17699192464351654, "learning_rate": 1.9841920208678024e-05, "loss": 1.2429, "step": 2465 }, { "epoch": 0.9181045014019615, "grad_norm": 0.17381340265274048, "learning_rate": 1.9841704897796185e-05, "loss": 1.2492, "step": 2466 }, { "epoch": 0.9184768065525707, "grad_norm": 0.1726997047662735, "learning_rate": 1.984148944155298e-05, "loss": 1.2569, "step": 2467 }, { "epoch": 0.9188491117031797, "grad_norm": 0.18082614243030548, "learning_rate": 1.9841273839951595e-05, "loss": 1.2426, "step": 2468 }, { "epoch": 0.9192214168537888, "grad_norm": 0.17732475697994232, "learning_rate": 1.984105809299521e-05, "loss": 1.2449, "step": 2469 }, { "epoch": 0.9195937220043978, "grad_norm": 0.17159555852413177, "learning_rate": 1.9840842200687014e-05, "loss": 1.2588, "step": 2470 }, { "epoch": 0.919966027155007, "grad_norm": 0.17913275957107544, "learning_rate": 1.9840626163030202e-05, "loss": 1.2564, "step": 2471 }, { "epoch": 0.920338332305616, "grad_norm": 0.17932996153831482, "learning_rate": 1.9840409980027954e-05, "loss": 1.2485, "step": 2472 }, { "epoch": 0.9207106374562251, "grad_norm": 0.17403729259967804, "learning_rate": 1.984019365168347e-05, "loss": 1.2463, "step": 2473 }, { "epoch": 0.9210829426068341, "grad_norm": 0.18577389419078827, "learning_rate": 1.9839977177999942e-05, "loss": 1.2486, "step": 2474 }, { "epoch": 0.9214552477574431, "grad_norm": 0.1704840064048767, "learning_rate": 1.9839760558980572e-05, "loss": 1.2459, "step": 2475 }, { "epoch": 0.9218275529080523, "grad_norm": 0.18757209181785583, "learning_rate": 1.9839543794628553e-05, "loss": 1.2486, "step": 2476 }, { "epoch": 0.9221998580586613, "grad_norm": 0.18660812079906464, "learning_rate": 1.9839326884947093e-05, "loss": 1.2705, "step": 2477 }, { "epoch": 0.9225721632092704, "grad_norm": 0.16449618339538574, "learning_rate": 1.9839109829939388e-05, "loss": 1.24, "step": 2478 }, { "epoch": 0.9229444683598794, "grad_norm": 0.16529710590839386, "learning_rate": 1.9838892629608652e-05, "loss": 1.2578, "step": 2479 }, { "epoch": 0.9233167735104886, "grad_norm": 0.17828698456287384, "learning_rate": 1.9838675283958087e-05, "loss": 1.256, "step": 2480 }, { "epoch": 0.9236890786610976, "grad_norm": 0.17406156659126282, "learning_rate": 1.9838457792990902e-05, "loss": 1.2568, "step": 2481 }, { "epoch": 0.9240613838117067, "grad_norm": 0.1763119399547577, "learning_rate": 1.983824015671032e-05, "loss": 1.2539, "step": 2482 }, { "epoch": 0.9244336889623157, "grad_norm": 0.1707441657781601, "learning_rate": 1.9838022375119544e-05, "loss": 1.2391, "step": 2483 }, { "epoch": 0.9248059941129249, "grad_norm": 0.1721871942281723, "learning_rate": 1.9837804448221798e-05, "loss": 1.2559, "step": 2484 }, { "epoch": 0.9251782992635339, "grad_norm": 0.17778955399990082, "learning_rate": 1.9837586376020293e-05, "loss": 1.2563, "step": 2485 }, { "epoch": 0.9255506044141429, "grad_norm": 0.17989608645439148, "learning_rate": 1.983736815851826e-05, "loss": 1.2358, "step": 2486 }, { "epoch": 0.925922909564752, "grad_norm": 0.17223800718784332, "learning_rate": 1.9837149795718913e-05, "loss": 1.2619, "step": 2487 }, { "epoch": 0.926295214715361, "grad_norm": 0.17492277920246124, "learning_rate": 1.983693128762548e-05, "loss": 1.24, "step": 2488 }, { "epoch": 0.9266675198659702, "grad_norm": 0.17337973415851593, "learning_rate": 1.9836712634241194e-05, "loss": 1.2582, "step": 2489 }, { "epoch": 0.9270398250165792, "grad_norm": 0.16389819979667664, "learning_rate": 1.9836493835569278e-05, "loss": 1.2379, "step": 2490 }, { "epoch": 0.9274121301671883, "grad_norm": 0.17160442471504211, "learning_rate": 1.9836274891612963e-05, "loss": 1.2405, "step": 2491 }, { "epoch": 0.9277844353177973, "grad_norm": 0.1783936619758606, "learning_rate": 1.9836055802375488e-05, "loss": 1.2555, "step": 2492 }, { "epoch": 0.9281567404684065, "grad_norm": 0.17412415146827698, "learning_rate": 1.9835836567860082e-05, "loss": 1.2471, "step": 2493 }, { "epoch": 0.9285290456190155, "grad_norm": 0.175662100315094, "learning_rate": 1.983561718806999e-05, "loss": 1.2335, "step": 2494 }, { "epoch": 0.9289013507696245, "grad_norm": 0.1671876162290573, "learning_rate": 1.983539766300845e-05, "loss": 1.2543, "step": 2495 }, { "epoch": 0.9292736559202336, "grad_norm": 0.16467541456222534, "learning_rate": 1.9835177992678704e-05, "loss": 1.2454, "step": 2496 }, { "epoch": 0.9296459610708427, "grad_norm": 0.17586293816566467, "learning_rate": 1.9834958177083995e-05, "loss": 1.2568, "step": 2497 }, { "epoch": 0.9300182662214518, "grad_norm": 0.170370951294899, "learning_rate": 1.983473821622757e-05, "loss": 1.2479, "step": 2498 }, { "epoch": 0.9303905713720608, "grad_norm": 0.17328549921512604, "learning_rate": 1.983451811011268e-05, "loss": 1.2398, "step": 2499 }, { "epoch": 0.9307628765226699, "grad_norm": 0.18288347125053406, "learning_rate": 1.9834297858742574e-05, "loss": 1.2438, "step": 2500 }, { "epoch": 0.9307628765226699, "eval_loss": 1.3256843090057373, "eval_runtime": 16.4567, "eval_samples_per_second": 105.368, "eval_steps_per_second": 5.287, "step": 2500 }, { "epoch": 0.9311351816732789, "grad_norm": 0.17619635164737701, "learning_rate": 1.9834077462120506e-05, "loss": 1.2541, "step": 2501 }, { "epoch": 0.9315074868238881, "grad_norm": 0.1839938461780548, "learning_rate": 1.9833856920249733e-05, "loss": 1.2747, "step": 2502 }, { "epoch": 0.9318797919744971, "grad_norm": 0.18233221769332886, "learning_rate": 1.9833636233133507e-05, "loss": 1.2484, "step": 2503 }, { "epoch": 0.9322520971251061, "grad_norm": 0.1671859472990036, "learning_rate": 1.9833415400775092e-05, "loss": 1.2481, "step": 2504 }, { "epoch": 0.9326244022757152, "grad_norm": 0.17161886394023895, "learning_rate": 1.9833194423177754e-05, "loss": 1.2584, "step": 2505 }, { "epoch": 0.9329967074263243, "grad_norm": 0.18463842570781708, "learning_rate": 1.9832973300344745e-05, "loss": 1.257, "step": 2506 }, { "epoch": 0.9333690125769334, "grad_norm": 0.17637981474399567, "learning_rate": 1.983275203227934e-05, "loss": 1.2577, "step": 2507 }, { "epoch": 0.9337413177275424, "grad_norm": 0.17521126568317413, "learning_rate": 1.9832530618984802e-05, "loss": 1.2564, "step": 2508 }, { "epoch": 0.9341136228781515, "grad_norm": 0.17385391891002655, "learning_rate": 1.9832309060464408e-05, "loss": 1.245, "step": 2509 }, { "epoch": 0.9344859280287606, "grad_norm": 0.16838274896144867, "learning_rate": 1.9832087356721424e-05, "loss": 1.2586, "step": 2510 }, { "epoch": 0.9348582331793697, "grad_norm": 0.17618712782859802, "learning_rate": 1.9831865507759125e-05, "loss": 1.2478, "step": 2511 }, { "epoch": 0.9352305383299787, "grad_norm": 0.17693477869033813, "learning_rate": 1.983164351358079e-05, "loss": 1.2573, "step": 2512 }, { "epoch": 0.9356028434805878, "grad_norm": 0.17031066119670868, "learning_rate": 1.9831421374189702e-05, "loss": 1.2538, "step": 2513 }, { "epoch": 0.9359751486311968, "grad_norm": 0.1740039438009262, "learning_rate": 1.983119908958913e-05, "loss": 1.2496, "step": 2514 }, { "epoch": 0.936347453781806, "grad_norm": 0.18025629222393036, "learning_rate": 1.983097665978237e-05, "loss": 1.2503, "step": 2515 }, { "epoch": 0.936719758932415, "grad_norm": 0.179831400513649, "learning_rate": 1.98307540847727e-05, "loss": 1.2372, "step": 2516 }, { "epoch": 0.937092064083024, "grad_norm": 0.17471134662628174, "learning_rate": 1.983053136456341e-05, "loss": 1.2489, "step": 2517 }, { "epoch": 0.9374643692336331, "grad_norm": 0.17015781998634338, "learning_rate": 1.9830308499157787e-05, "loss": 1.2523, "step": 2518 }, { "epoch": 0.9378366743842422, "grad_norm": 0.17395399510860443, "learning_rate": 1.9830085488559128e-05, "loss": 1.2493, "step": 2519 }, { "epoch": 0.9382089795348513, "grad_norm": 0.1732756495475769, "learning_rate": 1.982986233277072e-05, "loss": 1.2487, "step": 2520 }, { "epoch": 0.9385812846854603, "grad_norm": 0.1738785058259964, "learning_rate": 1.9829639031795862e-05, "loss": 1.254, "step": 2521 }, { "epoch": 0.9389535898360694, "grad_norm": 0.17151622474193573, "learning_rate": 1.9829415585637853e-05, "loss": 1.2453, "step": 2522 }, { "epoch": 0.9393258949866785, "grad_norm": 0.16740648448467255, "learning_rate": 1.982919199429999e-05, "loss": 1.2497, "step": 2523 }, { "epoch": 0.9396982001372876, "grad_norm": 0.1763676106929779, "learning_rate": 1.9828968257785582e-05, "loss": 1.2484, "step": 2524 }, { "epoch": 0.9400705052878966, "grad_norm": 0.17714397609233856, "learning_rate": 1.982874437609793e-05, "loss": 1.2431, "step": 2525 }, { "epoch": 0.9404428104385056, "grad_norm": 0.16998568177223206, "learning_rate": 1.982852034924034e-05, "loss": 1.2378, "step": 2526 }, { "epoch": 0.9408151155891147, "grad_norm": 0.1771582067012787, "learning_rate": 1.9828296177216118e-05, "loss": 1.2533, "step": 2527 }, { "epoch": 0.9411874207397238, "grad_norm": 0.18139605224132538, "learning_rate": 1.9828071860028582e-05, "loss": 1.2489, "step": 2528 }, { "epoch": 0.9415597258903329, "grad_norm": 0.17635215818881989, "learning_rate": 1.982784739768104e-05, "loss": 1.2437, "step": 2529 }, { "epoch": 0.9419320310409419, "grad_norm": 0.17219685018062592, "learning_rate": 1.982762279017681e-05, "loss": 1.2348, "step": 2530 }, { "epoch": 0.942304336191551, "grad_norm": 0.18018695712089539, "learning_rate": 1.9827398037519203e-05, "loss": 1.2448, "step": 2531 }, { "epoch": 0.9426766413421601, "grad_norm": 0.17004595696926117, "learning_rate": 1.9827173139711547e-05, "loss": 1.2489, "step": 2532 }, { "epoch": 0.9430489464927692, "grad_norm": 0.17758417129516602, "learning_rate": 1.9826948096757163e-05, "loss": 1.2598, "step": 2533 }, { "epoch": 0.9434212516433782, "grad_norm": 0.18373461067676544, "learning_rate": 1.982672290865937e-05, "loss": 1.2601, "step": 2534 }, { "epoch": 0.9437935567939872, "grad_norm": 0.17354753613471985, "learning_rate": 1.9826497575421498e-05, "loss": 1.2353, "step": 2535 }, { "epoch": 0.9441658619445963, "grad_norm": 0.179754376411438, "learning_rate": 1.982627209704687e-05, "loss": 1.2466, "step": 2536 }, { "epoch": 0.9445381670952054, "grad_norm": 0.16391302645206451, "learning_rate": 1.9826046473538823e-05, "loss": 1.2386, "step": 2537 }, { "epoch": 0.9449104722458145, "grad_norm": 0.1725778728723526, "learning_rate": 1.9825820704900684e-05, "loss": 1.2573, "step": 2538 }, { "epoch": 0.9452827773964235, "grad_norm": 0.1723441183567047, "learning_rate": 1.9825594791135792e-05, "loss": 1.2355, "step": 2539 }, { "epoch": 0.9456550825470326, "grad_norm": 0.16522756218910217, "learning_rate": 1.982536873224748e-05, "loss": 1.2386, "step": 2540 }, { "epoch": 0.9460273876976417, "grad_norm": 0.1779457926750183, "learning_rate": 1.982514252823909e-05, "loss": 1.2522, "step": 2541 }, { "epoch": 0.9463996928482508, "grad_norm": 0.17980381846427917, "learning_rate": 1.982491617911396e-05, "loss": 1.2639, "step": 2542 }, { "epoch": 0.9467719979988598, "grad_norm": 0.17201921343803406, "learning_rate": 1.9824689684875435e-05, "loss": 1.2428, "step": 2543 }, { "epoch": 0.9471443031494688, "grad_norm": 0.16897398233413696, "learning_rate": 1.9824463045526857e-05, "loss": 1.2488, "step": 2544 }, { "epoch": 0.947516608300078, "grad_norm": 0.1801498979330063, "learning_rate": 1.982423626107158e-05, "loss": 1.2434, "step": 2545 }, { "epoch": 0.947888913450687, "grad_norm": 0.1762707233428955, "learning_rate": 1.982400933151295e-05, "loss": 1.2467, "step": 2546 }, { "epoch": 0.9482612186012961, "grad_norm": 0.1682814210653305, "learning_rate": 1.982378225685432e-05, "loss": 1.2472, "step": 2547 }, { "epoch": 0.9486335237519051, "grad_norm": 0.17276160418987274, "learning_rate": 1.982355503709904e-05, "loss": 1.2439, "step": 2548 }, { "epoch": 0.9490058289025142, "grad_norm": 0.16808001697063446, "learning_rate": 1.982332767225047e-05, "loss": 1.245, "step": 2549 }, { "epoch": 0.9493781340531233, "grad_norm": 0.17079442739486694, "learning_rate": 1.9823100162311967e-05, "loss": 1.2439, "step": 2550 }, { "epoch": 0.9497504392037324, "grad_norm": 0.17456983029842377, "learning_rate": 1.982287250728689e-05, "loss": 1.2551, "step": 2551 }, { "epoch": 0.9501227443543414, "grad_norm": 0.1688481867313385, "learning_rate": 1.98226447071786e-05, "loss": 1.2487, "step": 2552 }, { "epoch": 0.9504950495049505, "grad_norm": 0.1739954799413681, "learning_rate": 1.982241676199047e-05, "loss": 1.2556, "step": 2553 }, { "epoch": 0.9508673546555596, "grad_norm": 0.1756342649459839, "learning_rate": 1.9822188671725854e-05, "loss": 1.2437, "step": 2554 }, { "epoch": 0.9512396598061686, "grad_norm": 0.17892800271511078, "learning_rate": 1.9821960436388134e-05, "loss": 1.2401, "step": 2555 }, { "epoch": 0.9516119649567777, "grad_norm": 0.17654842138290405, "learning_rate": 1.9821732055980673e-05, "loss": 1.2459, "step": 2556 }, { "epoch": 0.9519842701073867, "grad_norm": 0.18467935919761658, "learning_rate": 1.9821503530506843e-05, "loss": 1.2376, "step": 2557 }, { "epoch": 0.9523565752579959, "grad_norm": 0.1839521825313568, "learning_rate": 1.9821274859970025e-05, "loss": 1.2529, "step": 2558 }, { "epoch": 0.9527288804086049, "grad_norm": 0.17149806022644043, "learning_rate": 1.982104604437359e-05, "loss": 1.2576, "step": 2559 }, { "epoch": 0.953101185559214, "grad_norm": 0.17172785103321075, "learning_rate": 1.9820817083720928e-05, "loss": 1.2464, "step": 2560 }, { "epoch": 0.953473490709823, "grad_norm": 0.17375504970550537, "learning_rate": 1.9820587978015407e-05, "loss": 1.2578, "step": 2561 }, { "epoch": 0.953845795860432, "grad_norm": 0.17615382373332977, "learning_rate": 1.982035872726042e-05, "loss": 1.2555, "step": 2562 }, { "epoch": 0.9542181010110412, "grad_norm": 0.18444810807704926, "learning_rate": 1.9820129331459354e-05, "loss": 1.2337, "step": 2563 }, { "epoch": 0.9545904061616503, "grad_norm": 0.1793602854013443, "learning_rate": 1.981989979061559e-05, "loss": 1.2504, "step": 2564 }, { "epoch": 0.9549627113122593, "grad_norm": 0.16998952627182007, "learning_rate": 1.9819670104732528e-05, "loss": 1.2424, "step": 2565 }, { "epoch": 0.9553350164628683, "grad_norm": 0.16896101832389832, "learning_rate": 1.981944027381355e-05, "loss": 1.239, "step": 2566 }, { "epoch": 0.9557073216134775, "grad_norm": 0.16461212933063507, "learning_rate": 1.9819210297862055e-05, "loss": 1.245, "step": 2567 }, { "epoch": 0.9560796267640865, "grad_norm": 0.1697378307580948, "learning_rate": 1.981898017688144e-05, "loss": 1.2506, "step": 2568 }, { "epoch": 0.9564519319146956, "grad_norm": 0.17256854474544525, "learning_rate": 1.981874991087511e-05, "loss": 1.2113, "step": 2569 }, { "epoch": 0.9568242370653046, "grad_norm": 0.1716059297323227, "learning_rate": 1.9818519499846457e-05, "loss": 1.2552, "step": 2570 }, { "epoch": 0.9571965422159138, "grad_norm": 0.16681870818138123, "learning_rate": 1.9818288943798882e-05, "loss": 1.2412, "step": 2571 }, { "epoch": 0.9575688473665228, "grad_norm": 0.1805577576160431, "learning_rate": 1.98180582427358e-05, "loss": 1.2393, "step": 2572 }, { "epoch": 0.9579411525171319, "grad_norm": 0.16774867475032806, "learning_rate": 1.9817827396660615e-05, "loss": 1.2327, "step": 2573 }, { "epoch": 0.9583134576677409, "grad_norm": 0.1805572658777237, "learning_rate": 1.9817596405576733e-05, "loss": 1.2445, "step": 2574 }, { "epoch": 0.9586857628183499, "grad_norm": 0.16827546060085297, "learning_rate": 1.981736526948757e-05, "loss": 1.2472, "step": 2575 }, { "epoch": 0.9590580679689591, "grad_norm": 0.184850811958313, "learning_rate": 1.9817133988396536e-05, "loss": 1.2344, "step": 2576 }, { "epoch": 0.9594303731195681, "grad_norm": 0.17866064608097076, "learning_rate": 1.981690256230705e-05, "loss": 1.247, "step": 2577 }, { "epoch": 0.9598026782701772, "grad_norm": 0.17331889271736145, "learning_rate": 1.981667099122253e-05, "loss": 1.2627, "step": 2578 }, { "epoch": 0.9601749834207862, "grad_norm": 0.17190296947956085, "learning_rate": 1.9816439275146394e-05, "loss": 1.2344, "step": 2579 }, { "epoch": 0.9605472885713954, "grad_norm": 0.1726665496826172, "learning_rate": 1.981620741408207e-05, "loss": 1.247, "step": 2580 }, { "epoch": 0.9609195937220044, "grad_norm": 0.1653529554605484, "learning_rate": 1.9815975408032972e-05, "loss": 1.2472, "step": 2581 }, { "epoch": 0.9612918988726135, "grad_norm": 0.18165260553359985, "learning_rate": 1.9815743257002537e-05, "loss": 1.2457, "step": 2582 }, { "epoch": 0.9616642040232225, "grad_norm": 0.18499329686164856, "learning_rate": 1.981551096099419e-05, "loss": 1.251, "step": 2583 }, { "epoch": 0.9620365091738317, "grad_norm": 0.17018747329711914, "learning_rate": 1.9815278520011364e-05, "loss": 1.2342, "step": 2584 }, { "epoch": 0.9624088143244407, "grad_norm": 0.17789021134376526, "learning_rate": 1.9815045934057487e-05, "loss": 1.2406, "step": 2585 }, { "epoch": 0.9627811194750497, "grad_norm": 0.1729610115289688, "learning_rate": 1.9814813203135998e-05, "loss": 1.2466, "step": 2586 }, { "epoch": 0.9631534246256588, "grad_norm": 0.17735299468040466, "learning_rate": 1.9814580327250336e-05, "loss": 1.2406, "step": 2587 }, { "epoch": 0.9635257297762678, "grad_norm": 0.1712779998779297, "learning_rate": 1.9814347306403936e-05, "loss": 1.2408, "step": 2588 }, { "epoch": 0.963898034926877, "grad_norm": 0.18011993169784546, "learning_rate": 1.981411414060024e-05, "loss": 1.2578, "step": 2589 }, { "epoch": 0.964270340077486, "grad_norm": 0.17510679364204407, "learning_rate": 1.9813880829842704e-05, "loss": 1.2324, "step": 2590 }, { "epoch": 0.9646426452280951, "grad_norm": 0.17391736805438995, "learning_rate": 1.9813647374134756e-05, "loss": 1.2433, "step": 2591 }, { "epoch": 0.9650149503787041, "grad_norm": 0.17409949004650116, "learning_rate": 1.9813413773479853e-05, "loss": 1.2542, "step": 2592 }, { "epoch": 0.9653872555293133, "grad_norm": 0.17249634861946106, "learning_rate": 1.9813180027881445e-05, "loss": 1.231, "step": 2593 }, { "epoch": 0.9657595606799223, "grad_norm": 0.18046285212039948, "learning_rate": 1.9812946137342984e-05, "loss": 1.2408, "step": 2594 }, { "epoch": 0.9661318658305313, "grad_norm": 0.16898088157176971, "learning_rate": 1.9812712101867923e-05, "loss": 1.2426, "step": 2595 }, { "epoch": 0.9665041709811404, "grad_norm": 0.17144586145877838, "learning_rate": 1.9812477921459724e-05, "loss": 1.2358, "step": 2596 }, { "epoch": 0.9668764761317495, "grad_norm": 0.17590296268463135, "learning_rate": 1.981224359612184e-05, "loss": 1.231, "step": 2597 }, { "epoch": 0.9672487812823586, "grad_norm": 0.1714339703321457, "learning_rate": 1.981200912585773e-05, "loss": 1.2307, "step": 2598 }, { "epoch": 0.9676210864329676, "grad_norm": 0.17280226945877075, "learning_rate": 1.9811774510670866e-05, "loss": 1.2432, "step": 2599 }, { "epoch": 0.9679933915835767, "grad_norm": 0.19417522847652435, "learning_rate": 1.9811539750564702e-05, "loss": 1.2329, "step": 2600 }, { "epoch": 0.9683656967341857, "grad_norm": 0.18655581772327423, "learning_rate": 1.9811304845542717e-05, "loss": 1.2364, "step": 2601 }, { "epoch": 0.9687380018847949, "grad_norm": 0.1713135540485382, "learning_rate": 1.9811069795608377e-05, "loss": 1.266, "step": 2602 }, { "epoch": 0.9691103070354039, "grad_norm": 0.18093515932559967, "learning_rate": 1.9810834600765148e-05, "loss": 1.2466, "step": 2603 }, { "epoch": 0.969482612186013, "grad_norm": 0.17053551971912384, "learning_rate": 1.9810599261016506e-05, "loss": 1.2532, "step": 2604 }, { "epoch": 0.969854917336622, "grad_norm": 0.1664905697107315, "learning_rate": 1.9810363776365932e-05, "loss": 1.232, "step": 2605 }, { "epoch": 0.9702272224872311, "grad_norm": 0.1823788583278656, "learning_rate": 1.98101281468169e-05, "loss": 1.255, "step": 2606 }, { "epoch": 0.9705995276378402, "grad_norm": 0.17830835282802582, "learning_rate": 1.980989237237289e-05, "loss": 1.2522, "step": 2607 }, { "epoch": 0.9709718327884492, "grad_norm": 0.19234777987003326, "learning_rate": 1.980965645303739e-05, "loss": 1.2574, "step": 2608 }, { "epoch": 0.9713441379390583, "grad_norm": 0.18148033320903778, "learning_rate": 1.9809420388813874e-05, "loss": 1.2511, "step": 2609 }, { "epoch": 0.9717164430896673, "grad_norm": 0.1736968457698822, "learning_rate": 1.9809184179705835e-05, "loss": 1.2458, "step": 2610 }, { "epoch": 0.9720887482402765, "grad_norm": 0.17815859615802765, "learning_rate": 1.9808947825716768e-05, "loss": 1.2548, "step": 2611 }, { "epoch": 0.9724610533908855, "grad_norm": 0.1661943793296814, "learning_rate": 1.980871132685015e-05, "loss": 1.2295, "step": 2612 }, { "epoch": 0.9728333585414946, "grad_norm": 0.16622133553028107, "learning_rate": 1.980847468310948e-05, "loss": 1.2403, "step": 2613 }, { "epoch": 0.9732056636921036, "grad_norm": 0.17739859223365784, "learning_rate": 1.980823789449826e-05, "loss": 1.2407, "step": 2614 }, { "epoch": 0.9735779688427127, "grad_norm": 0.17749415338039398, "learning_rate": 1.980800096101998e-05, "loss": 1.236, "step": 2615 }, { "epoch": 0.9739502739933218, "grad_norm": 0.18621912598609924, "learning_rate": 1.9807763882678143e-05, "loss": 1.2393, "step": 2616 }, { "epoch": 0.9743225791439308, "grad_norm": 0.1727140098810196, "learning_rate": 1.9807526659476245e-05, "loss": 1.2331, "step": 2617 }, { "epoch": 0.9746948842945399, "grad_norm": 0.17797011137008667, "learning_rate": 1.9807289291417795e-05, "loss": 1.2603, "step": 2618 }, { "epoch": 0.975067189445149, "grad_norm": 0.18820150196552277, "learning_rate": 1.98070517785063e-05, "loss": 1.2385, "step": 2619 }, { "epoch": 0.9754394945957581, "grad_norm": 0.17448899149894714, "learning_rate": 1.9806814120745265e-05, "loss": 1.2361, "step": 2620 }, { "epoch": 0.9758117997463671, "grad_norm": 0.1788555085659027, "learning_rate": 1.9806576318138194e-05, "loss": 1.2421, "step": 2621 }, { "epoch": 0.9761841048969762, "grad_norm": 0.1790677309036255, "learning_rate": 1.9806338370688615e-05, "loss": 1.2379, "step": 2622 }, { "epoch": 0.9765564100475852, "grad_norm": 0.17613591253757477, "learning_rate": 1.980610027840003e-05, "loss": 1.2557, "step": 2623 }, { "epoch": 0.9769287151981944, "grad_norm": 0.19133977591991425, "learning_rate": 1.9805862041275962e-05, "loss": 1.2616, "step": 2624 }, { "epoch": 0.9773010203488034, "grad_norm": 0.18040066957473755, "learning_rate": 1.9805623659319924e-05, "loss": 1.2437, "step": 2625 }, { "epoch": 0.9776733254994124, "grad_norm": 0.17554713785648346, "learning_rate": 1.980538513253544e-05, "loss": 1.2483, "step": 2626 }, { "epoch": 0.9780456306500215, "grad_norm": 0.17956022918224335, "learning_rate": 1.9805146460926033e-05, "loss": 1.2492, "step": 2627 }, { "epoch": 0.9784179358006306, "grad_norm": 0.18192705512046814, "learning_rate": 1.980490764449523e-05, "loss": 1.2439, "step": 2628 }, { "epoch": 0.9787902409512397, "grad_norm": 0.1905398815870285, "learning_rate": 1.9804668683246556e-05, "loss": 1.2384, "step": 2629 }, { "epoch": 0.9791625461018487, "grad_norm": 0.17728392779827118, "learning_rate": 1.980442957718354e-05, "loss": 1.2492, "step": 2630 }, { "epoch": 0.9795348512524578, "grad_norm": 0.17330513894557953, "learning_rate": 1.9804190326309714e-05, "loss": 1.2312, "step": 2631 }, { "epoch": 0.9799071564030669, "grad_norm": 0.17659252882003784, "learning_rate": 1.9803950930628616e-05, "loss": 1.2429, "step": 2632 }, { "epoch": 0.980279461553676, "grad_norm": 0.18615534901618958, "learning_rate": 1.9803711390143774e-05, "loss": 1.2442, "step": 2633 }, { "epoch": 0.980651766704285, "grad_norm": 0.18511880934238434, "learning_rate": 1.9803471704858733e-05, "loss": 1.2489, "step": 2634 }, { "epoch": 0.981024071854894, "grad_norm": 0.1814030408859253, "learning_rate": 1.9803231874777025e-05, "loss": 1.2361, "step": 2635 }, { "epoch": 0.9813963770055031, "grad_norm": 0.1738506257534027, "learning_rate": 1.98029918999022e-05, "loss": 1.2462, "step": 2636 }, { "epoch": 0.9817686821561122, "grad_norm": 0.1846051663160324, "learning_rate": 1.98027517802378e-05, "loss": 1.2436, "step": 2637 }, { "epoch": 0.9821409873067213, "grad_norm": 0.19149626791477203, "learning_rate": 1.9802511515787373e-05, "loss": 1.2403, "step": 2638 }, { "epoch": 0.9825132924573303, "grad_norm": 0.17990732192993164, "learning_rate": 1.9802271106554464e-05, "loss": 1.2479, "step": 2639 }, { "epoch": 0.9828855976079394, "grad_norm": 0.18959732353687286, "learning_rate": 1.9802030552542627e-05, "loss": 1.238, "step": 2640 }, { "epoch": 0.9832579027585485, "grad_norm": 0.1863657683134079, "learning_rate": 1.9801789853755415e-05, "loss": 1.2615, "step": 2641 }, { "epoch": 0.9836302079091576, "grad_norm": 0.16739057004451752, "learning_rate": 1.980154901019638e-05, "loss": 1.2371, "step": 2642 }, { "epoch": 0.9840025130597666, "grad_norm": 0.1758878231048584, "learning_rate": 1.9801308021869084e-05, "loss": 1.2301, "step": 2643 }, { "epoch": 0.9843748182103756, "grad_norm": 0.18291743099689484, "learning_rate": 1.9801066888777082e-05, "loss": 1.2375, "step": 2644 }, { "epoch": 0.9847471233609848, "grad_norm": 0.18052415549755096, "learning_rate": 1.9800825610923937e-05, "loss": 1.2571, "step": 2645 }, { "epoch": 0.9851194285115938, "grad_norm": 0.17342287302017212, "learning_rate": 1.980058418831321e-05, "loss": 1.2584, "step": 2646 }, { "epoch": 0.9854917336622029, "grad_norm": 0.17909403145313263, "learning_rate": 1.9800342620948475e-05, "loss": 1.2307, "step": 2647 }, { "epoch": 0.9858640388128119, "grad_norm": 0.17488320171833038, "learning_rate": 1.980010090883329e-05, "loss": 1.2422, "step": 2648 }, { "epoch": 0.986236343963421, "grad_norm": 0.17006643116474152, "learning_rate": 1.9799859051971232e-05, "loss": 1.2405, "step": 2649 }, { "epoch": 0.9866086491140301, "grad_norm": 0.18193192780017853, "learning_rate": 1.979961705036587e-05, "loss": 1.2491, "step": 2650 }, { "epoch": 0.9869809542646392, "grad_norm": 0.18248356878757477, "learning_rate": 1.979937490402078e-05, "loss": 1.2499, "step": 2651 }, { "epoch": 0.9873532594152482, "grad_norm": 0.17710177600383759, "learning_rate": 1.9799132612939535e-05, "loss": 1.2346, "step": 2652 }, { "epoch": 0.9877255645658573, "grad_norm": 0.17549732327461243, "learning_rate": 1.979889017712572e-05, "loss": 1.2341, "step": 2653 }, { "epoch": 0.9880978697164664, "grad_norm": 0.17912618815898895, "learning_rate": 1.979864759658291e-05, "loss": 1.2562, "step": 2654 }, { "epoch": 0.9884701748670754, "grad_norm": 0.17686498165130615, "learning_rate": 1.979840487131469e-05, "loss": 1.2422, "step": 2655 }, { "epoch": 0.9888424800176845, "grad_norm": 0.1823192685842514, "learning_rate": 1.9798162001324647e-05, "loss": 1.2304, "step": 2656 }, { "epoch": 0.9892147851682935, "grad_norm": 0.17622555792331696, "learning_rate": 1.9797918986616362e-05, "loss": 1.2241, "step": 2657 }, { "epoch": 0.9895870903189027, "grad_norm": 0.1805817037820816, "learning_rate": 1.979767582719343e-05, "loss": 1.241, "step": 2658 }, { "epoch": 0.9899593954695117, "grad_norm": 0.1678328961133957, "learning_rate": 1.9797432523059442e-05, "loss": 1.2248, "step": 2659 }, { "epoch": 0.9903317006201208, "grad_norm": 0.17134131491184235, "learning_rate": 1.9797189074217993e-05, "loss": 1.2415, "step": 2660 }, { "epoch": 0.9907040057707298, "grad_norm": 0.1789311319589615, "learning_rate": 1.979694548067267e-05, "loss": 1.2358, "step": 2661 }, { "epoch": 0.9910763109213389, "grad_norm": 0.18833468854427338, "learning_rate": 1.979670174242708e-05, "loss": 1.2339, "step": 2662 }, { "epoch": 0.991448616071948, "grad_norm": 0.18006418645381927, "learning_rate": 1.9796457859484825e-05, "loss": 1.2322, "step": 2663 }, { "epoch": 0.991820921222557, "grad_norm": 0.1705029308795929, "learning_rate": 1.9796213831849496e-05, "loss": 1.2473, "step": 2664 }, { "epoch": 0.9921932263731661, "grad_norm": 0.16797584295272827, "learning_rate": 1.9795969659524705e-05, "loss": 1.2422, "step": 2665 }, { "epoch": 0.9925655315237751, "grad_norm": 0.19237132370471954, "learning_rate": 1.9795725342514055e-05, "loss": 1.2567, "step": 2666 }, { "epoch": 0.9929378366743843, "grad_norm": 0.1747465878725052, "learning_rate": 1.9795480880821162e-05, "loss": 1.2496, "step": 2667 }, { "epoch": 0.9933101418249933, "grad_norm": 0.17807847261428833, "learning_rate": 1.9795236274449627e-05, "loss": 1.2463, "step": 2668 }, { "epoch": 0.9936824469756024, "grad_norm": 0.17453856766223907, "learning_rate": 1.979499152340307e-05, "loss": 1.2433, "step": 2669 }, { "epoch": 0.9940547521262114, "grad_norm": 0.1748945713043213, "learning_rate": 1.9794746627685097e-05, "loss": 1.2363, "step": 2670 }, { "epoch": 0.9944270572768205, "grad_norm": 0.1771155595779419, "learning_rate": 1.9794501587299338e-05, "loss": 1.236, "step": 2671 }, { "epoch": 0.9947993624274296, "grad_norm": 0.17215952277183533, "learning_rate": 1.9794256402249398e-05, "loss": 1.233, "step": 2672 }, { "epoch": 0.9951716675780387, "grad_norm": 0.1868884265422821, "learning_rate": 1.979401107253891e-05, "loss": 1.2446, "step": 2673 }, { "epoch": 0.9955439727286477, "grad_norm": 0.18049085140228271, "learning_rate": 1.9793765598171494e-05, "loss": 1.2319, "step": 2674 }, { "epoch": 0.9959162778792567, "grad_norm": 0.16808654367923737, "learning_rate": 1.9793519979150773e-05, "loss": 1.2382, "step": 2675 }, { "epoch": 0.9962885830298659, "grad_norm": 0.18061494827270508, "learning_rate": 1.9793274215480375e-05, "loss": 1.2547, "step": 2676 }, { "epoch": 0.9966608881804749, "grad_norm": 0.16883356869220734, "learning_rate": 1.9793028307163937e-05, "loss": 1.2517, "step": 2677 }, { "epoch": 0.997033193331084, "grad_norm": 0.1701226681470871, "learning_rate": 1.979278225420508e-05, "loss": 1.2401, "step": 2678 }, { "epoch": 0.997405498481693, "grad_norm": 0.17168228328227997, "learning_rate": 1.9792536056607448e-05, "loss": 1.2341, "step": 2679 }, { "epoch": 0.9977778036323022, "grad_norm": 0.1792803853750229, "learning_rate": 1.979228971437467e-05, "loss": 1.2353, "step": 2680 }, { "epoch": 0.9981501087829112, "grad_norm": 0.16425569355487823, "learning_rate": 1.9792043227510387e-05, "loss": 1.239, "step": 2681 }, { "epoch": 0.9985224139335203, "grad_norm": 0.17416253685951233, "learning_rate": 1.979179659601824e-05, "loss": 1.2426, "step": 2682 }, { "epoch": 0.9988947190841293, "grad_norm": 0.16776736080646515, "learning_rate": 1.9791549819901875e-05, "loss": 1.2377, "step": 2683 }, { "epoch": 0.9992670242347383, "grad_norm": 0.16794565320014954, "learning_rate": 1.9791302899164932e-05, "loss": 1.2406, "step": 2684 }, { "epoch": 0.9996393293853475, "grad_norm": 0.1647026389837265, "learning_rate": 1.9791055833811056e-05, "loss": 1.2427, "step": 2685 }, { "epoch": 1.0000116345359564, "grad_norm": 0.19015128910541534, "learning_rate": 1.9790808623843905e-05, "loss": 1.2504, "step": 2686 }, { "epoch": 1.0003839396865657, "grad_norm": 0.1643396019935608, "learning_rate": 1.9790561269267122e-05, "loss": 1.2387, "step": 2687 }, { "epoch": 1.0007562448371747, "grad_norm": 0.178327739238739, "learning_rate": 1.9790313770084363e-05, "loss": 1.2417, "step": 2688 }, { "epoch": 1.0011285499877838, "grad_norm": 0.18090787529945374, "learning_rate": 1.9790066126299286e-05, "loss": 1.2458, "step": 2689 }, { "epoch": 1.0015008551383928, "grad_norm": 0.16783635318279266, "learning_rate": 1.978981833791555e-05, "loss": 1.2481, "step": 2690 }, { "epoch": 1.0018731602890019, "grad_norm": 0.1700548678636551, "learning_rate": 1.9789570404936805e-05, "loss": 1.2429, "step": 2691 }, { "epoch": 1.002245465439611, "grad_norm": 0.1691717803478241, "learning_rate": 1.9789322327366722e-05, "loss": 1.245, "step": 2692 }, { "epoch": 1.00261777059022, "grad_norm": 0.16553466022014618, "learning_rate": 1.9789074105208962e-05, "loss": 1.2288, "step": 2693 }, { "epoch": 1.002990075740829, "grad_norm": 0.17240577936172485, "learning_rate": 1.9788825738467194e-05, "loss": 1.2467, "step": 2694 }, { "epoch": 1.0033623808914383, "grad_norm": 0.17340059578418732, "learning_rate": 1.9788577227145084e-05, "loss": 1.2392, "step": 2695 }, { "epoch": 1.0037346860420473, "grad_norm": 0.1701386272907257, "learning_rate": 1.97883285712463e-05, "loss": 1.215, "step": 2696 }, { "epoch": 1.0041069911926563, "grad_norm": 0.17179681360721588, "learning_rate": 1.9788079770774517e-05, "loss": 1.2226, "step": 2697 }, { "epoch": 1.0044792963432654, "grad_norm": 0.1747562438249588, "learning_rate": 1.9787830825733415e-05, "loss": 1.2348, "step": 2698 }, { "epoch": 1.0048516014938744, "grad_norm": 0.1735781580209732, "learning_rate": 1.9787581736126663e-05, "loss": 1.2438, "step": 2699 }, { "epoch": 1.0052239066444835, "grad_norm": 0.16927596926689148, "learning_rate": 1.9787332501957942e-05, "loss": 1.2388, "step": 2700 }, { "epoch": 1.0055962117950925, "grad_norm": 0.17900878190994263, "learning_rate": 1.9787083123230933e-05, "loss": 1.2479, "step": 2701 }, { "epoch": 1.0059685169457016, "grad_norm": 0.1668340116739273, "learning_rate": 1.9786833599949325e-05, "loss": 1.2278, "step": 2702 }, { "epoch": 1.0063408220963106, "grad_norm": 0.17158925533294678, "learning_rate": 1.9786583932116795e-05, "loss": 1.2364, "step": 2703 }, { "epoch": 1.0067131272469199, "grad_norm": 0.17500467598438263, "learning_rate": 1.9786334119737035e-05, "loss": 1.2199, "step": 2704 }, { "epoch": 1.007085432397529, "grad_norm": 0.1702401041984558, "learning_rate": 1.9786084162813735e-05, "loss": 1.252, "step": 2705 }, { "epoch": 1.007457737548138, "grad_norm": 0.1826189160346985, "learning_rate": 1.9785834061350585e-05, "loss": 1.2441, "step": 2706 }, { "epoch": 1.007830042698747, "grad_norm": 0.16807404160499573, "learning_rate": 1.9785583815351285e-05, "loss": 1.2245, "step": 2707 }, { "epoch": 1.008202347849356, "grad_norm": 0.18120057880878448, "learning_rate": 1.978533342481952e-05, "loss": 1.2484, "step": 2708 }, { "epoch": 1.008574652999965, "grad_norm": 0.16986186802387238, "learning_rate": 1.9785082889759e-05, "loss": 1.2615, "step": 2709 }, { "epoch": 1.0089469581505741, "grad_norm": 0.18247941136360168, "learning_rate": 1.9784832210173413e-05, "loss": 1.2235, "step": 2710 }, { "epoch": 1.0093192633011832, "grad_norm": 0.177805557847023, "learning_rate": 1.9784581386066472e-05, "loss": 1.2425, "step": 2711 }, { "epoch": 1.0096915684517922, "grad_norm": 0.1815112829208374, "learning_rate": 1.978433041744188e-05, "loss": 1.2465, "step": 2712 }, { "epoch": 1.0100638736024015, "grad_norm": 0.189849853515625, "learning_rate": 1.9784079304303337e-05, "loss": 1.2374, "step": 2713 }, { "epoch": 1.0104361787530105, "grad_norm": 0.16993628442287445, "learning_rate": 1.978382804665456e-05, "loss": 1.2462, "step": 2714 }, { "epoch": 1.0108084839036195, "grad_norm": 0.1785373091697693, "learning_rate": 1.9783576644499257e-05, "loss": 1.2423, "step": 2715 }, { "epoch": 1.0111807890542286, "grad_norm": 0.19358791410923004, "learning_rate": 1.978332509784114e-05, "loss": 1.2281, "step": 2716 }, { "epoch": 1.0115530942048376, "grad_norm": 0.16849210858345032, "learning_rate": 1.9783073406683926e-05, "loss": 1.2245, "step": 2717 }, { "epoch": 1.0119253993554467, "grad_norm": 0.1804894357919693, "learning_rate": 1.978282157103133e-05, "loss": 1.2395, "step": 2718 }, { "epoch": 1.0122977045060557, "grad_norm": 0.16764596104621887, "learning_rate": 1.9782569590887075e-05, "loss": 1.2343, "step": 2719 }, { "epoch": 1.0126700096566648, "grad_norm": 0.17664223909378052, "learning_rate": 1.978231746625488e-05, "loss": 1.2432, "step": 2720 }, { "epoch": 1.0130423148072738, "grad_norm": 0.17679232358932495, "learning_rate": 1.978206519713847e-05, "loss": 1.2355, "step": 2721 }, { "epoch": 1.013414619957883, "grad_norm": 0.17845486104488373, "learning_rate": 1.9781812783541574e-05, "loss": 1.2529, "step": 2722 }, { "epoch": 1.013786925108492, "grad_norm": 0.17717497050762177, "learning_rate": 1.9781560225467913e-05, "loss": 1.2383, "step": 2723 }, { "epoch": 1.0141592302591012, "grad_norm": 0.17647191882133484, "learning_rate": 1.9781307522921224e-05, "loss": 1.2315, "step": 2724 }, { "epoch": 1.0145315354097102, "grad_norm": 0.177278071641922, "learning_rate": 1.9781054675905235e-05, "loss": 1.2385, "step": 2725 }, { "epoch": 1.0149038405603192, "grad_norm": 0.18225128948688507, "learning_rate": 1.9780801684423684e-05, "loss": 1.2356, "step": 2726 }, { "epoch": 1.0152761457109283, "grad_norm": 0.1841052770614624, "learning_rate": 1.9780548548480304e-05, "loss": 1.2404, "step": 2727 }, { "epoch": 1.0156484508615373, "grad_norm": 0.1847550868988037, "learning_rate": 1.9780295268078834e-05, "loss": 1.2343, "step": 2728 }, { "epoch": 1.0160207560121464, "grad_norm": 0.18073725700378418, "learning_rate": 1.9780041843223023e-05, "loss": 1.2507, "step": 2729 }, { "epoch": 1.0163930611627556, "grad_norm": 0.17755703628063202, "learning_rate": 1.97797882739166e-05, "loss": 1.2337, "step": 2730 }, { "epoch": 1.0167653663133647, "grad_norm": 0.1812552660703659, "learning_rate": 1.9779534560163324e-05, "loss": 1.2212, "step": 2731 }, { "epoch": 1.0171376714639737, "grad_norm": 0.18241752684116364, "learning_rate": 1.9779280701966935e-05, "loss": 1.2501, "step": 2732 }, { "epoch": 1.0175099766145828, "grad_norm": 0.1742631196975708, "learning_rate": 1.9779026699331183e-05, "loss": 1.2476, "step": 2733 }, { "epoch": 1.0178822817651918, "grad_norm": 0.17878419160842896, "learning_rate": 1.9778772552259818e-05, "loss": 1.2375, "step": 2734 }, { "epoch": 1.0182545869158008, "grad_norm": 0.18265032768249512, "learning_rate": 1.9778518260756602e-05, "loss": 1.2443, "step": 2735 }, { "epoch": 1.0186268920664099, "grad_norm": 0.17354628443717957, "learning_rate": 1.977826382482528e-05, "loss": 1.2458, "step": 2736 }, { "epoch": 1.018999197217019, "grad_norm": 0.17573866248130798, "learning_rate": 1.9778009244469617e-05, "loss": 1.2469, "step": 2737 }, { "epoch": 1.019371502367628, "grad_norm": 0.18138283491134644, "learning_rate": 1.977775451969337e-05, "loss": 1.2296, "step": 2738 }, { "epoch": 1.0197438075182372, "grad_norm": 0.18564942479133606, "learning_rate": 1.9777499650500303e-05, "loss": 1.2492, "step": 2739 }, { "epoch": 1.0201161126688463, "grad_norm": 0.18133315443992615, "learning_rate": 1.977724463689418e-05, "loss": 1.2432, "step": 2740 }, { "epoch": 1.0204884178194553, "grad_norm": 0.18659467995166779, "learning_rate": 1.9776989478878764e-05, "loss": 1.258, "step": 2741 }, { "epoch": 1.0208607229700644, "grad_norm": 0.2271583080291748, "learning_rate": 1.9776734176457833e-05, "loss": 1.236, "step": 2742 }, { "epoch": 1.0212330281206734, "grad_norm": 0.1641337126493454, "learning_rate": 1.9776478729635146e-05, "loss": 1.2299, "step": 2743 }, { "epoch": 1.0216053332712824, "grad_norm": 0.18119634687900543, "learning_rate": 1.9776223138414486e-05, "loss": 1.2252, "step": 2744 }, { "epoch": 1.0219776384218915, "grad_norm": 0.17801545560359955, "learning_rate": 1.977596740279962e-05, "loss": 1.2272, "step": 2745 }, { "epoch": 1.0223499435725005, "grad_norm": 0.18631921708583832, "learning_rate": 1.9775711522794333e-05, "loss": 1.252, "step": 2746 }, { "epoch": 1.0227222487231096, "grad_norm": 0.18302686512470245, "learning_rate": 1.97754554984024e-05, "loss": 1.2308, "step": 2747 }, { "epoch": 1.0230945538737188, "grad_norm": 0.178566575050354, "learning_rate": 1.97751993296276e-05, "loss": 1.2492, "step": 2748 }, { "epoch": 1.0234668590243279, "grad_norm": 0.18399080634117126, "learning_rate": 1.977494301647372e-05, "loss": 1.2451, "step": 2749 }, { "epoch": 1.023839164174937, "grad_norm": 0.18315501511096954, "learning_rate": 1.9774686558944544e-05, "loss": 1.2505, "step": 2750 }, { "epoch": 1.024211469325546, "grad_norm": 0.184551402926445, "learning_rate": 1.9774429957043866e-05, "loss": 1.2294, "step": 2751 }, { "epoch": 1.024583774476155, "grad_norm": 0.45721930265426636, "learning_rate": 1.9774173210775466e-05, "loss": 1.2194, "step": 2752 }, { "epoch": 1.024956079626764, "grad_norm": 0.1807864010334015, "learning_rate": 1.9773916320143144e-05, "loss": 1.249, "step": 2753 }, { "epoch": 1.025328384777373, "grad_norm": 0.18623942136764526, "learning_rate": 1.977365928515069e-05, "loss": 1.2281, "step": 2754 }, { "epoch": 1.0257006899279821, "grad_norm": 0.18158674240112305, "learning_rate": 1.97734021058019e-05, "loss": 1.2333, "step": 2755 }, { "epoch": 1.0260729950785914, "grad_norm": 0.1756281554698944, "learning_rate": 1.9773144782100576e-05, "loss": 1.2503, "step": 2756 }, { "epoch": 1.0264453002292004, "grad_norm": 0.1774439960718155, "learning_rate": 1.9772887314050516e-05, "loss": 1.2371, "step": 2757 }, { "epoch": 1.0268176053798095, "grad_norm": 0.17435887455940247, "learning_rate": 1.9772629701655524e-05, "loss": 1.2379, "step": 2758 }, { "epoch": 1.0271899105304185, "grad_norm": 0.18114317953586578, "learning_rate": 1.9772371944919406e-05, "loss": 1.2389, "step": 2759 }, { "epoch": 1.0275622156810276, "grad_norm": 0.1751965880393982, "learning_rate": 1.9772114043845968e-05, "loss": 1.2537, "step": 2760 }, { "epoch": 1.0279345208316366, "grad_norm": 0.18120123445987701, "learning_rate": 1.977185599843902e-05, "loss": 1.2308, "step": 2761 }, { "epoch": 1.0283068259822457, "grad_norm": 0.18496285378932953, "learning_rate": 1.9771597808702366e-05, "loss": 1.2399, "step": 2762 }, { "epoch": 1.0286791311328547, "grad_norm": 0.17509673535823822, "learning_rate": 1.9771339474639833e-05, "loss": 1.2205, "step": 2763 }, { "epoch": 1.0290514362834637, "grad_norm": 0.17925499379634857, "learning_rate": 1.9771080996255226e-05, "loss": 1.2414, "step": 2764 }, { "epoch": 1.029423741434073, "grad_norm": 0.17115138471126556, "learning_rate": 1.9770822373552362e-05, "loss": 1.2282, "step": 2765 }, { "epoch": 1.029796046584682, "grad_norm": 0.1737300306558609, "learning_rate": 1.9770563606535068e-05, "loss": 1.2417, "step": 2766 }, { "epoch": 1.030168351735291, "grad_norm": 0.17681419849395752, "learning_rate": 1.9770304695207164e-05, "loss": 1.2274, "step": 2767 }, { "epoch": 1.0305406568859001, "grad_norm": 0.1788710057735443, "learning_rate": 1.9770045639572473e-05, "loss": 1.2319, "step": 2768 }, { "epoch": 1.0309129620365092, "grad_norm": 0.17474333941936493, "learning_rate": 1.976978643963482e-05, "loss": 1.2386, "step": 2769 }, { "epoch": 1.0312852671871182, "grad_norm": 0.18028199672698975, "learning_rate": 1.9769527095398033e-05, "loss": 1.2426, "step": 2770 }, { "epoch": 1.0316575723377273, "grad_norm": 0.17821404337882996, "learning_rate": 1.9769267606865944e-05, "loss": 1.256, "step": 2771 }, { "epoch": 1.0320298774883363, "grad_norm": 0.17664535343647003, "learning_rate": 1.976900797404239e-05, "loss": 1.2397, "step": 2772 }, { "epoch": 1.0324021826389453, "grad_norm": 0.17644058167934418, "learning_rate": 1.9768748196931197e-05, "loss": 1.259, "step": 2773 }, { "epoch": 1.0327744877895546, "grad_norm": 0.18605953454971313, "learning_rate": 1.976848827553621e-05, "loss": 1.2336, "step": 2774 }, { "epoch": 1.0331467929401636, "grad_norm": 0.17775902152061462, "learning_rate": 1.9768228209861257e-05, "loss": 1.2309, "step": 2775 }, { "epoch": 1.0335190980907727, "grad_norm": 0.18057388067245483, "learning_rate": 1.976796799991019e-05, "loss": 1.231, "step": 2776 }, { "epoch": 1.0338914032413817, "grad_norm": 0.17607726156711578, "learning_rate": 1.9767707645686852e-05, "loss": 1.2189, "step": 2777 }, { "epoch": 1.0342637083919908, "grad_norm": 0.19016438722610474, "learning_rate": 1.9767447147195083e-05, "loss": 1.2416, "step": 2778 }, { "epoch": 1.0346360135425998, "grad_norm": 0.1789761781692505, "learning_rate": 1.976718650443873e-05, "loss": 1.2498, "step": 2779 }, { "epoch": 1.0350083186932089, "grad_norm": 0.1939614713191986, "learning_rate": 1.9766925717421647e-05, "loss": 1.2518, "step": 2780 }, { "epoch": 1.035380623843818, "grad_norm": 0.1895962953567505, "learning_rate": 1.9766664786147687e-05, "loss": 1.2613, "step": 2781 }, { "epoch": 1.0357529289944272, "grad_norm": 0.21478445827960968, "learning_rate": 1.97664037106207e-05, "loss": 1.235, "step": 2782 }, { "epoch": 1.0361252341450362, "grad_norm": 0.17898200452327728, "learning_rate": 1.976614249084454e-05, "loss": 1.2397, "step": 2783 }, { "epoch": 1.0364975392956453, "grad_norm": 0.18122392892837524, "learning_rate": 1.976588112682307e-05, "loss": 1.2253, "step": 2784 }, { "epoch": 1.0368698444462543, "grad_norm": 0.17120665311813354, "learning_rate": 1.9765619618560146e-05, "loss": 1.2428, "step": 2785 }, { "epoch": 1.0372421495968633, "grad_norm": 0.1834287941455841, "learning_rate": 1.9765357966059638e-05, "loss": 1.2253, "step": 2786 }, { "epoch": 1.0376144547474724, "grad_norm": 0.1814526915550232, "learning_rate": 1.9765096169325404e-05, "loss": 1.242, "step": 2787 }, { "epoch": 1.0379867598980814, "grad_norm": 0.18494722247123718, "learning_rate": 1.9764834228361313e-05, "loss": 1.2501, "step": 2788 }, { "epoch": 1.0383590650486905, "grad_norm": 0.17375454306602478, "learning_rate": 1.9764572143171232e-05, "loss": 1.2292, "step": 2789 }, { "epoch": 1.0387313701992995, "grad_norm": 0.18002989888191223, "learning_rate": 1.9764309913759033e-05, "loss": 1.2331, "step": 2790 }, { "epoch": 1.0391036753499088, "grad_norm": 0.18305903673171997, "learning_rate": 1.976404754012859e-05, "loss": 1.2479, "step": 2791 }, { "epoch": 1.0394759805005178, "grad_norm": 0.18125832080841064, "learning_rate": 1.9763785022283778e-05, "loss": 1.2238, "step": 2792 }, { "epoch": 1.0398482856511269, "grad_norm": 0.1753983050584793, "learning_rate": 1.976352236022847e-05, "loss": 1.2198, "step": 2793 }, { "epoch": 1.040220590801736, "grad_norm": 0.17953644692897797, "learning_rate": 1.9763259553966553e-05, "loss": 1.2477, "step": 2794 }, { "epoch": 1.040592895952345, "grad_norm": 0.17996378242969513, "learning_rate": 1.9762996603501908e-05, "loss": 1.2696, "step": 2795 }, { "epoch": 1.040965201102954, "grad_norm": 0.17698989808559418, "learning_rate": 1.976273350883841e-05, "loss": 1.248, "step": 2796 }, { "epoch": 1.041337506253563, "grad_norm": 0.17306332290172577, "learning_rate": 1.9762470269979955e-05, "loss": 1.2555, "step": 2797 }, { "epoch": 1.041709811404172, "grad_norm": 0.17857332527637482, "learning_rate": 1.9762206886930423e-05, "loss": 1.2386, "step": 2798 }, { "epoch": 1.0420821165547811, "grad_norm": 0.18211491405963898, "learning_rate": 1.9761943359693712e-05, "loss": 1.2458, "step": 2799 }, { "epoch": 1.0424544217053904, "grad_norm": 0.1814602017402649, "learning_rate": 1.9761679688273708e-05, "loss": 1.2442, "step": 2800 }, { "epoch": 1.0428267268559994, "grad_norm": 0.1816301792860031, "learning_rate": 1.976141587267431e-05, "loss": 1.2377, "step": 2801 }, { "epoch": 1.0431990320066085, "grad_norm": 0.1897687017917633, "learning_rate": 1.976115191289941e-05, "loss": 1.2532, "step": 2802 }, { "epoch": 1.0435713371572175, "grad_norm": 0.174024760723114, "learning_rate": 1.9760887808952908e-05, "loss": 1.223, "step": 2803 }, { "epoch": 1.0439436423078265, "grad_norm": 0.19265376031398773, "learning_rate": 1.9760623560838707e-05, "loss": 1.2442, "step": 2804 }, { "epoch": 1.0443159474584356, "grad_norm": 0.17987599968910217, "learning_rate": 1.9760359168560708e-05, "loss": 1.2395, "step": 2805 }, { "epoch": 1.0446882526090446, "grad_norm": 0.1701204478740692, "learning_rate": 1.976009463212282e-05, "loss": 1.2306, "step": 2806 }, { "epoch": 1.0450605577596537, "grad_norm": 0.1814124584197998, "learning_rate": 1.975982995152894e-05, "loss": 1.2451, "step": 2807 }, { "epoch": 1.0454328629102627, "grad_norm": 0.181508406996727, "learning_rate": 1.9759565126782988e-05, "loss": 1.2415, "step": 2808 }, { "epoch": 1.045805168060872, "grad_norm": 0.167762890458107, "learning_rate": 1.975930015788887e-05, "loss": 1.2343, "step": 2809 }, { "epoch": 1.046177473211481, "grad_norm": 0.17831118404865265, "learning_rate": 1.9759035044850504e-05, "loss": 1.2163, "step": 2810 }, { "epoch": 1.04654977836209, "grad_norm": 0.18151675164699554, "learning_rate": 1.9758769787671804e-05, "loss": 1.2255, "step": 2811 }, { "epoch": 1.046922083512699, "grad_norm": 0.18939217925071716, "learning_rate": 1.9758504386356682e-05, "loss": 1.2381, "step": 2812 }, { "epoch": 1.0472943886633082, "grad_norm": 0.1697840541601181, "learning_rate": 1.9758238840909066e-05, "loss": 1.2406, "step": 2813 }, { "epoch": 1.0476666938139172, "grad_norm": 0.17666485905647278, "learning_rate": 1.975797315133287e-05, "loss": 1.2312, "step": 2814 }, { "epoch": 1.0480389989645262, "grad_norm": 0.1959092617034912, "learning_rate": 1.975770731763203e-05, "loss": 1.2444, "step": 2815 }, { "epoch": 1.0484113041151353, "grad_norm": 0.17238794267177582, "learning_rate": 1.9757441339810462e-05, "loss": 1.227, "step": 2816 }, { "epoch": 1.0487836092657445, "grad_norm": 0.17401093244552612, "learning_rate": 1.9757175217872096e-05, "loss": 1.2475, "step": 2817 }, { "epoch": 1.0491559144163536, "grad_norm": 0.1765962839126587, "learning_rate": 1.9756908951820866e-05, "loss": 1.2348, "step": 2818 }, { "epoch": 1.0495282195669626, "grad_norm": 0.17260397970676422, "learning_rate": 1.9756642541660702e-05, "loss": 1.2509, "step": 2819 }, { "epoch": 1.0499005247175717, "grad_norm": 0.16606584191322327, "learning_rate": 1.975637598739554e-05, "loss": 1.2384, "step": 2820 }, { "epoch": 1.0502728298681807, "grad_norm": 0.17037144303321838, "learning_rate": 1.975610928902932e-05, "loss": 1.2382, "step": 2821 }, { "epoch": 1.0506451350187898, "grad_norm": 0.1816329061985016, "learning_rate": 1.9755842446565975e-05, "loss": 1.2329, "step": 2822 }, { "epoch": 1.0510174401693988, "grad_norm": 0.18435317277908325, "learning_rate": 1.975557546000945e-05, "loss": 1.233, "step": 2823 }, { "epoch": 1.0513897453200078, "grad_norm": 0.16666759550571442, "learning_rate": 1.975530832936369e-05, "loss": 1.2345, "step": 2824 }, { "epoch": 1.0517620504706169, "grad_norm": 0.18911749124526978, "learning_rate": 1.9755041054632634e-05, "loss": 1.2416, "step": 2825 }, { "epoch": 1.0521343556212261, "grad_norm": 0.17855370044708252, "learning_rate": 1.9754773635820236e-05, "loss": 1.246, "step": 2826 }, { "epoch": 1.0525066607718352, "grad_norm": 0.17206569015979767, "learning_rate": 1.9754506072930443e-05, "loss": 1.2326, "step": 2827 }, { "epoch": 1.0528789659224442, "grad_norm": 0.180022731423378, "learning_rate": 1.9754238365967207e-05, "loss": 1.2389, "step": 2828 }, { "epoch": 1.0532512710730533, "grad_norm": 0.17861510813236237, "learning_rate": 1.9753970514934485e-05, "loss": 1.2355, "step": 2829 }, { "epoch": 1.0536235762236623, "grad_norm": 0.17671099305152893, "learning_rate": 1.9753702519836228e-05, "loss": 1.2487, "step": 2830 }, { "epoch": 1.0539958813742714, "grad_norm": 0.17241548001766205, "learning_rate": 1.97534343806764e-05, "loss": 1.2386, "step": 2831 }, { "epoch": 1.0543681865248804, "grad_norm": 0.1827198565006256, "learning_rate": 1.9753166097458957e-05, "loss": 1.2326, "step": 2832 }, { "epoch": 1.0547404916754894, "grad_norm": 0.17715848982334137, "learning_rate": 1.975289767018786e-05, "loss": 1.2346, "step": 2833 }, { "epoch": 1.0551127968260985, "grad_norm": 0.17723973095417023, "learning_rate": 1.975262909886708e-05, "loss": 1.2426, "step": 2834 }, { "epoch": 1.0554851019767078, "grad_norm": 0.1761004477739334, "learning_rate": 1.975236038350058e-05, "loss": 1.2411, "step": 2835 }, { "epoch": 1.0558574071273168, "grad_norm": 0.18172328174114227, "learning_rate": 1.9752091524092324e-05, "loss": 1.2237, "step": 2836 }, { "epoch": 1.0562297122779258, "grad_norm": 0.16737525165081024, "learning_rate": 1.9751822520646297e-05, "loss": 1.2503, "step": 2837 }, { "epoch": 1.0566020174285349, "grad_norm": 0.1774168312549591, "learning_rate": 1.9751553373166454e-05, "loss": 1.2195, "step": 2838 }, { "epoch": 1.056974322579144, "grad_norm": 0.17864681780338287, "learning_rate": 1.9751284081656786e-05, "loss": 1.2407, "step": 2839 }, { "epoch": 1.057346627729753, "grad_norm": 0.1730041354894638, "learning_rate": 1.975101464612126e-05, "loss": 1.2441, "step": 2840 }, { "epoch": 1.057718932880362, "grad_norm": 0.17690402269363403, "learning_rate": 1.9750745066563864e-05, "loss": 1.2445, "step": 2841 }, { "epoch": 1.058091238030971, "grad_norm": 0.1674228459596634, "learning_rate": 1.9750475342988572e-05, "loss": 1.2313, "step": 2842 }, { "epoch": 1.0584635431815803, "grad_norm": 0.1728244572877884, "learning_rate": 1.9750205475399373e-05, "loss": 1.2261, "step": 2843 }, { "epoch": 1.0588358483321894, "grad_norm": 0.1766444444656372, "learning_rate": 1.974993546380025e-05, "loss": 1.2329, "step": 2844 }, { "epoch": 1.0592081534827984, "grad_norm": 0.17906500399112701, "learning_rate": 1.9749665308195194e-05, "loss": 1.237, "step": 2845 }, { "epoch": 1.0595804586334074, "grad_norm": 0.17800427973270416, "learning_rate": 1.974939500858819e-05, "loss": 1.2405, "step": 2846 }, { "epoch": 1.0599527637840165, "grad_norm": 0.17567503452301025, "learning_rate": 1.9749124564983237e-05, "loss": 1.241, "step": 2847 }, { "epoch": 1.0603250689346255, "grad_norm": 0.172315314412117, "learning_rate": 1.9748853977384326e-05, "loss": 1.2331, "step": 2848 }, { "epoch": 1.0606973740852346, "grad_norm": 0.17983998358249664, "learning_rate": 1.974858324579545e-05, "loss": 1.2449, "step": 2849 }, { "epoch": 1.0610696792358436, "grad_norm": 0.17935794591903687, "learning_rate": 1.9748312370220613e-05, "loss": 1.2357, "step": 2850 }, { "epoch": 1.0614419843864527, "grad_norm": 0.1763681173324585, "learning_rate": 1.9748041350663817e-05, "loss": 1.2498, "step": 2851 }, { "epoch": 1.061814289537062, "grad_norm": 0.17987823486328125, "learning_rate": 1.9747770187129055e-05, "loss": 1.2495, "step": 2852 }, { "epoch": 1.062186594687671, "grad_norm": 0.17571550607681274, "learning_rate": 1.9747498879620342e-05, "loss": 1.2263, "step": 2853 }, { "epoch": 1.06255889983828, "grad_norm": 0.17935210466384888, "learning_rate": 1.9747227428141687e-05, "loss": 1.2317, "step": 2854 }, { "epoch": 1.062931204988889, "grad_norm": 0.16637402772903442, "learning_rate": 1.9746955832697094e-05, "loss": 1.2329, "step": 2855 }, { "epoch": 1.063303510139498, "grad_norm": 0.17255236208438873, "learning_rate": 1.974668409329057e-05, "loss": 1.2353, "step": 2856 }, { "epoch": 1.0636758152901071, "grad_norm": 0.17514565587043762, "learning_rate": 1.9746412209926132e-05, "loss": 1.2303, "step": 2857 }, { "epoch": 1.0640481204407162, "grad_norm": 0.17487487196922302, "learning_rate": 1.9746140182607805e-05, "loss": 1.2199, "step": 2858 }, { "epoch": 1.0644204255913252, "grad_norm": 0.16520456969738007, "learning_rate": 1.9745868011339592e-05, "loss": 1.2332, "step": 2859 }, { "epoch": 1.0647927307419343, "grad_norm": 0.1774507462978363, "learning_rate": 1.9745595696125526e-05, "loss": 1.2407, "step": 2860 }, { "epoch": 1.0651650358925435, "grad_norm": 0.16631199419498444, "learning_rate": 1.974532323696962e-05, "loss": 1.2345, "step": 2861 }, { "epoch": 1.0655373410431526, "grad_norm": 0.17234042286872864, "learning_rate": 1.97450506338759e-05, "loss": 1.2222, "step": 2862 }, { "epoch": 1.0659096461937616, "grad_norm": 0.18009594082832336, "learning_rate": 1.97447778868484e-05, "loss": 1.2411, "step": 2863 }, { "epoch": 1.0662819513443706, "grad_norm": 0.1718275547027588, "learning_rate": 1.9744504995891135e-05, "loss": 1.2352, "step": 2864 }, { "epoch": 1.0666542564949797, "grad_norm": 0.16826453804969788, "learning_rate": 1.9744231961008147e-05, "loss": 1.2314, "step": 2865 }, { "epoch": 1.0670265616455887, "grad_norm": 0.17893622815608978, "learning_rate": 1.9743958782203462e-05, "loss": 1.2282, "step": 2866 }, { "epoch": 1.0673988667961978, "grad_norm": 0.17840996384620667, "learning_rate": 1.9743685459481118e-05, "loss": 1.2473, "step": 2867 }, { "epoch": 1.0677711719468068, "grad_norm": 0.1669299453496933, "learning_rate": 1.974341199284515e-05, "loss": 1.2345, "step": 2868 }, { "epoch": 1.0681434770974159, "grad_norm": 0.1682414710521698, "learning_rate": 1.97431383822996e-05, "loss": 1.2212, "step": 2869 }, { "epoch": 1.0685157822480251, "grad_norm": 0.17519889771938324, "learning_rate": 1.974286462784851e-05, "loss": 1.2377, "step": 2870 }, { "epoch": 1.0688880873986342, "grad_norm": 0.16797272861003876, "learning_rate": 1.9742590729495917e-05, "loss": 1.2078, "step": 2871 }, { "epoch": 1.0692603925492432, "grad_norm": 0.17429950833320618, "learning_rate": 1.9742316687245873e-05, "loss": 1.2334, "step": 2872 }, { "epoch": 1.0696326976998523, "grad_norm": 0.1742522120475769, "learning_rate": 1.974204250110242e-05, "loss": 1.2474, "step": 2873 }, { "epoch": 1.0700050028504613, "grad_norm": 0.17582035064697266, "learning_rate": 1.9741768171069614e-05, "loss": 1.241, "step": 2874 }, { "epoch": 1.0703773080010703, "grad_norm": 0.17079758644104004, "learning_rate": 1.9741493697151502e-05, "loss": 1.2281, "step": 2875 }, { "epoch": 1.0707496131516794, "grad_norm": 0.17140960693359375, "learning_rate": 1.9741219079352142e-05, "loss": 1.2393, "step": 2876 }, { "epoch": 1.0711219183022884, "grad_norm": 0.1833873987197876, "learning_rate": 1.9740944317675583e-05, "loss": 1.2488, "step": 2877 }, { "epoch": 1.0714942234528975, "grad_norm": 0.22053757309913635, "learning_rate": 1.974066941212589e-05, "loss": 1.2451, "step": 2878 }, { "epoch": 1.0718665286035067, "grad_norm": 0.1896497905254364, "learning_rate": 1.974039436270712e-05, "loss": 1.2203, "step": 2879 }, { "epoch": 1.0722388337541158, "grad_norm": 0.17092444002628326, "learning_rate": 1.9740119169423337e-05, "loss": 1.234, "step": 2880 }, { "epoch": 1.0726111389047248, "grad_norm": 0.1865914762020111, "learning_rate": 1.9739843832278604e-05, "loss": 1.2252, "step": 2881 }, { "epoch": 1.0729834440553339, "grad_norm": 0.1847282201051712, "learning_rate": 1.973956835127699e-05, "loss": 1.2307, "step": 2882 }, { "epoch": 1.073355749205943, "grad_norm": 0.16904793679714203, "learning_rate": 1.9739292726422565e-05, "loss": 1.2399, "step": 2883 }, { "epoch": 1.073728054356552, "grad_norm": 0.17725877463817596, "learning_rate": 1.9739016957719393e-05, "loss": 1.2425, "step": 2884 }, { "epoch": 1.074100359507161, "grad_norm": 0.1759733408689499, "learning_rate": 1.9738741045171556e-05, "loss": 1.2151, "step": 2885 }, { "epoch": 1.07447266465777, "grad_norm": 0.17010413110256195, "learning_rate": 1.973846498878312e-05, "loss": 1.2503, "step": 2886 }, { "epoch": 1.0748449698083793, "grad_norm": 0.17372602224349976, "learning_rate": 1.9738188788558174e-05, "loss": 1.2469, "step": 2887 }, { "epoch": 1.0752172749589883, "grad_norm": 0.17600518465042114, "learning_rate": 1.9737912444500786e-05, "loss": 1.2301, "step": 2888 }, { "epoch": 1.0755895801095974, "grad_norm": 0.17310109734535217, "learning_rate": 1.9737635956615044e-05, "loss": 1.2356, "step": 2889 }, { "epoch": 1.0759618852602064, "grad_norm": 0.1836235374212265, "learning_rate": 1.973735932490503e-05, "loss": 1.2501, "step": 2890 }, { "epoch": 1.0763341904108155, "grad_norm": 0.16939985752105713, "learning_rate": 1.9737082549374828e-05, "loss": 1.2273, "step": 2891 }, { "epoch": 1.0767064955614245, "grad_norm": 0.1679062396287918, "learning_rate": 1.973680563002853e-05, "loss": 1.2343, "step": 2892 }, { "epoch": 1.0770788007120335, "grad_norm": 0.1816723644733429, "learning_rate": 1.9736528566870223e-05, "loss": 1.2332, "step": 2893 }, { "epoch": 1.0774511058626426, "grad_norm": 0.17142869532108307, "learning_rate": 1.9736251359904003e-05, "loss": 1.2289, "step": 2894 }, { "epoch": 1.0778234110132519, "grad_norm": 0.17428475618362427, "learning_rate": 1.9735974009133957e-05, "loss": 1.2244, "step": 2895 }, { "epoch": 1.078195716163861, "grad_norm": 0.16701877117156982, "learning_rate": 1.9735696514564188e-05, "loss": 1.211, "step": 2896 }, { "epoch": 1.07856802131447, "grad_norm": 0.17782089114189148, "learning_rate": 1.9735418876198792e-05, "loss": 1.2257, "step": 2897 }, { "epoch": 1.078940326465079, "grad_norm": 0.1718931496143341, "learning_rate": 1.973514109404187e-05, "loss": 1.2393, "step": 2898 }, { "epoch": 1.079312631615688, "grad_norm": 0.16768048703670502, "learning_rate": 1.9734863168097526e-05, "loss": 1.2331, "step": 2899 }, { "epoch": 1.079684936766297, "grad_norm": 0.16998796164989471, "learning_rate": 1.973458509836986e-05, "loss": 1.226, "step": 2900 }, { "epoch": 1.080057241916906, "grad_norm": 0.1749204695224762, "learning_rate": 1.973430688486299e-05, "loss": 1.2437, "step": 2901 }, { "epoch": 1.0804295470675152, "grad_norm": 0.17514778673648834, "learning_rate": 1.973402852758101e-05, "loss": 1.2397, "step": 2902 }, { "epoch": 1.0808018522181242, "grad_norm": 0.1695886254310608, "learning_rate": 1.9733750026528046e-05, "loss": 1.2261, "step": 2903 }, { "epoch": 1.0811741573687335, "grad_norm": 0.16834037005901337, "learning_rate": 1.9733471381708202e-05, "loss": 1.2264, "step": 2904 }, { "epoch": 1.0815464625193425, "grad_norm": 0.17187410593032837, "learning_rate": 1.97331925931256e-05, "loss": 1.229, "step": 2905 }, { "epoch": 1.0819187676699515, "grad_norm": 0.17803004384040833, "learning_rate": 1.973291366078435e-05, "loss": 1.237, "step": 2906 }, { "epoch": 1.0822910728205606, "grad_norm": 0.1769217699766159, "learning_rate": 1.973263458468858e-05, "loss": 1.2232, "step": 2907 }, { "epoch": 1.0826633779711696, "grad_norm": 0.1921815276145935, "learning_rate": 1.9732355364842404e-05, "loss": 1.2533, "step": 2908 }, { "epoch": 1.0830356831217787, "grad_norm": 0.17243653535842896, "learning_rate": 1.973207600124995e-05, "loss": 1.2333, "step": 2909 }, { "epoch": 1.0834079882723877, "grad_norm": 0.1771405041217804, "learning_rate": 1.9731796493915346e-05, "loss": 1.2261, "step": 2910 }, { "epoch": 1.0837802934229968, "grad_norm": 0.17933285236358643, "learning_rate": 1.973151684284272e-05, "loss": 1.243, "step": 2911 }, { "epoch": 1.0841525985736058, "grad_norm": 0.16889271140098572, "learning_rate": 1.9731237048036197e-05, "loss": 1.227, "step": 2912 }, { "epoch": 1.084524903724215, "grad_norm": 0.17501141130924225, "learning_rate": 1.9730957109499917e-05, "loss": 1.2439, "step": 2913 }, { "epoch": 1.084897208874824, "grad_norm": 0.18403035402297974, "learning_rate": 1.973067702723801e-05, "loss": 1.2383, "step": 2914 }, { "epoch": 1.0852695140254331, "grad_norm": 0.169046550989151, "learning_rate": 1.9730396801254614e-05, "loss": 1.2363, "step": 2915 }, { "epoch": 1.0856418191760422, "grad_norm": 0.1726294606924057, "learning_rate": 1.973011643155387e-05, "loss": 1.2531, "step": 2916 }, { "epoch": 1.0860141243266512, "grad_norm": 0.1682724505662918, "learning_rate": 1.9729835918139914e-05, "loss": 1.2225, "step": 2917 }, { "epoch": 1.0863864294772603, "grad_norm": 0.2326967865228653, "learning_rate": 1.9729555261016894e-05, "loss": 1.2361, "step": 2918 }, { "epoch": 1.0867587346278693, "grad_norm": 0.1621241271495819, "learning_rate": 1.972927446018895e-05, "loss": 1.2233, "step": 2919 }, { "epoch": 1.0871310397784784, "grad_norm": 0.1713317185640335, "learning_rate": 1.9728993515660236e-05, "loss": 1.2284, "step": 2920 }, { "epoch": 1.0875033449290874, "grad_norm": 0.1748856008052826, "learning_rate": 1.9728712427434898e-05, "loss": 1.2337, "step": 2921 }, { "epoch": 1.0878756500796967, "grad_norm": 0.17016327381134033, "learning_rate": 1.972843119551709e-05, "loss": 1.2292, "step": 2922 }, { "epoch": 1.0882479552303057, "grad_norm": 0.17471212148666382, "learning_rate": 1.9728149819910958e-05, "loss": 1.231, "step": 2923 }, { "epoch": 1.0886202603809148, "grad_norm": 0.16408810019493103, "learning_rate": 1.972786830062067e-05, "loss": 1.2305, "step": 2924 }, { "epoch": 1.0889925655315238, "grad_norm": 0.17399826645851135, "learning_rate": 1.9727586637650373e-05, "loss": 1.2377, "step": 2925 }, { "epoch": 1.0893648706821328, "grad_norm": 0.18122164905071259, "learning_rate": 1.9727304831004232e-05, "loss": 1.2378, "step": 2926 }, { "epoch": 1.0897371758327419, "grad_norm": 0.17349712550640106, "learning_rate": 1.9727022880686413e-05, "loss": 1.2237, "step": 2927 }, { "epoch": 1.090109480983351, "grad_norm": 0.18894222378730774, "learning_rate": 1.9726740786701075e-05, "loss": 1.2254, "step": 2928 }, { "epoch": 1.09048178613396, "grad_norm": 0.184165820479393, "learning_rate": 1.9726458549052384e-05, "loss": 1.2344, "step": 2929 }, { "epoch": 1.090854091284569, "grad_norm": 0.18154264986515045, "learning_rate": 1.9726176167744513e-05, "loss": 1.225, "step": 2930 }, { "epoch": 1.0912263964351783, "grad_norm": 0.17851291596889496, "learning_rate": 1.972589364278163e-05, "loss": 1.237, "step": 2931 }, { "epoch": 1.0915987015857873, "grad_norm": 0.17904578149318695, "learning_rate": 1.9725610974167907e-05, "loss": 1.2378, "step": 2932 }, { "epoch": 1.0919710067363964, "grad_norm": 0.16910214722156525, "learning_rate": 1.972532816190752e-05, "loss": 1.2342, "step": 2933 }, { "epoch": 1.0923433118870054, "grad_norm": 0.17645606398582458, "learning_rate": 1.972504520600465e-05, "loss": 1.23, "step": 2934 }, { "epoch": 1.0927156170376144, "grad_norm": 0.16807736456394196, "learning_rate": 1.9724762106463467e-05, "loss": 1.2305, "step": 2935 }, { "epoch": 1.0930879221882235, "grad_norm": 0.20768019556999207, "learning_rate": 1.972447886328816e-05, "loss": 1.2291, "step": 2936 }, { "epoch": 1.0934602273388325, "grad_norm": 0.1863759607076645, "learning_rate": 1.9724195476482914e-05, "loss": 1.2371, "step": 2937 }, { "epoch": 1.0938325324894416, "grad_norm": 0.17857873439788818, "learning_rate": 1.9723911946051905e-05, "loss": 1.2179, "step": 2938 }, { "epoch": 1.0942048376400506, "grad_norm": 0.17775695025920868, "learning_rate": 1.972362827199933e-05, "loss": 1.2498, "step": 2939 }, { "epoch": 1.0945771427906599, "grad_norm": 0.1699213981628418, "learning_rate": 1.9723344454329376e-05, "loss": 1.2439, "step": 2940 }, { "epoch": 1.094949447941269, "grad_norm": 0.1817527562379837, "learning_rate": 1.9723060493046235e-05, "loss": 1.2276, "step": 2941 }, { "epoch": 1.095321753091878, "grad_norm": 0.17052580416202545, "learning_rate": 1.97227763881541e-05, "loss": 1.2161, "step": 2942 }, { "epoch": 1.095694058242487, "grad_norm": 0.17107853293418884, "learning_rate": 1.972249213965717e-05, "loss": 1.2265, "step": 2943 }, { "epoch": 1.096066363393096, "grad_norm": 0.17966510355472565, "learning_rate": 1.9722207747559636e-05, "loss": 1.2304, "step": 2944 }, { "epoch": 1.096438668543705, "grad_norm": 0.1866060048341751, "learning_rate": 1.972192321186571e-05, "loss": 1.2266, "step": 2945 }, { "epoch": 1.0968109736943141, "grad_norm": 0.17971013486385345, "learning_rate": 1.9721638532579584e-05, "loss": 1.2333, "step": 2946 }, { "epoch": 1.0971832788449232, "grad_norm": 0.17302943766117096, "learning_rate": 1.972135370970547e-05, "loss": 1.2367, "step": 2947 }, { "epoch": 1.0975555839955324, "grad_norm": 0.17640618979930878, "learning_rate": 1.9721068743247568e-05, "loss": 1.2385, "step": 2948 }, { "epoch": 1.0979278891461415, "grad_norm": 0.17200317978858948, "learning_rate": 1.972078363321009e-05, "loss": 1.2277, "step": 2949 }, { "epoch": 1.0983001942967505, "grad_norm": 0.1816646158695221, "learning_rate": 1.9720498379597256e-05, "loss": 1.2319, "step": 2950 }, { "epoch": 1.0986724994473596, "grad_norm": 0.17960673570632935, "learning_rate": 1.9720212982413262e-05, "loss": 1.2235, "step": 2951 }, { "epoch": 1.0990448045979686, "grad_norm": 0.17489486932754517, "learning_rate": 1.971992744166234e-05, "loss": 1.2377, "step": 2952 }, { "epoch": 1.0994171097485776, "grad_norm": 0.17041970789432526, "learning_rate": 1.9719641757348694e-05, "loss": 1.226, "step": 2953 }, { "epoch": 1.0997894148991867, "grad_norm": 0.18417948484420776, "learning_rate": 1.9719355929476548e-05, "loss": 1.237, "step": 2954 }, { "epoch": 1.1001617200497957, "grad_norm": 0.17019295692443848, "learning_rate": 1.971906995805013e-05, "loss": 1.225, "step": 2955 }, { "epoch": 1.100534025200405, "grad_norm": 0.17955681681632996, "learning_rate": 1.9718783843073653e-05, "loss": 1.2361, "step": 2956 }, { "epoch": 1.100906330351014, "grad_norm": 0.3305249810218811, "learning_rate": 1.9718497584551355e-05, "loss": 1.2538, "step": 2957 }, { "epoch": 1.101278635501623, "grad_norm": 0.18145957589149475, "learning_rate": 1.9718211182487455e-05, "loss": 1.2218, "step": 2958 }, { "epoch": 1.1016509406522321, "grad_norm": 0.17651425302028656, "learning_rate": 1.9717924636886186e-05, "loss": 1.2455, "step": 2959 }, { "epoch": 1.1020232458028412, "grad_norm": 0.17769207060337067, "learning_rate": 1.971763794775178e-05, "loss": 1.2381, "step": 2960 }, { "epoch": 1.1023955509534502, "grad_norm": 0.17026957869529724, "learning_rate": 1.971735111508847e-05, "loss": 1.2176, "step": 2961 }, { "epoch": 1.1027678561040593, "grad_norm": 0.17879103124141693, "learning_rate": 1.9717064138900494e-05, "loss": 1.2371, "step": 2962 }, { "epoch": 1.1031401612546683, "grad_norm": 0.16946665942668915, "learning_rate": 1.9716777019192087e-05, "loss": 1.2431, "step": 2963 }, { "epoch": 1.1035124664052773, "grad_norm": 0.16764768958091736, "learning_rate": 1.97164897559675e-05, "loss": 1.2293, "step": 2964 }, { "epoch": 1.1038847715558866, "grad_norm": 0.1787741482257843, "learning_rate": 1.9716202349230967e-05, "loss": 1.2273, "step": 2965 }, { "epoch": 1.1042570767064956, "grad_norm": 0.18097257614135742, "learning_rate": 1.971591479898673e-05, "loss": 1.2405, "step": 2966 }, { "epoch": 1.1046293818571047, "grad_norm": 0.16577765345573425, "learning_rate": 1.9715627105239048e-05, "loss": 1.2213, "step": 2967 }, { "epoch": 1.1050016870077137, "grad_norm": 0.17236430943012238, "learning_rate": 1.9715339267992162e-05, "loss": 1.2358, "step": 2968 }, { "epoch": 1.1053739921583228, "grad_norm": 0.1793234795331955, "learning_rate": 1.9715051287250322e-05, "loss": 1.2465, "step": 2969 }, { "epoch": 1.1057462973089318, "grad_norm": 0.17169257998466492, "learning_rate": 1.9714763163017788e-05, "loss": 1.2308, "step": 2970 }, { "epoch": 1.1061186024595409, "grad_norm": 0.16924230754375458, "learning_rate": 1.9714474895298807e-05, "loss": 1.2452, "step": 2971 }, { "epoch": 1.10649090761015, "grad_norm": 0.17970192432403564, "learning_rate": 1.9714186484097646e-05, "loss": 1.2216, "step": 2972 }, { "epoch": 1.106863212760759, "grad_norm": 0.17152759432792664, "learning_rate": 1.9713897929418556e-05, "loss": 1.2237, "step": 2973 }, { "epoch": 1.1072355179113682, "grad_norm": 0.18560026586055756, "learning_rate": 1.9713609231265807e-05, "loss": 1.2257, "step": 2974 }, { "epoch": 1.1076078230619772, "grad_norm": 0.17931891977787018, "learning_rate": 1.9713320389643658e-05, "loss": 1.2455, "step": 2975 }, { "epoch": 1.1079801282125863, "grad_norm": 0.17655859887599945, "learning_rate": 1.9713031404556377e-05, "loss": 1.2415, "step": 2976 }, { "epoch": 1.1083524333631953, "grad_norm": 0.17913676798343658, "learning_rate": 1.971274227600823e-05, "loss": 1.2298, "step": 2977 }, { "epoch": 1.1087247385138044, "grad_norm": 0.17090395092964172, "learning_rate": 1.971245300400349e-05, "loss": 1.2242, "step": 2978 }, { "epoch": 1.1090970436644134, "grad_norm": 0.17184513807296753, "learning_rate": 1.9712163588546426e-05, "loss": 1.2221, "step": 2979 }, { "epoch": 1.1094693488150225, "grad_norm": 0.17693102359771729, "learning_rate": 1.971187402964132e-05, "loss": 1.2345, "step": 2980 }, { "epoch": 1.1098416539656315, "grad_norm": 0.17723120748996735, "learning_rate": 1.9711584327292442e-05, "loss": 1.2168, "step": 2981 }, { "epoch": 1.1102139591162405, "grad_norm": 0.1746647208929062, "learning_rate": 1.9711294481504074e-05, "loss": 1.2297, "step": 2982 }, { "epoch": 1.1105862642668498, "grad_norm": 0.17656634747982025, "learning_rate": 1.9711004492280495e-05, "loss": 1.2339, "step": 2983 }, { "epoch": 1.1109585694174589, "grad_norm": 0.1806725114583969, "learning_rate": 1.9710714359625987e-05, "loss": 1.2305, "step": 2984 }, { "epoch": 1.111330874568068, "grad_norm": 0.17355605959892273, "learning_rate": 1.9710424083544837e-05, "loss": 1.2275, "step": 2985 }, { "epoch": 1.111703179718677, "grad_norm": 0.17555807530879974, "learning_rate": 1.9710133664041338e-05, "loss": 1.2219, "step": 2986 }, { "epoch": 1.112075484869286, "grad_norm": 0.17484989762306213, "learning_rate": 1.9709843101119772e-05, "loss": 1.236, "step": 2987 }, { "epoch": 1.112447790019895, "grad_norm": 0.1751967966556549, "learning_rate": 1.970955239478443e-05, "loss": 1.2551, "step": 2988 }, { "epoch": 1.112820095170504, "grad_norm": 0.1683066189289093, "learning_rate": 1.9709261545039614e-05, "loss": 1.2307, "step": 2989 }, { "epoch": 1.113192400321113, "grad_norm": 0.17595693469047546, "learning_rate": 1.970897055188961e-05, "loss": 1.247, "step": 2990 }, { "epoch": 1.1135647054717221, "grad_norm": 0.17412763833999634, "learning_rate": 1.9708679415338722e-05, "loss": 1.2353, "step": 2991 }, { "epoch": 1.1139370106223314, "grad_norm": 0.17140118777751923, "learning_rate": 1.9708388135391247e-05, "loss": 1.2259, "step": 2992 }, { "epoch": 1.1143093157729405, "grad_norm": 0.17357899248600006, "learning_rate": 1.970809671205149e-05, "loss": 1.2314, "step": 2993 }, { "epoch": 1.1146816209235495, "grad_norm": 0.16457659006118774, "learning_rate": 1.970780514532375e-05, "loss": 1.2371, "step": 2994 }, { "epoch": 1.1150539260741585, "grad_norm": 0.17428340017795563, "learning_rate": 1.970751343521234e-05, "loss": 1.227, "step": 2995 }, { "epoch": 1.1154262312247676, "grad_norm": 0.18031379580497742, "learning_rate": 1.9707221581721568e-05, "loss": 1.2366, "step": 2996 }, { "epoch": 1.1157985363753766, "grad_norm": 0.16890005767345428, "learning_rate": 1.9706929584855737e-05, "loss": 1.2296, "step": 2997 }, { "epoch": 1.1161708415259857, "grad_norm": 0.16828297078609467, "learning_rate": 1.970663744461917e-05, "loss": 1.2273, "step": 2998 }, { "epoch": 1.1165431466765947, "grad_norm": 0.17010074853897095, "learning_rate": 1.9706345161016177e-05, "loss": 1.2263, "step": 2999 }, { "epoch": 1.116915451827204, "grad_norm": 0.1739543378353119, "learning_rate": 1.970605273405107e-05, "loss": 1.2333, "step": 3000 }, { "epoch": 1.116915451827204, "eval_loss": 1.324150562286377, "eval_runtime": 16.2397, "eval_samples_per_second": 106.776, "eval_steps_per_second": 5.357, "step": 3000 }, { "epoch": 1.117287756977813, "grad_norm": 0.1825607717037201, "learning_rate": 1.970576016372818e-05, "loss": 1.2399, "step": 3001 }, { "epoch": 1.117660062128422, "grad_norm": 0.17324824631214142, "learning_rate": 1.970546745005182e-05, "loss": 1.2294, "step": 3002 }, { "epoch": 1.118032367279031, "grad_norm": 0.17063994705677032, "learning_rate": 1.970517459302631e-05, "loss": 1.2403, "step": 3003 }, { "epoch": 1.1184046724296401, "grad_norm": 0.16882003843784332, "learning_rate": 1.9704881592655983e-05, "loss": 1.2178, "step": 3004 }, { "epoch": 1.1187769775802492, "grad_norm": 0.17386655509471893, "learning_rate": 1.9704588448945167e-05, "loss": 1.2296, "step": 3005 }, { "epoch": 1.1191492827308582, "grad_norm": 0.16440868377685547, "learning_rate": 1.9704295161898185e-05, "loss": 1.2355, "step": 3006 }, { "epoch": 1.1195215878814673, "grad_norm": 0.17162500321865082, "learning_rate": 1.9704001731519374e-05, "loss": 1.2373, "step": 3007 }, { "epoch": 1.1198938930320763, "grad_norm": 0.1756156086921692, "learning_rate": 1.9703708157813068e-05, "loss": 1.2501, "step": 3008 }, { "epoch": 1.1202661981826856, "grad_norm": 0.17049741744995117, "learning_rate": 1.9703414440783596e-05, "loss": 1.2334, "step": 3009 }, { "epoch": 1.1206385033332946, "grad_norm": 0.17266800999641418, "learning_rate": 1.9703120580435303e-05, "loss": 1.2251, "step": 3010 }, { "epoch": 1.1210108084839037, "grad_norm": 0.17048951983451843, "learning_rate": 1.9702826576772535e-05, "loss": 1.2344, "step": 3011 }, { "epoch": 1.1213831136345127, "grad_norm": 0.17931020259857178, "learning_rate": 1.9702532429799622e-05, "loss": 1.2263, "step": 3012 }, { "epoch": 1.1217554187851217, "grad_norm": 0.16861797869205475, "learning_rate": 1.9702238139520912e-05, "loss": 1.2332, "step": 3013 }, { "epoch": 1.1221277239357308, "grad_norm": 0.1716170459985733, "learning_rate": 1.9701943705940758e-05, "loss": 1.2288, "step": 3014 }, { "epoch": 1.1225000290863398, "grad_norm": 0.1761639565229416, "learning_rate": 1.9701649129063503e-05, "loss": 1.229, "step": 3015 }, { "epoch": 1.1228723342369489, "grad_norm": 0.17630340158939362, "learning_rate": 1.9701354408893495e-05, "loss": 1.2453, "step": 3016 }, { "epoch": 1.1232446393875581, "grad_norm": 0.17522114515304565, "learning_rate": 1.9701059545435094e-05, "loss": 1.2242, "step": 3017 }, { "epoch": 1.1236169445381672, "grad_norm": 0.17642271518707275, "learning_rate": 1.9700764538692657e-05, "loss": 1.2354, "step": 3018 }, { "epoch": 1.1239892496887762, "grad_norm": 0.17362841963768005, "learning_rate": 1.970046938867053e-05, "loss": 1.2199, "step": 3019 }, { "epoch": 1.1243615548393853, "grad_norm": 0.17298756539821625, "learning_rate": 1.970017409537308e-05, "loss": 1.2324, "step": 3020 }, { "epoch": 1.1247338599899943, "grad_norm": 0.17711667716503143, "learning_rate": 1.9699878658804673e-05, "loss": 1.2111, "step": 3021 }, { "epoch": 1.1251061651406034, "grad_norm": 0.1722431480884552, "learning_rate": 1.969958307896966e-05, "loss": 1.2308, "step": 3022 }, { "epoch": 1.1254784702912124, "grad_norm": 0.17593185603618622, "learning_rate": 1.969928735587242e-05, "loss": 1.2335, "step": 3023 }, { "epoch": 1.1258507754418214, "grad_norm": 0.17119091749191284, "learning_rate": 1.969899148951731e-05, "loss": 1.2294, "step": 3024 }, { "epoch": 1.1262230805924305, "grad_norm": 0.1774984896183014, "learning_rate": 1.9698695479908706e-05, "loss": 1.2279, "step": 3025 }, { "epoch": 1.1265953857430397, "grad_norm": 0.17215633392333984, "learning_rate": 1.9698399327050976e-05, "loss": 1.227, "step": 3026 }, { "epoch": 1.1269676908936488, "grad_norm": 0.1741963028907776, "learning_rate": 1.96981030309485e-05, "loss": 1.2408, "step": 3027 }, { "epoch": 1.1273399960442578, "grad_norm": 0.1749897301197052, "learning_rate": 1.969780659160565e-05, "loss": 1.2416, "step": 3028 }, { "epoch": 1.1277123011948669, "grad_norm": 0.16878561675548553, "learning_rate": 1.9697510009026803e-05, "loss": 1.2395, "step": 3029 }, { "epoch": 1.128084606345476, "grad_norm": 0.1697298288345337, "learning_rate": 1.9697213283216342e-05, "loss": 1.2274, "step": 3030 }, { "epoch": 1.128456911496085, "grad_norm": 0.166582852602005, "learning_rate": 1.9696916414178652e-05, "loss": 1.2283, "step": 3031 }, { "epoch": 1.128829216646694, "grad_norm": 0.1713571697473526, "learning_rate": 1.9696619401918112e-05, "loss": 1.218, "step": 3032 }, { "epoch": 1.129201521797303, "grad_norm": 0.1708502322435379, "learning_rate": 1.9696322246439113e-05, "loss": 1.2272, "step": 3033 }, { "epoch": 1.129573826947912, "grad_norm": 0.16899080574512482, "learning_rate": 1.9696024947746047e-05, "loss": 1.2354, "step": 3034 }, { "epoch": 1.1299461320985213, "grad_norm": 0.16772188246250153, "learning_rate": 1.9695727505843298e-05, "loss": 1.2415, "step": 3035 }, { "epoch": 1.1303184372491304, "grad_norm": 0.17296922206878662, "learning_rate": 1.969542992073526e-05, "loss": 1.2145, "step": 3036 }, { "epoch": 1.1306907423997394, "grad_norm": 0.16814911365509033, "learning_rate": 1.9695132192426334e-05, "loss": 1.2418, "step": 3037 }, { "epoch": 1.1310630475503485, "grad_norm": 0.16937977075576782, "learning_rate": 1.9694834320920912e-05, "loss": 1.2245, "step": 3038 }, { "epoch": 1.1314353527009575, "grad_norm": 0.17156411707401276, "learning_rate": 1.9694536306223394e-05, "loss": 1.2336, "step": 3039 }, { "epoch": 1.1318076578515666, "grad_norm": 0.16524691879749298, "learning_rate": 1.9694238148338186e-05, "loss": 1.2354, "step": 3040 }, { "epoch": 1.1321799630021756, "grad_norm": 0.1789710819721222, "learning_rate": 1.9693939847269688e-05, "loss": 1.2243, "step": 3041 }, { "epoch": 1.1325522681527846, "grad_norm": 0.16184090077877045, "learning_rate": 1.9693641403022308e-05, "loss": 1.2194, "step": 3042 }, { "epoch": 1.1329245733033937, "grad_norm": 0.17413978278636932, "learning_rate": 1.969334281560045e-05, "loss": 1.238, "step": 3043 }, { "epoch": 1.133296878454003, "grad_norm": 0.17365868389606476, "learning_rate": 1.969304408500853e-05, "loss": 1.2253, "step": 3044 }, { "epoch": 1.133669183604612, "grad_norm": 0.16915839910507202, "learning_rate": 1.9692745211250957e-05, "loss": 1.2234, "step": 3045 }, { "epoch": 1.134041488755221, "grad_norm": 0.17093431949615479, "learning_rate": 1.9692446194332144e-05, "loss": 1.2275, "step": 3046 }, { "epoch": 1.13441379390583, "grad_norm": 0.1693648397922516, "learning_rate": 1.969214703425651e-05, "loss": 1.2381, "step": 3047 }, { "epoch": 1.1347860990564391, "grad_norm": 0.16665521264076233, "learning_rate": 1.969184773102847e-05, "loss": 1.2276, "step": 3048 }, { "epoch": 1.1351584042070482, "grad_norm": 0.17075350880622864, "learning_rate": 1.969154828465245e-05, "loss": 1.2314, "step": 3049 }, { "epoch": 1.1355307093576572, "grad_norm": 0.18020687997341156, "learning_rate": 1.969124869513287e-05, "loss": 1.2448, "step": 3050 }, { "epoch": 1.1359030145082663, "grad_norm": 0.17107802629470825, "learning_rate": 1.9690948962474152e-05, "loss": 1.2441, "step": 3051 }, { "epoch": 1.1362753196588753, "grad_norm": 0.1705516129732132, "learning_rate": 1.969064908668073e-05, "loss": 1.2429, "step": 3052 }, { "epoch": 1.1366476248094846, "grad_norm": 0.16781964898109436, "learning_rate": 1.9690349067757026e-05, "loss": 1.2203, "step": 3053 }, { "epoch": 1.1370199299600936, "grad_norm": 0.17965157330036163, "learning_rate": 1.9690048905707476e-05, "loss": 1.2352, "step": 3054 }, { "epoch": 1.1373922351107026, "grad_norm": 0.16421709954738617, "learning_rate": 1.9689748600536514e-05, "loss": 1.2138, "step": 3055 }, { "epoch": 1.1377645402613117, "grad_norm": 0.17832623422145844, "learning_rate": 1.9689448152248568e-05, "loss": 1.2288, "step": 3056 }, { "epoch": 1.1381368454119207, "grad_norm": 0.16908565163612366, "learning_rate": 1.9689147560848086e-05, "loss": 1.2158, "step": 3057 }, { "epoch": 1.1385091505625298, "grad_norm": 0.18035708367824554, "learning_rate": 1.9688846826339498e-05, "loss": 1.2292, "step": 3058 }, { "epoch": 1.1388814557131388, "grad_norm": 0.18107391893863678, "learning_rate": 1.9688545948727255e-05, "loss": 1.2215, "step": 3059 }, { "epoch": 1.1392537608637479, "grad_norm": 0.17275120317935944, "learning_rate": 1.9688244928015795e-05, "loss": 1.2262, "step": 3060 }, { "epoch": 1.139626066014357, "grad_norm": 0.17838861048221588, "learning_rate": 1.9687943764209564e-05, "loss": 1.2246, "step": 3061 }, { "epoch": 1.1399983711649662, "grad_norm": 0.17901575565338135, "learning_rate": 1.968764245731301e-05, "loss": 1.2299, "step": 3062 }, { "epoch": 1.1403706763155752, "grad_norm": 0.17386050522327423, "learning_rate": 1.9687341007330588e-05, "loss": 1.2214, "step": 3063 }, { "epoch": 1.1407429814661842, "grad_norm": 0.17698942124843597, "learning_rate": 1.9687039414266745e-05, "loss": 1.2363, "step": 3064 }, { "epoch": 1.1411152866167933, "grad_norm": 0.16935193538665771, "learning_rate": 1.968673767812594e-05, "loss": 1.2189, "step": 3065 }, { "epoch": 1.1414875917674023, "grad_norm": 0.18296149373054504, "learning_rate": 1.9686435798912624e-05, "loss": 1.2252, "step": 3066 }, { "epoch": 1.1418598969180114, "grad_norm": 0.17983867228031158, "learning_rate": 1.9686133776631263e-05, "loss": 1.2338, "step": 3067 }, { "epoch": 1.1422322020686204, "grad_norm": 0.17501530051231384, "learning_rate": 1.9685831611286312e-05, "loss": 1.2168, "step": 3068 }, { "epoch": 1.1426045072192297, "grad_norm": 0.17005561292171478, "learning_rate": 1.9685529302882237e-05, "loss": 1.2367, "step": 3069 }, { "epoch": 1.1429768123698387, "grad_norm": 0.1763869673013687, "learning_rate": 1.9685226851423502e-05, "loss": 1.2142, "step": 3070 }, { "epoch": 1.1433491175204478, "grad_norm": 0.17087772488594055, "learning_rate": 1.9684924256914574e-05, "loss": 1.2228, "step": 3071 }, { "epoch": 1.1437214226710568, "grad_norm": 0.167043536901474, "learning_rate": 1.968462151935992e-05, "loss": 1.2148, "step": 3072 }, { "epoch": 1.1440937278216659, "grad_norm": 0.17006975412368774, "learning_rate": 1.968431863876402e-05, "loss": 1.2376, "step": 3073 }, { "epoch": 1.144466032972275, "grad_norm": 0.175192192196846, "learning_rate": 1.9684015615131336e-05, "loss": 1.227, "step": 3074 }, { "epoch": 1.144838338122884, "grad_norm": 0.1716625988483429, "learning_rate": 1.968371244846635e-05, "loss": 1.2399, "step": 3075 }, { "epoch": 1.145210643273493, "grad_norm": 0.17771419882774353, "learning_rate": 1.9683409138773538e-05, "loss": 1.2311, "step": 3076 }, { "epoch": 1.145582948424102, "grad_norm": 0.17725834250450134, "learning_rate": 1.9683105686057383e-05, "loss": 1.2195, "step": 3077 }, { "epoch": 1.1459552535747113, "grad_norm": 0.17201553285121918, "learning_rate": 1.9682802090322365e-05, "loss": 1.2308, "step": 3078 }, { "epoch": 1.1463275587253203, "grad_norm": 0.17508849501609802, "learning_rate": 1.968249835157297e-05, "loss": 1.2278, "step": 3079 }, { "epoch": 1.1466998638759294, "grad_norm": 0.16712623834609985, "learning_rate": 1.9682194469813675e-05, "loss": 1.233, "step": 3080 }, { "epoch": 1.1470721690265384, "grad_norm": 0.1693454533815384, "learning_rate": 1.968189044504898e-05, "loss": 1.2292, "step": 3081 }, { "epoch": 1.1474444741771475, "grad_norm": 0.17314909398555756, "learning_rate": 1.968158627728337e-05, "loss": 1.2371, "step": 3082 }, { "epoch": 1.1478167793277565, "grad_norm": 0.1748846024274826, "learning_rate": 1.968128196652134e-05, "loss": 1.2337, "step": 3083 }, { "epoch": 1.1481890844783655, "grad_norm": 0.16931162774562836, "learning_rate": 1.968097751276738e-05, "loss": 1.2321, "step": 3084 }, { "epoch": 1.1485613896289746, "grad_norm": 0.17732059955596924, "learning_rate": 1.9680672916025993e-05, "loss": 1.2152, "step": 3085 }, { "epoch": 1.1489336947795836, "grad_norm": 0.17309656739234924, "learning_rate": 1.9680368176301673e-05, "loss": 1.235, "step": 3086 }, { "epoch": 1.149305999930193, "grad_norm": 0.17718668282032013, "learning_rate": 1.968006329359892e-05, "loss": 1.2288, "step": 3087 }, { "epoch": 1.149678305080802, "grad_norm": 0.17663928866386414, "learning_rate": 1.9679758267922243e-05, "loss": 1.2314, "step": 3088 }, { "epoch": 1.150050610231411, "grad_norm": 0.16871769726276398, "learning_rate": 1.9679453099276142e-05, "loss": 1.2292, "step": 3089 }, { "epoch": 1.15042291538202, "grad_norm": 0.177023783326149, "learning_rate": 1.9679147787665128e-05, "loss": 1.211, "step": 3090 }, { "epoch": 1.150795220532629, "grad_norm": 0.17566607892513275, "learning_rate": 1.9678842333093708e-05, "loss": 1.2255, "step": 3091 }, { "epoch": 1.151167525683238, "grad_norm": 0.17927557229995728, "learning_rate": 1.9678536735566393e-05, "loss": 1.2353, "step": 3092 }, { "epoch": 1.1515398308338471, "grad_norm": 0.15841099619865417, "learning_rate": 1.9678230995087696e-05, "loss": 1.2296, "step": 3093 }, { "epoch": 1.1519121359844562, "grad_norm": 0.17372049391269684, "learning_rate": 1.9677925111662136e-05, "loss": 1.2199, "step": 3094 }, { "epoch": 1.1522844411350652, "grad_norm": 0.17242653667926788, "learning_rate": 1.9677619085294234e-05, "loss": 1.223, "step": 3095 }, { "epoch": 1.1526567462856745, "grad_norm": 0.18381370604038239, "learning_rate": 1.96773129159885e-05, "loss": 1.2372, "step": 3096 }, { "epoch": 1.1530290514362835, "grad_norm": 0.1743118166923523, "learning_rate": 1.9677006603749463e-05, "loss": 1.2133, "step": 3097 }, { "epoch": 1.1534013565868926, "grad_norm": 0.18232518434524536, "learning_rate": 1.967670014858165e-05, "loss": 1.2328, "step": 3098 }, { "epoch": 1.1537736617375016, "grad_norm": 0.17845921218395233, "learning_rate": 1.9676393550489576e-05, "loss": 1.2349, "step": 3099 }, { "epoch": 1.1541459668881107, "grad_norm": 0.18814215064048767, "learning_rate": 1.9676086809477778e-05, "loss": 1.2375, "step": 3100 }, { "epoch": 1.1545182720387197, "grad_norm": 0.17665289342403412, "learning_rate": 1.9675779925550785e-05, "loss": 1.216, "step": 3101 }, { "epoch": 1.1548905771893287, "grad_norm": 0.1747797727584839, "learning_rate": 1.9675472898713136e-05, "loss": 1.2291, "step": 3102 }, { "epoch": 1.1552628823399378, "grad_norm": 0.17490416765213013, "learning_rate": 1.9675165728969353e-05, "loss": 1.2187, "step": 3103 }, { "epoch": 1.1556351874905468, "grad_norm": 0.17718598246574402, "learning_rate": 1.967485841632398e-05, "loss": 1.2374, "step": 3104 }, { "epoch": 1.156007492641156, "grad_norm": 0.17890246212482452, "learning_rate": 1.967455096078156e-05, "loss": 1.2184, "step": 3105 }, { "epoch": 1.1563797977917651, "grad_norm": 0.17774467170238495, "learning_rate": 1.9674243362346624e-05, "loss": 1.2251, "step": 3106 }, { "epoch": 1.1567521029423742, "grad_norm": 0.1743806153535843, "learning_rate": 1.9673935621023724e-05, "loss": 1.2357, "step": 3107 }, { "epoch": 1.1571244080929832, "grad_norm": 0.1773483157157898, "learning_rate": 1.96736277368174e-05, "loss": 1.2338, "step": 3108 }, { "epoch": 1.1574967132435923, "grad_norm": 0.171565443277359, "learning_rate": 1.9673319709732205e-05, "loss": 1.2412, "step": 3109 }, { "epoch": 1.1578690183942013, "grad_norm": 0.16667889058589935, "learning_rate": 1.967301153977268e-05, "loss": 1.227, "step": 3110 }, { "epoch": 1.1582413235448104, "grad_norm": 0.18110859394073486, "learning_rate": 1.9672703226943383e-05, "loss": 1.247, "step": 3111 }, { "epoch": 1.1586136286954194, "grad_norm": 0.18011483550071716, "learning_rate": 1.9672394771248867e-05, "loss": 1.23, "step": 3112 }, { "epoch": 1.1589859338460284, "grad_norm": 0.17470338940620422, "learning_rate": 1.967208617269369e-05, "loss": 1.2272, "step": 3113 }, { "epoch": 1.1593582389966377, "grad_norm": 0.16720262169837952, "learning_rate": 1.9671777431282404e-05, "loss": 1.2286, "step": 3114 }, { "epoch": 1.1597305441472467, "grad_norm": 0.16974225640296936, "learning_rate": 1.9671468547019575e-05, "loss": 1.2358, "step": 3115 }, { "epoch": 1.1601028492978558, "grad_norm": 0.17322564125061035, "learning_rate": 1.9671159519909758e-05, "loss": 1.2196, "step": 3116 }, { "epoch": 1.1604751544484648, "grad_norm": 0.17480473220348358, "learning_rate": 1.9670850349957525e-05, "loss": 1.2199, "step": 3117 }, { "epoch": 1.1608474595990739, "grad_norm": 0.17341922223567963, "learning_rate": 1.967054103716744e-05, "loss": 1.2197, "step": 3118 }, { "epoch": 1.161219764749683, "grad_norm": 0.16872482001781464, "learning_rate": 1.9670231581544068e-05, "loss": 1.23, "step": 3119 }, { "epoch": 1.161592069900292, "grad_norm": 0.16933618485927582, "learning_rate": 1.9669921983091985e-05, "loss": 1.2202, "step": 3120 }, { "epoch": 1.161964375050901, "grad_norm": 0.164505273103714, "learning_rate": 1.966961224181576e-05, "loss": 1.2398, "step": 3121 }, { "epoch": 1.16233668020151, "grad_norm": 0.1745142638683319, "learning_rate": 1.9669302357719968e-05, "loss": 1.2223, "step": 3122 }, { "epoch": 1.1627089853521193, "grad_norm": 0.17189304530620575, "learning_rate": 1.9668992330809187e-05, "loss": 1.234, "step": 3123 }, { "epoch": 1.1630812905027283, "grad_norm": 0.16852304339408875, "learning_rate": 1.9668682161088e-05, "loss": 1.219, "step": 3124 }, { "epoch": 1.1634535956533374, "grad_norm": 0.17335280776023865, "learning_rate": 1.966837184856098e-05, "loss": 1.2107, "step": 3125 }, { "epoch": 1.1638259008039464, "grad_norm": 0.17474551498889923, "learning_rate": 1.9668061393232717e-05, "loss": 1.2285, "step": 3126 }, { "epoch": 1.1641982059545555, "grad_norm": 0.16944244503974915, "learning_rate": 1.9667750795107793e-05, "loss": 1.2281, "step": 3127 }, { "epoch": 1.1645705111051645, "grad_norm": 0.1715431809425354, "learning_rate": 1.9667440054190793e-05, "loss": 1.2402, "step": 3128 }, { "epoch": 1.1649428162557736, "grad_norm": 0.1723116636276245, "learning_rate": 1.9667129170486315e-05, "loss": 1.2295, "step": 3129 }, { "epoch": 1.1653151214063828, "grad_norm": 0.172069251537323, "learning_rate": 1.9666818143998944e-05, "loss": 1.2467, "step": 3130 }, { "epoch": 1.1656874265569919, "grad_norm": 0.16813892126083374, "learning_rate": 1.9666506974733276e-05, "loss": 1.2321, "step": 3131 }, { "epoch": 1.166059731707601, "grad_norm": 0.17631420493125916, "learning_rate": 1.9666195662693907e-05, "loss": 1.2281, "step": 3132 }, { "epoch": 1.16643203685821, "grad_norm": 0.1704123616218567, "learning_rate": 1.9665884207885437e-05, "loss": 1.2453, "step": 3133 }, { "epoch": 1.166804342008819, "grad_norm": 0.17325717210769653, "learning_rate": 1.966557261031246e-05, "loss": 1.2172, "step": 3134 }, { "epoch": 1.167176647159428, "grad_norm": 0.16483458876609802, "learning_rate": 1.9665260869979585e-05, "loss": 1.2209, "step": 3135 }, { "epoch": 1.167548952310037, "grad_norm": 0.17396704852581024, "learning_rate": 1.966494898689141e-05, "loss": 1.2164, "step": 3136 }, { "epoch": 1.1679212574606461, "grad_norm": 0.18191742897033691, "learning_rate": 1.9664636961052547e-05, "loss": 1.2402, "step": 3137 }, { "epoch": 1.1682935626112552, "grad_norm": 0.18091581761837006, "learning_rate": 1.9664324792467603e-05, "loss": 1.2276, "step": 3138 }, { "epoch": 1.1686658677618644, "grad_norm": 0.17447642982006073, "learning_rate": 1.9664012481141185e-05, "loss": 1.2262, "step": 3139 }, { "epoch": 1.1690381729124735, "grad_norm": 0.17776253819465637, "learning_rate": 1.9663700027077915e-05, "loss": 1.2252, "step": 3140 }, { "epoch": 1.1694104780630825, "grad_norm": 0.16708004474639893, "learning_rate": 1.96633874302824e-05, "loss": 1.2365, "step": 3141 }, { "epoch": 1.1697827832136916, "grad_norm": 0.17193494737148285, "learning_rate": 1.9663074690759255e-05, "loss": 1.2224, "step": 3142 }, { "epoch": 1.1701550883643006, "grad_norm": 0.16979169845581055, "learning_rate": 1.9662761808513105e-05, "loss": 1.2222, "step": 3143 }, { "epoch": 1.1705273935149096, "grad_norm": 0.17003171145915985, "learning_rate": 1.9662448783548575e-05, "loss": 1.222, "step": 3144 }, { "epoch": 1.1708996986655187, "grad_norm": 0.17789965867996216, "learning_rate": 1.966213561587028e-05, "loss": 1.2306, "step": 3145 }, { "epoch": 1.1712720038161277, "grad_norm": 0.17377351224422455, "learning_rate": 1.9661822305482845e-05, "loss": 1.2268, "step": 3146 }, { "epoch": 1.1716443089667368, "grad_norm": 0.17414116859436035, "learning_rate": 1.96615088523909e-05, "loss": 1.2161, "step": 3147 }, { "epoch": 1.172016614117346, "grad_norm": 0.17168028652668, "learning_rate": 1.966119525659908e-05, "loss": 1.2287, "step": 3148 }, { "epoch": 1.172388919267955, "grad_norm": 0.17371198534965515, "learning_rate": 1.966088151811201e-05, "loss": 1.2552, "step": 3149 }, { "epoch": 1.1727612244185641, "grad_norm": 0.17159205675125122, "learning_rate": 1.966056763693433e-05, "loss": 1.2243, "step": 3150 }, { "epoch": 1.1731335295691732, "grad_norm": 0.1716485470533371, "learning_rate": 1.9660253613070667e-05, "loss": 1.2413, "step": 3151 }, { "epoch": 1.1735058347197822, "grad_norm": 0.17491815984249115, "learning_rate": 1.9659939446525668e-05, "loss": 1.2377, "step": 3152 }, { "epoch": 1.1738781398703912, "grad_norm": 0.166838139295578, "learning_rate": 1.965962513730397e-05, "loss": 1.2133, "step": 3153 }, { "epoch": 1.1742504450210003, "grad_norm": 0.16914509236812592, "learning_rate": 1.9659310685410212e-05, "loss": 1.2264, "step": 3154 }, { "epoch": 1.1746227501716093, "grad_norm": 0.17503736913204193, "learning_rate": 1.9658996090849042e-05, "loss": 1.2341, "step": 3155 }, { "epoch": 1.1749950553222184, "grad_norm": 0.17667143046855927, "learning_rate": 1.9658681353625105e-05, "loss": 1.231, "step": 3156 }, { "epoch": 1.1753673604728276, "grad_norm": 0.17044112086296082, "learning_rate": 1.9658366473743052e-05, "loss": 1.2339, "step": 3157 }, { "epoch": 1.1757396656234367, "grad_norm": 0.1663392037153244, "learning_rate": 1.9658051451207536e-05, "loss": 1.2352, "step": 3158 }, { "epoch": 1.1761119707740457, "grad_norm": 0.18989932537078857, "learning_rate": 1.96577362860232e-05, "loss": 1.2266, "step": 3159 }, { "epoch": 1.1764842759246548, "grad_norm": 0.18411101400852203, "learning_rate": 1.965742097819471e-05, "loss": 1.2343, "step": 3160 }, { "epoch": 1.1768565810752638, "grad_norm": 0.17134809494018555, "learning_rate": 1.9657105527726716e-05, "loss": 1.2191, "step": 3161 }, { "epoch": 1.1772288862258729, "grad_norm": 0.20970846712589264, "learning_rate": 1.965678993462388e-05, "loss": 1.2202, "step": 3162 }, { "epoch": 1.177601191376482, "grad_norm": 0.20733484625816345, "learning_rate": 1.9656474198890864e-05, "loss": 1.2263, "step": 3163 }, { "epoch": 1.177973496527091, "grad_norm": 0.18368762731552124, "learning_rate": 1.965615832053233e-05, "loss": 1.2363, "step": 3164 }, { "epoch": 1.1783458016777, "grad_norm": 0.17400404810905457, "learning_rate": 1.9655842299552938e-05, "loss": 1.2301, "step": 3165 }, { "epoch": 1.1787181068283092, "grad_norm": 0.19013762474060059, "learning_rate": 1.9655526135957366e-05, "loss": 1.2316, "step": 3166 }, { "epoch": 1.1790904119789183, "grad_norm": 0.17968544363975525, "learning_rate": 1.965520982975028e-05, "loss": 1.2308, "step": 3167 }, { "epoch": 1.1794627171295273, "grad_norm": 0.1678808182477951, "learning_rate": 1.965489338093635e-05, "loss": 1.2178, "step": 3168 }, { "epoch": 1.1798350222801364, "grad_norm": 0.18025662004947662, "learning_rate": 1.9654576789520248e-05, "loss": 1.2261, "step": 3169 }, { "epoch": 1.1802073274307454, "grad_norm": 0.1803680807352066, "learning_rate": 1.965426005550665e-05, "loss": 1.2291, "step": 3170 }, { "epoch": 1.1805796325813545, "grad_norm": 0.1786152571439743, "learning_rate": 1.9653943178900246e-05, "loss": 1.2508, "step": 3171 }, { "epoch": 1.1809519377319635, "grad_norm": 0.17993713915348053, "learning_rate": 1.96536261597057e-05, "loss": 1.224, "step": 3172 }, { "epoch": 1.1813242428825725, "grad_norm": 0.18419890105724335, "learning_rate": 1.9653308997927705e-05, "loss": 1.236, "step": 3173 }, { "epoch": 1.1816965480331816, "grad_norm": 0.17795118689537048, "learning_rate": 1.9652991693570938e-05, "loss": 1.2217, "step": 3174 }, { "epoch": 1.1820688531837908, "grad_norm": 0.17739805579185486, "learning_rate": 1.965267424664009e-05, "loss": 1.2336, "step": 3175 }, { "epoch": 1.1824411583343999, "grad_norm": 0.17250627279281616, "learning_rate": 1.9652356657139854e-05, "loss": 1.2117, "step": 3176 }, { "epoch": 1.182813463485009, "grad_norm": 0.1707741916179657, "learning_rate": 1.9652038925074916e-05, "loss": 1.2457, "step": 3177 }, { "epoch": 1.183185768635618, "grad_norm": 0.17577117681503296, "learning_rate": 1.9651721050449964e-05, "loss": 1.2213, "step": 3178 }, { "epoch": 1.183558073786227, "grad_norm": 0.1711239516735077, "learning_rate": 1.96514030332697e-05, "loss": 1.2081, "step": 3179 }, { "epoch": 1.183930378936836, "grad_norm": 0.16911734640598297, "learning_rate": 1.9651084873538816e-05, "loss": 1.2082, "step": 3180 }, { "epoch": 1.184302684087445, "grad_norm": 0.1710880696773529, "learning_rate": 1.965076657126202e-05, "loss": 1.2193, "step": 3181 }, { "epoch": 1.1846749892380541, "grad_norm": 0.1759801208972931, "learning_rate": 1.9650448126444003e-05, "loss": 1.2261, "step": 3182 }, { "epoch": 1.1850472943886632, "grad_norm": 0.17261157929897308, "learning_rate": 1.9650129539089477e-05, "loss": 1.2322, "step": 3183 }, { "epoch": 1.1854195995392725, "grad_norm": 0.17363622784614563, "learning_rate": 1.9649810809203138e-05, "loss": 1.2263, "step": 3184 }, { "epoch": 1.1857919046898815, "grad_norm": 0.17991025745868683, "learning_rate": 1.9649491936789702e-05, "loss": 1.224, "step": 3185 }, { "epoch": 1.1861642098404905, "grad_norm": 0.17815767228603363, "learning_rate": 1.9649172921853873e-05, "loss": 1.2289, "step": 3186 }, { "epoch": 1.1865365149910996, "grad_norm": 0.1795058697462082, "learning_rate": 1.964885376440037e-05, "loss": 1.2335, "step": 3187 }, { "epoch": 1.1869088201417086, "grad_norm": 0.16902460157871246, "learning_rate": 1.9648534464433897e-05, "loss": 1.2369, "step": 3188 }, { "epoch": 1.1872811252923177, "grad_norm": 0.19541476666927338, "learning_rate": 1.964821502195918e-05, "loss": 1.2325, "step": 3189 }, { "epoch": 1.1876534304429267, "grad_norm": 0.1734236776828766, "learning_rate": 1.964789543698093e-05, "loss": 1.2162, "step": 3190 }, { "epoch": 1.188025735593536, "grad_norm": 0.26567402482032776, "learning_rate": 1.9647575709503873e-05, "loss": 1.2229, "step": 3191 }, { "epoch": 1.188398040744145, "grad_norm": 0.1733405888080597, "learning_rate": 1.9647255839532726e-05, "loss": 1.2366, "step": 3192 }, { "epoch": 1.188770345894754, "grad_norm": 0.17245125770568848, "learning_rate": 1.9646935827072215e-05, "loss": 1.2173, "step": 3193 }, { "epoch": 1.189142651045363, "grad_norm": 0.16756393015384674, "learning_rate": 1.9646615672127068e-05, "loss": 1.2291, "step": 3194 }, { "epoch": 1.1895149561959721, "grad_norm": 0.16170787811279297, "learning_rate": 1.9646295374702014e-05, "loss": 1.2338, "step": 3195 }, { "epoch": 1.1898872613465812, "grad_norm": 0.1687183678150177, "learning_rate": 1.964597493480178e-05, "loss": 1.218, "step": 3196 }, { "epoch": 1.1902595664971902, "grad_norm": 0.16018569469451904, "learning_rate": 1.9645654352431105e-05, "loss": 1.2158, "step": 3197 }, { "epoch": 1.1906318716477993, "grad_norm": 0.1713458150625229, "learning_rate": 1.9645333627594717e-05, "loss": 1.2331, "step": 3198 }, { "epoch": 1.1910041767984083, "grad_norm": 0.1839626431465149, "learning_rate": 1.9645012760297358e-05, "loss": 1.2246, "step": 3199 }, { "epoch": 1.1913764819490176, "grad_norm": 0.16510577499866486, "learning_rate": 1.964469175054377e-05, "loss": 1.2169, "step": 3200 }, { "epoch": 1.1917487870996266, "grad_norm": 0.16351880133152008, "learning_rate": 1.9644370598338686e-05, "loss": 1.237, "step": 3201 }, { "epoch": 1.1921210922502357, "grad_norm": 0.16644950211048126, "learning_rate": 1.9644049303686852e-05, "loss": 1.2395, "step": 3202 }, { "epoch": 1.1924933974008447, "grad_norm": 0.17122626304626465, "learning_rate": 1.9643727866593015e-05, "loss": 1.2404, "step": 3203 }, { "epoch": 1.1928657025514537, "grad_norm": 0.1687421053647995, "learning_rate": 1.9643406287061924e-05, "loss": 1.227, "step": 3204 }, { "epoch": 1.1932380077020628, "grad_norm": 0.17786358296871185, "learning_rate": 1.964308456509833e-05, "loss": 1.2271, "step": 3205 }, { "epoch": 1.1936103128526718, "grad_norm": 0.16544969379901886, "learning_rate": 1.964276270070698e-05, "loss": 1.2288, "step": 3206 }, { "epoch": 1.1939826180032809, "grad_norm": 0.1842457503080368, "learning_rate": 1.964244069389263e-05, "loss": 1.2244, "step": 3207 }, { "epoch": 1.19435492315389, "grad_norm": 0.16619916260242462, "learning_rate": 1.9642118544660036e-05, "loss": 1.2357, "step": 3208 }, { "epoch": 1.1947272283044992, "grad_norm": 0.1687583178281784, "learning_rate": 1.9641796253013957e-05, "loss": 1.2182, "step": 3209 }, { "epoch": 1.1950995334551082, "grad_norm": 0.17150536179542542, "learning_rate": 1.9641473818959153e-05, "loss": 1.237, "step": 3210 }, { "epoch": 1.1954718386057173, "grad_norm": 0.16135509312152863, "learning_rate": 1.9641151242500383e-05, "loss": 1.2405, "step": 3211 }, { "epoch": 1.1958441437563263, "grad_norm": 0.17465688288211823, "learning_rate": 1.9640828523642415e-05, "loss": 1.2145, "step": 3212 }, { "epoch": 1.1962164489069353, "grad_norm": 0.17879217863082886, "learning_rate": 1.9640505662390017e-05, "loss": 1.223, "step": 3213 }, { "epoch": 1.1965887540575444, "grad_norm": 0.1738523691892624, "learning_rate": 1.9640182658747955e-05, "loss": 1.2235, "step": 3214 }, { "epoch": 1.1969610592081534, "grad_norm": 0.17595960199832916, "learning_rate": 1.9639859512721e-05, "loss": 1.2419, "step": 3215 }, { "epoch": 1.1973333643587625, "grad_norm": 0.17298002541065216, "learning_rate": 1.963953622431392e-05, "loss": 1.226, "step": 3216 }, { "epoch": 1.1977056695093715, "grad_norm": 0.169357568025589, "learning_rate": 1.96392127935315e-05, "loss": 1.2276, "step": 3217 }, { "epoch": 1.1980779746599808, "grad_norm": 0.17965613305568695, "learning_rate": 1.963888922037851e-05, "loss": 1.2232, "step": 3218 }, { "epoch": 1.1984502798105898, "grad_norm": 0.16693131625652313, "learning_rate": 1.9638565504859734e-05, "loss": 1.2187, "step": 3219 }, { "epoch": 1.1988225849611989, "grad_norm": 0.16887417435646057, "learning_rate": 1.9638241646979947e-05, "loss": 1.2175, "step": 3220 }, { "epoch": 1.199194890111808, "grad_norm": 0.17603495717048645, "learning_rate": 1.9637917646743937e-05, "loss": 1.2293, "step": 3221 }, { "epoch": 1.199567195262417, "grad_norm": 0.1772565096616745, "learning_rate": 1.963759350415649e-05, "loss": 1.2246, "step": 3222 }, { "epoch": 1.199939500413026, "grad_norm": 0.16490373015403748, "learning_rate": 1.963726921922239e-05, "loss": 1.2226, "step": 3223 }, { "epoch": 1.200311805563635, "grad_norm": 0.17405438423156738, "learning_rate": 1.963694479194643e-05, "loss": 1.2418, "step": 3224 }, { "epoch": 1.200684110714244, "grad_norm": 0.1792418658733368, "learning_rate": 1.9636620222333398e-05, "loss": 1.2288, "step": 3225 }, { "epoch": 1.2010564158648531, "grad_norm": 0.17188262939453125, "learning_rate": 1.963629551038809e-05, "loss": 1.2226, "step": 3226 }, { "epoch": 1.2014287210154624, "grad_norm": 0.1634385883808136, "learning_rate": 1.9635970656115303e-05, "loss": 1.2126, "step": 3227 }, { "epoch": 1.2018010261660714, "grad_norm": 0.17879730463027954, "learning_rate": 1.9635645659519835e-05, "loss": 1.226, "step": 3228 }, { "epoch": 1.2021733313166805, "grad_norm": 0.19251345098018646, "learning_rate": 1.9635320520606483e-05, "loss": 1.228, "step": 3229 }, { "epoch": 1.2025456364672895, "grad_norm": 0.16735433042049408, "learning_rate": 1.9634995239380056e-05, "loss": 1.2525, "step": 3230 }, { "epoch": 1.2029179416178986, "grad_norm": 0.17983882129192352, "learning_rate": 1.9634669815845352e-05, "loss": 1.2331, "step": 3231 }, { "epoch": 1.2032902467685076, "grad_norm": 0.17625576257705688, "learning_rate": 1.9634344250007175e-05, "loss": 1.2252, "step": 3232 }, { "epoch": 1.2036625519191166, "grad_norm": 0.1711379885673523, "learning_rate": 1.9634018541870342e-05, "loss": 1.2268, "step": 3233 }, { "epoch": 1.2040348570697257, "grad_norm": 0.17250944674015045, "learning_rate": 1.9633692691439662e-05, "loss": 1.214, "step": 3234 }, { "epoch": 1.2044071622203347, "grad_norm": 0.17136184871196747, "learning_rate": 1.963336669871994e-05, "loss": 1.2315, "step": 3235 }, { "epoch": 1.204779467370944, "grad_norm": 0.17495617270469666, "learning_rate": 1.9633040563716e-05, "loss": 1.2198, "step": 3236 }, { "epoch": 1.205151772521553, "grad_norm": 0.16628341376781464, "learning_rate": 1.9632714286432656e-05, "loss": 1.2302, "step": 3237 }, { "epoch": 1.205524077672162, "grad_norm": 0.17815084755420685, "learning_rate": 1.963238786687472e-05, "loss": 1.2172, "step": 3238 }, { "epoch": 1.2058963828227711, "grad_norm": 0.18150483071804047, "learning_rate": 1.9632061305047028e-05, "loss": 1.2399, "step": 3239 }, { "epoch": 1.2062686879733802, "grad_norm": 0.16711880266666412, "learning_rate": 1.9631734600954392e-05, "loss": 1.2162, "step": 3240 }, { "epoch": 1.2066409931239892, "grad_norm": 0.16697241365909576, "learning_rate": 1.963140775460164e-05, "loss": 1.2246, "step": 3241 }, { "epoch": 1.2070132982745982, "grad_norm": 0.17977376282215118, "learning_rate": 1.96310807659936e-05, "loss": 1.2305, "step": 3242 }, { "epoch": 1.2073856034252075, "grad_norm": 0.1734996736049652, "learning_rate": 1.9630753635135102e-05, "loss": 1.229, "step": 3243 }, { "epoch": 1.2077579085758163, "grad_norm": 0.17894618213176727, "learning_rate": 1.9630426362030978e-05, "loss": 1.2304, "step": 3244 }, { "epoch": 1.2081302137264256, "grad_norm": 0.17757487297058105, "learning_rate": 1.963009894668606e-05, "loss": 1.2192, "step": 3245 }, { "epoch": 1.2085025188770346, "grad_norm": 0.17967943847179413, "learning_rate": 1.9629771389105185e-05, "loss": 1.2279, "step": 3246 }, { "epoch": 1.2088748240276437, "grad_norm": 0.18884268403053284, "learning_rate": 1.962944368929319e-05, "loss": 1.2305, "step": 3247 }, { "epoch": 1.2092471291782527, "grad_norm": 0.17970938980579376, "learning_rate": 1.9629115847254916e-05, "loss": 1.2085, "step": 3248 }, { "epoch": 1.2096194343288618, "grad_norm": 0.16718293726444244, "learning_rate": 1.9628787862995207e-05, "loss": 1.2305, "step": 3249 }, { "epoch": 1.2099917394794708, "grad_norm": 0.17763985693454742, "learning_rate": 1.9628459736518907e-05, "loss": 1.2271, "step": 3250 }, { "epoch": 1.2103640446300798, "grad_norm": 0.1723608523607254, "learning_rate": 1.9628131467830856e-05, "loss": 1.2224, "step": 3251 }, { "epoch": 1.2107363497806891, "grad_norm": 0.16534040868282318, "learning_rate": 1.9627803056935912e-05, "loss": 1.2223, "step": 3252 }, { "epoch": 1.2111086549312982, "grad_norm": 0.1700848489999771, "learning_rate": 1.9627474503838918e-05, "loss": 1.2291, "step": 3253 }, { "epoch": 1.2114809600819072, "grad_norm": 0.18612729012966156, "learning_rate": 1.9627145808544733e-05, "loss": 1.2119, "step": 3254 }, { "epoch": 1.2118532652325162, "grad_norm": 0.16971179842948914, "learning_rate": 1.9626816971058205e-05, "loss": 1.2249, "step": 3255 }, { "epoch": 1.2122255703831253, "grad_norm": 0.1748899221420288, "learning_rate": 1.9626487991384194e-05, "loss": 1.2136, "step": 3256 }, { "epoch": 1.2125978755337343, "grad_norm": 0.17875608801841736, "learning_rate": 1.9626158869527564e-05, "loss": 1.2189, "step": 3257 }, { "epoch": 1.2129701806843434, "grad_norm": 0.17736589908599854, "learning_rate": 1.962582960549317e-05, "loss": 1.2349, "step": 3258 }, { "epoch": 1.2133424858349524, "grad_norm": 0.17571696639060974, "learning_rate": 1.9625500199285874e-05, "loss": 1.2216, "step": 3259 }, { "epoch": 1.2137147909855615, "grad_norm": 0.1696479171514511, "learning_rate": 1.9625170650910547e-05, "loss": 1.2141, "step": 3260 }, { "epoch": 1.2140870961361707, "grad_norm": 0.17148007452487946, "learning_rate": 1.9624840960372053e-05, "loss": 1.2426, "step": 3261 }, { "epoch": 1.2144594012867798, "grad_norm": 0.17675496637821198, "learning_rate": 1.962451112767526e-05, "loss": 1.2278, "step": 3262 }, { "epoch": 1.2148317064373888, "grad_norm": 0.17009492218494415, "learning_rate": 1.9624181152825044e-05, "loss": 1.2285, "step": 3263 }, { "epoch": 1.2152040115879978, "grad_norm": 0.1749381422996521, "learning_rate": 1.9623851035826274e-05, "loss": 1.2182, "step": 3264 }, { "epoch": 1.2155763167386069, "grad_norm": 0.17395372688770294, "learning_rate": 1.962352077668383e-05, "loss": 1.2366, "step": 3265 }, { "epoch": 1.215948621889216, "grad_norm": 0.17901504039764404, "learning_rate": 1.962319037540259e-05, "loss": 1.2295, "step": 3266 }, { "epoch": 1.216320927039825, "grad_norm": 0.17020705342292786, "learning_rate": 1.962285983198743e-05, "loss": 1.239, "step": 3267 }, { "epoch": 1.216693232190434, "grad_norm": 0.17261864244937897, "learning_rate": 1.9622529146443235e-05, "loss": 1.2242, "step": 3268 }, { "epoch": 1.217065537341043, "grad_norm": 0.17174561321735382, "learning_rate": 1.9622198318774884e-05, "loss": 1.2337, "step": 3269 }, { "epoch": 1.2174378424916523, "grad_norm": 0.1685078740119934, "learning_rate": 1.9621867348987273e-05, "loss": 1.2328, "step": 3270 }, { "epoch": 1.2178101476422614, "grad_norm": 0.17111830413341522, "learning_rate": 1.9621536237085285e-05, "loss": 1.2164, "step": 3271 }, { "epoch": 1.2181824527928704, "grad_norm": 0.1732606589794159, "learning_rate": 1.9621204983073806e-05, "loss": 1.2314, "step": 3272 }, { "epoch": 1.2185547579434794, "grad_norm": 0.16949288547039032, "learning_rate": 1.9620873586957735e-05, "loss": 1.2231, "step": 3273 }, { "epoch": 1.2189270630940885, "grad_norm": 0.17130224406719208, "learning_rate": 1.962054204874197e-05, "loss": 1.2315, "step": 3274 }, { "epoch": 1.2192993682446975, "grad_norm": 0.16252587735652924, "learning_rate": 1.96202103684314e-05, "loss": 1.2077, "step": 3275 }, { "epoch": 1.2196716733953066, "grad_norm": 0.1738114356994629, "learning_rate": 1.961987854603092e-05, "loss": 1.2344, "step": 3276 }, { "epoch": 1.2200439785459156, "grad_norm": 0.16991567611694336, "learning_rate": 1.9619546581545445e-05, "loss": 1.2061, "step": 3277 }, { "epoch": 1.2204162836965247, "grad_norm": 0.17030175030231476, "learning_rate": 1.961921447497987e-05, "loss": 1.2193, "step": 3278 }, { "epoch": 1.220788588847134, "grad_norm": 0.16493943333625793, "learning_rate": 1.9618882226339096e-05, "loss": 1.2283, "step": 3279 }, { "epoch": 1.221160893997743, "grad_norm": 0.17286229133605957, "learning_rate": 1.961854983562804e-05, "loss": 1.2392, "step": 3280 }, { "epoch": 1.221533199148352, "grad_norm": 0.1747228503227234, "learning_rate": 1.9618217302851607e-05, "loss": 1.2216, "step": 3281 }, { "epoch": 1.221905504298961, "grad_norm": 0.17101289331912994, "learning_rate": 1.9617884628014707e-05, "loss": 1.2203, "step": 3282 }, { "epoch": 1.22227780944957, "grad_norm": 0.178693488240242, "learning_rate": 1.9617551811122253e-05, "loss": 1.2378, "step": 3283 }, { "epoch": 1.2226501146001791, "grad_norm": 0.17429909110069275, "learning_rate": 1.9617218852179162e-05, "loss": 1.2383, "step": 3284 }, { "epoch": 1.2230224197507882, "grad_norm": 0.17724157869815826, "learning_rate": 1.9616885751190354e-05, "loss": 1.2176, "step": 3285 }, { "epoch": 1.2233947249013972, "grad_norm": 0.1739131063222885, "learning_rate": 1.9616552508160747e-05, "loss": 1.2302, "step": 3286 }, { "epoch": 1.2237670300520063, "grad_norm": 0.1718931496143341, "learning_rate": 1.9616219123095265e-05, "loss": 1.2336, "step": 3287 }, { "epoch": 1.2241393352026155, "grad_norm": 0.17403414845466614, "learning_rate": 1.9615885595998825e-05, "loss": 1.2193, "step": 3288 }, { "epoch": 1.2245116403532246, "grad_norm": 0.16218788921833038, "learning_rate": 1.9615551926876358e-05, "loss": 1.2239, "step": 3289 }, { "epoch": 1.2248839455038336, "grad_norm": 0.17476886510849, "learning_rate": 1.9615218115732796e-05, "loss": 1.2332, "step": 3290 }, { "epoch": 1.2252562506544427, "grad_norm": 0.18384511768817902, "learning_rate": 1.9614884162573067e-05, "loss": 1.2243, "step": 3291 }, { "epoch": 1.2256285558050517, "grad_norm": 0.174949049949646, "learning_rate": 1.96145500674021e-05, "loss": 1.2188, "step": 3292 }, { "epoch": 1.2260008609556607, "grad_norm": 0.18955911695957184, "learning_rate": 1.9614215830224832e-05, "loss": 1.2269, "step": 3293 }, { "epoch": 1.2263731661062698, "grad_norm": 0.17960596084594727, "learning_rate": 1.96138814510462e-05, "loss": 1.2313, "step": 3294 }, { "epoch": 1.2267454712568788, "grad_norm": 0.18217206001281738, "learning_rate": 1.961354692987114e-05, "loss": 1.2345, "step": 3295 }, { "epoch": 1.2271177764074879, "grad_norm": 0.1756083369255066, "learning_rate": 1.9613212266704597e-05, "loss": 1.2195, "step": 3296 }, { "epoch": 1.2274900815580971, "grad_norm": 0.16914978623390198, "learning_rate": 1.9612877461551516e-05, "loss": 1.2131, "step": 3297 }, { "epoch": 1.2278623867087062, "grad_norm": 0.1753888875246048, "learning_rate": 1.9612542514416835e-05, "loss": 1.24, "step": 3298 }, { "epoch": 1.2282346918593152, "grad_norm": 0.17672811448574066, "learning_rate": 1.96122074253055e-05, "loss": 1.222, "step": 3299 }, { "epoch": 1.2286069970099243, "grad_norm": 0.17020326852798462, "learning_rate": 1.961187219422247e-05, "loss": 1.2209, "step": 3300 }, { "epoch": 1.2289793021605333, "grad_norm": 0.17728908360004425, "learning_rate": 1.961153682117269e-05, "loss": 1.2284, "step": 3301 }, { "epoch": 1.2293516073111423, "grad_norm": 0.18490946292877197, "learning_rate": 1.9611201306161115e-05, "loss": 1.221, "step": 3302 }, { "epoch": 1.2297239124617514, "grad_norm": 0.17374534904956818, "learning_rate": 1.9610865649192695e-05, "loss": 1.2393, "step": 3303 }, { "epoch": 1.2300962176123607, "grad_norm": 0.17610304057598114, "learning_rate": 1.96105298502724e-05, "loss": 1.2163, "step": 3304 }, { "epoch": 1.2304685227629695, "grad_norm": 0.17196281254291534, "learning_rate": 1.961019390940518e-05, "loss": 1.2256, "step": 3305 }, { "epoch": 1.2308408279135787, "grad_norm": 0.16787520051002502, "learning_rate": 1.9609857826595996e-05, "loss": 1.2373, "step": 3306 }, { "epoch": 1.2312131330641878, "grad_norm": 0.17314410209655762, "learning_rate": 1.9609521601849815e-05, "loss": 1.2213, "step": 3307 }, { "epoch": 1.2315854382147968, "grad_norm": 0.1651289165019989, "learning_rate": 1.9609185235171604e-05, "loss": 1.2163, "step": 3308 }, { "epoch": 1.2319577433654059, "grad_norm": 0.15858665108680725, "learning_rate": 1.9608848726566328e-05, "loss": 1.21, "step": 3309 }, { "epoch": 1.232330048516015, "grad_norm": 0.1707463413476944, "learning_rate": 1.9608512076038964e-05, "loss": 1.216, "step": 3310 }, { "epoch": 1.232702353666624, "grad_norm": 0.16724033653736115, "learning_rate": 1.9608175283594476e-05, "loss": 1.2289, "step": 3311 }, { "epoch": 1.233074658817233, "grad_norm": 0.17127692699432373, "learning_rate": 1.9607838349237847e-05, "loss": 1.2233, "step": 3312 }, { "epoch": 1.2334469639678423, "grad_norm": 0.17231450974941254, "learning_rate": 1.9607501272974044e-05, "loss": 1.2362, "step": 3313 }, { "epoch": 1.2338192691184513, "grad_norm": 0.1658644676208496, "learning_rate": 1.960716405480805e-05, "loss": 1.2242, "step": 3314 }, { "epoch": 1.2341915742690603, "grad_norm": 0.1748226135969162, "learning_rate": 1.9606826694744847e-05, "loss": 1.2307, "step": 3315 }, { "epoch": 1.2345638794196694, "grad_norm": 0.1692144274711609, "learning_rate": 1.9606489192789418e-05, "loss": 1.2219, "step": 3316 }, { "epoch": 1.2349361845702784, "grad_norm": 0.1659345030784607, "learning_rate": 1.9606151548946744e-05, "loss": 1.225, "step": 3317 }, { "epoch": 1.2353084897208875, "grad_norm": 0.17193017899990082, "learning_rate": 1.9605813763221816e-05, "loss": 1.2318, "step": 3318 }, { "epoch": 1.2356807948714965, "grad_norm": 0.17747105658054352, "learning_rate": 1.960547583561962e-05, "loss": 1.2413, "step": 3319 }, { "epoch": 1.2360531000221056, "grad_norm": 0.1722479611635208, "learning_rate": 1.9605137766145154e-05, "loss": 1.2255, "step": 3320 }, { "epoch": 1.2364254051727146, "grad_norm": 0.16848038136959076, "learning_rate": 1.96047995548034e-05, "loss": 1.2384, "step": 3321 }, { "epoch": 1.2367977103233239, "grad_norm": 0.16798731684684753, "learning_rate": 1.960446120159936e-05, "loss": 1.222, "step": 3322 }, { "epoch": 1.237170015473933, "grad_norm": 0.1701812744140625, "learning_rate": 1.9604122706538033e-05, "loss": 1.2218, "step": 3323 }, { "epoch": 1.237542320624542, "grad_norm": 0.1743394136428833, "learning_rate": 1.9603784069624417e-05, "loss": 1.2381, "step": 3324 }, { "epoch": 1.237914625775151, "grad_norm": 0.16606199741363525, "learning_rate": 1.960344529086351e-05, "loss": 1.2241, "step": 3325 }, { "epoch": 1.23828693092576, "grad_norm": 0.17570117115974426, "learning_rate": 1.960310637026032e-05, "loss": 1.2356, "step": 3326 }, { "epoch": 1.238659236076369, "grad_norm": 0.17124217748641968, "learning_rate": 1.960276730781985e-05, "loss": 1.2422, "step": 3327 }, { "epoch": 1.2390315412269781, "grad_norm": 0.21202170848846436, "learning_rate": 1.9602428103547112e-05, "loss": 1.2334, "step": 3328 }, { "epoch": 1.2394038463775872, "grad_norm": 0.17141880095005035, "learning_rate": 1.9602088757447114e-05, "loss": 1.2254, "step": 3329 }, { "epoch": 1.2397761515281962, "grad_norm": 0.18297582864761353, "learning_rate": 1.9601749269524867e-05, "loss": 1.2451, "step": 3330 }, { "epoch": 1.2401484566788055, "grad_norm": 0.1688864827156067, "learning_rate": 1.9601409639785384e-05, "loss": 1.2194, "step": 3331 }, { "epoch": 1.2405207618294145, "grad_norm": 0.1724138855934143, "learning_rate": 1.9601069868233687e-05, "loss": 1.2278, "step": 3332 }, { "epoch": 1.2408930669800236, "grad_norm": 0.174929678440094, "learning_rate": 1.9600729954874786e-05, "loss": 1.2341, "step": 3333 }, { "epoch": 1.2412653721306326, "grad_norm": 0.17739325761795044, "learning_rate": 1.9600389899713707e-05, "loss": 1.2258, "step": 3334 }, { "epoch": 1.2416376772812416, "grad_norm": 0.1830214411020279, "learning_rate": 1.9600049702755473e-05, "loss": 1.2313, "step": 3335 }, { "epoch": 1.2420099824318507, "grad_norm": 0.16760197281837463, "learning_rate": 1.9599709364005107e-05, "loss": 1.2235, "step": 3336 }, { "epoch": 1.2423822875824597, "grad_norm": 0.16950567066669464, "learning_rate": 1.959936888346764e-05, "loss": 1.2124, "step": 3337 }, { "epoch": 1.2427545927330688, "grad_norm": 0.1735774725675583, "learning_rate": 1.959902826114809e-05, "loss": 1.213, "step": 3338 }, { "epoch": 1.2431268978836778, "grad_norm": 0.17555302381515503, "learning_rate": 1.9598687497051497e-05, "loss": 1.2144, "step": 3339 }, { "epoch": 1.243499203034287, "grad_norm": 0.17770478129386902, "learning_rate": 1.9598346591182896e-05, "loss": 1.2089, "step": 3340 }, { "epoch": 1.2438715081848961, "grad_norm": 0.16483311355113983, "learning_rate": 1.9598005543547315e-05, "loss": 1.2113, "step": 3341 }, { "epoch": 1.2442438133355052, "grad_norm": 0.17812606692314148, "learning_rate": 1.9597664354149793e-05, "loss": 1.2189, "step": 3342 }, { "epoch": 1.2446161184861142, "grad_norm": 0.17400455474853516, "learning_rate": 1.9597323022995375e-05, "loss": 1.2183, "step": 3343 }, { "epoch": 1.2449884236367232, "grad_norm": 0.17530007660388947, "learning_rate": 1.95969815500891e-05, "loss": 1.2222, "step": 3344 }, { "epoch": 1.2453607287873323, "grad_norm": 0.16525647044181824, "learning_rate": 1.9596639935436005e-05, "loss": 1.2257, "step": 3345 }, { "epoch": 1.2457330339379413, "grad_norm": 0.1811496466398239, "learning_rate": 1.959629817904114e-05, "loss": 1.2295, "step": 3346 }, { "epoch": 1.2461053390885504, "grad_norm": 0.16356612741947174, "learning_rate": 1.9595956280909552e-05, "loss": 1.2251, "step": 3347 }, { "epoch": 1.2464776442391594, "grad_norm": 0.1709991693496704, "learning_rate": 1.9595614241046296e-05, "loss": 1.2053, "step": 3348 }, { "epoch": 1.2468499493897687, "grad_norm": 0.16833190619945526, "learning_rate": 1.959527205945642e-05, "loss": 1.2216, "step": 3349 }, { "epoch": 1.2472222545403777, "grad_norm": 0.1702100783586502, "learning_rate": 1.9594929736144978e-05, "loss": 1.2207, "step": 3350 }, { "epoch": 1.2475945596909868, "grad_norm": 0.1710902601480484, "learning_rate": 1.9594587271117023e-05, "loss": 1.2166, "step": 3351 }, { "epoch": 1.2479668648415958, "grad_norm": 0.17266355454921722, "learning_rate": 1.9594244664377617e-05, "loss": 1.2489, "step": 3352 }, { "epoch": 1.2483391699922048, "grad_norm": 0.16496752202510834, "learning_rate": 1.959390191593182e-05, "loss": 1.2185, "step": 3353 }, { "epoch": 1.2487114751428139, "grad_norm": 0.16642600297927856, "learning_rate": 1.9593559025784692e-05, "loss": 1.2326, "step": 3354 }, { "epoch": 1.249083780293423, "grad_norm": 0.17854921519756317, "learning_rate": 1.95932159939413e-05, "loss": 1.2324, "step": 3355 }, { "epoch": 1.249456085444032, "grad_norm": 0.17069010436534882, "learning_rate": 1.959287282040671e-05, "loss": 1.2088, "step": 3356 }, { "epoch": 1.249828390594641, "grad_norm": 0.18059507012367249, "learning_rate": 1.9592529505185993e-05, "loss": 1.2264, "step": 3357 }, { "epoch": 1.2502006957452503, "grad_norm": 0.173674538731575, "learning_rate": 1.9592186048284216e-05, "loss": 1.2263, "step": 3358 }, { "epoch": 1.2505730008958593, "grad_norm": 0.16623902320861816, "learning_rate": 1.9591842449706454e-05, "loss": 1.2264, "step": 3359 }, { "epoch": 1.2509453060464684, "grad_norm": 0.1817052662372589, "learning_rate": 1.9591498709457776e-05, "loss": 1.2062, "step": 3360 }, { "epoch": 1.2513176111970774, "grad_norm": 0.17939196527004242, "learning_rate": 1.959115482754327e-05, "loss": 1.2464, "step": 3361 }, { "epoch": 1.2516899163476864, "grad_norm": 0.17410658299922943, "learning_rate": 1.9590810803968003e-05, "loss": 1.2175, "step": 3362 }, { "epoch": 1.2520622214982955, "grad_norm": 0.19902339577674866, "learning_rate": 1.9590466638737068e-05, "loss": 1.2195, "step": 3363 }, { "epoch": 1.2524345266489045, "grad_norm": 0.1737726330757141, "learning_rate": 1.9590122331855543e-05, "loss": 1.2097, "step": 3364 }, { "epoch": 1.2528068317995138, "grad_norm": 0.17546404898166656, "learning_rate": 1.9589777883328506e-05, "loss": 1.2249, "step": 3365 }, { "epoch": 1.2531791369501226, "grad_norm": 0.1736469715833664, "learning_rate": 1.9589433293161057e-05, "loss": 1.2135, "step": 3366 }, { "epoch": 1.2535514421007319, "grad_norm": 0.17716355621814728, "learning_rate": 1.9589088561358278e-05, "loss": 1.2203, "step": 3367 }, { "epoch": 1.253923747251341, "grad_norm": 0.18464305996894836, "learning_rate": 1.9588743687925264e-05, "loss": 1.2134, "step": 3368 }, { "epoch": 1.25429605240195, "grad_norm": 0.17414681613445282, "learning_rate": 1.9588398672867108e-05, "loss": 1.2212, "step": 3369 }, { "epoch": 1.254668357552559, "grad_norm": 0.17853331565856934, "learning_rate": 1.9588053516188906e-05, "loss": 1.2398, "step": 3370 }, { "epoch": 1.255040662703168, "grad_norm": 0.17003083229064941, "learning_rate": 1.958770821789575e-05, "loss": 1.2234, "step": 3371 }, { "epoch": 1.255412967853777, "grad_norm": 0.18236912786960602, "learning_rate": 1.9587362777992753e-05, "loss": 1.2246, "step": 3372 }, { "epoch": 1.2557852730043861, "grad_norm": 0.18175008893013, "learning_rate": 1.9587017196485007e-05, "loss": 1.2313, "step": 3373 }, { "epoch": 1.2561575781549954, "grad_norm": 0.18152983486652374, "learning_rate": 1.9586671473377614e-05, "loss": 1.222, "step": 3374 }, { "epoch": 1.2565298833056042, "grad_norm": 0.1859092116355896, "learning_rate": 1.9586325608675688e-05, "loss": 1.231, "step": 3375 }, { "epoch": 1.2569021884562135, "grad_norm": 0.19084875285625458, "learning_rate": 1.9585979602384334e-05, "loss": 1.2234, "step": 3376 }, { "epoch": 1.2572744936068225, "grad_norm": 0.18165820837020874, "learning_rate": 1.9585633454508665e-05, "loss": 1.225, "step": 3377 }, { "epoch": 1.2576467987574316, "grad_norm": 0.16506415605545044, "learning_rate": 1.958528716505379e-05, "loss": 1.2329, "step": 3378 }, { "epoch": 1.2580191039080406, "grad_norm": 0.17367565631866455, "learning_rate": 1.9584940734024826e-05, "loss": 1.2263, "step": 3379 }, { "epoch": 1.2583914090586497, "grad_norm": 0.16884395480155945, "learning_rate": 1.9584594161426888e-05, "loss": 1.2087, "step": 3380 }, { "epoch": 1.2587637142092587, "grad_norm": 0.18630245327949524, "learning_rate": 1.9584247447265095e-05, "loss": 1.2279, "step": 3381 }, { "epoch": 1.2591360193598677, "grad_norm": 0.16664794087409973, "learning_rate": 1.958390059154457e-05, "loss": 1.2317, "step": 3382 }, { "epoch": 1.259508324510477, "grad_norm": 0.17144736647605896, "learning_rate": 1.9583553594270433e-05, "loss": 1.2157, "step": 3383 }, { "epoch": 1.259880629661086, "grad_norm": 0.17621219158172607, "learning_rate": 1.9583206455447812e-05, "loss": 1.2179, "step": 3384 }, { "epoch": 1.260252934811695, "grad_norm": 0.17258892953395844, "learning_rate": 1.9582859175081835e-05, "loss": 1.2349, "step": 3385 }, { "epoch": 1.2606252399623041, "grad_norm": 0.16975384950637817, "learning_rate": 1.9582511753177625e-05, "loss": 1.2155, "step": 3386 }, { "epoch": 1.2609975451129132, "grad_norm": 0.171031653881073, "learning_rate": 1.9582164189740322e-05, "loss": 1.2121, "step": 3387 }, { "epoch": 1.2613698502635222, "grad_norm": 0.16449801623821259, "learning_rate": 1.9581816484775055e-05, "loss": 1.2154, "step": 3388 }, { "epoch": 1.2617421554141313, "grad_norm": 0.17565850913524628, "learning_rate": 1.9581468638286954e-05, "loss": 1.2266, "step": 3389 }, { "epoch": 1.2621144605647403, "grad_norm": 0.1752316802740097, "learning_rate": 1.9581120650281166e-05, "loss": 1.2237, "step": 3390 }, { "epoch": 1.2624867657153493, "grad_norm": 0.16294018924236298, "learning_rate": 1.958077252076283e-05, "loss": 1.2161, "step": 3391 }, { "epoch": 1.2628590708659586, "grad_norm": 0.16601049900054932, "learning_rate": 1.9580424249737085e-05, "loss": 1.2202, "step": 3392 }, { "epoch": 1.2632313760165677, "grad_norm": 0.17357154190540314, "learning_rate": 1.958007583720907e-05, "loss": 1.2107, "step": 3393 }, { "epoch": 1.2636036811671767, "grad_norm": 0.17299290001392365, "learning_rate": 1.957972728318394e-05, "loss": 1.2297, "step": 3394 }, { "epoch": 1.2639759863177857, "grad_norm": 0.1687079221010208, "learning_rate": 1.9579378587666838e-05, "loss": 1.2087, "step": 3395 }, { "epoch": 1.2643482914683948, "grad_norm": 0.16745884716510773, "learning_rate": 1.9579029750662918e-05, "loss": 1.2154, "step": 3396 }, { "epoch": 1.2647205966190038, "grad_norm": 0.17740416526794434, "learning_rate": 1.9578680772177327e-05, "loss": 1.2304, "step": 3397 }, { "epoch": 1.2650929017696129, "grad_norm": 0.16838476061820984, "learning_rate": 1.9578331652215224e-05, "loss": 1.2315, "step": 3398 }, { "epoch": 1.265465206920222, "grad_norm": 0.1673501580953598, "learning_rate": 1.9577982390781766e-05, "loss": 1.2072, "step": 3399 }, { "epoch": 1.265837512070831, "grad_norm": 0.1591964215040207, "learning_rate": 1.9577632987882103e-05, "loss": 1.2121, "step": 3400 }, { "epoch": 1.2662098172214402, "grad_norm": 0.1804029494524002, "learning_rate": 1.9577283443521403e-05, "loss": 1.225, "step": 3401 }, { "epoch": 1.2665821223720493, "grad_norm": 0.17372415959835052, "learning_rate": 1.957693375770483e-05, "loss": 1.212, "step": 3402 }, { "epoch": 1.2669544275226583, "grad_norm": 0.17061223089694977, "learning_rate": 1.9576583930437546e-05, "loss": 1.2165, "step": 3403 }, { "epoch": 1.2673267326732673, "grad_norm": 0.19941537082195282, "learning_rate": 1.9576233961724716e-05, "loss": 1.2322, "step": 3404 }, { "epoch": 1.2676990378238764, "grad_norm": 0.1921522617340088, "learning_rate": 1.9575883851571516e-05, "loss": 1.2069, "step": 3405 }, { "epoch": 1.2680713429744854, "grad_norm": 0.17495955526828766, "learning_rate": 1.957553359998311e-05, "loss": 1.227, "step": 3406 }, { "epoch": 1.2684436481250945, "grad_norm": 0.18259906768798828, "learning_rate": 1.9575183206964673e-05, "loss": 1.2292, "step": 3407 }, { "epoch": 1.2688159532757037, "grad_norm": 0.16548015177249908, "learning_rate": 1.9574832672521384e-05, "loss": 1.209, "step": 3408 }, { "epoch": 1.2691882584263126, "grad_norm": 0.17932672798633575, "learning_rate": 1.9574481996658412e-05, "loss": 1.2247, "step": 3409 }, { "epoch": 1.2695605635769218, "grad_norm": 0.16864344477653503, "learning_rate": 1.9574131179380945e-05, "loss": 1.209, "step": 3410 }, { "epoch": 1.2699328687275309, "grad_norm": 0.16725954413414001, "learning_rate": 1.957378022069416e-05, "loss": 1.2094, "step": 3411 }, { "epoch": 1.27030517387814, "grad_norm": 0.17758184671401978, "learning_rate": 1.9573429120603245e-05, "loss": 1.2177, "step": 3412 }, { "epoch": 1.270677479028749, "grad_norm": 0.17803123593330383, "learning_rate": 1.957307787911338e-05, "loss": 1.2244, "step": 3413 }, { "epoch": 1.271049784179358, "grad_norm": 0.1861790120601654, "learning_rate": 1.9572726496229754e-05, "loss": 1.2335, "step": 3414 }, { "epoch": 1.271422089329967, "grad_norm": 0.17091241478919983, "learning_rate": 1.9572374971957562e-05, "loss": 1.2198, "step": 3415 }, { "epoch": 1.271794394480576, "grad_norm": 0.16536393761634827, "learning_rate": 1.957202330630199e-05, "loss": 1.2227, "step": 3416 }, { "epoch": 1.2721666996311853, "grad_norm": 0.1737341731786728, "learning_rate": 1.9571671499268238e-05, "loss": 1.2294, "step": 3417 }, { "epoch": 1.2725390047817942, "grad_norm": 0.16454610228538513, "learning_rate": 1.9571319550861493e-05, "loss": 1.2099, "step": 3418 }, { "epoch": 1.2729113099324034, "grad_norm": 0.17201176285743713, "learning_rate": 1.957096746108696e-05, "loss": 1.23, "step": 3419 }, { "epoch": 1.2732836150830125, "grad_norm": 0.16498921811580658, "learning_rate": 1.9570615229949844e-05, "loss": 1.2081, "step": 3420 }, { "epoch": 1.2736559202336215, "grad_norm": 0.16812345385551453, "learning_rate": 1.9570262857455336e-05, "loss": 1.2207, "step": 3421 }, { "epoch": 1.2740282253842306, "grad_norm": 0.17763420939445496, "learning_rate": 1.956991034360865e-05, "loss": 1.233, "step": 3422 }, { "epoch": 1.2744005305348396, "grad_norm": 0.18109717965126038, "learning_rate": 1.9569557688414985e-05, "loss": 1.2304, "step": 3423 }, { "epoch": 1.2747728356854486, "grad_norm": 0.1638157218694687, "learning_rate": 1.9569204891879554e-05, "loss": 1.2216, "step": 3424 }, { "epoch": 1.2751451408360577, "grad_norm": 0.17872877418994904, "learning_rate": 1.956885195400757e-05, "loss": 1.215, "step": 3425 }, { "epoch": 1.275517445986667, "grad_norm": 0.1729113757610321, "learning_rate": 1.956849887480424e-05, "loss": 1.2073, "step": 3426 }, { "epoch": 1.2758897511372758, "grad_norm": 0.16412028670310974, "learning_rate": 1.9568145654274787e-05, "loss": 1.2321, "step": 3427 }, { "epoch": 1.276262056287885, "grad_norm": 0.17767532169818878, "learning_rate": 1.9567792292424417e-05, "loss": 1.2291, "step": 3428 }, { "epoch": 1.276634361438494, "grad_norm": 0.17381222546100616, "learning_rate": 1.956743878925836e-05, "loss": 1.2315, "step": 3429 }, { "epoch": 1.2770066665891031, "grad_norm": 0.17282582819461823, "learning_rate": 1.956708514478183e-05, "loss": 1.2208, "step": 3430 }, { "epoch": 1.2773789717397122, "grad_norm": 0.16768918931484222, "learning_rate": 1.9566731359000056e-05, "loss": 1.2107, "step": 3431 }, { "epoch": 1.2777512768903212, "grad_norm": 0.1659182608127594, "learning_rate": 1.9566377431918258e-05, "loss": 1.216, "step": 3432 }, { "epoch": 1.2781235820409302, "grad_norm": 0.1740458458662033, "learning_rate": 1.9566023363541664e-05, "loss": 1.2357, "step": 3433 }, { "epoch": 1.2784958871915393, "grad_norm": 0.167833611369133, "learning_rate": 1.956566915387551e-05, "loss": 1.2286, "step": 3434 }, { "epoch": 1.2788681923421485, "grad_norm": 0.17230744659900665, "learning_rate": 1.9565314802925017e-05, "loss": 1.2251, "step": 3435 }, { "epoch": 1.2792404974927574, "grad_norm": 0.16119259595870972, "learning_rate": 1.9564960310695426e-05, "loss": 1.2184, "step": 3436 }, { "epoch": 1.2796128026433666, "grad_norm": 0.15957672894001007, "learning_rate": 1.9564605677191975e-05, "loss": 1.2133, "step": 3437 }, { "epoch": 1.2799851077939757, "grad_norm": 0.16447031497955322, "learning_rate": 1.9564250902419895e-05, "loss": 1.2355, "step": 3438 }, { "epoch": 1.2803574129445847, "grad_norm": 0.17678163945674896, "learning_rate": 1.956389598638443e-05, "loss": 1.2152, "step": 3439 }, { "epoch": 1.2807297180951938, "grad_norm": 0.18602001667022705, "learning_rate": 1.9563540929090825e-05, "loss": 1.2069, "step": 3440 }, { "epoch": 1.2811020232458028, "grad_norm": 0.16143512725830078, "learning_rate": 1.9563185730544316e-05, "loss": 1.2164, "step": 3441 }, { "epoch": 1.2814743283964118, "grad_norm": 0.17867274582386017, "learning_rate": 1.9562830390750157e-05, "loss": 1.2228, "step": 3442 }, { "epoch": 1.2818466335470209, "grad_norm": 0.16659069061279297, "learning_rate": 1.9562474909713592e-05, "loss": 1.2173, "step": 3443 }, { "epoch": 1.2822189386976302, "grad_norm": 0.17568208277225494, "learning_rate": 1.9562119287439874e-05, "loss": 1.2199, "step": 3444 }, { "epoch": 1.2825912438482392, "grad_norm": 0.18032683432102203, "learning_rate": 1.956176352393425e-05, "loss": 1.2246, "step": 3445 }, { "epoch": 1.2829635489988482, "grad_norm": 0.1706835776567459, "learning_rate": 1.956140761920198e-05, "loss": 1.2079, "step": 3446 }, { "epoch": 1.2833358541494573, "grad_norm": 0.17287270724773407, "learning_rate": 1.9561051573248325e-05, "loss": 1.2145, "step": 3447 }, { "epoch": 1.2837081593000663, "grad_norm": 0.1678660660982132, "learning_rate": 1.9560695386078536e-05, "loss": 1.219, "step": 3448 }, { "epoch": 1.2840804644506754, "grad_norm": 0.17408515512943268, "learning_rate": 1.956033905769787e-05, "loss": 1.2178, "step": 3449 }, { "epoch": 1.2844527696012844, "grad_norm": 0.1688593178987503, "learning_rate": 1.9559982588111604e-05, "loss": 1.2167, "step": 3450 }, { "epoch": 1.2848250747518934, "grad_norm": 0.16934001445770264, "learning_rate": 1.9559625977324992e-05, "loss": 1.2089, "step": 3451 }, { "epoch": 1.2851973799025025, "grad_norm": 0.17669974267482758, "learning_rate": 1.9559269225343303e-05, "loss": 1.2276, "step": 3452 }, { "epoch": 1.2855696850531118, "grad_norm": 0.18074657022953033, "learning_rate": 1.955891233217181e-05, "loss": 1.2208, "step": 3453 }, { "epoch": 1.2859419902037208, "grad_norm": 0.16438442468643188, "learning_rate": 1.955855529781578e-05, "loss": 1.2096, "step": 3454 }, { "epoch": 1.2863142953543298, "grad_norm": 0.20270679891109467, "learning_rate": 1.9558198122280488e-05, "loss": 1.2206, "step": 3455 }, { "epoch": 1.2866866005049389, "grad_norm": 0.18252705037593842, "learning_rate": 1.955784080557121e-05, "loss": 1.2188, "step": 3456 }, { "epoch": 1.287058905655548, "grad_norm": 0.17641377449035645, "learning_rate": 1.9557483347693226e-05, "loss": 1.2156, "step": 3457 }, { "epoch": 1.287431210806157, "grad_norm": 0.16303670406341553, "learning_rate": 1.955712574865181e-05, "loss": 1.2378, "step": 3458 }, { "epoch": 1.287803515956766, "grad_norm": 0.20156008005142212, "learning_rate": 1.9556768008452245e-05, "loss": 1.2284, "step": 3459 }, { "epoch": 1.288175821107375, "grad_norm": 0.16495651006698608, "learning_rate": 1.9556410127099817e-05, "loss": 1.2097, "step": 3460 }, { "epoch": 1.288548126257984, "grad_norm": 0.1750713586807251, "learning_rate": 1.9556052104599813e-05, "loss": 1.2175, "step": 3461 }, { "epoch": 1.2889204314085934, "grad_norm": 0.17808473110198975, "learning_rate": 1.9555693940957518e-05, "loss": 1.2354, "step": 3462 }, { "epoch": 1.2892927365592024, "grad_norm": 0.1678575724363327, "learning_rate": 1.955533563617822e-05, "loss": 1.2293, "step": 3463 }, { "epoch": 1.2896650417098114, "grad_norm": 0.18347376585006714, "learning_rate": 1.955497719026722e-05, "loss": 1.2276, "step": 3464 }, { "epoch": 1.2900373468604205, "grad_norm": 0.17216694355010986, "learning_rate": 1.9554618603229804e-05, "loss": 1.2365, "step": 3465 }, { "epoch": 1.2904096520110295, "grad_norm": 0.16860246658325195, "learning_rate": 1.9554259875071274e-05, "loss": 1.2087, "step": 3466 }, { "epoch": 1.2907819571616386, "grad_norm": 0.1744735687971115, "learning_rate": 1.955390100579692e-05, "loss": 1.2064, "step": 3467 }, { "epoch": 1.2911542623122476, "grad_norm": 0.16828221082687378, "learning_rate": 1.955354199541205e-05, "loss": 1.2235, "step": 3468 }, { "epoch": 1.2915265674628569, "grad_norm": 0.19916963577270508, "learning_rate": 1.9553182843921963e-05, "loss": 1.2269, "step": 3469 }, { "epoch": 1.2918988726134657, "grad_norm": 0.17060431838035583, "learning_rate": 1.9552823551331966e-05, "loss": 1.2064, "step": 3470 }, { "epoch": 1.292271177764075, "grad_norm": 0.17713354527950287, "learning_rate": 1.9552464117647365e-05, "loss": 1.2306, "step": 3471 }, { "epoch": 1.292643482914684, "grad_norm": 0.17107143998146057, "learning_rate": 1.955210454287347e-05, "loss": 1.2413, "step": 3472 }, { "epoch": 1.293015788065293, "grad_norm": 0.17678503692150116, "learning_rate": 1.955174482701559e-05, "loss": 1.2145, "step": 3473 }, { "epoch": 1.293388093215902, "grad_norm": 0.17001782357692719, "learning_rate": 1.955138497007904e-05, "loss": 1.2357, "step": 3474 }, { "epoch": 1.2937603983665111, "grad_norm": 0.1619400829076767, "learning_rate": 1.9551024972069127e-05, "loss": 1.2237, "step": 3475 }, { "epoch": 1.2941327035171202, "grad_norm": 0.1685052067041397, "learning_rate": 1.9550664832991178e-05, "loss": 1.2217, "step": 3476 }, { "epoch": 1.2945050086677292, "grad_norm": 0.1692323237657547, "learning_rate": 1.9550304552850506e-05, "loss": 1.2116, "step": 3477 }, { "epoch": 1.2948773138183385, "grad_norm": 0.1704968959093094, "learning_rate": 1.954994413165244e-05, "loss": 1.2236, "step": 3478 }, { "epoch": 1.2952496189689473, "grad_norm": 0.17128580808639526, "learning_rate": 1.9549583569402297e-05, "loss": 1.2362, "step": 3479 }, { "epoch": 1.2956219241195566, "grad_norm": 0.16995707154273987, "learning_rate": 1.95492228661054e-05, "loss": 1.2178, "step": 3480 }, { "epoch": 1.2959942292701656, "grad_norm": 0.16950835287570953, "learning_rate": 1.9548862021767084e-05, "loss": 1.2147, "step": 3481 }, { "epoch": 1.2963665344207747, "grad_norm": 0.1846836656332016, "learning_rate": 1.9548501036392676e-05, "loss": 1.215, "step": 3482 }, { "epoch": 1.2967388395713837, "grad_norm": 0.16860432922840118, "learning_rate": 1.954813990998751e-05, "loss": 1.2209, "step": 3483 }, { "epoch": 1.2971111447219927, "grad_norm": 0.162691131234169, "learning_rate": 1.9547778642556913e-05, "loss": 1.1976, "step": 3484 }, { "epoch": 1.2974834498726018, "grad_norm": 0.17532385885715485, "learning_rate": 1.954741723410622e-05, "loss": 1.226, "step": 3485 }, { "epoch": 1.2978557550232108, "grad_norm": 0.18503205478191376, "learning_rate": 1.954705568464078e-05, "loss": 1.2223, "step": 3486 }, { "epoch": 1.29822806017382, "grad_norm": 0.1707049161195755, "learning_rate": 1.9546693994165922e-05, "loss": 1.2179, "step": 3487 }, { "epoch": 1.298600365324429, "grad_norm": 0.17060478031635284, "learning_rate": 1.9546332162687e-05, "loss": 1.2217, "step": 3488 }, { "epoch": 1.2989726704750382, "grad_norm": 0.17310044169425964, "learning_rate": 1.9545970190209346e-05, "loss": 1.221, "step": 3489 }, { "epoch": 1.2993449756256472, "grad_norm": 0.19877584278583527, "learning_rate": 1.9545608076738312e-05, "loss": 1.2391, "step": 3490 }, { "epoch": 1.2997172807762563, "grad_norm": 0.16506221890449524, "learning_rate": 1.9545245822279243e-05, "loss": 1.2112, "step": 3491 }, { "epoch": 1.3000895859268653, "grad_norm": 0.18883390724658966, "learning_rate": 1.9544883426837497e-05, "loss": 1.2221, "step": 3492 }, { "epoch": 1.3004618910774743, "grad_norm": 0.18333663046360016, "learning_rate": 1.954452089041842e-05, "loss": 1.231, "step": 3493 }, { "epoch": 1.3008341962280834, "grad_norm": 0.17763221263885498, "learning_rate": 1.954415821302737e-05, "loss": 1.2195, "step": 3494 }, { "epoch": 1.3012065013786924, "grad_norm": 0.1757153868675232, "learning_rate": 1.9543795394669696e-05, "loss": 1.2183, "step": 3495 }, { "epoch": 1.3015788065293017, "grad_norm": 0.1739683747291565, "learning_rate": 1.954343243535077e-05, "loss": 1.2039, "step": 3496 }, { "epoch": 1.3019511116799105, "grad_norm": 0.17304451763629913, "learning_rate": 1.9543069335075945e-05, "loss": 1.2128, "step": 3497 }, { "epoch": 1.3023234168305198, "grad_norm": 0.17375996708869934, "learning_rate": 1.9542706093850585e-05, "loss": 1.2332, "step": 3498 }, { "epoch": 1.3026957219811288, "grad_norm": 0.18385480344295502, "learning_rate": 1.9542342711680053e-05, "loss": 1.2246, "step": 3499 }, { "epoch": 1.3030680271317379, "grad_norm": 0.17756296694278717, "learning_rate": 1.9541979188569717e-05, "loss": 1.2084, "step": 3500 }, { "epoch": 1.3030680271317379, "eval_loss": 1.3167004585266113, "eval_runtime": 16.5219, "eval_samples_per_second": 104.951, "eval_steps_per_second": 5.266, "step": 3500 }, { "epoch": 1.303440332282347, "grad_norm": 0.18115080893039703, "learning_rate": 1.9541615524524946e-05, "loss": 1.2279, "step": 3501 }, { "epoch": 1.303812637432956, "grad_norm": 0.18096600472927094, "learning_rate": 1.9541251719551116e-05, "loss": 1.238, "step": 3502 }, { "epoch": 1.304184942583565, "grad_norm": 0.18809261918067932, "learning_rate": 1.9540887773653594e-05, "loss": 1.2256, "step": 3503 }, { "epoch": 1.304557247734174, "grad_norm": 0.19462642073631287, "learning_rate": 1.954052368683776e-05, "loss": 1.2191, "step": 3504 }, { "epoch": 1.3049295528847833, "grad_norm": 0.17324475944042206, "learning_rate": 1.9540159459108992e-05, "loss": 1.218, "step": 3505 }, { "epoch": 1.3053018580353923, "grad_norm": 0.18626105785369873, "learning_rate": 1.9539795090472665e-05, "loss": 1.2095, "step": 3506 }, { "epoch": 1.3056741631860014, "grad_norm": 0.17831236124038696, "learning_rate": 1.9539430580934162e-05, "loss": 1.2298, "step": 3507 }, { "epoch": 1.3060464683366104, "grad_norm": 0.17475959658622742, "learning_rate": 1.953906593049887e-05, "loss": 1.2262, "step": 3508 }, { "epoch": 1.3064187734872195, "grad_norm": 0.1782606840133667, "learning_rate": 1.9538701139172174e-05, "loss": 1.2196, "step": 3509 }, { "epoch": 1.3067910786378285, "grad_norm": 0.181804358959198, "learning_rate": 1.9538336206959457e-05, "loss": 1.2171, "step": 3510 }, { "epoch": 1.3071633837884375, "grad_norm": 0.17738310992717743, "learning_rate": 1.9537971133866116e-05, "loss": 1.2304, "step": 3511 }, { "epoch": 1.3075356889390466, "grad_norm": 0.16684716939926147, "learning_rate": 1.953760591989754e-05, "loss": 1.203, "step": 3512 }, { "epoch": 1.3079079940896556, "grad_norm": 0.17668993771076202, "learning_rate": 1.953724056505912e-05, "loss": 1.2144, "step": 3513 }, { "epoch": 1.308280299240265, "grad_norm": 0.17957065999507904, "learning_rate": 1.953687506935626e-05, "loss": 1.2201, "step": 3514 }, { "epoch": 1.308652604390874, "grad_norm": 0.16784153878688812, "learning_rate": 1.953650943279435e-05, "loss": 1.2145, "step": 3515 }, { "epoch": 1.309024909541483, "grad_norm": 0.17085857689380646, "learning_rate": 1.9536143655378795e-05, "loss": 1.2283, "step": 3516 }, { "epoch": 1.309397214692092, "grad_norm": 0.1773957908153534, "learning_rate": 1.9535777737114997e-05, "loss": 1.2117, "step": 3517 }, { "epoch": 1.309769519842701, "grad_norm": 0.1848251074552536, "learning_rate": 1.953541167800836e-05, "loss": 1.2149, "step": 3518 }, { "epoch": 1.3101418249933101, "grad_norm": 0.1742524355649948, "learning_rate": 1.9535045478064293e-05, "loss": 1.2426, "step": 3519 }, { "epoch": 1.3105141301439192, "grad_norm": 0.1685061901807785, "learning_rate": 1.95346791372882e-05, "loss": 1.2167, "step": 3520 }, { "epoch": 1.3108864352945282, "grad_norm": 0.15838715434074402, "learning_rate": 1.9534312655685497e-05, "loss": 1.2213, "step": 3521 }, { "epoch": 1.3112587404451372, "grad_norm": 0.1682320386171341, "learning_rate": 1.9533946033261593e-05, "loss": 1.2267, "step": 3522 }, { "epoch": 1.3116310455957465, "grad_norm": 0.17389652132987976, "learning_rate": 1.9533579270021904e-05, "loss": 1.2222, "step": 3523 }, { "epoch": 1.3120033507463555, "grad_norm": 0.17018966376781464, "learning_rate": 1.9533212365971844e-05, "loss": 1.2272, "step": 3524 }, { "epoch": 1.3123756558969646, "grad_norm": 0.16955389082431793, "learning_rate": 1.953284532111684e-05, "loss": 1.2204, "step": 3525 }, { "epoch": 1.3127479610475736, "grad_norm": 0.16709072887897491, "learning_rate": 1.9532478135462313e-05, "loss": 1.222, "step": 3526 }, { "epoch": 1.3131202661981827, "grad_norm": 0.1752871423959732, "learning_rate": 1.9532110809013676e-05, "loss": 1.22, "step": 3527 }, { "epoch": 1.3134925713487917, "grad_norm": 0.1721472591161728, "learning_rate": 1.953174334177636e-05, "loss": 1.2195, "step": 3528 }, { "epoch": 1.3138648764994008, "grad_norm": 0.17040152847766876, "learning_rate": 1.9531375733755795e-05, "loss": 1.224, "step": 3529 }, { "epoch": 1.31423718165001, "grad_norm": 0.17984364926815033, "learning_rate": 1.9531007984957408e-05, "loss": 1.2178, "step": 3530 }, { "epoch": 1.3146094868006188, "grad_norm": 0.16760718822479248, "learning_rate": 1.953064009538663e-05, "loss": 1.2096, "step": 3531 }, { "epoch": 1.314981791951228, "grad_norm": 0.1705697625875473, "learning_rate": 1.9530272065048903e-05, "loss": 1.2189, "step": 3532 }, { "epoch": 1.3153540971018371, "grad_norm": 0.17441794276237488, "learning_rate": 1.9529903893949647e-05, "loss": 1.2218, "step": 3533 }, { "epoch": 1.3157264022524462, "grad_norm": 0.17177151143550873, "learning_rate": 1.9529535582094315e-05, "loss": 1.219, "step": 3534 }, { "epoch": 1.3160987074030552, "grad_norm": 0.18386155366897583, "learning_rate": 1.9529167129488335e-05, "loss": 1.2187, "step": 3535 }, { "epoch": 1.3164710125536643, "grad_norm": 0.17924180626869202, "learning_rate": 1.9528798536137157e-05, "loss": 1.2303, "step": 3536 }, { "epoch": 1.3168433177042733, "grad_norm": 0.16562217473983765, "learning_rate": 1.9528429802046225e-05, "loss": 1.2155, "step": 3537 }, { "epoch": 1.3172156228548824, "grad_norm": 0.1981254667043686, "learning_rate": 1.952806092722098e-05, "loss": 1.203, "step": 3538 }, { "epoch": 1.3175879280054916, "grad_norm": 0.17058147490024567, "learning_rate": 1.9527691911666875e-05, "loss": 1.2058, "step": 3539 }, { "epoch": 1.3179602331561004, "grad_norm": 0.1780320107936859, "learning_rate": 1.9527322755389355e-05, "loss": 1.2242, "step": 3540 }, { "epoch": 1.3183325383067097, "grad_norm": 0.17311114072799683, "learning_rate": 1.9526953458393878e-05, "loss": 1.2142, "step": 3541 }, { "epoch": 1.3187048434573188, "grad_norm": 0.1679919809103012, "learning_rate": 1.9526584020685896e-05, "loss": 1.213, "step": 3542 }, { "epoch": 1.3190771486079278, "grad_norm": 0.17678605020046234, "learning_rate": 1.9526214442270865e-05, "loss": 1.2275, "step": 3543 }, { "epoch": 1.3194494537585368, "grad_norm": 0.16861461102962494, "learning_rate": 1.9525844723154246e-05, "loss": 1.2262, "step": 3544 }, { "epoch": 1.3198217589091459, "grad_norm": 0.18046718835830688, "learning_rate": 1.95254748633415e-05, "loss": 1.2258, "step": 3545 }, { "epoch": 1.320194064059755, "grad_norm": 0.16833364963531494, "learning_rate": 1.9525104862838085e-05, "loss": 1.2195, "step": 3546 }, { "epoch": 1.320566369210364, "grad_norm": 0.16752612590789795, "learning_rate": 1.952473472164947e-05, "loss": 1.2227, "step": 3547 }, { "epoch": 1.3209386743609732, "grad_norm": 0.16177985072135925, "learning_rate": 1.952436443978112e-05, "loss": 1.2036, "step": 3548 }, { "epoch": 1.321310979511582, "grad_norm": 0.1741490513086319, "learning_rate": 1.9523994017238505e-05, "loss": 1.2292, "step": 3549 }, { "epoch": 1.3216832846621913, "grad_norm": 0.17161637544631958, "learning_rate": 1.9523623454027095e-05, "loss": 1.2195, "step": 3550 }, { "epoch": 1.3220555898128004, "grad_norm": 0.16780497133731842, "learning_rate": 1.9523252750152367e-05, "loss": 1.2235, "step": 3551 }, { "epoch": 1.3224278949634094, "grad_norm": 0.17077326774597168, "learning_rate": 1.9522881905619794e-05, "loss": 1.2144, "step": 3552 }, { "epoch": 1.3228002001140184, "grad_norm": 0.16986922919750214, "learning_rate": 1.9522510920434853e-05, "loss": 1.2222, "step": 3553 }, { "epoch": 1.3231725052646275, "grad_norm": 0.17960046231746674, "learning_rate": 1.9522139794603018e-05, "loss": 1.2397, "step": 3554 }, { "epoch": 1.3235448104152365, "grad_norm": 0.17524504661560059, "learning_rate": 1.9521768528129782e-05, "loss": 1.2136, "step": 3555 }, { "epoch": 1.3239171155658456, "grad_norm": 0.16311313211917877, "learning_rate": 1.952139712102062e-05, "loss": 1.2182, "step": 3556 }, { "epoch": 1.3242894207164548, "grad_norm": 0.2031218409538269, "learning_rate": 1.952102557328102e-05, "loss": 1.211, "step": 3557 }, { "epoch": 1.3246617258670637, "grad_norm": 0.18743805587291718, "learning_rate": 1.952065388491647e-05, "loss": 1.2244, "step": 3558 }, { "epoch": 1.325034031017673, "grad_norm": 0.17010486125946045, "learning_rate": 1.952028205593246e-05, "loss": 1.2096, "step": 3559 }, { "epoch": 1.325406336168282, "grad_norm": 0.16542263329029083, "learning_rate": 1.951991008633448e-05, "loss": 1.233, "step": 3560 }, { "epoch": 1.325778641318891, "grad_norm": 0.1798142045736313, "learning_rate": 1.9519537976128025e-05, "loss": 1.2222, "step": 3561 }, { "epoch": 1.3261509464695, "grad_norm": 0.16815944015979767, "learning_rate": 1.9519165725318594e-05, "loss": 1.2215, "step": 3562 }, { "epoch": 1.326523251620109, "grad_norm": 0.16989260911941528, "learning_rate": 1.951879333391168e-05, "loss": 1.2323, "step": 3563 }, { "epoch": 1.3268955567707181, "grad_norm": 0.17635375261306763, "learning_rate": 1.9518420801912787e-05, "loss": 1.2118, "step": 3564 }, { "epoch": 1.3272678619213272, "grad_norm": 0.1696915477514267, "learning_rate": 1.951804812932742e-05, "loss": 1.2136, "step": 3565 }, { "epoch": 1.3276401670719364, "grad_norm": 0.17209488153457642, "learning_rate": 1.9517675316161074e-05, "loss": 1.2209, "step": 3566 }, { "epoch": 1.3280124722225455, "grad_norm": 0.1687251627445221, "learning_rate": 1.951730236241926e-05, "loss": 1.2153, "step": 3567 }, { "epoch": 1.3283847773731545, "grad_norm": 0.17375510931015015, "learning_rate": 1.9516929268107492e-05, "loss": 1.2257, "step": 3568 }, { "epoch": 1.3287570825237636, "grad_norm": 0.17510513961315155, "learning_rate": 1.9516556033231276e-05, "loss": 1.2204, "step": 3569 }, { "epoch": 1.3291293876743726, "grad_norm": 0.16425535082817078, "learning_rate": 1.9516182657796123e-05, "loss": 1.2278, "step": 3570 }, { "epoch": 1.3295016928249817, "grad_norm": 0.17641210556030273, "learning_rate": 1.9515809141807547e-05, "loss": 1.2086, "step": 3571 }, { "epoch": 1.3298739979755907, "grad_norm": 0.1740121841430664, "learning_rate": 1.951543548527107e-05, "loss": 1.2131, "step": 3572 }, { "epoch": 1.3302463031261997, "grad_norm": 0.16927821934223175, "learning_rate": 1.951506168819221e-05, "loss": 1.2278, "step": 3573 }, { "epoch": 1.3306186082768088, "grad_norm": 0.17070408165454865, "learning_rate": 1.9514687750576483e-05, "loss": 1.2097, "step": 3574 }, { "epoch": 1.330990913427418, "grad_norm": 0.17762641608715057, "learning_rate": 1.9514313672429414e-05, "loss": 1.2277, "step": 3575 }, { "epoch": 1.331363218578027, "grad_norm": 0.1731291264295578, "learning_rate": 1.951393945375653e-05, "loss": 1.2455, "step": 3576 }, { "epoch": 1.3317355237286361, "grad_norm": 0.17669746279716492, "learning_rate": 1.9513565094563358e-05, "loss": 1.2225, "step": 3577 }, { "epoch": 1.3321078288792452, "grad_norm": 0.16807028651237488, "learning_rate": 1.9513190594855427e-05, "loss": 1.2223, "step": 3578 }, { "epoch": 1.3324801340298542, "grad_norm": 0.16439288854599, "learning_rate": 1.9512815954638266e-05, "loss": 1.2235, "step": 3579 }, { "epoch": 1.3328524391804633, "grad_norm": 0.17088323831558228, "learning_rate": 1.9512441173917415e-05, "loss": 1.2066, "step": 3580 }, { "epoch": 1.3332247443310723, "grad_norm": 0.1691887080669403, "learning_rate": 1.95120662526984e-05, "loss": 1.2274, "step": 3581 }, { "epoch": 1.3335970494816813, "grad_norm": 0.17315596342086792, "learning_rate": 1.9511691190986767e-05, "loss": 1.2285, "step": 3582 }, { "epoch": 1.3339693546322904, "grad_norm": 0.1639329493045807, "learning_rate": 1.9511315988788046e-05, "loss": 1.2236, "step": 3583 }, { "epoch": 1.3343416597828996, "grad_norm": 0.16537517309188843, "learning_rate": 1.951094064610779e-05, "loss": 1.2046, "step": 3584 }, { "epoch": 1.3347139649335087, "grad_norm": 0.173024520277977, "learning_rate": 1.9510565162951538e-05, "loss": 1.2142, "step": 3585 }, { "epoch": 1.3350862700841177, "grad_norm": 0.1649821549654007, "learning_rate": 1.9510189539324832e-05, "loss": 1.225, "step": 3586 }, { "epoch": 1.3354585752347268, "grad_norm": 0.1695072054862976, "learning_rate": 1.9509813775233227e-05, "loss": 1.2062, "step": 3587 }, { "epoch": 1.3358308803853358, "grad_norm": 0.16297145187854767, "learning_rate": 1.9509437870682268e-05, "loss": 1.2252, "step": 3588 }, { "epoch": 1.3362031855359449, "grad_norm": 0.1728506088256836, "learning_rate": 1.9509061825677508e-05, "loss": 1.2158, "step": 3589 }, { "epoch": 1.336575490686554, "grad_norm": 0.16616937518119812, "learning_rate": 1.95086856402245e-05, "loss": 1.2066, "step": 3590 }, { "epoch": 1.3369477958371632, "grad_norm": 0.16450245678424835, "learning_rate": 1.9508309314328803e-05, "loss": 1.2085, "step": 3591 }, { "epoch": 1.337320100987772, "grad_norm": 0.17758624255657196, "learning_rate": 1.9507932847995974e-05, "loss": 1.2121, "step": 3592 }, { "epoch": 1.3376924061383813, "grad_norm": 0.17107117176055908, "learning_rate": 1.9507556241231574e-05, "loss": 1.2223, "step": 3593 }, { "epoch": 1.3380647112889903, "grad_norm": 0.17417475581169128, "learning_rate": 1.9507179494041166e-05, "loss": 1.2306, "step": 3594 }, { "epoch": 1.3384370164395993, "grad_norm": 0.16709604859352112, "learning_rate": 1.9506802606430314e-05, "loss": 1.2262, "step": 3595 }, { "epoch": 1.3388093215902084, "grad_norm": 0.1753034144639969, "learning_rate": 1.950642557840458e-05, "loss": 1.2329, "step": 3596 }, { "epoch": 1.3391816267408174, "grad_norm": 0.16027675569057465, "learning_rate": 1.950604840996954e-05, "loss": 1.2315, "step": 3597 }, { "epoch": 1.3395539318914265, "grad_norm": 0.17138755321502686, "learning_rate": 1.950567110113076e-05, "loss": 1.2045, "step": 3598 }, { "epoch": 1.3399262370420355, "grad_norm": 0.1684652864933014, "learning_rate": 1.9505293651893817e-05, "loss": 1.2353, "step": 3599 }, { "epoch": 1.3402985421926448, "grad_norm": 0.178995743393898, "learning_rate": 1.9504916062264285e-05, "loss": 1.2311, "step": 3600 }, { "epoch": 1.3406708473432536, "grad_norm": 0.17695268988609314, "learning_rate": 1.950453833224773e-05, "loss": 1.2295, "step": 3601 }, { "epoch": 1.3410431524938629, "grad_norm": 0.16872268915176392, "learning_rate": 1.950416046184975e-05, "loss": 1.2149, "step": 3602 }, { "epoch": 1.341415457644472, "grad_norm": 0.1644962728023529, "learning_rate": 1.9503782451075912e-05, "loss": 1.2012, "step": 3603 }, { "epoch": 1.341787762795081, "grad_norm": 0.16995151340961456, "learning_rate": 1.9503404299931806e-05, "loss": 1.2115, "step": 3604 }, { "epoch": 1.34216006794569, "grad_norm": 0.1838231086730957, "learning_rate": 1.950302600842301e-05, "loss": 1.2096, "step": 3605 }, { "epoch": 1.342532373096299, "grad_norm": 0.1684899926185608, "learning_rate": 1.950264757655512e-05, "loss": 1.2167, "step": 3606 }, { "epoch": 1.342904678246908, "grad_norm": 0.17310784757137299, "learning_rate": 1.9502269004333722e-05, "loss": 1.2252, "step": 3607 }, { "epoch": 1.343276983397517, "grad_norm": 0.17696700990200043, "learning_rate": 1.950189029176441e-05, "loss": 1.2212, "step": 3608 }, { "epoch": 1.3436492885481264, "grad_norm": 0.16353021562099457, "learning_rate": 1.9501511438852767e-05, "loss": 1.2122, "step": 3609 }, { "epoch": 1.3440215936987352, "grad_norm": 0.1800825595855713, "learning_rate": 1.95011324456044e-05, "loss": 1.222, "step": 3610 }, { "epoch": 1.3443938988493445, "grad_norm": 0.17191185057163239, "learning_rate": 1.9500753312024904e-05, "loss": 1.2084, "step": 3611 }, { "epoch": 1.3447662039999535, "grad_norm": 0.16857829689979553, "learning_rate": 1.9500374038119877e-05, "loss": 1.219, "step": 3612 }, { "epoch": 1.3451385091505625, "grad_norm": 0.16424717009067535, "learning_rate": 1.949999462389492e-05, "loss": 1.2135, "step": 3613 }, { "epoch": 1.3455108143011716, "grad_norm": 0.1720798909664154, "learning_rate": 1.9499615069355644e-05, "loss": 1.2162, "step": 3614 }, { "epoch": 1.3458831194517806, "grad_norm": 0.1672721654176712, "learning_rate": 1.9499235374507646e-05, "loss": 1.2197, "step": 3615 }, { "epoch": 1.3462554246023897, "grad_norm": 0.17355121672153473, "learning_rate": 1.9498855539356534e-05, "loss": 1.2289, "step": 3616 }, { "epoch": 1.3466277297529987, "grad_norm": 0.17053773999214172, "learning_rate": 1.9498475563907927e-05, "loss": 1.2154, "step": 3617 }, { "epoch": 1.347000034903608, "grad_norm": 0.17740602791309357, "learning_rate": 1.9498095448167435e-05, "loss": 1.2198, "step": 3618 }, { "epoch": 1.3473723400542168, "grad_norm": 0.16917094588279724, "learning_rate": 1.949771519214066e-05, "loss": 1.2174, "step": 3619 }, { "epoch": 1.347744645204826, "grad_norm": 0.20187613368034363, "learning_rate": 1.9497334795833235e-05, "loss": 1.213, "step": 3620 }, { "epoch": 1.348116950355435, "grad_norm": 0.1762245148420334, "learning_rate": 1.949695425925077e-05, "loss": 1.1968, "step": 3621 }, { "epoch": 1.3484892555060441, "grad_norm": 0.17733825743198395, "learning_rate": 1.9496573582398884e-05, "loss": 1.2316, "step": 3622 }, { "epoch": 1.3488615606566532, "grad_norm": 0.16012229025363922, "learning_rate": 1.9496192765283205e-05, "loss": 1.2156, "step": 3623 }, { "epoch": 1.3492338658072622, "grad_norm": 0.17193962633609772, "learning_rate": 1.9495811807909353e-05, "loss": 1.229, "step": 3624 }, { "epoch": 1.3496061709578713, "grad_norm": 0.17869403958320618, "learning_rate": 1.9495430710282956e-05, "loss": 1.2101, "step": 3625 }, { "epoch": 1.3499784761084803, "grad_norm": 0.1690213680267334, "learning_rate": 1.9495049472409644e-05, "loss": 1.2194, "step": 3626 }, { "epoch": 1.3503507812590896, "grad_norm": 0.16633479297161102, "learning_rate": 1.9494668094295046e-05, "loss": 1.2069, "step": 3627 }, { "epoch": 1.3507230864096986, "grad_norm": 0.17453880608081818, "learning_rate": 1.94942865759448e-05, "loss": 1.2156, "step": 3628 }, { "epoch": 1.3510953915603077, "grad_norm": 0.176530122756958, "learning_rate": 1.9493904917364533e-05, "loss": 1.201, "step": 3629 }, { "epoch": 1.3514676967109167, "grad_norm": 0.1691957712173462, "learning_rate": 1.9493523118559888e-05, "loss": 1.2361, "step": 3630 }, { "epoch": 1.3518400018615258, "grad_norm": 0.16371867060661316, "learning_rate": 1.9493141179536504e-05, "loss": 1.2186, "step": 3631 }, { "epoch": 1.3522123070121348, "grad_norm": 0.17232230305671692, "learning_rate": 1.949275910030002e-05, "loss": 1.2164, "step": 3632 }, { "epoch": 1.3525846121627438, "grad_norm": 0.17034488916397095, "learning_rate": 1.9492376880856075e-05, "loss": 1.2146, "step": 3633 }, { "epoch": 1.3529569173133529, "grad_norm": 0.16732533276081085, "learning_rate": 1.9491994521210326e-05, "loss": 1.2249, "step": 3634 }, { "epoch": 1.353329222463962, "grad_norm": 0.16528116166591644, "learning_rate": 1.949161202136841e-05, "loss": 1.2163, "step": 3635 }, { "epoch": 1.3537015276145712, "grad_norm": 0.1649235486984253, "learning_rate": 1.9491229381335978e-05, "loss": 1.2159, "step": 3636 }, { "epoch": 1.3540738327651802, "grad_norm": 0.16926534473896027, "learning_rate": 1.9490846601118685e-05, "loss": 1.211, "step": 3637 }, { "epoch": 1.3544461379157893, "grad_norm": 0.17798763513565063, "learning_rate": 1.9490463680722183e-05, "loss": 1.2198, "step": 3638 }, { "epoch": 1.3548184430663983, "grad_norm": 0.1720110923051834, "learning_rate": 1.9490080620152133e-05, "loss": 1.2144, "step": 3639 }, { "epoch": 1.3551907482170074, "grad_norm": 0.16815277934074402, "learning_rate": 1.948969741941418e-05, "loss": 1.2294, "step": 3640 }, { "epoch": 1.3555630533676164, "grad_norm": 0.1696767359972, "learning_rate": 1.9489314078514e-05, "loss": 1.2161, "step": 3641 }, { "epoch": 1.3559353585182254, "grad_norm": 0.17275430262088776, "learning_rate": 1.9488930597457242e-05, "loss": 1.2219, "step": 3642 }, { "epoch": 1.3563076636688347, "grad_norm": 0.17057517170906067, "learning_rate": 1.9488546976249572e-05, "loss": 1.2106, "step": 3643 }, { "epoch": 1.3566799688194435, "grad_norm": 0.16843107342720032, "learning_rate": 1.9488163214896666e-05, "loss": 1.2274, "step": 3644 }, { "epoch": 1.3570522739700528, "grad_norm": 0.1690577119588852, "learning_rate": 1.948777931340418e-05, "loss": 1.2187, "step": 3645 }, { "epoch": 1.3574245791206618, "grad_norm": 0.17254535853862762, "learning_rate": 1.9487395271777787e-05, "loss": 1.2228, "step": 3646 }, { "epoch": 1.3577968842712709, "grad_norm": 0.17045117914676666, "learning_rate": 1.9487011090023165e-05, "loss": 1.195, "step": 3647 }, { "epoch": 1.35816918942188, "grad_norm": 0.17185580730438232, "learning_rate": 1.9486626768145986e-05, "loss": 1.2162, "step": 3648 }, { "epoch": 1.358541494572489, "grad_norm": 0.1752527505159378, "learning_rate": 1.948624230615192e-05, "loss": 1.2336, "step": 3649 }, { "epoch": 1.358913799723098, "grad_norm": 0.1720474511384964, "learning_rate": 1.9485857704046652e-05, "loss": 1.2268, "step": 3650 }, { "epoch": 1.359286104873707, "grad_norm": 0.1814902275800705, "learning_rate": 1.9485472961835865e-05, "loss": 1.2179, "step": 3651 }, { "epoch": 1.3596584100243163, "grad_norm": 0.16393816471099854, "learning_rate": 1.948508807952523e-05, "loss": 1.2179, "step": 3652 }, { "epoch": 1.3600307151749251, "grad_norm": 0.17267872393131256, "learning_rate": 1.9484703057120444e-05, "loss": 1.2152, "step": 3653 }, { "epoch": 1.3604030203255344, "grad_norm": 0.1774880588054657, "learning_rate": 1.948431789462719e-05, "loss": 1.2323, "step": 3654 }, { "epoch": 1.3607753254761434, "grad_norm": 0.17601603269577026, "learning_rate": 1.9483932592051154e-05, "loss": 1.2211, "step": 3655 }, { "epoch": 1.3611476306267525, "grad_norm": 0.17215831577777863, "learning_rate": 1.948354714939803e-05, "loss": 1.2171, "step": 3656 }, { "epoch": 1.3615199357773615, "grad_norm": 0.16573548316955566, "learning_rate": 1.948316156667351e-05, "loss": 1.2386, "step": 3657 }, { "epoch": 1.3618922409279706, "grad_norm": 0.17070265114307404, "learning_rate": 1.9482775843883287e-05, "loss": 1.2306, "step": 3658 }, { "epoch": 1.3622645460785796, "grad_norm": 0.17669783532619476, "learning_rate": 1.948238998103306e-05, "loss": 1.2186, "step": 3659 }, { "epoch": 1.3626368512291887, "grad_norm": 0.17972193658351898, "learning_rate": 1.948200397812853e-05, "loss": 1.2208, "step": 3660 }, { "epoch": 1.363009156379798, "grad_norm": 0.16259367763996124, "learning_rate": 1.9481617835175394e-05, "loss": 1.1995, "step": 3661 }, { "epoch": 1.3633814615304067, "grad_norm": 0.16560673713684082, "learning_rate": 1.948123155217936e-05, "loss": 1.2124, "step": 3662 }, { "epoch": 1.363753766681016, "grad_norm": 0.17198215425014496, "learning_rate": 1.948084512914613e-05, "loss": 1.223, "step": 3663 }, { "epoch": 1.364126071831625, "grad_norm": 0.16872192919254303, "learning_rate": 1.948045856608141e-05, "loss": 1.213, "step": 3664 }, { "epoch": 1.364498376982234, "grad_norm": 0.17418809235095978, "learning_rate": 1.9480071862990917e-05, "loss": 1.2059, "step": 3665 }, { "epoch": 1.3648706821328431, "grad_norm": 0.1725587695837021, "learning_rate": 1.9479685019880356e-05, "loss": 1.2234, "step": 3666 }, { "epoch": 1.3652429872834522, "grad_norm": 0.16984142363071442, "learning_rate": 1.947929803675544e-05, "loss": 1.2106, "step": 3667 }, { "epoch": 1.3656152924340612, "grad_norm": 0.1837463080883026, "learning_rate": 1.9478910913621888e-05, "loss": 1.2143, "step": 3668 }, { "epoch": 1.3659875975846703, "grad_norm": 0.1684904843568802, "learning_rate": 1.9478523650485416e-05, "loss": 1.2236, "step": 3669 }, { "epoch": 1.3663599027352795, "grad_norm": 0.1728762686252594, "learning_rate": 1.9478136247351745e-05, "loss": 1.205, "step": 3670 }, { "epoch": 1.3667322078858883, "grad_norm": 0.16816040873527527, "learning_rate": 1.9477748704226597e-05, "loss": 1.2231, "step": 3671 }, { "epoch": 1.3671045130364976, "grad_norm": 0.17051219940185547, "learning_rate": 1.9477361021115695e-05, "loss": 1.2277, "step": 3672 }, { "epoch": 1.3674768181871066, "grad_norm": 0.16704906523227692, "learning_rate": 1.9476973198024766e-05, "loss": 1.2166, "step": 3673 }, { "epoch": 1.3678491233377157, "grad_norm": 0.16872693598270416, "learning_rate": 1.9476585234959538e-05, "loss": 1.2136, "step": 3674 }, { "epoch": 1.3682214284883247, "grad_norm": 0.16978906095027924, "learning_rate": 1.947619713192574e-05, "loss": 1.209, "step": 3675 }, { "epoch": 1.3685937336389338, "grad_norm": 0.16473500430583954, "learning_rate": 1.9475808888929104e-05, "loss": 1.2284, "step": 3676 }, { "epoch": 1.3689660387895428, "grad_norm": 0.16459238529205322, "learning_rate": 1.9475420505975366e-05, "loss": 1.2103, "step": 3677 }, { "epoch": 1.3693383439401519, "grad_norm": 0.1704069972038269, "learning_rate": 1.9475031983070264e-05, "loss": 1.2207, "step": 3678 }, { "epoch": 1.3697106490907611, "grad_norm": 0.18187867105007172, "learning_rate": 1.9474643320219534e-05, "loss": 1.1893, "step": 3679 }, { "epoch": 1.3700829542413702, "grad_norm": 0.1664721667766571, "learning_rate": 1.9474254517428912e-05, "loss": 1.23, "step": 3680 }, { "epoch": 1.3704552593919792, "grad_norm": 0.17463192343711853, "learning_rate": 1.947386557470415e-05, "loss": 1.2172, "step": 3681 }, { "epoch": 1.3708275645425883, "grad_norm": 0.17818517982959747, "learning_rate": 1.9473476492050984e-05, "loss": 1.2206, "step": 3682 }, { "epoch": 1.3711998696931973, "grad_norm": 0.1713617891073227, "learning_rate": 1.947308726947517e-05, "loss": 1.2358, "step": 3683 }, { "epoch": 1.3715721748438063, "grad_norm": 0.16868345439434052, "learning_rate": 1.947269790698245e-05, "loss": 1.2138, "step": 3684 }, { "epoch": 1.3719444799944154, "grad_norm": 0.17152869701385498, "learning_rate": 1.9472308404578574e-05, "loss": 1.2142, "step": 3685 }, { "epoch": 1.3723167851450244, "grad_norm": 0.16501954197883606, "learning_rate": 1.9471918762269298e-05, "loss": 1.2098, "step": 3686 }, { "epoch": 1.3726890902956335, "grad_norm": 0.1698669195175171, "learning_rate": 1.9471528980060378e-05, "loss": 1.2238, "step": 3687 }, { "epoch": 1.3730613954462427, "grad_norm": 0.17526181042194366, "learning_rate": 1.9471139057957566e-05, "loss": 1.226, "step": 3688 }, { "epoch": 1.3734337005968518, "grad_norm": 0.1682656854391098, "learning_rate": 1.947074899596663e-05, "loss": 1.2115, "step": 3689 }, { "epoch": 1.3738060057474608, "grad_norm": 0.16698940098285675, "learning_rate": 1.947035879409332e-05, "loss": 1.1977, "step": 3690 }, { "epoch": 1.3741783108980699, "grad_norm": 0.17004626989364624, "learning_rate": 1.9469968452343408e-05, "loss": 1.2197, "step": 3691 }, { "epoch": 1.374550616048679, "grad_norm": 0.17727632820606232, "learning_rate": 1.9469577970722656e-05, "loss": 1.2242, "step": 3692 }, { "epoch": 1.374922921199288, "grad_norm": 0.16782982647418976, "learning_rate": 1.9469187349236832e-05, "loss": 1.2111, "step": 3693 }, { "epoch": 1.375295226349897, "grad_norm": 0.17160236835479736, "learning_rate": 1.9468796587891706e-05, "loss": 1.2168, "step": 3694 }, { "epoch": 1.375667531500506, "grad_norm": 0.16954834759235382, "learning_rate": 1.9468405686693044e-05, "loss": 1.2077, "step": 3695 }, { "epoch": 1.376039836651115, "grad_norm": 0.168158158659935, "learning_rate": 1.9468014645646628e-05, "loss": 1.2249, "step": 3696 }, { "epoch": 1.3764121418017243, "grad_norm": 0.16404445469379425, "learning_rate": 1.946762346475823e-05, "loss": 1.2158, "step": 3697 }, { "epoch": 1.3767844469523334, "grad_norm": 0.1699007749557495, "learning_rate": 1.946723214403363e-05, "loss": 1.2043, "step": 3698 }, { "epoch": 1.3771567521029424, "grad_norm": 0.17556270956993103, "learning_rate": 1.9466840683478597e-05, "loss": 1.2167, "step": 3699 }, { "epoch": 1.3775290572535515, "grad_norm": 0.1715071052312851, "learning_rate": 1.9466449083098927e-05, "loss": 1.2228, "step": 3700 }, { "epoch": 1.3779013624041605, "grad_norm": 0.1757829487323761, "learning_rate": 1.94660573429004e-05, "loss": 1.232, "step": 3701 }, { "epoch": 1.3782736675547695, "grad_norm": 0.1744741052389145, "learning_rate": 1.9465665462888798e-05, "loss": 1.2026, "step": 3702 }, { "epoch": 1.3786459727053786, "grad_norm": 0.17323531210422516, "learning_rate": 1.9465273443069908e-05, "loss": 1.2306, "step": 3703 }, { "epoch": 1.3790182778559879, "grad_norm": 0.16859839856624603, "learning_rate": 1.9464881283449525e-05, "loss": 1.2169, "step": 3704 }, { "epoch": 1.3793905830065967, "grad_norm": 0.1696728765964508, "learning_rate": 1.9464488984033442e-05, "loss": 1.2084, "step": 3705 }, { "epoch": 1.379762888157206, "grad_norm": 0.16950103640556335, "learning_rate": 1.946409654482745e-05, "loss": 1.2233, "step": 3706 }, { "epoch": 1.380135193307815, "grad_norm": 0.17010366916656494, "learning_rate": 1.9463703965837344e-05, "loss": 1.2042, "step": 3707 }, { "epoch": 1.380507498458424, "grad_norm": 0.16800427436828613, "learning_rate": 1.9463311247068927e-05, "loss": 1.2012, "step": 3708 }, { "epoch": 1.380879803609033, "grad_norm": 0.16539591550827026, "learning_rate": 1.9462918388527994e-05, "loss": 1.2236, "step": 3709 }, { "epoch": 1.381252108759642, "grad_norm": 0.17388111352920532, "learning_rate": 1.946252539022035e-05, "loss": 1.2115, "step": 3710 }, { "epoch": 1.3816244139102511, "grad_norm": 0.17521414160728455, "learning_rate": 1.94621322521518e-05, "loss": 1.2319, "step": 3711 }, { "epoch": 1.3819967190608602, "grad_norm": 0.16813969612121582, "learning_rate": 1.9461738974328156e-05, "loss": 1.2158, "step": 3712 }, { "epoch": 1.3823690242114695, "grad_norm": 0.16460061073303223, "learning_rate": 1.9461345556755216e-05, "loss": 1.2084, "step": 3713 }, { "epoch": 1.3827413293620783, "grad_norm": 0.18938884139060974, "learning_rate": 1.9460951999438798e-05, "loss": 1.2169, "step": 3714 }, { "epoch": 1.3831136345126875, "grad_norm": 0.19839580357074738, "learning_rate": 1.946055830238471e-05, "loss": 1.221, "step": 3715 }, { "epoch": 1.3834859396632966, "grad_norm": 0.1847252994775772, "learning_rate": 1.9460164465598773e-05, "loss": 1.21, "step": 3716 }, { "epoch": 1.3838582448139056, "grad_norm": 0.1856468766927719, "learning_rate": 1.94597704890868e-05, "loss": 1.222, "step": 3717 }, { "epoch": 1.3842305499645147, "grad_norm": 0.1739731878042221, "learning_rate": 1.945937637285461e-05, "loss": 1.1989, "step": 3718 }, { "epoch": 1.3846028551151237, "grad_norm": 0.16499097645282745, "learning_rate": 1.945898211690802e-05, "loss": 1.2297, "step": 3719 }, { "epoch": 1.3849751602657328, "grad_norm": 0.17512036859989166, "learning_rate": 1.9458587721252862e-05, "loss": 1.2238, "step": 3720 }, { "epoch": 1.3853474654163418, "grad_norm": 0.1747319996356964, "learning_rate": 1.9458193185894957e-05, "loss": 1.214, "step": 3721 }, { "epoch": 1.385719770566951, "grad_norm": 0.17503361403942108, "learning_rate": 1.9457798510840132e-05, "loss": 1.225, "step": 3722 }, { "epoch": 1.3860920757175599, "grad_norm": 0.16246047616004944, "learning_rate": 1.9457403696094216e-05, "loss": 1.2121, "step": 3723 }, { "epoch": 1.3864643808681691, "grad_norm": 0.17145591974258423, "learning_rate": 1.945700874166304e-05, "loss": 1.1996, "step": 3724 }, { "epoch": 1.3868366860187782, "grad_norm": 0.17188484966754913, "learning_rate": 1.945661364755244e-05, "loss": 1.217, "step": 3725 }, { "epoch": 1.3872089911693872, "grad_norm": 0.17199209332466125, "learning_rate": 1.945621841376825e-05, "loss": 1.2215, "step": 3726 }, { "epoch": 1.3875812963199963, "grad_norm": 0.17529122531414032, "learning_rate": 1.945582304031631e-05, "loss": 1.2076, "step": 3727 }, { "epoch": 1.3879536014706053, "grad_norm": 0.16724155843257904, "learning_rate": 1.945542752720245e-05, "loss": 1.2258, "step": 3728 }, { "epoch": 1.3883259066212144, "grad_norm": 0.16579319536685944, "learning_rate": 1.9455031874432526e-05, "loss": 1.2206, "step": 3729 }, { "epoch": 1.3886982117718234, "grad_norm": 0.17617808282375336, "learning_rate": 1.9454636082012373e-05, "loss": 1.2258, "step": 3730 }, { "epoch": 1.3890705169224327, "grad_norm": 0.17041444778442383, "learning_rate": 1.9454240149947834e-05, "loss": 1.2164, "step": 3731 }, { "epoch": 1.3894428220730415, "grad_norm": 0.17237654328346252, "learning_rate": 1.9453844078244767e-05, "loss": 1.2238, "step": 3732 }, { "epoch": 1.3898151272236507, "grad_norm": 0.17883872985839844, "learning_rate": 1.9453447866909013e-05, "loss": 1.2267, "step": 3733 }, { "epoch": 1.3901874323742598, "grad_norm": 0.1674152910709381, "learning_rate": 1.9453051515946428e-05, "loss": 1.2181, "step": 3734 }, { "epoch": 1.3905597375248688, "grad_norm": 0.16891363263130188, "learning_rate": 1.9452655025362865e-05, "loss": 1.2301, "step": 3735 }, { "epoch": 1.3909320426754779, "grad_norm": 0.16902455687522888, "learning_rate": 1.9452258395164182e-05, "loss": 1.2097, "step": 3736 }, { "epoch": 1.391304347826087, "grad_norm": 0.1644563525915146, "learning_rate": 1.9451861625356235e-05, "loss": 1.2291, "step": 3737 }, { "epoch": 1.391676652976696, "grad_norm": 0.1637285202741623, "learning_rate": 1.9451464715944885e-05, "loss": 1.2201, "step": 3738 }, { "epoch": 1.392048958127305, "grad_norm": 0.1716940551996231, "learning_rate": 1.945106766693599e-05, "loss": 1.2151, "step": 3739 }, { "epoch": 1.3924212632779143, "grad_norm": 0.17560310661792755, "learning_rate": 1.9450670478335424e-05, "loss": 1.2121, "step": 3740 }, { "epoch": 1.3927935684285233, "grad_norm": 0.164300337433815, "learning_rate": 1.9450273150149047e-05, "loss": 1.2194, "step": 3741 }, { "epoch": 1.3931658735791324, "grad_norm": 0.17114891111850739, "learning_rate": 1.944987568238273e-05, "loss": 1.201, "step": 3742 }, { "epoch": 1.3935381787297414, "grad_norm": 0.1734459102153778, "learning_rate": 1.944947807504234e-05, "loss": 1.221, "step": 3743 }, { "epoch": 1.3939104838803504, "grad_norm": 0.16712883114814758, "learning_rate": 1.944908032813375e-05, "loss": 1.2168, "step": 3744 }, { "epoch": 1.3942827890309595, "grad_norm": 0.17574042081832886, "learning_rate": 1.944868244166284e-05, "loss": 1.2226, "step": 3745 }, { "epoch": 1.3946550941815685, "grad_norm": 0.167069673538208, "learning_rate": 1.944828441563548e-05, "loss": 1.1964, "step": 3746 }, { "epoch": 1.3950273993321776, "grad_norm": 0.1734757423400879, "learning_rate": 1.9447886250057556e-05, "loss": 1.2265, "step": 3747 }, { "epoch": 1.3953997044827866, "grad_norm": 0.1713225245475769, "learning_rate": 1.944748794493494e-05, "loss": 1.1987, "step": 3748 }, { "epoch": 1.3957720096333959, "grad_norm": 0.18104128539562225, "learning_rate": 1.9447089500273524e-05, "loss": 1.212, "step": 3749 }, { "epoch": 1.396144314784005, "grad_norm": 0.17931242287158966, "learning_rate": 1.944669091607919e-05, "loss": 1.218, "step": 3750 }, { "epoch": 1.396516619934614, "grad_norm": 0.1705280989408493, "learning_rate": 1.944629219235782e-05, "loss": 1.2186, "step": 3751 }, { "epoch": 1.396888925085223, "grad_norm": 0.1851729452610016, "learning_rate": 1.944589332911531e-05, "loss": 1.2086, "step": 3752 }, { "epoch": 1.397261230235832, "grad_norm": 0.17898760735988617, "learning_rate": 1.9445494326357548e-05, "loss": 1.2299, "step": 3753 }, { "epoch": 1.397633535386441, "grad_norm": 0.17653051018714905, "learning_rate": 1.9445095184090428e-05, "loss": 1.2135, "step": 3754 }, { "epoch": 1.3980058405370501, "grad_norm": 0.17303214967250824, "learning_rate": 1.9444695902319845e-05, "loss": 1.2214, "step": 3755 }, { "epoch": 1.3983781456876592, "grad_norm": 0.16854806244373322, "learning_rate": 1.9444296481051697e-05, "loss": 1.1992, "step": 3756 }, { "epoch": 1.3987504508382682, "grad_norm": 0.17003561556339264, "learning_rate": 1.944389692029188e-05, "loss": 1.2143, "step": 3757 }, { "epoch": 1.3991227559888775, "grad_norm": 0.17145571112632751, "learning_rate": 1.9443497220046298e-05, "loss": 1.2136, "step": 3758 }, { "epoch": 1.3994950611394865, "grad_norm": 0.1773233562707901, "learning_rate": 1.9443097380320855e-05, "loss": 1.1977, "step": 3759 }, { "epoch": 1.3998673662900956, "grad_norm": 0.1766585260629654, "learning_rate": 1.944269740112146e-05, "loss": 1.2133, "step": 3760 }, { "epoch": 1.4002396714407046, "grad_norm": 0.18044057488441467, "learning_rate": 1.9442297282454012e-05, "loss": 1.206, "step": 3761 }, { "epoch": 1.4006119765913136, "grad_norm": 0.1800515502691269, "learning_rate": 1.9441897024324428e-05, "loss": 1.2115, "step": 3762 }, { "epoch": 1.4009842817419227, "grad_norm": 0.17421506345272064, "learning_rate": 1.944149662673862e-05, "loss": 1.2186, "step": 3763 }, { "epoch": 1.4013565868925317, "grad_norm": 0.17194457352161407, "learning_rate": 1.9441096089702495e-05, "loss": 1.2109, "step": 3764 }, { "epoch": 1.401728892043141, "grad_norm": 0.1739378422498703, "learning_rate": 1.9440695413221975e-05, "loss": 1.2066, "step": 3765 }, { "epoch": 1.4021011971937498, "grad_norm": 0.17094124853610992, "learning_rate": 1.9440294597302977e-05, "loss": 1.217, "step": 3766 }, { "epoch": 1.402473502344359, "grad_norm": 0.16742725670337677, "learning_rate": 1.9439893641951418e-05, "loss": 1.2153, "step": 3767 }, { "epoch": 1.4028458074949681, "grad_norm": 0.17719261348247528, "learning_rate": 1.9439492547173225e-05, "loss": 1.2097, "step": 3768 }, { "epoch": 1.4032181126455772, "grad_norm": 0.16668783128261566, "learning_rate": 1.9439091312974317e-05, "loss": 1.2193, "step": 3769 }, { "epoch": 1.4035904177961862, "grad_norm": 0.17551898956298828, "learning_rate": 1.9438689939360627e-05, "loss": 1.2115, "step": 3770 }, { "epoch": 1.4039627229467952, "grad_norm": 0.18291716277599335, "learning_rate": 1.9438288426338073e-05, "loss": 1.2085, "step": 3771 }, { "epoch": 1.4043350280974043, "grad_norm": 0.16662460565567017, "learning_rate": 1.9437886773912595e-05, "loss": 1.2078, "step": 3772 }, { "epoch": 1.4047073332480133, "grad_norm": 0.17178334295749664, "learning_rate": 1.9437484982090122e-05, "loss": 1.2195, "step": 3773 }, { "epoch": 1.4050796383986226, "grad_norm": 0.17247992753982544, "learning_rate": 1.9437083050876588e-05, "loss": 1.2026, "step": 3774 }, { "epoch": 1.4054519435492314, "grad_norm": 0.17090709507465363, "learning_rate": 1.943668098027793e-05, "loss": 1.2013, "step": 3775 }, { "epoch": 1.4058242486998407, "grad_norm": 0.1711760014295578, "learning_rate": 1.9436278770300082e-05, "loss": 1.2116, "step": 3776 }, { "epoch": 1.4061965538504497, "grad_norm": 0.17857705056667328, "learning_rate": 1.943587642094899e-05, "loss": 1.2237, "step": 3777 }, { "epoch": 1.4065688590010588, "grad_norm": 0.18631206452846527, "learning_rate": 1.9435473932230597e-05, "loss": 1.2378, "step": 3778 }, { "epoch": 1.4069411641516678, "grad_norm": 0.209711492061615, "learning_rate": 1.9435071304150846e-05, "loss": 1.2207, "step": 3779 }, { "epoch": 1.4073134693022769, "grad_norm": 0.2716389298439026, "learning_rate": 1.9434668536715686e-05, "loss": 1.2311, "step": 3780 }, { "epoch": 1.407685774452886, "grad_norm": 0.2101835310459137, "learning_rate": 1.9434265629931063e-05, "loss": 1.2202, "step": 3781 }, { "epoch": 1.408058079603495, "grad_norm": 0.1864628791809082, "learning_rate": 1.9433862583802927e-05, "loss": 1.2012, "step": 3782 }, { "epoch": 1.4084303847541042, "grad_norm": 0.1653883457183838, "learning_rate": 1.9433459398337234e-05, "loss": 1.2191, "step": 3783 }, { "epoch": 1.408802689904713, "grad_norm": 0.17915792763233185, "learning_rate": 1.9433056073539934e-05, "loss": 1.2218, "step": 3784 }, { "epoch": 1.4091749950553223, "grad_norm": 0.20036007463932037, "learning_rate": 1.9432652609416993e-05, "loss": 1.2227, "step": 3785 }, { "epoch": 1.4095473002059313, "grad_norm": 0.18710802495479584, "learning_rate": 1.943224900597436e-05, "loss": 1.2126, "step": 3786 }, { "epoch": 1.4099196053565404, "grad_norm": 0.18362727761268616, "learning_rate": 1.9431845263218005e-05, "loss": 1.219, "step": 3787 }, { "epoch": 1.4102919105071494, "grad_norm": 0.18792852759361267, "learning_rate": 1.943144138115389e-05, "loss": 1.2113, "step": 3788 }, { "epoch": 1.4106642156577585, "grad_norm": 0.1898011863231659, "learning_rate": 1.943103735978797e-05, "loss": 1.2172, "step": 3789 }, { "epoch": 1.4110365208083675, "grad_norm": 0.1731170117855072, "learning_rate": 1.9430633199126225e-05, "loss": 1.2306, "step": 3790 }, { "epoch": 1.4114088259589765, "grad_norm": 0.1677362322807312, "learning_rate": 1.9430228899174617e-05, "loss": 1.2239, "step": 3791 }, { "epoch": 1.4117811311095858, "grad_norm": 0.16974781453609467, "learning_rate": 1.9429824459939125e-05, "loss": 1.2035, "step": 3792 }, { "epoch": 1.4121534362601946, "grad_norm": 0.19337321817874908, "learning_rate": 1.9429419881425713e-05, "loss": 1.2091, "step": 3793 }, { "epoch": 1.412525741410804, "grad_norm": 0.1791197508573532, "learning_rate": 1.9429015163640363e-05, "loss": 1.2148, "step": 3794 }, { "epoch": 1.412898046561413, "grad_norm": 0.17587502300739288, "learning_rate": 1.9428610306589047e-05, "loss": 1.2287, "step": 3795 }, { "epoch": 1.413270351712022, "grad_norm": 0.17841365933418274, "learning_rate": 1.9428205310277752e-05, "loss": 1.2108, "step": 3796 }, { "epoch": 1.413642656862631, "grad_norm": 0.17350056767463684, "learning_rate": 1.9427800174712455e-05, "loss": 1.2035, "step": 3797 }, { "epoch": 1.41401496201324, "grad_norm": 0.1747625321149826, "learning_rate": 1.942739489989914e-05, "loss": 1.2333, "step": 3798 }, { "epoch": 1.414387267163849, "grad_norm": 0.17848855257034302, "learning_rate": 1.9426989485843796e-05, "loss": 1.2028, "step": 3799 }, { "epoch": 1.4147595723144581, "grad_norm": 0.17129814624786377, "learning_rate": 1.942658393255241e-05, "loss": 1.228, "step": 3800 }, { "epoch": 1.4151318774650674, "grad_norm": 0.1709514707326889, "learning_rate": 1.942617824003097e-05, "loss": 1.2262, "step": 3801 }, { "epoch": 1.4155041826156765, "grad_norm": 0.16819478571414948, "learning_rate": 1.942577240828547e-05, "loss": 1.1978, "step": 3802 }, { "epoch": 1.4158764877662855, "grad_norm": 0.17637069523334503, "learning_rate": 1.94253664373219e-05, "loss": 1.2166, "step": 3803 }, { "epoch": 1.4162487929168945, "grad_norm": 0.1750773936510086, "learning_rate": 1.942496032714626e-05, "loss": 1.2239, "step": 3804 }, { "epoch": 1.4166210980675036, "grad_norm": 0.16642986238002777, "learning_rate": 1.9424554077764548e-05, "loss": 1.2061, "step": 3805 }, { "epoch": 1.4169934032181126, "grad_norm": 0.16567951440811157, "learning_rate": 1.9424147689182765e-05, "loss": 1.2207, "step": 3806 }, { "epoch": 1.4173657083687217, "grad_norm": 0.17164906859397888, "learning_rate": 1.942374116140691e-05, "loss": 1.2136, "step": 3807 }, { "epoch": 1.4177380135193307, "grad_norm": 0.1757207065820694, "learning_rate": 1.942333449444299e-05, "loss": 1.2229, "step": 3808 }, { "epoch": 1.4181103186699398, "grad_norm": 0.1720336526632309, "learning_rate": 1.9422927688297012e-05, "loss": 1.2173, "step": 3809 }, { "epoch": 1.418482623820549, "grad_norm": 0.16701281070709229, "learning_rate": 1.9422520742974984e-05, "loss": 1.2145, "step": 3810 }, { "epoch": 1.418854928971158, "grad_norm": 0.16450679302215576, "learning_rate": 1.9422113658482912e-05, "loss": 1.2042, "step": 3811 }, { "epoch": 1.419227234121767, "grad_norm": 0.16200435161590576, "learning_rate": 1.942170643482682e-05, "loss": 1.2152, "step": 3812 }, { "epoch": 1.4195995392723761, "grad_norm": 0.1700204312801361, "learning_rate": 1.9421299072012705e-05, "loss": 1.2209, "step": 3813 }, { "epoch": 1.4199718444229852, "grad_norm": 0.1691514402627945, "learning_rate": 1.94208915700466e-05, "loss": 1.2218, "step": 3814 }, { "epoch": 1.4203441495735942, "grad_norm": 0.17180638015270233, "learning_rate": 1.942048392893452e-05, "loss": 1.2112, "step": 3815 }, { "epoch": 1.4207164547242033, "grad_norm": 0.17067848145961761, "learning_rate": 1.942007614868248e-05, "loss": 1.2182, "step": 3816 }, { "epoch": 1.4210887598748123, "grad_norm": 0.17079667747020721, "learning_rate": 1.9419668229296507e-05, "loss": 1.2248, "step": 3817 }, { "epoch": 1.4214610650254214, "grad_norm": 0.16801148653030396, "learning_rate": 1.9419260170782624e-05, "loss": 1.217, "step": 3818 }, { "epoch": 1.4218333701760306, "grad_norm": 0.16779641807079315, "learning_rate": 1.9418851973146864e-05, "loss": 1.2039, "step": 3819 }, { "epoch": 1.4222056753266397, "grad_norm": 0.17368653416633606, "learning_rate": 1.941844363639525e-05, "loss": 1.2103, "step": 3820 }, { "epoch": 1.4225779804772487, "grad_norm": 0.1701452136039734, "learning_rate": 1.9418035160533813e-05, "loss": 1.2069, "step": 3821 }, { "epoch": 1.4229502856278577, "grad_norm": 0.17342039942741394, "learning_rate": 1.9417626545568588e-05, "loss": 1.222, "step": 3822 }, { "epoch": 1.4233225907784668, "grad_norm": 0.173239603638649, "learning_rate": 1.941721779150561e-05, "loss": 1.2248, "step": 3823 }, { "epoch": 1.4236948959290758, "grad_norm": 0.16423793137073517, "learning_rate": 1.9416808898350915e-05, "loss": 1.2084, "step": 3824 }, { "epoch": 1.4240672010796849, "grad_norm": 0.16744939982891083, "learning_rate": 1.9416399866110545e-05, "loss": 1.2066, "step": 3825 }, { "epoch": 1.4244395062302941, "grad_norm": 0.17226476967334747, "learning_rate": 1.941599069479054e-05, "loss": 1.2074, "step": 3826 }, { "epoch": 1.424811811380903, "grad_norm": 0.16691239178180695, "learning_rate": 1.9415581384396944e-05, "loss": 1.2157, "step": 3827 }, { "epoch": 1.4251841165315122, "grad_norm": 0.16759438812732697, "learning_rate": 1.9415171934935798e-05, "loss": 1.2214, "step": 3828 }, { "epoch": 1.4255564216821213, "grad_norm": 0.1606573760509491, "learning_rate": 1.9414762346413157e-05, "loss": 1.202, "step": 3829 }, { "epoch": 1.4259287268327303, "grad_norm": 0.16710183024406433, "learning_rate": 1.9414352618835065e-05, "loss": 1.226, "step": 3830 }, { "epoch": 1.4263010319833394, "grad_norm": 0.16753865778446198, "learning_rate": 1.9413942752207577e-05, "loss": 1.2131, "step": 3831 }, { "epoch": 1.4266733371339484, "grad_norm": 0.17130768299102783, "learning_rate": 1.9413532746536744e-05, "loss": 1.2127, "step": 3832 }, { "epoch": 1.4270456422845574, "grad_norm": 0.1705525517463684, "learning_rate": 1.9413122601828624e-05, "loss": 1.2116, "step": 3833 }, { "epoch": 1.4274179474351665, "grad_norm": 0.16685357689857483, "learning_rate": 1.941271231808928e-05, "loss": 1.2101, "step": 3834 }, { "epoch": 1.4277902525857757, "grad_norm": 0.16948597133159637, "learning_rate": 1.9412301895324755e-05, "loss": 1.2146, "step": 3835 }, { "epoch": 1.4281625577363846, "grad_norm": 0.16796061396598816, "learning_rate": 1.941189133354113e-05, "loss": 1.2168, "step": 3836 }, { "epoch": 1.4285348628869938, "grad_norm": 0.1637044996023178, "learning_rate": 1.9411480632744454e-05, "loss": 1.2151, "step": 3837 }, { "epoch": 1.4289071680376029, "grad_norm": 0.16353152692317963, "learning_rate": 1.9411069792940803e-05, "loss": 1.2055, "step": 3838 }, { "epoch": 1.429279473188212, "grad_norm": 0.17246000468730927, "learning_rate": 1.9410658814136243e-05, "loss": 1.2039, "step": 3839 }, { "epoch": 1.429651778338821, "grad_norm": 0.17234675586223602, "learning_rate": 1.9410247696336842e-05, "loss": 1.2052, "step": 3840 }, { "epoch": 1.43002408348943, "grad_norm": 0.1646290272474289, "learning_rate": 1.940983643954867e-05, "loss": 1.203, "step": 3841 }, { "epoch": 1.430396388640039, "grad_norm": 0.17096149921417236, "learning_rate": 1.9409425043777806e-05, "loss": 1.2062, "step": 3842 }, { "epoch": 1.430768693790648, "grad_norm": 0.16843508183956146, "learning_rate": 1.9409013509030327e-05, "loss": 1.2088, "step": 3843 }, { "epoch": 1.4311409989412573, "grad_norm": 0.17243371903896332, "learning_rate": 1.940860183531231e-05, "loss": 1.2183, "step": 3844 }, { "epoch": 1.4315133040918662, "grad_norm": 0.16187606751918793, "learning_rate": 1.940819002262983e-05, "loss": 1.207, "step": 3845 }, { "epoch": 1.4318856092424754, "grad_norm": 0.17250527441501617, "learning_rate": 1.9407778070988978e-05, "loss": 1.2233, "step": 3846 }, { "epoch": 1.4322579143930845, "grad_norm": 0.17004723846912384, "learning_rate": 1.9407365980395833e-05, "loss": 1.2143, "step": 3847 }, { "epoch": 1.4326302195436935, "grad_norm": 0.16374097764492035, "learning_rate": 1.940695375085648e-05, "loss": 1.2106, "step": 3848 }, { "epoch": 1.4330025246943026, "grad_norm": 0.17270928621292114, "learning_rate": 1.9406541382377012e-05, "loss": 1.2206, "step": 3849 }, { "epoch": 1.4333748298449116, "grad_norm": 0.17662329971790314, "learning_rate": 1.940612887496352e-05, "loss": 1.2189, "step": 3850 }, { "epoch": 1.4337471349955206, "grad_norm": 0.1634707897901535, "learning_rate": 1.9405716228622094e-05, "loss": 1.2006, "step": 3851 }, { "epoch": 1.4341194401461297, "grad_norm": 0.16489939391613007, "learning_rate": 1.9405303443358827e-05, "loss": 1.2142, "step": 3852 }, { "epoch": 1.434491745296739, "grad_norm": 0.17095866799354553, "learning_rate": 1.940489051917982e-05, "loss": 1.2122, "step": 3853 }, { "epoch": 1.4348640504473478, "grad_norm": 0.16927289962768555, "learning_rate": 1.940447745609117e-05, "loss": 1.2124, "step": 3854 }, { "epoch": 1.435236355597957, "grad_norm": 0.17228350043296814, "learning_rate": 1.940406425409898e-05, "loss": 1.2221, "step": 3855 }, { "epoch": 1.435608660748566, "grad_norm": 0.1686760038137436, "learning_rate": 1.940365091320935e-05, "loss": 1.2281, "step": 3856 }, { "epoch": 1.4359809658991751, "grad_norm": 0.16984900832176208, "learning_rate": 1.9403237433428384e-05, "loss": 1.2348, "step": 3857 }, { "epoch": 1.4363532710497842, "grad_norm": 0.16201676428318024, "learning_rate": 1.9402823814762194e-05, "loss": 1.2072, "step": 3858 }, { "epoch": 1.4367255762003932, "grad_norm": 0.17006635665893555, "learning_rate": 1.9402410057216886e-05, "loss": 1.2149, "step": 3859 }, { "epoch": 1.4370978813510022, "grad_norm": 0.17508037388324738, "learning_rate": 1.9401996160798574e-05, "loss": 1.2096, "step": 3860 }, { "epoch": 1.4374701865016113, "grad_norm": 0.1723199486732483, "learning_rate": 1.9401582125513364e-05, "loss": 1.2175, "step": 3861 }, { "epoch": 1.4378424916522206, "grad_norm": 0.16605544090270996, "learning_rate": 1.9401167951367375e-05, "loss": 1.2092, "step": 3862 }, { "epoch": 1.4382147968028296, "grad_norm": 0.16768351197242737, "learning_rate": 1.940075363836673e-05, "loss": 1.229, "step": 3863 }, { "epoch": 1.4385871019534386, "grad_norm": 0.17820565402507782, "learning_rate": 1.9400339186517544e-05, "loss": 1.2276, "step": 3864 }, { "epoch": 1.4389594071040477, "grad_norm": 0.17079061269760132, "learning_rate": 1.9399924595825936e-05, "loss": 1.2158, "step": 3865 }, { "epoch": 1.4393317122546567, "grad_norm": 0.16866520047187805, "learning_rate": 1.939950986629803e-05, "loss": 1.1979, "step": 3866 }, { "epoch": 1.4397040174052658, "grad_norm": 0.16329576075077057, "learning_rate": 1.9399094997939957e-05, "loss": 1.2184, "step": 3867 }, { "epoch": 1.4400763225558748, "grad_norm": 0.17052018642425537, "learning_rate": 1.9398679990757837e-05, "loss": 1.2144, "step": 3868 }, { "epoch": 1.4404486277064839, "grad_norm": 0.16311021149158478, "learning_rate": 1.9398264844757805e-05, "loss": 1.206, "step": 3869 }, { "epoch": 1.440820932857093, "grad_norm": 0.17270159721374512, "learning_rate": 1.9397849559945993e-05, "loss": 1.2159, "step": 3870 }, { "epoch": 1.4411932380077022, "grad_norm": 0.17571993172168732, "learning_rate": 1.939743413632853e-05, "loss": 1.202, "step": 3871 }, { "epoch": 1.4415655431583112, "grad_norm": 0.16022472083568573, "learning_rate": 1.9397018573911558e-05, "loss": 1.2048, "step": 3872 }, { "epoch": 1.4419378483089202, "grad_norm": 0.1789269596338272, "learning_rate": 1.9396602872701205e-05, "loss": 1.2145, "step": 3873 }, { "epoch": 1.4423101534595293, "grad_norm": 0.18328560888767242, "learning_rate": 1.9396187032703624e-05, "loss": 1.2088, "step": 3874 }, { "epoch": 1.4426824586101383, "grad_norm": 0.16325511038303375, "learning_rate": 1.939577105392495e-05, "loss": 1.2033, "step": 3875 }, { "epoch": 1.4430547637607474, "grad_norm": 0.16945096850395203, "learning_rate": 1.9395354936371323e-05, "loss": 1.2225, "step": 3876 }, { "epoch": 1.4434270689113564, "grad_norm": 0.16546650230884552, "learning_rate": 1.9394938680048893e-05, "loss": 1.2079, "step": 3877 }, { "epoch": 1.4437993740619655, "grad_norm": 0.1665334403514862, "learning_rate": 1.9394522284963814e-05, "loss": 1.2367, "step": 3878 }, { "epoch": 1.4441716792125745, "grad_norm": 0.1650296449661255, "learning_rate": 1.9394105751122226e-05, "loss": 1.2172, "step": 3879 }, { "epoch": 1.4445439843631838, "grad_norm": 0.16569402813911438, "learning_rate": 1.9393689078530285e-05, "loss": 1.2179, "step": 3880 }, { "epoch": 1.4449162895137928, "grad_norm": 0.1602257788181305, "learning_rate": 1.9393272267194144e-05, "loss": 1.2111, "step": 3881 }, { "epoch": 1.4452885946644018, "grad_norm": 0.16653552651405334, "learning_rate": 1.9392855317119966e-05, "loss": 1.2216, "step": 3882 }, { "epoch": 1.445660899815011, "grad_norm": 0.16940967738628387, "learning_rate": 1.93924382283139e-05, "loss": 1.2129, "step": 3883 }, { "epoch": 1.44603320496562, "grad_norm": 0.1640087068080902, "learning_rate": 1.9392021000782114e-05, "loss": 1.2136, "step": 3884 }, { "epoch": 1.446405510116229, "grad_norm": 0.15895035862922668, "learning_rate": 1.939160363453077e-05, "loss": 1.197, "step": 3885 }, { "epoch": 1.446777815266838, "grad_norm": 0.16390269994735718, "learning_rate": 1.9391186129566025e-05, "loss": 1.2022, "step": 3886 }, { "epoch": 1.4471501204174473, "grad_norm": 0.16444234549999237, "learning_rate": 1.939076848589405e-05, "loss": 1.2089, "step": 3887 }, { "epoch": 1.447522425568056, "grad_norm": 0.16005566716194153, "learning_rate": 1.9390350703521015e-05, "loss": 1.1956, "step": 3888 }, { "epoch": 1.4478947307186654, "grad_norm": 0.16827602684497833, "learning_rate": 1.938993278245309e-05, "loss": 1.2111, "step": 3889 }, { "epoch": 1.4482670358692744, "grad_norm": 0.16684898734092712, "learning_rate": 1.938951472269645e-05, "loss": 1.2087, "step": 3890 }, { "epoch": 1.4486393410198835, "grad_norm": 0.1641646921634674, "learning_rate": 1.9389096524257263e-05, "loss": 1.2132, "step": 3891 }, { "epoch": 1.4490116461704925, "grad_norm": 0.17110654711723328, "learning_rate": 1.938867818714171e-05, "loss": 1.2203, "step": 3892 }, { "epoch": 1.4493839513211015, "grad_norm": 0.15883228182792664, "learning_rate": 1.938825971135597e-05, "loss": 1.2096, "step": 3893 }, { "epoch": 1.4497562564717106, "grad_norm": 0.1638491004705429, "learning_rate": 1.938784109690622e-05, "loss": 1.2125, "step": 3894 }, { "epoch": 1.4501285616223196, "grad_norm": 0.16501039266586304, "learning_rate": 1.938742234379865e-05, "loss": 1.2235, "step": 3895 }, { "epoch": 1.4505008667729289, "grad_norm": 0.16907702386379242, "learning_rate": 1.938700345203944e-05, "loss": 1.2204, "step": 3896 }, { "epoch": 1.4508731719235377, "grad_norm": 0.16508372128009796, "learning_rate": 1.938658442163478e-05, "loss": 1.2152, "step": 3897 }, { "epoch": 1.451245477074147, "grad_norm": 0.1664949506521225, "learning_rate": 1.9386165252590854e-05, "loss": 1.2177, "step": 3898 }, { "epoch": 1.451617782224756, "grad_norm": 0.1608046591281891, "learning_rate": 1.9385745944913858e-05, "loss": 1.2062, "step": 3899 }, { "epoch": 1.451990087375365, "grad_norm": 0.16553208231925964, "learning_rate": 1.9385326498609983e-05, "loss": 1.2204, "step": 3900 }, { "epoch": 1.452362392525974, "grad_norm": 0.16579049825668335, "learning_rate": 1.9384906913685426e-05, "loss": 1.2166, "step": 3901 }, { "epoch": 1.4527346976765831, "grad_norm": 0.17063631117343903, "learning_rate": 1.9384487190146383e-05, "loss": 1.2075, "step": 3902 }, { "epoch": 1.4531070028271922, "grad_norm": 0.16345682740211487, "learning_rate": 1.938406732799905e-05, "loss": 1.2176, "step": 3903 }, { "epoch": 1.4534793079778012, "grad_norm": 0.16839192807674408, "learning_rate": 1.9383647327249635e-05, "loss": 1.223, "step": 3904 }, { "epoch": 1.4538516131284105, "grad_norm": 0.1658104509115219, "learning_rate": 1.9383227187904334e-05, "loss": 1.2142, "step": 3905 }, { "epoch": 1.4542239182790193, "grad_norm": 0.16820955276489258, "learning_rate": 1.9382806909969362e-05, "loss": 1.2302, "step": 3906 }, { "epoch": 1.4545962234296286, "grad_norm": 0.16950318217277527, "learning_rate": 1.9382386493450913e-05, "loss": 1.2091, "step": 3907 }, { "epoch": 1.4549685285802376, "grad_norm": 0.17140018939971924, "learning_rate": 1.938196593835521e-05, "loss": 1.2267, "step": 3908 }, { "epoch": 1.4553408337308467, "grad_norm": 0.17573173344135284, "learning_rate": 1.938154524468846e-05, "loss": 1.211, "step": 3909 }, { "epoch": 1.4557131388814557, "grad_norm": 0.16952942311763763, "learning_rate": 1.938112441245687e-05, "loss": 1.2111, "step": 3910 }, { "epoch": 1.4560854440320647, "grad_norm": 0.17475372552871704, "learning_rate": 1.9380703441666666e-05, "loss": 1.1979, "step": 3911 }, { "epoch": 1.4564577491826738, "grad_norm": 0.16706113517284393, "learning_rate": 1.9380282332324055e-05, "loss": 1.2209, "step": 3912 }, { "epoch": 1.4568300543332828, "grad_norm": 0.1862281709909439, "learning_rate": 1.937986108443527e-05, "loss": 1.2208, "step": 3913 }, { "epoch": 1.457202359483892, "grad_norm": 0.17388898134231567, "learning_rate": 1.9379439698006522e-05, "loss": 1.2103, "step": 3914 }, { "epoch": 1.457574664634501, "grad_norm": 0.1705702692270279, "learning_rate": 1.9379018173044038e-05, "loss": 1.2244, "step": 3915 }, { "epoch": 1.4579469697851102, "grad_norm": 0.16031381487846375, "learning_rate": 1.9378596509554045e-05, "loss": 1.2021, "step": 3916 }, { "epoch": 1.4583192749357192, "grad_norm": 0.17074525356292725, "learning_rate": 1.937817470754277e-05, "loss": 1.2091, "step": 3917 }, { "epoch": 1.4586915800863283, "grad_norm": 0.1819746345281601, "learning_rate": 1.9377752767016443e-05, "loss": 1.2182, "step": 3918 }, { "epoch": 1.4590638852369373, "grad_norm": 0.16856756806373596, "learning_rate": 1.9377330687981295e-05, "loss": 1.2114, "step": 3919 }, { "epoch": 1.4594361903875464, "grad_norm": 0.16812150180339813, "learning_rate": 1.9376908470443562e-05, "loss": 1.2114, "step": 3920 }, { "epoch": 1.4598084955381554, "grad_norm": 0.17135708034038544, "learning_rate": 1.937648611440948e-05, "loss": 1.2128, "step": 3921 }, { "epoch": 1.4601808006887644, "grad_norm": 0.17878791689872742, "learning_rate": 1.9376063619885285e-05, "loss": 1.2219, "step": 3922 }, { "epoch": 1.4605531058393737, "grad_norm": 0.162892147898674, "learning_rate": 1.937564098687722e-05, "loss": 1.2149, "step": 3923 }, { "epoch": 1.4609254109899827, "grad_norm": 0.16608810424804688, "learning_rate": 1.9375218215391527e-05, "loss": 1.1958, "step": 3924 }, { "epoch": 1.4612977161405918, "grad_norm": 0.17034226655960083, "learning_rate": 1.9374795305434446e-05, "loss": 1.193, "step": 3925 }, { "epoch": 1.4616700212912008, "grad_norm": 0.15888184309005737, "learning_rate": 1.937437225701223e-05, "loss": 1.2113, "step": 3926 }, { "epoch": 1.4620423264418099, "grad_norm": 0.17904046177864075, "learning_rate": 1.937394907013112e-05, "loss": 1.2128, "step": 3927 }, { "epoch": 1.462414631592419, "grad_norm": 0.16831357777118683, "learning_rate": 1.9373525744797377e-05, "loss": 1.2023, "step": 3928 }, { "epoch": 1.462786936743028, "grad_norm": 0.17215365171432495, "learning_rate": 1.9373102281017246e-05, "loss": 1.21, "step": 3929 }, { "epoch": 1.463159241893637, "grad_norm": 0.16621056199073792, "learning_rate": 1.937267867879698e-05, "loss": 1.2021, "step": 3930 }, { "epoch": 1.463531547044246, "grad_norm": 0.17535530030727386, "learning_rate": 1.9372254938142835e-05, "loss": 1.2345, "step": 3931 }, { "epoch": 1.4639038521948553, "grad_norm": 0.16179418563842773, "learning_rate": 1.9371831059061078e-05, "loss": 1.2097, "step": 3932 }, { "epoch": 1.4642761573454643, "grad_norm": 0.17177635431289673, "learning_rate": 1.937140704155796e-05, "loss": 1.2181, "step": 3933 }, { "epoch": 1.4646484624960734, "grad_norm": 0.17407308518886566, "learning_rate": 1.9370982885639752e-05, "loss": 1.215, "step": 3934 }, { "epoch": 1.4650207676466824, "grad_norm": 0.1671421378850937, "learning_rate": 1.937055859131271e-05, "loss": 1.2185, "step": 3935 }, { "epoch": 1.4653930727972915, "grad_norm": 0.16444651782512665, "learning_rate": 1.937013415858311e-05, "loss": 1.2247, "step": 3936 }, { "epoch": 1.4657653779479005, "grad_norm": 0.1708529144525528, "learning_rate": 1.9369709587457217e-05, "loss": 1.1963, "step": 3937 }, { "epoch": 1.4661376830985096, "grad_norm": 0.17232173681259155, "learning_rate": 1.93692848779413e-05, "loss": 1.2015, "step": 3938 }, { "epoch": 1.4665099882491186, "grad_norm": 0.16869337856769562, "learning_rate": 1.936886003004163e-05, "loss": 1.2328, "step": 3939 }, { "epoch": 1.4668822933997276, "grad_norm": 0.17547161877155304, "learning_rate": 1.9368435043764493e-05, "loss": 1.2094, "step": 3940 }, { "epoch": 1.467254598550337, "grad_norm": 0.1809409260749817, "learning_rate": 1.9368009919116152e-05, "loss": 1.2065, "step": 3941 }, { "epoch": 1.467626903700946, "grad_norm": 0.1913343220949173, "learning_rate": 1.9367584656102895e-05, "loss": 1.2309, "step": 3942 }, { "epoch": 1.467999208851555, "grad_norm": 0.16980275511741638, "learning_rate": 1.9367159254731e-05, "loss": 1.2062, "step": 3943 }, { "epoch": 1.468371514002164, "grad_norm": 0.17768965661525726, "learning_rate": 1.936673371500675e-05, "loss": 1.2134, "step": 3944 }, { "epoch": 1.468743819152773, "grad_norm": 0.17502190172672272, "learning_rate": 1.9366308036936433e-05, "loss": 1.2255, "step": 3945 }, { "epoch": 1.4691161243033821, "grad_norm": 0.176423579454422, "learning_rate": 1.936588222052633e-05, "loss": 1.2131, "step": 3946 }, { "epoch": 1.4694884294539912, "grad_norm": 0.18307115137577057, "learning_rate": 1.936545626578274e-05, "loss": 1.2093, "step": 3947 }, { "epoch": 1.4698607346046004, "grad_norm": 0.16688889265060425, "learning_rate": 1.9365030172711946e-05, "loss": 1.2074, "step": 3948 }, { "epoch": 1.4702330397552092, "grad_norm": 0.16726486384868622, "learning_rate": 1.9364603941320243e-05, "loss": 1.1952, "step": 3949 }, { "epoch": 1.4706053449058185, "grad_norm": 0.1644599884748459, "learning_rate": 1.9364177571613927e-05, "loss": 1.2164, "step": 3950 }, { "epoch": 1.4709776500564276, "grad_norm": 0.17639389634132385, "learning_rate": 1.9363751063599298e-05, "loss": 1.2219, "step": 3951 }, { "epoch": 1.4713499552070366, "grad_norm": 0.1668757051229477, "learning_rate": 1.936332441728265e-05, "loss": 1.2021, "step": 3952 }, { "epoch": 1.4717222603576456, "grad_norm": 0.17846433818340302, "learning_rate": 1.9362897632670293e-05, "loss": 1.2116, "step": 3953 }, { "epoch": 1.4720945655082547, "grad_norm": 0.18401432037353516, "learning_rate": 1.936247070976852e-05, "loss": 1.2167, "step": 3954 }, { "epoch": 1.4724668706588637, "grad_norm": 0.16676369309425354, "learning_rate": 1.9362043648583647e-05, "loss": 1.2144, "step": 3955 }, { "epoch": 1.4728391758094728, "grad_norm": 0.17086462676525116, "learning_rate": 1.9361616449121973e-05, "loss": 1.2163, "step": 3956 }, { "epoch": 1.473211480960082, "grad_norm": 0.1699550896883011, "learning_rate": 1.9361189111389817e-05, "loss": 1.2083, "step": 3957 }, { "epoch": 1.4735837861106909, "grad_norm": 0.16862007975578308, "learning_rate": 1.936076163539348e-05, "loss": 1.2129, "step": 3958 }, { "epoch": 1.4739560912613001, "grad_norm": 0.168454110622406, "learning_rate": 1.936033402113928e-05, "loss": 1.2052, "step": 3959 }, { "epoch": 1.4743283964119092, "grad_norm": 0.17576009035110474, "learning_rate": 1.9359906268633542e-05, "loss": 1.1988, "step": 3960 }, { "epoch": 1.4747007015625182, "grad_norm": 0.17533783614635468, "learning_rate": 1.9359478377882567e-05, "loss": 1.2165, "step": 3961 }, { "epoch": 1.4750730067131272, "grad_norm": 0.18363003432750702, "learning_rate": 1.935905034889269e-05, "loss": 1.2095, "step": 3962 }, { "epoch": 1.4754453118637363, "grad_norm": 0.1739223450422287, "learning_rate": 1.9358622181670225e-05, "loss": 1.2043, "step": 3963 }, { "epoch": 1.4758176170143453, "grad_norm": 0.1666637510061264, "learning_rate": 1.9358193876221497e-05, "loss": 1.2036, "step": 3964 }, { "epoch": 1.4761899221649544, "grad_norm": 0.18564556539058685, "learning_rate": 1.935776543255283e-05, "loss": 1.2267, "step": 3965 }, { "epoch": 1.4765622273155636, "grad_norm": 0.16822317242622375, "learning_rate": 1.935733685067056e-05, "loss": 1.2294, "step": 3966 }, { "epoch": 1.4769345324661725, "grad_norm": 0.16911527514457703, "learning_rate": 1.9356908130581008e-05, "loss": 1.2128, "step": 3967 }, { "epoch": 1.4773068376167817, "grad_norm": 0.19350941479206085, "learning_rate": 1.9356479272290514e-05, "loss": 1.2107, "step": 3968 }, { "epoch": 1.4776791427673908, "grad_norm": 0.1877276450395584, "learning_rate": 1.9356050275805406e-05, "loss": 1.2059, "step": 3969 }, { "epoch": 1.4780514479179998, "grad_norm": 0.17434152960777283, "learning_rate": 1.9355621141132022e-05, "loss": 1.2185, "step": 3970 }, { "epoch": 1.4784237530686088, "grad_norm": 0.19158223271369934, "learning_rate": 1.9355191868276702e-05, "loss": 1.2107, "step": 3971 }, { "epoch": 1.478796058219218, "grad_norm": 0.17890071868896484, "learning_rate": 1.9354762457245782e-05, "loss": 1.205, "step": 3972 }, { "epoch": 1.479168363369827, "grad_norm": 0.16865649819374084, "learning_rate": 1.935433290804561e-05, "loss": 1.2112, "step": 3973 }, { "epoch": 1.479540668520436, "grad_norm": 0.16355697810649872, "learning_rate": 1.935390322068253e-05, "loss": 1.2194, "step": 3974 }, { "epoch": 1.4799129736710452, "grad_norm": 0.169992133975029, "learning_rate": 1.9353473395162882e-05, "loss": 1.2088, "step": 3975 }, { "epoch": 1.480285278821654, "grad_norm": 0.16917189955711365, "learning_rate": 1.9353043431493024e-05, "loss": 1.2149, "step": 3976 }, { "epoch": 1.4806575839722633, "grad_norm": 0.16801759600639343, "learning_rate": 1.9352613329679298e-05, "loss": 1.2251, "step": 3977 }, { "epoch": 1.4810298891228724, "grad_norm": 0.16183248162269592, "learning_rate": 1.935218308972806e-05, "loss": 1.2019, "step": 3978 }, { "epoch": 1.4814021942734814, "grad_norm": 0.16628491878509521, "learning_rate": 1.935175271164567e-05, "loss": 1.2186, "step": 3979 }, { "epoch": 1.4817744994240905, "grad_norm": 0.167390838265419, "learning_rate": 1.9351322195438472e-05, "loss": 1.2196, "step": 3980 }, { "epoch": 1.4821468045746995, "grad_norm": 0.17368678748607635, "learning_rate": 1.9350891541112836e-05, "loss": 1.2114, "step": 3981 }, { "epoch": 1.4825191097253085, "grad_norm": 0.17052781581878662, "learning_rate": 1.9350460748675117e-05, "loss": 1.2013, "step": 3982 }, { "epoch": 1.4828914148759176, "grad_norm": 0.16377271711826324, "learning_rate": 1.935002981813168e-05, "loss": 1.2114, "step": 3983 }, { "epoch": 1.4832637200265268, "grad_norm": 0.1672280728816986, "learning_rate": 1.934959874948889e-05, "loss": 1.2214, "step": 3984 }, { "epoch": 1.4836360251771359, "grad_norm": 0.16582052409648895, "learning_rate": 1.9349167542753116e-05, "loss": 1.2097, "step": 3985 }, { "epoch": 1.484008330327745, "grad_norm": 0.15691331028938293, "learning_rate": 1.934873619793072e-05, "loss": 1.2001, "step": 3986 }, { "epoch": 1.484380635478354, "grad_norm": 0.16252057254314423, "learning_rate": 1.934830471502808e-05, "loss": 1.2105, "step": 3987 }, { "epoch": 1.484752940628963, "grad_norm": 0.18076485395431519, "learning_rate": 1.9347873094051565e-05, "loss": 1.1981, "step": 3988 }, { "epoch": 1.485125245779572, "grad_norm": 0.18029817938804626, "learning_rate": 1.9347441335007547e-05, "loss": 1.2136, "step": 3989 }, { "epoch": 1.485497550930181, "grad_norm": 0.17143037915229797, "learning_rate": 1.9347009437902414e-05, "loss": 1.1981, "step": 3990 }, { "epoch": 1.4858698560807901, "grad_norm": 0.1707201451063156, "learning_rate": 1.9346577402742532e-05, "loss": 1.2137, "step": 3991 }, { "epoch": 1.4862421612313992, "grad_norm": 0.17134663462638855, "learning_rate": 1.9346145229534295e-05, "loss": 1.2097, "step": 3992 }, { "epoch": 1.4866144663820084, "grad_norm": 0.18079330027103424, "learning_rate": 1.9345712918284074e-05, "loss": 1.2135, "step": 3993 }, { "epoch": 1.4869867715326175, "grad_norm": 0.16749143600463867, "learning_rate": 1.934528046899826e-05, "loss": 1.2145, "step": 3994 }, { "epoch": 1.4873590766832265, "grad_norm": 0.17483735084533691, "learning_rate": 1.9344847881683242e-05, "loss": 1.2164, "step": 3995 }, { "epoch": 1.4877313818338356, "grad_norm": 0.17424844205379486, "learning_rate": 1.9344415156345407e-05, "loss": 1.2007, "step": 3996 }, { "epoch": 1.4881036869844446, "grad_norm": 0.18237757682800293, "learning_rate": 1.9343982292991147e-05, "loss": 1.1958, "step": 3997 }, { "epoch": 1.4884759921350537, "grad_norm": 0.2026931643486023, "learning_rate": 1.9343549291626853e-05, "loss": 1.2126, "step": 3998 }, { "epoch": 1.4888482972856627, "grad_norm": 0.2018553465604782, "learning_rate": 1.9343116152258924e-05, "loss": 1.2323, "step": 3999 }, { "epoch": 1.489220602436272, "grad_norm": 0.17525236308574677, "learning_rate": 1.9342682874893756e-05, "loss": 1.2227, "step": 4000 }, { "epoch": 1.489220602436272, "eval_loss": 1.3177570104599, "eval_runtime": 16.441, "eval_samples_per_second": 105.468, "eval_steps_per_second": 5.292, "step": 4000 }, { "epoch": 1.4895929075868808, "grad_norm": 0.20407749712467194, "learning_rate": 1.9342249459537746e-05, "loss": 1.2125, "step": 4001 }, { "epoch": 1.48996521273749, "grad_norm": 0.1680436134338379, "learning_rate": 1.93418159061973e-05, "loss": 1.208, "step": 4002 }, { "epoch": 1.490337517888099, "grad_norm": 0.16656237840652466, "learning_rate": 1.934138221487882e-05, "loss": 1.2182, "step": 4003 }, { "epoch": 1.4907098230387081, "grad_norm": 0.17523828148841858, "learning_rate": 1.9340948385588708e-05, "loss": 1.2013, "step": 4004 }, { "epoch": 1.4910821281893172, "grad_norm": 0.17147956788539886, "learning_rate": 1.9340514418333375e-05, "loss": 1.221, "step": 4005 }, { "epoch": 1.4914544333399262, "grad_norm": 0.16101637482643127, "learning_rate": 1.934008031311923e-05, "loss": 1.2107, "step": 4006 }, { "epoch": 1.4918267384905353, "grad_norm": 0.17050312459468842, "learning_rate": 1.933964606995269e-05, "loss": 1.216, "step": 4007 }, { "epoch": 1.4921990436411443, "grad_norm": 0.17762567102909088, "learning_rate": 1.933921168884016e-05, "loss": 1.219, "step": 4008 }, { "epoch": 1.4925713487917536, "grad_norm": 0.179819718003273, "learning_rate": 1.9338777169788058e-05, "loss": 1.2271, "step": 4009 }, { "epoch": 1.4929436539423624, "grad_norm": 0.17511019110679626, "learning_rate": 1.9338342512802805e-05, "loss": 1.2128, "step": 4010 }, { "epoch": 1.4933159590929717, "grad_norm": 0.17348961532115936, "learning_rate": 1.9337907717890817e-05, "loss": 1.2215, "step": 4011 }, { "epoch": 1.4936882642435807, "grad_norm": 0.16590604186058044, "learning_rate": 1.933747278505852e-05, "loss": 1.2048, "step": 4012 }, { "epoch": 1.4940605693941897, "grad_norm": 0.16932128369808197, "learning_rate": 1.9337037714312337e-05, "loss": 1.22, "step": 4013 }, { "epoch": 1.4944328745447988, "grad_norm": 0.18030394613742828, "learning_rate": 1.933660250565869e-05, "loss": 1.2031, "step": 4014 }, { "epoch": 1.4948051796954078, "grad_norm": 0.16804668307304382, "learning_rate": 1.9336167159104012e-05, "loss": 1.2167, "step": 4015 }, { "epoch": 1.4951774848460169, "grad_norm": 0.17750367522239685, "learning_rate": 1.9335731674654732e-05, "loss": 1.2102, "step": 4016 }, { "epoch": 1.495549789996626, "grad_norm": 0.1765013486146927, "learning_rate": 1.9335296052317278e-05, "loss": 1.2192, "step": 4017 }, { "epoch": 1.4959220951472352, "grad_norm": 0.17468391358852386, "learning_rate": 1.933486029209809e-05, "loss": 1.2093, "step": 4018 }, { "epoch": 1.496294400297844, "grad_norm": 0.16882555186748505, "learning_rate": 1.93344243940036e-05, "loss": 1.2092, "step": 4019 }, { "epoch": 1.4966667054484533, "grad_norm": 0.16576802730560303, "learning_rate": 1.9333988358040246e-05, "loss": 1.2021, "step": 4020 }, { "epoch": 1.4970390105990623, "grad_norm": 0.17148758471012115, "learning_rate": 1.9333552184214473e-05, "loss": 1.2202, "step": 4021 }, { "epoch": 1.4974113157496713, "grad_norm": 0.16886599361896515, "learning_rate": 1.933311587253272e-05, "loss": 1.2053, "step": 4022 }, { "epoch": 1.4977836209002804, "grad_norm": 0.16921773552894592, "learning_rate": 1.9332679423001428e-05, "loss": 1.2064, "step": 4023 }, { "epoch": 1.4981559260508894, "grad_norm": 0.16829746961593628, "learning_rate": 1.9332242835627048e-05, "loss": 1.2097, "step": 4024 }, { "epoch": 1.4985282312014985, "grad_norm": 0.16895058751106262, "learning_rate": 1.9331806110416027e-05, "loss": 1.2116, "step": 4025 }, { "epoch": 1.4989005363521075, "grad_norm": 0.17444762587547302, "learning_rate": 1.9331369247374815e-05, "loss": 1.1994, "step": 4026 }, { "epoch": 1.4992728415027168, "grad_norm": 0.17568188905715942, "learning_rate": 1.9330932246509867e-05, "loss": 1.1921, "step": 4027 }, { "epoch": 1.4996451466533256, "grad_norm": 0.164281964302063, "learning_rate": 1.9330495107827633e-05, "loss": 1.221, "step": 4028 }, { "epoch": 1.5000174518039349, "grad_norm": 0.1712205559015274, "learning_rate": 1.9330057831334573e-05, "loss": 1.2055, "step": 4029 }, { "epoch": 1.500389756954544, "grad_norm": 0.17232446372509003, "learning_rate": 1.9329620417037143e-05, "loss": 1.2094, "step": 4030 }, { "epoch": 1.500762062105153, "grad_norm": 0.16828641295433044, "learning_rate": 1.9329182864941808e-05, "loss": 1.2161, "step": 4031 }, { "epoch": 1.501134367255762, "grad_norm": 0.1642870008945465, "learning_rate": 1.9328745175055025e-05, "loss": 1.2076, "step": 4032 }, { "epoch": 1.501506672406371, "grad_norm": 0.16758432984352112, "learning_rate": 1.932830734738326e-05, "loss": 1.2149, "step": 4033 }, { "epoch": 1.5018789775569803, "grad_norm": 0.16760718822479248, "learning_rate": 1.9327869381932984e-05, "loss": 1.221, "step": 4034 }, { "epoch": 1.5022512827075891, "grad_norm": 0.16775357723236084, "learning_rate": 1.932743127871066e-05, "loss": 1.2106, "step": 4035 }, { "epoch": 1.5026235878581984, "grad_norm": 0.16413766145706177, "learning_rate": 1.9326993037722762e-05, "loss": 1.2133, "step": 4036 }, { "epoch": 1.5029958930088072, "grad_norm": 0.17524518072605133, "learning_rate": 1.9326554658975766e-05, "loss": 1.2083, "step": 4037 }, { "epoch": 1.5033681981594165, "grad_norm": 0.17292781174182892, "learning_rate": 1.9326116142476137e-05, "loss": 1.2126, "step": 4038 }, { "epoch": 1.5037405033100255, "grad_norm": 0.17386804521083832, "learning_rate": 1.9325677488230364e-05, "loss": 1.2151, "step": 4039 }, { "epoch": 1.5041128084606346, "grad_norm": 0.1756100207567215, "learning_rate": 1.9325238696244914e-05, "loss": 1.2122, "step": 4040 }, { "epoch": 1.5044851136112436, "grad_norm": 0.16365545988082886, "learning_rate": 1.9324799766526276e-05, "loss": 1.2159, "step": 4041 }, { "epoch": 1.5048574187618526, "grad_norm": 0.1651315987110138, "learning_rate": 1.932436069908093e-05, "loss": 1.2278, "step": 4042 }, { "epoch": 1.505229723912462, "grad_norm": 0.1678977757692337, "learning_rate": 1.9323921493915364e-05, "loss": 1.2131, "step": 4043 }, { "epoch": 1.5056020290630707, "grad_norm": 0.16497032344341278, "learning_rate": 1.932348215103606e-05, "loss": 1.2105, "step": 4044 }, { "epoch": 1.50597433421368, "grad_norm": 0.1694755107164383, "learning_rate": 1.932304267044951e-05, "loss": 1.2084, "step": 4045 }, { "epoch": 1.5063466393642888, "grad_norm": 0.16757144033908844, "learning_rate": 1.932260305216221e-05, "loss": 1.2132, "step": 4046 }, { "epoch": 1.506718944514898, "grad_norm": 0.16843421757221222, "learning_rate": 1.932216329618064e-05, "loss": 1.2041, "step": 4047 }, { "epoch": 1.5070912496655071, "grad_norm": 0.17438024282455444, "learning_rate": 1.9321723402511308e-05, "loss": 1.2206, "step": 4048 }, { "epoch": 1.5074635548161162, "grad_norm": 0.17273013293743134, "learning_rate": 1.9321283371160704e-05, "loss": 1.2153, "step": 4049 }, { "epoch": 1.5078358599667252, "grad_norm": 0.16855518519878387, "learning_rate": 1.9320843202135333e-05, "loss": 1.2327, "step": 4050 }, { "epoch": 1.5082081651173342, "grad_norm": 0.17092865705490112, "learning_rate": 1.932040289544169e-05, "loss": 1.2183, "step": 4051 }, { "epoch": 1.5085804702679435, "grad_norm": 0.16333964467048645, "learning_rate": 1.9319962451086282e-05, "loss": 1.2078, "step": 4052 }, { "epoch": 1.5089527754185523, "grad_norm": 0.16895681619644165, "learning_rate": 1.9319521869075612e-05, "loss": 1.2129, "step": 4053 }, { "epoch": 1.5093250805691616, "grad_norm": 0.17089246213436127, "learning_rate": 1.931908114941619e-05, "loss": 1.1971, "step": 4054 }, { "epoch": 1.5096973857197704, "grad_norm": 0.1670157015323639, "learning_rate": 1.9318640292114526e-05, "loss": 1.2041, "step": 4055 }, { "epoch": 1.5100696908703797, "grad_norm": 0.16973476111888885, "learning_rate": 1.9318199297177127e-05, "loss": 1.2034, "step": 4056 }, { "epoch": 1.5104419960209887, "grad_norm": 0.173712819814682, "learning_rate": 1.931775816461051e-05, "loss": 1.2121, "step": 4057 }, { "epoch": 1.5108143011715978, "grad_norm": 0.1701354682445526, "learning_rate": 1.931731689442119e-05, "loss": 1.2193, "step": 4058 }, { "epoch": 1.5111866063222068, "grad_norm": 0.16733410954475403, "learning_rate": 1.9316875486615684e-05, "loss": 1.2152, "step": 4059 }, { "epoch": 1.5115589114728158, "grad_norm": 0.17362013459205627, "learning_rate": 1.931643394120051e-05, "loss": 1.2194, "step": 4060 }, { "epoch": 1.5119312166234251, "grad_norm": 0.1801014095544815, "learning_rate": 1.9315992258182196e-05, "loss": 1.2112, "step": 4061 }, { "epoch": 1.512303521774034, "grad_norm": 0.17193296551704407, "learning_rate": 1.9315550437567258e-05, "loss": 1.2177, "step": 4062 }, { "epoch": 1.5126758269246432, "grad_norm": 0.17398405075073242, "learning_rate": 1.9315108479362226e-05, "loss": 1.2154, "step": 4063 }, { "epoch": 1.513048132075252, "grad_norm": 0.18025220930576324, "learning_rate": 1.9314666383573622e-05, "loss": 1.2044, "step": 4064 }, { "epoch": 1.5134204372258613, "grad_norm": 0.15889345109462738, "learning_rate": 1.9314224150207986e-05, "loss": 1.2138, "step": 4065 }, { "epoch": 1.5137927423764703, "grad_norm": 0.1648905724287033, "learning_rate": 1.9313781779271842e-05, "loss": 1.2124, "step": 4066 }, { "epoch": 1.5141650475270794, "grad_norm": 0.1780308037996292, "learning_rate": 1.9313339270771724e-05, "loss": 1.203, "step": 4067 }, { "epoch": 1.5145373526776884, "grad_norm": 0.16824577748775482, "learning_rate": 1.931289662471417e-05, "loss": 1.2033, "step": 4068 }, { "epoch": 1.5149096578282975, "grad_norm": 0.1748679131269455, "learning_rate": 1.9312453841105716e-05, "loss": 1.2034, "step": 4069 }, { "epoch": 1.5152819629789067, "grad_norm": 0.16885723173618317, "learning_rate": 1.9312010919952907e-05, "loss": 1.2147, "step": 4070 }, { "epoch": 1.5156542681295155, "grad_norm": 0.16560660302639008, "learning_rate": 1.931156786126228e-05, "loss": 1.219, "step": 4071 }, { "epoch": 1.5160265732801248, "grad_norm": 0.16653484106063843, "learning_rate": 1.9311124665040378e-05, "loss": 1.2258, "step": 4072 }, { "epoch": 1.5163988784307338, "grad_norm": 0.16899293661117554, "learning_rate": 1.931068133129375e-05, "loss": 1.208, "step": 4073 }, { "epoch": 1.5167711835813429, "grad_norm": 0.17353177070617676, "learning_rate": 1.931023786002894e-05, "loss": 1.196, "step": 4074 }, { "epoch": 1.517143488731952, "grad_norm": 0.16658303141593933, "learning_rate": 1.9309794251252506e-05, "loss": 1.2167, "step": 4075 }, { "epoch": 1.517515793882561, "grad_norm": 0.17217494547367096, "learning_rate": 1.9309350504970996e-05, "loss": 1.2224, "step": 4076 }, { "epoch": 1.51788809903317, "grad_norm": 0.1716787964105606, "learning_rate": 1.930890662119096e-05, "loss": 1.214, "step": 4077 }, { "epoch": 1.518260404183779, "grad_norm": 0.16615904867649078, "learning_rate": 1.9308462599918955e-05, "loss": 1.2204, "step": 4078 }, { "epoch": 1.5186327093343883, "grad_norm": 0.17430458962917328, "learning_rate": 1.9308018441161547e-05, "loss": 1.2237, "step": 4079 }, { "epoch": 1.5190050144849971, "grad_norm": 0.17217344045639038, "learning_rate": 1.9307574144925288e-05, "loss": 1.22, "step": 4080 }, { "epoch": 1.5193773196356064, "grad_norm": 0.1643909215927124, "learning_rate": 1.930712971121674e-05, "loss": 1.2008, "step": 4081 }, { "epoch": 1.5197496247862154, "grad_norm": 0.17074188590049744, "learning_rate": 1.9306685140042476e-05, "loss": 1.2294, "step": 4082 }, { "epoch": 1.5201219299368245, "grad_norm": 0.16520312428474426, "learning_rate": 1.9306240431409056e-05, "loss": 1.2173, "step": 4083 }, { "epoch": 1.5204942350874335, "grad_norm": 0.16149955987930298, "learning_rate": 1.9305795585323043e-05, "loss": 1.2173, "step": 4084 }, { "epoch": 1.5208665402380426, "grad_norm": 0.17141614854335785, "learning_rate": 1.930535060179102e-05, "loss": 1.2022, "step": 4085 }, { "epoch": 1.5212388453886518, "grad_norm": 0.16703253984451294, "learning_rate": 1.930490548081955e-05, "loss": 1.2247, "step": 4086 }, { "epoch": 1.5216111505392607, "grad_norm": 0.16546790301799774, "learning_rate": 1.9304460222415207e-05, "loss": 1.2045, "step": 4087 }, { "epoch": 1.52198345568987, "grad_norm": 0.16973921656608582, "learning_rate": 1.9304014826584578e-05, "loss": 1.226, "step": 4088 }, { "epoch": 1.5223557608404787, "grad_norm": 0.16389979422092438, "learning_rate": 1.9303569293334226e-05, "loss": 1.2052, "step": 4089 }, { "epoch": 1.522728065991088, "grad_norm": 0.16464626789093018, "learning_rate": 1.9303123622670743e-05, "loss": 1.2047, "step": 4090 }, { "epoch": 1.523100371141697, "grad_norm": 0.1665724664926529, "learning_rate": 1.9302677814600707e-05, "loss": 1.2025, "step": 4091 }, { "epoch": 1.523472676292306, "grad_norm": 0.18270178139209747, "learning_rate": 1.9302231869130703e-05, "loss": 1.2181, "step": 4092 }, { "epoch": 1.5238449814429151, "grad_norm": 0.1672508865594864, "learning_rate": 1.9301785786267323e-05, "loss": 1.2081, "step": 4093 }, { "epoch": 1.5242172865935242, "grad_norm": 0.16370201110839844, "learning_rate": 1.9301339566017144e-05, "loss": 1.2036, "step": 4094 }, { "epoch": 1.5245895917441334, "grad_norm": 0.17350997030735016, "learning_rate": 1.930089320838677e-05, "loss": 1.2072, "step": 4095 }, { "epoch": 1.5249618968947423, "grad_norm": 0.17266206443309784, "learning_rate": 1.930044671338278e-05, "loss": 1.2169, "step": 4096 }, { "epoch": 1.5253342020453515, "grad_norm": 0.16876213252544403, "learning_rate": 1.9300000081011778e-05, "loss": 1.2076, "step": 4097 }, { "epoch": 1.5257065071959603, "grad_norm": 0.17879076302051544, "learning_rate": 1.9299553311280358e-05, "loss": 1.2003, "step": 4098 }, { "epoch": 1.5260788123465696, "grad_norm": 0.16374212503433228, "learning_rate": 1.9299106404195123e-05, "loss": 1.2203, "step": 4099 }, { "epoch": 1.5264511174971787, "grad_norm": 0.17050255835056305, "learning_rate": 1.929865935976267e-05, "loss": 1.2112, "step": 4100 }, { "epoch": 1.5268234226477877, "grad_norm": 0.17496921122074127, "learning_rate": 1.92982121779896e-05, "loss": 1.2123, "step": 4101 }, { "epoch": 1.5271957277983967, "grad_norm": 0.17470447719097137, "learning_rate": 1.9297764858882516e-05, "loss": 1.2115, "step": 4102 }, { "epoch": 1.5275680329490058, "grad_norm": 0.166524276137352, "learning_rate": 1.9297317402448032e-05, "loss": 1.2235, "step": 4103 }, { "epoch": 1.527940338099615, "grad_norm": 0.1667315661907196, "learning_rate": 1.929686980869275e-05, "loss": 1.2164, "step": 4104 }, { "epoch": 1.5283126432502239, "grad_norm": 0.17172963917255402, "learning_rate": 1.9296422077623293e-05, "loss": 1.2133, "step": 4105 }, { "epoch": 1.5286849484008331, "grad_norm": 0.16795189678668976, "learning_rate": 1.9295974209246257e-05, "loss": 1.2076, "step": 4106 }, { "epoch": 1.529057253551442, "grad_norm": 0.1728687286376953, "learning_rate": 1.9295526203568268e-05, "loss": 1.214, "step": 4107 }, { "epoch": 1.5294295587020512, "grad_norm": 0.16910706460475922, "learning_rate": 1.9295078060595938e-05, "loss": 1.2089, "step": 4108 }, { "epoch": 1.5298018638526603, "grad_norm": 0.16839970648288727, "learning_rate": 1.9294629780335895e-05, "loss": 1.2014, "step": 4109 }, { "epoch": 1.5301741690032693, "grad_norm": 0.17429544031620026, "learning_rate": 1.9294181362794745e-05, "loss": 1.2086, "step": 4110 }, { "epoch": 1.5305464741538783, "grad_norm": 0.16927529871463776, "learning_rate": 1.9293732807979127e-05, "loss": 1.222, "step": 4111 }, { "epoch": 1.5309187793044874, "grad_norm": 0.17815326154232025, "learning_rate": 1.9293284115895656e-05, "loss": 1.2107, "step": 4112 }, { "epoch": 1.5312910844550967, "grad_norm": 0.17721591889858246, "learning_rate": 1.929283528655096e-05, "loss": 1.2062, "step": 4113 }, { "epoch": 1.5316633896057055, "grad_norm": 0.16361045837402344, "learning_rate": 1.929238631995167e-05, "loss": 1.2002, "step": 4114 }, { "epoch": 1.5320356947563147, "grad_norm": 0.17079226672649384, "learning_rate": 1.929193721610442e-05, "loss": 1.1955, "step": 4115 }, { "epoch": 1.5324079999069236, "grad_norm": 0.17936839163303375, "learning_rate": 1.9291487975015835e-05, "loss": 1.2175, "step": 4116 }, { "epoch": 1.5327803050575328, "grad_norm": 0.17177017033100128, "learning_rate": 1.9291038596692562e-05, "loss": 1.2078, "step": 4117 }, { "epoch": 1.5331526102081419, "grad_norm": 0.1675311028957367, "learning_rate": 1.929058908114123e-05, "loss": 1.2289, "step": 4118 }, { "epoch": 1.533524915358751, "grad_norm": 0.18256328999996185, "learning_rate": 1.9290139428368482e-05, "loss": 1.2118, "step": 4119 }, { "epoch": 1.53389722050936, "grad_norm": 0.16636022925376892, "learning_rate": 1.9289689638380956e-05, "loss": 1.2061, "step": 4120 }, { "epoch": 1.534269525659969, "grad_norm": 0.22456076741218567, "learning_rate": 1.9289239711185293e-05, "loss": 1.2134, "step": 4121 }, { "epoch": 1.5346418308105783, "grad_norm": 0.19078871607780457, "learning_rate": 1.928878964678815e-05, "loss": 1.2185, "step": 4122 }, { "epoch": 1.535014135961187, "grad_norm": 0.18136771023273468, "learning_rate": 1.928833944519616e-05, "loss": 1.2092, "step": 4123 }, { "epoch": 1.5353864411117963, "grad_norm": 0.17033730447292328, "learning_rate": 1.9287889106415983e-05, "loss": 1.2218, "step": 4124 }, { "epoch": 1.5357587462624052, "grad_norm": 0.17553742229938507, "learning_rate": 1.9287438630454268e-05, "loss": 1.2242, "step": 4125 }, { "epoch": 1.5361310514130144, "grad_norm": 0.18240036070346832, "learning_rate": 1.9286988017317664e-05, "loss": 1.2029, "step": 4126 }, { "epoch": 1.5365033565636235, "grad_norm": 0.17878971993923187, "learning_rate": 1.928653726701283e-05, "loss": 1.2057, "step": 4127 }, { "epoch": 1.5368756617142325, "grad_norm": 0.16980788111686707, "learning_rate": 1.9286086379546427e-05, "loss": 1.2071, "step": 4128 }, { "epoch": 1.5372479668648416, "grad_norm": 0.17709119617938995, "learning_rate": 1.9285635354925107e-05, "loss": 1.1857, "step": 4129 }, { "epoch": 1.5376202720154506, "grad_norm": 0.1764722466468811, "learning_rate": 1.9285184193155536e-05, "loss": 1.2124, "step": 4130 }, { "epoch": 1.5379925771660599, "grad_norm": 0.18060049414634705, "learning_rate": 1.9284732894244378e-05, "loss": 1.2073, "step": 4131 }, { "epoch": 1.5383648823166687, "grad_norm": 0.16817817091941833, "learning_rate": 1.92842814581983e-05, "loss": 1.2128, "step": 4132 }, { "epoch": 1.538737187467278, "grad_norm": 0.1727471500635147, "learning_rate": 1.9283829885023967e-05, "loss": 1.1974, "step": 4133 }, { "epoch": 1.539109492617887, "grad_norm": 0.1839704066514969, "learning_rate": 1.9283378174728046e-05, "loss": 1.2003, "step": 4134 }, { "epoch": 1.539481797768496, "grad_norm": 0.17027804255485535, "learning_rate": 1.9282926327317213e-05, "loss": 1.2135, "step": 4135 }, { "epoch": 1.539854102919105, "grad_norm": 0.1670842319726944, "learning_rate": 1.9282474342798143e-05, "loss": 1.2023, "step": 4136 }, { "epoch": 1.5402264080697141, "grad_norm": 0.1733679473400116, "learning_rate": 1.9282022221177507e-05, "loss": 1.2102, "step": 4137 }, { "epoch": 1.5405987132203232, "grad_norm": 0.17217591404914856, "learning_rate": 1.9281569962461986e-05, "loss": 1.2237, "step": 4138 }, { "epoch": 1.5409710183709322, "grad_norm": 0.174224391579628, "learning_rate": 1.928111756665826e-05, "loss": 1.2072, "step": 4139 }, { "epoch": 1.5413433235215415, "grad_norm": 0.17632028460502625, "learning_rate": 1.928066503377301e-05, "loss": 1.2182, "step": 4140 }, { "epoch": 1.5417156286721503, "grad_norm": 0.17433500289916992, "learning_rate": 1.9280212363812918e-05, "loss": 1.2297, "step": 4141 }, { "epoch": 1.5420879338227595, "grad_norm": 0.1642346978187561, "learning_rate": 1.9279759556784673e-05, "loss": 1.2103, "step": 4142 }, { "epoch": 1.5424602389733686, "grad_norm": 0.172978937625885, "learning_rate": 1.9279306612694963e-05, "loss": 1.2225, "step": 4143 }, { "epoch": 1.5428325441239776, "grad_norm": 0.17284728586673737, "learning_rate": 1.9278853531550475e-05, "loss": 1.2209, "step": 4144 }, { "epoch": 1.5432048492745867, "grad_norm": 0.16585254669189453, "learning_rate": 1.9278400313357902e-05, "loss": 1.2256, "step": 4145 }, { "epoch": 1.5435771544251957, "grad_norm": 0.1627526581287384, "learning_rate": 1.927794695812394e-05, "loss": 1.2172, "step": 4146 }, { "epoch": 1.543949459575805, "grad_norm": 0.17496748268604279, "learning_rate": 1.9277493465855287e-05, "loss": 1.197, "step": 4147 }, { "epoch": 1.5443217647264138, "grad_norm": 0.1616668850183487, "learning_rate": 1.9277039836558635e-05, "loss": 1.2088, "step": 4148 }, { "epoch": 1.544694069877023, "grad_norm": 0.16618521511554718, "learning_rate": 1.9276586070240684e-05, "loss": 1.215, "step": 4149 }, { "epoch": 1.5450663750276319, "grad_norm": 0.16764888167381287, "learning_rate": 1.927613216690814e-05, "loss": 1.2012, "step": 4150 }, { "epoch": 1.5454386801782412, "grad_norm": 0.166963592171669, "learning_rate": 1.9275678126567707e-05, "loss": 1.2234, "step": 4151 }, { "epoch": 1.5458109853288502, "grad_norm": 0.166300967335701, "learning_rate": 1.927522394922609e-05, "loss": 1.2119, "step": 4152 }, { "epoch": 1.5461832904794592, "grad_norm": 0.16552546620368958, "learning_rate": 1.9274769634889997e-05, "loss": 1.1968, "step": 4153 }, { "epoch": 1.5465555956300683, "grad_norm": 0.1656341403722763, "learning_rate": 1.927431518356614e-05, "loss": 1.2084, "step": 4154 }, { "epoch": 1.5469279007806773, "grad_norm": 0.16084742546081543, "learning_rate": 1.9273860595261232e-05, "loss": 1.2007, "step": 4155 }, { "epoch": 1.5473002059312866, "grad_norm": 0.16208963096141815, "learning_rate": 1.927340586998198e-05, "loss": 1.2251, "step": 4156 }, { "epoch": 1.5476725110818954, "grad_norm": 0.16170455515384674, "learning_rate": 1.9272951007735108e-05, "loss": 1.2208, "step": 4157 }, { "epoch": 1.5480448162325047, "grad_norm": 0.159234881401062, "learning_rate": 1.927249600852733e-05, "loss": 1.2103, "step": 4158 }, { "epoch": 1.5484171213831135, "grad_norm": 0.17041054368019104, "learning_rate": 1.927204087236537e-05, "loss": 1.2171, "step": 4159 }, { "epoch": 1.5487894265337228, "grad_norm": 0.16318999230861664, "learning_rate": 1.9271585599255945e-05, "loss": 1.2101, "step": 4160 }, { "epoch": 1.5491617316843318, "grad_norm": 0.15880700945854187, "learning_rate": 1.9271130189205786e-05, "loss": 1.1997, "step": 4161 }, { "epoch": 1.5495340368349408, "grad_norm": 0.16643404960632324, "learning_rate": 1.9270674642221614e-05, "loss": 1.193, "step": 4162 }, { "epoch": 1.5499063419855499, "grad_norm": 0.17623448371887207, "learning_rate": 1.927021895831016e-05, "loss": 1.2188, "step": 4163 }, { "epoch": 1.550278647136159, "grad_norm": 0.1635187864303589, "learning_rate": 1.9269763137478152e-05, "loss": 1.1997, "step": 4164 }, { "epoch": 1.5506509522867682, "grad_norm": 0.16812646389007568, "learning_rate": 1.9269307179732325e-05, "loss": 1.2083, "step": 4165 }, { "epoch": 1.551023257437377, "grad_norm": 0.164559006690979, "learning_rate": 1.9268851085079415e-05, "loss": 1.1901, "step": 4166 }, { "epoch": 1.5513955625879863, "grad_norm": 0.1598326563835144, "learning_rate": 1.9268394853526156e-05, "loss": 1.198, "step": 4167 }, { "epoch": 1.551767867738595, "grad_norm": 0.16687574982643127, "learning_rate": 1.9267938485079285e-05, "loss": 1.2133, "step": 4168 }, { "epoch": 1.5521401728892044, "grad_norm": 0.16783174872398376, "learning_rate": 1.9267481979745544e-05, "loss": 1.2217, "step": 4169 }, { "epoch": 1.5525124780398134, "grad_norm": 0.1639779806137085, "learning_rate": 1.9267025337531678e-05, "loss": 1.2243, "step": 4170 }, { "epoch": 1.5528847831904224, "grad_norm": 0.16327700018882751, "learning_rate": 1.9266568558444426e-05, "loss": 1.2213, "step": 4171 }, { "epoch": 1.5532570883410315, "grad_norm": 0.16662442684173584, "learning_rate": 1.926611164249054e-05, "loss": 1.198, "step": 4172 }, { "epoch": 1.5536293934916405, "grad_norm": 0.1651596873998642, "learning_rate": 1.9265654589676767e-05, "loss": 1.2113, "step": 4173 }, { "epoch": 1.5540016986422498, "grad_norm": 0.16614238917827606, "learning_rate": 1.9265197400009854e-05, "loss": 1.2092, "step": 4174 }, { "epoch": 1.5543740037928586, "grad_norm": 0.1736796498298645, "learning_rate": 1.926474007349656e-05, "loss": 1.2147, "step": 4175 }, { "epoch": 1.5547463089434679, "grad_norm": 0.1689198911190033, "learning_rate": 1.9264282610143638e-05, "loss": 1.2119, "step": 4176 }, { "epoch": 1.5551186140940767, "grad_norm": 0.16373711824417114, "learning_rate": 1.926382500995784e-05, "loss": 1.2077, "step": 4177 }, { "epoch": 1.555490919244686, "grad_norm": 0.1627718210220337, "learning_rate": 1.9263367272945927e-05, "loss": 1.2128, "step": 4178 }, { "epoch": 1.555863224395295, "grad_norm": 0.16306252777576447, "learning_rate": 1.9262909399114663e-05, "loss": 1.218, "step": 4179 }, { "epoch": 1.556235529545904, "grad_norm": 0.16775202751159668, "learning_rate": 1.9262451388470807e-05, "loss": 1.2039, "step": 4180 }, { "epoch": 1.556607834696513, "grad_norm": 0.1653129607439041, "learning_rate": 1.926199324102113e-05, "loss": 1.2081, "step": 4181 }, { "epoch": 1.5569801398471221, "grad_norm": 0.1624881476163864, "learning_rate": 1.9261534956772386e-05, "loss": 1.2163, "step": 4182 }, { "epoch": 1.5573524449977314, "grad_norm": 0.16071651875972748, "learning_rate": 1.9261076535731356e-05, "loss": 1.2008, "step": 4183 }, { "epoch": 1.5577247501483402, "grad_norm": 0.17238472402095795, "learning_rate": 1.926061797790481e-05, "loss": 1.221, "step": 4184 }, { "epoch": 1.5580970552989495, "grad_norm": 0.17098172008991241, "learning_rate": 1.9260159283299514e-05, "loss": 1.206, "step": 4185 }, { "epoch": 1.5584693604495583, "grad_norm": 0.164502814412117, "learning_rate": 1.9259700451922247e-05, "loss": 1.2157, "step": 4186 }, { "epoch": 1.5588416656001676, "grad_norm": 0.17409628629684448, "learning_rate": 1.9259241483779787e-05, "loss": 1.226, "step": 4187 }, { "epoch": 1.5592139707507766, "grad_norm": 0.17351455986499786, "learning_rate": 1.9258782378878908e-05, "loss": 1.2167, "step": 4188 }, { "epoch": 1.5595862759013857, "grad_norm": 0.16940516233444214, "learning_rate": 1.92583231372264e-05, "loss": 1.2044, "step": 4189 }, { "epoch": 1.5599585810519947, "grad_norm": 0.16605758666992188, "learning_rate": 1.9257863758829038e-05, "loss": 1.2009, "step": 4190 }, { "epoch": 1.5603308862026037, "grad_norm": 0.17315179109573364, "learning_rate": 1.9257404243693606e-05, "loss": 1.2139, "step": 4191 }, { "epoch": 1.560703191353213, "grad_norm": 0.1688748598098755, "learning_rate": 1.9256944591826893e-05, "loss": 1.2212, "step": 4192 }, { "epoch": 1.5610754965038218, "grad_norm": 0.16964520514011383, "learning_rate": 1.9256484803235696e-05, "loss": 1.2092, "step": 4193 }, { "epoch": 1.561447801654431, "grad_norm": 0.16854505240917206, "learning_rate": 1.9256024877926796e-05, "loss": 1.2015, "step": 4194 }, { "epoch": 1.5618201068050401, "grad_norm": 0.168868288397789, "learning_rate": 1.9255564815906988e-05, "loss": 1.2013, "step": 4195 }, { "epoch": 1.5621924119556492, "grad_norm": 0.17103056609630585, "learning_rate": 1.9255104617183068e-05, "loss": 1.211, "step": 4196 }, { "epoch": 1.5625647171062582, "grad_norm": 0.1685587465763092, "learning_rate": 1.9254644281761838e-05, "loss": 1.1984, "step": 4197 }, { "epoch": 1.5629370222568673, "grad_norm": 0.17588046193122864, "learning_rate": 1.925418380965009e-05, "loss": 1.2144, "step": 4198 }, { "epoch": 1.5633093274074763, "grad_norm": 0.16657619178295135, "learning_rate": 1.925372320085463e-05, "loss": 1.2011, "step": 4199 }, { "epoch": 1.5636816325580853, "grad_norm": 0.16856853663921356, "learning_rate": 1.9253262455382256e-05, "loss": 1.2057, "step": 4200 }, { "epoch": 1.5640539377086946, "grad_norm": 0.16726350784301758, "learning_rate": 1.925280157323978e-05, "loss": 1.2062, "step": 4201 }, { "epoch": 1.5644262428593034, "grad_norm": 0.18542660772800446, "learning_rate": 1.9252340554434003e-05, "loss": 1.21, "step": 4202 }, { "epoch": 1.5647985480099127, "grad_norm": 0.17337799072265625, "learning_rate": 1.9251879398971733e-05, "loss": 1.203, "step": 4203 }, { "epoch": 1.5651708531605217, "grad_norm": 0.1717308908700943, "learning_rate": 1.925141810685979e-05, "loss": 1.22, "step": 4204 }, { "epoch": 1.5655431583111308, "grad_norm": 0.1700487732887268, "learning_rate": 1.925095667810498e-05, "loss": 1.2018, "step": 4205 }, { "epoch": 1.5659154634617398, "grad_norm": 0.1768191158771515, "learning_rate": 1.9250495112714123e-05, "loss": 1.1853, "step": 4206 }, { "epoch": 1.5662877686123489, "grad_norm": 0.17097093164920807, "learning_rate": 1.9250033410694032e-05, "loss": 1.2047, "step": 4207 }, { "epoch": 1.5666600737629581, "grad_norm": 0.1712087094783783, "learning_rate": 1.9249571572051528e-05, "loss": 1.2028, "step": 4208 }, { "epoch": 1.567032378913567, "grad_norm": 0.16526564955711365, "learning_rate": 1.924910959679343e-05, "loss": 1.205, "step": 4209 }, { "epoch": 1.5674046840641762, "grad_norm": 0.16260464489459991, "learning_rate": 1.9248647484926568e-05, "loss": 1.201, "step": 4210 }, { "epoch": 1.567776989214785, "grad_norm": 0.1643940955400467, "learning_rate": 1.924818523645776e-05, "loss": 1.2147, "step": 4211 }, { "epoch": 1.5681492943653943, "grad_norm": 0.17409846186637878, "learning_rate": 1.9247722851393838e-05, "loss": 1.2043, "step": 4212 }, { "epoch": 1.5685215995160033, "grad_norm": 0.16997475922107697, "learning_rate": 1.924726032974163e-05, "loss": 1.2171, "step": 4213 }, { "epoch": 1.5688939046666124, "grad_norm": 0.171822652220726, "learning_rate": 1.9246797671507966e-05, "loss": 1.2126, "step": 4214 }, { "epoch": 1.5692662098172214, "grad_norm": 0.1666400134563446, "learning_rate": 1.9246334876699682e-05, "loss": 1.2152, "step": 4215 }, { "epoch": 1.5696385149678305, "grad_norm": 0.17441390454769135, "learning_rate": 1.924587194532361e-05, "loss": 1.2002, "step": 4216 }, { "epoch": 1.5700108201184397, "grad_norm": 0.18336564302444458, "learning_rate": 1.9245408877386592e-05, "loss": 1.2119, "step": 4217 }, { "epoch": 1.5703831252690486, "grad_norm": 0.16609658300876617, "learning_rate": 1.9244945672895464e-05, "loss": 1.2084, "step": 4218 }, { "epoch": 1.5707554304196578, "grad_norm": 0.17070931196212769, "learning_rate": 1.924448233185707e-05, "loss": 1.2045, "step": 4219 }, { "epoch": 1.5711277355702666, "grad_norm": 0.16091009974479675, "learning_rate": 1.924401885427825e-05, "loss": 1.1902, "step": 4220 }, { "epoch": 1.571500040720876, "grad_norm": 0.1693873405456543, "learning_rate": 1.9243555240165855e-05, "loss": 1.2213, "step": 4221 }, { "epoch": 1.571872345871485, "grad_norm": 0.17035622894763947, "learning_rate": 1.9243091489526728e-05, "loss": 1.2, "step": 4222 }, { "epoch": 1.572244651022094, "grad_norm": 0.17594295740127563, "learning_rate": 1.9242627602367716e-05, "loss": 1.2161, "step": 4223 }, { "epoch": 1.572616956172703, "grad_norm": 0.16813448071479797, "learning_rate": 1.924216357869568e-05, "loss": 1.2029, "step": 4224 }, { "epoch": 1.572989261323312, "grad_norm": 0.18052363395690918, "learning_rate": 1.924169941851746e-05, "loss": 1.2017, "step": 4225 }, { "epoch": 1.5733615664739213, "grad_norm": 0.18110094964504242, "learning_rate": 1.9241235121839927e-05, "loss": 1.1987, "step": 4226 }, { "epoch": 1.5737338716245302, "grad_norm": 0.17271529138088226, "learning_rate": 1.9240770688669928e-05, "loss": 1.2058, "step": 4227 }, { "epoch": 1.5741061767751394, "grad_norm": 0.1713894158601761, "learning_rate": 1.924030611901433e-05, "loss": 1.2011, "step": 4228 }, { "epoch": 1.5744784819257482, "grad_norm": 0.17483104765415192, "learning_rate": 1.9239841412879983e-05, "loss": 1.2166, "step": 4229 }, { "epoch": 1.5748507870763575, "grad_norm": 0.17396202683448792, "learning_rate": 1.9239376570273763e-05, "loss": 1.2059, "step": 4230 }, { "epoch": 1.5752230922269665, "grad_norm": 0.16693198680877686, "learning_rate": 1.923891159120253e-05, "loss": 1.2113, "step": 4231 }, { "epoch": 1.5755953973775756, "grad_norm": 0.17103157937526703, "learning_rate": 1.9238446475673155e-05, "loss": 1.2166, "step": 4232 }, { "epoch": 1.5759677025281846, "grad_norm": 0.1680799126625061, "learning_rate": 1.92379812236925e-05, "loss": 1.2189, "step": 4233 }, { "epoch": 1.5763400076787937, "grad_norm": 0.18774433434009552, "learning_rate": 1.9237515835267447e-05, "loss": 1.2121, "step": 4234 }, { "epoch": 1.576712312829403, "grad_norm": 0.16608703136444092, "learning_rate": 1.9237050310404862e-05, "loss": 1.1935, "step": 4235 }, { "epoch": 1.5770846179800118, "grad_norm": 0.17058461904525757, "learning_rate": 1.9236584649111628e-05, "loss": 1.2014, "step": 4236 }, { "epoch": 1.577456923130621, "grad_norm": 0.16331979632377625, "learning_rate": 1.923611885139461e-05, "loss": 1.2044, "step": 4237 }, { "epoch": 1.5778292282812298, "grad_norm": 0.17175064980983734, "learning_rate": 1.9235652917260705e-05, "loss": 1.2032, "step": 4238 }, { "epoch": 1.578201533431839, "grad_norm": 0.1720954328775406, "learning_rate": 1.9235186846716784e-05, "loss": 1.2092, "step": 4239 }, { "epoch": 1.5785738385824482, "grad_norm": 0.16295336186885834, "learning_rate": 1.923472063976973e-05, "loss": 1.1955, "step": 4240 }, { "epoch": 1.5789461437330572, "grad_norm": 0.16375908255577087, "learning_rate": 1.923425429642643e-05, "loss": 1.1909, "step": 4241 }, { "epoch": 1.5793184488836662, "grad_norm": 0.1621014028787613, "learning_rate": 1.923378781669378e-05, "loss": 1.201, "step": 4242 }, { "epoch": 1.5796907540342753, "grad_norm": 0.1648714393377304, "learning_rate": 1.9233321200578657e-05, "loss": 1.2119, "step": 4243 }, { "epoch": 1.5800630591848845, "grad_norm": 0.16513188183307648, "learning_rate": 1.9232854448087962e-05, "loss": 1.2129, "step": 4244 }, { "epoch": 1.5804353643354934, "grad_norm": 0.1719852089881897, "learning_rate": 1.9232387559228587e-05, "loss": 1.2095, "step": 4245 }, { "epoch": 1.5808076694861026, "grad_norm": 0.16509710252285004, "learning_rate": 1.9231920534007428e-05, "loss": 1.2118, "step": 4246 }, { "epoch": 1.5811799746367114, "grad_norm": 0.16327592730522156, "learning_rate": 1.923145337243138e-05, "loss": 1.2081, "step": 4247 }, { "epoch": 1.5815522797873207, "grad_norm": 0.17513926327228546, "learning_rate": 1.9230986074507347e-05, "loss": 1.2215, "step": 4248 }, { "epoch": 1.5819245849379298, "grad_norm": 0.17169274389743805, "learning_rate": 1.9230518640242228e-05, "loss": 1.2164, "step": 4249 }, { "epoch": 1.5822968900885388, "grad_norm": 0.17580832540988922, "learning_rate": 1.9230051069642927e-05, "loss": 1.2066, "step": 4250 }, { "epoch": 1.5826691952391478, "grad_norm": 0.16311529278755188, "learning_rate": 1.922958336271635e-05, "loss": 1.2061, "step": 4251 }, { "epoch": 1.5830415003897569, "grad_norm": 0.18009954690933228, "learning_rate": 1.922911551946941e-05, "loss": 1.2161, "step": 4252 }, { "epoch": 1.5834138055403661, "grad_norm": 0.17402635514736176, "learning_rate": 1.922864753990901e-05, "loss": 1.228, "step": 4253 }, { "epoch": 1.583786110690975, "grad_norm": 0.17226436734199524, "learning_rate": 1.9228179424042064e-05, "loss": 1.209, "step": 4254 }, { "epoch": 1.5841584158415842, "grad_norm": 0.1807158887386322, "learning_rate": 1.9227711171875486e-05, "loss": 1.202, "step": 4255 }, { "epoch": 1.5845307209921933, "grad_norm": 0.16312603652477264, "learning_rate": 1.92272427834162e-05, "loss": 1.2014, "step": 4256 }, { "epoch": 1.5849030261428023, "grad_norm": 0.16579662263393402, "learning_rate": 1.9226774258671112e-05, "loss": 1.208, "step": 4257 }, { "epoch": 1.5852753312934114, "grad_norm": 0.1716030240058899, "learning_rate": 1.9226305597647145e-05, "loss": 1.2052, "step": 4258 }, { "epoch": 1.5856476364440204, "grad_norm": 0.16442036628723145, "learning_rate": 1.922583680035123e-05, "loss": 1.2109, "step": 4259 }, { "epoch": 1.5860199415946294, "grad_norm": 0.15719719231128693, "learning_rate": 1.922536786679028e-05, "loss": 1.2087, "step": 4260 }, { "epoch": 1.5863922467452385, "grad_norm": 0.16500256955623627, "learning_rate": 1.9224898796971224e-05, "loss": 1.2206, "step": 4261 }, { "epoch": 1.5867645518958478, "grad_norm": 0.17196062207221985, "learning_rate": 1.9224429590900997e-05, "loss": 1.206, "step": 4262 }, { "epoch": 1.5871368570464566, "grad_norm": 0.17409005761146545, "learning_rate": 1.9223960248586523e-05, "loss": 1.2111, "step": 4263 }, { "epoch": 1.5875091621970658, "grad_norm": 0.17429877817630768, "learning_rate": 1.922349077003473e-05, "loss": 1.2316, "step": 4264 }, { "epoch": 1.5878814673476749, "grad_norm": 0.1687888205051422, "learning_rate": 1.922302115525256e-05, "loss": 1.2126, "step": 4265 }, { "epoch": 1.588253772498284, "grad_norm": 0.1785813719034195, "learning_rate": 1.922255140424695e-05, "loss": 1.2094, "step": 4266 }, { "epoch": 1.588626077648893, "grad_norm": 0.1841798722743988, "learning_rate": 1.922208151702483e-05, "loss": 1.2105, "step": 4267 }, { "epoch": 1.588998382799502, "grad_norm": 0.16933433711528778, "learning_rate": 1.9221611493593145e-05, "loss": 1.2078, "step": 4268 }, { "epoch": 1.5893706879501113, "grad_norm": 0.21182604134082794, "learning_rate": 1.9221141333958837e-05, "loss": 1.2086, "step": 4269 }, { "epoch": 1.58974299310072, "grad_norm": 0.18182890117168427, "learning_rate": 1.922067103812885e-05, "loss": 1.2054, "step": 4270 }, { "epoch": 1.5901152982513294, "grad_norm": 0.18493826687335968, "learning_rate": 1.9220200606110132e-05, "loss": 1.2018, "step": 4271 }, { "epoch": 1.5904876034019382, "grad_norm": 0.17452014982700348, "learning_rate": 1.921973003790963e-05, "loss": 1.2287, "step": 4272 }, { "epoch": 1.5908599085525474, "grad_norm": 0.16467060148715973, "learning_rate": 1.9219259333534292e-05, "loss": 1.2003, "step": 4273 }, { "epoch": 1.5912322137031565, "grad_norm": 0.18403173983097076, "learning_rate": 1.9218788492991075e-05, "loss": 1.228, "step": 4274 }, { "epoch": 1.5916045188537655, "grad_norm": 0.18004946410655975, "learning_rate": 1.921831751628693e-05, "loss": 1.202, "step": 4275 }, { "epoch": 1.5919768240043746, "grad_norm": 0.16663858294487, "learning_rate": 1.921784640342881e-05, "loss": 1.2187, "step": 4276 }, { "epoch": 1.5923491291549836, "grad_norm": 0.17807120084762573, "learning_rate": 1.921737515442368e-05, "loss": 1.2171, "step": 4277 }, { "epoch": 1.5927214343055929, "grad_norm": 0.18053670227527618, "learning_rate": 1.9216903769278498e-05, "loss": 1.2175, "step": 4278 }, { "epoch": 1.5930937394562017, "grad_norm": 0.17122212052345276, "learning_rate": 1.9216432248000224e-05, "loss": 1.2032, "step": 4279 }, { "epoch": 1.593466044606811, "grad_norm": 0.16505327820777893, "learning_rate": 1.9215960590595824e-05, "loss": 1.2039, "step": 4280 }, { "epoch": 1.5938383497574198, "grad_norm": 0.18405233323574066, "learning_rate": 1.9215488797072267e-05, "loss": 1.212, "step": 4281 }, { "epoch": 1.594210654908029, "grad_norm": 0.17196938395500183, "learning_rate": 1.9215016867436516e-05, "loss": 1.2118, "step": 4282 }, { "epoch": 1.594582960058638, "grad_norm": 0.16480191051959991, "learning_rate": 1.9214544801695547e-05, "loss": 1.2089, "step": 4283 }, { "epoch": 1.5949552652092471, "grad_norm": 0.16730982065200806, "learning_rate": 1.9214072599856326e-05, "loss": 1.2027, "step": 4284 }, { "epoch": 1.5953275703598562, "grad_norm": 0.17092877626419067, "learning_rate": 1.9213600261925832e-05, "loss": 1.2149, "step": 4285 }, { "epoch": 1.5956998755104652, "grad_norm": 0.1654144823551178, "learning_rate": 1.9213127787911045e-05, "loss": 1.2017, "step": 4286 }, { "epoch": 1.5960721806610745, "grad_norm": 0.17047090828418732, "learning_rate": 1.9212655177818935e-05, "loss": 1.1947, "step": 4287 }, { "epoch": 1.5964444858116833, "grad_norm": 0.17292653024196625, "learning_rate": 1.9212182431656487e-05, "loss": 1.2226, "step": 4288 }, { "epoch": 1.5968167909622926, "grad_norm": 0.16899065673351288, "learning_rate": 1.9211709549430678e-05, "loss": 1.196, "step": 4289 }, { "epoch": 1.5971890961129014, "grad_norm": 0.16645705699920654, "learning_rate": 1.92112365311485e-05, "loss": 1.2066, "step": 4290 }, { "epoch": 1.5975614012635106, "grad_norm": 0.1680358499288559, "learning_rate": 1.921076337681694e-05, "loss": 1.2134, "step": 4291 }, { "epoch": 1.5979337064141197, "grad_norm": 0.16596710681915283, "learning_rate": 1.9210290086442983e-05, "loss": 1.2085, "step": 4292 }, { "epoch": 1.5983060115647287, "grad_norm": 0.1577194184064865, "learning_rate": 1.9209816660033613e-05, "loss": 1.1921, "step": 4293 }, { "epoch": 1.5986783167153378, "grad_norm": 0.1676270067691803, "learning_rate": 1.9209343097595834e-05, "loss": 1.2075, "step": 4294 }, { "epoch": 1.5990506218659468, "grad_norm": 0.16375909745693207, "learning_rate": 1.9208869399136633e-05, "loss": 1.2086, "step": 4295 }, { "epoch": 1.599422927016556, "grad_norm": 0.16282792389392853, "learning_rate": 1.9208395564663012e-05, "loss": 1.2115, "step": 4296 }, { "epoch": 1.599795232167165, "grad_norm": 0.16538910567760468, "learning_rate": 1.9207921594181964e-05, "loss": 1.2063, "step": 4297 }, { "epoch": 1.6001675373177742, "grad_norm": 0.1656702756881714, "learning_rate": 1.9207447487700494e-05, "loss": 1.2123, "step": 4298 }, { "epoch": 1.600539842468383, "grad_norm": 0.16824668645858765, "learning_rate": 1.92069732452256e-05, "loss": 1.2085, "step": 4299 }, { "epoch": 1.6009121476189923, "grad_norm": 0.16301900148391724, "learning_rate": 1.920649886676429e-05, "loss": 1.2227, "step": 4300 }, { "epoch": 1.6012844527696013, "grad_norm": 0.173927441239357, "learning_rate": 1.920602435232357e-05, "loss": 1.2114, "step": 4301 }, { "epoch": 1.6016567579202103, "grad_norm": 0.17178916931152344, "learning_rate": 1.9205549701910445e-05, "loss": 1.2076, "step": 4302 }, { "epoch": 1.6020290630708194, "grad_norm": 0.16292187571525574, "learning_rate": 1.920507491553193e-05, "loss": 1.2231, "step": 4303 }, { "epoch": 1.6024013682214284, "grad_norm": 0.1758165806531906, "learning_rate": 1.9204599993195038e-05, "loss": 1.2088, "step": 4304 }, { "epoch": 1.6027736733720377, "grad_norm": 0.1699971705675125, "learning_rate": 1.920412493490678e-05, "loss": 1.2018, "step": 4305 }, { "epoch": 1.6031459785226465, "grad_norm": 0.16813789308071136, "learning_rate": 1.920364974067418e-05, "loss": 1.2054, "step": 4306 }, { "epoch": 1.6035182836732558, "grad_norm": 0.16365012526512146, "learning_rate": 1.9203174410504243e-05, "loss": 1.2084, "step": 4307 }, { "epoch": 1.6038905888238648, "grad_norm": 0.16683803498744965, "learning_rate": 1.9202698944404002e-05, "loss": 1.2064, "step": 4308 }, { "epoch": 1.6042628939744739, "grad_norm": 0.16585657000541687, "learning_rate": 1.9202223342380475e-05, "loss": 1.2156, "step": 4309 }, { "epoch": 1.604635199125083, "grad_norm": 0.16558226943016052, "learning_rate": 1.9201747604440686e-05, "loss": 1.2181, "step": 4310 }, { "epoch": 1.605007504275692, "grad_norm": 0.1677953600883484, "learning_rate": 1.920127173059166e-05, "loss": 1.2002, "step": 4311 }, { "epoch": 1.605379809426301, "grad_norm": 0.16227871179580688, "learning_rate": 1.920079572084043e-05, "loss": 1.201, "step": 4312 }, { "epoch": 1.60575211457691, "grad_norm": 0.16449685394763947, "learning_rate": 1.9200319575194025e-05, "loss": 1.2128, "step": 4313 }, { "epoch": 1.6061244197275193, "grad_norm": 0.1711970865726471, "learning_rate": 1.919984329365948e-05, "loss": 1.2181, "step": 4314 }, { "epoch": 1.6064967248781281, "grad_norm": 0.16856355965137482, "learning_rate": 1.919936687624382e-05, "loss": 1.2065, "step": 4315 }, { "epoch": 1.6068690300287374, "grad_norm": 0.1796281933784485, "learning_rate": 1.9198890322954092e-05, "loss": 1.1957, "step": 4316 }, { "epoch": 1.6072413351793464, "grad_norm": 0.17579133808612823, "learning_rate": 1.9198413633797334e-05, "loss": 1.2143, "step": 4317 }, { "epoch": 1.6076136403299555, "grad_norm": 0.16579852998256683, "learning_rate": 1.919793680878058e-05, "loss": 1.2078, "step": 4318 }, { "epoch": 1.6079859454805645, "grad_norm": 0.16327063739299774, "learning_rate": 1.9197459847910878e-05, "loss": 1.2, "step": 4319 }, { "epoch": 1.6083582506311735, "grad_norm": 0.16912123560905457, "learning_rate": 1.9196982751195272e-05, "loss": 1.2125, "step": 4320 }, { "epoch": 1.6087305557817826, "grad_norm": 0.16993340849876404, "learning_rate": 1.9196505518640807e-05, "loss": 1.2124, "step": 4321 }, { "epoch": 1.6091028609323916, "grad_norm": 0.16973137855529785, "learning_rate": 1.9196028150254535e-05, "loss": 1.2082, "step": 4322 }, { "epoch": 1.609475166083001, "grad_norm": 0.1582946479320526, "learning_rate": 1.91955506460435e-05, "loss": 1.2028, "step": 4323 }, { "epoch": 1.6098474712336097, "grad_norm": 0.17046526074409485, "learning_rate": 1.9195073006014762e-05, "loss": 1.2113, "step": 4324 }, { "epoch": 1.610219776384219, "grad_norm": 0.17357958853244781, "learning_rate": 1.9194595230175373e-05, "loss": 1.2014, "step": 4325 }, { "epoch": 1.610592081534828, "grad_norm": 0.17264226078987122, "learning_rate": 1.9194117318532387e-05, "loss": 1.1983, "step": 4326 }, { "epoch": 1.610964386685437, "grad_norm": 0.1646987348794937, "learning_rate": 1.9193639271092866e-05, "loss": 1.2043, "step": 4327 }, { "epoch": 1.611336691836046, "grad_norm": 0.17559926211833954, "learning_rate": 1.919316108786387e-05, "loss": 1.2055, "step": 4328 }, { "epoch": 1.6117089969866552, "grad_norm": 0.1712273806333542, "learning_rate": 1.9192682768852464e-05, "loss": 1.2072, "step": 4329 }, { "epoch": 1.6120813021372644, "grad_norm": 0.16717827320098877, "learning_rate": 1.919220431406571e-05, "loss": 1.2085, "step": 4330 }, { "epoch": 1.6124536072878732, "grad_norm": 0.1609518676996231, "learning_rate": 1.919172572351067e-05, "loss": 1.2075, "step": 4331 }, { "epoch": 1.6128259124384825, "grad_norm": 0.1639975607395172, "learning_rate": 1.9191246997194426e-05, "loss": 1.2182, "step": 4332 }, { "epoch": 1.6131982175890913, "grad_norm": 0.16888481378555298, "learning_rate": 1.9190768135124034e-05, "loss": 1.1964, "step": 4333 }, { "epoch": 1.6135705227397006, "grad_norm": 0.17243389785289764, "learning_rate": 1.9190289137306577e-05, "loss": 1.2095, "step": 4334 }, { "epoch": 1.6139428278903096, "grad_norm": 0.16882330179214478, "learning_rate": 1.9189810003749125e-05, "loss": 1.2119, "step": 4335 }, { "epoch": 1.6143151330409187, "grad_norm": 0.16220669448375702, "learning_rate": 1.9189330734458757e-05, "loss": 1.2105, "step": 4336 }, { "epoch": 1.6146874381915277, "grad_norm": 0.16079674661159515, "learning_rate": 1.918885132944255e-05, "loss": 1.2148, "step": 4337 }, { "epoch": 1.6150597433421368, "grad_norm": 0.17351467907428741, "learning_rate": 1.9188371788707585e-05, "loss": 1.2062, "step": 4338 }, { "epoch": 1.615432048492746, "grad_norm": 0.1602046936750412, "learning_rate": 1.9187892112260944e-05, "loss": 1.203, "step": 4339 }, { "epoch": 1.6158043536433548, "grad_norm": 0.1660042405128479, "learning_rate": 1.9187412300109714e-05, "loss": 1.2168, "step": 4340 }, { "epoch": 1.616176658793964, "grad_norm": 0.16898569464683533, "learning_rate": 1.9186932352260984e-05, "loss": 1.2069, "step": 4341 }, { "epoch": 1.616548963944573, "grad_norm": 0.17615294456481934, "learning_rate": 1.9186452268721838e-05, "loss": 1.2044, "step": 4342 }, { "epoch": 1.6169212690951822, "grad_norm": 0.17343638837337494, "learning_rate": 1.9185972049499368e-05, "loss": 1.2107, "step": 4343 }, { "epoch": 1.6172935742457912, "grad_norm": 0.15935227274894714, "learning_rate": 1.9185491694600668e-05, "loss": 1.2153, "step": 4344 }, { "epoch": 1.6176658793964003, "grad_norm": 0.1717858910560608, "learning_rate": 1.9185011204032832e-05, "loss": 1.211, "step": 4345 }, { "epoch": 1.6180381845470093, "grad_norm": 0.17080077528953552, "learning_rate": 1.9184530577802953e-05, "loss": 1.1987, "step": 4346 }, { "epoch": 1.6184104896976184, "grad_norm": 0.1667952537536621, "learning_rate": 1.918404981591814e-05, "loss": 1.2066, "step": 4347 }, { "epoch": 1.6187827948482276, "grad_norm": 0.17135196924209595, "learning_rate": 1.9183568918385484e-05, "loss": 1.1975, "step": 4348 }, { "epoch": 1.6191550999988364, "grad_norm": 0.16429933905601501, "learning_rate": 1.918308788521209e-05, "loss": 1.1992, "step": 4349 }, { "epoch": 1.6195274051494457, "grad_norm": 0.16419640183448792, "learning_rate": 1.918260671640507e-05, "loss": 1.1938, "step": 4350 }, { "epoch": 1.6198997103000545, "grad_norm": 0.1755063682794571, "learning_rate": 1.9182125411971522e-05, "loss": 1.2272, "step": 4351 }, { "epoch": 1.6202720154506638, "grad_norm": 0.17690351605415344, "learning_rate": 1.9181643971918557e-05, "loss": 1.1957, "step": 4352 }, { "epoch": 1.6206443206012728, "grad_norm": 0.16387419402599335, "learning_rate": 1.9181162396253286e-05, "loss": 1.1932, "step": 4353 }, { "epoch": 1.6210166257518819, "grad_norm": 0.1648489236831665, "learning_rate": 1.918068068498282e-05, "loss": 1.2002, "step": 4354 }, { "epoch": 1.621388930902491, "grad_norm": 0.171859472990036, "learning_rate": 1.9180198838114284e-05, "loss": 1.1869, "step": 4355 }, { "epoch": 1.6217612360531, "grad_norm": 0.1747763305902481, "learning_rate": 1.9179716855654783e-05, "loss": 1.2189, "step": 4356 }, { "epoch": 1.6221335412037092, "grad_norm": 0.1828392744064331, "learning_rate": 1.917923473761144e-05, "loss": 1.1953, "step": 4357 }, { "epoch": 1.622505846354318, "grad_norm": 0.16498731076717377, "learning_rate": 1.917875248399138e-05, "loss": 1.2088, "step": 4358 }, { "epoch": 1.6228781515049273, "grad_norm": 0.17804205417633057, "learning_rate": 1.9178270094801713e-05, "loss": 1.2074, "step": 4359 }, { "epoch": 1.6232504566555361, "grad_norm": 0.1724364459514618, "learning_rate": 1.917778757004958e-05, "loss": 1.2236, "step": 4360 }, { "epoch": 1.6236227618061454, "grad_norm": 0.17162750661373138, "learning_rate": 1.91773049097421e-05, "loss": 1.2016, "step": 4361 }, { "epoch": 1.6239950669567544, "grad_norm": 0.17239919304847717, "learning_rate": 1.91768221138864e-05, "loss": 1.1871, "step": 4362 }, { "epoch": 1.6243673721073635, "grad_norm": 0.1668175607919693, "learning_rate": 1.9176339182489614e-05, "loss": 1.1971, "step": 4363 }, { "epoch": 1.6247396772579725, "grad_norm": 0.1689068228006363, "learning_rate": 1.9175856115558876e-05, "loss": 1.2086, "step": 4364 }, { "epoch": 1.6251119824085816, "grad_norm": 0.16748180985450745, "learning_rate": 1.9175372913101317e-05, "loss": 1.2124, "step": 4365 }, { "epoch": 1.6254842875591908, "grad_norm": 0.16516231000423431, "learning_rate": 1.9174889575124077e-05, "loss": 1.1986, "step": 4366 }, { "epoch": 1.6258565927097997, "grad_norm": 0.16919684410095215, "learning_rate": 1.9174406101634294e-05, "loss": 1.2042, "step": 4367 }, { "epoch": 1.626228897860409, "grad_norm": 0.1688244789838791, "learning_rate": 1.917392249263911e-05, "loss": 1.1963, "step": 4368 }, { "epoch": 1.626601203011018, "grad_norm": 0.17357051372528076, "learning_rate": 1.917343874814566e-05, "loss": 1.2133, "step": 4369 }, { "epoch": 1.626973508161627, "grad_norm": 0.16557662189006805, "learning_rate": 1.9172954868161098e-05, "loss": 1.1971, "step": 4370 }, { "epoch": 1.627345813312236, "grad_norm": 0.16236409544944763, "learning_rate": 1.9172470852692572e-05, "loss": 1.1992, "step": 4371 }, { "epoch": 1.627718118462845, "grad_norm": 0.16975420713424683, "learning_rate": 1.9171986701747227e-05, "loss": 1.1992, "step": 4372 }, { "epoch": 1.6280904236134541, "grad_norm": 0.1642945110797882, "learning_rate": 1.9171502415332214e-05, "loss": 1.2119, "step": 4373 }, { "epoch": 1.6284627287640632, "grad_norm": 0.15670865774154663, "learning_rate": 1.9171017993454684e-05, "loss": 1.1994, "step": 4374 }, { "epoch": 1.6288350339146724, "grad_norm": 0.16495923697948456, "learning_rate": 1.9170533436121793e-05, "loss": 1.2089, "step": 4375 }, { "epoch": 1.6292073390652813, "grad_norm": 0.16799496114253998, "learning_rate": 1.9170048743340698e-05, "loss": 1.1993, "step": 4376 }, { "epoch": 1.6295796442158905, "grad_norm": 0.1674148440361023, "learning_rate": 1.9169563915118562e-05, "loss": 1.2032, "step": 4377 }, { "epoch": 1.6299519493664996, "grad_norm": 0.16675646603107452, "learning_rate": 1.9169078951462537e-05, "loss": 1.2214, "step": 4378 }, { "epoch": 1.6303242545171086, "grad_norm": 0.16630315780639648, "learning_rate": 1.9168593852379798e-05, "loss": 1.1958, "step": 4379 }, { "epoch": 1.6306965596677176, "grad_norm": 0.16184255480766296, "learning_rate": 1.91681086178775e-05, "loss": 1.2152, "step": 4380 }, { "epoch": 1.6310688648183267, "grad_norm": 0.1670718640089035, "learning_rate": 1.9167623247962816e-05, "loss": 1.2088, "step": 4381 }, { "epoch": 1.6314411699689357, "grad_norm": 0.1772667020559311, "learning_rate": 1.916713774264291e-05, "loss": 1.2204, "step": 4382 }, { "epoch": 1.6318134751195448, "grad_norm": 0.16231748461723328, "learning_rate": 1.916665210192495e-05, "loss": 1.1968, "step": 4383 }, { "epoch": 1.632185780270154, "grad_norm": 0.1662776619195938, "learning_rate": 1.916616632581612e-05, "loss": 1.1995, "step": 4384 }, { "epoch": 1.6325580854207629, "grad_norm": 0.16376206278800964, "learning_rate": 1.9165680414323585e-05, "loss": 1.207, "step": 4385 }, { "epoch": 1.6329303905713721, "grad_norm": 0.16616666316986084, "learning_rate": 1.916519436745453e-05, "loss": 1.2061, "step": 4386 }, { "epoch": 1.6333026957219812, "grad_norm": 0.16720803081989288, "learning_rate": 1.9164708185216122e-05, "loss": 1.2103, "step": 4387 }, { "epoch": 1.6336750008725902, "grad_norm": 0.16905055940151215, "learning_rate": 1.9164221867615556e-05, "loss": 1.2003, "step": 4388 }, { "epoch": 1.6340473060231993, "grad_norm": 0.17031989991664886, "learning_rate": 1.9163735414660005e-05, "loss": 1.1962, "step": 4389 }, { "epoch": 1.6344196111738083, "grad_norm": 0.17323459684848785, "learning_rate": 1.9163248826356657e-05, "loss": 1.2219, "step": 4390 }, { "epoch": 1.6347919163244176, "grad_norm": 0.17302441596984863, "learning_rate": 1.91627621027127e-05, "loss": 1.2199, "step": 4391 }, { "epoch": 1.6351642214750264, "grad_norm": 0.19569987058639526, "learning_rate": 1.916227524373532e-05, "loss": 1.2103, "step": 4392 }, { "epoch": 1.6355365266256356, "grad_norm": 0.16403131186962128, "learning_rate": 1.916178824943171e-05, "loss": 1.1942, "step": 4393 }, { "epoch": 1.6359088317762445, "grad_norm": 0.17407037317752838, "learning_rate": 1.9161301119809065e-05, "loss": 1.2064, "step": 4394 }, { "epoch": 1.6362811369268537, "grad_norm": 0.19399769604206085, "learning_rate": 1.916081385487458e-05, "loss": 1.2097, "step": 4395 }, { "epoch": 1.6366534420774628, "grad_norm": 0.1640399843454361, "learning_rate": 1.9160326454635442e-05, "loss": 1.208, "step": 4396 }, { "epoch": 1.6370257472280718, "grad_norm": 0.1667345017194748, "learning_rate": 1.9159838919098862e-05, "loss": 1.1953, "step": 4397 }, { "epoch": 1.6373980523786809, "grad_norm": 0.16879738867282867, "learning_rate": 1.9159351248272032e-05, "loss": 1.2016, "step": 4398 }, { "epoch": 1.63777035752929, "grad_norm": 0.19763416051864624, "learning_rate": 1.9158863442162162e-05, "loss": 1.2099, "step": 4399 }, { "epoch": 1.6381426626798992, "grad_norm": 0.1749972701072693, "learning_rate": 1.9158375500776454e-05, "loss": 1.2072, "step": 4400 }, { "epoch": 1.638514967830508, "grad_norm": 0.17299896478652954, "learning_rate": 1.9157887424122112e-05, "loss": 1.2046, "step": 4401 }, { "epoch": 1.6388872729811172, "grad_norm": 0.16272982954978943, "learning_rate": 1.915739921220635e-05, "loss": 1.2075, "step": 4402 }, { "epoch": 1.639259578131726, "grad_norm": 0.186982199549675, "learning_rate": 1.9156910865036375e-05, "loss": 1.2091, "step": 4403 }, { "epoch": 1.6396318832823353, "grad_norm": 0.1684504896402359, "learning_rate": 1.91564223826194e-05, "loss": 1.212, "step": 4404 }, { "epoch": 1.6400041884329444, "grad_norm": 0.17304614186286926, "learning_rate": 1.9155933764962645e-05, "loss": 1.2291, "step": 4405 }, { "epoch": 1.6403764935835534, "grad_norm": 0.1625901609659195, "learning_rate": 1.915544501207332e-05, "loss": 1.2118, "step": 4406 }, { "epoch": 1.6407487987341625, "grad_norm": 0.17311595380306244, "learning_rate": 1.915495612395865e-05, "loss": 1.2139, "step": 4407 }, { "epoch": 1.6411211038847715, "grad_norm": 0.17164155840873718, "learning_rate": 1.9154467100625848e-05, "loss": 1.2048, "step": 4408 }, { "epoch": 1.6414934090353808, "grad_norm": 0.169975146651268, "learning_rate": 1.9153977942082143e-05, "loss": 1.2059, "step": 4409 }, { "epoch": 1.6418657141859896, "grad_norm": 0.17380517721176147, "learning_rate": 1.915348864833476e-05, "loss": 1.2203, "step": 4410 }, { "epoch": 1.6422380193365989, "grad_norm": 0.16856248676776886, "learning_rate": 1.9152999219390924e-05, "loss": 1.2108, "step": 4411 }, { "epoch": 1.6426103244872077, "grad_norm": 0.17079463601112366, "learning_rate": 1.915250965525786e-05, "loss": 1.207, "step": 4412 }, { "epoch": 1.642982629637817, "grad_norm": 0.16879814863204956, "learning_rate": 1.9152019955942808e-05, "loss": 1.2071, "step": 4413 }, { "epoch": 1.643354934788426, "grad_norm": 0.17056001722812653, "learning_rate": 1.915153012145299e-05, "loss": 1.2001, "step": 4414 }, { "epoch": 1.643727239939035, "grad_norm": 0.1714567393064499, "learning_rate": 1.915104015179565e-05, "loss": 1.2208, "step": 4415 }, { "epoch": 1.644099545089644, "grad_norm": 0.16902725398540497, "learning_rate": 1.9150550046978022e-05, "loss": 1.2072, "step": 4416 }, { "epoch": 1.644471850240253, "grad_norm": 0.1708216667175293, "learning_rate": 1.9150059807007343e-05, "loss": 1.204, "step": 4417 }, { "epoch": 1.6448441553908624, "grad_norm": 0.17180079221725464, "learning_rate": 1.9149569431890854e-05, "loss": 1.2099, "step": 4418 }, { "epoch": 1.6452164605414712, "grad_norm": 0.17007555067539215, "learning_rate": 1.91490789216358e-05, "loss": 1.217, "step": 4419 }, { "epoch": 1.6455887656920805, "grad_norm": 0.16884230077266693, "learning_rate": 1.9148588276249423e-05, "loss": 1.2053, "step": 4420 }, { "epoch": 1.6459610708426893, "grad_norm": 0.16880488395690918, "learning_rate": 1.9148097495738974e-05, "loss": 1.1978, "step": 4421 }, { "epoch": 1.6463333759932985, "grad_norm": 0.16282446682453156, "learning_rate": 1.9147606580111696e-05, "loss": 1.2012, "step": 4422 }, { "epoch": 1.6467056811439076, "grad_norm": 0.17796848714351654, "learning_rate": 1.9147115529374846e-05, "loss": 1.2069, "step": 4423 }, { "epoch": 1.6470779862945166, "grad_norm": 0.16863879561424255, "learning_rate": 1.914662434353567e-05, "loss": 1.2175, "step": 4424 }, { "epoch": 1.6474502914451257, "grad_norm": 0.1625177413225174, "learning_rate": 1.914613302260143e-05, "loss": 1.1944, "step": 4425 }, { "epoch": 1.6478225965957347, "grad_norm": 0.1714029610157013, "learning_rate": 1.9145641566579377e-05, "loss": 1.2129, "step": 4426 }, { "epoch": 1.648194901746344, "grad_norm": 0.16849827766418457, "learning_rate": 1.914514997547677e-05, "loss": 1.2187, "step": 4427 }, { "epoch": 1.6485672068969528, "grad_norm": 0.16397565603256226, "learning_rate": 1.9144658249300877e-05, "loss": 1.1988, "step": 4428 }, { "epoch": 1.648939512047562, "grad_norm": 0.16494201123714447, "learning_rate": 1.9144166388058952e-05, "loss": 1.2045, "step": 4429 }, { "epoch": 1.649311817198171, "grad_norm": 0.16765427589416504, "learning_rate": 1.9143674391758264e-05, "loss": 1.1967, "step": 4430 }, { "epoch": 1.6496841223487801, "grad_norm": 0.16366161406040192, "learning_rate": 1.914318226040608e-05, "loss": 1.1777, "step": 4431 }, { "epoch": 1.6500564274993892, "grad_norm": 0.16279412806034088, "learning_rate": 1.9142689994009666e-05, "loss": 1.2158, "step": 4432 }, { "epoch": 1.6504287326499982, "grad_norm": 0.16934585571289062, "learning_rate": 1.9142197592576294e-05, "loss": 1.2082, "step": 4433 }, { "epoch": 1.6508010378006073, "grad_norm": 0.17340950667858124, "learning_rate": 1.914170505611324e-05, "loss": 1.2204, "step": 4434 }, { "epoch": 1.6511733429512163, "grad_norm": 0.16891925036907196, "learning_rate": 1.9141212384627777e-05, "loss": 1.2084, "step": 4435 }, { "epoch": 1.6515456481018256, "grad_norm": 0.1604902744293213, "learning_rate": 1.914071957812718e-05, "loss": 1.1887, "step": 4436 }, { "epoch": 1.6519179532524344, "grad_norm": 0.17385107278823853, "learning_rate": 1.9140226636618726e-05, "loss": 1.2132, "step": 4437 }, { "epoch": 1.6522902584030437, "grad_norm": 0.17028431594371796, "learning_rate": 1.91397335601097e-05, "loss": 1.1935, "step": 4438 }, { "epoch": 1.6526625635536527, "grad_norm": 0.17365415394306183, "learning_rate": 1.913924034860738e-05, "loss": 1.2097, "step": 4439 }, { "epoch": 1.6530348687042618, "grad_norm": 0.1641881912946701, "learning_rate": 1.913874700211906e-05, "loss": 1.2043, "step": 4440 }, { "epoch": 1.6534071738548708, "grad_norm": 0.17010696232318878, "learning_rate": 1.9138253520652014e-05, "loss": 1.1914, "step": 4441 }, { "epoch": 1.6537794790054798, "grad_norm": 0.16985437273979187, "learning_rate": 1.913775990421354e-05, "loss": 1.2232, "step": 4442 }, { "epoch": 1.654151784156089, "grad_norm": 0.16199250519275665, "learning_rate": 1.9137266152810925e-05, "loss": 1.1949, "step": 4443 }, { "epoch": 1.654524089306698, "grad_norm": 0.16791431605815887, "learning_rate": 1.9136772266451462e-05, "loss": 1.2102, "step": 4444 }, { "epoch": 1.6548963944573072, "grad_norm": 0.16607221961021423, "learning_rate": 1.9136278245142446e-05, "loss": 1.205, "step": 4445 }, { "epoch": 1.655268699607916, "grad_norm": 0.16292716562747955, "learning_rate": 1.9135784088891175e-05, "loss": 1.2006, "step": 4446 }, { "epoch": 1.6556410047585253, "grad_norm": 0.1653973013162613, "learning_rate": 1.9135289797704946e-05, "loss": 1.191, "step": 4447 }, { "epoch": 1.6560133099091343, "grad_norm": 0.17572778463363647, "learning_rate": 1.913479537159106e-05, "loss": 1.2133, "step": 4448 }, { "epoch": 1.6563856150597434, "grad_norm": 0.17195762693881989, "learning_rate": 1.913430081055682e-05, "loss": 1.1863, "step": 4449 }, { "epoch": 1.6567579202103524, "grad_norm": 0.17644944787025452, "learning_rate": 1.9133806114609527e-05, "loss": 1.1969, "step": 4450 }, { "epoch": 1.6571302253609614, "grad_norm": 0.16783183813095093, "learning_rate": 1.9133311283756493e-05, "loss": 1.2021, "step": 4451 }, { "epoch": 1.6575025305115707, "grad_norm": 0.17917399108409882, "learning_rate": 1.9132816318005026e-05, "loss": 1.2142, "step": 4452 }, { "epoch": 1.6578748356621795, "grad_norm": 0.16448262333869934, "learning_rate": 1.9132321217362434e-05, "loss": 1.205, "step": 4453 }, { "epoch": 1.6582471408127888, "grad_norm": 0.1729598045349121, "learning_rate": 1.913182598183603e-05, "loss": 1.202, "step": 4454 }, { "epoch": 1.6586194459633976, "grad_norm": 0.1698109656572342, "learning_rate": 1.913133061143313e-05, "loss": 1.1936, "step": 4455 }, { "epoch": 1.6589917511140069, "grad_norm": 0.1744636744260788, "learning_rate": 1.913083510616105e-05, "loss": 1.1998, "step": 4456 }, { "epoch": 1.659364056264616, "grad_norm": 0.1762547791004181, "learning_rate": 1.9130339466027108e-05, "loss": 1.2128, "step": 4457 }, { "epoch": 1.659736361415225, "grad_norm": 0.16493569314479828, "learning_rate": 1.9129843691038625e-05, "loss": 1.205, "step": 4458 }, { "epoch": 1.660108666565834, "grad_norm": 0.16134680807590485, "learning_rate": 1.9129347781202924e-05, "loss": 1.1999, "step": 4459 }, { "epoch": 1.660480971716443, "grad_norm": 0.16520054638385773, "learning_rate": 1.912885173652733e-05, "loss": 1.2073, "step": 4460 }, { "epoch": 1.6608532768670523, "grad_norm": 0.16811229288578033, "learning_rate": 1.9128355557019168e-05, "loss": 1.2041, "step": 4461 }, { "epoch": 1.6612255820176611, "grad_norm": 0.16674669086933136, "learning_rate": 1.912785924268577e-05, "loss": 1.212, "step": 4462 }, { "epoch": 1.6615978871682704, "grad_norm": 0.17780810594558716, "learning_rate": 1.912736279353446e-05, "loss": 1.2236, "step": 4463 }, { "epoch": 1.6619701923188792, "grad_norm": 0.18256306648254395, "learning_rate": 1.9126866209572575e-05, "loss": 1.2028, "step": 4464 }, { "epoch": 1.6623424974694885, "grad_norm": 0.18052802979946136, "learning_rate": 1.912636949080745e-05, "loss": 1.215, "step": 4465 }, { "epoch": 1.6627148026200975, "grad_norm": 0.16654305160045624, "learning_rate": 1.912587263724642e-05, "loss": 1.2013, "step": 4466 }, { "epoch": 1.6630871077707066, "grad_norm": 0.17217010259628296, "learning_rate": 1.9125375648896823e-05, "loss": 1.218, "step": 4467 }, { "epoch": 1.6634594129213156, "grad_norm": 0.17608584463596344, "learning_rate": 1.9124878525766002e-05, "loss": 1.2081, "step": 4468 }, { "epoch": 1.6638317180719246, "grad_norm": 0.16333593428134918, "learning_rate": 1.9124381267861295e-05, "loss": 1.2036, "step": 4469 }, { "epoch": 1.664204023222534, "grad_norm": 0.1907689869403839, "learning_rate": 1.9123883875190052e-05, "loss": 1.202, "step": 4470 }, { "epoch": 1.6645763283731427, "grad_norm": 0.18991775810718536, "learning_rate": 1.9123386347759614e-05, "loss": 1.2024, "step": 4471 }, { "epoch": 1.664948633523752, "grad_norm": 0.17199549078941345, "learning_rate": 1.9122888685577337e-05, "loss": 1.2002, "step": 4472 }, { "epoch": 1.6653209386743608, "grad_norm": 0.20347337424755096, "learning_rate": 1.9122390888650564e-05, "loss": 1.2212, "step": 4473 }, { "epoch": 1.66569324382497, "grad_norm": 0.16607064008712769, "learning_rate": 1.912189295698665e-05, "loss": 1.2063, "step": 4474 }, { "epoch": 1.6660655489755791, "grad_norm": 0.16356223821640015, "learning_rate": 1.9121394890592948e-05, "loss": 1.2087, "step": 4475 }, { "epoch": 1.6664378541261882, "grad_norm": 0.1626313030719757, "learning_rate": 1.9120896689476817e-05, "loss": 1.1938, "step": 4476 }, { "epoch": 1.6668101592767972, "grad_norm": 0.17197953164577484, "learning_rate": 1.9120398353645615e-05, "loss": 1.2206, "step": 4477 }, { "epoch": 1.6671824644274063, "grad_norm": 0.16226613521575928, "learning_rate": 1.9119899883106702e-05, "loss": 1.2073, "step": 4478 }, { "epoch": 1.6675547695780155, "grad_norm": 0.16794995963573456, "learning_rate": 1.911940127786744e-05, "loss": 1.2096, "step": 4479 }, { "epoch": 1.6679270747286243, "grad_norm": 0.17185860872268677, "learning_rate": 1.911890253793519e-05, "loss": 1.1917, "step": 4480 }, { "epoch": 1.6682993798792336, "grad_norm": 0.16930022835731506, "learning_rate": 1.9118403663317323e-05, "loss": 1.2207, "step": 4481 }, { "epoch": 1.6686716850298424, "grad_norm": 0.16357828676700592, "learning_rate": 1.911790465402121e-05, "loss": 1.2031, "step": 4482 }, { "epoch": 1.6690439901804517, "grad_norm": 0.16401416063308716, "learning_rate": 1.9117405510054216e-05, "loss": 1.1962, "step": 4483 }, { "epoch": 1.6694162953310607, "grad_norm": 0.1795455366373062, "learning_rate": 1.9116906231423712e-05, "loss": 1.2093, "step": 4484 }, { "epoch": 1.6697886004816698, "grad_norm": 0.17180180549621582, "learning_rate": 1.9116406818137077e-05, "loss": 1.2082, "step": 4485 }, { "epoch": 1.6701609056322788, "grad_norm": 0.16501714289188385, "learning_rate": 1.9115907270201684e-05, "loss": 1.1998, "step": 4486 }, { "epoch": 1.6705332107828879, "grad_norm": 0.16321603953838348, "learning_rate": 1.9115407587624915e-05, "loss": 1.2053, "step": 4487 }, { "epoch": 1.6709055159334971, "grad_norm": 0.16696272790431976, "learning_rate": 1.911490777041415e-05, "loss": 1.2012, "step": 4488 }, { "epoch": 1.671277821084106, "grad_norm": 0.16422761976718903, "learning_rate": 1.9114407818576767e-05, "loss": 1.1892, "step": 4489 }, { "epoch": 1.6716501262347152, "grad_norm": 0.17487448453903198, "learning_rate": 1.911390773212015e-05, "loss": 1.2041, "step": 4490 }, { "epoch": 1.6720224313853242, "grad_norm": 0.16392962634563446, "learning_rate": 1.911340751105169e-05, "loss": 1.2164, "step": 4491 }, { "epoch": 1.6723947365359333, "grad_norm": 0.1618296504020691, "learning_rate": 1.9112907155378772e-05, "loss": 1.2109, "step": 4492 }, { "epoch": 1.6727670416865423, "grad_norm": 0.17908450961112976, "learning_rate": 1.911240666510879e-05, "loss": 1.2248, "step": 4493 }, { "epoch": 1.6731393468371514, "grad_norm": 0.17229048907756805, "learning_rate": 1.9111906040249134e-05, "loss": 1.2005, "step": 4494 }, { "epoch": 1.6735116519877604, "grad_norm": 0.16625456511974335, "learning_rate": 1.9111405280807192e-05, "loss": 1.1994, "step": 4495 }, { "epoch": 1.6738839571383695, "grad_norm": 0.18167874217033386, "learning_rate": 1.911090438679037e-05, "loss": 1.1959, "step": 4496 }, { "epoch": 1.6742562622889787, "grad_norm": 0.16596947610378265, "learning_rate": 1.911040335820606e-05, "loss": 1.1882, "step": 4497 }, { "epoch": 1.6746285674395875, "grad_norm": 0.17403769493103027, "learning_rate": 1.9109902195061666e-05, "loss": 1.2009, "step": 4498 }, { "epoch": 1.6750008725901968, "grad_norm": 0.1789177507162094, "learning_rate": 1.9109400897364584e-05, "loss": 1.2039, "step": 4499 }, { "epoch": 1.6753731777408059, "grad_norm": 0.1641203761100769, "learning_rate": 1.9108899465122227e-05, "loss": 1.2151, "step": 4500 }, { "epoch": 1.6753731777408059, "eval_loss": 1.3091740608215332, "eval_runtime": 17.081, "eval_samples_per_second": 101.517, "eval_steps_per_second": 5.093, "step": 4500 }, { "epoch": 1.675745482891415, "grad_norm": 0.17204515635967255, "learning_rate": 1.9108397898342e-05, "loss": 1.2264, "step": 4501 }, { "epoch": 1.676117788042024, "grad_norm": 0.16915835440158844, "learning_rate": 1.9107896197031298e-05, "loss": 1.2102, "step": 4502 }, { "epoch": 1.676490093192633, "grad_norm": 0.22267740964889526, "learning_rate": 1.9107394361197545e-05, "loss": 1.2142, "step": 4503 }, { "epoch": 1.6768623983432422, "grad_norm": 0.16702081263065338, "learning_rate": 1.9106892390848154e-05, "loss": 1.1892, "step": 4504 }, { "epoch": 1.677234703493851, "grad_norm": 0.17088332772254944, "learning_rate": 1.9106390285990527e-05, "loss": 1.2173, "step": 4505 }, { "epoch": 1.6776070086444603, "grad_norm": 0.16930538415908813, "learning_rate": 1.9105888046632088e-05, "loss": 1.1886, "step": 4506 }, { "epoch": 1.6779793137950691, "grad_norm": 0.18196465075016022, "learning_rate": 1.9105385672780256e-05, "loss": 1.1952, "step": 4507 }, { "epoch": 1.6783516189456784, "grad_norm": 0.17563000321388245, "learning_rate": 1.910488316444245e-05, "loss": 1.2135, "step": 4508 }, { "epoch": 1.6787239240962875, "grad_norm": 0.17271769046783447, "learning_rate": 1.910438052162609e-05, "loss": 1.2188, "step": 4509 }, { "epoch": 1.6790962292468965, "grad_norm": 0.21126636862754822, "learning_rate": 1.91038777443386e-05, "loss": 1.2002, "step": 4510 }, { "epoch": 1.6794685343975055, "grad_norm": 0.24127759039402008, "learning_rate": 1.9103374832587406e-05, "loss": 1.1998, "step": 4511 }, { "epoch": 1.6798408395481146, "grad_norm": 0.1943320333957672, "learning_rate": 1.910287178637994e-05, "loss": 1.2146, "step": 4512 }, { "epoch": 1.6802131446987238, "grad_norm": 0.18076887726783752, "learning_rate": 1.9102368605723626e-05, "loss": 1.1915, "step": 4513 }, { "epoch": 1.6805854498493327, "grad_norm": 0.2074926644563675, "learning_rate": 1.9101865290625903e-05, "loss": 1.1936, "step": 4514 }, { "epoch": 1.680957754999942, "grad_norm": 0.17887428402900696, "learning_rate": 1.91013618410942e-05, "loss": 1.2016, "step": 4515 }, { "epoch": 1.6813300601505508, "grad_norm": 0.1693534553050995, "learning_rate": 1.910085825713595e-05, "loss": 1.2022, "step": 4516 }, { "epoch": 1.68170236530116, "grad_norm": 0.1669137179851532, "learning_rate": 1.9100354538758598e-05, "loss": 1.2027, "step": 4517 }, { "epoch": 1.682074670451769, "grad_norm": 0.1691608726978302, "learning_rate": 1.9099850685969578e-05, "loss": 1.2078, "step": 4518 }, { "epoch": 1.682446975602378, "grad_norm": 0.16599372029304504, "learning_rate": 1.9099346698776338e-05, "loss": 1.185, "step": 4519 }, { "epoch": 1.6828192807529871, "grad_norm": 0.164974182844162, "learning_rate": 1.9098842577186315e-05, "loss": 1.2028, "step": 4520 }, { "epoch": 1.6831915859035962, "grad_norm": 0.1712837517261505, "learning_rate": 1.909833832120696e-05, "loss": 1.215, "step": 4521 }, { "epoch": 1.6835638910542055, "grad_norm": 0.16040773689746857, "learning_rate": 1.9097833930845718e-05, "loss": 1.2097, "step": 4522 }, { "epoch": 1.6839361962048143, "grad_norm": 0.163836270570755, "learning_rate": 1.9097329406110038e-05, "loss": 1.2142, "step": 4523 }, { "epoch": 1.6843085013554235, "grad_norm": 0.16300654411315918, "learning_rate": 1.9096824747007378e-05, "loss": 1.203, "step": 4524 }, { "epoch": 1.6846808065060324, "grad_norm": 0.17333677411079407, "learning_rate": 1.9096319953545186e-05, "loss": 1.2009, "step": 4525 }, { "epoch": 1.6850531116566416, "grad_norm": 0.1657482534646988, "learning_rate": 1.9095815025730918e-05, "loss": 1.224, "step": 4526 }, { "epoch": 1.6854254168072507, "grad_norm": 0.17389807105064392, "learning_rate": 1.9095309963572034e-05, "loss": 1.208, "step": 4527 }, { "epoch": 1.6857977219578597, "grad_norm": 0.16647979617118835, "learning_rate": 1.909480476707599e-05, "loss": 1.1943, "step": 4528 }, { "epoch": 1.6861700271084687, "grad_norm": 0.16131357848644257, "learning_rate": 1.9094299436250254e-05, "loss": 1.1994, "step": 4529 }, { "epoch": 1.6865423322590778, "grad_norm": 0.16403862833976746, "learning_rate": 1.9093793971102282e-05, "loss": 1.1896, "step": 4530 }, { "epoch": 1.686914637409687, "grad_norm": 0.16016656160354614, "learning_rate": 1.9093288371639547e-05, "loss": 1.2013, "step": 4531 }, { "epoch": 1.6872869425602959, "grad_norm": 0.1598949134349823, "learning_rate": 1.9092782637869513e-05, "loss": 1.1843, "step": 4532 }, { "epoch": 1.6876592477109051, "grad_norm": 0.16211171448230743, "learning_rate": 1.909227676979965e-05, "loss": 1.1962, "step": 4533 }, { "epoch": 1.688031552861514, "grad_norm": 0.1622130423784256, "learning_rate": 1.9091770767437428e-05, "loss": 1.1995, "step": 4534 }, { "epoch": 1.6884038580121232, "grad_norm": 0.1642407476902008, "learning_rate": 1.9091264630790324e-05, "loss": 1.1968, "step": 4535 }, { "epoch": 1.6887761631627323, "grad_norm": 0.16541729867458344, "learning_rate": 1.9090758359865812e-05, "loss": 1.2072, "step": 4536 }, { "epoch": 1.6891484683133413, "grad_norm": 0.16923236846923828, "learning_rate": 1.9090251954671372e-05, "loss": 1.2137, "step": 4537 }, { "epoch": 1.6895207734639504, "grad_norm": 0.16798345744609833, "learning_rate": 1.9089745415214474e-05, "loss": 1.1978, "step": 4538 }, { "epoch": 1.6898930786145594, "grad_norm": 0.1650944948196411, "learning_rate": 1.9089238741502614e-05, "loss": 1.2105, "step": 4539 }, { "epoch": 1.6902653837651687, "grad_norm": 0.17048072814941406, "learning_rate": 1.9088731933543262e-05, "loss": 1.208, "step": 4540 }, { "epoch": 1.6906376889157775, "grad_norm": 0.15930227935314178, "learning_rate": 1.9088224991343916e-05, "loss": 1.217, "step": 4541 }, { "epoch": 1.6910099940663867, "grad_norm": 0.163492351770401, "learning_rate": 1.9087717914912054e-05, "loss": 1.2102, "step": 4542 }, { "epoch": 1.6913822992169956, "grad_norm": 0.16682982444763184, "learning_rate": 1.908721070425517e-05, "loss": 1.1899, "step": 4543 }, { "epoch": 1.6917546043676048, "grad_norm": 0.16626925766468048, "learning_rate": 1.908670335938075e-05, "loss": 1.1931, "step": 4544 }, { "epoch": 1.6921269095182139, "grad_norm": 0.1644056737422943, "learning_rate": 1.9086195880296294e-05, "loss": 1.2129, "step": 4545 }, { "epoch": 1.692499214668823, "grad_norm": 0.16008804738521576, "learning_rate": 1.9085688267009298e-05, "loss": 1.2037, "step": 4546 }, { "epoch": 1.692871519819432, "grad_norm": 0.16731880605220795, "learning_rate": 1.9085180519527252e-05, "loss": 1.1954, "step": 4547 }, { "epoch": 1.693243824970041, "grad_norm": 0.1611679047346115, "learning_rate": 1.9084672637857663e-05, "loss": 1.2074, "step": 4548 }, { "epoch": 1.6936161301206503, "grad_norm": 0.16875462234020233, "learning_rate": 1.908416462200803e-05, "loss": 1.2177, "step": 4549 }, { "epoch": 1.693988435271259, "grad_norm": 0.15939076244831085, "learning_rate": 1.9083656471985855e-05, "loss": 1.2001, "step": 4550 }, { "epoch": 1.6943607404218683, "grad_norm": 0.16312842071056366, "learning_rate": 1.908314818779864e-05, "loss": 1.1995, "step": 4551 }, { "epoch": 1.6947330455724774, "grad_norm": 0.16439701616764069, "learning_rate": 1.90826397694539e-05, "loss": 1.2002, "step": 4552 }, { "epoch": 1.6951053507230864, "grad_norm": 0.16645435988903046, "learning_rate": 1.9082131216959137e-05, "loss": 1.207, "step": 4553 }, { "epoch": 1.6954776558736955, "grad_norm": 0.1708313226699829, "learning_rate": 1.9081622530321874e-05, "loss": 1.2047, "step": 4554 }, { "epoch": 1.6958499610243045, "grad_norm": 0.16160647571086884, "learning_rate": 1.908111370954961e-05, "loss": 1.2187, "step": 4555 }, { "epoch": 1.6962222661749136, "grad_norm": 0.15889696776866913, "learning_rate": 1.9080604754649865e-05, "loss": 1.2136, "step": 4556 }, { "epoch": 1.6965945713255226, "grad_norm": 0.1612309068441391, "learning_rate": 1.908009566563016e-05, "loss": 1.1969, "step": 4557 }, { "epoch": 1.6969668764761319, "grad_norm": 0.16933639347553253, "learning_rate": 1.9079586442498016e-05, "loss": 1.2167, "step": 4558 }, { "epoch": 1.6973391816267407, "grad_norm": 0.1634131371974945, "learning_rate": 1.9079077085260943e-05, "loss": 1.216, "step": 4559 }, { "epoch": 1.69771148677735, "grad_norm": 0.16775217652320862, "learning_rate": 1.9078567593926472e-05, "loss": 1.2064, "step": 4560 }, { "epoch": 1.698083791927959, "grad_norm": 0.164699524641037, "learning_rate": 1.9078057968502132e-05, "loss": 1.2052, "step": 4561 }, { "epoch": 1.698456097078568, "grad_norm": 0.1709076166152954, "learning_rate": 1.9077548208995442e-05, "loss": 1.2005, "step": 4562 }, { "epoch": 1.698828402229177, "grad_norm": 0.1730164885520935, "learning_rate": 1.907703831541394e-05, "loss": 1.2262, "step": 4563 }, { "epoch": 1.6992007073797861, "grad_norm": 0.1677834689617157, "learning_rate": 1.9076528287765145e-05, "loss": 1.1978, "step": 4564 }, { "epoch": 1.6995730125303954, "grad_norm": 0.16488604247570038, "learning_rate": 1.90760181260566e-05, "loss": 1.1987, "step": 4565 }, { "epoch": 1.6999453176810042, "grad_norm": 0.18518555164337158, "learning_rate": 1.9075507830295837e-05, "loss": 1.1877, "step": 4566 }, { "epoch": 1.7003176228316135, "grad_norm": 0.16280607879161835, "learning_rate": 1.9074997400490392e-05, "loss": 1.2053, "step": 4567 }, { "epoch": 1.7006899279822223, "grad_norm": 0.16857080161571503, "learning_rate": 1.90744868366478e-05, "loss": 1.2054, "step": 4568 }, { "epoch": 1.7010622331328316, "grad_norm": 0.16795577108860016, "learning_rate": 1.9073976138775613e-05, "loss": 1.2102, "step": 4569 }, { "epoch": 1.7014345382834406, "grad_norm": 0.17121511697769165, "learning_rate": 1.907346530688137e-05, "loss": 1.1977, "step": 4570 }, { "epoch": 1.7018068434340496, "grad_norm": 0.1754191517829895, "learning_rate": 1.907295434097261e-05, "loss": 1.2035, "step": 4571 }, { "epoch": 1.7021791485846587, "grad_norm": 0.1667300909757614, "learning_rate": 1.9072443241056884e-05, "loss": 1.2076, "step": 4572 }, { "epoch": 1.7025514537352677, "grad_norm": 0.19166409969329834, "learning_rate": 1.9071932007141742e-05, "loss": 1.207, "step": 4573 }, { "epoch": 1.702923758885877, "grad_norm": 0.16179704666137695, "learning_rate": 1.907142063923473e-05, "loss": 1.1928, "step": 4574 }, { "epoch": 1.7032960640364858, "grad_norm": 0.17530782520771027, "learning_rate": 1.907090913734341e-05, "loss": 1.2057, "step": 4575 }, { "epoch": 1.703668369187095, "grad_norm": 0.16238301992416382, "learning_rate": 1.9070397501475327e-05, "loss": 1.1965, "step": 4576 }, { "epoch": 1.704040674337704, "grad_norm": 0.18561150133609772, "learning_rate": 1.9069885731638045e-05, "loss": 1.2201, "step": 4577 }, { "epoch": 1.7044129794883132, "grad_norm": 0.17264258861541748, "learning_rate": 1.9069373827839117e-05, "loss": 1.1941, "step": 4578 }, { "epoch": 1.7047852846389222, "grad_norm": 0.16849012672901154, "learning_rate": 1.906886179008611e-05, "loss": 1.2056, "step": 4579 }, { "epoch": 1.7051575897895312, "grad_norm": 0.16359888017177582, "learning_rate": 1.906834961838658e-05, "loss": 1.2024, "step": 4580 }, { "epoch": 1.7055298949401403, "grad_norm": 0.18342570960521698, "learning_rate": 1.9067837312748097e-05, "loss": 1.1985, "step": 4581 }, { "epoch": 1.7059022000907493, "grad_norm": 0.16952671110630035, "learning_rate": 1.9067324873178227e-05, "loss": 1.2096, "step": 4582 }, { "epoch": 1.7062745052413586, "grad_norm": 0.16725878417491913, "learning_rate": 1.9066812299684537e-05, "loss": 1.2007, "step": 4583 }, { "epoch": 1.7066468103919674, "grad_norm": 0.1666475087404251, "learning_rate": 1.9066299592274596e-05, "loss": 1.2142, "step": 4584 }, { "epoch": 1.7070191155425767, "grad_norm": 0.17125262320041656, "learning_rate": 1.9065786750955983e-05, "loss": 1.2146, "step": 4585 }, { "epoch": 1.7073914206931855, "grad_norm": 0.16806922852993011, "learning_rate": 1.9065273775736264e-05, "loss": 1.1995, "step": 4586 }, { "epoch": 1.7077637258437948, "grad_norm": 0.16687040030956268, "learning_rate": 1.9064760666623025e-05, "loss": 1.2051, "step": 4587 }, { "epoch": 1.7081360309944038, "grad_norm": 0.1716591715812683, "learning_rate": 1.9064247423623838e-05, "loss": 1.204, "step": 4588 }, { "epoch": 1.7085083361450129, "grad_norm": 0.1665405035018921, "learning_rate": 1.9063734046746286e-05, "loss": 1.1918, "step": 4589 }, { "epoch": 1.708880641295622, "grad_norm": 0.17192888259887695, "learning_rate": 1.906322053599795e-05, "loss": 1.2096, "step": 4590 }, { "epoch": 1.709252946446231, "grad_norm": 0.16220897436141968, "learning_rate": 1.9062706891386414e-05, "loss": 1.1828, "step": 4591 }, { "epoch": 1.7096252515968402, "grad_norm": 0.17330990731716156, "learning_rate": 1.9062193112919266e-05, "loss": 1.1909, "step": 4592 }, { "epoch": 1.709997556747449, "grad_norm": 0.17249375581741333, "learning_rate": 1.9061679200604097e-05, "loss": 1.1908, "step": 4593 }, { "epoch": 1.7103698618980583, "grad_norm": 0.1671372950077057, "learning_rate": 1.9061165154448496e-05, "loss": 1.2069, "step": 4594 }, { "epoch": 1.710742167048667, "grad_norm": 0.17133872210979462, "learning_rate": 1.906065097446005e-05, "loss": 1.1977, "step": 4595 }, { "epoch": 1.7111144721992764, "grad_norm": 0.18286733329296112, "learning_rate": 1.9060136660646362e-05, "loss": 1.2198, "step": 4596 }, { "epoch": 1.7114867773498854, "grad_norm": 0.17125152051448822, "learning_rate": 1.9059622213015023e-05, "loss": 1.2111, "step": 4597 }, { "epoch": 1.7118590825004945, "grad_norm": 0.16331323981285095, "learning_rate": 1.905910763157363e-05, "loss": 1.212, "step": 4598 }, { "epoch": 1.7122313876511035, "grad_norm": 0.17022307217121124, "learning_rate": 1.905859291632979e-05, "loss": 1.2115, "step": 4599 }, { "epoch": 1.7126036928017125, "grad_norm": 0.17252519726753235, "learning_rate": 1.9058078067291095e-05, "loss": 1.2075, "step": 4600 }, { "epoch": 1.7129759979523218, "grad_norm": 0.19809845089912415, "learning_rate": 1.9057563084465157e-05, "loss": 1.1885, "step": 4601 }, { "epoch": 1.7133483031029306, "grad_norm": 0.1879017949104309, "learning_rate": 1.9057047967859584e-05, "loss": 1.2113, "step": 4602 }, { "epoch": 1.71372060825354, "grad_norm": 0.17755410075187683, "learning_rate": 1.905653271748198e-05, "loss": 1.205, "step": 4603 }, { "epoch": 1.7140929134041487, "grad_norm": 0.24146291613578796, "learning_rate": 1.905601733333996e-05, "loss": 1.1976, "step": 4604 }, { "epoch": 1.714465218554758, "grad_norm": 0.2119484394788742, "learning_rate": 1.9055501815441126e-05, "loss": 1.1983, "step": 4605 }, { "epoch": 1.714837523705367, "grad_norm": 0.18424339592456818, "learning_rate": 1.90549861637931e-05, "loss": 1.2067, "step": 4606 }, { "epoch": 1.715209828855976, "grad_norm": 0.16043433547019958, "learning_rate": 1.9054470378403495e-05, "loss": 1.2139, "step": 4607 }, { "epoch": 1.715582134006585, "grad_norm": 0.19521519541740417, "learning_rate": 1.9053954459279934e-05, "loss": 1.2078, "step": 4608 }, { "epoch": 1.7159544391571941, "grad_norm": 0.1905895173549652, "learning_rate": 1.905343840643003e-05, "loss": 1.1947, "step": 4609 }, { "epoch": 1.7163267443078034, "grad_norm": 0.17229844629764557, "learning_rate": 1.9052922219861413e-05, "loss": 1.2018, "step": 4610 }, { "epoch": 1.7166990494584122, "grad_norm": 0.16754981875419617, "learning_rate": 1.90524058995817e-05, "loss": 1.2105, "step": 4611 }, { "epoch": 1.7170713546090215, "grad_norm": 0.17675986886024475, "learning_rate": 1.9051889445598524e-05, "loss": 1.196, "step": 4612 }, { "epoch": 1.7174436597596305, "grad_norm": 0.1735907346010208, "learning_rate": 1.9051372857919505e-05, "loss": 1.2019, "step": 4613 }, { "epoch": 1.7178159649102396, "grad_norm": 0.16587558388710022, "learning_rate": 1.905085613655228e-05, "loss": 1.2019, "step": 4614 }, { "epoch": 1.7181882700608486, "grad_norm": 0.17289356887340546, "learning_rate": 1.9050339281504474e-05, "loss": 1.1919, "step": 4615 }, { "epoch": 1.7185605752114577, "grad_norm": 0.1681814342737198, "learning_rate": 1.904982229278373e-05, "loss": 1.1936, "step": 4616 }, { "epoch": 1.7189328803620667, "grad_norm": 0.15795257687568665, "learning_rate": 1.9049305170397673e-05, "loss": 1.203, "step": 4617 }, { "epoch": 1.7193051855126757, "grad_norm": 0.16355076432228088, "learning_rate": 1.904878791435395e-05, "loss": 1.2002, "step": 4618 }, { "epoch": 1.719677490663285, "grad_norm": 0.16879984736442566, "learning_rate": 1.9048270524660197e-05, "loss": 1.2025, "step": 4619 }, { "epoch": 1.7200497958138938, "grad_norm": 0.1663307398557663, "learning_rate": 1.9047753001324057e-05, "loss": 1.1998, "step": 4620 }, { "epoch": 1.720422100964503, "grad_norm": 0.16687534749507904, "learning_rate": 1.9047235344353173e-05, "loss": 1.2093, "step": 4621 }, { "epoch": 1.7207944061151121, "grad_norm": 0.1707238256931305, "learning_rate": 1.9046717553755187e-05, "loss": 1.2089, "step": 4622 }, { "epoch": 1.7211667112657212, "grad_norm": 0.16789813339710236, "learning_rate": 1.9046199629537754e-05, "loss": 1.2204, "step": 4623 }, { "epoch": 1.7215390164163302, "grad_norm": 0.16236470639705658, "learning_rate": 1.9045681571708517e-05, "loss": 1.2088, "step": 4624 }, { "epoch": 1.7219113215669393, "grad_norm": 0.1650935858488083, "learning_rate": 1.9045163380275134e-05, "loss": 1.2009, "step": 4625 }, { "epoch": 1.7222836267175485, "grad_norm": 0.17150568962097168, "learning_rate": 1.9044645055245254e-05, "loss": 1.2058, "step": 4626 }, { "epoch": 1.7226559318681574, "grad_norm": 0.15954706072807312, "learning_rate": 1.9044126596626536e-05, "loss": 1.1959, "step": 4627 }, { "epoch": 1.7230282370187666, "grad_norm": 0.16907548904418945, "learning_rate": 1.9043608004426635e-05, "loss": 1.2186, "step": 4628 }, { "epoch": 1.7234005421693754, "grad_norm": 0.16456788778305054, "learning_rate": 1.904308927865321e-05, "loss": 1.2052, "step": 4629 }, { "epoch": 1.7237728473199847, "grad_norm": 0.16648238897323608, "learning_rate": 1.9042570419313927e-05, "loss": 1.1964, "step": 4630 }, { "epoch": 1.7241451524705937, "grad_norm": 0.16598086059093475, "learning_rate": 1.904205142641644e-05, "loss": 1.1976, "step": 4631 }, { "epoch": 1.7245174576212028, "grad_norm": 0.16780155897140503, "learning_rate": 1.9041532299968426e-05, "loss": 1.202, "step": 4632 }, { "epoch": 1.7248897627718118, "grad_norm": 0.1815885603427887, "learning_rate": 1.9041013039977548e-05, "loss": 1.2154, "step": 4633 }, { "epoch": 1.7252620679224209, "grad_norm": 0.1682657152414322, "learning_rate": 1.9040493646451472e-05, "loss": 1.2037, "step": 4634 }, { "epoch": 1.7256343730730301, "grad_norm": 0.16929581761360168, "learning_rate": 1.9039974119397872e-05, "loss": 1.2131, "step": 4635 }, { "epoch": 1.726006678223639, "grad_norm": 0.15777097642421722, "learning_rate": 1.9039454458824426e-05, "loss": 1.1929, "step": 4636 }, { "epoch": 1.7263789833742482, "grad_norm": 0.16508424282073975, "learning_rate": 1.90389346647388e-05, "loss": 1.2151, "step": 4637 }, { "epoch": 1.726751288524857, "grad_norm": 0.16090057790279388, "learning_rate": 1.903841473714868e-05, "loss": 1.1994, "step": 4638 }, { "epoch": 1.7271235936754663, "grad_norm": 0.16870178282260895, "learning_rate": 1.903789467606174e-05, "loss": 1.1991, "step": 4639 }, { "epoch": 1.7274958988260753, "grad_norm": 0.17125073075294495, "learning_rate": 1.903737448148566e-05, "loss": 1.2113, "step": 4640 }, { "epoch": 1.7278682039766844, "grad_norm": 0.17486359179019928, "learning_rate": 1.903685415342813e-05, "loss": 1.2241, "step": 4641 }, { "epoch": 1.7282405091272934, "grad_norm": 0.16174235939979553, "learning_rate": 1.903633369189683e-05, "loss": 1.2012, "step": 4642 }, { "epoch": 1.7286128142779025, "grad_norm": 0.1670721024274826, "learning_rate": 1.9035813096899448e-05, "loss": 1.2032, "step": 4643 }, { "epoch": 1.7289851194285117, "grad_norm": 0.1667913943529129, "learning_rate": 1.9035292368443674e-05, "loss": 1.1978, "step": 4644 }, { "epoch": 1.7293574245791206, "grad_norm": 0.16571226716041565, "learning_rate": 1.90347715065372e-05, "loss": 1.2055, "step": 4645 }, { "epoch": 1.7297297297297298, "grad_norm": 0.16005724668502808, "learning_rate": 1.9034250511187716e-05, "loss": 1.1902, "step": 4646 }, { "epoch": 1.7301020348803386, "grad_norm": 0.1715211570262909, "learning_rate": 1.903372938240292e-05, "loss": 1.2029, "step": 4647 }, { "epoch": 1.730474340030948, "grad_norm": 0.16407069563865662, "learning_rate": 1.9033208120190507e-05, "loss": 1.1987, "step": 4648 }, { "epoch": 1.730846645181557, "grad_norm": 0.16125743091106415, "learning_rate": 1.9032686724558177e-05, "loss": 1.1941, "step": 4649 }, { "epoch": 1.731218950332166, "grad_norm": 0.17684829235076904, "learning_rate": 1.9032165195513634e-05, "loss": 1.2181, "step": 4650 }, { "epoch": 1.731591255482775, "grad_norm": 0.1644441783428192, "learning_rate": 1.9031643533064573e-05, "loss": 1.2018, "step": 4651 }, { "epoch": 1.731963560633384, "grad_norm": 0.1682443469762802, "learning_rate": 1.9031121737218706e-05, "loss": 1.197, "step": 4652 }, { "epoch": 1.7323358657839933, "grad_norm": 0.16315114498138428, "learning_rate": 1.9030599807983737e-05, "loss": 1.2067, "step": 4653 }, { "epoch": 1.7327081709346022, "grad_norm": 0.16800329089164734, "learning_rate": 1.9030077745367377e-05, "loss": 1.2031, "step": 4654 }, { "epoch": 1.7330804760852114, "grad_norm": 0.16758573055267334, "learning_rate": 1.9029555549377335e-05, "loss": 1.1791, "step": 4655 }, { "epoch": 1.7334527812358203, "grad_norm": 0.1693509966135025, "learning_rate": 1.9029033220021325e-05, "loss": 1.2177, "step": 4656 }, { "epoch": 1.7338250863864295, "grad_norm": 0.15949083864688873, "learning_rate": 1.902851075730706e-05, "loss": 1.2019, "step": 4657 }, { "epoch": 1.7341973915370386, "grad_norm": 0.16945861279964447, "learning_rate": 1.9027988161242258e-05, "loss": 1.1955, "step": 4658 }, { "epoch": 1.7345696966876476, "grad_norm": 0.16500024497509003, "learning_rate": 1.9027465431834637e-05, "loss": 1.2125, "step": 4659 }, { "epoch": 1.7349420018382566, "grad_norm": 0.1751050353050232, "learning_rate": 1.9026942569091917e-05, "loss": 1.1917, "step": 4660 }, { "epoch": 1.7353143069888657, "grad_norm": 0.16122271120548248, "learning_rate": 1.902641957302182e-05, "loss": 1.1966, "step": 4661 }, { "epoch": 1.735686612139475, "grad_norm": 0.16455887258052826, "learning_rate": 1.9025896443632076e-05, "loss": 1.1993, "step": 4662 }, { "epoch": 1.7360589172900838, "grad_norm": 0.16358114778995514, "learning_rate": 1.902537318093041e-05, "loss": 1.2037, "step": 4663 }, { "epoch": 1.736431222440693, "grad_norm": 0.16991184651851654, "learning_rate": 1.9024849784924546e-05, "loss": 1.2098, "step": 4664 }, { "epoch": 1.7368035275913019, "grad_norm": 0.1610153317451477, "learning_rate": 1.9024326255622215e-05, "loss": 1.2134, "step": 4665 }, { "epoch": 1.7371758327419111, "grad_norm": 0.16271455585956573, "learning_rate": 1.9023802593031156e-05, "loss": 1.2164, "step": 4666 }, { "epoch": 1.7375481378925202, "grad_norm": 0.16444487869739532, "learning_rate": 1.9023278797159096e-05, "loss": 1.2124, "step": 4667 }, { "epoch": 1.7379204430431292, "grad_norm": 0.16799621284008026, "learning_rate": 1.9022754868013775e-05, "loss": 1.2078, "step": 4668 }, { "epoch": 1.7382927481937382, "grad_norm": 0.16734634339809418, "learning_rate": 1.902223080560293e-05, "loss": 1.2088, "step": 4669 }, { "epoch": 1.7386650533443473, "grad_norm": 0.16235774755477905, "learning_rate": 1.9021706609934305e-05, "loss": 1.2093, "step": 4670 }, { "epoch": 1.7390373584949566, "grad_norm": 0.1678633987903595, "learning_rate": 1.9021182281015636e-05, "loss": 1.2058, "step": 4671 }, { "epoch": 1.7394096636455654, "grad_norm": 0.17294074594974518, "learning_rate": 1.9020657818854673e-05, "loss": 1.1982, "step": 4672 }, { "epoch": 1.7397819687961746, "grad_norm": 0.17070721089839935, "learning_rate": 1.902013322345916e-05, "loss": 1.1967, "step": 4673 }, { "epoch": 1.7401542739467837, "grad_norm": 0.16906553506851196, "learning_rate": 1.9019608494836843e-05, "loss": 1.2101, "step": 4674 }, { "epoch": 1.7405265790973927, "grad_norm": 0.17362748086452484, "learning_rate": 1.9019083632995476e-05, "loss": 1.2191, "step": 4675 }, { "epoch": 1.7408988842480018, "grad_norm": 0.17202383279800415, "learning_rate": 1.9018558637942813e-05, "loss": 1.2034, "step": 4676 }, { "epoch": 1.7412711893986108, "grad_norm": 0.16142207384109497, "learning_rate": 1.9018033509686603e-05, "loss": 1.1993, "step": 4677 }, { "epoch": 1.7416434945492199, "grad_norm": 0.16045573353767395, "learning_rate": 1.9017508248234603e-05, "loss": 1.2045, "step": 4678 }, { "epoch": 1.742015799699829, "grad_norm": 0.16805481910705566, "learning_rate": 1.901698285359457e-05, "loss": 1.1967, "step": 4679 }, { "epoch": 1.7423881048504382, "grad_norm": 0.16612930595874786, "learning_rate": 1.9016457325774268e-05, "loss": 1.1955, "step": 4680 }, { "epoch": 1.742760410001047, "grad_norm": 0.16055123507976532, "learning_rate": 1.901593166478146e-05, "loss": 1.1979, "step": 4681 }, { "epoch": 1.7431327151516562, "grad_norm": 0.16279493272304535, "learning_rate": 1.90154058706239e-05, "loss": 1.2181, "step": 4682 }, { "epoch": 1.7435050203022653, "grad_norm": 0.18220022320747375, "learning_rate": 1.9014879943309367e-05, "loss": 1.2013, "step": 4683 }, { "epoch": 1.7438773254528743, "grad_norm": 0.17012886703014374, "learning_rate": 1.9014353882845626e-05, "loss": 1.1997, "step": 4684 }, { "epoch": 1.7442496306034834, "grad_norm": 0.16687701642513275, "learning_rate": 1.9013827689240434e-05, "loss": 1.1968, "step": 4685 }, { "epoch": 1.7446219357540924, "grad_norm": 0.16534671187400818, "learning_rate": 1.9013301362501583e-05, "loss": 1.1906, "step": 4686 }, { "epoch": 1.7449942409047017, "grad_norm": 0.16701330244541168, "learning_rate": 1.901277490263683e-05, "loss": 1.1929, "step": 4687 }, { "epoch": 1.7453665460553105, "grad_norm": 0.17027342319488525, "learning_rate": 1.901224830965396e-05, "loss": 1.2095, "step": 4688 }, { "epoch": 1.7457388512059198, "grad_norm": 0.17393416166305542, "learning_rate": 1.9011721583560747e-05, "loss": 1.2137, "step": 4689 }, { "epoch": 1.7461111563565286, "grad_norm": 0.15819013118743896, "learning_rate": 1.901119472436497e-05, "loss": 1.2161, "step": 4690 }, { "epoch": 1.7464834615071378, "grad_norm": 0.17175039649009705, "learning_rate": 1.9010667732074415e-05, "loss": 1.2104, "step": 4691 }, { "epoch": 1.746855766657747, "grad_norm": 0.16250735521316528, "learning_rate": 1.9010140606696865e-05, "loss": 1.1984, "step": 4692 }, { "epoch": 1.747228071808356, "grad_norm": 0.16283249855041504, "learning_rate": 1.90096133482401e-05, "loss": 1.1991, "step": 4693 }, { "epoch": 1.747600376958965, "grad_norm": 0.16619625687599182, "learning_rate": 1.9009085956711916e-05, "loss": 1.2008, "step": 4694 }, { "epoch": 1.747972682109574, "grad_norm": 0.17469939589500427, "learning_rate": 1.9008558432120094e-05, "loss": 1.2038, "step": 4695 }, { "epoch": 1.7483449872601833, "grad_norm": 0.1646946370601654, "learning_rate": 1.900803077447243e-05, "loss": 1.1974, "step": 4696 }, { "epoch": 1.748717292410792, "grad_norm": 0.17054390907287598, "learning_rate": 1.9007502983776712e-05, "loss": 1.2065, "step": 4697 }, { "epoch": 1.7490895975614014, "grad_norm": 0.1633417159318924, "learning_rate": 1.9006975060040746e-05, "loss": 1.2084, "step": 4698 }, { "epoch": 1.7494619027120102, "grad_norm": 0.18991155922412872, "learning_rate": 1.9006447003272322e-05, "loss": 1.1942, "step": 4699 }, { "epoch": 1.7498342078626195, "grad_norm": 0.16292832791805267, "learning_rate": 1.900591881347924e-05, "loss": 1.1923, "step": 4700 }, { "epoch": 1.7502065130132285, "grad_norm": 0.17191947996616364, "learning_rate": 1.9005390490669305e-05, "loss": 1.2142, "step": 4701 }, { "epoch": 1.7505788181638375, "grad_norm": 0.17528562247753143, "learning_rate": 1.9004862034850314e-05, "loss": 1.2203, "step": 4702 }, { "epoch": 1.7509511233144466, "grad_norm": 0.17579975724220276, "learning_rate": 1.900433344603008e-05, "loss": 1.2078, "step": 4703 }, { "epoch": 1.7513234284650556, "grad_norm": 0.16660520434379578, "learning_rate": 1.9003804724216402e-05, "loss": 1.2123, "step": 4704 }, { "epoch": 1.7516957336156649, "grad_norm": 0.16542848944664001, "learning_rate": 1.90032758694171e-05, "loss": 1.1933, "step": 4705 }, { "epoch": 1.7520680387662737, "grad_norm": 0.16875770688056946, "learning_rate": 1.9002746881639972e-05, "loss": 1.1999, "step": 4706 }, { "epoch": 1.752440343916883, "grad_norm": 0.1760881096124649, "learning_rate": 1.900221776089284e-05, "loss": 1.1932, "step": 4707 }, { "epoch": 1.7528126490674918, "grad_norm": 0.17353412508964539, "learning_rate": 1.9001688507183517e-05, "loss": 1.2093, "step": 4708 }, { "epoch": 1.753184954218101, "grad_norm": 0.16783374547958374, "learning_rate": 1.9001159120519817e-05, "loss": 1.2056, "step": 4709 }, { "epoch": 1.75355725936871, "grad_norm": 0.1809961050748825, "learning_rate": 1.9000629600909562e-05, "loss": 1.21, "step": 4710 }, { "epoch": 1.7539295645193191, "grad_norm": 0.1838044971227646, "learning_rate": 1.9000099948360577e-05, "loss": 1.1983, "step": 4711 }, { "epoch": 1.7543018696699282, "grad_norm": 0.18065232038497925, "learning_rate": 1.8999570162880676e-05, "loss": 1.1836, "step": 4712 }, { "epoch": 1.7546741748205372, "grad_norm": 0.17441041767597198, "learning_rate": 1.899904024447769e-05, "loss": 1.1929, "step": 4713 }, { "epoch": 1.7550464799711465, "grad_norm": 0.16926120221614838, "learning_rate": 1.8998510193159445e-05, "loss": 1.2176, "step": 4714 }, { "epoch": 1.7554187851217553, "grad_norm": 0.168484628200531, "learning_rate": 1.8997980008933767e-05, "loss": 1.2068, "step": 4715 }, { "epoch": 1.7557910902723646, "grad_norm": 0.1681792289018631, "learning_rate": 1.899744969180849e-05, "loss": 1.1982, "step": 4716 }, { "epoch": 1.7561633954229734, "grad_norm": 0.18074312806129456, "learning_rate": 1.8996919241791446e-05, "loss": 1.2009, "step": 4717 }, { "epoch": 1.7565357005735827, "grad_norm": 0.17582470178604126, "learning_rate": 1.899638865889047e-05, "loss": 1.2103, "step": 4718 }, { "epoch": 1.7569080057241917, "grad_norm": 0.17479169368743896, "learning_rate": 1.89958579431134e-05, "loss": 1.1962, "step": 4719 }, { "epoch": 1.7572803108748007, "grad_norm": 0.21927672624588013, "learning_rate": 1.899532709446807e-05, "loss": 1.1914, "step": 4720 }, { "epoch": 1.7576526160254098, "grad_norm": 0.18427489697933197, "learning_rate": 1.8994796112962325e-05, "loss": 1.189, "step": 4721 }, { "epoch": 1.7580249211760188, "grad_norm": 0.17045900225639343, "learning_rate": 1.8994264998604005e-05, "loss": 1.192, "step": 4722 }, { "epoch": 1.758397226326628, "grad_norm": 0.16136103868484497, "learning_rate": 1.8993733751400953e-05, "loss": 1.2078, "step": 4723 }, { "epoch": 1.758769531477237, "grad_norm": 0.17973823845386505, "learning_rate": 1.899320237136102e-05, "loss": 1.2029, "step": 4724 }, { "epoch": 1.7591418366278462, "grad_norm": 0.1666877269744873, "learning_rate": 1.8992670858492053e-05, "loss": 1.1991, "step": 4725 }, { "epoch": 1.7595141417784552, "grad_norm": 0.16305620968341827, "learning_rate": 1.89921392128019e-05, "loss": 1.1932, "step": 4726 }, { "epoch": 1.7598864469290643, "grad_norm": 0.1701706200838089, "learning_rate": 1.899160743429842e-05, "loss": 1.2117, "step": 4727 }, { "epoch": 1.7602587520796733, "grad_norm": 0.17170703411102295, "learning_rate": 1.8991075522989458e-05, "loss": 1.1967, "step": 4728 }, { "epoch": 1.7606310572302823, "grad_norm": 0.15608349442481995, "learning_rate": 1.8990543478882875e-05, "loss": 1.1778, "step": 4729 }, { "epoch": 1.7610033623808914, "grad_norm": 0.17074252665042877, "learning_rate": 1.899001130198653e-05, "loss": 1.1975, "step": 4730 }, { "epoch": 1.7613756675315004, "grad_norm": 0.168598935008049, "learning_rate": 1.898947899230828e-05, "loss": 1.2009, "step": 4731 }, { "epoch": 1.7617479726821097, "grad_norm": 0.16988085210323334, "learning_rate": 1.8988946549855994e-05, "loss": 1.2064, "step": 4732 }, { "epoch": 1.7621202778327185, "grad_norm": 0.16802845895290375, "learning_rate": 1.8988413974637527e-05, "loss": 1.1968, "step": 4733 }, { "epoch": 1.7624925829833278, "grad_norm": 0.16767756640911102, "learning_rate": 1.8987881266660754e-05, "loss": 1.2002, "step": 4734 }, { "epoch": 1.7628648881339368, "grad_norm": 0.16321119666099548, "learning_rate": 1.8987348425933535e-05, "loss": 1.2016, "step": 4735 }, { "epoch": 1.7632371932845459, "grad_norm": 0.15938079357147217, "learning_rate": 1.8986815452463747e-05, "loss": 1.2024, "step": 4736 }, { "epoch": 1.763609498435155, "grad_norm": 0.164838969707489, "learning_rate": 1.8986282346259255e-05, "loss": 1.1904, "step": 4737 }, { "epoch": 1.763981803585764, "grad_norm": 0.16808170080184937, "learning_rate": 1.898574910732794e-05, "loss": 1.2027, "step": 4738 }, { "epoch": 1.764354108736373, "grad_norm": 0.16249272227287292, "learning_rate": 1.8985215735677673e-05, "loss": 1.2025, "step": 4739 }, { "epoch": 1.764726413886982, "grad_norm": 0.16439345479011536, "learning_rate": 1.8984682231316335e-05, "loss": 1.2028, "step": 4740 }, { "epoch": 1.7650987190375913, "grad_norm": 0.16751708090305328, "learning_rate": 1.89841485942518e-05, "loss": 1.1973, "step": 4741 }, { "epoch": 1.7654710241882001, "grad_norm": 0.15885969996452332, "learning_rate": 1.8983614824491958e-05, "loss": 1.1985, "step": 4742 }, { "epoch": 1.7658433293388094, "grad_norm": 0.15906503796577454, "learning_rate": 1.8983080922044687e-05, "loss": 1.2051, "step": 4743 }, { "epoch": 1.7662156344894184, "grad_norm": 0.16614754498004913, "learning_rate": 1.8982546886917878e-05, "loss": 1.2041, "step": 4744 }, { "epoch": 1.7665879396400275, "grad_norm": 0.16927511990070343, "learning_rate": 1.8982012719119414e-05, "loss": 1.2092, "step": 4745 }, { "epoch": 1.7669602447906365, "grad_norm": 0.16667069494724274, "learning_rate": 1.8981478418657185e-05, "loss": 1.2005, "step": 4746 }, { "epoch": 1.7673325499412456, "grad_norm": 0.15757103264331818, "learning_rate": 1.898094398553908e-05, "loss": 1.2002, "step": 4747 }, { "epoch": 1.7677048550918548, "grad_norm": 0.1850326508283615, "learning_rate": 1.8980409419772998e-05, "loss": 1.2049, "step": 4748 }, { "epoch": 1.7680771602424636, "grad_norm": 0.1695263683795929, "learning_rate": 1.8979874721366834e-05, "loss": 1.1879, "step": 4749 }, { "epoch": 1.768449465393073, "grad_norm": 0.17270156741142273, "learning_rate": 1.8979339890328484e-05, "loss": 1.2006, "step": 4750 }, { "epoch": 1.7688217705436817, "grad_norm": 0.18274691700935364, "learning_rate": 1.8978804926665848e-05, "loss": 1.2109, "step": 4751 }, { "epoch": 1.769194075694291, "grad_norm": 0.16077204048633575, "learning_rate": 1.8978269830386825e-05, "loss": 1.1919, "step": 4752 }, { "epoch": 1.7695663808449, "grad_norm": 0.1679827719926834, "learning_rate": 1.8977734601499322e-05, "loss": 1.2057, "step": 4753 }, { "epoch": 1.769938685995509, "grad_norm": 0.1700560301542282, "learning_rate": 1.8977199240011237e-05, "loss": 1.2149, "step": 4754 }, { "epoch": 1.7703109911461181, "grad_norm": 0.17137514054775238, "learning_rate": 1.8976663745930488e-05, "loss": 1.2163, "step": 4755 }, { "epoch": 1.7706832962967272, "grad_norm": 0.16699162125587463, "learning_rate": 1.897612811926498e-05, "loss": 1.1973, "step": 4756 }, { "epoch": 1.7710556014473364, "grad_norm": 0.1748683899641037, "learning_rate": 1.8975592360022616e-05, "loss": 1.2049, "step": 4757 }, { "epoch": 1.7714279065979452, "grad_norm": 0.17756815254688263, "learning_rate": 1.8975056468211323e-05, "loss": 1.1937, "step": 4758 }, { "epoch": 1.7718002117485545, "grad_norm": 0.16323328018188477, "learning_rate": 1.8974520443839007e-05, "loss": 1.2013, "step": 4759 }, { "epoch": 1.7721725168991633, "grad_norm": 0.19412948191165924, "learning_rate": 1.8973984286913584e-05, "loss": 1.2092, "step": 4760 }, { "epoch": 1.7725448220497726, "grad_norm": 0.1670580357313156, "learning_rate": 1.897344799744298e-05, "loss": 1.2202, "step": 4761 }, { "epoch": 1.7729171272003816, "grad_norm": 0.17248012125492096, "learning_rate": 1.8972911575435112e-05, "loss": 1.1881, "step": 4762 }, { "epoch": 1.7732894323509907, "grad_norm": 0.17886854708194733, "learning_rate": 1.8972375020897905e-05, "loss": 1.2028, "step": 4763 }, { "epoch": 1.7736617375015997, "grad_norm": 0.1719491332769394, "learning_rate": 1.897183833383928e-05, "loss": 1.2057, "step": 4764 }, { "epoch": 1.7740340426522088, "grad_norm": 0.18928195536136627, "learning_rate": 1.8971301514267162e-05, "loss": 1.1843, "step": 4765 }, { "epoch": 1.774406347802818, "grad_norm": 0.16735054552555084, "learning_rate": 1.897076456218949e-05, "loss": 1.1963, "step": 4766 }, { "epoch": 1.7747786529534268, "grad_norm": 0.16620579361915588, "learning_rate": 1.897022747761418e-05, "loss": 1.2001, "step": 4767 }, { "epoch": 1.7751509581040361, "grad_norm": 0.16270288825035095, "learning_rate": 1.8969690260549183e-05, "loss": 1.2023, "step": 4768 }, { "epoch": 1.775523263254645, "grad_norm": 0.17014147341251373, "learning_rate": 1.8969152911002417e-05, "loss": 1.1932, "step": 4769 }, { "epoch": 1.7758955684052542, "grad_norm": 0.1710130274295807, "learning_rate": 1.896861542898183e-05, "loss": 1.2129, "step": 4770 }, { "epoch": 1.7762678735558632, "grad_norm": 0.16842330992221832, "learning_rate": 1.8968077814495355e-05, "loss": 1.195, "step": 4771 }, { "epoch": 1.7766401787064723, "grad_norm": 0.16350851953029633, "learning_rate": 1.896754006755093e-05, "loss": 1.1914, "step": 4772 }, { "epoch": 1.7770124838570813, "grad_norm": 0.18311436474323273, "learning_rate": 1.8967002188156503e-05, "loss": 1.1927, "step": 4773 }, { "epoch": 1.7773847890076904, "grad_norm": 0.16735710203647614, "learning_rate": 1.8966464176320015e-05, "loss": 1.2019, "step": 4774 }, { "epoch": 1.7777570941582996, "grad_norm": 0.16617412865161896, "learning_rate": 1.8965926032049418e-05, "loss": 1.2009, "step": 4775 }, { "epoch": 1.7781293993089085, "grad_norm": 0.1925051510334015, "learning_rate": 1.896538775535265e-05, "loss": 1.1979, "step": 4776 }, { "epoch": 1.7785017044595177, "grad_norm": 0.17179760336875916, "learning_rate": 1.8964849346237676e-05, "loss": 1.1963, "step": 4777 }, { "epoch": 1.7788740096101265, "grad_norm": 0.172499880194664, "learning_rate": 1.8964310804712435e-05, "loss": 1.1916, "step": 4778 }, { "epoch": 1.7792463147607358, "grad_norm": 0.1798262745141983, "learning_rate": 1.8963772130784882e-05, "loss": 1.2, "step": 4779 }, { "epoch": 1.7796186199113448, "grad_norm": 0.16507485508918762, "learning_rate": 1.8963233324462982e-05, "loss": 1.1805, "step": 4780 }, { "epoch": 1.7799909250619539, "grad_norm": 0.1641186773777008, "learning_rate": 1.896269438575469e-05, "loss": 1.1815, "step": 4781 }, { "epoch": 1.780363230212563, "grad_norm": 0.16876539587974548, "learning_rate": 1.896215531466796e-05, "loss": 1.2031, "step": 4782 }, { "epoch": 1.780735535363172, "grad_norm": 0.17245730757713318, "learning_rate": 1.896161611121076e-05, "loss": 1.1912, "step": 4783 }, { "epoch": 1.7811078405137812, "grad_norm": 0.1740397810935974, "learning_rate": 1.896107677539105e-05, "loss": 1.2029, "step": 4784 }, { "epoch": 1.78148014566439, "grad_norm": 0.1714610755443573, "learning_rate": 1.8960537307216804e-05, "loss": 1.208, "step": 4785 }, { "epoch": 1.7818524508149993, "grad_norm": 0.17852343618869781, "learning_rate": 1.895999770669598e-05, "loss": 1.2083, "step": 4786 }, { "epoch": 1.7822247559656084, "grad_norm": 0.16778838634490967, "learning_rate": 1.8959457973836554e-05, "loss": 1.2066, "step": 4787 }, { "epoch": 1.7825970611162174, "grad_norm": 0.1763961762189865, "learning_rate": 1.8958918108646495e-05, "loss": 1.1975, "step": 4788 }, { "epoch": 1.7829693662668264, "grad_norm": 0.16524644196033478, "learning_rate": 1.8958378111133777e-05, "loss": 1.2086, "step": 4789 }, { "epoch": 1.7833416714174355, "grad_norm": 0.17384812235832214, "learning_rate": 1.895783798130638e-05, "loss": 1.1881, "step": 4790 }, { "epoch": 1.7837139765680445, "grad_norm": 0.16518385708332062, "learning_rate": 1.8957297719172278e-05, "loss": 1.1994, "step": 4791 }, { "epoch": 1.7840862817186536, "grad_norm": 0.16743482649326324, "learning_rate": 1.8956757324739445e-05, "loss": 1.199, "step": 4792 }, { "epoch": 1.7844585868692628, "grad_norm": 0.1630256474018097, "learning_rate": 1.8956216798015873e-05, "loss": 1.2051, "step": 4793 }, { "epoch": 1.7848308920198717, "grad_norm": 0.16784405708312988, "learning_rate": 1.895567613900954e-05, "loss": 1.2047, "step": 4794 }, { "epoch": 1.785203197170481, "grad_norm": 0.16349439322948456, "learning_rate": 1.8955135347728434e-05, "loss": 1.1983, "step": 4795 }, { "epoch": 1.78557550232109, "grad_norm": 0.16735652089118958, "learning_rate": 1.895459442418054e-05, "loss": 1.1949, "step": 4796 }, { "epoch": 1.785947807471699, "grad_norm": 0.1649763435125351, "learning_rate": 1.8954053368373846e-05, "loss": 1.2026, "step": 4797 }, { "epoch": 1.786320112622308, "grad_norm": 0.1692332774400711, "learning_rate": 1.895351218031635e-05, "loss": 1.2029, "step": 4798 }, { "epoch": 1.786692417772917, "grad_norm": 0.1696692705154419, "learning_rate": 1.895297086001604e-05, "loss": 1.1935, "step": 4799 }, { "epoch": 1.7870647229235264, "grad_norm": 0.1772766262292862, "learning_rate": 1.8952429407480908e-05, "loss": 1.1929, "step": 4800 }, { "epoch": 1.7874370280741352, "grad_norm": 0.17916709184646606, "learning_rate": 1.895188782271896e-05, "loss": 1.2017, "step": 4801 }, { "epoch": 1.7878093332247444, "grad_norm": 0.16637340188026428, "learning_rate": 1.8951346105738188e-05, "loss": 1.1952, "step": 4802 }, { "epoch": 1.7881816383753533, "grad_norm": 0.20684456825256348, "learning_rate": 1.8950804256546597e-05, "loss": 1.2116, "step": 4803 }, { "epoch": 1.7885539435259625, "grad_norm": 0.17692846059799194, "learning_rate": 1.8950262275152188e-05, "loss": 1.1984, "step": 4804 }, { "epoch": 1.7889262486765716, "grad_norm": 0.17448139190673828, "learning_rate": 1.8949720161562967e-05, "loss": 1.1976, "step": 4805 }, { "epoch": 1.7892985538271806, "grad_norm": 0.17009678483009338, "learning_rate": 1.8949177915786942e-05, "loss": 1.1925, "step": 4806 }, { "epoch": 1.7896708589777897, "grad_norm": 0.17584814131259918, "learning_rate": 1.894863553783212e-05, "loss": 1.1907, "step": 4807 }, { "epoch": 1.7900431641283987, "grad_norm": 0.17380961775779724, "learning_rate": 1.8948093027706512e-05, "loss": 1.2074, "step": 4808 }, { "epoch": 1.790415469279008, "grad_norm": 0.1762208640575409, "learning_rate": 1.8947550385418136e-05, "loss": 1.214, "step": 4809 }, { "epoch": 1.7907877744296168, "grad_norm": 0.16765017807483673, "learning_rate": 1.8947007610974998e-05, "loss": 1.2002, "step": 4810 }, { "epoch": 1.791160079580226, "grad_norm": 0.197531059384346, "learning_rate": 1.894646470438512e-05, "loss": 1.1988, "step": 4811 }, { "epoch": 1.7915323847308349, "grad_norm": 0.17233921587467194, "learning_rate": 1.894592166565652e-05, "loss": 1.1891, "step": 4812 }, { "epoch": 1.7919046898814441, "grad_norm": 0.1792616844177246, "learning_rate": 1.8945378494797216e-05, "loss": 1.1974, "step": 4813 }, { "epoch": 1.7922769950320532, "grad_norm": 0.16458769142627716, "learning_rate": 1.8944835191815233e-05, "loss": 1.2007, "step": 4814 }, { "epoch": 1.7926493001826622, "grad_norm": 0.20892377197742462, "learning_rate": 1.89442917567186e-05, "loss": 1.1936, "step": 4815 }, { "epoch": 1.7930216053332713, "grad_norm": 0.17339985072612762, "learning_rate": 1.894374818951534e-05, "loss": 1.1883, "step": 4816 }, { "epoch": 1.7933939104838803, "grad_norm": 0.17067044973373413, "learning_rate": 1.8943204490213474e-05, "loss": 1.2031, "step": 4817 }, { "epoch": 1.7937662156344896, "grad_norm": 0.15906625986099243, "learning_rate": 1.8942660658821045e-05, "loss": 1.2074, "step": 4818 }, { "epoch": 1.7941385207850984, "grad_norm": 0.1729053556919098, "learning_rate": 1.8942116695346073e-05, "loss": 1.1915, "step": 4819 }, { "epoch": 1.7945108259357077, "grad_norm": 0.16248184442520142, "learning_rate": 1.8941572599796602e-05, "loss": 1.1905, "step": 4820 }, { "epoch": 1.7948831310863165, "grad_norm": 0.17368124425411224, "learning_rate": 1.8941028372180667e-05, "loss": 1.1896, "step": 4821 }, { "epoch": 1.7952554362369257, "grad_norm": 0.1708829700946808, "learning_rate": 1.8940484012506298e-05, "loss": 1.2013, "step": 4822 }, { "epoch": 1.7956277413875348, "grad_norm": 0.15987025201320648, "learning_rate": 1.8939939520781546e-05, "loss": 1.22, "step": 4823 }, { "epoch": 1.7960000465381438, "grad_norm": 0.16917409002780914, "learning_rate": 1.8939394897014448e-05, "loss": 1.1861, "step": 4824 }, { "epoch": 1.7963723516887529, "grad_norm": 0.1673855185508728, "learning_rate": 1.893885014121305e-05, "loss": 1.202, "step": 4825 }, { "epoch": 1.796744656839362, "grad_norm": 0.1666531264781952, "learning_rate": 1.8938305253385395e-05, "loss": 1.2077, "step": 4826 }, { "epoch": 1.7971169619899712, "grad_norm": 0.16617925465106964, "learning_rate": 1.893776023353953e-05, "loss": 1.2171, "step": 4827 }, { "epoch": 1.79748926714058, "grad_norm": 0.16386044025421143, "learning_rate": 1.893721508168351e-05, "loss": 1.1957, "step": 4828 }, { "epoch": 1.7978615722911893, "grad_norm": 0.19208215177059174, "learning_rate": 1.8936669797825384e-05, "loss": 1.2114, "step": 4829 }, { "epoch": 1.798233877441798, "grad_norm": 0.16761602461338043, "learning_rate": 1.8936124381973203e-05, "loss": 1.194, "step": 4830 }, { "epoch": 1.7986061825924073, "grad_norm": 0.1722363531589508, "learning_rate": 1.893557883413503e-05, "loss": 1.1789, "step": 4831 }, { "epoch": 1.7989784877430164, "grad_norm": 0.1653546690940857, "learning_rate": 1.8935033154318914e-05, "loss": 1.2039, "step": 4832 }, { "epoch": 1.7993507928936254, "grad_norm": 0.16508762538433075, "learning_rate": 1.8934487342532925e-05, "loss": 1.1938, "step": 4833 }, { "epoch": 1.7997230980442345, "grad_norm": 0.17712639272212982, "learning_rate": 1.893394139878511e-05, "loss": 1.1873, "step": 4834 }, { "epoch": 1.8000954031948435, "grad_norm": 0.17879736423492432, "learning_rate": 1.8933395323083547e-05, "loss": 1.2158, "step": 4835 }, { "epoch": 1.8004677083454528, "grad_norm": 0.1629367172718048, "learning_rate": 1.8932849115436296e-05, "loss": 1.1977, "step": 4836 }, { "epoch": 1.8008400134960616, "grad_norm": 0.16029439866542816, "learning_rate": 1.8932302775851423e-05, "loss": 1.2091, "step": 4837 }, { "epoch": 1.8012123186466709, "grad_norm": 0.17447559535503387, "learning_rate": 1.8931756304337e-05, "loss": 1.2146, "step": 4838 }, { "epoch": 1.8015846237972797, "grad_norm": 0.16988350450992584, "learning_rate": 1.8931209700901096e-05, "loss": 1.198, "step": 4839 }, { "epoch": 1.801956928947889, "grad_norm": 0.1667841076850891, "learning_rate": 1.8930662965551784e-05, "loss": 1.1971, "step": 4840 }, { "epoch": 1.802329234098498, "grad_norm": 0.17226542532444, "learning_rate": 1.893011609829714e-05, "loss": 1.1918, "step": 4841 }, { "epoch": 1.802701539249107, "grad_norm": 0.16706976294517517, "learning_rate": 1.892956909914524e-05, "loss": 1.2056, "step": 4842 }, { "epoch": 1.803073844399716, "grad_norm": 0.1757022887468338, "learning_rate": 1.892902196810417e-05, "loss": 1.211, "step": 4843 }, { "epoch": 1.8034461495503251, "grad_norm": 0.17053164541721344, "learning_rate": 1.8928474705182002e-05, "loss": 1.219, "step": 4844 }, { "epoch": 1.8038184547009344, "grad_norm": 0.1679467409849167, "learning_rate": 1.8927927310386827e-05, "loss": 1.1871, "step": 4845 }, { "epoch": 1.8041907598515432, "grad_norm": 0.18097002804279327, "learning_rate": 1.892737978372672e-05, "loss": 1.2076, "step": 4846 }, { "epoch": 1.8045630650021525, "grad_norm": 0.16347897052764893, "learning_rate": 1.8926832125209776e-05, "loss": 1.1959, "step": 4847 }, { "epoch": 1.8049353701527615, "grad_norm": 0.16471904516220093, "learning_rate": 1.8926284334844086e-05, "loss": 1.1894, "step": 4848 }, { "epoch": 1.8053076753033706, "grad_norm": 0.17791272699832916, "learning_rate": 1.8925736412637734e-05, "loss": 1.1922, "step": 4849 }, { "epoch": 1.8056799804539796, "grad_norm": 0.16574202477931976, "learning_rate": 1.8925188358598815e-05, "loss": 1.2123, "step": 4850 }, { "epoch": 1.8060522856045886, "grad_norm": 0.16811169683933258, "learning_rate": 1.8924640172735423e-05, "loss": 1.2018, "step": 4851 }, { "epoch": 1.8064245907551977, "grad_norm": 0.1643866002559662, "learning_rate": 1.8924091855055656e-05, "loss": 1.1997, "step": 4852 }, { "epoch": 1.8067968959058067, "grad_norm": 0.1829267144203186, "learning_rate": 1.8923543405567612e-05, "loss": 1.1984, "step": 4853 }, { "epoch": 1.807169201056416, "grad_norm": 0.17171823978424072, "learning_rate": 1.8922994824279394e-05, "loss": 1.2109, "step": 4854 }, { "epoch": 1.8075415062070248, "grad_norm": 0.1727183312177658, "learning_rate": 1.89224461111991e-05, "loss": 1.1947, "step": 4855 }, { "epoch": 1.807913811357634, "grad_norm": 0.23297062516212463, "learning_rate": 1.8921897266334837e-05, "loss": 1.207, "step": 4856 }, { "epoch": 1.8082861165082431, "grad_norm": 0.1721329540014267, "learning_rate": 1.8921348289694713e-05, "loss": 1.1893, "step": 4857 }, { "epoch": 1.8086584216588522, "grad_norm": 0.17270666360855103, "learning_rate": 1.8920799181286837e-05, "loss": 1.2045, "step": 4858 }, { "epoch": 1.8090307268094612, "grad_norm": 0.16383390128612518, "learning_rate": 1.8920249941119313e-05, "loss": 1.1815, "step": 4859 }, { "epoch": 1.8094030319600702, "grad_norm": 0.16056787967681885, "learning_rate": 1.891970056920026e-05, "loss": 1.215, "step": 4860 }, { "epoch": 1.8097753371106795, "grad_norm": 0.1632789522409439, "learning_rate": 1.8919151065537788e-05, "loss": 1.2041, "step": 4861 }, { "epoch": 1.8101476422612883, "grad_norm": 0.17091669142246246, "learning_rate": 1.8918601430140012e-05, "loss": 1.2226, "step": 4862 }, { "epoch": 1.8105199474118976, "grad_norm": 0.16258689761161804, "learning_rate": 1.8918051663015057e-05, "loss": 1.1869, "step": 4863 }, { "epoch": 1.8108922525625064, "grad_norm": 0.1618296056985855, "learning_rate": 1.8917501764171034e-05, "loss": 1.211, "step": 4864 }, { "epoch": 1.8112645577131157, "grad_norm": 0.1654951572418213, "learning_rate": 1.8916951733616074e-05, "loss": 1.2002, "step": 4865 }, { "epoch": 1.8116368628637247, "grad_norm": 0.1661575883626938, "learning_rate": 1.8916401571358292e-05, "loss": 1.2041, "step": 4866 }, { "epoch": 1.8120091680143338, "grad_norm": 0.16568347811698914, "learning_rate": 1.8915851277405823e-05, "loss": 1.2076, "step": 4867 }, { "epoch": 1.8123814731649428, "grad_norm": 0.1610947996377945, "learning_rate": 1.8915300851766786e-05, "loss": 1.194, "step": 4868 }, { "epoch": 1.8127537783155518, "grad_norm": 0.16845589876174927, "learning_rate": 1.8914750294449317e-05, "loss": 1.1927, "step": 4869 }, { "epoch": 1.813126083466161, "grad_norm": 0.16183564066886902, "learning_rate": 1.8914199605461546e-05, "loss": 1.2055, "step": 4870 }, { "epoch": 1.81349838861677, "grad_norm": 0.1744004338979721, "learning_rate": 1.8913648784811607e-05, "loss": 1.2018, "step": 4871 }, { "epoch": 1.8138706937673792, "grad_norm": 0.1690053641796112, "learning_rate": 1.8913097832507632e-05, "loss": 1.199, "step": 4872 }, { "epoch": 1.814242998917988, "grad_norm": 0.16530083119869232, "learning_rate": 1.8912546748557762e-05, "loss": 1.1827, "step": 4873 }, { "epoch": 1.8146153040685973, "grad_norm": 0.17327569425106049, "learning_rate": 1.891199553297014e-05, "loss": 1.1972, "step": 4874 }, { "epoch": 1.8149876092192063, "grad_norm": 0.163415789604187, "learning_rate": 1.89114441857529e-05, "loss": 1.1924, "step": 4875 }, { "epoch": 1.8153599143698154, "grad_norm": 0.16614532470703125, "learning_rate": 1.891089270691419e-05, "loss": 1.1986, "step": 4876 }, { "epoch": 1.8157322195204244, "grad_norm": 0.1664610356092453, "learning_rate": 1.891034109646215e-05, "loss": 1.2012, "step": 4877 }, { "epoch": 1.8161045246710334, "grad_norm": 0.1688005030155182, "learning_rate": 1.8909789354404934e-05, "loss": 1.1825, "step": 4878 }, { "epoch": 1.8164768298216427, "grad_norm": 0.16336117684841156, "learning_rate": 1.890923748075069e-05, "loss": 1.1941, "step": 4879 }, { "epoch": 1.8168491349722515, "grad_norm": 0.17047706246376038, "learning_rate": 1.8908685475507566e-05, "loss": 1.1988, "step": 4880 }, { "epoch": 1.8172214401228608, "grad_norm": 0.16434811055660248, "learning_rate": 1.8908133338683715e-05, "loss": 1.204, "step": 4881 }, { "epoch": 1.8175937452734696, "grad_norm": 0.21952207386493683, "learning_rate": 1.8907581070287295e-05, "loss": 1.1928, "step": 4882 }, { "epoch": 1.8179660504240789, "grad_norm": 0.1890069991350174, "learning_rate": 1.8907028670326462e-05, "loss": 1.2033, "step": 4883 }, { "epoch": 1.818338355574688, "grad_norm": 0.17524456977844238, "learning_rate": 1.8906476138809374e-05, "loss": 1.1934, "step": 4884 }, { "epoch": 1.818710660725297, "grad_norm": 0.17356589436531067, "learning_rate": 1.890592347574419e-05, "loss": 1.1977, "step": 4885 }, { "epoch": 1.819082965875906, "grad_norm": 0.18048810958862305, "learning_rate": 1.8905370681139083e-05, "loss": 1.2109, "step": 4886 }, { "epoch": 1.819455271026515, "grad_norm": 0.16388888657093048, "learning_rate": 1.89048177550022e-05, "loss": 1.1987, "step": 4887 }, { "epoch": 1.8198275761771243, "grad_norm": 0.1702636033296585, "learning_rate": 1.8904264697341723e-05, "loss": 1.2081, "step": 4888 }, { "epoch": 1.8201998813277331, "grad_norm": 0.16600562632083893, "learning_rate": 1.8903711508165816e-05, "loss": 1.1876, "step": 4889 }, { "epoch": 1.8205721864783424, "grad_norm": 0.17300425469875336, "learning_rate": 1.8903158187482646e-05, "loss": 1.1927, "step": 4890 }, { "epoch": 1.8209444916289512, "grad_norm": 0.18208463490009308, "learning_rate": 1.890260473530039e-05, "loss": 1.1933, "step": 4891 }, { "epoch": 1.8213167967795605, "grad_norm": 0.16697552800178528, "learning_rate": 1.890205115162722e-05, "loss": 1.209, "step": 4892 }, { "epoch": 1.8216891019301695, "grad_norm": 0.16606757044792175, "learning_rate": 1.8901497436471314e-05, "loss": 1.2104, "step": 4893 }, { "epoch": 1.8220614070807786, "grad_norm": 0.16475386917591095, "learning_rate": 1.890094358984085e-05, "loss": 1.2086, "step": 4894 }, { "epoch": 1.8224337122313876, "grad_norm": 0.16822832822799683, "learning_rate": 1.890038961174401e-05, "loss": 1.2009, "step": 4895 }, { "epoch": 1.8228060173819967, "grad_norm": 0.1646614372730255, "learning_rate": 1.889983550218897e-05, "loss": 1.1945, "step": 4896 }, { "epoch": 1.823178322532606, "grad_norm": 0.16594818234443665, "learning_rate": 1.8899281261183916e-05, "loss": 1.2005, "step": 4897 }, { "epoch": 1.8235506276832147, "grad_norm": 0.15717746317386627, "learning_rate": 1.889872688873704e-05, "loss": 1.1948, "step": 4898 }, { "epoch": 1.823922932833824, "grad_norm": 0.16720950603485107, "learning_rate": 1.8898172384856526e-05, "loss": 1.1799, "step": 4899 }, { "epoch": 1.8242952379844328, "grad_norm": 0.17484848201274872, "learning_rate": 1.8897617749550565e-05, "loss": 1.1892, "step": 4900 }, { "epoch": 1.824667543135042, "grad_norm": 0.16109175980091095, "learning_rate": 1.8897062982827347e-05, "loss": 1.1825, "step": 4901 }, { "epoch": 1.8250398482856511, "grad_norm": 0.1625639796257019, "learning_rate": 1.8896508084695068e-05, "loss": 1.201, "step": 4902 }, { "epoch": 1.8254121534362602, "grad_norm": 0.16512739658355713, "learning_rate": 1.889595305516192e-05, "loss": 1.2067, "step": 4903 }, { "epoch": 1.8257844585868692, "grad_norm": 0.15775848925113678, "learning_rate": 1.889539789423611e-05, "loss": 1.1929, "step": 4904 }, { "epoch": 1.8261567637374783, "grad_norm": 0.1584419310092926, "learning_rate": 1.8894842601925823e-05, "loss": 1.1877, "step": 4905 }, { "epoch": 1.8265290688880875, "grad_norm": 0.16281954944133759, "learning_rate": 1.8894287178239274e-05, "loss": 1.1972, "step": 4906 }, { "epoch": 1.8269013740386963, "grad_norm": 0.16380088031291962, "learning_rate": 1.889373162318466e-05, "loss": 1.18, "step": 4907 }, { "epoch": 1.8272736791893056, "grad_norm": 0.17226645350456238, "learning_rate": 1.8893175936770188e-05, "loss": 1.2165, "step": 4908 }, { "epoch": 1.8276459843399147, "grad_norm": 0.16510790586471558, "learning_rate": 1.8892620119004067e-05, "loss": 1.2002, "step": 4909 }, { "epoch": 1.8280182894905237, "grad_norm": 0.18165461719036102, "learning_rate": 1.8892064169894504e-05, "loss": 1.1977, "step": 4910 }, { "epoch": 1.8283905946411327, "grad_norm": 0.2052212953567505, "learning_rate": 1.889150808944971e-05, "loss": 1.1974, "step": 4911 }, { "epoch": 1.8287628997917418, "grad_norm": 0.18859870731830597, "learning_rate": 1.8890951877677903e-05, "loss": 1.1985, "step": 4912 }, { "epoch": 1.8291352049423508, "grad_norm": 0.16892032325267792, "learning_rate": 1.889039553458729e-05, "loss": 1.2059, "step": 4913 }, { "epoch": 1.8295075100929599, "grad_norm": 0.16577079892158508, "learning_rate": 1.8889839060186095e-05, "loss": 1.1789, "step": 4914 }, { "epoch": 1.8298798152435691, "grad_norm": 0.18138210475444794, "learning_rate": 1.8889282454482538e-05, "loss": 1.2013, "step": 4915 }, { "epoch": 1.830252120394178, "grad_norm": 0.17776082456111908, "learning_rate": 1.8888725717484834e-05, "loss": 1.2137, "step": 4916 }, { "epoch": 1.8306244255447872, "grad_norm": 0.1690797209739685, "learning_rate": 1.888816884920121e-05, "loss": 1.2019, "step": 4917 }, { "epoch": 1.8309967306953963, "grad_norm": 0.17596592009067535, "learning_rate": 1.888761184963989e-05, "loss": 1.209, "step": 4918 }, { "epoch": 1.8313690358460053, "grad_norm": 0.17080815136432648, "learning_rate": 1.88870547188091e-05, "loss": 1.1965, "step": 4919 }, { "epoch": 1.8317413409966143, "grad_norm": 0.21370607614517212, "learning_rate": 1.8886497456717073e-05, "loss": 1.1991, "step": 4920 }, { "epoch": 1.8321136461472234, "grad_norm": 0.18233619630336761, "learning_rate": 1.888594006337203e-05, "loss": 1.2039, "step": 4921 }, { "epoch": 1.8324859512978326, "grad_norm": 0.1737797111272812, "learning_rate": 1.8885382538782213e-05, "loss": 1.1976, "step": 4922 }, { "epoch": 1.8328582564484415, "grad_norm": 0.16964221000671387, "learning_rate": 1.8884824882955853e-05, "loss": 1.1955, "step": 4923 }, { "epoch": 1.8332305615990507, "grad_norm": 0.18157334625720978, "learning_rate": 1.888426709590119e-05, "loss": 1.1951, "step": 4924 }, { "epoch": 1.8336028667496596, "grad_norm": 0.1712358593940735, "learning_rate": 1.8883709177626456e-05, "loss": 1.1945, "step": 4925 }, { "epoch": 1.8339751719002688, "grad_norm": 0.16328178346157074, "learning_rate": 1.8883151128139898e-05, "loss": 1.194, "step": 4926 }, { "epoch": 1.8343474770508779, "grad_norm": 0.16288742423057556, "learning_rate": 1.8882592947449753e-05, "loss": 1.1876, "step": 4927 }, { "epoch": 1.834719782201487, "grad_norm": 0.1691894680261612, "learning_rate": 1.8882034635564266e-05, "loss": 1.2148, "step": 4928 }, { "epoch": 1.835092087352096, "grad_norm": 0.16904255747795105, "learning_rate": 1.888147619249169e-05, "loss": 1.1911, "step": 4929 }, { "epoch": 1.835464392502705, "grad_norm": 0.1638314574956894, "learning_rate": 1.8880917618240265e-05, "loss": 1.1922, "step": 4930 }, { "epoch": 1.8358366976533143, "grad_norm": 0.17079859972000122, "learning_rate": 1.888035891281824e-05, "loss": 1.1861, "step": 4931 }, { "epoch": 1.836209002803923, "grad_norm": 0.1680002361536026, "learning_rate": 1.8879800076233875e-05, "loss": 1.2024, "step": 4932 }, { "epoch": 1.8365813079545323, "grad_norm": 0.15936627984046936, "learning_rate": 1.8879241108495423e-05, "loss": 1.1981, "step": 4933 }, { "epoch": 1.8369536131051412, "grad_norm": 0.16010630130767822, "learning_rate": 1.8878682009611134e-05, "loss": 1.1983, "step": 4934 }, { "epoch": 1.8373259182557504, "grad_norm": 0.16550739109516144, "learning_rate": 1.887812277958927e-05, "loss": 1.2154, "step": 4935 }, { "epoch": 1.8376982234063595, "grad_norm": 0.16892309486865997, "learning_rate": 1.887756341843809e-05, "loss": 1.2044, "step": 4936 }, { "epoch": 1.8380705285569685, "grad_norm": 0.16828759014606476, "learning_rate": 1.8877003926165852e-05, "loss": 1.2074, "step": 4937 }, { "epoch": 1.8384428337075776, "grad_norm": 0.16481132805347443, "learning_rate": 1.8876444302780826e-05, "loss": 1.2012, "step": 4938 }, { "epoch": 1.8388151388581866, "grad_norm": 0.1642160415649414, "learning_rate": 1.8875884548291274e-05, "loss": 1.195, "step": 4939 }, { "epoch": 1.8391874440087959, "grad_norm": 0.15789036452770233, "learning_rate": 1.8875324662705467e-05, "loss": 1.2151, "step": 4940 }, { "epoch": 1.8395597491594047, "grad_norm": 0.1689959466457367, "learning_rate": 1.8874764646031665e-05, "loss": 1.1944, "step": 4941 }, { "epoch": 1.839932054310014, "grad_norm": 0.17505736649036407, "learning_rate": 1.8874204498278153e-05, "loss": 1.2001, "step": 4942 }, { "epoch": 1.8403043594606228, "grad_norm": 0.1680225431919098, "learning_rate": 1.8873644219453194e-05, "loss": 1.2081, "step": 4943 }, { "epoch": 1.840676664611232, "grad_norm": 0.17328615486621857, "learning_rate": 1.887308380956507e-05, "loss": 1.2141, "step": 4944 }, { "epoch": 1.841048969761841, "grad_norm": 0.16859564185142517, "learning_rate": 1.887252326862205e-05, "loss": 1.1836, "step": 4945 }, { "epoch": 1.8414212749124501, "grad_norm": 0.17477625608444214, "learning_rate": 1.887196259663242e-05, "loss": 1.2045, "step": 4946 }, { "epoch": 1.8417935800630592, "grad_norm": 0.16840267181396484, "learning_rate": 1.8871401793604463e-05, "loss": 1.2002, "step": 4947 }, { "epoch": 1.8421658852136682, "grad_norm": 0.16595827043056488, "learning_rate": 1.8870840859546455e-05, "loss": 1.1985, "step": 4948 }, { "epoch": 1.8425381903642775, "grad_norm": 0.17932920157909393, "learning_rate": 1.8870279794466686e-05, "loss": 1.1966, "step": 4949 }, { "epoch": 1.8429104955148863, "grad_norm": 0.17470277845859528, "learning_rate": 1.8869718598373438e-05, "loss": 1.1887, "step": 4950 }, { "epoch": 1.8432828006654955, "grad_norm": 0.16657747328281403, "learning_rate": 1.8869157271275008e-05, "loss": 1.191, "step": 4951 }, { "epoch": 1.8436551058161044, "grad_norm": 0.1673271358013153, "learning_rate": 1.8868595813179677e-05, "loss": 1.192, "step": 4952 }, { "epoch": 1.8440274109667136, "grad_norm": 0.1705188900232315, "learning_rate": 1.8868034224095742e-05, "loss": 1.2007, "step": 4953 }, { "epoch": 1.8443997161173227, "grad_norm": 0.1590953916311264, "learning_rate": 1.88674725040315e-05, "loss": 1.2006, "step": 4954 }, { "epoch": 1.8447720212679317, "grad_norm": 0.16473130881786346, "learning_rate": 1.8866910652995244e-05, "loss": 1.2069, "step": 4955 }, { "epoch": 1.8451443264185408, "grad_norm": 0.16664549708366394, "learning_rate": 1.886634867099528e-05, "loss": 1.1985, "step": 4956 }, { "epoch": 1.8455166315691498, "grad_norm": 0.1601409912109375, "learning_rate": 1.8865786558039895e-05, "loss": 1.1817, "step": 4957 }, { "epoch": 1.845888936719759, "grad_norm": 0.16703009605407715, "learning_rate": 1.8865224314137404e-05, "loss": 1.1998, "step": 4958 }, { "epoch": 1.8462612418703679, "grad_norm": 0.16715413331985474, "learning_rate": 1.88646619392961e-05, "loss": 1.1901, "step": 4959 }, { "epoch": 1.8466335470209772, "grad_norm": 0.18171849846839905, "learning_rate": 1.8864099433524302e-05, "loss": 1.2008, "step": 4960 }, { "epoch": 1.847005852171586, "grad_norm": 0.20484322309494019, "learning_rate": 1.886353679683031e-05, "loss": 1.1864, "step": 4961 }, { "epoch": 1.8473781573221952, "grad_norm": 0.19610385596752167, "learning_rate": 1.8862974029222438e-05, "loss": 1.2133, "step": 4962 }, { "epoch": 1.8477504624728043, "grad_norm": 0.18955834209918976, "learning_rate": 1.8862411130708992e-05, "loss": 1.2175, "step": 4963 }, { "epoch": 1.8481227676234133, "grad_norm": 0.2106134295463562, "learning_rate": 1.8861848101298287e-05, "loss": 1.2053, "step": 4964 }, { "epoch": 1.8484950727740224, "grad_norm": 0.16345474123954773, "learning_rate": 1.8861284940998647e-05, "loss": 1.2037, "step": 4965 }, { "epoch": 1.8488673779246314, "grad_norm": 0.16316930949687958, "learning_rate": 1.8860721649818383e-05, "loss": 1.197, "step": 4966 }, { "epoch": 1.8492396830752407, "grad_norm": 0.16572824120521545, "learning_rate": 1.8860158227765816e-05, "loss": 1.1979, "step": 4967 }, { "epoch": 1.8496119882258495, "grad_norm": 0.16799062490463257, "learning_rate": 1.8859594674849267e-05, "loss": 1.1983, "step": 4968 }, { "epoch": 1.8499842933764588, "grad_norm": 0.15991763770580292, "learning_rate": 1.8859030991077062e-05, "loss": 1.2133, "step": 4969 }, { "epoch": 1.8503565985270678, "grad_norm": 0.1664821207523346, "learning_rate": 1.8858467176457528e-05, "loss": 1.2058, "step": 4970 }, { "epoch": 1.8507289036776768, "grad_norm": 0.16562390327453613, "learning_rate": 1.8857903230998986e-05, "loss": 1.2027, "step": 4971 }, { "epoch": 1.8511012088282859, "grad_norm": 0.16051267087459564, "learning_rate": 1.885733915470977e-05, "loss": 1.1803, "step": 4972 }, { "epoch": 1.851473513978895, "grad_norm": 0.1623574048280716, "learning_rate": 1.8856774947598212e-05, "loss": 1.1947, "step": 4973 }, { "epoch": 1.851845819129504, "grad_norm": 0.16899898648262024, "learning_rate": 1.8856210609672643e-05, "loss": 1.2068, "step": 4974 }, { "epoch": 1.852218124280113, "grad_norm": 0.16852141916751862, "learning_rate": 1.88556461409414e-05, "loss": 1.1886, "step": 4975 }, { "epoch": 1.8525904294307223, "grad_norm": 0.16997982561588287, "learning_rate": 1.8855081541412814e-05, "loss": 1.1986, "step": 4976 }, { "epoch": 1.852962734581331, "grad_norm": 0.17644338309764862, "learning_rate": 1.8854516811095234e-05, "loss": 1.1934, "step": 4977 }, { "epoch": 1.8533350397319404, "grad_norm": 0.18668413162231445, "learning_rate": 1.8853951949996997e-05, "loss": 1.1824, "step": 4978 }, { "epoch": 1.8537073448825494, "grad_norm": 0.16832074522972107, "learning_rate": 1.8853386958126444e-05, "loss": 1.1913, "step": 4979 }, { "epoch": 1.8540796500331584, "grad_norm": 0.18081939220428467, "learning_rate": 1.885282183549192e-05, "loss": 1.1971, "step": 4980 }, { "epoch": 1.8544519551837675, "grad_norm": 0.17961086332798004, "learning_rate": 1.8852256582101772e-05, "loss": 1.206, "step": 4981 }, { "epoch": 1.8548242603343765, "grad_norm": 0.16785657405853271, "learning_rate": 1.8851691197964356e-05, "loss": 1.1924, "step": 4982 }, { "epoch": 1.8551965654849858, "grad_norm": 0.17671982944011688, "learning_rate": 1.885112568308801e-05, "loss": 1.1868, "step": 4983 }, { "epoch": 1.8555688706355946, "grad_norm": 0.16151179373264313, "learning_rate": 1.8850560037481095e-05, "loss": 1.1999, "step": 4984 }, { "epoch": 1.8559411757862039, "grad_norm": 0.18711332976818085, "learning_rate": 1.8849994261151968e-05, "loss": 1.1959, "step": 4985 }, { "epoch": 1.8563134809368127, "grad_norm": 0.16686466336250305, "learning_rate": 1.8849428354108977e-05, "loss": 1.2055, "step": 4986 }, { "epoch": 1.856685786087422, "grad_norm": 0.17078348994255066, "learning_rate": 1.8848862316360485e-05, "loss": 1.2076, "step": 4987 }, { "epoch": 1.857058091238031, "grad_norm": 0.18692351877689362, "learning_rate": 1.884829614791485e-05, "loss": 1.2099, "step": 4988 }, { "epoch": 1.85743039638864, "grad_norm": 0.16382160782814026, "learning_rate": 1.884772984878044e-05, "loss": 1.2002, "step": 4989 }, { "epoch": 1.857802701539249, "grad_norm": 0.16507616639137268, "learning_rate": 1.8847163418965613e-05, "loss": 1.2038, "step": 4990 }, { "epoch": 1.8581750066898581, "grad_norm": 0.1630743145942688, "learning_rate": 1.884659685847874e-05, "loss": 1.2044, "step": 4991 }, { "epoch": 1.8585473118404674, "grad_norm": 0.18251408636569977, "learning_rate": 1.884603016732818e-05, "loss": 1.1941, "step": 4992 }, { "epoch": 1.8589196169910762, "grad_norm": 0.1670679897069931, "learning_rate": 1.8845463345522317e-05, "loss": 1.1857, "step": 4993 }, { "epoch": 1.8592919221416855, "grad_norm": 0.17508168518543243, "learning_rate": 1.8844896393069514e-05, "loss": 1.1886, "step": 4994 }, { "epoch": 1.8596642272922943, "grad_norm": 0.16918566823005676, "learning_rate": 1.8844329309978146e-05, "loss": 1.2011, "step": 4995 }, { "epoch": 1.8600365324429036, "grad_norm": 0.18971340358257294, "learning_rate": 1.8843762096256587e-05, "loss": 1.2074, "step": 4996 }, { "epoch": 1.8604088375935126, "grad_norm": 0.1661243587732315, "learning_rate": 1.8843194751913217e-05, "loss": 1.194, "step": 4997 }, { "epoch": 1.8607811427441217, "grad_norm": 0.1802155077457428, "learning_rate": 1.8842627276956418e-05, "loss": 1.1916, "step": 4998 }, { "epoch": 1.8611534478947307, "grad_norm": 0.16783170402050018, "learning_rate": 1.884205967139457e-05, "loss": 1.1981, "step": 4999 }, { "epoch": 1.8615257530453397, "grad_norm": 0.19096462428569794, "learning_rate": 1.884149193523605e-05, "loss": 1.2114, "step": 5000 }, { "epoch": 1.8615257530453397, "eval_loss": 1.3059989213943481, "eval_runtime": 16.6737, "eval_samples_per_second": 103.996, "eval_steps_per_second": 5.218, "step": 5000 }, { "epoch": 1.861898058195949, "grad_norm": 0.16396373510360718, "learning_rate": 1.884092406848925e-05, "loss": 1.2119, "step": 5001 }, { "epoch": 1.8622703633465578, "grad_norm": 0.17009267210960388, "learning_rate": 1.8840356071162565e-05, "loss": 1.1872, "step": 5002 }, { "epoch": 1.862642668497167, "grad_norm": 0.16647803783416748, "learning_rate": 1.8839787943264367e-05, "loss": 1.199, "step": 5003 }, { "epoch": 1.863014973647776, "grad_norm": 0.17351815104484558, "learning_rate": 1.8839219684803057e-05, "loss": 1.2002, "step": 5004 }, { "epoch": 1.8633872787983852, "grad_norm": 0.17014597356319427, "learning_rate": 1.8838651295787028e-05, "loss": 1.205, "step": 5005 }, { "epoch": 1.8637595839489942, "grad_norm": 0.17426498234272003, "learning_rate": 1.8838082776224675e-05, "loss": 1.2107, "step": 5006 }, { "epoch": 1.8641318890996033, "grad_norm": 0.17104274034500122, "learning_rate": 1.883751412612439e-05, "loss": 1.2051, "step": 5007 }, { "epoch": 1.8645041942502123, "grad_norm": 0.16714368760585785, "learning_rate": 1.8836945345494584e-05, "loss": 1.1975, "step": 5008 }, { "epoch": 1.8648764994008213, "grad_norm": 0.1665792465209961, "learning_rate": 1.8836376434343644e-05, "loss": 1.2, "step": 5009 }, { "epoch": 1.8652488045514306, "grad_norm": 0.16620507836341858, "learning_rate": 1.8835807392679978e-05, "loss": 1.1974, "step": 5010 }, { "epoch": 1.8656211097020394, "grad_norm": 0.16727782785892487, "learning_rate": 1.8835238220511997e-05, "loss": 1.2209, "step": 5011 }, { "epoch": 1.8659934148526487, "grad_norm": 0.17242635786533356, "learning_rate": 1.8834668917848097e-05, "loss": 1.2057, "step": 5012 }, { "epoch": 1.8663657200032575, "grad_norm": 0.16240327060222626, "learning_rate": 1.883409948469669e-05, "loss": 1.1868, "step": 5013 }, { "epoch": 1.8667380251538668, "grad_norm": 0.16446274518966675, "learning_rate": 1.8833529921066193e-05, "loss": 1.1848, "step": 5014 }, { "epoch": 1.8671103303044758, "grad_norm": 0.16615699231624603, "learning_rate": 1.883296022696501e-05, "loss": 1.1968, "step": 5015 }, { "epoch": 1.8674826354550849, "grad_norm": 0.15990613400936127, "learning_rate": 1.883239040240156e-05, "loss": 1.1975, "step": 5016 }, { "epoch": 1.867854940605694, "grad_norm": 0.1651614010334015, "learning_rate": 1.8831820447384256e-05, "loss": 1.1967, "step": 5017 }, { "epoch": 1.868227245756303, "grad_norm": 0.16867800056934357, "learning_rate": 1.8831250361921522e-05, "loss": 1.2167, "step": 5018 }, { "epoch": 1.8685995509069122, "grad_norm": 0.1658807247877121, "learning_rate": 1.8830680146021773e-05, "loss": 1.1987, "step": 5019 }, { "epoch": 1.868971856057521, "grad_norm": 0.1625361442565918, "learning_rate": 1.8830109799693434e-05, "loss": 1.2042, "step": 5020 }, { "epoch": 1.8693441612081303, "grad_norm": 0.47724783420562744, "learning_rate": 1.882953932294492e-05, "loss": 1.2001, "step": 5021 }, { "epoch": 1.8697164663587391, "grad_norm": 0.1839105486869812, "learning_rate": 1.882896871578467e-05, "loss": 1.1972, "step": 5022 }, { "epoch": 1.8700887715093484, "grad_norm": 0.22842104732990265, "learning_rate": 1.8828397978221108e-05, "loss": 1.1797, "step": 5023 }, { "epoch": 1.8704610766599574, "grad_norm": 0.18718920648097992, "learning_rate": 1.882782711026266e-05, "loss": 1.1971, "step": 5024 }, { "epoch": 1.8708333818105665, "grad_norm": 0.17810329794883728, "learning_rate": 1.8827256111917757e-05, "loss": 1.1894, "step": 5025 }, { "epoch": 1.8712056869611755, "grad_norm": 0.16336943209171295, "learning_rate": 1.882668498319484e-05, "loss": 1.2008, "step": 5026 }, { "epoch": 1.8715779921117845, "grad_norm": 0.18069854378700256, "learning_rate": 1.882611372410234e-05, "loss": 1.198, "step": 5027 }, { "epoch": 1.8719502972623938, "grad_norm": 0.17028245329856873, "learning_rate": 1.8825542334648687e-05, "loss": 1.1885, "step": 5028 }, { "epoch": 1.8723226024130026, "grad_norm": 0.16441023349761963, "learning_rate": 1.8824970814842332e-05, "loss": 1.1954, "step": 5029 }, { "epoch": 1.872694907563612, "grad_norm": 0.16318555176258087, "learning_rate": 1.8824399164691712e-05, "loss": 1.1845, "step": 5030 }, { "epoch": 1.873067212714221, "grad_norm": 0.16445982456207275, "learning_rate": 1.882382738420527e-05, "loss": 1.1954, "step": 5031 }, { "epoch": 1.87343951786483, "grad_norm": 0.16694188117980957, "learning_rate": 1.8823255473391454e-05, "loss": 1.2076, "step": 5032 }, { "epoch": 1.873811823015439, "grad_norm": 0.17121298611164093, "learning_rate": 1.8822683432258703e-05, "loss": 1.2013, "step": 5033 }, { "epoch": 1.874184128166048, "grad_norm": 0.16623573005199432, "learning_rate": 1.8822111260815475e-05, "loss": 1.1941, "step": 5034 }, { "epoch": 1.8745564333166571, "grad_norm": 0.16473905742168427, "learning_rate": 1.882153895907022e-05, "loss": 1.1876, "step": 5035 }, { "epoch": 1.8749287384672662, "grad_norm": 0.1598900854587555, "learning_rate": 1.8820966527031383e-05, "loss": 1.2061, "step": 5036 }, { "epoch": 1.8753010436178754, "grad_norm": 0.16518044471740723, "learning_rate": 1.8820393964707424e-05, "loss": 1.1909, "step": 5037 }, { "epoch": 1.8756733487684842, "grad_norm": 0.1635928601026535, "learning_rate": 1.8819821272106803e-05, "loss": 1.1977, "step": 5038 }, { "epoch": 1.8760456539190935, "grad_norm": 0.16230987012386322, "learning_rate": 1.8819248449237973e-05, "loss": 1.1884, "step": 5039 }, { "epoch": 1.8764179590697025, "grad_norm": 0.16847163438796997, "learning_rate": 1.8818675496109398e-05, "loss": 1.1951, "step": 5040 }, { "epoch": 1.8767902642203116, "grad_norm": 0.16432876884937286, "learning_rate": 1.8818102412729537e-05, "loss": 1.1996, "step": 5041 }, { "epoch": 1.8771625693709206, "grad_norm": 0.16060929000377655, "learning_rate": 1.8817529199106858e-05, "loss": 1.1812, "step": 5042 }, { "epoch": 1.8775348745215297, "grad_norm": 0.15831544995307922, "learning_rate": 1.8816955855249827e-05, "loss": 1.2125, "step": 5043 }, { "epoch": 1.877907179672139, "grad_norm": 0.1665830910205841, "learning_rate": 1.8816382381166912e-05, "loss": 1.1976, "step": 5044 }, { "epoch": 1.8782794848227478, "grad_norm": 0.16302713751792908, "learning_rate": 1.8815808776866583e-05, "loss": 1.2007, "step": 5045 }, { "epoch": 1.878651789973357, "grad_norm": 0.16040857136249542, "learning_rate": 1.881523504235731e-05, "loss": 1.2007, "step": 5046 }, { "epoch": 1.8790240951239658, "grad_norm": 0.16229379177093506, "learning_rate": 1.8814661177647567e-05, "loss": 1.2095, "step": 5047 }, { "epoch": 1.879396400274575, "grad_norm": 0.15823791921138763, "learning_rate": 1.8814087182745835e-05, "loss": 1.2027, "step": 5048 }, { "epoch": 1.8797687054251841, "grad_norm": 0.16429072618484497, "learning_rate": 1.8813513057660586e-05, "loss": 1.1974, "step": 5049 }, { "epoch": 1.8801410105757932, "grad_norm": 0.16185057163238525, "learning_rate": 1.8812938802400303e-05, "loss": 1.2046, "step": 5050 }, { "epoch": 1.8805133157264022, "grad_norm": 0.16304458677768707, "learning_rate": 1.8812364416973467e-05, "loss": 1.1941, "step": 5051 }, { "epoch": 1.8808856208770113, "grad_norm": 0.1603989452123642, "learning_rate": 1.881178990138856e-05, "loss": 1.2074, "step": 5052 }, { "epoch": 1.8812579260276205, "grad_norm": 0.16219423711299896, "learning_rate": 1.8811215255654074e-05, "loss": 1.1982, "step": 5053 }, { "epoch": 1.8816302311782294, "grad_norm": 0.16032250225543976, "learning_rate": 1.8810640479778488e-05, "loss": 1.1879, "step": 5054 }, { "epoch": 1.8820025363288386, "grad_norm": 0.16628342866897583, "learning_rate": 1.88100655737703e-05, "loss": 1.201, "step": 5055 }, { "epoch": 1.8823748414794474, "grad_norm": 0.15982067584991455, "learning_rate": 1.8809490537637988e-05, "loss": 1.1874, "step": 5056 }, { "epoch": 1.8827471466300567, "grad_norm": 0.16088199615478516, "learning_rate": 1.880891537139006e-05, "loss": 1.1981, "step": 5057 }, { "epoch": 1.8831194517806658, "grad_norm": 0.16720576584339142, "learning_rate": 1.8808340075035e-05, "loss": 1.1988, "step": 5058 }, { "epoch": 1.8834917569312748, "grad_norm": 0.1663171947002411, "learning_rate": 1.880776464858131e-05, "loss": 1.2205, "step": 5059 }, { "epoch": 1.8838640620818838, "grad_norm": 0.15910829603672028, "learning_rate": 1.8807189092037494e-05, "loss": 1.198, "step": 5060 }, { "epoch": 1.8842363672324929, "grad_norm": 0.17297664284706116, "learning_rate": 1.8806613405412045e-05, "loss": 1.1931, "step": 5061 }, { "epoch": 1.8846086723831021, "grad_norm": 0.18729183077812195, "learning_rate": 1.880603758871347e-05, "loss": 1.2086, "step": 5062 }, { "epoch": 1.884980977533711, "grad_norm": 0.17871494591236115, "learning_rate": 1.880546164195027e-05, "loss": 1.202, "step": 5063 }, { "epoch": 1.8853532826843202, "grad_norm": 0.16181787848472595, "learning_rate": 1.8804885565130956e-05, "loss": 1.1903, "step": 5064 }, { "epoch": 1.885725587834929, "grad_norm": 0.16055096685886383, "learning_rate": 1.8804309358264034e-05, "loss": 1.1956, "step": 5065 }, { "epoch": 1.8860978929855383, "grad_norm": 0.16259528696537018, "learning_rate": 1.8803733021358015e-05, "loss": 1.1927, "step": 5066 }, { "epoch": 1.8864701981361474, "grad_norm": 0.16635549068450928, "learning_rate": 1.880315655442141e-05, "loss": 1.1886, "step": 5067 }, { "epoch": 1.8868425032867564, "grad_norm": 0.1678260862827301, "learning_rate": 1.8802579957462738e-05, "loss": 1.216, "step": 5068 }, { "epoch": 1.8872148084373654, "grad_norm": 0.1632586270570755, "learning_rate": 1.880200323049051e-05, "loss": 1.1899, "step": 5069 }, { "epoch": 1.8875871135879745, "grad_norm": 0.17265328764915466, "learning_rate": 1.880142637351325e-05, "loss": 1.1961, "step": 5070 }, { "epoch": 1.8879594187385838, "grad_norm": 0.19594860076904297, "learning_rate": 1.8800849386539476e-05, "loss": 1.2123, "step": 5071 }, { "epoch": 1.8883317238891926, "grad_norm": 0.1857958883047104, "learning_rate": 1.8800272269577706e-05, "loss": 1.1959, "step": 5072 }, { "epoch": 1.8887040290398018, "grad_norm": 0.17190240323543549, "learning_rate": 1.8799695022636466e-05, "loss": 1.1803, "step": 5073 }, { "epoch": 1.8890763341904107, "grad_norm": 0.2080855667591095, "learning_rate": 1.8799117645724282e-05, "loss": 1.1875, "step": 5074 }, { "epoch": 1.88944863934102, "grad_norm": 0.18487951159477234, "learning_rate": 1.8798540138849685e-05, "loss": 1.2109, "step": 5075 }, { "epoch": 1.889820944491629, "grad_norm": 0.17496329545974731, "learning_rate": 1.8797962502021203e-05, "loss": 1.196, "step": 5076 }, { "epoch": 1.890193249642238, "grad_norm": 0.16391214728355408, "learning_rate": 1.8797384735247367e-05, "loss": 1.2139, "step": 5077 }, { "epoch": 1.890565554792847, "grad_norm": 0.1845993846654892, "learning_rate": 1.8796806838536708e-05, "loss": 1.208, "step": 5078 }, { "epoch": 1.890937859943456, "grad_norm": 0.17227643728256226, "learning_rate": 1.8796228811897764e-05, "loss": 1.1862, "step": 5079 }, { "epoch": 1.8913101650940654, "grad_norm": 0.16078023612499237, "learning_rate": 1.8795650655339075e-05, "loss": 1.1944, "step": 5080 }, { "epoch": 1.8916824702446742, "grad_norm": 0.1699201762676239, "learning_rate": 1.8795072368869176e-05, "loss": 1.2018, "step": 5081 }, { "epoch": 1.8920547753952834, "grad_norm": 0.17085964977741241, "learning_rate": 1.879449395249661e-05, "loss": 1.1928, "step": 5082 }, { "epoch": 1.8924270805458925, "grad_norm": 0.19768460094928741, "learning_rate": 1.8793915406229924e-05, "loss": 1.2065, "step": 5083 }, { "epoch": 1.8927993856965015, "grad_norm": 0.1722254604101181, "learning_rate": 1.8793336730077657e-05, "loss": 1.1908, "step": 5084 }, { "epoch": 1.8931716908471106, "grad_norm": 0.17522525787353516, "learning_rate": 1.8792757924048354e-05, "loss": 1.1937, "step": 5085 }, { "epoch": 1.8935439959977196, "grad_norm": 0.17701640725135803, "learning_rate": 1.8792178988150574e-05, "loss": 1.194, "step": 5086 }, { "epoch": 1.8939163011483287, "grad_norm": 0.17206698656082153, "learning_rate": 1.879159992239286e-05, "loss": 1.2049, "step": 5087 }, { "epoch": 1.8942886062989377, "grad_norm": 0.1732310801744461, "learning_rate": 1.8791020726783767e-05, "loss": 1.2098, "step": 5088 }, { "epoch": 1.894660911449547, "grad_norm": 0.1696704626083374, "learning_rate": 1.8790441401331848e-05, "loss": 1.1892, "step": 5089 }, { "epoch": 1.8950332166001558, "grad_norm": 0.17553074657917023, "learning_rate": 1.8789861946045662e-05, "loss": 1.1939, "step": 5090 }, { "epoch": 1.895405521750765, "grad_norm": 0.16884693503379822, "learning_rate": 1.8789282360933767e-05, "loss": 1.1892, "step": 5091 }, { "epoch": 1.895777826901374, "grad_norm": 0.17038117349147797, "learning_rate": 1.8788702646004725e-05, "loss": 1.2057, "step": 5092 }, { "epoch": 1.8961501320519831, "grad_norm": 0.16676990687847137, "learning_rate": 1.8788122801267094e-05, "loss": 1.212, "step": 5093 }, { "epoch": 1.8965224372025922, "grad_norm": 0.1800990104675293, "learning_rate": 1.878754282672944e-05, "loss": 1.2153, "step": 5094 }, { "epoch": 1.8968947423532012, "grad_norm": 0.20043106377124786, "learning_rate": 1.8786962722400334e-05, "loss": 1.1969, "step": 5095 }, { "epoch": 1.8972670475038103, "grad_norm": 0.1726304590702057, "learning_rate": 1.878638248828834e-05, "loss": 1.2091, "step": 5096 }, { "epoch": 1.8976393526544193, "grad_norm": 0.17626087367534637, "learning_rate": 1.8785802124402022e-05, "loss": 1.1895, "step": 5097 }, { "epoch": 1.8980116578050286, "grad_norm": 0.19044530391693115, "learning_rate": 1.8785221630749964e-05, "loss": 1.2022, "step": 5098 }, { "epoch": 1.8983839629556374, "grad_norm": 0.17916062474250793, "learning_rate": 1.878464100734073e-05, "loss": 1.2064, "step": 5099 }, { "epoch": 1.8987562681062466, "grad_norm": 0.17990046739578247, "learning_rate": 1.8784060254182904e-05, "loss": 1.1868, "step": 5100 }, { "epoch": 1.8991285732568557, "grad_norm": 0.1693127304315567, "learning_rate": 1.8783479371285054e-05, "loss": 1.2112, "step": 5101 }, { "epoch": 1.8995008784074647, "grad_norm": 0.1721697747707367, "learning_rate": 1.8782898358655767e-05, "loss": 1.1971, "step": 5102 }, { "epoch": 1.8998731835580738, "grad_norm": 0.1762087643146515, "learning_rate": 1.8782317216303624e-05, "loss": 1.1838, "step": 5103 }, { "epoch": 1.9002454887086828, "grad_norm": 0.2079368680715561, "learning_rate": 1.8781735944237204e-05, "loss": 1.1848, "step": 5104 }, { "epoch": 1.900617793859292, "grad_norm": 0.18409818410873413, "learning_rate": 1.87811545424651e-05, "loss": 1.1941, "step": 5105 }, { "epoch": 1.900990099009901, "grad_norm": 0.1844477504491806, "learning_rate": 1.878057301099589e-05, "loss": 1.1866, "step": 5106 }, { "epoch": 1.9013624041605102, "grad_norm": 0.19511154294013977, "learning_rate": 1.877999134983817e-05, "loss": 1.2066, "step": 5107 }, { "epoch": 1.901734709311119, "grad_norm": 0.16205884516239166, "learning_rate": 1.877940955900053e-05, "loss": 1.1894, "step": 5108 }, { "epoch": 1.9021070144617283, "grad_norm": 0.1605387181043625, "learning_rate": 1.8778827638491556e-05, "loss": 1.2014, "step": 5109 }, { "epoch": 1.9024793196123373, "grad_norm": 0.15819790959358215, "learning_rate": 1.877824558831985e-05, "loss": 1.2027, "step": 5110 }, { "epoch": 1.9028516247629463, "grad_norm": 0.1695229858160019, "learning_rate": 1.8777663408494013e-05, "loss": 1.181, "step": 5111 }, { "epoch": 1.9032239299135554, "grad_norm": 0.18196141719818115, "learning_rate": 1.8777081099022636e-05, "loss": 1.1962, "step": 5112 }, { "epoch": 1.9035962350641644, "grad_norm": 0.18630284070968628, "learning_rate": 1.877649865991432e-05, "loss": 1.2019, "step": 5113 }, { "epoch": 1.9039685402147737, "grad_norm": 0.16647082567214966, "learning_rate": 1.8775916091177674e-05, "loss": 1.1968, "step": 5114 }, { "epoch": 1.9043408453653825, "grad_norm": 0.20907706022262573, "learning_rate": 1.8775333392821294e-05, "loss": 1.1929, "step": 5115 }, { "epoch": 1.9047131505159918, "grad_norm": 0.19712580740451813, "learning_rate": 1.877475056485379e-05, "loss": 1.2029, "step": 5116 }, { "epoch": 1.9050854556666006, "grad_norm": 0.17977239191532135, "learning_rate": 1.8774167607283772e-05, "loss": 1.198, "step": 5117 }, { "epoch": 1.9054577608172099, "grad_norm": 0.1739930659532547, "learning_rate": 1.8773584520119848e-05, "loss": 1.1902, "step": 5118 }, { "epoch": 1.905830065967819, "grad_norm": 0.21335825324058533, "learning_rate": 1.8773001303370634e-05, "loss": 1.1862, "step": 5119 }, { "epoch": 1.906202371118428, "grad_norm": 0.15986791253089905, "learning_rate": 1.8772417957044743e-05, "loss": 1.2015, "step": 5120 }, { "epoch": 1.906574676269037, "grad_norm": 0.1675948053598404, "learning_rate": 1.8771834481150782e-05, "loss": 1.1985, "step": 5121 }, { "epoch": 1.906946981419646, "grad_norm": 0.16435672342777252, "learning_rate": 1.8771250875697383e-05, "loss": 1.2114, "step": 5122 }, { "epoch": 1.9073192865702553, "grad_norm": 0.16337592899799347, "learning_rate": 1.8770667140693155e-05, "loss": 1.2001, "step": 5123 }, { "epoch": 1.907691591720864, "grad_norm": 0.1674809753894806, "learning_rate": 1.8770083276146726e-05, "loss": 1.1977, "step": 5124 }, { "epoch": 1.9080638968714734, "grad_norm": 0.17421855032444, "learning_rate": 1.8769499282066716e-05, "loss": 1.1829, "step": 5125 }, { "epoch": 1.9084362020220822, "grad_norm": 0.16936717927455902, "learning_rate": 1.8768915158461755e-05, "loss": 1.2072, "step": 5126 }, { "epoch": 1.9088085071726915, "grad_norm": 0.16133807599544525, "learning_rate": 1.8768330905340462e-05, "loss": 1.2025, "step": 5127 }, { "epoch": 1.9091808123233005, "grad_norm": 0.1636780947446823, "learning_rate": 1.8767746522711478e-05, "loss": 1.2099, "step": 5128 }, { "epoch": 1.9095531174739095, "grad_norm": 0.17051872611045837, "learning_rate": 1.876716201058342e-05, "loss": 1.1913, "step": 5129 }, { "epoch": 1.9099254226245186, "grad_norm": 0.16685360670089722, "learning_rate": 1.8766577368964937e-05, "loss": 1.1992, "step": 5130 }, { "epoch": 1.9102977277751276, "grad_norm": 0.1661144345998764, "learning_rate": 1.8765992597864654e-05, "loss": 1.1953, "step": 5131 }, { "epoch": 1.910670032925737, "grad_norm": 0.16702596843242645, "learning_rate": 1.8765407697291208e-05, "loss": 1.2023, "step": 5132 }, { "epoch": 1.9110423380763457, "grad_norm": 0.16298018395900726, "learning_rate": 1.8764822667253244e-05, "loss": 1.203, "step": 5133 }, { "epoch": 1.911414643226955, "grad_norm": 0.16323602199554443, "learning_rate": 1.87642375077594e-05, "loss": 1.1987, "step": 5134 }, { "epoch": 1.9117869483775638, "grad_norm": 0.1665439009666443, "learning_rate": 1.8763652218818314e-05, "loss": 1.1812, "step": 5135 }, { "epoch": 1.912159253528173, "grad_norm": 0.16453301906585693, "learning_rate": 1.8763066800438638e-05, "loss": 1.2071, "step": 5136 }, { "epoch": 1.912531558678782, "grad_norm": 0.1619269996881485, "learning_rate": 1.8762481252629013e-05, "loss": 1.2125, "step": 5137 }, { "epoch": 1.9129038638293911, "grad_norm": 0.16248470544815063, "learning_rate": 1.8761895575398094e-05, "loss": 1.209, "step": 5138 }, { "epoch": 1.9132761689800002, "grad_norm": 0.16813617944717407, "learning_rate": 1.876130976875452e-05, "loss": 1.1875, "step": 5139 }, { "epoch": 1.9136484741306092, "grad_norm": 0.16750268638134003, "learning_rate": 1.8760723832706955e-05, "loss": 1.1921, "step": 5140 }, { "epoch": 1.9140207792812185, "grad_norm": 0.16163413226604462, "learning_rate": 1.8760137767264048e-05, "loss": 1.18, "step": 5141 }, { "epoch": 1.9143930844318273, "grad_norm": 0.16053326427936554, "learning_rate": 1.875955157243446e-05, "loss": 1.1977, "step": 5142 }, { "epoch": 1.9147653895824366, "grad_norm": 0.16916437447071075, "learning_rate": 1.8758965248226836e-05, "loss": 1.2144, "step": 5143 }, { "epoch": 1.9151376947330456, "grad_norm": 0.1655954271554947, "learning_rate": 1.875837879464985e-05, "loss": 1.2031, "step": 5144 }, { "epoch": 1.9155099998836547, "grad_norm": 0.16790400445461273, "learning_rate": 1.875779221171216e-05, "loss": 1.1969, "step": 5145 }, { "epoch": 1.9158823050342637, "grad_norm": 0.16594092547893524, "learning_rate": 1.8757205499422428e-05, "loss": 1.191, "step": 5146 }, { "epoch": 1.9162546101848728, "grad_norm": 0.16297580301761627, "learning_rate": 1.8756618657789322e-05, "loss": 1.2073, "step": 5147 }, { "epoch": 1.9166269153354818, "grad_norm": 0.16412940621376038, "learning_rate": 1.8756031686821506e-05, "loss": 1.1907, "step": 5148 }, { "epoch": 1.9169992204860908, "grad_norm": 0.16388413310050964, "learning_rate": 1.875544458652765e-05, "loss": 1.182, "step": 5149 }, { "epoch": 1.9173715256367, "grad_norm": 0.16367630660533905, "learning_rate": 1.875485735691643e-05, "loss": 1.2072, "step": 5150 }, { "epoch": 1.917743830787309, "grad_norm": 0.16208967566490173, "learning_rate": 1.8754269997996512e-05, "loss": 1.1871, "step": 5151 }, { "epoch": 1.9181161359379182, "grad_norm": 0.1770969182252884, "learning_rate": 1.875368250977658e-05, "loss": 1.2146, "step": 5152 }, { "epoch": 1.9184884410885272, "grad_norm": 0.17094440758228302, "learning_rate": 1.8753094892265308e-05, "loss": 1.1936, "step": 5153 }, { "epoch": 1.9188607462391363, "grad_norm": 0.16425853967666626, "learning_rate": 1.875250714547137e-05, "loss": 1.1972, "step": 5154 }, { "epoch": 1.9192330513897453, "grad_norm": 0.16301047801971436, "learning_rate": 1.875191926940345e-05, "loss": 1.1832, "step": 5155 }, { "epoch": 1.9196053565403544, "grad_norm": 0.16729892790317535, "learning_rate": 1.8751331264070232e-05, "loss": 1.2042, "step": 5156 }, { "epoch": 1.9199776616909636, "grad_norm": 0.16945315897464752, "learning_rate": 1.87507431294804e-05, "loss": 1.1815, "step": 5157 }, { "epoch": 1.9203499668415724, "grad_norm": 0.15983742475509644, "learning_rate": 1.8750154865642644e-05, "loss": 1.1798, "step": 5158 }, { "epoch": 1.9207222719921817, "grad_norm": 0.1647757738828659, "learning_rate": 1.874956647256565e-05, "loss": 1.2057, "step": 5159 }, { "epoch": 1.9210945771427905, "grad_norm": 0.16134053468704224, "learning_rate": 1.8748977950258105e-05, "loss": 1.2005, "step": 5160 }, { "epoch": 1.9214668822933998, "grad_norm": 0.16527298092842102, "learning_rate": 1.8748389298728708e-05, "loss": 1.2001, "step": 5161 }, { "epoch": 1.9218391874440088, "grad_norm": 0.15893900394439697, "learning_rate": 1.8747800517986147e-05, "loss": 1.2154, "step": 5162 }, { "epoch": 1.9222114925946179, "grad_norm": 0.16622401773929596, "learning_rate": 1.8747211608039124e-05, "loss": 1.1916, "step": 5163 }, { "epoch": 1.922583797745227, "grad_norm": 0.16279946267604828, "learning_rate": 1.8746622568896334e-05, "loss": 1.1944, "step": 5164 }, { "epoch": 1.922956102895836, "grad_norm": 0.16495847702026367, "learning_rate": 1.874603340056648e-05, "loss": 1.2027, "step": 5165 }, { "epoch": 1.9233284080464452, "grad_norm": 0.156333789229393, "learning_rate": 1.8745444103058257e-05, "loss": 1.1909, "step": 5166 }, { "epoch": 1.923700713197054, "grad_norm": 0.16494396328926086, "learning_rate": 1.8744854676380374e-05, "loss": 1.193, "step": 5167 }, { "epoch": 1.9240730183476633, "grad_norm": 0.1679079830646515, "learning_rate": 1.8744265120541537e-05, "loss": 1.2151, "step": 5168 }, { "epoch": 1.9244453234982721, "grad_norm": 0.16192030906677246, "learning_rate": 1.8743675435550453e-05, "loss": 1.2061, "step": 5169 }, { "epoch": 1.9248176286488814, "grad_norm": 0.16086839139461517, "learning_rate": 1.8743085621415834e-05, "loss": 1.1853, "step": 5170 }, { "epoch": 1.9251899337994904, "grad_norm": 0.16591200232505798, "learning_rate": 1.8742495678146384e-05, "loss": 1.1983, "step": 5171 }, { "epoch": 1.9255622389500995, "grad_norm": 0.16515636444091797, "learning_rate": 1.8741905605750826e-05, "loss": 1.1886, "step": 5172 }, { "epoch": 1.9259345441007085, "grad_norm": 0.16431131958961487, "learning_rate": 1.874131540423787e-05, "loss": 1.2073, "step": 5173 }, { "epoch": 1.9263068492513176, "grad_norm": 0.16539840400218964, "learning_rate": 1.874072507361623e-05, "loss": 1.1931, "step": 5174 }, { "epoch": 1.9266791544019268, "grad_norm": 0.1654292792081833, "learning_rate": 1.8740134613894633e-05, "loss": 1.2022, "step": 5175 }, { "epoch": 1.9270514595525357, "grad_norm": 0.16364841163158417, "learning_rate": 1.8739544025081797e-05, "loss": 1.1803, "step": 5176 }, { "epoch": 1.927423764703145, "grad_norm": 0.16501003503799438, "learning_rate": 1.8738953307186442e-05, "loss": 1.1918, "step": 5177 }, { "epoch": 1.9277960698537537, "grad_norm": 0.15340885519981384, "learning_rate": 1.8738362460217296e-05, "loss": 1.1955, "step": 5178 }, { "epoch": 1.928168375004363, "grad_norm": 0.16795547306537628, "learning_rate": 1.8737771484183084e-05, "loss": 1.2006, "step": 5179 }, { "epoch": 1.928540680154972, "grad_norm": 0.16092796623706818, "learning_rate": 1.8737180379092536e-05, "loss": 1.192, "step": 5180 }, { "epoch": 1.928912985305581, "grad_norm": 0.1616433709859848, "learning_rate": 1.8736589144954384e-05, "loss": 1.1703, "step": 5181 }, { "epoch": 1.9292852904561901, "grad_norm": 0.16441045701503754, "learning_rate": 1.8735997781777356e-05, "loss": 1.1818, "step": 5182 }, { "epoch": 1.9296575956067992, "grad_norm": 0.16628822684288025, "learning_rate": 1.8735406289570193e-05, "loss": 1.1943, "step": 5183 }, { "epoch": 1.9300299007574084, "grad_norm": 0.15827049314975739, "learning_rate": 1.873481466834162e-05, "loss": 1.1896, "step": 5184 }, { "epoch": 1.9304022059080173, "grad_norm": 0.16958533227443695, "learning_rate": 1.873422291810039e-05, "loss": 1.1987, "step": 5185 }, { "epoch": 1.9307745110586265, "grad_norm": 0.1640082448720932, "learning_rate": 1.8733631038855232e-05, "loss": 1.2068, "step": 5186 }, { "epoch": 1.9311468162092353, "grad_norm": 0.16046150028705597, "learning_rate": 1.8733039030614893e-05, "loss": 1.1939, "step": 5187 }, { "epoch": 1.9315191213598446, "grad_norm": 0.15839648246765137, "learning_rate": 1.8732446893388116e-05, "loss": 1.1919, "step": 5188 }, { "epoch": 1.9318914265104536, "grad_norm": 0.15910884737968445, "learning_rate": 1.8731854627183642e-05, "loss": 1.2, "step": 5189 }, { "epoch": 1.9322637316610627, "grad_norm": 0.17085225880146027, "learning_rate": 1.8731262232010226e-05, "loss": 1.1909, "step": 5190 }, { "epoch": 1.9326360368116717, "grad_norm": 0.16858842968940735, "learning_rate": 1.8730669707876617e-05, "loss": 1.2053, "step": 5191 }, { "epoch": 1.9330083419622808, "grad_norm": 0.16198118031024933, "learning_rate": 1.8730077054791562e-05, "loss": 1.1851, "step": 5192 }, { "epoch": 1.93338064711289, "grad_norm": 0.16233724355697632, "learning_rate": 1.8729484272763817e-05, "loss": 1.1811, "step": 5193 }, { "epoch": 1.9337529522634989, "grad_norm": 0.1625370979309082, "learning_rate": 1.8728891361802136e-05, "loss": 1.1946, "step": 5194 }, { "epoch": 1.9341252574141081, "grad_norm": 0.16381089389324188, "learning_rate": 1.872829832191528e-05, "loss": 1.1884, "step": 5195 }, { "epoch": 1.934497562564717, "grad_norm": 0.15602105855941772, "learning_rate": 1.8727705153112007e-05, "loss": 1.192, "step": 5196 }, { "epoch": 1.9348698677153262, "grad_norm": 0.1663997620344162, "learning_rate": 1.8727111855401073e-05, "loss": 1.2075, "step": 5197 }, { "epoch": 1.9352421728659353, "grad_norm": 0.16173984110355377, "learning_rate": 1.8726518428791246e-05, "loss": 1.1854, "step": 5198 }, { "epoch": 1.9356144780165443, "grad_norm": 0.16155578196048737, "learning_rate": 1.872592487329129e-05, "loss": 1.1947, "step": 5199 }, { "epoch": 1.9359867831671533, "grad_norm": 0.1606961041688919, "learning_rate": 1.872533118890997e-05, "loss": 1.1849, "step": 5200 }, { "epoch": 1.9363590883177624, "grad_norm": 0.15817643702030182, "learning_rate": 1.8724737375656054e-05, "loss": 1.1949, "step": 5201 }, { "epoch": 1.9367313934683716, "grad_norm": 0.16562969982624054, "learning_rate": 1.8724143433538317e-05, "loss": 1.1889, "step": 5202 }, { "epoch": 1.9371036986189805, "grad_norm": 0.1643010526895523, "learning_rate": 1.872354936256553e-05, "loss": 1.1853, "step": 5203 }, { "epoch": 1.9374760037695897, "grad_norm": 0.16269844770431519, "learning_rate": 1.8722955162746465e-05, "loss": 1.207, "step": 5204 }, { "epoch": 1.9378483089201988, "grad_norm": 0.15886783599853516, "learning_rate": 1.87223608340899e-05, "loss": 1.1989, "step": 5205 }, { "epoch": 1.9382206140708078, "grad_norm": 0.1662435382604599, "learning_rate": 1.8721766376604612e-05, "loss": 1.1928, "step": 5206 }, { "epoch": 1.9385929192214169, "grad_norm": 0.16843284666538239, "learning_rate": 1.8721171790299385e-05, "loss": 1.1914, "step": 5207 }, { "epoch": 1.938965224372026, "grad_norm": 0.16134469211101532, "learning_rate": 1.8720577075182992e-05, "loss": 1.1829, "step": 5208 }, { "epoch": 1.939337529522635, "grad_norm": 0.16114822030067444, "learning_rate": 1.8719982231264228e-05, "loss": 1.2182, "step": 5209 }, { "epoch": 1.939709834673244, "grad_norm": 0.15821397304534912, "learning_rate": 1.8719387258551874e-05, "loss": 1.1835, "step": 5210 }, { "epoch": 1.9400821398238532, "grad_norm": 0.16827209293842316, "learning_rate": 1.8718792157054714e-05, "loss": 1.1927, "step": 5211 }, { "epoch": 1.940454444974462, "grad_norm": 0.16307982802391052, "learning_rate": 1.871819692678154e-05, "loss": 1.1895, "step": 5212 }, { "epoch": 1.9408267501250713, "grad_norm": 0.16273893415927887, "learning_rate": 1.8717601567741147e-05, "loss": 1.2002, "step": 5213 }, { "epoch": 1.9411990552756804, "grad_norm": 0.16210372745990753, "learning_rate": 1.871700607994233e-05, "loss": 1.2093, "step": 5214 }, { "epoch": 1.9415713604262894, "grad_norm": 0.16374051570892334, "learning_rate": 1.8716410463393873e-05, "loss": 1.2023, "step": 5215 }, { "epoch": 1.9419436655768985, "grad_norm": 0.16489435732364655, "learning_rate": 1.8715814718104585e-05, "loss": 1.201, "step": 5216 }, { "epoch": 1.9423159707275075, "grad_norm": 0.15813589096069336, "learning_rate": 1.871521884408326e-05, "loss": 1.1857, "step": 5217 }, { "epoch": 1.9426882758781168, "grad_norm": 0.16261295974254608, "learning_rate": 1.8714622841338696e-05, "loss": 1.1985, "step": 5218 }, { "epoch": 1.9430605810287256, "grad_norm": 0.15933023393154144, "learning_rate": 1.8714026709879704e-05, "loss": 1.2005, "step": 5219 }, { "epoch": 1.9434328861793349, "grad_norm": 0.1632496863603592, "learning_rate": 1.8713430449715086e-05, "loss": 1.1946, "step": 5220 }, { "epoch": 1.9438051913299437, "grad_norm": 0.16615080833435059, "learning_rate": 1.8712834060853643e-05, "loss": 1.2015, "step": 5221 }, { "epoch": 1.944177496480553, "grad_norm": 0.1673406958580017, "learning_rate": 1.871223754330419e-05, "loss": 1.1935, "step": 5222 }, { "epoch": 1.944549801631162, "grad_norm": 0.17133355140686035, "learning_rate": 1.871164089707553e-05, "loss": 1.198, "step": 5223 }, { "epoch": 1.944922106781771, "grad_norm": 0.16196174919605255, "learning_rate": 1.8711044122176484e-05, "loss": 1.1964, "step": 5224 }, { "epoch": 1.94529441193238, "grad_norm": 0.16071192920207977, "learning_rate": 1.8710447218615865e-05, "loss": 1.2009, "step": 5225 }, { "epoch": 1.945666717082989, "grad_norm": 0.17128324508666992, "learning_rate": 1.8709850186402487e-05, "loss": 1.193, "step": 5226 }, { "epoch": 1.9460390222335984, "grad_norm": 0.16837619245052338, "learning_rate": 1.8709253025545167e-05, "loss": 1.1982, "step": 5227 }, { "epoch": 1.9464113273842072, "grad_norm": 0.16107183694839478, "learning_rate": 1.8708655736052725e-05, "loss": 1.1822, "step": 5228 }, { "epoch": 1.9467836325348165, "grad_norm": 0.16632641851902008, "learning_rate": 1.8708058317933986e-05, "loss": 1.1895, "step": 5229 }, { "epoch": 1.9471559376854253, "grad_norm": 0.16798974573612213, "learning_rate": 1.8707460771197773e-05, "loss": 1.2121, "step": 5230 }, { "epoch": 1.9475282428360345, "grad_norm": 0.16306382417678833, "learning_rate": 1.870686309585291e-05, "loss": 1.1866, "step": 5231 }, { "epoch": 1.9479005479866436, "grad_norm": 0.1670602411031723, "learning_rate": 1.8706265291908226e-05, "loss": 1.1971, "step": 5232 }, { "epoch": 1.9482728531372526, "grad_norm": 0.1586066037416458, "learning_rate": 1.870566735937255e-05, "loss": 1.1894, "step": 5233 }, { "epoch": 1.9486451582878617, "grad_norm": 0.16661204397678375, "learning_rate": 1.870506929825471e-05, "loss": 1.2033, "step": 5234 }, { "epoch": 1.9490174634384707, "grad_norm": 0.16679902374744415, "learning_rate": 1.870447110856355e-05, "loss": 1.2003, "step": 5235 }, { "epoch": 1.94938976858908, "grad_norm": 0.1620539128780365, "learning_rate": 1.8703872790307892e-05, "loss": 1.1742, "step": 5236 }, { "epoch": 1.9497620737396888, "grad_norm": 0.16605715453624725, "learning_rate": 1.870327434349658e-05, "loss": 1.1964, "step": 5237 }, { "epoch": 1.950134378890298, "grad_norm": 0.1599631905555725, "learning_rate": 1.8702675768138453e-05, "loss": 1.1964, "step": 5238 }, { "epoch": 1.9505066840409069, "grad_norm": 0.1627977043390274, "learning_rate": 1.870207706424235e-05, "loss": 1.1941, "step": 5239 }, { "epoch": 1.9508789891915161, "grad_norm": 0.16092990338802338, "learning_rate": 1.8701478231817116e-05, "loss": 1.2014, "step": 5240 }, { "epoch": 1.9512512943421252, "grad_norm": 0.16339315474033356, "learning_rate": 1.8700879270871594e-05, "loss": 1.1798, "step": 5241 }, { "epoch": 1.9516235994927342, "grad_norm": 0.16573093831539154, "learning_rate": 1.8700280181414633e-05, "loss": 1.1786, "step": 5242 }, { "epoch": 1.9519959046433433, "grad_norm": 0.16669802367687225, "learning_rate": 1.8699680963455076e-05, "loss": 1.202, "step": 5243 }, { "epoch": 1.9523682097939523, "grad_norm": 0.16122578084468842, "learning_rate": 1.8699081617001784e-05, "loss": 1.1927, "step": 5244 }, { "epoch": 1.9527405149445616, "grad_norm": 0.1680309772491455, "learning_rate": 1.8698482142063595e-05, "loss": 1.1934, "step": 5245 }, { "epoch": 1.9531128200951704, "grad_norm": 0.16212962567806244, "learning_rate": 1.8697882538649373e-05, "loss": 1.1842, "step": 5246 }, { "epoch": 1.9534851252457797, "grad_norm": 0.16280242800712585, "learning_rate": 1.8697282806767974e-05, "loss": 1.2018, "step": 5247 }, { "epoch": 1.9538574303963885, "grad_norm": 0.15737560391426086, "learning_rate": 1.8696682946428253e-05, "loss": 1.2001, "step": 5248 }, { "epoch": 1.9542297355469977, "grad_norm": 0.1623457968235016, "learning_rate": 1.8696082957639068e-05, "loss": 1.1931, "step": 5249 }, { "epoch": 1.9546020406976068, "grad_norm": 0.15832823514938354, "learning_rate": 1.8695482840409287e-05, "loss": 1.2063, "step": 5250 }, { "epoch": 1.9549743458482158, "grad_norm": 0.15807972848415375, "learning_rate": 1.869488259474777e-05, "loss": 1.1896, "step": 5251 }, { "epoch": 1.9553466509988249, "grad_norm": 0.16206707060337067, "learning_rate": 1.869428222066338e-05, "loss": 1.2002, "step": 5252 }, { "epoch": 1.955718956149434, "grad_norm": 0.16285400092601776, "learning_rate": 1.8693681718164987e-05, "loss": 1.2033, "step": 5253 }, { "epoch": 1.9560912613000432, "grad_norm": 0.1658763289451599, "learning_rate": 1.8693081087261463e-05, "loss": 1.1955, "step": 5254 }, { "epoch": 1.956463566450652, "grad_norm": 0.15834468603134155, "learning_rate": 1.8692480327961673e-05, "loss": 1.1961, "step": 5255 }, { "epoch": 1.9568358716012613, "grad_norm": 0.15797977149486542, "learning_rate": 1.8691879440274498e-05, "loss": 1.1883, "step": 5256 }, { "epoch": 1.95720817675187, "grad_norm": 0.16849857568740845, "learning_rate": 1.869127842420881e-05, "loss": 1.1909, "step": 5257 }, { "epoch": 1.9575804819024794, "grad_norm": 0.16109803318977356, "learning_rate": 1.8690677279773482e-05, "loss": 1.1815, "step": 5258 }, { "epoch": 1.9579527870530884, "grad_norm": 0.15917837619781494, "learning_rate": 1.86900760069774e-05, "loss": 1.2017, "step": 5259 }, { "epoch": 1.9583250922036974, "grad_norm": 0.1662922352552414, "learning_rate": 1.8689474605829436e-05, "loss": 1.2067, "step": 5260 }, { "epoch": 1.9586973973543065, "grad_norm": 0.17103338241577148, "learning_rate": 1.868887307633848e-05, "loss": 1.196, "step": 5261 }, { "epoch": 1.9590697025049155, "grad_norm": 0.1583465188741684, "learning_rate": 1.8688271418513415e-05, "loss": 1.1932, "step": 5262 }, { "epoch": 1.9594420076555248, "grad_norm": 0.1627221256494522, "learning_rate": 1.8687669632363122e-05, "loss": 1.1863, "step": 5263 }, { "epoch": 1.9598143128061336, "grad_norm": 0.1791868507862091, "learning_rate": 1.86870677178965e-05, "loss": 1.2089, "step": 5264 }, { "epoch": 1.9601866179567429, "grad_norm": 0.17789161205291748, "learning_rate": 1.8686465675122426e-05, "loss": 1.1899, "step": 5265 }, { "epoch": 1.960558923107352, "grad_norm": 0.16588279604911804, "learning_rate": 1.8685863504049804e-05, "loss": 1.1984, "step": 5266 }, { "epoch": 1.960931228257961, "grad_norm": 0.16590848565101624, "learning_rate": 1.8685261204687523e-05, "loss": 1.2052, "step": 5267 }, { "epoch": 1.96130353340857, "grad_norm": 0.22419647872447968, "learning_rate": 1.8684658777044478e-05, "loss": 1.1945, "step": 5268 }, { "epoch": 1.961675838559179, "grad_norm": 0.20315192639827728, "learning_rate": 1.8684056221129568e-05, "loss": 1.2025, "step": 5269 }, { "epoch": 1.962048143709788, "grad_norm": 0.17711172997951508, "learning_rate": 1.868345353695169e-05, "loss": 1.1777, "step": 5270 }, { "epoch": 1.9624204488603971, "grad_norm": 0.16664499044418335, "learning_rate": 1.8682850724519752e-05, "loss": 1.1837, "step": 5271 }, { "epoch": 1.9627927540110064, "grad_norm": 0.1918790340423584, "learning_rate": 1.8682247783842654e-05, "loss": 1.1987, "step": 5272 }, { "epoch": 1.9631650591616152, "grad_norm": 0.17362017929553986, "learning_rate": 1.86816447149293e-05, "loss": 1.2091, "step": 5273 }, { "epoch": 1.9635373643122245, "grad_norm": 0.1695503294467926, "learning_rate": 1.8681041517788596e-05, "loss": 1.1945, "step": 5274 }, { "epoch": 1.9639096694628335, "grad_norm": 0.17335741221904755, "learning_rate": 1.8680438192429455e-05, "loss": 1.185, "step": 5275 }, { "epoch": 1.9642819746134426, "grad_norm": 0.17371761798858643, "learning_rate": 1.867983473886079e-05, "loss": 1.1801, "step": 5276 }, { "epoch": 1.9646542797640516, "grad_norm": 0.1588549017906189, "learning_rate": 1.8679231157091507e-05, "loss": 1.1924, "step": 5277 }, { "epoch": 1.9650265849146606, "grad_norm": 0.16421490907669067, "learning_rate": 1.8678627447130524e-05, "loss": 1.1931, "step": 5278 }, { "epoch": 1.96539889006527, "grad_norm": 0.17214372754096985, "learning_rate": 1.8678023608986758e-05, "loss": 1.1904, "step": 5279 }, { "epoch": 1.9657711952158787, "grad_norm": 0.18433736264705658, "learning_rate": 1.867741964266913e-05, "loss": 1.1974, "step": 5280 }, { "epoch": 1.966143500366488, "grad_norm": 0.15704011917114258, "learning_rate": 1.867681554818656e-05, "loss": 1.1743, "step": 5281 }, { "epoch": 1.9665158055170968, "grad_norm": 0.16321733593940735, "learning_rate": 1.8676211325547965e-05, "loss": 1.204, "step": 5282 }, { "epoch": 1.966888110667706, "grad_norm": 0.17481832206249237, "learning_rate": 1.8675606974762274e-05, "loss": 1.1891, "step": 5283 }, { "epoch": 1.9672604158183151, "grad_norm": 0.16488084197044373, "learning_rate": 1.8675002495838415e-05, "loss": 1.2052, "step": 5284 }, { "epoch": 1.9676327209689242, "grad_norm": 0.15766631066799164, "learning_rate": 1.8674397888785312e-05, "loss": 1.1902, "step": 5285 }, { "epoch": 1.9680050261195332, "grad_norm": 0.17218059301376343, "learning_rate": 1.8673793153611893e-05, "loss": 1.1857, "step": 5286 }, { "epoch": 1.9683773312701422, "grad_norm": 0.15942028164863586, "learning_rate": 1.86731882903271e-05, "loss": 1.1842, "step": 5287 }, { "epoch": 1.9687496364207515, "grad_norm": 0.16559404134750366, "learning_rate": 1.8672583298939857e-05, "loss": 1.1907, "step": 5288 }, { "epoch": 1.9691219415713603, "grad_norm": 0.16306178271770477, "learning_rate": 1.8671978179459102e-05, "loss": 1.1881, "step": 5289 }, { "epoch": 1.9694942467219696, "grad_norm": 0.16512851417064667, "learning_rate": 1.8671372931893775e-05, "loss": 1.2026, "step": 5290 }, { "epoch": 1.9698665518725784, "grad_norm": 0.1704392433166504, "learning_rate": 1.8670767556252812e-05, "loss": 1.207, "step": 5291 }, { "epoch": 1.9702388570231877, "grad_norm": 0.160672128200531, "learning_rate": 1.867016205254516e-05, "loss": 1.196, "step": 5292 }, { "epoch": 1.9706111621737967, "grad_norm": 0.16421304643154144, "learning_rate": 1.866955642077976e-05, "loss": 1.1896, "step": 5293 }, { "epoch": 1.9709834673244058, "grad_norm": 0.1669030487537384, "learning_rate": 1.866895066096555e-05, "loss": 1.1982, "step": 5294 }, { "epoch": 1.9713557724750148, "grad_norm": 0.171576127409935, "learning_rate": 1.8668344773111483e-05, "loss": 1.1997, "step": 5295 }, { "epoch": 1.9717280776256239, "grad_norm": 0.16728414595127106, "learning_rate": 1.866773875722651e-05, "loss": 1.1875, "step": 5296 }, { "epoch": 1.9721003827762331, "grad_norm": 0.15906740725040436, "learning_rate": 1.866713261331958e-05, "loss": 1.1797, "step": 5297 }, { "epoch": 1.972472687926842, "grad_norm": 0.16088546812534332, "learning_rate": 1.8666526341399644e-05, "loss": 1.1836, "step": 5298 }, { "epoch": 1.9728449930774512, "grad_norm": 0.16941684484481812, "learning_rate": 1.866591994147566e-05, "loss": 1.1921, "step": 5299 }, { "epoch": 1.97321729822806, "grad_norm": 0.16476041078567505, "learning_rate": 1.866531341355658e-05, "loss": 1.1841, "step": 5300 }, { "epoch": 1.9735896033786693, "grad_norm": 0.15881778299808502, "learning_rate": 1.8664706757651365e-05, "loss": 1.1853, "step": 5301 }, { "epoch": 1.9739619085292783, "grad_norm": 0.164139986038208, "learning_rate": 1.8664099973768975e-05, "loss": 1.1816, "step": 5302 }, { "epoch": 1.9743342136798874, "grad_norm": 0.16207018494606018, "learning_rate": 1.8663493061918375e-05, "loss": 1.1838, "step": 5303 }, { "epoch": 1.9747065188304964, "grad_norm": 0.17509856820106506, "learning_rate": 1.8662886022108524e-05, "loss": 1.2028, "step": 5304 }, { "epoch": 1.9750788239811055, "grad_norm": 0.17713604867458344, "learning_rate": 1.8662278854348387e-05, "loss": 1.2028, "step": 5305 }, { "epoch": 1.9754511291317147, "grad_norm": 0.17039844393730164, "learning_rate": 1.8661671558646938e-05, "loss": 1.1883, "step": 5306 }, { "epoch": 1.9758234342823235, "grad_norm": 0.17008256912231445, "learning_rate": 1.8661064135013143e-05, "loss": 1.1971, "step": 5307 }, { "epoch": 1.9761957394329328, "grad_norm": 0.20797152817249298, "learning_rate": 1.8660456583455974e-05, "loss": 1.1907, "step": 5308 }, { "epoch": 1.9765680445835416, "grad_norm": 0.20216019451618195, "learning_rate": 1.865984890398441e-05, "loss": 1.2112, "step": 5309 }, { "epoch": 1.976940349734151, "grad_norm": 0.18128228187561035, "learning_rate": 1.8659241096607416e-05, "loss": 1.1903, "step": 5310 }, { "epoch": 1.97731265488476, "grad_norm": 0.1619497686624527, "learning_rate": 1.8658633161333974e-05, "loss": 1.2063, "step": 5311 }, { "epoch": 1.977684960035369, "grad_norm": 0.1790294498205185, "learning_rate": 1.865802509817306e-05, "loss": 1.197, "step": 5312 }, { "epoch": 1.978057265185978, "grad_norm": 0.18129126727581024, "learning_rate": 1.8657416907133668e-05, "loss": 1.1738, "step": 5313 }, { "epoch": 1.978429570336587, "grad_norm": 0.16873174905776978, "learning_rate": 1.8656808588224767e-05, "loss": 1.1904, "step": 5314 }, { "epoch": 1.9788018754871963, "grad_norm": 0.1654161512851715, "learning_rate": 1.8656200141455345e-05, "loss": 1.1814, "step": 5315 }, { "epoch": 1.9791741806378051, "grad_norm": 0.1696605384349823, "learning_rate": 1.865559156683439e-05, "loss": 1.1972, "step": 5316 }, { "epoch": 1.9795464857884144, "grad_norm": 0.17438465356826782, "learning_rate": 1.8654982864370893e-05, "loss": 1.2101, "step": 5317 }, { "epoch": 1.9799187909390232, "grad_norm": 0.16162490844726562, "learning_rate": 1.8654374034073843e-05, "loss": 1.1928, "step": 5318 }, { "epoch": 1.9802910960896325, "grad_norm": 0.16213704645633698, "learning_rate": 1.865376507595223e-05, "loss": 1.1729, "step": 5319 }, { "epoch": 1.9806634012402415, "grad_norm": 0.19320641458034515, "learning_rate": 1.865315599001505e-05, "loss": 1.1971, "step": 5320 }, { "epoch": 1.9810357063908506, "grad_norm": 0.16562476754188538, "learning_rate": 1.8652546776271297e-05, "loss": 1.1873, "step": 5321 }, { "epoch": 1.9814080115414596, "grad_norm": 0.16413205862045288, "learning_rate": 1.8651937434729975e-05, "loss": 1.1935, "step": 5322 }, { "epoch": 1.9817803166920687, "grad_norm": 0.17290954291820526, "learning_rate": 1.865132796540008e-05, "loss": 1.1915, "step": 5323 }, { "epoch": 1.982152621842678, "grad_norm": 0.17080329358577728, "learning_rate": 1.8650718368290613e-05, "loss": 1.1958, "step": 5324 }, { "epoch": 1.9825249269932868, "grad_norm": 0.1644285023212433, "learning_rate": 1.865010864341058e-05, "loss": 1.1918, "step": 5325 }, { "epoch": 1.982897232143896, "grad_norm": 0.1660122573375702, "learning_rate": 1.864949879076898e-05, "loss": 1.1808, "step": 5326 }, { "epoch": 1.983269537294505, "grad_norm": 0.16145946085453033, "learning_rate": 1.864888881037483e-05, "loss": 1.2083, "step": 5327 }, { "epoch": 1.983641842445114, "grad_norm": 0.16719859838485718, "learning_rate": 1.8648278702237133e-05, "loss": 1.1856, "step": 5328 }, { "epoch": 1.9840141475957231, "grad_norm": 0.16676342487335205, "learning_rate": 1.8647668466364903e-05, "loss": 1.2016, "step": 5329 }, { "epoch": 1.9843864527463322, "grad_norm": 0.16827064752578735, "learning_rate": 1.864705810276715e-05, "loss": 1.1768, "step": 5330 }, { "epoch": 1.9847587578969412, "grad_norm": 0.1711677461862564, "learning_rate": 1.8646447611452895e-05, "loss": 1.1931, "step": 5331 }, { "epoch": 1.9851310630475503, "grad_norm": 0.16668665409088135, "learning_rate": 1.8645836992431152e-05, "loss": 1.2098, "step": 5332 }, { "epoch": 1.9855033681981595, "grad_norm": 0.1718050241470337, "learning_rate": 1.8645226245710936e-05, "loss": 1.2019, "step": 5333 }, { "epoch": 1.9858756733487684, "grad_norm": 0.1724565029144287, "learning_rate": 1.8644615371301275e-05, "loss": 1.1771, "step": 5334 }, { "epoch": 1.9862479784993776, "grad_norm": 0.1688944697380066, "learning_rate": 1.8644004369211183e-05, "loss": 1.1902, "step": 5335 }, { "epoch": 1.9866202836499867, "grad_norm": 0.17193703353405, "learning_rate": 1.864339323944969e-05, "loss": 1.1893, "step": 5336 }, { "epoch": 1.9869925888005957, "grad_norm": 0.166243776679039, "learning_rate": 1.8642781982025827e-05, "loss": 1.203, "step": 5337 }, { "epoch": 1.9873648939512047, "grad_norm": 0.1623961478471756, "learning_rate": 1.864217059694861e-05, "loss": 1.1703, "step": 5338 }, { "epoch": 1.9877371991018138, "grad_norm": 0.1668027937412262, "learning_rate": 1.8641559084227078e-05, "loss": 1.1886, "step": 5339 }, { "epoch": 1.988109504252423, "grad_norm": 0.16468320786952972, "learning_rate": 1.864094744387026e-05, "loss": 1.2047, "step": 5340 }, { "epoch": 1.9884818094030319, "grad_norm": 0.16658997535705566, "learning_rate": 1.864033567588719e-05, "loss": 1.1944, "step": 5341 }, { "epoch": 1.9888541145536411, "grad_norm": 0.16415011882781982, "learning_rate": 1.8639723780286903e-05, "loss": 1.1896, "step": 5342 }, { "epoch": 1.98922641970425, "grad_norm": 0.16742385923862457, "learning_rate": 1.8639111757078444e-05, "loss": 1.2031, "step": 5343 }, { "epoch": 1.9895987248548592, "grad_norm": 0.16341787576675415, "learning_rate": 1.863849960627084e-05, "loss": 1.1822, "step": 5344 }, { "epoch": 1.9899710300054683, "grad_norm": 0.16611303389072418, "learning_rate": 1.863788732787314e-05, "loss": 1.1853, "step": 5345 }, { "epoch": 1.9903433351560773, "grad_norm": 0.17013627290725708, "learning_rate": 1.863727492189439e-05, "loss": 1.1891, "step": 5346 }, { "epoch": 1.9907156403066864, "grad_norm": 0.17224200069904327, "learning_rate": 1.863666238834363e-05, "loss": 1.1946, "step": 5347 }, { "epoch": 1.9910879454572954, "grad_norm": 0.1723671704530716, "learning_rate": 1.8636049727229908e-05, "loss": 1.1886, "step": 5348 }, { "epoch": 1.9914602506079047, "grad_norm": 0.1694638729095459, "learning_rate": 1.8635436938562273e-05, "loss": 1.185, "step": 5349 }, { "epoch": 1.9918325557585135, "grad_norm": 0.1634708046913147, "learning_rate": 1.8634824022349773e-05, "loss": 1.201, "step": 5350 }, { "epoch": 1.9922048609091227, "grad_norm": 0.16697536408901215, "learning_rate": 1.863421097860147e-05, "loss": 1.1841, "step": 5351 }, { "epoch": 1.9925771660597316, "grad_norm": 0.16353781521320343, "learning_rate": 1.863359780732641e-05, "loss": 1.189, "step": 5352 }, { "epoch": 1.9929494712103408, "grad_norm": 0.17699691653251648, "learning_rate": 1.8632984508533654e-05, "loss": 1.1828, "step": 5353 }, { "epoch": 1.9933217763609499, "grad_norm": 0.19866538047790527, "learning_rate": 1.8632371082232254e-05, "loss": 1.2084, "step": 5354 }, { "epoch": 1.993694081511559, "grad_norm": 0.16737166047096252, "learning_rate": 1.863175752843128e-05, "loss": 1.204, "step": 5355 }, { "epoch": 1.994066386662168, "grad_norm": 0.1831938922405243, "learning_rate": 1.8631143847139785e-05, "loss": 1.1924, "step": 5356 }, { "epoch": 1.994438691812777, "grad_norm": 0.17462186515331268, "learning_rate": 1.863053003836684e-05, "loss": 1.2058, "step": 5357 }, { "epoch": 1.9948109969633863, "grad_norm": 0.175477534532547, "learning_rate": 1.8629916102121507e-05, "loss": 1.1933, "step": 5358 }, { "epoch": 1.995183302113995, "grad_norm": 0.22154709696769714, "learning_rate": 1.8629302038412856e-05, "loss": 1.1926, "step": 5359 }, { "epoch": 1.9955556072646043, "grad_norm": 0.16697679460048676, "learning_rate": 1.8628687847249955e-05, "loss": 1.1954, "step": 5360 }, { "epoch": 1.9959279124152132, "grad_norm": 0.17017914354801178, "learning_rate": 1.8628073528641873e-05, "loss": 1.203, "step": 5361 }, { "epoch": 1.9963002175658224, "grad_norm": 0.1623312085866928, "learning_rate": 1.862745908259769e-05, "loss": 1.1899, "step": 5362 }, { "epoch": 1.9966725227164315, "grad_norm": 0.16061881184577942, "learning_rate": 1.8626844509126476e-05, "loss": 1.1995, "step": 5363 }, { "epoch": 1.9970448278670405, "grad_norm": 0.18128757178783417, "learning_rate": 1.862622980823731e-05, "loss": 1.2044, "step": 5364 }, { "epoch": 1.9974171330176496, "grad_norm": 0.15951292216777802, "learning_rate": 1.8625614979939273e-05, "loss": 1.1834, "step": 5365 }, { "epoch": 1.9977894381682586, "grad_norm": 0.15819518268108368, "learning_rate": 1.8625000024241445e-05, "loss": 1.1907, "step": 5366 }, { "epoch": 1.9981617433188679, "grad_norm": 0.1662832796573639, "learning_rate": 1.86243849411529e-05, "loss": 1.1928, "step": 5367 }, { "epoch": 1.9985340484694767, "grad_norm": 0.16362832486629486, "learning_rate": 1.8623769730682738e-05, "loss": 1.1923, "step": 5368 }, { "epoch": 1.998906353620086, "grad_norm": 0.16274386644363403, "learning_rate": 1.8623154392840036e-05, "loss": 1.1892, "step": 5369 }, { "epoch": 1.9992786587706948, "grad_norm": 0.2034921646118164, "learning_rate": 1.8622538927633886e-05, "loss": 1.1806, "step": 5370 }, { "epoch": 1.999650963921304, "grad_norm": 0.1652044951915741, "learning_rate": 1.8621923335073378e-05, "loss": 1.2014, "step": 5371 }, { "epoch": 2.000023269071913, "grad_norm": 0.16666065156459808, "learning_rate": 1.86213076151676e-05, "loss": 1.1979, "step": 5372 }, { "epoch": 2.000395574222522, "grad_norm": 0.16674093902111053, "learning_rate": 1.8620691767925655e-05, "loss": 1.1733, "step": 5373 }, { "epoch": 2.0007678793731314, "grad_norm": 0.16234448552131653, "learning_rate": 1.862007579335663e-05, "loss": 1.1845, "step": 5374 }, { "epoch": 2.00114018452374, "grad_norm": 0.16867627203464508, "learning_rate": 1.8619459691469625e-05, "loss": 1.1914, "step": 5375 }, { "epoch": 2.0015124896743495, "grad_norm": 0.17261743545532227, "learning_rate": 1.8618843462273743e-05, "loss": 1.1808, "step": 5376 }, { "epoch": 2.0018847948249583, "grad_norm": 0.16891492903232574, "learning_rate": 1.8618227105778086e-05, "loss": 1.1928, "step": 5377 }, { "epoch": 2.0022570999755676, "grad_norm": 0.17338630557060242, "learning_rate": 1.8617610621991753e-05, "loss": 1.181, "step": 5378 }, { "epoch": 2.0026294051261764, "grad_norm": 0.15824103355407715, "learning_rate": 1.861699401092385e-05, "loss": 1.1842, "step": 5379 }, { "epoch": 2.0030017102767856, "grad_norm": 0.19989395141601562, "learning_rate": 1.861637727258349e-05, "loss": 1.1842, "step": 5380 }, { "epoch": 2.0033740154273945, "grad_norm": 0.1691095381975174, "learning_rate": 1.8615760406979778e-05, "loss": 1.1976, "step": 5381 }, { "epoch": 2.0037463205780037, "grad_norm": 0.17154785990715027, "learning_rate": 1.8615143414121823e-05, "loss": 1.1795, "step": 5382 }, { "epoch": 2.004118625728613, "grad_norm": 0.16465583443641663, "learning_rate": 1.8614526294018743e-05, "loss": 1.1805, "step": 5383 }, { "epoch": 2.004490930879222, "grad_norm": 0.16546522080898285, "learning_rate": 1.8613909046679646e-05, "loss": 1.1785, "step": 5384 }, { "epoch": 2.004863236029831, "grad_norm": 0.1838424950838089, "learning_rate": 1.861329167211366e-05, "loss": 1.1865, "step": 5385 }, { "epoch": 2.00523554118044, "grad_norm": 0.16737298667430878, "learning_rate": 1.861267417032989e-05, "loss": 1.1896, "step": 5386 }, { "epoch": 2.005607846331049, "grad_norm": 0.16740891337394714, "learning_rate": 1.8612056541337466e-05, "loss": 1.1955, "step": 5387 }, { "epoch": 2.005980151481658, "grad_norm": 0.1717260479927063, "learning_rate": 1.8611438785145508e-05, "loss": 1.1835, "step": 5388 }, { "epoch": 2.0063524566322672, "grad_norm": 0.16396482288837433, "learning_rate": 1.8610820901763137e-05, "loss": 1.1755, "step": 5389 }, { "epoch": 2.0067247617828765, "grad_norm": 0.16121700406074524, "learning_rate": 1.8610202891199484e-05, "loss": 1.1849, "step": 5390 }, { "epoch": 2.0070970669334853, "grad_norm": 0.18463963270187378, "learning_rate": 1.8609584753463676e-05, "loss": 1.1814, "step": 5391 }, { "epoch": 2.0074693720840946, "grad_norm": 0.16385917365550995, "learning_rate": 1.8608966488564838e-05, "loss": 1.1919, "step": 5392 }, { "epoch": 2.0078416772347034, "grad_norm": 0.1689513474702835, "learning_rate": 1.8608348096512107e-05, "loss": 1.2054, "step": 5393 }, { "epoch": 2.0082139823853127, "grad_norm": 0.16833508014678955, "learning_rate": 1.8607729577314615e-05, "loss": 1.1865, "step": 5394 }, { "epoch": 2.0085862875359215, "grad_norm": 0.16249039769172668, "learning_rate": 1.8607110930981496e-05, "loss": 1.1882, "step": 5395 }, { "epoch": 2.0089585926865308, "grad_norm": 0.15944033861160278, "learning_rate": 1.860649215752189e-05, "loss": 1.1781, "step": 5396 }, { "epoch": 2.0093308978371396, "grad_norm": 0.16826555132865906, "learning_rate": 1.8605873256944934e-05, "loss": 1.1748, "step": 5397 }, { "epoch": 2.009703202987749, "grad_norm": 0.1765327900648117, "learning_rate": 1.860525422925977e-05, "loss": 1.1894, "step": 5398 }, { "epoch": 2.010075508138358, "grad_norm": 0.1757960021495819, "learning_rate": 1.8604635074475542e-05, "loss": 1.1952, "step": 5399 }, { "epoch": 2.010447813288967, "grad_norm": 0.17114783823490143, "learning_rate": 1.8604015792601395e-05, "loss": 1.1784, "step": 5400 }, { "epoch": 2.010820118439576, "grad_norm": 0.16381992399692535, "learning_rate": 1.860339638364647e-05, "loss": 1.1807, "step": 5401 }, { "epoch": 2.011192423590185, "grad_norm": 0.1801488697528839, "learning_rate": 1.8602776847619926e-05, "loss": 1.2051, "step": 5402 }, { "epoch": 2.0115647287407943, "grad_norm": 0.17057554423809052, "learning_rate": 1.8602157184530907e-05, "loss": 1.1815, "step": 5403 }, { "epoch": 2.011937033891403, "grad_norm": 0.175872340798378, "learning_rate": 1.8601537394388565e-05, "loss": 1.199, "step": 5404 }, { "epoch": 2.0123093390420124, "grad_norm": 0.16850432753562927, "learning_rate": 1.8600917477202055e-05, "loss": 1.1914, "step": 5405 }, { "epoch": 2.012681644192621, "grad_norm": 0.1771266758441925, "learning_rate": 1.8600297432980533e-05, "loss": 1.1708, "step": 5406 }, { "epoch": 2.0130539493432305, "grad_norm": 0.17538318037986755, "learning_rate": 1.859967726173316e-05, "loss": 1.1903, "step": 5407 }, { "epoch": 2.0134262544938397, "grad_norm": 0.1645265519618988, "learning_rate": 1.859905696346909e-05, "loss": 1.1905, "step": 5408 }, { "epoch": 2.0137985596444485, "grad_norm": 0.16865003108978271, "learning_rate": 1.8598436538197494e-05, "loss": 1.1704, "step": 5409 }, { "epoch": 2.014170864795058, "grad_norm": 0.1677810251712799, "learning_rate": 1.8597815985927524e-05, "loss": 1.2029, "step": 5410 }, { "epoch": 2.0145431699456666, "grad_norm": 0.1736694723367691, "learning_rate": 1.8597195306668355e-05, "loss": 1.1831, "step": 5411 }, { "epoch": 2.014915475096276, "grad_norm": 0.17751945555210114, "learning_rate": 1.859657450042915e-05, "loss": 1.19, "step": 5412 }, { "epoch": 2.0152877802468847, "grad_norm": 0.17150509357452393, "learning_rate": 1.8595953567219077e-05, "loss": 1.2011, "step": 5413 }, { "epoch": 2.015660085397494, "grad_norm": 0.16916759312152863, "learning_rate": 1.859533250704731e-05, "loss": 1.1708, "step": 5414 }, { "epoch": 2.016032390548103, "grad_norm": 0.1699335128068924, "learning_rate": 1.8594711319923026e-05, "loss": 1.1795, "step": 5415 }, { "epoch": 2.016404695698712, "grad_norm": 0.18379724025726318, "learning_rate": 1.859409000585539e-05, "loss": 1.1816, "step": 5416 }, { "epoch": 2.0167770008493213, "grad_norm": 0.16987095773220062, "learning_rate": 1.8593468564853587e-05, "loss": 1.1684, "step": 5417 }, { "epoch": 2.01714930599993, "grad_norm": 0.177865669131279, "learning_rate": 1.8592846996926793e-05, "loss": 1.1965, "step": 5418 }, { "epoch": 2.0175216111505394, "grad_norm": 0.16508503258228302, "learning_rate": 1.8592225302084187e-05, "loss": 1.1869, "step": 5419 }, { "epoch": 2.0178939163011482, "grad_norm": 0.21210977435112, "learning_rate": 1.8591603480334947e-05, "loss": 1.1911, "step": 5420 }, { "epoch": 2.0182662214517575, "grad_norm": 0.16533805429935455, "learning_rate": 1.859098153168827e-05, "loss": 1.1882, "step": 5421 }, { "epoch": 2.0186385266023663, "grad_norm": 0.17425382137298584, "learning_rate": 1.8590359456153333e-05, "loss": 1.1772, "step": 5422 }, { "epoch": 2.0190108317529756, "grad_norm": 0.16604116559028625, "learning_rate": 1.8589737253739325e-05, "loss": 1.1949, "step": 5423 }, { "epoch": 2.0193831369035844, "grad_norm": 0.16940464079380035, "learning_rate": 1.8589114924455438e-05, "loss": 1.1811, "step": 5424 }, { "epoch": 2.0197554420541937, "grad_norm": 0.17113879323005676, "learning_rate": 1.858849246831086e-05, "loss": 1.1798, "step": 5425 }, { "epoch": 2.020127747204803, "grad_norm": 0.1643594354391098, "learning_rate": 1.8587869885314788e-05, "loss": 1.1811, "step": 5426 }, { "epoch": 2.0205000523554117, "grad_norm": 0.16450834274291992, "learning_rate": 1.858724717547642e-05, "loss": 1.1875, "step": 5427 }, { "epoch": 2.020872357506021, "grad_norm": 0.1718573272228241, "learning_rate": 1.8586624338804947e-05, "loss": 1.1845, "step": 5428 }, { "epoch": 2.02124466265663, "grad_norm": 0.16138458251953125, "learning_rate": 1.8586001375309576e-05, "loss": 1.1766, "step": 5429 }, { "epoch": 2.021616967807239, "grad_norm": 0.20273058116436005, "learning_rate": 1.85853782849995e-05, "loss": 1.1965, "step": 5430 }, { "epoch": 2.021989272957848, "grad_norm": 0.17914332449436188, "learning_rate": 1.8584755067883923e-05, "loss": 1.1734, "step": 5431 }, { "epoch": 2.022361578108457, "grad_norm": 0.17309610545635223, "learning_rate": 1.8584131723972055e-05, "loss": 1.1775, "step": 5432 }, { "epoch": 2.022733883259066, "grad_norm": 0.18708333373069763, "learning_rate": 1.8583508253273098e-05, "loss": 1.195, "step": 5433 }, { "epoch": 2.0231061884096753, "grad_norm": 0.16678020358085632, "learning_rate": 1.8582884655796266e-05, "loss": 1.1987, "step": 5434 }, { "epoch": 2.0234784935602845, "grad_norm": 0.16107121109962463, "learning_rate": 1.8582260931550766e-05, "loss": 1.1866, "step": 5435 }, { "epoch": 2.0238507987108934, "grad_norm": 0.16838499903678894, "learning_rate": 1.8581637080545813e-05, "loss": 1.1874, "step": 5436 }, { "epoch": 2.0242231038615026, "grad_norm": 0.16616487503051758, "learning_rate": 1.8581013102790612e-05, "loss": 1.1863, "step": 5437 }, { "epoch": 2.0245954090121114, "grad_norm": 0.18162907660007477, "learning_rate": 1.8580388998294393e-05, "loss": 1.1745, "step": 5438 }, { "epoch": 2.0249677141627207, "grad_norm": 0.16428324580192566, "learning_rate": 1.8579764767066362e-05, "loss": 1.1795, "step": 5439 }, { "epoch": 2.0253400193133295, "grad_norm": 0.17782078683376312, "learning_rate": 1.8579140409115744e-05, "loss": 1.1999, "step": 5440 }, { "epoch": 2.025712324463939, "grad_norm": 0.16830138862133026, "learning_rate": 1.8578515924451765e-05, "loss": 1.173, "step": 5441 }, { "epoch": 2.0260846296145476, "grad_norm": 0.19714663922786713, "learning_rate": 1.8577891313083637e-05, "loss": 1.2026, "step": 5442 }, { "epoch": 2.026456934765157, "grad_norm": 0.18386825919151306, "learning_rate": 1.8577266575020598e-05, "loss": 1.1741, "step": 5443 }, { "epoch": 2.026829239915766, "grad_norm": 0.18373322486877441, "learning_rate": 1.857664171027187e-05, "loss": 1.171, "step": 5444 }, { "epoch": 2.027201545066375, "grad_norm": 0.23901142179965973, "learning_rate": 1.8576016718846678e-05, "loss": 1.1837, "step": 5445 }, { "epoch": 2.027573850216984, "grad_norm": 0.16936446726322174, "learning_rate": 1.8575391600754266e-05, "loss": 1.1687, "step": 5446 }, { "epoch": 2.027946155367593, "grad_norm": 0.17396526038646698, "learning_rate": 1.857476635600385e-05, "loss": 1.1989, "step": 5447 }, { "epoch": 2.0283184605182023, "grad_norm": 0.17292600870132446, "learning_rate": 1.8574140984604672e-05, "loss": 1.1754, "step": 5448 }, { "epoch": 2.028690765668811, "grad_norm": 0.1673480123281479, "learning_rate": 1.8573515486565976e-05, "loss": 1.1891, "step": 5449 }, { "epoch": 2.0290630708194204, "grad_norm": 0.16834761202335358, "learning_rate": 1.8572889861896993e-05, "loss": 1.1937, "step": 5450 }, { "epoch": 2.0294353759700297, "grad_norm": 0.16884836554527283, "learning_rate": 1.857226411060696e-05, "loss": 1.1812, "step": 5451 }, { "epoch": 2.0298076811206385, "grad_norm": 0.16621071100234985, "learning_rate": 1.857163823270513e-05, "loss": 1.1886, "step": 5452 }, { "epoch": 2.0301799862712477, "grad_norm": 0.16866537928581238, "learning_rate": 1.8571012228200737e-05, "loss": 1.1831, "step": 5453 }, { "epoch": 2.0305522914218566, "grad_norm": 0.16921500861644745, "learning_rate": 1.8570386097103033e-05, "loss": 1.1753, "step": 5454 }, { "epoch": 2.030924596572466, "grad_norm": 0.1673291027545929, "learning_rate": 1.8569759839421263e-05, "loss": 1.1827, "step": 5455 }, { "epoch": 2.0312969017230746, "grad_norm": 0.1631641387939453, "learning_rate": 1.856913345516468e-05, "loss": 1.1886, "step": 5456 }, { "epoch": 2.031669206873684, "grad_norm": 0.16557154059410095, "learning_rate": 1.8568506944342535e-05, "loss": 1.1644, "step": 5457 }, { "epoch": 2.0320415120242927, "grad_norm": 0.1667705774307251, "learning_rate": 1.8567880306964077e-05, "loss": 1.1846, "step": 5458 }, { "epoch": 2.032413817174902, "grad_norm": 0.16573351621627808, "learning_rate": 1.8567253543038564e-05, "loss": 1.1714, "step": 5459 }, { "epoch": 2.0327861223255113, "grad_norm": 0.17770934104919434, "learning_rate": 1.8566626652575257e-05, "loss": 1.1844, "step": 5460 }, { "epoch": 2.03315842747612, "grad_norm": 0.1648428589105606, "learning_rate": 1.856599963558341e-05, "loss": 1.197, "step": 5461 }, { "epoch": 2.0335307326267293, "grad_norm": 0.16865648329257965, "learning_rate": 1.8565372492072288e-05, "loss": 1.2002, "step": 5462 }, { "epoch": 2.033903037777338, "grad_norm": 0.1726176142692566, "learning_rate": 1.856474522205115e-05, "loss": 1.1789, "step": 5463 }, { "epoch": 2.0342753429279474, "grad_norm": 0.1659863144159317, "learning_rate": 1.856411782552926e-05, "loss": 1.1927, "step": 5464 }, { "epoch": 2.0346476480785562, "grad_norm": 0.17737741768360138, "learning_rate": 1.856349030251589e-05, "loss": 1.1853, "step": 5465 }, { "epoch": 2.0350199532291655, "grad_norm": 0.1588411182165146, "learning_rate": 1.8562862653020306e-05, "loss": 1.1897, "step": 5466 }, { "epoch": 2.0353922583797743, "grad_norm": 0.200198695063591, "learning_rate": 1.8562234877051778e-05, "loss": 1.1845, "step": 5467 }, { "epoch": 2.0357645635303836, "grad_norm": 0.18898078799247742, "learning_rate": 1.8561606974619577e-05, "loss": 1.1717, "step": 5468 }, { "epoch": 2.036136868680993, "grad_norm": 0.17676281929016113, "learning_rate": 1.856097894573298e-05, "loss": 1.1838, "step": 5469 }, { "epoch": 2.0365091738316017, "grad_norm": 0.23812207579612732, "learning_rate": 1.856035079040126e-05, "loss": 1.1728, "step": 5470 }, { "epoch": 2.036881478982211, "grad_norm": 0.17304615676403046, "learning_rate": 1.8559722508633698e-05, "loss": 1.1913, "step": 5471 }, { "epoch": 2.0372537841328198, "grad_norm": 0.17756541073322296, "learning_rate": 1.8559094100439568e-05, "loss": 1.1787, "step": 5472 }, { "epoch": 2.037626089283429, "grad_norm": 0.16720567643642426, "learning_rate": 1.8558465565828156e-05, "loss": 1.1778, "step": 5473 }, { "epoch": 2.037998394434038, "grad_norm": 0.19061878323554993, "learning_rate": 1.855783690480875e-05, "loss": 1.1798, "step": 5474 }, { "epoch": 2.038370699584647, "grad_norm": 0.16539441049098969, "learning_rate": 1.8557208117390626e-05, "loss": 1.1939, "step": 5475 }, { "epoch": 2.038743004735256, "grad_norm": 0.16110876202583313, "learning_rate": 1.8556579203583075e-05, "loss": 1.1937, "step": 5476 }, { "epoch": 2.039115309885865, "grad_norm": 0.16478274762630463, "learning_rate": 1.8555950163395383e-05, "loss": 1.189, "step": 5477 }, { "epoch": 2.0394876150364745, "grad_norm": 0.16724629700183868, "learning_rate": 1.855532099683685e-05, "loss": 1.1846, "step": 5478 }, { "epoch": 2.0398599201870833, "grad_norm": 0.1901393085718155, "learning_rate": 1.855469170391676e-05, "loss": 1.1695, "step": 5479 }, { "epoch": 2.0402322253376926, "grad_norm": 0.16991356015205383, "learning_rate": 1.855406228464441e-05, "loss": 1.2068, "step": 5480 }, { "epoch": 2.0406045304883014, "grad_norm": 0.1755204200744629, "learning_rate": 1.85534327390291e-05, "loss": 1.178, "step": 5481 }, { "epoch": 2.0409768356389106, "grad_norm": 0.17370069026947021, "learning_rate": 1.855280306708012e-05, "loss": 1.1727, "step": 5482 }, { "epoch": 2.0413491407895195, "grad_norm": 0.17671510577201843, "learning_rate": 1.8552173268806778e-05, "loss": 1.1941, "step": 5483 }, { "epoch": 2.0417214459401287, "grad_norm": 0.17848750948905945, "learning_rate": 1.8551543344218372e-05, "loss": 1.169, "step": 5484 }, { "epoch": 2.0420937510907375, "grad_norm": 0.18597623705863953, "learning_rate": 1.855091329332421e-05, "loss": 1.1696, "step": 5485 }, { "epoch": 2.042466056241347, "grad_norm": 0.19254527986049652, "learning_rate": 1.8550283116133595e-05, "loss": 1.1791, "step": 5486 }, { "epoch": 2.042838361391956, "grad_norm": 0.17649273574352264, "learning_rate": 1.8549652812655836e-05, "loss": 1.1929, "step": 5487 }, { "epoch": 2.043210666542565, "grad_norm": 0.16401410102844238, "learning_rate": 1.854902238290024e-05, "loss": 1.1727, "step": 5488 }, { "epoch": 2.043582971693174, "grad_norm": 0.16937775909900665, "learning_rate": 1.854839182687612e-05, "loss": 1.1872, "step": 5489 }, { "epoch": 2.043955276843783, "grad_norm": 0.16704395413398743, "learning_rate": 1.854776114459279e-05, "loss": 1.1857, "step": 5490 }, { "epoch": 2.0443275819943922, "grad_norm": 0.16636380553245544, "learning_rate": 1.8547130336059562e-05, "loss": 1.1877, "step": 5491 }, { "epoch": 2.044699887145001, "grad_norm": 0.17131738364696503, "learning_rate": 1.8546499401285755e-05, "loss": 1.1872, "step": 5492 }, { "epoch": 2.0450721922956103, "grad_norm": 0.1786455512046814, "learning_rate": 1.854586834028069e-05, "loss": 1.1894, "step": 5493 }, { "epoch": 2.045444497446219, "grad_norm": 0.17036451399326324, "learning_rate": 1.8545237153053688e-05, "loss": 1.1865, "step": 5494 }, { "epoch": 2.0458168025968284, "grad_norm": 0.17519360780715942, "learning_rate": 1.8544605839614066e-05, "loss": 1.1922, "step": 5495 }, { "epoch": 2.0461891077474377, "grad_norm": 0.17141428589820862, "learning_rate": 1.8543974399971153e-05, "loss": 1.1991, "step": 5496 }, { "epoch": 2.0465614128980465, "grad_norm": 0.17702911794185638, "learning_rate": 1.8543342834134276e-05, "loss": 1.1764, "step": 5497 }, { "epoch": 2.0469337180486558, "grad_norm": 0.16847530007362366, "learning_rate": 1.854271114211276e-05, "loss": 1.1831, "step": 5498 }, { "epoch": 2.0473060231992646, "grad_norm": 0.17060035467147827, "learning_rate": 1.8542079323915935e-05, "loss": 1.2039, "step": 5499 }, { "epoch": 2.047678328349874, "grad_norm": 0.1642797440290451, "learning_rate": 1.8541447379553136e-05, "loss": 1.1645, "step": 5500 }, { "epoch": 2.047678328349874, "eval_loss": 1.3067960739135742, "eval_runtime": 17.3329, "eval_samples_per_second": 100.041, "eval_steps_per_second": 5.019, "step": 5500 }, { "epoch": 2.0480506335004827, "grad_norm": 0.2160232812166214, "learning_rate": 1.8540815309033697e-05, "loss": 1.2004, "step": 5501 }, { "epoch": 2.048422938651092, "grad_norm": 0.1832730919122696, "learning_rate": 1.854018311236695e-05, "loss": 1.1862, "step": 5502 }, { "epoch": 2.0487952438017007, "grad_norm": 0.18487650156021118, "learning_rate": 1.8539550789562234e-05, "loss": 1.2079, "step": 5503 }, { "epoch": 2.04916754895231, "grad_norm": 0.16732271015644073, "learning_rate": 1.853891834062889e-05, "loss": 1.1865, "step": 5504 }, { "epoch": 2.0495398541029193, "grad_norm": 0.21880120038986206, "learning_rate": 1.853828576557626e-05, "loss": 1.1793, "step": 5505 }, { "epoch": 2.049912159253528, "grad_norm": 0.16671521961688995, "learning_rate": 1.853765306441368e-05, "loss": 1.1817, "step": 5506 }, { "epoch": 2.0502844644041374, "grad_norm": 0.17051751911640167, "learning_rate": 1.8537020237150503e-05, "loss": 1.1646, "step": 5507 }, { "epoch": 2.050656769554746, "grad_norm": 0.16859528422355652, "learning_rate": 1.853638728379607e-05, "loss": 1.1764, "step": 5508 }, { "epoch": 2.0510290747053554, "grad_norm": 0.16496269404888153, "learning_rate": 1.8535754204359737e-05, "loss": 1.183, "step": 5509 }, { "epoch": 2.0514013798559643, "grad_norm": 0.16721811890602112, "learning_rate": 1.853512099885085e-05, "loss": 1.2113, "step": 5510 }, { "epoch": 2.0517736850065735, "grad_norm": 0.17551200091838837, "learning_rate": 1.8534487667278757e-05, "loss": 1.177, "step": 5511 }, { "epoch": 2.052145990157183, "grad_norm": 0.16415713727474213, "learning_rate": 1.853385420965282e-05, "loss": 1.1782, "step": 5512 }, { "epoch": 2.0525182953077916, "grad_norm": 0.16708432137966156, "learning_rate": 1.8533220625982392e-05, "loss": 1.2093, "step": 5513 }, { "epoch": 2.052890600458401, "grad_norm": 0.16967132687568665, "learning_rate": 1.8532586916276828e-05, "loss": 1.18, "step": 5514 }, { "epoch": 2.0532629056090097, "grad_norm": 0.17362266778945923, "learning_rate": 1.8531953080545494e-05, "loss": 1.1906, "step": 5515 }, { "epoch": 2.053635210759619, "grad_norm": 0.1716122329235077, "learning_rate": 1.853131911879775e-05, "loss": 1.1809, "step": 5516 }, { "epoch": 2.054007515910228, "grad_norm": 0.1693599373102188, "learning_rate": 1.8530685031042952e-05, "loss": 1.1829, "step": 5517 }, { "epoch": 2.054379821060837, "grad_norm": 0.1685894876718521, "learning_rate": 1.8530050817290477e-05, "loss": 1.1855, "step": 5518 }, { "epoch": 2.054752126211446, "grad_norm": 0.18109537661075592, "learning_rate": 1.852941647754968e-05, "loss": 1.1782, "step": 5519 }, { "epoch": 2.055124431362055, "grad_norm": 0.17635756731033325, "learning_rate": 1.8528782011829945e-05, "loss": 1.1857, "step": 5520 }, { "epoch": 2.0554967365126644, "grad_norm": 0.1745537966489792, "learning_rate": 1.852814742014063e-05, "loss": 1.1963, "step": 5521 }, { "epoch": 2.0558690416632732, "grad_norm": 0.17796571552753448, "learning_rate": 1.8527512702491116e-05, "loss": 1.1818, "step": 5522 }, { "epoch": 2.0562413468138825, "grad_norm": 0.17338848114013672, "learning_rate": 1.852687785889077e-05, "loss": 1.1864, "step": 5523 }, { "epoch": 2.0566136519644913, "grad_norm": 0.18581651151180267, "learning_rate": 1.8526242889348976e-05, "loss": 1.1952, "step": 5524 }, { "epoch": 2.0569859571151006, "grad_norm": 0.1682850867509842, "learning_rate": 1.852560779387511e-05, "loss": 1.1846, "step": 5525 }, { "epoch": 2.0573582622657094, "grad_norm": 0.19426992535591125, "learning_rate": 1.8524972572478554e-05, "loss": 1.1823, "step": 5526 }, { "epoch": 2.0577305674163187, "grad_norm": 0.17708083987236023, "learning_rate": 1.8524337225168682e-05, "loss": 1.1776, "step": 5527 }, { "epoch": 2.0581028725669275, "grad_norm": 0.1654316484928131, "learning_rate": 1.852370175195489e-05, "loss": 1.1824, "step": 5528 }, { "epoch": 2.0584751777175367, "grad_norm": 0.20494744181632996, "learning_rate": 1.8523066152846552e-05, "loss": 1.1866, "step": 5529 }, { "epoch": 2.058847482868146, "grad_norm": 0.17771178483963013, "learning_rate": 1.852243042785307e-05, "loss": 1.1918, "step": 5530 }, { "epoch": 2.059219788018755, "grad_norm": 0.17502543330192566, "learning_rate": 1.852179457698382e-05, "loss": 1.1821, "step": 5531 }, { "epoch": 2.059592093169364, "grad_norm": 0.1928001195192337, "learning_rate": 1.85211586002482e-05, "loss": 1.1901, "step": 5532 }, { "epoch": 2.059964398319973, "grad_norm": 0.16364991664886475, "learning_rate": 1.85205224976556e-05, "loss": 1.1747, "step": 5533 }, { "epoch": 2.060336703470582, "grad_norm": 0.1756642460823059, "learning_rate": 1.851988626921542e-05, "loss": 1.1723, "step": 5534 }, { "epoch": 2.060709008621191, "grad_norm": 0.19008730351924896, "learning_rate": 1.8519249914937056e-05, "loss": 1.1996, "step": 5535 }, { "epoch": 2.0610813137718003, "grad_norm": 0.17250213027000427, "learning_rate": 1.85186134348299e-05, "loss": 1.1882, "step": 5536 }, { "epoch": 2.061453618922409, "grad_norm": 0.1646471917629242, "learning_rate": 1.8517976828903365e-05, "loss": 1.1918, "step": 5537 }, { "epoch": 2.0618259240730183, "grad_norm": 0.1910063773393631, "learning_rate": 1.851734009716684e-05, "loss": 1.187, "step": 5538 }, { "epoch": 2.0621982292236276, "grad_norm": 0.19279128313064575, "learning_rate": 1.8516703239629744e-05, "loss": 1.1942, "step": 5539 }, { "epoch": 2.0625705343742364, "grad_norm": 0.16666115820407867, "learning_rate": 1.8516066256301468e-05, "loss": 1.1883, "step": 5540 }, { "epoch": 2.0629428395248457, "grad_norm": 0.20289574563503265, "learning_rate": 1.8515429147191434e-05, "loss": 1.1917, "step": 5541 }, { "epoch": 2.0633151446754545, "grad_norm": 0.18597279489040375, "learning_rate": 1.851479191230904e-05, "loss": 1.1754, "step": 5542 }, { "epoch": 2.063687449826064, "grad_norm": 0.17310966551303864, "learning_rate": 1.851415455166371e-05, "loss": 1.1892, "step": 5543 }, { "epoch": 2.0640597549766726, "grad_norm": 0.20302632451057434, "learning_rate": 1.851351706526485e-05, "loss": 1.1921, "step": 5544 }, { "epoch": 2.064432060127282, "grad_norm": 0.17062672972679138, "learning_rate": 1.8512879453121874e-05, "loss": 1.1821, "step": 5545 }, { "epoch": 2.0648043652778907, "grad_norm": 0.16479294002056122, "learning_rate": 1.8512241715244203e-05, "loss": 1.1813, "step": 5546 }, { "epoch": 2.0651766704285, "grad_norm": 0.17303155362606049, "learning_rate": 1.8511603851641256e-05, "loss": 1.1988, "step": 5547 }, { "epoch": 2.065548975579109, "grad_norm": 0.17052410542964935, "learning_rate": 1.8510965862322455e-05, "loss": 1.1997, "step": 5548 }, { "epoch": 2.065921280729718, "grad_norm": 0.15899476408958435, "learning_rate": 1.8510327747297225e-05, "loss": 1.1912, "step": 5549 }, { "epoch": 2.0662935858803273, "grad_norm": 0.18540219962596893, "learning_rate": 1.8509689506574986e-05, "loss": 1.198, "step": 5550 }, { "epoch": 2.066665891030936, "grad_norm": 0.1953384429216385, "learning_rate": 1.8509051140165167e-05, "loss": 1.1713, "step": 5551 }, { "epoch": 2.0670381961815454, "grad_norm": 0.1603497862815857, "learning_rate": 1.85084126480772e-05, "loss": 1.1924, "step": 5552 }, { "epoch": 2.067410501332154, "grad_norm": 0.3583022654056549, "learning_rate": 1.8507774030320508e-05, "loss": 1.1836, "step": 5553 }, { "epoch": 2.0677828064827635, "grad_norm": 0.1765451580286026, "learning_rate": 1.8507135286904527e-05, "loss": 1.1841, "step": 5554 }, { "epoch": 2.0681551116333723, "grad_norm": 0.17876610159873962, "learning_rate": 1.8506496417838695e-05, "loss": 1.1916, "step": 5555 }, { "epoch": 2.0685274167839816, "grad_norm": 0.17126069962978363, "learning_rate": 1.8505857423132447e-05, "loss": 1.1818, "step": 5556 }, { "epoch": 2.068899721934591, "grad_norm": 0.1569199413061142, "learning_rate": 1.8505218302795213e-05, "loss": 1.193, "step": 5557 }, { "epoch": 2.0692720270851996, "grad_norm": 0.17595240473747253, "learning_rate": 1.8504579056836437e-05, "loss": 1.1999, "step": 5558 }, { "epoch": 2.069644332235809, "grad_norm": 0.18248499929904938, "learning_rate": 1.850393968526557e-05, "loss": 1.1813, "step": 5559 }, { "epoch": 2.0700166373864177, "grad_norm": 0.17279589176177979, "learning_rate": 1.850330018809204e-05, "loss": 1.1837, "step": 5560 }, { "epoch": 2.070388942537027, "grad_norm": 0.16098688542842865, "learning_rate": 1.8502660565325302e-05, "loss": 1.1816, "step": 5561 }, { "epoch": 2.070761247687636, "grad_norm": 0.16793614625930786, "learning_rate": 1.8502020816974806e-05, "loss": 1.1893, "step": 5562 }, { "epoch": 2.071133552838245, "grad_norm": 0.17572426795959473, "learning_rate": 1.850138094304999e-05, "loss": 1.182, "step": 5563 }, { "epoch": 2.0715058579888543, "grad_norm": 0.1719566136598587, "learning_rate": 1.850074094356031e-05, "loss": 1.191, "step": 5564 }, { "epoch": 2.071878163139463, "grad_norm": 0.16841794550418854, "learning_rate": 1.8500100818515224e-05, "loss": 1.1896, "step": 5565 }, { "epoch": 2.0722504682900724, "grad_norm": 0.16912922263145447, "learning_rate": 1.8499460567924182e-05, "loss": 1.1908, "step": 5566 }, { "epoch": 2.0726227734406812, "grad_norm": 0.1644641011953354, "learning_rate": 1.849882019179664e-05, "loss": 1.1758, "step": 5567 }, { "epoch": 2.0729950785912905, "grad_norm": 0.16581396758556366, "learning_rate": 1.8498179690142057e-05, "loss": 1.1841, "step": 5568 }, { "epoch": 2.0733673837418993, "grad_norm": 0.165495365858078, "learning_rate": 1.8497539062969893e-05, "loss": 1.1847, "step": 5569 }, { "epoch": 2.0737396888925086, "grad_norm": 0.16472892463207245, "learning_rate": 1.849689831028961e-05, "loss": 1.1978, "step": 5570 }, { "epoch": 2.0741119940431174, "grad_norm": 0.1632337123155594, "learning_rate": 1.8496257432110673e-05, "loss": 1.1641, "step": 5571 }, { "epoch": 2.0744842991937267, "grad_norm": 0.16693541407585144, "learning_rate": 1.8495616428442546e-05, "loss": 1.178, "step": 5572 }, { "epoch": 2.074856604344336, "grad_norm": 0.17431595921516418, "learning_rate": 1.84949752992947e-05, "loss": 1.1958, "step": 5573 }, { "epoch": 2.0752289094949448, "grad_norm": 0.1731480062007904, "learning_rate": 1.84943340446766e-05, "loss": 1.1769, "step": 5574 }, { "epoch": 2.075601214645554, "grad_norm": 0.16712194681167603, "learning_rate": 1.849369266459772e-05, "loss": 1.1871, "step": 5575 }, { "epoch": 2.075973519796163, "grad_norm": 0.16776369512081146, "learning_rate": 1.849305115906753e-05, "loss": 1.1734, "step": 5576 }, { "epoch": 2.076345824946772, "grad_norm": 0.17154593765735626, "learning_rate": 1.849240952809551e-05, "loss": 1.1959, "step": 5577 }, { "epoch": 2.076718130097381, "grad_norm": 0.16923579573631287, "learning_rate": 1.8491767771691133e-05, "loss": 1.1937, "step": 5578 }, { "epoch": 2.07709043524799, "grad_norm": 0.17263515293598175, "learning_rate": 1.849112588986388e-05, "loss": 1.1912, "step": 5579 }, { "epoch": 2.077462740398599, "grad_norm": 0.16480253636837006, "learning_rate": 1.849048388262323e-05, "loss": 1.1821, "step": 5580 }, { "epoch": 2.0778350455492083, "grad_norm": 0.16835813224315643, "learning_rate": 1.8489841749978668e-05, "loss": 1.1754, "step": 5581 }, { "epoch": 2.0782073506998175, "grad_norm": 0.17042113840579987, "learning_rate": 1.848919949193967e-05, "loss": 1.1767, "step": 5582 }, { "epoch": 2.0785796558504264, "grad_norm": 0.17123256623744965, "learning_rate": 1.8488557108515736e-05, "loss": 1.1778, "step": 5583 }, { "epoch": 2.0789519610010356, "grad_norm": 0.16714586317539215, "learning_rate": 1.848791459971634e-05, "loss": 1.183, "step": 5584 }, { "epoch": 2.0793242661516445, "grad_norm": 0.16388732194900513, "learning_rate": 1.848727196555098e-05, "loss": 1.1881, "step": 5585 }, { "epoch": 2.0796965713022537, "grad_norm": 0.1711377501487732, "learning_rate": 1.8486629206029146e-05, "loss": 1.1975, "step": 5586 }, { "epoch": 2.0800688764528625, "grad_norm": 0.16601723432540894, "learning_rate": 1.8485986321160335e-05, "loss": 1.1854, "step": 5587 }, { "epoch": 2.080441181603472, "grad_norm": 0.16255028545856476, "learning_rate": 1.8485343310954033e-05, "loss": 1.1762, "step": 5588 }, { "epoch": 2.0808134867540806, "grad_norm": 0.1604170799255371, "learning_rate": 1.8484700175419747e-05, "loss": 1.1835, "step": 5589 }, { "epoch": 2.08118579190469, "grad_norm": 0.16844023764133453, "learning_rate": 1.8484056914566967e-05, "loss": 1.189, "step": 5590 }, { "epoch": 2.081558097055299, "grad_norm": 0.1678459644317627, "learning_rate": 1.84834135284052e-05, "loss": 1.1828, "step": 5591 }, { "epoch": 2.081930402205908, "grad_norm": 0.16233506798744202, "learning_rate": 1.8482770016943952e-05, "loss": 1.1749, "step": 5592 }, { "epoch": 2.0823027073565172, "grad_norm": 0.16579484939575195, "learning_rate": 1.848212638019272e-05, "loss": 1.1794, "step": 5593 }, { "epoch": 2.082675012507126, "grad_norm": 0.16897903382778168, "learning_rate": 1.8481482618161016e-05, "loss": 1.2122, "step": 5594 }, { "epoch": 2.0830473176577353, "grad_norm": 0.16713303327560425, "learning_rate": 1.848083873085834e-05, "loss": 1.1682, "step": 5595 }, { "epoch": 2.083419622808344, "grad_norm": 0.1653788685798645, "learning_rate": 1.8480194718294212e-05, "loss": 1.183, "step": 5596 }, { "epoch": 2.0837919279589534, "grad_norm": 0.15681540966033936, "learning_rate": 1.847955058047814e-05, "loss": 1.1812, "step": 5597 }, { "epoch": 2.0841642331095622, "grad_norm": 0.16332903504371643, "learning_rate": 1.8478906317419644e-05, "loss": 1.1776, "step": 5598 }, { "epoch": 2.0845365382601715, "grad_norm": 0.16454966366291046, "learning_rate": 1.8478261929128226e-05, "loss": 1.1807, "step": 5599 }, { "epoch": 2.0849088434107808, "grad_norm": 0.16205929219722748, "learning_rate": 1.8477617415613413e-05, "loss": 1.1902, "step": 5600 }, { "epoch": 2.0852811485613896, "grad_norm": 0.16746239364147186, "learning_rate": 1.8476972776884724e-05, "loss": 1.2076, "step": 5601 }, { "epoch": 2.085653453711999, "grad_norm": 0.1641511619091034, "learning_rate": 1.8476328012951677e-05, "loss": 1.1847, "step": 5602 }, { "epoch": 2.0860257588626077, "grad_norm": 0.16240178048610687, "learning_rate": 1.84756831238238e-05, "loss": 1.1733, "step": 5603 }, { "epoch": 2.086398064013217, "grad_norm": 0.16634000837802887, "learning_rate": 1.8475038109510612e-05, "loss": 1.1749, "step": 5604 }, { "epoch": 2.0867703691638257, "grad_norm": 0.17073014378547668, "learning_rate": 1.847439297002165e-05, "loss": 1.1751, "step": 5605 }, { "epoch": 2.087142674314435, "grad_norm": 0.15842556953430176, "learning_rate": 1.8473747705366427e-05, "loss": 1.1722, "step": 5606 }, { "epoch": 2.087514979465044, "grad_norm": 0.15961557626724243, "learning_rate": 1.8473102315554484e-05, "loss": 1.1957, "step": 5607 }, { "epoch": 2.087887284615653, "grad_norm": 0.1633174568414688, "learning_rate": 1.8472456800595355e-05, "loss": 1.1652, "step": 5608 }, { "epoch": 2.0882595897662624, "grad_norm": 0.17014336585998535, "learning_rate": 1.847181116049857e-05, "loss": 1.1878, "step": 5609 }, { "epoch": 2.088631894916871, "grad_norm": 0.16674593091011047, "learning_rate": 1.8471165395273666e-05, "loss": 1.1844, "step": 5610 }, { "epoch": 2.0890042000674804, "grad_norm": 0.16320446133613586, "learning_rate": 1.8470519504930178e-05, "loss": 1.1932, "step": 5611 }, { "epoch": 2.0893765052180893, "grad_norm": 0.1698993742465973, "learning_rate": 1.846987348947765e-05, "loss": 1.1791, "step": 5612 }, { "epoch": 2.0897488103686985, "grad_norm": 0.16859601438045502, "learning_rate": 1.8469227348925624e-05, "loss": 1.1977, "step": 5613 }, { "epoch": 2.0901211155193073, "grad_norm": 0.16423748433589935, "learning_rate": 1.846858108328364e-05, "loss": 1.1899, "step": 5614 }, { "epoch": 2.0904934206699166, "grad_norm": 0.15783381462097168, "learning_rate": 1.8467934692561242e-05, "loss": 1.1945, "step": 5615 }, { "epoch": 2.0908657258205254, "grad_norm": 0.16708320379257202, "learning_rate": 1.8467288176767986e-05, "loss": 1.1789, "step": 5616 }, { "epoch": 2.0912380309711347, "grad_norm": 0.16855353116989136, "learning_rate": 1.846664153591341e-05, "loss": 1.1807, "step": 5617 }, { "epoch": 2.091610336121744, "grad_norm": 0.17245307564735413, "learning_rate": 1.846599477000707e-05, "loss": 1.1927, "step": 5618 }, { "epoch": 2.091982641272353, "grad_norm": 0.1593155562877655, "learning_rate": 1.8465347879058524e-05, "loss": 1.1677, "step": 5619 }, { "epoch": 2.092354946422962, "grad_norm": 0.16127386689186096, "learning_rate": 1.8464700863077313e-05, "loss": 1.1698, "step": 5620 }, { "epoch": 2.092727251573571, "grad_norm": 0.16885069012641907, "learning_rate": 1.846405372207301e-05, "loss": 1.1863, "step": 5621 }, { "epoch": 2.09309955672418, "grad_norm": 0.16606591641902924, "learning_rate": 1.846340645605516e-05, "loss": 1.1731, "step": 5622 }, { "epoch": 2.093471861874789, "grad_norm": 0.16918626427650452, "learning_rate": 1.8462759065033326e-05, "loss": 1.1936, "step": 5623 }, { "epoch": 2.093844167025398, "grad_norm": 0.16348494589328766, "learning_rate": 1.8462111549017074e-05, "loss": 1.1815, "step": 5624 }, { "epoch": 2.094216472176007, "grad_norm": 0.16731733083724976, "learning_rate": 1.8461463908015967e-05, "loss": 1.2031, "step": 5625 }, { "epoch": 2.0945887773266163, "grad_norm": 0.1636827141046524, "learning_rate": 1.8460816142039566e-05, "loss": 1.1862, "step": 5626 }, { "epoch": 2.0949610824772256, "grad_norm": 0.1637611985206604, "learning_rate": 1.8460168251097444e-05, "loss": 1.1915, "step": 5627 }, { "epoch": 2.0953333876278344, "grad_norm": 0.16247594356536865, "learning_rate": 1.8459520235199165e-05, "loss": 1.1826, "step": 5628 }, { "epoch": 2.0957056927784437, "grad_norm": 0.1628650724887848, "learning_rate": 1.8458872094354307e-05, "loss": 1.1848, "step": 5629 }, { "epoch": 2.0960779979290525, "grad_norm": 0.16687503457069397, "learning_rate": 1.8458223828572435e-05, "loss": 1.1787, "step": 5630 }, { "epoch": 2.0964503030796617, "grad_norm": 0.16528525948524475, "learning_rate": 1.8457575437863134e-05, "loss": 1.1788, "step": 5631 }, { "epoch": 2.0968226082302706, "grad_norm": 0.16622121632099152, "learning_rate": 1.845692692223597e-05, "loss": 1.1934, "step": 5632 }, { "epoch": 2.09719491338088, "grad_norm": 0.1611912101507187, "learning_rate": 1.8456278281700527e-05, "loss": 1.1901, "step": 5633 }, { "epoch": 2.097567218531489, "grad_norm": 0.1634882390499115, "learning_rate": 1.845562951626638e-05, "loss": 1.1897, "step": 5634 }, { "epoch": 2.097939523682098, "grad_norm": 0.1702599674463272, "learning_rate": 1.8454980625943122e-05, "loss": 1.2001, "step": 5635 }, { "epoch": 2.098311828832707, "grad_norm": 0.15951082110404968, "learning_rate": 1.845433161074033e-05, "loss": 1.1773, "step": 5636 }, { "epoch": 2.098684133983316, "grad_norm": 0.16114339232444763, "learning_rate": 1.845368247066759e-05, "loss": 1.1853, "step": 5637 }, { "epoch": 2.0990564391339253, "grad_norm": 0.17119161784648895, "learning_rate": 1.845303320573449e-05, "loss": 1.1879, "step": 5638 }, { "epoch": 2.099428744284534, "grad_norm": 0.1637708991765976, "learning_rate": 1.8452383815950616e-05, "loss": 1.1693, "step": 5639 }, { "epoch": 2.0998010494351433, "grad_norm": 0.16330254077911377, "learning_rate": 1.845173430132557e-05, "loss": 1.1775, "step": 5640 }, { "epoch": 2.100173354585752, "grad_norm": 0.16474707424640656, "learning_rate": 1.8451084661868936e-05, "loss": 1.166, "step": 5641 }, { "epoch": 2.1005456597363614, "grad_norm": 0.16186347603797913, "learning_rate": 1.845043489759031e-05, "loss": 1.1778, "step": 5642 }, { "epoch": 2.1009179648869707, "grad_norm": 0.1683456301689148, "learning_rate": 1.844978500849929e-05, "loss": 1.1905, "step": 5643 }, { "epoch": 2.1012902700375795, "grad_norm": 0.15968085825443268, "learning_rate": 1.8449134994605483e-05, "loss": 1.1779, "step": 5644 }, { "epoch": 2.1016625751881888, "grad_norm": 0.15909917652606964, "learning_rate": 1.8448484855918476e-05, "loss": 1.1993, "step": 5645 }, { "epoch": 2.1020348803387976, "grad_norm": 0.16778358817100525, "learning_rate": 1.844783459244788e-05, "loss": 1.2041, "step": 5646 }, { "epoch": 2.102407185489407, "grad_norm": 0.1642913818359375, "learning_rate": 1.8447184204203297e-05, "loss": 1.1888, "step": 5647 }, { "epoch": 2.1027794906400157, "grad_norm": 0.1615377813577652, "learning_rate": 1.8446533691194332e-05, "loss": 1.1834, "step": 5648 }, { "epoch": 2.103151795790625, "grad_norm": 0.16255392134189606, "learning_rate": 1.8445883053430597e-05, "loss": 1.167, "step": 5649 }, { "epoch": 2.1035241009412338, "grad_norm": 0.1646036058664322, "learning_rate": 1.84452322909217e-05, "loss": 1.1865, "step": 5650 }, { "epoch": 2.103896406091843, "grad_norm": 0.17090143263339996, "learning_rate": 1.844458140367725e-05, "loss": 1.1907, "step": 5651 }, { "epoch": 2.1042687112424523, "grad_norm": 0.17261987924575806, "learning_rate": 1.8443930391706863e-05, "loss": 1.194, "step": 5652 }, { "epoch": 2.104641016393061, "grad_norm": 0.166230246424675, "learning_rate": 1.8443279255020153e-05, "loss": 1.188, "step": 5653 }, { "epoch": 2.1050133215436704, "grad_norm": 0.168045312166214, "learning_rate": 1.844262799362674e-05, "loss": 1.1929, "step": 5654 }, { "epoch": 2.105385626694279, "grad_norm": 0.1657107025384903, "learning_rate": 1.844197660753624e-05, "loss": 1.1787, "step": 5655 }, { "epoch": 2.1057579318448885, "grad_norm": 0.1633072793483734, "learning_rate": 1.8441325096758275e-05, "loss": 1.1949, "step": 5656 }, { "epoch": 2.1061302369954973, "grad_norm": 0.16375941038131714, "learning_rate": 1.844067346130247e-05, "loss": 1.1742, "step": 5657 }, { "epoch": 2.1065025421461065, "grad_norm": 0.16451074182987213, "learning_rate": 1.8440021701178445e-05, "loss": 1.1873, "step": 5658 }, { "epoch": 2.1068748472967154, "grad_norm": 0.1651698499917984, "learning_rate": 1.843936981639583e-05, "loss": 1.1857, "step": 5659 }, { "epoch": 2.1072471524473246, "grad_norm": 0.1603275090456009, "learning_rate": 1.8438717806964254e-05, "loss": 1.1923, "step": 5660 }, { "epoch": 2.107619457597934, "grad_norm": 0.16892802715301514, "learning_rate": 1.8438065672893348e-05, "loss": 1.1954, "step": 5661 }, { "epoch": 2.1079917627485427, "grad_norm": 0.1669367104768753, "learning_rate": 1.8437413414192734e-05, "loss": 1.19, "step": 5662 }, { "epoch": 2.108364067899152, "grad_norm": 0.16005094349384308, "learning_rate": 1.843676103087206e-05, "loss": 1.1905, "step": 5663 }, { "epoch": 2.108736373049761, "grad_norm": 0.16192243993282318, "learning_rate": 1.8436108522940953e-05, "loss": 1.1865, "step": 5664 }, { "epoch": 2.10910867820037, "grad_norm": 0.16024897992610931, "learning_rate": 1.8435455890409054e-05, "loss": 1.1641, "step": 5665 }, { "epoch": 2.109480983350979, "grad_norm": 0.1648632138967514, "learning_rate": 1.8434803133285998e-05, "loss": 1.1792, "step": 5666 }, { "epoch": 2.109853288501588, "grad_norm": 0.15843743085861206, "learning_rate": 1.843415025158143e-05, "loss": 1.1728, "step": 5667 }, { "epoch": 2.110225593652197, "grad_norm": 0.1614643931388855, "learning_rate": 1.843349724530499e-05, "loss": 1.1726, "step": 5668 }, { "epoch": 2.1105978988028062, "grad_norm": 0.16818486154079437, "learning_rate": 1.843284411446633e-05, "loss": 1.1832, "step": 5669 }, { "epoch": 2.1109702039534155, "grad_norm": 0.16501553356647491, "learning_rate": 1.8432190859075088e-05, "loss": 1.1941, "step": 5670 }, { "epoch": 2.1113425091040243, "grad_norm": 0.16194112598896027, "learning_rate": 1.8431537479140916e-05, "loss": 1.1829, "step": 5671 }, { "epoch": 2.1117148142546336, "grad_norm": 0.1625380516052246, "learning_rate": 1.8430883974673467e-05, "loss": 1.1866, "step": 5672 }, { "epoch": 2.1120871194052424, "grad_norm": 0.16628055274486542, "learning_rate": 1.8430230345682393e-05, "loss": 1.167, "step": 5673 }, { "epoch": 2.1124594245558517, "grad_norm": 0.16523656249046326, "learning_rate": 1.842957659217734e-05, "loss": 1.1693, "step": 5674 }, { "epoch": 2.1128317297064605, "grad_norm": 0.16235126554965973, "learning_rate": 1.842892271416797e-05, "loss": 1.1814, "step": 5675 }, { "epoch": 2.1132040348570698, "grad_norm": 0.16224858164787292, "learning_rate": 1.8428268711663943e-05, "loss": 1.1778, "step": 5676 }, { "epoch": 2.113576340007679, "grad_norm": 0.1565116047859192, "learning_rate": 1.842761458467492e-05, "loss": 1.1834, "step": 5677 }, { "epoch": 2.113948645158288, "grad_norm": 0.1609666496515274, "learning_rate": 1.842696033321055e-05, "loss": 1.188, "step": 5678 }, { "epoch": 2.114320950308897, "grad_norm": 0.17431986331939697, "learning_rate": 1.842630595728051e-05, "loss": 1.2025, "step": 5679 }, { "epoch": 2.114693255459506, "grad_norm": 0.16619716584682465, "learning_rate": 1.842565145689446e-05, "loss": 1.1908, "step": 5680 }, { "epoch": 2.115065560610115, "grad_norm": 0.16613732278347015, "learning_rate": 1.8424996832062066e-05, "loss": 1.1826, "step": 5681 }, { "epoch": 2.115437865760724, "grad_norm": 0.16963297128677368, "learning_rate": 1.8424342082792996e-05, "loss": 1.1904, "step": 5682 }, { "epoch": 2.1158101709113333, "grad_norm": 0.15998825430870056, "learning_rate": 1.8423687209096926e-05, "loss": 1.1873, "step": 5683 }, { "epoch": 2.116182476061942, "grad_norm": 0.15529409050941467, "learning_rate": 1.842303221098352e-05, "loss": 1.1879, "step": 5684 }, { "epoch": 2.1165547812125514, "grad_norm": 0.16346554458141327, "learning_rate": 1.8422377088462456e-05, "loss": 1.1873, "step": 5685 }, { "epoch": 2.1169270863631606, "grad_norm": 0.16552503407001495, "learning_rate": 1.8421721841543412e-05, "loss": 1.1971, "step": 5686 }, { "epoch": 2.1172993915137694, "grad_norm": 0.16993537545204163, "learning_rate": 1.842106647023607e-05, "loss": 1.2047, "step": 5687 }, { "epoch": 2.1176716966643787, "grad_norm": 0.16615287959575653, "learning_rate": 1.8420410974550103e-05, "loss": 1.1826, "step": 5688 }, { "epoch": 2.1180440018149875, "grad_norm": 0.16618463397026062, "learning_rate": 1.8419755354495194e-05, "loss": 1.1757, "step": 5689 }, { "epoch": 2.118416306965597, "grad_norm": 0.16526402533054352, "learning_rate": 1.8419099610081025e-05, "loss": 1.1833, "step": 5690 }, { "epoch": 2.1187886121162056, "grad_norm": 0.1653960645198822, "learning_rate": 1.841844374131728e-05, "loss": 1.176, "step": 5691 }, { "epoch": 2.119160917266815, "grad_norm": 0.16341429948806763, "learning_rate": 1.8417787748213655e-05, "loss": 1.1807, "step": 5692 }, { "epoch": 2.1195332224174237, "grad_norm": 0.15940816700458527, "learning_rate": 1.8417131630779834e-05, "loss": 1.1814, "step": 5693 }, { "epoch": 2.119905527568033, "grad_norm": 0.1656249612569809, "learning_rate": 1.84164753890255e-05, "loss": 1.1895, "step": 5694 }, { "epoch": 2.1202778327186422, "grad_norm": 0.1686115264892578, "learning_rate": 1.841581902296036e-05, "loss": 1.1888, "step": 5695 }, { "epoch": 2.120650137869251, "grad_norm": 0.1635192185640335, "learning_rate": 1.8415162532594096e-05, "loss": 1.183, "step": 5696 }, { "epoch": 2.1210224430198603, "grad_norm": 0.15994298458099365, "learning_rate": 1.841450591793641e-05, "loss": 1.1839, "step": 5697 }, { "epoch": 2.121394748170469, "grad_norm": 0.16956914961338043, "learning_rate": 1.8413849178997002e-05, "loss": 1.1802, "step": 5698 }, { "epoch": 2.1217670533210784, "grad_norm": 0.16558878123760223, "learning_rate": 1.841319231578557e-05, "loss": 1.2054, "step": 5699 }, { "epoch": 2.122139358471687, "grad_norm": 0.16168220341205597, "learning_rate": 1.8412535328311813e-05, "loss": 1.1613, "step": 5700 }, { "epoch": 2.1225116636222965, "grad_norm": 0.16287264227867126, "learning_rate": 1.8411878216585436e-05, "loss": 1.1889, "step": 5701 }, { "epoch": 2.1228839687729053, "grad_norm": 0.1619829684495926, "learning_rate": 1.841122098061615e-05, "loss": 1.1877, "step": 5702 }, { "epoch": 2.1232562739235146, "grad_norm": 0.16612021625041962, "learning_rate": 1.8410563620413658e-05, "loss": 1.1818, "step": 5703 }, { "epoch": 2.123628579074124, "grad_norm": 0.16354532539844513, "learning_rate": 1.8409906135987668e-05, "loss": 1.1882, "step": 5704 }, { "epoch": 2.1240008842247327, "grad_norm": 0.1652521938085556, "learning_rate": 1.8409248527347888e-05, "loss": 1.1854, "step": 5705 }, { "epoch": 2.124373189375342, "grad_norm": 0.1664263755083084, "learning_rate": 1.840859079450404e-05, "loss": 1.1875, "step": 5706 }, { "epoch": 2.1247454945259507, "grad_norm": 0.16455532610416412, "learning_rate": 1.8407932937465835e-05, "loss": 1.1903, "step": 5707 }, { "epoch": 2.12511779967656, "grad_norm": 0.1682821661233902, "learning_rate": 1.8407274956242983e-05, "loss": 1.1965, "step": 5708 }, { "epoch": 2.125490104827169, "grad_norm": 0.16767260432243347, "learning_rate": 1.840661685084521e-05, "loss": 1.178, "step": 5709 }, { "epoch": 2.125862409977778, "grad_norm": 0.16613049805164337, "learning_rate": 1.8405958621282232e-05, "loss": 1.172, "step": 5710 }, { "epoch": 2.126234715128387, "grad_norm": 0.16709165275096893, "learning_rate": 1.8405300267563774e-05, "loss": 1.1834, "step": 5711 }, { "epoch": 2.126607020278996, "grad_norm": 0.16999532282352448, "learning_rate": 1.840464178969956e-05, "loss": 1.1879, "step": 5712 }, { "epoch": 2.1269793254296054, "grad_norm": 0.1657310575246811, "learning_rate": 1.840398318769931e-05, "loss": 1.1959, "step": 5713 }, { "epoch": 2.1273516305802143, "grad_norm": 0.16610091924667358, "learning_rate": 1.8403324461572762e-05, "loss": 1.1929, "step": 5714 }, { "epoch": 2.1277239357308235, "grad_norm": 0.16511550545692444, "learning_rate": 1.8402665611329635e-05, "loss": 1.1819, "step": 5715 }, { "epoch": 2.1280962408814323, "grad_norm": 0.1685413420200348, "learning_rate": 1.8402006636979667e-05, "loss": 1.1871, "step": 5716 }, { "epoch": 2.1284685460320416, "grad_norm": 0.1703380048274994, "learning_rate": 1.8401347538532585e-05, "loss": 1.1795, "step": 5717 }, { "epoch": 2.1288408511826504, "grad_norm": 0.16510550677776337, "learning_rate": 1.8400688315998128e-05, "loss": 1.1991, "step": 5718 }, { "epoch": 2.1292131563332597, "grad_norm": 0.15952372550964355, "learning_rate": 1.8400028969386033e-05, "loss": 1.1844, "step": 5719 }, { "epoch": 2.1295854614838685, "grad_norm": 0.1626126617193222, "learning_rate": 1.8399369498706034e-05, "loss": 1.189, "step": 5720 }, { "epoch": 2.129957766634478, "grad_norm": 0.1677938997745514, "learning_rate": 1.8398709903967878e-05, "loss": 1.1905, "step": 5721 }, { "epoch": 2.130330071785087, "grad_norm": 0.16606491804122925, "learning_rate": 1.8398050185181303e-05, "loss": 1.1932, "step": 5722 }, { "epoch": 2.130702376935696, "grad_norm": 0.16483409702777863, "learning_rate": 1.8397390342356054e-05, "loss": 1.1819, "step": 5723 }, { "epoch": 2.131074682086305, "grad_norm": 0.15964972972869873, "learning_rate": 1.8396730375501877e-05, "loss": 1.1887, "step": 5724 }, { "epoch": 2.131446987236914, "grad_norm": 0.1669287085533142, "learning_rate": 1.8396070284628518e-05, "loss": 1.1811, "step": 5725 }, { "epoch": 2.131819292387523, "grad_norm": 0.16600339114665985, "learning_rate": 1.839541006974573e-05, "loss": 1.187, "step": 5726 }, { "epoch": 2.132191597538132, "grad_norm": 0.16649706661701202, "learning_rate": 1.839474973086326e-05, "loss": 1.1863, "step": 5727 }, { "epoch": 2.1325639026887413, "grad_norm": 0.16561785340309143, "learning_rate": 1.839408926799086e-05, "loss": 1.1911, "step": 5728 }, { "epoch": 2.13293620783935, "grad_norm": 0.16608673334121704, "learning_rate": 1.8393428681138298e-05, "loss": 1.1734, "step": 5729 }, { "epoch": 2.1333085129899594, "grad_norm": 0.16552896797657013, "learning_rate": 1.8392767970315314e-05, "loss": 1.1823, "step": 5730 }, { "epoch": 2.1336808181405686, "grad_norm": 0.16509748995304108, "learning_rate": 1.8392107135531674e-05, "loss": 1.1801, "step": 5731 }, { "epoch": 2.1340531232911775, "grad_norm": 0.16142834722995758, "learning_rate": 1.839144617679714e-05, "loss": 1.174, "step": 5732 }, { "epoch": 2.1344254284417867, "grad_norm": 0.17052648961544037, "learning_rate": 1.839078509412147e-05, "loss": 1.1891, "step": 5733 }, { "epoch": 2.1347977335923956, "grad_norm": 0.15803445875644684, "learning_rate": 1.8390123887514436e-05, "loss": 1.1954, "step": 5734 }, { "epoch": 2.135170038743005, "grad_norm": 0.16502295434474945, "learning_rate": 1.8389462556985793e-05, "loss": 1.1777, "step": 5735 }, { "epoch": 2.1355423438936136, "grad_norm": 0.1639169305562973, "learning_rate": 1.838880110254532e-05, "loss": 1.1857, "step": 5736 }, { "epoch": 2.135914649044223, "grad_norm": 0.16261015832424164, "learning_rate": 1.8388139524202776e-05, "loss": 1.1836, "step": 5737 }, { "epoch": 2.1362869541948317, "grad_norm": 0.1698441356420517, "learning_rate": 1.8387477821967938e-05, "loss": 1.1822, "step": 5738 }, { "epoch": 2.136659259345441, "grad_norm": 0.15911483764648438, "learning_rate": 1.8386815995850584e-05, "loss": 1.1733, "step": 5739 }, { "epoch": 2.1370315644960503, "grad_norm": 0.16580499708652496, "learning_rate": 1.838615404586048e-05, "loss": 1.1809, "step": 5740 }, { "epoch": 2.137403869646659, "grad_norm": 0.16274124383926392, "learning_rate": 1.8385491972007408e-05, "loss": 1.1737, "step": 5741 }, { "epoch": 2.1377761747972683, "grad_norm": 0.1641577035188675, "learning_rate": 1.8384829774301145e-05, "loss": 1.1926, "step": 5742 }, { "epoch": 2.138148479947877, "grad_norm": 0.159241184592247, "learning_rate": 1.8384167452751473e-05, "loss": 1.1794, "step": 5743 }, { "epoch": 2.1385207850984864, "grad_norm": 0.17162783443927765, "learning_rate": 1.8383505007368175e-05, "loss": 1.1891, "step": 5744 }, { "epoch": 2.1388930902490952, "grad_norm": 0.17057040333747864, "learning_rate": 1.8382842438161034e-05, "loss": 1.1981, "step": 5745 }, { "epoch": 2.1392653953997045, "grad_norm": 0.16203658282756805, "learning_rate": 1.8382179745139835e-05, "loss": 1.1763, "step": 5746 }, { "epoch": 2.1396377005503133, "grad_norm": 0.16306722164154053, "learning_rate": 1.838151692831437e-05, "loss": 1.1903, "step": 5747 }, { "epoch": 2.1400100057009226, "grad_norm": 0.16811603307724, "learning_rate": 1.838085398769442e-05, "loss": 1.1941, "step": 5748 }, { "epoch": 2.140382310851532, "grad_norm": 0.1626274734735489, "learning_rate": 1.8380190923289785e-05, "loss": 1.1709, "step": 5749 }, { "epoch": 2.1407546160021407, "grad_norm": 0.16186201572418213, "learning_rate": 1.837952773511026e-05, "loss": 1.1861, "step": 5750 }, { "epoch": 2.14112692115275, "grad_norm": 0.1650025099515915, "learning_rate": 1.8378864423165632e-05, "loss": 1.18, "step": 5751 }, { "epoch": 2.1414992263033588, "grad_norm": 0.15938428044319153, "learning_rate": 1.8378200987465704e-05, "loss": 1.2017, "step": 5752 }, { "epoch": 2.141871531453968, "grad_norm": 0.16518081724643707, "learning_rate": 1.8377537428020273e-05, "loss": 1.1805, "step": 5753 }, { "epoch": 2.142243836604577, "grad_norm": 0.16796717047691345, "learning_rate": 1.837687374483914e-05, "loss": 1.1798, "step": 5754 }, { "epoch": 2.142616141755186, "grad_norm": 0.1624600887298584, "learning_rate": 1.837620993793211e-05, "loss": 1.1888, "step": 5755 }, { "epoch": 2.142988446905795, "grad_norm": 0.1630149781703949, "learning_rate": 1.837554600730898e-05, "loss": 1.1848, "step": 5756 }, { "epoch": 2.143360752056404, "grad_norm": 0.16488610208034515, "learning_rate": 1.837488195297956e-05, "loss": 1.1831, "step": 5757 }, { "epoch": 2.1437330572070135, "grad_norm": 0.1614954173564911, "learning_rate": 1.8374217774953663e-05, "loss": 1.1871, "step": 5758 }, { "epoch": 2.1441053623576223, "grad_norm": 0.16131506860256195, "learning_rate": 1.8373553473241097e-05, "loss": 1.1734, "step": 5759 }, { "epoch": 2.1444776675082315, "grad_norm": 0.16610832512378693, "learning_rate": 1.837288904785167e-05, "loss": 1.1945, "step": 5760 }, { "epoch": 2.1448499726588404, "grad_norm": 0.16947484016418457, "learning_rate": 1.8372224498795198e-05, "loss": 1.1775, "step": 5761 }, { "epoch": 2.1452222778094496, "grad_norm": 0.17164815962314606, "learning_rate": 1.8371559826081492e-05, "loss": 1.1955, "step": 5762 }, { "epoch": 2.1455945829600584, "grad_norm": 0.16294875741004944, "learning_rate": 1.8370895029720374e-05, "loss": 1.1761, "step": 5763 }, { "epoch": 2.1459668881106677, "grad_norm": 0.16515691578388214, "learning_rate": 1.8370230109721664e-05, "loss": 1.1913, "step": 5764 }, { "epoch": 2.146339193261277, "grad_norm": 0.16761860251426697, "learning_rate": 1.836956506609518e-05, "loss": 1.183, "step": 5765 }, { "epoch": 2.146711498411886, "grad_norm": 0.16894914209842682, "learning_rate": 1.8368899898850744e-05, "loss": 1.1825, "step": 5766 }, { "epoch": 2.147083803562495, "grad_norm": 0.159805029630661, "learning_rate": 1.836823460799818e-05, "loss": 1.1854, "step": 5767 }, { "epoch": 2.147456108713104, "grad_norm": 0.16420911252498627, "learning_rate": 1.836756919354732e-05, "loss": 1.1941, "step": 5768 }, { "epoch": 2.147828413863713, "grad_norm": 0.1619480699300766, "learning_rate": 1.8366903655507987e-05, "loss": 1.1958, "step": 5769 }, { "epoch": 2.148200719014322, "grad_norm": 0.16295601427555084, "learning_rate": 1.836623799389001e-05, "loss": 1.204, "step": 5770 }, { "epoch": 2.1485730241649312, "grad_norm": 0.1661759316921234, "learning_rate": 1.8365572208703225e-05, "loss": 1.2004, "step": 5771 }, { "epoch": 2.14894532931554, "grad_norm": 0.1644112765789032, "learning_rate": 1.8364906299957464e-05, "loss": 1.1852, "step": 5772 }, { "epoch": 2.1493176344661493, "grad_norm": 0.16289396584033966, "learning_rate": 1.836424026766256e-05, "loss": 1.1738, "step": 5773 }, { "epoch": 2.1496899396167586, "grad_norm": 0.16191107034683228, "learning_rate": 1.836357411182835e-05, "loss": 1.1809, "step": 5774 }, { "epoch": 2.1500622447673674, "grad_norm": 0.16520683467388153, "learning_rate": 1.8362907832464678e-05, "loss": 1.179, "step": 5775 }, { "epoch": 2.1504345499179767, "grad_norm": 0.16534850001335144, "learning_rate": 1.8362241429581386e-05, "loss": 1.1973, "step": 5776 }, { "epoch": 2.1508068550685855, "grad_norm": 0.16141214966773987, "learning_rate": 1.8361574903188307e-05, "loss": 1.1926, "step": 5777 }, { "epoch": 2.1511791602191948, "grad_norm": 0.15976232290267944, "learning_rate": 1.8360908253295293e-05, "loss": 1.1813, "step": 5778 }, { "epoch": 2.1515514653698036, "grad_norm": 0.16501028835773468, "learning_rate": 1.836024147991219e-05, "loss": 1.1883, "step": 5779 }, { "epoch": 2.151923770520413, "grad_norm": 0.16739144921302795, "learning_rate": 1.8359574583048846e-05, "loss": 1.1884, "step": 5780 }, { "epoch": 2.1522960756710217, "grad_norm": 0.1655547022819519, "learning_rate": 1.8358907562715104e-05, "loss": 1.2032, "step": 5781 }, { "epoch": 2.152668380821631, "grad_norm": 0.16336333751678467, "learning_rate": 1.835824041892083e-05, "loss": 1.1947, "step": 5782 }, { "epoch": 2.15304068597224, "grad_norm": 0.16573452949523926, "learning_rate": 1.8357573151675864e-05, "loss": 1.1751, "step": 5783 }, { "epoch": 2.153412991122849, "grad_norm": 0.1619141399860382, "learning_rate": 1.835690576099007e-05, "loss": 1.1685, "step": 5784 }, { "epoch": 2.1537852962734583, "grad_norm": 0.16914691030979156, "learning_rate": 1.8356238246873302e-05, "loss": 1.1747, "step": 5785 }, { "epoch": 2.154157601424067, "grad_norm": 0.16027076542377472, "learning_rate": 1.8355570609335416e-05, "loss": 1.1737, "step": 5786 }, { "epoch": 2.1545299065746764, "grad_norm": 0.15901793539524078, "learning_rate": 1.835490284838628e-05, "loss": 1.1896, "step": 5787 }, { "epoch": 2.154902211725285, "grad_norm": 0.17148235440254211, "learning_rate": 1.8354234964035754e-05, "loss": 1.192, "step": 5788 }, { "epoch": 2.1552745168758944, "grad_norm": 0.16743379831314087, "learning_rate": 1.83535669562937e-05, "loss": 1.1747, "step": 5789 }, { "epoch": 2.1556468220265037, "grad_norm": 0.1674523502588272, "learning_rate": 1.8352898825169986e-05, "loss": 1.1853, "step": 5790 }, { "epoch": 2.1560191271771125, "grad_norm": 0.15756765007972717, "learning_rate": 1.835223057067448e-05, "loss": 1.1702, "step": 5791 }, { "epoch": 2.156391432327722, "grad_norm": 0.16530895233154297, "learning_rate": 1.8351562192817054e-05, "loss": 1.1814, "step": 5792 }, { "epoch": 2.1567637374783306, "grad_norm": 0.1682775914669037, "learning_rate": 1.835089369160758e-05, "loss": 1.1956, "step": 5793 }, { "epoch": 2.15713604262894, "grad_norm": 0.1804119050502777, "learning_rate": 1.8350225067055927e-05, "loss": 1.175, "step": 5794 }, { "epoch": 2.1575083477795487, "grad_norm": 0.1660710573196411, "learning_rate": 1.8349556319171977e-05, "loss": 1.1795, "step": 5795 }, { "epoch": 2.157880652930158, "grad_norm": 0.16514445841312408, "learning_rate": 1.83488874479656e-05, "loss": 1.1923, "step": 5796 }, { "epoch": 2.158252958080767, "grad_norm": 0.1635090708732605, "learning_rate": 1.8348218453446685e-05, "loss": 1.1853, "step": 5797 }, { "epoch": 2.158625263231376, "grad_norm": 0.17032501101493835, "learning_rate": 1.8347549335625104e-05, "loss": 1.1996, "step": 5798 }, { "epoch": 2.1589975683819853, "grad_norm": 0.17020241916179657, "learning_rate": 1.834688009451074e-05, "loss": 1.1841, "step": 5799 }, { "epoch": 2.159369873532594, "grad_norm": 0.1642071157693863, "learning_rate": 1.8346210730113484e-05, "loss": 1.195, "step": 5800 }, { "epoch": 2.1597421786832034, "grad_norm": 0.16940705478191376, "learning_rate": 1.8345541242443223e-05, "loss": 1.1909, "step": 5801 }, { "epoch": 2.160114483833812, "grad_norm": 0.15778762102127075, "learning_rate": 1.8344871631509837e-05, "loss": 1.1872, "step": 5802 }, { "epoch": 2.1604867889844215, "grad_norm": 0.1702834963798523, "learning_rate": 1.834420189732322e-05, "loss": 1.1847, "step": 5803 }, { "epoch": 2.1608590941350303, "grad_norm": 0.16944074630737305, "learning_rate": 1.834353203989327e-05, "loss": 1.1858, "step": 5804 }, { "epoch": 2.1612313992856396, "grad_norm": 0.17070455849170685, "learning_rate": 1.834286205922987e-05, "loss": 1.1943, "step": 5805 }, { "epoch": 2.1616037044362484, "grad_norm": 0.16586799919605255, "learning_rate": 1.8342191955342926e-05, "loss": 1.1877, "step": 5806 }, { "epoch": 2.1619760095868576, "grad_norm": 0.17121802270412445, "learning_rate": 1.8341521728242324e-05, "loss": 1.1779, "step": 5807 }, { "epoch": 2.162348314737467, "grad_norm": 0.16471904516220093, "learning_rate": 1.8340851377937975e-05, "loss": 1.1963, "step": 5808 }, { "epoch": 2.1627206198880757, "grad_norm": 0.16698746383190155, "learning_rate": 1.834018090443977e-05, "loss": 1.1695, "step": 5809 }, { "epoch": 2.163092925038685, "grad_norm": 0.16752393543720245, "learning_rate": 1.833951030775762e-05, "loss": 1.1872, "step": 5810 }, { "epoch": 2.163465230189294, "grad_norm": 0.16329661011695862, "learning_rate": 1.8338839587901426e-05, "loss": 1.1855, "step": 5811 }, { "epoch": 2.163837535339903, "grad_norm": 0.16880545020103455, "learning_rate": 1.833816874488109e-05, "loss": 1.1856, "step": 5812 }, { "epoch": 2.164209840490512, "grad_norm": 0.16855594515800476, "learning_rate": 1.833749777870653e-05, "loss": 1.1756, "step": 5813 }, { "epoch": 2.164582145641121, "grad_norm": 0.16726450622081757, "learning_rate": 1.833682668938765e-05, "loss": 1.1606, "step": 5814 }, { "epoch": 2.16495445079173, "grad_norm": 0.16840289533138275, "learning_rate": 1.8336155476934365e-05, "loss": 1.1811, "step": 5815 }, { "epoch": 2.1653267559423393, "grad_norm": 0.16863827407360077, "learning_rate": 1.8335484141356582e-05, "loss": 1.1907, "step": 5816 }, { "epoch": 2.1656990610929485, "grad_norm": 0.16578540205955505, "learning_rate": 1.8334812682664224e-05, "loss": 1.1792, "step": 5817 }, { "epoch": 2.1660713662435573, "grad_norm": 0.16219614446163177, "learning_rate": 1.8334141100867208e-05, "loss": 1.1759, "step": 5818 }, { "epoch": 2.1664436713941666, "grad_norm": 0.16222938895225525, "learning_rate": 1.8333469395975446e-05, "loss": 1.1734, "step": 5819 }, { "epoch": 2.1668159765447754, "grad_norm": 0.17056572437286377, "learning_rate": 1.8332797567998865e-05, "loss": 1.1854, "step": 5820 }, { "epoch": 2.1671882816953847, "grad_norm": 0.1574883610010147, "learning_rate": 1.833212561694739e-05, "loss": 1.1951, "step": 5821 }, { "epoch": 2.1675605868459935, "grad_norm": 0.1614379733800888, "learning_rate": 1.833145354283094e-05, "loss": 1.1806, "step": 5822 }, { "epoch": 2.1679328919966028, "grad_norm": 0.17167092859745026, "learning_rate": 1.8330781345659447e-05, "loss": 1.1928, "step": 5823 }, { "epoch": 2.1683051971472116, "grad_norm": 0.17068855464458466, "learning_rate": 1.8330109025442834e-05, "loss": 1.184, "step": 5824 }, { "epoch": 2.168677502297821, "grad_norm": 0.1654396653175354, "learning_rate": 1.832943658219103e-05, "loss": 1.1839, "step": 5825 }, { "epoch": 2.16904980744843, "grad_norm": 0.16501565277576447, "learning_rate": 1.832876401591398e-05, "loss": 1.1828, "step": 5826 }, { "epoch": 2.169422112599039, "grad_norm": 0.16627371311187744, "learning_rate": 1.8328091326621597e-05, "loss": 1.1904, "step": 5827 }, { "epoch": 2.169794417749648, "grad_norm": 0.15978330373764038, "learning_rate": 1.8327418514323833e-05, "loss": 1.1706, "step": 5828 }, { "epoch": 2.170166722900257, "grad_norm": 0.16594457626342773, "learning_rate": 1.8326745579030623e-05, "loss": 1.2023, "step": 5829 }, { "epoch": 2.1705390280508663, "grad_norm": 0.17066100239753723, "learning_rate": 1.83260725207519e-05, "loss": 1.1809, "step": 5830 }, { "epoch": 2.170911333201475, "grad_norm": 0.16543124616146088, "learning_rate": 1.8325399339497608e-05, "loss": 1.1749, "step": 5831 }, { "epoch": 2.1712836383520844, "grad_norm": 0.16617171466350555, "learning_rate": 1.8324726035277694e-05, "loss": 1.1901, "step": 5832 }, { "epoch": 2.171655943502693, "grad_norm": 0.17173443734645844, "learning_rate": 1.8324052608102095e-05, "loss": 1.1847, "step": 5833 }, { "epoch": 2.1720282486533025, "grad_norm": 0.1652347892522812, "learning_rate": 1.832337905798076e-05, "loss": 1.1766, "step": 5834 }, { "epoch": 2.1724005538039117, "grad_norm": 0.16114427149295807, "learning_rate": 1.8322705384923644e-05, "loss": 1.1835, "step": 5835 }, { "epoch": 2.1727728589545205, "grad_norm": 0.17017604410648346, "learning_rate": 1.8322031588940687e-05, "loss": 1.1778, "step": 5836 }, { "epoch": 2.17314516410513, "grad_norm": 0.16408562660217285, "learning_rate": 1.8321357670041848e-05, "loss": 1.1863, "step": 5837 }, { "epoch": 2.1735174692557386, "grad_norm": 0.1657814234495163, "learning_rate": 1.832068362823708e-05, "loss": 1.1847, "step": 5838 }, { "epoch": 2.173889774406348, "grad_norm": 0.15966399013996124, "learning_rate": 1.8320009463536333e-05, "loss": 1.1818, "step": 5839 }, { "epoch": 2.1742620795569567, "grad_norm": 0.15668559074401855, "learning_rate": 1.831933517594957e-05, "loss": 1.1798, "step": 5840 }, { "epoch": 2.174634384707566, "grad_norm": 0.1669110357761383, "learning_rate": 1.831866076548675e-05, "loss": 1.2053, "step": 5841 }, { "epoch": 2.175006689858175, "grad_norm": 0.15878044068813324, "learning_rate": 1.831798623215783e-05, "loss": 1.1697, "step": 5842 }, { "epoch": 2.175378995008784, "grad_norm": 0.16765941679477692, "learning_rate": 1.8317311575972777e-05, "loss": 1.1806, "step": 5843 }, { "epoch": 2.1757513001593933, "grad_norm": 0.1708860546350479, "learning_rate": 1.8316636796941555e-05, "loss": 1.2002, "step": 5844 }, { "epoch": 2.176123605310002, "grad_norm": 0.16999541223049164, "learning_rate": 1.8315961895074127e-05, "loss": 1.1778, "step": 5845 }, { "epoch": 2.1764959104606114, "grad_norm": 0.1659235954284668, "learning_rate": 1.8315286870380468e-05, "loss": 1.1821, "step": 5846 }, { "epoch": 2.1768682156112202, "grad_norm": 0.15911462903022766, "learning_rate": 1.831461172287054e-05, "loss": 1.197, "step": 5847 }, { "epoch": 2.1772405207618295, "grad_norm": 0.15904010832309723, "learning_rate": 1.8313936452554318e-05, "loss": 1.1777, "step": 5848 }, { "epoch": 2.1776128259124383, "grad_norm": 0.1634838581085205, "learning_rate": 1.8313261059441777e-05, "loss": 1.1752, "step": 5849 }, { "epoch": 2.1779851310630476, "grad_norm": 0.16560928523540497, "learning_rate": 1.8312585543542893e-05, "loss": 1.1859, "step": 5850 }, { "epoch": 2.1783574362136564, "grad_norm": 0.16026043891906738, "learning_rate": 1.8311909904867643e-05, "loss": 1.1715, "step": 5851 }, { "epoch": 2.1787297413642657, "grad_norm": 0.1598140001296997, "learning_rate": 1.8311234143426003e-05, "loss": 1.1755, "step": 5852 }, { "epoch": 2.179102046514875, "grad_norm": 0.1634136587381363, "learning_rate": 1.831055825922796e-05, "loss": 1.1806, "step": 5853 }, { "epoch": 2.1794743516654838, "grad_norm": 0.16830392181873322, "learning_rate": 1.830988225228349e-05, "loss": 1.1638, "step": 5854 }, { "epoch": 2.179846656816093, "grad_norm": 0.16452953219413757, "learning_rate": 1.8309206122602582e-05, "loss": 1.184, "step": 5855 }, { "epoch": 2.180218961966702, "grad_norm": 0.16308681666851044, "learning_rate": 1.830852987019522e-05, "loss": 1.1898, "step": 5856 }, { "epoch": 2.180591267117311, "grad_norm": 0.1646089106798172, "learning_rate": 1.8307853495071394e-05, "loss": 1.1717, "step": 5857 }, { "epoch": 2.18096357226792, "grad_norm": 0.16182439029216766, "learning_rate": 1.8307176997241092e-05, "loss": 1.1833, "step": 5858 }, { "epoch": 2.181335877418529, "grad_norm": 0.16360101103782654, "learning_rate": 1.8306500376714307e-05, "loss": 1.1876, "step": 5859 }, { "epoch": 2.181708182569138, "grad_norm": 0.1654779016971588, "learning_rate": 1.8305823633501033e-05, "loss": 1.1686, "step": 5860 }, { "epoch": 2.1820804877197473, "grad_norm": 0.16477808356285095, "learning_rate": 1.8305146767611267e-05, "loss": 1.1878, "step": 5861 }, { "epoch": 2.1824527928703565, "grad_norm": 0.16140249371528625, "learning_rate": 1.8304469779055e-05, "loss": 1.1852, "step": 5862 }, { "epoch": 2.1828250980209654, "grad_norm": 0.1675121784210205, "learning_rate": 1.830379266784224e-05, "loss": 1.1675, "step": 5863 }, { "epoch": 2.1831974031715746, "grad_norm": 0.1668505072593689, "learning_rate": 1.8303115433982984e-05, "loss": 1.1938, "step": 5864 }, { "epoch": 2.1835697083221834, "grad_norm": 0.16749152541160583, "learning_rate": 1.830243807748723e-05, "loss": 1.1722, "step": 5865 }, { "epoch": 2.1839420134727927, "grad_norm": 0.16352511942386627, "learning_rate": 1.830176059836499e-05, "loss": 1.1873, "step": 5866 }, { "epoch": 2.1843143186234015, "grad_norm": 0.16892147064208984, "learning_rate": 1.8301082996626268e-05, "loss": 1.1968, "step": 5867 }, { "epoch": 2.184686623774011, "grad_norm": 0.1627473384141922, "learning_rate": 1.830040527228107e-05, "loss": 1.185, "step": 5868 }, { "epoch": 2.1850589289246196, "grad_norm": 0.16038550436496735, "learning_rate": 1.8299727425339405e-05, "loss": 1.1742, "step": 5869 }, { "epoch": 2.185431234075229, "grad_norm": 0.1608399599790573, "learning_rate": 1.8299049455811285e-05, "loss": 1.1837, "step": 5870 }, { "epoch": 2.185803539225838, "grad_norm": 0.16327451169490814, "learning_rate": 1.829837136370673e-05, "loss": 1.1792, "step": 5871 }, { "epoch": 2.186175844376447, "grad_norm": 0.16703878343105316, "learning_rate": 1.829769314903575e-05, "loss": 1.2048, "step": 5872 }, { "epoch": 2.1865481495270562, "grad_norm": 0.16750751435756683, "learning_rate": 1.8297014811808363e-05, "loss": 1.1813, "step": 5873 }, { "epoch": 2.186920454677665, "grad_norm": 0.16518592834472656, "learning_rate": 1.8296336352034585e-05, "loss": 1.1782, "step": 5874 }, { "epoch": 2.1872927598282743, "grad_norm": 0.1652197390794754, "learning_rate": 1.829565776972444e-05, "loss": 1.1771, "step": 5875 }, { "epoch": 2.187665064978883, "grad_norm": 0.16577941179275513, "learning_rate": 1.829497906488795e-05, "loss": 1.1896, "step": 5876 }, { "epoch": 2.1880373701294924, "grad_norm": 0.16348230838775635, "learning_rate": 1.8294300237535145e-05, "loss": 1.185, "step": 5877 }, { "epoch": 2.188409675280101, "grad_norm": 0.16359402239322662, "learning_rate": 1.8293621287676043e-05, "loss": 1.1909, "step": 5878 }, { "epoch": 2.1887819804307105, "grad_norm": 0.16968275606632233, "learning_rate": 1.8292942215320675e-05, "loss": 1.1827, "step": 5879 }, { "epoch": 2.1891542855813197, "grad_norm": 0.16383975744247437, "learning_rate": 1.829226302047907e-05, "loss": 1.1892, "step": 5880 }, { "epoch": 2.1895265907319286, "grad_norm": 0.16417528688907623, "learning_rate": 1.8291583703161263e-05, "loss": 1.1879, "step": 5881 }, { "epoch": 2.189898895882538, "grad_norm": 0.16152073442935944, "learning_rate": 1.8290904263377284e-05, "loss": 1.1755, "step": 5882 }, { "epoch": 2.1902712010331467, "grad_norm": 0.16532781720161438, "learning_rate": 1.8290224701137165e-05, "loss": 1.1987, "step": 5883 }, { "epoch": 2.190643506183756, "grad_norm": 0.1655869036912918, "learning_rate": 1.8289545016450953e-05, "loss": 1.187, "step": 5884 }, { "epoch": 2.1910158113343647, "grad_norm": 0.16325949132442474, "learning_rate": 1.828886520932868e-05, "loss": 1.1926, "step": 5885 }, { "epoch": 2.191388116484974, "grad_norm": 0.1616744101047516, "learning_rate": 1.8288185279780388e-05, "loss": 1.1851, "step": 5886 }, { "epoch": 2.1917604216355833, "grad_norm": 0.15696094930171967, "learning_rate": 1.828750522781612e-05, "loss": 1.1899, "step": 5887 }, { "epoch": 2.192132726786192, "grad_norm": 0.15940669178962708, "learning_rate": 1.8286825053445916e-05, "loss": 1.1872, "step": 5888 }, { "epoch": 2.1925050319368014, "grad_norm": 0.16422483325004578, "learning_rate": 1.828614475667983e-05, "loss": 1.1823, "step": 5889 }, { "epoch": 2.19287733708741, "grad_norm": 0.16130252182483673, "learning_rate": 1.8285464337527906e-05, "loss": 1.1784, "step": 5890 }, { "epoch": 2.1932496422380194, "grad_norm": 0.1615862399339676, "learning_rate": 1.8284783796000193e-05, "loss": 1.185, "step": 5891 }, { "epoch": 2.1936219473886283, "grad_norm": 0.15964102745056152, "learning_rate": 1.8284103132106743e-05, "loss": 1.1796, "step": 5892 }, { "epoch": 2.1939942525392375, "grad_norm": 0.15634554624557495, "learning_rate": 1.828342234585761e-05, "loss": 1.1821, "step": 5893 }, { "epoch": 2.1943665576898463, "grad_norm": 0.1648419350385666, "learning_rate": 1.828274143726285e-05, "loss": 1.1926, "step": 5894 }, { "epoch": 2.1947388628404556, "grad_norm": 0.1677112877368927, "learning_rate": 1.8282060406332513e-05, "loss": 1.1815, "step": 5895 }, { "epoch": 2.195111167991065, "grad_norm": 0.16209574043750763, "learning_rate": 1.828137925307667e-05, "loss": 1.1888, "step": 5896 }, { "epoch": 2.1954834731416737, "grad_norm": 0.16768576204776764, "learning_rate": 1.828069797750537e-05, "loss": 1.1844, "step": 5897 }, { "epoch": 2.195855778292283, "grad_norm": 0.15827785432338715, "learning_rate": 1.8280016579628686e-05, "loss": 1.1645, "step": 5898 }, { "epoch": 2.1962280834428918, "grad_norm": 0.17181847989559174, "learning_rate": 1.8279335059456673e-05, "loss": 1.1777, "step": 5899 }, { "epoch": 2.196600388593501, "grad_norm": 0.16436168551445007, "learning_rate": 1.8278653416999402e-05, "loss": 1.1708, "step": 5900 }, { "epoch": 2.19697269374411, "grad_norm": 0.16275320947170258, "learning_rate": 1.827797165226694e-05, "loss": 1.1883, "step": 5901 }, { "epoch": 2.197344998894719, "grad_norm": 0.15981367230415344, "learning_rate": 1.8277289765269353e-05, "loss": 1.1709, "step": 5902 }, { "epoch": 2.197717304045328, "grad_norm": 0.17126475274562836, "learning_rate": 1.8276607756016722e-05, "loss": 1.1784, "step": 5903 }, { "epoch": 2.198089609195937, "grad_norm": 0.16240163147449493, "learning_rate": 1.827592562451911e-05, "loss": 1.1772, "step": 5904 }, { "epoch": 2.1984619143465465, "grad_norm": 0.16826151311397552, "learning_rate": 1.8275243370786594e-05, "loss": 1.1728, "step": 5905 }, { "epoch": 2.1988342194971553, "grad_norm": 0.1712343990802765, "learning_rate": 1.8274560994829256e-05, "loss": 1.1836, "step": 5906 }, { "epoch": 2.1992065246477646, "grad_norm": 0.166199192404747, "learning_rate": 1.827387849665717e-05, "loss": 1.1996, "step": 5907 }, { "epoch": 2.1995788297983734, "grad_norm": 0.16163288056850433, "learning_rate": 1.827319587628042e-05, "loss": 1.1775, "step": 5908 }, { "epoch": 2.1999511349489826, "grad_norm": 0.171270489692688, "learning_rate": 1.827251313370908e-05, "loss": 1.1866, "step": 5909 }, { "epoch": 2.2003234400995915, "grad_norm": 0.1645098328590393, "learning_rate": 1.8271830268953248e-05, "loss": 1.1814, "step": 5910 }, { "epoch": 2.2006957452502007, "grad_norm": 0.1669131964445114, "learning_rate": 1.8271147282022998e-05, "loss": 1.1928, "step": 5911 }, { "epoch": 2.20106805040081, "grad_norm": 0.17018841207027435, "learning_rate": 1.8270464172928423e-05, "loss": 1.1825, "step": 5912 }, { "epoch": 2.201440355551419, "grad_norm": 0.167917862534523, "learning_rate": 1.826978094167961e-05, "loss": 1.1882, "step": 5913 }, { "epoch": 2.201812660702028, "grad_norm": 0.16571766138076782, "learning_rate": 1.826909758828665e-05, "loss": 1.1726, "step": 5914 }, { "epoch": 2.202184965852637, "grad_norm": 0.16511200368404388, "learning_rate": 1.826841411275964e-05, "loss": 1.1786, "step": 5915 }, { "epoch": 2.202557271003246, "grad_norm": 0.16730502247810364, "learning_rate": 1.8267730515108674e-05, "loss": 1.1868, "step": 5916 }, { "epoch": 2.202929576153855, "grad_norm": 0.16795329749584198, "learning_rate": 1.8267046795343845e-05, "loss": 1.1757, "step": 5917 }, { "epoch": 2.2033018813044642, "grad_norm": 0.17142654955387115, "learning_rate": 1.8266362953475252e-05, "loss": 1.1918, "step": 5918 }, { "epoch": 2.203674186455073, "grad_norm": 0.1681024432182312, "learning_rate": 1.8265678989513e-05, "loss": 1.1676, "step": 5919 }, { "epoch": 2.2040464916056823, "grad_norm": 0.1688128411769867, "learning_rate": 1.8264994903467187e-05, "loss": 1.1888, "step": 5920 }, { "epoch": 2.2044187967562916, "grad_norm": 0.16253378987312317, "learning_rate": 1.8264310695347918e-05, "loss": 1.1793, "step": 5921 }, { "epoch": 2.2047911019069004, "grad_norm": 0.16472946107387543, "learning_rate": 1.8263626365165296e-05, "loss": 1.1855, "step": 5922 }, { "epoch": 2.2051634070575097, "grad_norm": 0.1619625687599182, "learning_rate": 1.8262941912929434e-05, "loss": 1.1799, "step": 5923 }, { "epoch": 2.2055357122081185, "grad_norm": 0.16878865659236908, "learning_rate": 1.826225733865044e-05, "loss": 1.1956, "step": 5924 }, { "epoch": 2.2059080173587278, "grad_norm": 0.16465233266353607, "learning_rate": 1.8261572642338418e-05, "loss": 1.1848, "step": 5925 }, { "epoch": 2.2062803225093366, "grad_norm": 0.1674007624387741, "learning_rate": 1.826088782400349e-05, "loss": 1.1781, "step": 5926 }, { "epoch": 2.206652627659946, "grad_norm": 0.16674551367759705, "learning_rate": 1.8260202883655773e-05, "loss": 1.1907, "step": 5927 }, { "epoch": 2.2070249328105547, "grad_norm": 0.16409821808338165, "learning_rate": 1.825951782130537e-05, "loss": 1.1699, "step": 5928 }, { "epoch": 2.207397237961164, "grad_norm": 0.16593073308467865, "learning_rate": 1.825883263696241e-05, "loss": 1.1953, "step": 5929 }, { "epoch": 2.207769543111773, "grad_norm": 0.16026705503463745, "learning_rate": 1.825814733063701e-05, "loss": 1.1621, "step": 5930 }, { "epoch": 2.208141848262382, "grad_norm": 0.16227182745933533, "learning_rate": 1.825746190233929e-05, "loss": 1.1973, "step": 5931 }, { "epoch": 2.2085141534129913, "grad_norm": 0.16535073518753052, "learning_rate": 1.8256776352079377e-05, "loss": 1.1731, "step": 5932 }, { "epoch": 2.2088864585636, "grad_norm": 0.16271446645259857, "learning_rate": 1.82560906798674e-05, "loss": 1.1718, "step": 5933 }, { "epoch": 2.2092587637142094, "grad_norm": 0.16261382400989532, "learning_rate": 1.8255404885713478e-05, "loss": 1.1795, "step": 5934 }, { "epoch": 2.209631068864818, "grad_norm": 0.16760127246379852, "learning_rate": 1.825471896962774e-05, "loss": 1.1688, "step": 5935 }, { "epoch": 2.2100033740154275, "grad_norm": 0.16549386084079742, "learning_rate": 1.8254032931620326e-05, "loss": 1.1906, "step": 5936 }, { "epoch": 2.2103756791660363, "grad_norm": 0.16376779973506927, "learning_rate": 1.8253346771701363e-05, "loss": 1.1764, "step": 5937 }, { "epoch": 2.2107479843166455, "grad_norm": 0.1606384664773941, "learning_rate": 1.8252660489880986e-05, "loss": 1.1735, "step": 5938 }, { "epoch": 2.211120289467255, "grad_norm": 0.16291670501232147, "learning_rate": 1.8251974086169332e-05, "loss": 1.178, "step": 5939 }, { "epoch": 2.2114925946178636, "grad_norm": 0.16411124169826508, "learning_rate": 1.8251287560576535e-05, "loss": 1.1845, "step": 5940 }, { "epoch": 2.211864899768473, "grad_norm": 0.15855379402637482, "learning_rate": 1.8250600913112743e-05, "loss": 1.1883, "step": 5941 }, { "epoch": 2.2122372049190817, "grad_norm": 0.15984077751636505, "learning_rate": 1.824991414378809e-05, "loss": 1.1848, "step": 5942 }, { "epoch": 2.212609510069691, "grad_norm": 0.16627465188503265, "learning_rate": 1.8249227252612725e-05, "loss": 1.1914, "step": 5943 }, { "epoch": 2.2129818152203, "grad_norm": 0.1621500700712204, "learning_rate": 1.824854023959679e-05, "loss": 1.1999, "step": 5944 }, { "epoch": 2.213354120370909, "grad_norm": 0.1592169553041458, "learning_rate": 1.8247853104750433e-05, "loss": 1.1838, "step": 5945 }, { "epoch": 2.213726425521518, "grad_norm": 0.16420015692710876, "learning_rate": 1.8247165848083805e-05, "loss": 1.1777, "step": 5946 }, { "epoch": 2.214098730672127, "grad_norm": 0.16197781264781952, "learning_rate": 1.8246478469607055e-05, "loss": 1.1815, "step": 5947 }, { "epoch": 2.2144710358227364, "grad_norm": 0.1656760275363922, "learning_rate": 1.8245790969330336e-05, "loss": 1.185, "step": 5948 }, { "epoch": 2.2148433409733452, "grad_norm": 0.16433504223823547, "learning_rate": 1.82451033472638e-05, "loss": 1.1878, "step": 5949 }, { "epoch": 2.2152156461239545, "grad_norm": 0.16674500703811646, "learning_rate": 1.8244415603417603e-05, "loss": 1.1901, "step": 5950 }, { "epoch": 2.2155879512745633, "grad_norm": 0.16727973520755768, "learning_rate": 1.824372773780191e-05, "loss": 1.1683, "step": 5951 }, { "epoch": 2.2159602564251726, "grad_norm": 0.16087013483047485, "learning_rate": 1.8243039750426872e-05, "loss": 1.1982, "step": 5952 }, { "epoch": 2.2163325615757814, "grad_norm": 0.1639922559261322, "learning_rate": 1.8242351641302657e-05, "loss": 1.1971, "step": 5953 }, { "epoch": 2.2167048667263907, "grad_norm": 0.16302375495433807, "learning_rate": 1.8241663410439424e-05, "loss": 1.1748, "step": 5954 }, { "epoch": 2.2170771718769995, "grad_norm": 0.16024988889694214, "learning_rate": 1.8240975057847338e-05, "loss": 1.1837, "step": 5955 }, { "epoch": 2.2174494770276088, "grad_norm": 0.16668042540550232, "learning_rate": 1.824028658353657e-05, "loss": 1.1922, "step": 5956 }, { "epoch": 2.217821782178218, "grad_norm": 0.1660262942314148, "learning_rate": 1.8239597987517284e-05, "loss": 1.1947, "step": 5957 }, { "epoch": 2.218194087328827, "grad_norm": 0.15916001796722412, "learning_rate": 1.8238909269799655e-05, "loss": 1.1774, "step": 5958 }, { "epoch": 2.218566392479436, "grad_norm": 0.16157934069633484, "learning_rate": 1.8238220430393855e-05, "loss": 1.1705, "step": 5959 }, { "epoch": 2.218938697630045, "grad_norm": 0.17225240170955658, "learning_rate": 1.8237531469310054e-05, "loss": 1.1772, "step": 5960 }, { "epoch": 2.219311002780654, "grad_norm": 0.16769270598888397, "learning_rate": 1.823684238655843e-05, "loss": 1.1843, "step": 5961 }, { "epoch": 2.219683307931263, "grad_norm": 0.18250252306461334, "learning_rate": 1.8236153182149158e-05, "loss": 1.1767, "step": 5962 }, { "epoch": 2.2200556130818723, "grad_norm": 0.16508784890174866, "learning_rate": 1.8235463856092423e-05, "loss": 1.178, "step": 5963 }, { "epoch": 2.220427918232481, "grad_norm": 0.16602812707424164, "learning_rate": 1.8234774408398405e-05, "loss": 1.1758, "step": 5964 }, { "epoch": 2.2208002233830904, "grad_norm": 0.16944126784801483, "learning_rate": 1.8234084839077283e-05, "loss": 1.1842, "step": 5965 }, { "epoch": 2.2211725285336996, "grad_norm": 0.1692361980676651, "learning_rate": 1.8233395148139246e-05, "loss": 1.2078, "step": 5966 }, { "epoch": 2.2215448336843084, "grad_norm": 0.175007626414299, "learning_rate": 1.823270533559448e-05, "loss": 1.1778, "step": 5967 }, { "epoch": 2.2219171388349177, "grad_norm": 0.16647957265377045, "learning_rate": 1.823201540145317e-05, "loss": 1.1834, "step": 5968 }, { "epoch": 2.2222894439855265, "grad_norm": 0.16229864954948425, "learning_rate": 1.8231325345725514e-05, "loss": 1.1748, "step": 5969 }, { "epoch": 2.222661749136136, "grad_norm": 0.16611957550048828, "learning_rate": 1.8230635168421694e-05, "loss": 1.1781, "step": 5970 }, { "epoch": 2.2230340542867446, "grad_norm": 0.17674987018108368, "learning_rate": 1.8229944869551915e-05, "loss": 1.1974, "step": 5971 }, { "epoch": 2.223406359437354, "grad_norm": 0.16097386181354523, "learning_rate": 1.8229254449126365e-05, "loss": 1.1807, "step": 5972 }, { "epoch": 2.2237786645879627, "grad_norm": 0.16559158265590668, "learning_rate": 1.822856390715524e-05, "loss": 1.1776, "step": 5973 }, { "epoch": 2.224150969738572, "grad_norm": 0.16879995167255402, "learning_rate": 1.8227873243648748e-05, "loss": 1.1829, "step": 5974 }, { "epoch": 2.2245232748891812, "grad_norm": 0.1685383915901184, "learning_rate": 1.8227182458617076e-05, "loss": 1.1909, "step": 5975 }, { "epoch": 2.22489558003979, "grad_norm": 0.16316437721252441, "learning_rate": 1.822649155207044e-05, "loss": 1.1669, "step": 5976 }, { "epoch": 2.2252678851903993, "grad_norm": 0.18334704637527466, "learning_rate": 1.822580052401904e-05, "loss": 1.1755, "step": 5977 }, { "epoch": 2.225640190341008, "grad_norm": 0.16785620152950287, "learning_rate": 1.8225109374473087e-05, "loss": 1.1818, "step": 5978 }, { "epoch": 2.2260124954916174, "grad_norm": 0.16493113338947296, "learning_rate": 1.822441810344278e-05, "loss": 1.2018, "step": 5979 }, { "epoch": 2.226384800642226, "grad_norm": 0.16159646213054657, "learning_rate": 1.822372671093833e-05, "loss": 1.1709, "step": 5980 }, { "epoch": 2.2267571057928355, "grad_norm": 0.16819043457508087, "learning_rate": 1.822303519696996e-05, "loss": 1.1913, "step": 5981 }, { "epoch": 2.2271294109434443, "grad_norm": 0.16642168164253235, "learning_rate": 1.8222343561547876e-05, "loss": 1.1787, "step": 5982 }, { "epoch": 2.2275017160940536, "grad_norm": 0.16357922554016113, "learning_rate": 1.8221651804682287e-05, "loss": 1.1764, "step": 5983 }, { "epoch": 2.227874021244663, "grad_norm": 0.16216666996479034, "learning_rate": 1.8220959926383422e-05, "loss": 1.18, "step": 5984 }, { "epoch": 2.2282463263952716, "grad_norm": 0.16932249069213867, "learning_rate": 1.8220267926661494e-05, "loss": 1.1882, "step": 5985 }, { "epoch": 2.228618631545881, "grad_norm": 0.16690555214881897, "learning_rate": 1.8219575805526723e-05, "loss": 1.1948, "step": 5986 }, { "epoch": 2.2289909366964897, "grad_norm": 0.16043587028980255, "learning_rate": 1.8218883562989335e-05, "loss": 1.1779, "step": 5987 }, { "epoch": 2.229363241847099, "grad_norm": 0.16727250814437866, "learning_rate": 1.8218191199059553e-05, "loss": 1.1728, "step": 5988 }, { "epoch": 2.229735546997708, "grad_norm": 0.17071470618247986, "learning_rate": 1.82174987137476e-05, "loss": 1.1852, "step": 5989 }, { "epoch": 2.230107852148317, "grad_norm": 0.17446862161159515, "learning_rate": 1.8216806107063705e-05, "loss": 1.1967, "step": 5990 }, { "epoch": 2.230480157298926, "grad_norm": 0.16440711915493011, "learning_rate": 1.8216113379018105e-05, "loss": 1.1864, "step": 5991 }, { "epoch": 2.230852462449535, "grad_norm": 0.16457001864910126, "learning_rate": 1.8215420529621025e-05, "loss": 1.1746, "step": 5992 }, { "epoch": 2.2312247676001444, "grad_norm": 0.16013966500759125, "learning_rate": 1.82147275588827e-05, "loss": 1.1715, "step": 5993 }, { "epoch": 2.2315970727507533, "grad_norm": 0.1729276180267334, "learning_rate": 1.821403446681336e-05, "loss": 1.2026, "step": 5994 }, { "epoch": 2.2319693779013625, "grad_norm": 0.16060994565486908, "learning_rate": 1.8213341253423248e-05, "loss": 1.1569, "step": 5995 }, { "epoch": 2.2323416830519713, "grad_norm": 0.16661281883716583, "learning_rate": 1.8212647918722605e-05, "loss": 1.1791, "step": 5996 }, { "epoch": 2.2327139882025806, "grad_norm": 0.16243353486061096, "learning_rate": 1.8211954462721663e-05, "loss": 1.1822, "step": 5997 }, { "epoch": 2.2330862933531894, "grad_norm": 0.16968528926372528, "learning_rate": 1.8211260885430672e-05, "loss": 1.1697, "step": 5998 }, { "epoch": 2.2334585985037987, "grad_norm": 0.16825567185878754, "learning_rate": 1.821056718685987e-05, "loss": 1.1784, "step": 5999 }, { "epoch": 2.233830903654408, "grad_norm": 0.1593291014432907, "learning_rate": 1.820987336701951e-05, "loss": 1.1793, "step": 6000 }, { "epoch": 2.233830903654408, "eval_loss": 1.3026354312896729, "eval_runtime": 16.489, "eval_samples_per_second": 105.161, "eval_steps_per_second": 5.276, "step": 6000 }, { "epoch": 2.2342032088050168, "grad_norm": 0.16981658339500427, "learning_rate": 1.8209179425919832e-05, "loss": 1.1754, "step": 6001 }, { "epoch": 2.234575513955626, "grad_norm": 0.16528399288654327, "learning_rate": 1.820848536357109e-05, "loss": 1.1896, "step": 6002 }, { "epoch": 2.234947819106235, "grad_norm": 0.16212095320224762, "learning_rate": 1.8207791179983535e-05, "loss": 1.1762, "step": 6003 }, { "epoch": 2.235320124256844, "grad_norm": 0.16362003982067108, "learning_rate": 1.8207096875167417e-05, "loss": 1.1724, "step": 6004 }, { "epoch": 2.235692429407453, "grad_norm": 0.16708360612392426, "learning_rate": 1.8206402449132997e-05, "loss": 1.1824, "step": 6005 }, { "epoch": 2.236064734558062, "grad_norm": 0.16234560310840607, "learning_rate": 1.8205707901890524e-05, "loss": 1.178, "step": 6006 }, { "epoch": 2.236437039708671, "grad_norm": 0.16618958115577698, "learning_rate": 1.8205013233450268e-05, "loss": 1.1672, "step": 6007 }, { "epoch": 2.2368093448592803, "grad_norm": 0.17038699984550476, "learning_rate": 1.8204318443822473e-05, "loss": 1.1991, "step": 6008 }, { "epoch": 2.2371816500098896, "grad_norm": 0.16078020632266998, "learning_rate": 1.8203623533017413e-05, "loss": 1.1836, "step": 6009 }, { "epoch": 2.2375539551604984, "grad_norm": 0.16053731739521027, "learning_rate": 1.8202928501045347e-05, "loss": 1.1889, "step": 6010 }, { "epoch": 2.2379262603111076, "grad_norm": 0.16496212780475616, "learning_rate": 1.820223334791654e-05, "loss": 1.1908, "step": 6011 }, { "epoch": 2.2382985654617165, "grad_norm": 0.16259627044200897, "learning_rate": 1.8201538073641264e-05, "loss": 1.1845, "step": 6012 }, { "epoch": 2.2386708706123257, "grad_norm": 0.16362962126731873, "learning_rate": 1.8200842678229786e-05, "loss": 1.1731, "step": 6013 }, { "epoch": 2.2390431757629345, "grad_norm": 0.16907905042171478, "learning_rate": 1.8200147161692373e-05, "loss": 1.1932, "step": 6014 }, { "epoch": 2.239415480913544, "grad_norm": 0.1673828512430191, "learning_rate": 1.8199451524039308e-05, "loss": 1.1848, "step": 6015 }, { "epoch": 2.2397877860641526, "grad_norm": 0.16971193253993988, "learning_rate": 1.819875576528085e-05, "loss": 1.1777, "step": 6016 }, { "epoch": 2.240160091214762, "grad_norm": 0.173600435256958, "learning_rate": 1.819805988542729e-05, "loss": 1.1947, "step": 6017 }, { "epoch": 2.240532396365371, "grad_norm": 0.1610695868730545, "learning_rate": 1.81973638844889e-05, "loss": 1.1796, "step": 6018 }, { "epoch": 2.24090470151598, "grad_norm": 0.16606231033802032, "learning_rate": 1.8196667762475953e-05, "loss": 1.1824, "step": 6019 }, { "epoch": 2.2412770066665892, "grad_norm": 0.16269895434379578, "learning_rate": 1.8195971519398744e-05, "loss": 1.1918, "step": 6020 }, { "epoch": 2.241649311817198, "grad_norm": 0.1672276109457016, "learning_rate": 1.8195275155267546e-05, "loss": 1.197, "step": 6021 }, { "epoch": 2.2420216169678073, "grad_norm": 0.16324323415756226, "learning_rate": 1.8194578670092654e-05, "loss": 1.1799, "step": 6022 }, { "epoch": 2.242393922118416, "grad_norm": 0.1610366255044937, "learning_rate": 1.8193882063884346e-05, "loss": 1.1787, "step": 6023 }, { "epoch": 2.2427662272690254, "grad_norm": 0.163301020860672, "learning_rate": 1.8193185336652912e-05, "loss": 1.1851, "step": 6024 }, { "epoch": 2.2431385324196347, "grad_norm": 0.16629861295223236, "learning_rate": 1.819248848840865e-05, "loss": 1.1963, "step": 6025 }, { "epoch": 2.2435108375702435, "grad_norm": 0.1662205159664154, "learning_rate": 1.819179151916184e-05, "loss": 1.189, "step": 6026 }, { "epoch": 2.2438831427208528, "grad_norm": 0.15960603952407837, "learning_rate": 1.819109442892279e-05, "loss": 1.1654, "step": 6027 }, { "epoch": 2.2442554478714616, "grad_norm": 0.16344155371189117, "learning_rate": 1.8190397217701785e-05, "loss": 1.1841, "step": 6028 }, { "epoch": 2.244627753022071, "grad_norm": 0.16831251978874207, "learning_rate": 1.8189699885509128e-05, "loss": 1.1615, "step": 6029 }, { "epoch": 2.2450000581726797, "grad_norm": 0.1629910171031952, "learning_rate": 1.818900243235512e-05, "loss": 1.1961, "step": 6030 }, { "epoch": 2.245372363323289, "grad_norm": 0.16238047182559967, "learning_rate": 1.818830485825006e-05, "loss": 1.1766, "step": 6031 }, { "epoch": 2.2457446684738978, "grad_norm": 0.1695476919412613, "learning_rate": 1.8187607163204246e-05, "loss": 1.173, "step": 6032 }, { "epoch": 2.246116973624507, "grad_norm": 0.16819293797016144, "learning_rate": 1.8186909347227992e-05, "loss": 1.1899, "step": 6033 }, { "epoch": 2.2464892787751163, "grad_norm": 0.16680686175823212, "learning_rate": 1.81862114103316e-05, "loss": 1.1771, "step": 6034 }, { "epoch": 2.246861583925725, "grad_norm": 0.1659124493598938, "learning_rate": 1.818551335252538e-05, "loss": 1.1693, "step": 6035 }, { "epoch": 2.2472338890763344, "grad_norm": 0.15798041224479675, "learning_rate": 1.818481517381964e-05, "loss": 1.1757, "step": 6036 }, { "epoch": 2.247606194226943, "grad_norm": 0.17841339111328125, "learning_rate": 1.8184116874224695e-05, "loss": 1.1828, "step": 6037 }, { "epoch": 2.2479784993775525, "grad_norm": 0.1704692840576172, "learning_rate": 1.818341845375086e-05, "loss": 1.1874, "step": 6038 }, { "epoch": 2.2483508045281613, "grad_norm": 0.16449272632598877, "learning_rate": 1.818271991240844e-05, "loss": 1.1822, "step": 6039 }, { "epoch": 2.2487231096787705, "grad_norm": 0.15764431655406952, "learning_rate": 1.818202125020777e-05, "loss": 1.1709, "step": 6040 }, { "epoch": 2.2490954148293794, "grad_norm": 0.18003995716571808, "learning_rate": 1.8181322467159153e-05, "loss": 1.1801, "step": 6041 }, { "epoch": 2.2494677199799886, "grad_norm": 0.1611638069152832, "learning_rate": 1.818062356327292e-05, "loss": 1.1702, "step": 6042 }, { "epoch": 2.249840025130598, "grad_norm": 0.17060458660125732, "learning_rate": 1.8179924538559385e-05, "loss": 1.1737, "step": 6043 }, { "epoch": 2.2502123302812067, "grad_norm": 0.1664137840270996, "learning_rate": 1.8179225393028883e-05, "loss": 1.1984, "step": 6044 }, { "epoch": 2.250584635431816, "grad_norm": 0.16480514407157898, "learning_rate": 1.8178526126691734e-05, "loss": 1.1802, "step": 6045 }, { "epoch": 2.250956940582425, "grad_norm": 0.1663130819797516, "learning_rate": 1.8177826739558268e-05, "loss": 1.1844, "step": 6046 }, { "epoch": 2.251329245733034, "grad_norm": 0.15989989042282104, "learning_rate": 1.8177127231638815e-05, "loss": 1.1773, "step": 6047 }, { "epoch": 2.251701550883643, "grad_norm": 0.1636444330215454, "learning_rate": 1.8176427602943705e-05, "loss": 1.1707, "step": 6048 }, { "epoch": 2.252073856034252, "grad_norm": 0.16399884223937988, "learning_rate": 1.817572785348327e-05, "loss": 1.1714, "step": 6049 }, { "epoch": 2.252446161184861, "grad_norm": 0.1703033149242401, "learning_rate": 1.817502798326785e-05, "loss": 1.199, "step": 6050 }, { "epoch": 2.2528184663354702, "grad_norm": 0.16354358196258545, "learning_rate": 1.817432799230778e-05, "loss": 1.1994, "step": 6051 }, { "epoch": 2.2531907714860795, "grad_norm": 0.16181591153144836, "learning_rate": 1.8173627880613394e-05, "loss": 1.1794, "step": 6052 }, { "epoch": 2.2535630766366883, "grad_norm": 0.1724006086587906, "learning_rate": 1.8172927648195043e-05, "loss": 1.1879, "step": 6053 }, { "epoch": 2.2539353817872976, "grad_norm": 0.16579163074493408, "learning_rate": 1.8172227295063062e-05, "loss": 1.1827, "step": 6054 }, { "epoch": 2.2543076869379064, "grad_norm": 0.16125434637069702, "learning_rate": 1.81715268212278e-05, "loss": 1.1758, "step": 6055 }, { "epoch": 2.2546799920885157, "grad_norm": 0.16942323744297028, "learning_rate": 1.8170826226699593e-05, "loss": 1.1849, "step": 6056 }, { "epoch": 2.2550522972391245, "grad_norm": 0.1689453423023224, "learning_rate": 1.81701255114888e-05, "loss": 1.1837, "step": 6057 }, { "epoch": 2.2554246023897337, "grad_norm": 0.1727559119462967, "learning_rate": 1.8169424675605766e-05, "loss": 1.1916, "step": 6058 }, { "epoch": 2.2557969075403426, "grad_norm": 0.17401790618896484, "learning_rate": 1.816872371906084e-05, "loss": 1.1693, "step": 6059 }, { "epoch": 2.256169212690952, "grad_norm": 0.16698595881462097, "learning_rate": 1.816802264186438e-05, "loss": 1.18, "step": 6060 }, { "epoch": 2.256541517841561, "grad_norm": 0.1707891970872879, "learning_rate": 1.816732144402673e-05, "loss": 1.1742, "step": 6061 }, { "epoch": 2.25691382299217, "grad_norm": 0.16598151624202728, "learning_rate": 1.8166620125558263e-05, "loss": 1.188, "step": 6062 }, { "epoch": 2.257286128142779, "grad_norm": 0.16978704929351807, "learning_rate": 1.816591868646933e-05, "loss": 1.1884, "step": 6063 }, { "epoch": 2.257658433293388, "grad_norm": 0.16990995407104492, "learning_rate": 1.8165217126770285e-05, "loss": 1.1826, "step": 6064 }, { "epoch": 2.2580307384439973, "grad_norm": 0.1661897599697113, "learning_rate": 1.81645154464715e-05, "loss": 1.1702, "step": 6065 }, { "epoch": 2.258403043594606, "grad_norm": 0.16440676152706146, "learning_rate": 1.816381364558333e-05, "loss": 1.1769, "step": 6066 }, { "epoch": 2.2587753487452154, "grad_norm": 0.18172229826450348, "learning_rate": 1.816311172411615e-05, "loss": 1.1747, "step": 6067 }, { "epoch": 2.259147653895824, "grad_norm": 0.16828443109989166, "learning_rate": 1.816240968208032e-05, "loss": 1.2007, "step": 6068 }, { "epoch": 2.2595199590464334, "grad_norm": 0.16745588183403015, "learning_rate": 1.816170751948621e-05, "loss": 1.1842, "step": 6069 }, { "epoch": 2.2598922641970427, "grad_norm": 0.16287997364997864, "learning_rate": 1.8161005236344193e-05, "loss": 1.1852, "step": 6070 }, { "epoch": 2.2602645693476515, "grad_norm": 0.1797943264245987, "learning_rate": 1.816030283266464e-05, "loss": 1.1838, "step": 6071 }, { "epoch": 2.260636874498261, "grad_norm": 0.1775754690170288, "learning_rate": 1.815960030845793e-05, "loss": 1.1877, "step": 6072 }, { "epoch": 2.2610091796488696, "grad_norm": 0.1645868569612503, "learning_rate": 1.815889766373443e-05, "loss": 1.184, "step": 6073 }, { "epoch": 2.261381484799479, "grad_norm": 0.17757607996463776, "learning_rate": 1.8158194898504526e-05, "loss": 1.1702, "step": 6074 }, { "epoch": 2.2617537899500877, "grad_norm": 0.16869355738162994, "learning_rate": 1.8157492012778598e-05, "loss": 1.1741, "step": 6075 }, { "epoch": 2.262126095100697, "grad_norm": 0.1690586805343628, "learning_rate": 1.8156789006567018e-05, "loss": 1.1676, "step": 6076 }, { "epoch": 2.2624984002513058, "grad_norm": 0.17510120570659637, "learning_rate": 1.815608587988018e-05, "loss": 1.1863, "step": 6077 }, { "epoch": 2.262870705401915, "grad_norm": 0.1718439757823944, "learning_rate": 1.8155382632728468e-05, "loss": 1.1847, "step": 6078 }, { "epoch": 2.2632430105525243, "grad_norm": 0.1764347404241562, "learning_rate": 1.8154679265122265e-05, "loss": 1.1627, "step": 6079 }, { "epoch": 2.263615315703133, "grad_norm": 0.1741929054260254, "learning_rate": 1.815397577707196e-05, "loss": 1.186, "step": 6080 }, { "epoch": 2.2639876208537424, "grad_norm": 0.162635937333107, "learning_rate": 1.8153272168587947e-05, "loss": 1.1809, "step": 6081 }, { "epoch": 2.264359926004351, "grad_norm": 0.16348296403884888, "learning_rate": 1.8152568439680612e-05, "loss": 1.179, "step": 6082 }, { "epoch": 2.2647322311549605, "grad_norm": 0.16939151287078857, "learning_rate": 1.8151864590360354e-05, "loss": 1.1605, "step": 6083 }, { "epoch": 2.2651045363055693, "grad_norm": 0.17128928005695343, "learning_rate": 1.815116062063757e-05, "loss": 1.1947, "step": 6084 }, { "epoch": 2.2654768414561786, "grad_norm": 0.1636468470096588, "learning_rate": 1.8150456530522652e-05, "loss": 1.181, "step": 6085 }, { "epoch": 2.2658491466067874, "grad_norm": 0.1675485074520111, "learning_rate": 1.8149752320026004e-05, "loss": 1.1772, "step": 6086 }, { "epoch": 2.2662214517573966, "grad_norm": 0.17299336194992065, "learning_rate": 1.8149047989158026e-05, "loss": 1.1853, "step": 6087 }, { "epoch": 2.266593756908006, "grad_norm": 0.16324491798877716, "learning_rate": 1.8148343537929118e-05, "loss": 1.1806, "step": 6088 }, { "epoch": 2.2669660620586147, "grad_norm": 0.16224730014801025, "learning_rate": 1.8147638966349687e-05, "loss": 1.1856, "step": 6089 }, { "epoch": 2.267338367209224, "grad_norm": 0.16687463223934174, "learning_rate": 1.8146934274430147e-05, "loss": 1.2029, "step": 6090 }, { "epoch": 2.267710672359833, "grad_norm": 0.16694878041744232, "learning_rate": 1.814622946218089e-05, "loss": 1.1644, "step": 6091 }, { "epoch": 2.268082977510442, "grad_norm": 0.1973126232624054, "learning_rate": 1.814552452961234e-05, "loss": 1.1787, "step": 6092 }, { "epoch": 2.268455282661051, "grad_norm": 0.20395469665527344, "learning_rate": 1.81448194767349e-05, "loss": 1.1916, "step": 6093 }, { "epoch": 2.26882758781166, "grad_norm": 0.1700964868068695, "learning_rate": 1.8144114303558993e-05, "loss": 1.1889, "step": 6094 }, { "epoch": 2.269199892962269, "grad_norm": 0.17254211008548737, "learning_rate": 1.8143409010095028e-05, "loss": 1.1644, "step": 6095 }, { "epoch": 2.2695721981128782, "grad_norm": 0.19276972115039825, "learning_rate": 1.814270359635342e-05, "loss": 1.1883, "step": 6096 }, { "epoch": 2.2699445032634875, "grad_norm": 0.1734069436788559, "learning_rate": 1.814199806234459e-05, "loss": 1.1843, "step": 6097 }, { "epoch": 2.2703168084140963, "grad_norm": 0.1637657731771469, "learning_rate": 1.8141292408078963e-05, "loss": 1.1871, "step": 6098 }, { "epoch": 2.2706891135647056, "grad_norm": 0.17510093748569489, "learning_rate": 1.814058663356696e-05, "loss": 1.1796, "step": 6099 }, { "epoch": 2.2710614187153144, "grad_norm": 0.16966238617897034, "learning_rate": 1.8139880738819e-05, "loss": 1.1683, "step": 6100 }, { "epoch": 2.2714337238659237, "grad_norm": 0.1668512225151062, "learning_rate": 1.8139174723845513e-05, "loss": 1.1993, "step": 6101 }, { "epoch": 2.2718060290165325, "grad_norm": 0.17251057922840118, "learning_rate": 1.8138468588656922e-05, "loss": 1.19, "step": 6102 }, { "epoch": 2.2721783341671418, "grad_norm": 0.1686212718486786, "learning_rate": 1.8137762333263667e-05, "loss": 1.1798, "step": 6103 }, { "epoch": 2.2725506393177506, "grad_norm": 0.16368193924427032, "learning_rate": 1.8137055957676172e-05, "loss": 1.168, "step": 6104 }, { "epoch": 2.27292294446836, "grad_norm": 0.1598677635192871, "learning_rate": 1.8136349461904866e-05, "loss": 1.1762, "step": 6105 }, { "epoch": 2.273295249618969, "grad_norm": 0.16657836735248566, "learning_rate": 1.8135642845960195e-05, "loss": 1.179, "step": 6106 }, { "epoch": 2.273667554769578, "grad_norm": 0.17238342761993408, "learning_rate": 1.8134936109852587e-05, "loss": 1.2088, "step": 6107 }, { "epoch": 2.274039859920187, "grad_norm": 0.17234353721141815, "learning_rate": 1.8134229253592485e-05, "loss": 1.1965, "step": 6108 }, { "epoch": 2.274412165070796, "grad_norm": 0.16358719766139984, "learning_rate": 1.8133522277190324e-05, "loss": 1.1843, "step": 6109 }, { "epoch": 2.2747844702214053, "grad_norm": 0.17148621380329132, "learning_rate": 1.8132815180656554e-05, "loss": 1.1863, "step": 6110 }, { "epoch": 2.275156775372014, "grad_norm": 0.17336560785770416, "learning_rate": 1.813210796400161e-05, "loss": 1.1859, "step": 6111 }, { "epoch": 2.2755290805226234, "grad_norm": 0.17160752415657043, "learning_rate": 1.813140062723594e-05, "loss": 1.1925, "step": 6112 }, { "epoch": 2.275901385673232, "grad_norm": 0.16290338337421417, "learning_rate": 1.8130693170369998e-05, "loss": 1.174, "step": 6113 }, { "epoch": 2.2762736908238415, "grad_norm": 0.16730904579162598, "learning_rate": 1.812998559341423e-05, "loss": 1.1699, "step": 6114 }, { "epoch": 2.2766459959744507, "grad_norm": 0.1715693324804306, "learning_rate": 1.8129277896379077e-05, "loss": 1.1877, "step": 6115 }, { "epoch": 2.2770183011250595, "grad_norm": 0.16124042868614197, "learning_rate": 1.812857007927501e-05, "loss": 1.1604, "step": 6116 }, { "epoch": 2.277390606275669, "grad_norm": 0.155757874250412, "learning_rate": 1.8127862142112463e-05, "loss": 1.1699, "step": 6117 }, { "epoch": 2.2777629114262776, "grad_norm": 0.16820941865444183, "learning_rate": 1.8127154084901906e-05, "loss": 1.1702, "step": 6118 }, { "epoch": 2.278135216576887, "grad_norm": 0.16459953784942627, "learning_rate": 1.8126445907653797e-05, "loss": 1.1873, "step": 6119 }, { "epoch": 2.2785075217274957, "grad_norm": 0.16601146757602692, "learning_rate": 1.8125737610378585e-05, "loss": 1.1809, "step": 6120 }, { "epoch": 2.278879826878105, "grad_norm": 0.16226966679096222, "learning_rate": 1.8125029193086743e-05, "loss": 1.1728, "step": 6121 }, { "epoch": 2.279252132028714, "grad_norm": 0.16306665539741516, "learning_rate": 1.812432065578873e-05, "loss": 1.1887, "step": 6122 }, { "epoch": 2.279624437179323, "grad_norm": 0.15871618688106537, "learning_rate": 1.812361199849501e-05, "loss": 1.1873, "step": 6123 }, { "epoch": 2.2799967423299323, "grad_norm": 0.17421315610408783, "learning_rate": 1.812290322121605e-05, "loss": 1.1949, "step": 6124 }, { "epoch": 2.280369047480541, "grad_norm": 0.16765080392360687, "learning_rate": 1.8122194323962317e-05, "loss": 1.1913, "step": 6125 }, { "epoch": 2.2807413526311504, "grad_norm": 0.15896672010421753, "learning_rate": 1.8121485306744286e-05, "loss": 1.1887, "step": 6126 }, { "epoch": 2.2811136577817592, "grad_norm": 0.16295817494392395, "learning_rate": 1.8120776169572427e-05, "loss": 1.1773, "step": 6127 }, { "epoch": 2.2814859629323685, "grad_norm": 0.16846707463264465, "learning_rate": 1.8120066912457216e-05, "loss": 1.1831, "step": 6128 }, { "epoch": 2.2818582680829778, "grad_norm": 0.1673799604177475, "learning_rate": 1.811935753540912e-05, "loss": 1.1824, "step": 6129 }, { "epoch": 2.2822305732335866, "grad_norm": 0.1708277016878128, "learning_rate": 1.8118648038438627e-05, "loss": 1.1776, "step": 6130 }, { "epoch": 2.2826028783841954, "grad_norm": 0.170358344912529, "learning_rate": 1.811793842155621e-05, "loss": 1.1843, "step": 6131 }, { "epoch": 2.2829751835348047, "grad_norm": 0.17161040008068085, "learning_rate": 1.8117228684772358e-05, "loss": 1.1901, "step": 6132 }, { "epoch": 2.283347488685414, "grad_norm": 0.16947078704833984, "learning_rate": 1.811651882809754e-05, "loss": 1.1788, "step": 6133 }, { "epoch": 2.2837197938360227, "grad_norm": 0.1724851131439209, "learning_rate": 1.8115808851542255e-05, "loss": 1.1903, "step": 6134 }, { "epoch": 2.284092098986632, "grad_norm": 0.17149105668067932, "learning_rate": 1.8115098755116974e-05, "loss": 1.1849, "step": 6135 }, { "epoch": 2.284464404137241, "grad_norm": 0.16904261708259583, "learning_rate": 1.81143885388322e-05, "loss": 1.18, "step": 6136 }, { "epoch": 2.28483670928785, "grad_norm": 0.16865547001361847, "learning_rate": 1.811367820269842e-05, "loss": 1.1773, "step": 6137 }, { "epoch": 2.2852090144384594, "grad_norm": 0.16392925381660461, "learning_rate": 1.811296774672611e-05, "loss": 1.1895, "step": 6138 }, { "epoch": 2.285581319589068, "grad_norm": 0.17182591557502747, "learning_rate": 1.8112257170925785e-05, "loss": 1.184, "step": 6139 }, { "epoch": 2.2859536247396774, "grad_norm": 0.16225169599056244, "learning_rate": 1.8111546475307927e-05, "loss": 1.1836, "step": 6140 }, { "epoch": 2.2863259298902863, "grad_norm": 0.16837048530578613, "learning_rate": 1.811083565988304e-05, "loss": 1.1813, "step": 6141 }, { "epoch": 2.2866982350408955, "grad_norm": 0.16455179452896118, "learning_rate": 1.8110124724661617e-05, "loss": 1.1837, "step": 6142 }, { "epoch": 2.2870705401915044, "grad_norm": 0.19278612732887268, "learning_rate": 1.810941366965416e-05, "loss": 1.177, "step": 6143 }, { "epoch": 2.2874428453421136, "grad_norm": 0.18453273177146912, "learning_rate": 1.8108702494871173e-05, "loss": 1.1796, "step": 6144 }, { "epoch": 2.2878151504927224, "grad_norm": 0.17446112632751465, "learning_rate": 1.8107991200323162e-05, "loss": 1.1892, "step": 6145 }, { "epoch": 2.2881874556433317, "grad_norm": 0.15920548141002655, "learning_rate": 1.8107279786020627e-05, "loss": 1.1722, "step": 6146 }, { "epoch": 2.288559760793941, "grad_norm": 0.21259953081607819, "learning_rate": 1.8106568251974077e-05, "loss": 1.1791, "step": 6147 }, { "epoch": 2.28893206594455, "grad_norm": 0.16296431422233582, "learning_rate": 1.8105856598194026e-05, "loss": 1.1778, "step": 6148 }, { "epoch": 2.289304371095159, "grad_norm": 0.1709262728691101, "learning_rate": 1.8105144824690977e-05, "loss": 1.179, "step": 6149 }, { "epoch": 2.289676676245768, "grad_norm": 0.163625106215477, "learning_rate": 1.8104432931475454e-05, "loss": 1.1787, "step": 6150 }, { "epoch": 2.290048981396377, "grad_norm": 0.16324341297149658, "learning_rate": 1.810372091855796e-05, "loss": 1.1714, "step": 6151 }, { "epoch": 2.290421286546986, "grad_norm": 0.16584549844264984, "learning_rate": 1.8103008785949015e-05, "loss": 1.173, "step": 6152 }, { "epoch": 2.290793591697595, "grad_norm": 0.17169234156608582, "learning_rate": 1.8102296533659146e-05, "loss": 1.1747, "step": 6153 }, { "epoch": 2.291165896848204, "grad_norm": 0.17495130002498627, "learning_rate": 1.810158416169886e-05, "loss": 1.1814, "step": 6154 }, { "epoch": 2.2915382019988133, "grad_norm": 0.16563518345355988, "learning_rate": 1.8100871670078687e-05, "loss": 1.1744, "step": 6155 }, { "epoch": 2.2919105071494226, "grad_norm": 0.16511280834674835, "learning_rate": 1.8100159058809146e-05, "loss": 1.184, "step": 6156 }, { "epoch": 2.2922828123000314, "grad_norm": 0.16747237741947174, "learning_rate": 1.8099446327900766e-05, "loss": 1.1883, "step": 6157 }, { "epoch": 2.2926551174506407, "grad_norm": 0.16100740432739258, "learning_rate": 1.809873347736407e-05, "loss": 1.1764, "step": 6158 }, { "epoch": 2.2930274226012495, "grad_norm": 0.16185326874256134, "learning_rate": 1.809802050720959e-05, "loss": 1.1846, "step": 6159 }, { "epoch": 2.2933997277518587, "grad_norm": 0.1648186296224594, "learning_rate": 1.8097307417447855e-05, "loss": 1.1802, "step": 6160 }, { "epoch": 2.2937720329024676, "grad_norm": 0.1759217381477356, "learning_rate": 1.8096594208089397e-05, "loss": 1.1826, "step": 6161 }, { "epoch": 2.294144338053077, "grad_norm": 0.1690814197063446, "learning_rate": 1.809588087914475e-05, "loss": 1.1606, "step": 6162 }, { "epoch": 2.2945166432036856, "grad_norm": 0.1603320837020874, "learning_rate": 1.8095167430624454e-05, "loss": 1.1739, "step": 6163 }, { "epoch": 2.294888948354295, "grad_norm": 0.16544067859649658, "learning_rate": 1.809445386253904e-05, "loss": 1.1809, "step": 6164 }, { "epoch": 2.295261253504904, "grad_norm": 0.16254539787769318, "learning_rate": 1.809374017489905e-05, "loss": 1.1904, "step": 6165 }, { "epoch": 2.295633558655513, "grad_norm": 0.16620919108390808, "learning_rate": 1.809302636771503e-05, "loss": 1.1834, "step": 6166 }, { "epoch": 2.2960058638061223, "grad_norm": 0.16282342374324799, "learning_rate": 1.809231244099751e-05, "loss": 1.1815, "step": 6167 }, { "epoch": 2.296378168956731, "grad_norm": 0.157831609249115, "learning_rate": 1.809159839475705e-05, "loss": 1.1681, "step": 6168 }, { "epoch": 2.2967504741073403, "grad_norm": 0.16010123491287231, "learning_rate": 1.809088422900419e-05, "loss": 1.1629, "step": 6169 }, { "epoch": 2.297122779257949, "grad_norm": 0.16694015264511108, "learning_rate": 1.8090169943749477e-05, "loss": 1.1824, "step": 6170 }, { "epoch": 2.2974950844085584, "grad_norm": 0.15992291271686554, "learning_rate": 1.8089455539003457e-05, "loss": 1.1718, "step": 6171 }, { "epoch": 2.2978673895591673, "grad_norm": 0.1654921919107437, "learning_rate": 1.808874101477669e-05, "loss": 1.1947, "step": 6172 }, { "epoch": 2.2982396947097765, "grad_norm": 0.1608392298221588, "learning_rate": 1.8088026371079728e-05, "loss": 1.1953, "step": 6173 }, { "epoch": 2.298611999860386, "grad_norm": 0.1624198704957962, "learning_rate": 1.8087311607923118e-05, "loss": 1.1819, "step": 6174 }, { "epoch": 2.2989843050109946, "grad_norm": 0.163405641913414, "learning_rate": 1.808659672531743e-05, "loss": 1.1877, "step": 6175 }, { "epoch": 2.299356610161604, "grad_norm": 0.16340793669223785, "learning_rate": 1.8085881723273215e-05, "loss": 1.1771, "step": 6176 }, { "epoch": 2.2997289153122127, "grad_norm": 0.16229262948036194, "learning_rate": 1.808516660180103e-05, "loss": 1.1862, "step": 6177 }, { "epoch": 2.300101220462822, "grad_norm": 0.15974633395671844, "learning_rate": 1.808445136091144e-05, "loss": 1.1828, "step": 6178 }, { "epoch": 2.3004735256134308, "grad_norm": 0.15938468277454376, "learning_rate": 1.8083736000615017e-05, "loss": 1.1667, "step": 6179 }, { "epoch": 2.30084583076404, "grad_norm": 0.1704174429178238, "learning_rate": 1.808302052092232e-05, "loss": 1.1785, "step": 6180 }, { "epoch": 2.301218135914649, "grad_norm": 0.15925031900405884, "learning_rate": 1.8082304921843913e-05, "loss": 1.1729, "step": 6181 }, { "epoch": 2.301590441065258, "grad_norm": 0.17216730117797852, "learning_rate": 1.8081589203390374e-05, "loss": 1.1874, "step": 6182 }, { "epoch": 2.3019627462158674, "grad_norm": 0.16319389641284943, "learning_rate": 1.8080873365572265e-05, "loss": 1.1853, "step": 6183 }, { "epoch": 2.302335051366476, "grad_norm": 0.16606274247169495, "learning_rate": 1.8080157408400167e-05, "loss": 1.1777, "step": 6184 }, { "epoch": 2.3027073565170855, "grad_norm": 0.16718682646751404, "learning_rate": 1.807944133188465e-05, "loss": 1.1713, "step": 6185 }, { "epoch": 2.3030796616676943, "grad_norm": 0.16058477759361267, "learning_rate": 1.8078725136036292e-05, "loss": 1.171, "step": 6186 }, { "epoch": 2.3034519668183036, "grad_norm": 0.1608082354068756, "learning_rate": 1.8078008820865667e-05, "loss": 1.1773, "step": 6187 }, { "epoch": 2.3038242719689124, "grad_norm": 0.16686289012432098, "learning_rate": 1.8077292386383364e-05, "loss": 1.1741, "step": 6188 }, { "epoch": 2.3041965771195216, "grad_norm": 0.16598117351531982, "learning_rate": 1.8076575832599957e-05, "loss": 1.1979, "step": 6189 }, { "epoch": 2.3045688822701305, "grad_norm": 0.16313214600086212, "learning_rate": 1.8075859159526033e-05, "loss": 1.1774, "step": 6190 }, { "epoch": 2.3049411874207397, "grad_norm": 0.16657821834087372, "learning_rate": 1.8075142367172175e-05, "loss": 1.1888, "step": 6191 }, { "epoch": 2.305313492571349, "grad_norm": 0.16192109882831573, "learning_rate": 1.8074425455548972e-05, "loss": 1.1688, "step": 6192 }, { "epoch": 2.305685797721958, "grad_norm": 0.16952742636203766, "learning_rate": 1.807370842466701e-05, "loss": 1.1592, "step": 6193 }, { "epoch": 2.306058102872567, "grad_norm": 0.15994684398174286, "learning_rate": 1.8072991274536883e-05, "loss": 1.1903, "step": 6194 }, { "epoch": 2.306430408023176, "grad_norm": 0.16636525094509125, "learning_rate": 1.807227400516918e-05, "loss": 1.1754, "step": 6195 }, { "epoch": 2.306802713173785, "grad_norm": 0.16642355918884277, "learning_rate": 1.8071556616574498e-05, "loss": 1.182, "step": 6196 }, { "epoch": 2.307175018324394, "grad_norm": 0.1568913608789444, "learning_rate": 1.807083910876343e-05, "loss": 1.1825, "step": 6197 }, { "epoch": 2.3075473234750032, "grad_norm": 0.16373074054718018, "learning_rate": 1.8070121481746576e-05, "loss": 1.1873, "step": 6198 }, { "epoch": 2.307919628625612, "grad_norm": 0.16669581830501556, "learning_rate": 1.8069403735534533e-05, "loss": 1.1804, "step": 6199 }, { "epoch": 2.3082919337762213, "grad_norm": 0.1655244082212448, "learning_rate": 1.8068685870137906e-05, "loss": 1.194, "step": 6200 }, { "epoch": 2.3086642389268306, "grad_norm": 0.16131706535816193, "learning_rate": 1.8067967885567292e-05, "loss": 1.172, "step": 6201 }, { "epoch": 2.3090365440774394, "grad_norm": 0.1645621359348297, "learning_rate": 1.80672497818333e-05, "loss": 1.1778, "step": 6202 }, { "epoch": 2.3094088492280487, "grad_norm": 0.1699766367673874, "learning_rate": 1.8066531558946537e-05, "loss": 1.1868, "step": 6203 }, { "epoch": 2.3097811543786575, "grad_norm": 0.16291096806526184, "learning_rate": 1.8065813216917604e-05, "loss": 1.1798, "step": 6204 }, { "epoch": 2.3101534595292668, "grad_norm": 0.16420194506645203, "learning_rate": 1.806509475575712e-05, "loss": 1.1724, "step": 6205 }, { "epoch": 2.3105257646798756, "grad_norm": 0.1601489931344986, "learning_rate": 1.806437617547569e-05, "loss": 1.1648, "step": 6206 }, { "epoch": 2.310898069830485, "grad_norm": 0.17182345688343048, "learning_rate": 1.806365747608393e-05, "loss": 1.1888, "step": 6207 }, { "epoch": 2.3112703749810937, "grad_norm": 0.16343766450881958, "learning_rate": 1.806293865759246e-05, "loss": 1.17, "step": 6208 }, { "epoch": 2.311642680131703, "grad_norm": 0.16800105571746826, "learning_rate": 1.806221972001189e-05, "loss": 1.1749, "step": 6209 }, { "epoch": 2.312014985282312, "grad_norm": 0.16388358175754547, "learning_rate": 1.806150066335284e-05, "loss": 1.1791, "step": 6210 }, { "epoch": 2.312387290432921, "grad_norm": 0.1601734161376953, "learning_rate": 1.8060781487625927e-05, "loss": 1.181, "step": 6211 }, { "epoch": 2.3127595955835303, "grad_norm": 0.16823667287826538, "learning_rate": 1.806006219284178e-05, "loss": 1.176, "step": 6212 }, { "epoch": 2.313131900734139, "grad_norm": 0.16742992401123047, "learning_rate": 1.805934277901102e-05, "loss": 1.175, "step": 6213 }, { "epoch": 2.3135042058847484, "grad_norm": 0.16113218665122986, "learning_rate": 1.8058623246144274e-05, "loss": 1.1971, "step": 6214 }, { "epoch": 2.313876511035357, "grad_norm": 0.1706179678440094, "learning_rate": 1.805790359425217e-05, "loss": 1.1806, "step": 6215 }, { "epoch": 2.3142488161859665, "grad_norm": 0.16922074556350708, "learning_rate": 1.8057183823345333e-05, "loss": 1.184, "step": 6216 }, { "epoch": 2.3146211213365753, "grad_norm": 0.16326811909675598, "learning_rate": 1.80564639334344e-05, "loss": 1.178, "step": 6217 }, { "epoch": 2.3149934264871845, "grad_norm": 0.1662980020046234, "learning_rate": 1.8055743924529996e-05, "loss": 1.1884, "step": 6218 }, { "epoch": 2.315365731637794, "grad_norm": 0.16735336184501648, "learning_rate": 1.805502379664276e-05, "loss": 1.1828, "step": 6219 }, { "epoch": 2.3157380367884026, "grad_norm": 0.16329142451286316, "learning_rate": 1.805430354978333e-05, "loss": 1.1834, "step": 6220 }, { "epoch": 2.316110341939012, "grad_norm": 0.16252319514751434, "learning_rate": 1.8053583183962342e-05, "loss": 1.1639, "step": 6221 }, { "epoch": 2.3164826470896207, "grad_norm": 0.16563460230827332, "learning_rate": 1.8052862699190435e-05, "loss": 1.1942, "step": 6222 }, { "epoch": 2.31685495224023, "grad_norm": 0.16497038304805756, "learning_rate": 1.8052142095478253e-05, "loss": 1.1846, "step": 6223 }, { "epoch": 2.317227257390839, "grad_norm": 0.1694660484790802, "learning_rate": 1.8051421372836438e-05, "loss": 1.1886, "step": 6224 }, { "epoch": 2.317599562541448, "grad_norm": 0.1612187922000885, "learning_rate": 1.8050700531275632e-05, "loss": 1.1802, "step": 6225 }, { "epoch": 2.317971867692057, "grad_norm": 0.166747584939003, "learning_rate": 1.8049979570806485e-05, "loss": 1.1878, "step": 6226 }, { "epoch": 2.318344172842666, "grad_norm": 0.17181634902954102, "learning_rate": 1.8049258491439644e-05, "loss": 1.1834, "step": 6227 }, { "epoch": 2.3187164779932754, "grad_norm": 0.16277888417243958, "learning_rate": 1.8048537293185763e-05, "loss": 1.1808, "step": 6228 }, { "epoch": 2.3190887831438842, "grad_norm": 0.16461539268493652, "learning_rate": 1.804781597605549e-05, "loss": 1.189, "step": 6229 }, { "epoch": 2.3194610882944935, "grad_norm": 0.1649829000234604, "learning_rate": 1.8047094540059478e-05, "loss": 1.1877, "step": 6230 }, { "epoch": 2.3198333934451023, "grad_norm": 0.16547247767448425, "learning_rate": 1.804637298520839e-05, "loss": 1.1789, "step": 6231 }, { "epoch": 2.3202056985957116, "grad_norm": 0.15987227857112885, "learning_rate": 1.804565131151287e-05, "loss": 1.1692, "step": 6232 }, { "epoch": 2.3205780037463204, "grad_norm": 0.16262219846248627, "learning_rate": 1.8044929518983592e-05, "loss": 1.1858, "step": 6233 }, { "epoch": 2.3209503088969297, "grad_norm": 0.16427870094776154, "learning_rate": 1.8044207607631206e-05, "loss": 1.18, "step": 6234 }, { "epoch": 2.3213226140475385, "grad_norm": 0.1622522473335266, "learning_rate": 1.804348557746638e-05, "loss": 1.1718, "step": 6235 }, { "epoch": 2.3216949191981477, "grad_norm": 0.15820512175559998, "learning_rate": 1.8042763428499777e-05, "loss": 1.1816, "step": 6236 }, { "epoch": 2.322067224348757, "grad_norm": 0.16606608033180237, "learning_rate": 1.8042041160742062e-05, "loss": 1.1708, "step": 6237 }, { "epoch": 2.322439529499366, "grad_norm": 0.17201752960681915, "learning_rate": 1.8041318774203908e-05, "loss": 1.1814, "step": 6238 }, { "epoch": 2.322811834649975, "grad_norm": 0.16523173451423645, "learning_rate": 1.8040596268895973e-05, "loss": 1.1717, "step": 6239 }, { "epoch": 2.323184139800584, "grad_norm": 0.16738300025463104, "learning_rate": 1.803987364482894e-05, "loss": 1.1809, "step": 6240 }, { "epoch": 2.323556444951193, "grad_norm": 0.1602403223514557, "learning_rate": 1.8039150902013478e-05, "loss": 1.1754, "step": 6241 }, { "epoch": 2.323928750101802, "grad_norm": 0.16947504878044128, "learning_rate": 1.803842804046026e-05, "loss": 1.1869, "step": 6242 }, { "epoch": 2.3243010552524113, "grad_norm": 0.16907165944576263, "learning_rate": 1.8037705060179965e-05, "loss": 1.1857, "step": 6243 }, { "epoch": 2.32467336040302, "grad_norm": 0.17978952825069427, "learning_rate": 1.8036981961183273e-05, "loss": 1.1801, "step": 6244 }, { "epoch": 2.3250456655536293, "grad_norm": 0.16338391602039337, "learning_rate": 1.803625874348086e-05, "loss": 1.1679, "step": 6245 }, { "epoch": 2.3254179707042386, "grad_norm": 0.17273187637329102, "learning_rate": 1.803553540708341e-05, "loss": 1.1749, "step": 6246 }, { "epoch": 2.3257902758548474, "grad_norm": 0.17095768451690674, "learning_rate": 1.8034811952001602e-05, "loss": 1.1663, "step": 6247 }, { "epoch": 2.3261625810054567, "grad_norm": 0.16900064051151276, "learning_rate": 1.803408837824613e-05, "loss": 1.1831, "step": 6248 }, { "epoch": 2.3265348861560655, "grad_norm": 0.17305012047290802, "learning_rate": 1.8033364685827677e-05, "loss": 1.1898, "step": 6249 }, { "epoch": 2.326907191306675, "grad_norm": 0.1679958999156952, "learning_rate": 1.8032640874756932e-05, "loss": 1.1713, "step": 6250 }, { "epoch": 2.327279496457284, "grad_norm": 0.1740444004535675, "learning_rate": 1.8031916945044586e-05, "loss": 1.194, "step": 6251 }, { "epoch": 2.327651801607893, "grad_norm": 0.1582871377468109, "learning_rate": 1.803119289670133e-05, "loss": 1.1891, "step": 6252 }, { "epoch": 2.3280241067585017, "grad_norm": 0.17062443494796753, "learning_rate": 1.8030468729737856e-05, "loss": 1.185, "step": 6253 }, { "epoch": 2.328396411909111, "grad_norm": 0.16043491661548615, "learning_rate": 1.802974444416487e-05, "loss": 1.1798, "step": 6254 }, { "epoch": 2.32876871705972, "grad_norm": 0.17315945029258728, "learning_rate": 1.8029020039993055e-05, "loss": 1.1855, "step": 6255 }, { "epoch": 2.329141022210329, "grad_norm": 0.16167216002941132, "learning_rate": 1.802829551723312e-05, "loss": 1.1834, "step": 6256 }, { "epoch": 2.3295133273609383, "grad_norm": 0.17070269584655762, "learning_rate": 1.8027570875895762e-05, "loss": 1.1805, "step": 6257 }, { "epoch": 2.329885632511547, "grad_norm": 0.1713981032371521, "learning_rate": 1.802684611599169e-05, "loss": 1.1893, "step": 6258 }, { "epoch": 2.3302579376621564, "grad_norm": 0.17081809043884277, "learning_rate": 1.80261212375316e-05, "loss": 1.1838, "step": 6259 }, { "epoch": 2.3306302428127657, "grad_norm": 0.16853542625904083, "learning_rate": 1.8025396240526208e-05, "loss": 1.1858, "step": 6260 }, { "epoch": 2.3310025479633745, "grad_norm": 0.1681629717350006, "learning_rate": 1.8024671124986218e-05, "loss": 1.177, "step": 6261 }, { "epoch": 2.3313748531139837, "grad_norm": 0.1574486941099167, "learning_rate": 1.8023945890922334e-05, "loss": 1.1778, "step": 6262 }, { "epoch": 2.3317471582645926, "grad_norm": 0.16776683926582336, "learning_rate": 1.8023220538345276e-05, "loss": 1.1888, "step": 6263 }, { "epoch": 2.332119463415202, "grad_norm": 0.16034848988056183, "learning_rate": 1.802249506726575e-05, "loss": 1.1826, "step": 6264 }, { "epoch": 2.3324917685658106, "grad_norm": 0.16641570627689362, "learning_rate": 1.8021769477694482e-05, "loss": 1.1863, "step": 6265 }, { "epoch": 2.33286407371642, "grad_norm": 0.16648726165294647, "learning_rate": 1.802104376964218e-05, "loss": 1.1586, "step": 6266 }, { "epoch": 2.3332363788670287, "grad_norm": 0.16356948018074036, "learning_rate": 1.8020317943119563e-05, "loss": 1.1854, "step": 6267 }, { "epoch": 2.333608684017638, "grad_norm": 0.16005860269069672, "learning_rate": 1.8019591998137355e-05, "loss": 1.1887, "step": 6268 }, { "epoch": 2.3339809891682473, "grad_norm": 0.1607915163040161, "learning_rate": 1.8018865934706277e-05, "loss": 1.1765, "step": 6269 }, { "epoch": 2.334353294318856, "grad_norm": 0.1606864184141159, "learning_rate": 1.801813975283705e-05, "loss": 1.182, "step": 6270 }, { "epoch": 2.3347255994694653, "grad_norm": 0.17700619995594025, "learning_rate": 1.80174134525404e-05, "loss": 1.1745, "step": 6271 }, { "epoch": 2.335097904620074, "grad_norm": 0.1743892878293991, "learning_rate": 1.801668703382706e-05, "loss": 1.174, "step": 6272 }, { "epoch": 2.3354702097706834, "grad_norm": 0.16445204615592957, "learning_rate": 1.8015960496707756e-05, "loss": 1.1716, "step": 6273 }, { "epoch": 2.3358425149212922, "grad_norm": 0.1924527883529663, "learning_rate": 1.8015233841193218e-05, "loss": 1.1676, "step": 6274 }, { "epoch": 2.3362148200719015, "grad_norm": 0.19679906964302063, "learning_rate": 1.8014507067294177e-05, "loss": 1.1849, "step": 6275 }, { "epoch": 2.3365871252225103, "grad_norm": 0.16511952877044678, "learning_rate": 1.8013780175021373e-05, "loss": 1.2015, "step": 6276 }, { "epoch": 2.3369594303731196, "grad_norm": 0.23659729957580566, "learning_rate": 1.8013053164385538e-05, "loss": 1.1712, "step": 6277 }, { "epoch": 2.337331735523729, "grad_norm": 0.19166865944862366, "learning_rate": 1.8012326035397407e-05, "loss": 1.1822, "step": 6278 }, { "epoch": 2.3377040406743377, "grad_norm": 0.1799168735742569, "learning_rate": 1.8011598788067728e-05, "loss": 1.1751, "step": 6279 }, { "epoch": 2.338076345824947, "grad_norm": 0.16195322573184967, "learning_rate": 1.8010871422407238e-05, "loss": 1.1822, "step": 6280 }, { "epoch": 2.3384486509755558, "grad_norm": 0.18496288359165192, "learning_rate": 1.8010143938426674e-05, "loss": 1.1882, "step": 6281 }, { "epoch": 2.338820956126165, "grad_norm": 0.16737127304077148, "learning_rate": 1.800941633613679e-05, "loss": 1.1897, "step": 6282 }, { "epoch": 2.339193261276774, "grad_norm": 0.16343113780021667, "learning_rate": 1.800868861554833e-05, "loss": 1.1761, "step": 6283 }, { "epoch": 2.339565566427383, "grad_norm": 0.16952070593833923, "learning_rate": 1.8007960776672043e-05, "loss": 1.1742, "step": 6284 }, { "epoch": 2.339937871577992, "grad_norm": 0.16918697953224182, "learning_rate": 1.8007232819518675e-05, "loss": 1.1809, "step": 6285 }, { "epoch": 2.340310176728601, "grad_norm": 0.16563785076141357, "learning_rate": 1.800650474409898e-05, "loss": 1.1709, "step": 6286 }, { "epoch": 2.3406824818792105, "grad_norm": 0.1735762506723404, "learning_rate": 1.8005776550423718e-05, "loss": 1.1761, "step": 6287 }, { "epoch": 2.3410547870298193, "grad_norm": 0.16570459306240082, "learning_rate": 1.8005048238503633e-05, "loss": 1.1778, "step": 6288 }, { "epoch": 2.3414270921804285, "grad_norm": 0.16612395644187927, "learning_rate": 1.800431980834949e-05, "loss": 1.1772, "step": 6289 }, { "epoch": 2.3417993973310374, "grad_norm": 0.16212446987628937, "learning_rate": 1.8003591259972047e-05, "loss": 1.1854, "step": 6290 }, { "epoch": 2.3421717024816466, "grad_norm": 0.1619550585746765, "learning_rate": 1.8002862593382063e-05, "loss": 1.1682, "step": 6291 }, { "epoch": 2.3425440076322555, "grad_norm": 0.16271120309829712, "learning_rate": 1.80021338085903e-05, "loss": 1.1646, "step": 6292 }, { "epoch": 2.3429163127828647, "grad_norm": 0.1650598645210266, "learning_rate": 1.8001404905607523e-05, "loss": 1.189, "step": 6293 }, { "epoch": 2.3432886179334735, "grad_norm": 0.16364504396915436, "learning_rate": 1.8000675884444495e-05, "loss": 1.1838, "step": 6294 }, { "epoch": 2.343660923084083, "grad_norm": 0.16529352962970734, "learning_rate": 1.7999946745111993e-05, "loss": 1.1908, "step": 6295 }, { "epoch": 2.344033228234692, "grad_norm": 0.1709616482257843, "learning_rate": 1.7999217487620773e-05, "loss": 1.1731, "step": 6296 }, { "epoch": 2.344405533385301, "grad_norm": 0.16353869438171387, "learning_rate": 1.7998488111981616e-05, "loss": 1.1748, "step": 6297 }, { "epoch": 2.34477783853591, "grad_norm": 0.16740430891513824, "learning_rate": 1.799775861820529e-05, "loss": 1.1674, "step": 6298 }, { "epoch": 2.345150143686519, "grad_norm": 0.1658317595720291, "learning_rate": 1.7997029006302572e-05, "loss": 1.1709, "step": 6299 }, { "epoch": 2.3455224488371282, "grad_norm": 0.1644759625196457, "learning_rate": 1.799629927628424e-05, "loss": 1.184, "step": 6300 }, { "epoch": 2.345894753987737, "grad_norm": 0.1665249615907669, "learning_rate": 1.7995569428161066e-05, "loss": 1.1826, "step": 6301 }, { "epoch": 2.3462670591383463, "grad_norm": 0.1632251888513565, "learning_rate": 1.7994839461943834e-05, "loss": 1.1614, "step": 6302 }, { "epoch": 2.346639364288955, "grad_norm": 0.16572819650173187, "learning_rate": 1.7994109377643326e-05, "loss": 1.1768, "step": 6303 }, { "epoch": 2.3470116694395644, "grad_norm": 0.16670545935630798, "learning_rate": 1.7993379175270323e-05, "loss": 1.1751, "step": 6304 }, { "epoch": 2.3473839745901737, "grad_norm": 0.16433420777320862, "learning_rate": 1.7992648854835607e-05, "loss": 1.1997, "step": 6305 }, { "epoch": 2.3477562797407825, "grad_norm": 0.16783735156059265, "learning_rate": 1.7991918416349977e-05, "loss": 1.1837, "step": 6306 }, { "epoch": 2.3481285848913918, "grad_norm": 0.166385218501091, "learning_rate": 1.799118785982421e-05, "loss": 1.196, "step": 6307 }, { "epoch": 2.3485008900420006, "grad_norm": 0.1664101630449295, "learning_rate": 1.79904571852691e-05, "loss": 1.1839, "step": 6308 }, { "epoch": 2.34887319519261, "grad_norm": 0.16921697556972504, "learning_rate": 1.7989726392695438e-05, "loss": 1.1826, "step": 6309 }, { "epoch": 2.3492455003432187, "grad_norm": 0.16790980100631714, "learning_rate": 1.798899548211402e-05, "loss": 1.1739, "step": 6310 }, { "epoch": 2.349617805493828, "grad_norm": 0.16884508728981018, "learning_rate": 1.798826445353564e-05, "loss": 1.1825, "step": 6311 }, { "epoch": 2.3499901106444367, "grad_norm": 0.16576768457889557, "learning_rate": 1.7987533306971093e-05, "loss": 1.1503, "step": 6312 }, { "epoch": 2.350362415795046, "grad_norm": 0.1681016981601715, "learning_rate": 1.798680204243118e-05, "loss": 1.1753, "step": 6313 }, { "epoch": 2.3507347209456553, "grad_norm": 0.17938603460788727, "learning_rate": 1.7986070659926705e-05, "loss": 1.1865, "step": 6314 }, { "epoch": 2.351107026096264, "grad_norm": 0.16416122019290924, "learning_rate": 1.798533915946847e-05, "loss": 1.1693, "step": 6315 }, { "epoch": 2.3514793312468734, "grad_norm": 0.1787365823984146, "learning_rate": 1.7984607541067272e-05, "loss": 1.174, "step": 6316 }, { "epoch": 2.351851636397482, "grad_norm": 0.16736140847206116, "learning_rate": 1.7983875804733925e-05, "loss": 1.1772, "step": 6317 }, { "epoch": 2.3522239415480914, "grad_norm": 0.17561602592468262, "learning_rate": 1.798314395047923e-05, "loss": 1.1793, "step": 6318 }, { "epoch": 2.3525962466987003, "grad_norm": 0.16405464708805084, "learning_rate": 1.7982411978314e-05, "loss": 1.1866, "step": 6319 }, { "epoch": 2.3529685518493095, "grad_norm": 0.1719312071800232, "learning_rate": 1.798167988824905e-05, "loss": 1.1743, "step": 6320 }, { "epoch": 2.3533408569999184, "grad_norm": 0.17220060527324677, "learning_rate": 1.7980947680295187e-05, "loss": 1.1719, "step": 6321 }, { "epoch": 2.3537131621505276, "grad_norm": 0.1724272072315216, "learning_rate": 1.7980215354463223e-05, "loss": 1.1777, "step": 6322 }, { "epoch": 2.354085467301137, "grad_norm": 0.17141182720661163, "learning_rate": 1.7979482910763984e-05, "loss": 1.1822, "step": 6323 }, { "epoch": 2.3544577724517457, "grad_norm": 0.18284828960895538, "learning_rate": 1.7978750349208284e-05, "loss": 1.1885, "step": 6324 }, { "epoch": 2.354830077602355, "grad_norm": 0.16586068272590637, "learning_rate": 1.797801766980694e-05, "loss": 1.1677, "step": 6325 }, { "epoch": 2.355202382752964, "grad_norm": 0.1680738776922226, "learning_rate": 1.7977284872570775e-05, "loss": 1.1888, "step": 6326 }, { "epoch": 2.355574687903573, "grad_norm": 0.16332261264324188, "learning_rate": 1.7976551957510614e-05, "loss": 1.1771, "step": 6327 }, { "epoch": 2.355946993054182, "grad_norm": 0.16689446568489075, "learning_rate": 1.797581892463728e-05, "loss": 1.1698, "step": 6328 }, { "epoch": 2.356319298204791, "grad_norm": 0.17073221504688263, "learning_rate": 1.7975085773961606e-05, "loss": 1.1931, "step": 6329 }, { "epoch": 2.3566916033554, "grad_norm": 0.16630737483501434, "learning_rate": 1.797435250549441e-05, "loss": 1.1698, "step": 6330 }, { "epoch": 2.357063908506009, "grad_norm": 0.19237372279167175, "learning_rate": 1.797361911924653e-05, "loss": 1.1976, "step": 6331 }, { "epoch": 2.3574362136566185, "grad_norm": 0.1919546127319336, "learning_rate": 1.7972885615228796e-05, "loss": 1.1915, "step": 6332 }, { "epoch": 2.3578085188072273, "grad_norm": 0.16632108390331268, "learning_rate": 1.797215199345204e-05, "loss": 1.187, "step": 6333 }, { "epoch": 2.3581808239578366, "grad_norm": 0.26524388790130615, "learning_rate": 1.7971418253927103e-05, "loss": 1.1899, "step": 6334 }, { "epoch": 2.3585531291084454, "grad_norm": 0.20467346906661987, "learning_rate": 1.7970684396664814e-05, "loss": 1.1828, "step": 6335 }, { "epoch": 2.3589254342590547, "grad_norm": 0.18540099263191223, "learning_rate": 1.7969950421676022e-05, "loss": 1.1699, "step": 6336 }, { "epoch": 2.3592977394096635, "grad_norm": 0.17078468203544617, "learning_rate": 1.796921632897156e-05, "loss": 1.1846, "step": 6337 }, { "epoch": 2.3596700445602727, "grad_norm": 0.19237884879112244, "learning_rate": 1.796848211856227e-05, "loss": 1.1847, "step": 6338 }, { "epoch": 2.3600423497108816, "grad_norm": 0.1824292242527008, "learning_rate": 1.7967747790459006e-05, "loss": 1.1699, "step": 6339 }, { "epoch": 2.360414654861491, "grad_norm": 0.17438608407974243, "learning_rate": 1.7967013344672602e-05, "loss": 1.1848, "step": 6340 }, { "epoch": 2.3607869600121, "grad_norm": 0.16449159383773804, "learning_rate": 1.7966278781213914e-05, "loss": 1.1741, "step": 6341 }, { "epoch": 2.361159265162709, "grad_norm": 0.17680013179779053, "learning_rate": 1.7965544100093785e-05, "loss": 1.1611, "step": 6342 }, { "epoch": 2.361531570313318, "grad_norm": 0.18226385116577148, "learning_rate": 1.796480930132307e-05, "loss": 1.1873, "step": 6343 }, { "epoch": 2.361903875463927, "grad_norm": 0.1675233244895935, "learning_rate": 1.7964074384912624e-05, "loss": 1.1864, "step": 6344 }, { "epoch": 2.3622761806145363, "grad_norm": 0.17054075002670288, "learning_rate": 1.7963339350873296e-05, "loss": 1.1863, "step": 6345 }, { "epoch": 2.362648485765145, "grad_norm": 0.1669616848230362, "learning_rate": 1.7962604199215946e-05, "loss": 1.1782, "step": 6346 }, { "epoch": 2.3630207909157543, "grad_norm": 0.1679152250289917, "learning_rate": 1.7961868929951432e-05, "loss": 1.1859, "step": 6347 }, { "epoch": 2.363393096066363, "grad_norm": 0.1678704470396042, "learning_rate": 1.7961133543090617e-05, "loss": 1.1759, "step": 6348 }, { "epoch": 2.3637654012169724, "grad_norm": 0.15599095821380615, "learning_rate": 1.7960398038644356e-05, "loss": 1.1686, "step": 6349 }, { "epoch": 2.3641377063675817, "grad_norm": 0.16793273389339447, "learning_rate": 1.7959662416623516e-05, "loss": 1.1858, "step": 6350 }, { "epoch": 2.3645100115181905, "grad_norm": 0.16774815320968628, "learning_rate": 1.795892667703896e-05, "loss": 1.1846, "step": 6351 }, { "epoch": 2.3648823166687998, "grad_norm": 0.16648389399051666, "learning_rate": 1.795819081990156e-05, "loss": 1.1811, "step": 6352 }, { "epoch": 2.3652546218194086, "grad_norm": 0.16627708077430725, "learning_rate": 1.795745484522218e-05, "loss": 1.1804, "step": 6353 }, { "epoch": 2.365626926970018, "grad_norm": 0.16126348078250885, "learning_rate": 1.795671875301169e-05, "loss": 1.1786, "step": 6354 }, { "epoch": 2.3659992321206267, "grad_norm": 0.16175805032253265, "learning_rate": 1.7955982543280965e-05, "loss": 1.1678, "step": 6355 }, { "epoch": 2.366371537271236, "grad_norm": 0.16892601549625397, "learning_rate": 1.7955246216040872e-05, "loss": 1.1723, "step": 6356 }, { "epoch": 2.3667438424218448, "grad_norm": 0.1636088341474533, "learning_rate": 1.79545097713023e-05, "loss": 1.1844, "step": 6357 }, { "epoch": 2.367116147572454, "grad_norm": 0.16723226010799408, "learning_rate": 1.795377320907611e-05, "loss": 1.1886, "step": 6358 }, { "epoch": 2.3674884527230633, "grad_norm": 0.17732274532318115, "learning_rate": 1.7953036529373194e-05, "loss": 1.1981, "step": 6359 }, { "epoch": 2.367860757873672, "grad_norm": 0.15954697132110596, "learning_rate": 1.795229973220443e-05, "loss": 1.1924, "step": 6360 }, { "epoch": 2.3682330630242814, "grad_norm": 0.1688387095928192, "learning_rate": 1.795156281758069e-05, "loss": 1.1782, "step": 6361 }, { "epoch": 2.36860536817489, "grad_norm": 0.16727487742900848, "learning_rate": 1.7950825785512873e-05, "loss": 1.1868, "step": 6362 }, { "epoch": 2.3689776733254995, "grad_norm": 0.1692897230386734, "learning_rate": 1.7950088636011853e-05, "loss": 1.1952, "step": 6363 }, { "epoch": 2.3693499784761083, "grad_norm": 0.15988357365131378, "learning_rate": 1.7949351369088526e-05, "loss": 1.1724, "step": 6364 }, { "epoch": 2.3697222836267176, "grad_norm": 0.16070933640003204, "learning_rate": 1.7948613984753777e-05, "loss": 1.1731, "step": 6365 }, { "epoch": 2.3700945887773264, "grad_norm": 0.16724231839179993, "learning_rate": 1.7947876483018498e-05, "loss": 1.1779, "step": 6366 }, { "epoch": 2.3704668939279356, "grad_norm": 0.16751320660114288, "learning_rate": 1.7947138863893582e-05, "loss": 1.1952, "step": 6367 }, { "epoch": 2.370839199078545, "grad_norm": 0.16353389620780945, "learning_rate": 1.7946401127389928e-05, "loss": 1.1899, "step": 6368 }, { "epoch": 2.3712115042291537, "grad_norm": 0.16663357615470886, "learning_rate": 1.7945663273518423e-05, "loss": 1.2004, "step": 6369 }, { "epoch": 2.371583809379763, "grad_norm": 0.1695907711982727, "learning_rate": 1.7944925302289972e-05, "loss": 1.158, "step": 6370 }, { "epoch": 2.371956114530372, "grad_norm": 0.16809728741645813, "learning_rate": 1.794418721371547e-05, "loss": 1.1689, "step": 6371 }, { "epoch": 2.372328419680981, "grad_norm": 0.1606246680021286, "learning_rate": 1.7943449007805824e-05, "loss": 1.1615, "step": 6372 }, { "epoch": 2.3727007248315903, "grad_norm": 0.1651766151189804, "learning_rate": 1.7942710684571934e-05, "loss": 1.183, "step": 6373 }, { "epoch": 2.373073029982199, "grad_norm": 0.16688674688339233, "learning_rate": 1.794197224402471e-05, "loss": 1.1735, "step": 6374 }, { "epoch": 2.3734453351328084, "grad_norm": 0.16931474208831787, "learning_rate": 1.7941233686175052e-05, "loss": 1.169, "step": 6375 }, { "epoch": 2.3738176402834172, "grad_norm": 0.16354526579380035, "learning_rate": 1.7940495011033866e-05, "loss": 1.1828, "step": 6376 }, { "epoch": 2.3741899454340265, "grad_norm": 0.16403120756149292, "learning_rate": 1.7939756218612072e-05, "loss": 1.1755, "step": 6377 }, { "epoch": 2.3745622505846353, "grad_norm": 0.16923734545707703, "learning_rate": 1.7939017308920575e-05, "loss": 1.1704, "step": 6378 }, { "epoch": 2.3749345557352446, "grad_norm": 0.16847112774848938, "learning_rate": 1.793827828197029e-05, "loss": 1.1739, "step": 6379 }, { "epoch": 2.3753068608858534, "grad_norm": 0.17019587755203247, "learning_rate": 1.7937539137772134e-05, "loss": 1.1815, "step": 6380 }, { "epoch": 2.3756791660364627, "grad_norm": 0.16363418102264404, "learning_rate": 1.7936799876337022e-05, "loss": 1.1707, "step": 6381 }, { "epoch": 2.376051471187072, "grad_norm": 0.16532693803310394, "learning_rate": 1.7936060497675875e-05, "loss": 1.174, "step": 6382 }, { "epoch": 2.3764237763376808, "grad_norm": 0.168051615357399, "learning_rate": 1.793532100179961e-05, "loss": 1.1609, "step": 6383 }, { "epoch": 2.37679608148829, "grad_norm": 0.16365954279899597, "learning_rate": 1.7934581388719158e-05, "loss": 1.1779, "step": 6384 }, { "epoch": 2.377168386638899, "grad_norm": 0.16921496391296387, "learning_rate": 1.7933841658445432e-05, "loss": 1.1967, "step": 6385 }, { "epoch": 2.377540691789508, "grad_norm": 0.16775216162204742, "learning_rate": 1.7933101810989363e-05, "loss": 1.1698, "step": 6386 }, { "epoch": 2.377912996940117, "grad_norm": 0.16097751259803772, "learning_rate": 1.793236184636188e-05, "loss": 1.1879, "step": 6387 }, { "epoch": 2.378285302090726, "grad_norm": 0.1658683568239212, "learning_rate": 1.7931621764573907e-05, "loss": 1.1815, "step": 6388 }, { "epoch": 2.378657607241335, "grad_norm": 0.16662012040615082, "learning_rate": 1.793088156563638e-05, "loss": 1.1767, "step": 6389 }, { "epoch": 2.3790299123919443, "grad_norm": 0.16364309191703796, "learning_rate": 1.7930141249560235e-05, "loss": 1.1758, "step": 6390 }, { "epoch": 2.3794022175425535, "grad_norm": 0.17829427123069763, "learning_rate": 1.7929400816356394e-05, "loss": 1.1832, "step": 6391 }, { "epoch": 2.3797745226931624, "grad_norm": 0.16619321703910828, "learning_rate": 1.7928660266035804e-05, "loss": 1.1796, "step": 6392 }, { "epoch": 2.3801468278437716, "grad_norm": 0.16665421426296234, "learning_rate": 1.79279195986094e-05, "loss": 1.1768, "step": 6393 }, { "epoch": 2.3805191329943804, "grad_norm": 0.17309972643852234, "learning_rate": 1.792717881408812e-05, "loss": 1.1811, "step": 6394 }, { "epoch": 2.3808914381449897, "grad_norm": 0.1681647002696991, "learning_rate": 1.7926437912482905e-05, "loss": 1.1738, "step": 6395 }, { "epoch": 2.3812637432955985, "grad_norm": 0.17703039944171906, "learning_rate": 1.79256968938047e-05, "loss": 1.1759, "step": 6396 }, { "epoch": 2.381636048446208, "grad_norm": 0.16433195769786835, "learning_rate": 1.792495575806445e-05, "loss": 1.1746, "step": 6397 }, { "epoch": 2.3820083535968166, "grad_norm": 0.1649874895811081, "learning_rate": 1.7924214505273102e-05, "loss": 1.1785, "step": 6398 }, { "epoch": 2.382380658747426, "grad_norm": 0.17650985717773438, "learning_rate": 1.79234731354416e-05, "loss": 1.1714, "step": 6399 }, { "epoch": 2.382752963898035, "grad_norm": 0.19608263671398163, "learning_rate": 1.7922731648580902e-05, "loss": 1.1805, "step": 6400 }, { "epoch": 2.383125269048644, "grad_norm": 0.18923339247703552, "learning_rate": 1.7921990044701952e-05, "loss": 1.1701, "step": 6401 }, { "epoch": 2.3834975741992532, "grad_norm": 0.1707700490951538, "learning_rate": 1.7921248323815703e-05, "loss": 1.1829, "step": 6402 }, { "epoch": 2.383869879349862, "grad_norm": 0.16345664858818054, "learning_rate": 1.7920506485933117e-05, "loss": 1.1881, "step": 6403 }, { "epoch": 2.3842421845004713, "grad_norm": 0.166402205824852, "learning_rate": 1.7919764531065147e-05, "loss": 1.1603, "step": 6404 }, { "epoch": 2.38461448965108, "grad_norm": 0.17079654335975647, "learning_rate": 1.7919022459222754e-05, "loss": 1.1882, "step": 6405 }, { "epoch": 2.3849867948016894, "grad_norm": 0.16907550394535065, "learning_rate": 1.791828027041689e-05, "loss": 1.1921, "step": 6406 }, { "epoch": 2.3853590999522982, "grad_norm": 0.15836122632026672, "learning_rate": 1.791753796465853e-05, "loss": 1.1894, "step": 6407 }, { "epoch": 2.3857314051029075, "grad_norm": 0.1576787382364273, "learning_rate": 1.791679554195863e-05, "loss": 1.1631, "step": 6408 }, { "epoch": 2.3861037102535168, "grad_norm": 0.1628400981426239, "learning_rate": 1.7916053002328152e-05, "loss": 1.1717, "step": 6409 }, { "epoch": 2.3864760154041256, "grad_norm": 0.16520801186561584, "learning_rate": 1.7915310345778072e-05, "loss": 1.1745, "step": 6410 }, { "epoch": 2.386848320554735, "grad_norm": 0.167642742395401, "learning_rate": 1.7914567572319352e-05, "loss": 1.1679, "step": 6411 }, { "epoch": 2.3872206257053437, "grad_norm": 0.16473732888698578, "learning_rate": 1.791382468196297e-05, "loss": 1.17, "step": 6412 }, { "epoch": 2.387592930855953, "grad_norm": 0.17128442227840424, "learning_rate": 1.791308167471989e-05, "loss": 1.1791, "step": 6413 }, { "epoch": 2.3879652360065617, "grad_norm": 0.17826206982135773, "learning_rate": 1.791233855060109e-05, "loss": 1.1615, "step": 6414 }, { "epoch": 2.388337541157171, "grad_norm": 0.24664366245269775, "learning_rate": 1.7911595309617552e-05, "loss": 1.1762, "step": 6415 }, { "epoch": 2.38870984630778, "grad_norm": 0.2372390776872635, "learning_rate": 1.7910851951780244e-05, "loss": 1.1696, "step": 6416 }, { "epoch": 2.389082151458389, "grad_norm": 0.18587949872016907, "learning_rate": 1.791010847710015e-05, "loss": 1.1784, "step": 6417 }, { "epoch": 2.3894544566089984, "grad_norm": 0.16316647827625275, "learning_rate": 1.7909364885588247e-05, "loss": 1.1808, "step": 6418 }, { "epoch": 2.389826761759607, "grad_norm": 0.19883513450622559, "learning_rate": 1.7908621177255523e-05, "loss": 1.1763, "step": 6419 }, { "epoch": 2.3901990669102164, "grad_norm": 0.1844921112060547, "learning_rate": 1.790787735211296e-05, "loss": 1.1791, "step": 6420 }, { "epoch": 2.3905713720608253, "grad_norm": 0.17219749093055725, "learning_rate": 1.7907133410171548e-05, "loss": 1.1735, "step": 6421 }, { "epoch": 2.3909436772114345, "grad_norm": 0.16659492254257202, "learning_rate": 1.790638935144227e-05, "loss": 1.1776, "step": 6422 }, { "epoch": 2.3913159823620433, "grad_norm": 0.1710551679134369, "learning_rate": 1.7905645175936116e-05, "loss": 1.18, "step": 6423 }, { "epoch": 2.3916882875126526, "grad_norm": 0.17188216745853424, "learning_rate": 1.7904900883664078e-05, "loss": 1.1724, "step": 6424 }, { "epoch": 2.3920605926632614, "grad_norm": 0.1644524782896042, "learning_rate": 1.790415647463715e-05, "loss": 1.1819, "step": 6425 }, { "epoch": 2.3924328978138707, "grad_norm": 0.16760697960853577, "learning_rate": 1.790341194886633e-05, "loss": 1.1836, "step": 6426 }, { "epoch": 2.39280520296448, "grad_norm": 0.16618818044662476, "learning_rate": 1.790266730636261e-05, "loss": 1.192, "step": 6427 }, { "epoch": 2.393177508115089, "grad_norm": 0.17228281497955322, "learning_rate": 1.790192254713699e-05, "loss": 1.1891, "step": 6428 }, { "epoch": 2.393549813265698, "grad_norm": 0.16444258391857147, "learning_rate": 1.7901177671200472e-05, "loss": 1.1802, "step": 6429 }, { "epoch": 2.393922118416307, "grad_norm": 0.16610148549079895, "learning_rate": 1.790043267856405e-05, "loss": 1.1583, "step": 6430 }, { "epoch": 2.394294423566916, "grad_norm": 0.16486294567584991, "learning_rate": 1.789968756923874e-05, "loss": 1.1835, "step": 6431 }, { "epoch": 2.394666728717525, "grad_norm": 0.1681571900844574, "learning_rate": 1.7898942343235535e-05, "loss": 1.1903, "step": 6432 }, { "epoch": 2.395039033868134, "grad_norm": 0.16249068081378937, "learning_rate": 1.7898197000565448e-05, "loss": 1.1848, "step": 6433 }, { "epoch": 2.395411339018743, "grad_norm": 0.16723275184631348, "learning_rate": 1.789745154123949e-05, "loss": 1.1678, "step": 6434 }, { "epoch": 2.3957836441693523, "grad_norm": 0.16536808013916016, "learning_rate": 1.7896705965268668e-05, "loss": 1.1744, "step": 6435 }, { "epoch": 2.3961559493199616, "grad_norm": 0.16059263050556183, "learning_rate": 1.7895960272663992e-05, "loss": 1.1798, "step": 6436 }, { "epoch": 2.3965282544705704, "grad_norm": 0.1607469618320465, "learning_rate": 1.789521446343648e-05, "loss": 1.1797, "step": 6437 }, { "epoch": 2.3969005596211796, "grad_norm": 0.1656348556280136, "learning_rate": 1.7894468537597146e-05, "loss": 1.1781, "step": 6438 }, { "epoch": 2.3972728647717885, "grad_norm": 0.16378162801265717, "learning_rate": 1.7893722495157004e-05, "loss": 1.1878, "step": 6439 }, { "epoch": 2.3976451699223977, "grad_norm": 0.1643425077199936, "learning_rate": 1.789297633612708e-05, "loss": 1.1851, "step": 6440 }, { "epoch": 2.3980174750730066, "grad_norm": 0.16220180690288544, "learning_rate": 1.789223006051839e-05, "loss": 1.1757, "step": 6441 }, { "epoch": 2.398389780223616, "grad_norm": 0.16219714283943176, "learning_rate": 1.7891483668341955e-05, "loss": 1.1934, "step": 6442 }, { "epoch": 2.3987620853742246, "grad_norm": 0.16055887937545776, "learning_rate": 1.7890737159608803e-05, "loss": 1.1712, "step": 6443 }, { "epoch": 2.399134390524834, "grad_norm": 0.16199053823947906, "learning_rate": 1.788999053432996e-05, "loss": 1.1757, "step": 6444 }, { "epoch": 2.399506695675443, "grad_norm": 0.1576344072818756, "learning_rate": 1.7889243792516452e-05, "loss": 1.1713, "step": 6445 }, { "epoch": 2.399879000826052, "grad_norm": 0.17367145419120789, "learning_rate": 1.7888496934179308e-05, "loss": 1.1757, "step": 6446 }, { "epoch": 2.4002513059766613, "grad_norm": 0.1603989452123642, "learning_rate": 1.7887749959329555e-05, "loss": 1.1654, "step": 6447 }, { "epoch": 2.40062361112727, "grad_norm": 0.1615387499332428, "learning_rate": 1.7887002867978234e-05, "loss": 1.1664, "step": 6448 }, { "epoch": 2.4009959162778793, "grad_norm": 0.1623525768518448, "learning_rate": 1.7886255660136376e-05, "loss": 1.1547, "step": 6449 }, { "epoch": 2.401368221428488, "grad_norm": 0.16615155339241028, "learning_rate": 1.7885508335815013e-05, "loss": 1.1797, "step": 6450 }, { "epoch": 2.4017405265790974, "grad_norm": 0.16319416463375092, "learning_rate": 1.788476089502519e-05, "loss": 1.1662, "step": 6451 }, { "epoch": 2.4021128317297062, "grad_norm": 0.16049784421920776, "learning_rate": 1.7884013337777944e-05, "loss": 1.176, "step": 6452 }, { "epoch": 2.4024851368803155, "grad_norm": 0.1557374745607376, "learning_rate": 1.7883265664084317e-05, "loss": 1.1815, "step": 6453 }, { "epoch": 2.4028574420309248, "grad_norm": 0.17005103826522827, "learning_rate": 1.7882517873955344e-05, "loss": 1.1816, "step": 6454 }, { "epoch": 2.4032297471815336, "grad_norm": 0.16482587158679962, "learning_rate": 1.7881769967402083e-05, "loss": 1.1879, "step": 6455 }, { "epoch": 2.403602052332143, "grad_norm": 0.16888852417469025, "learning_rate": 1.788102194443557e-05, "loss": 1.1853, "step": 6456 }, { "epoch": 2.4039743574827517, "grad_norm": 0.16261079907417297, "learning_rate": 1.7880273805066864e-05, "loss": 1.1675, "step": 6457 }, { "epoch": 2.404346662633361, "grad_norm": 0.16388806700706482, "learning_rate": 1.7879525549307004e-05, "loss": 1.178, "step": 6458 }, { "epoch": 2.4047189677839698, "grad_norm": 0.16773760318756104, "learning_rate": 1.7878777177167047e-05, "loss": 1.1691, "step": 6459 }, { "epoch": 2.405091272934579, "grad_norm": 0.1617787927389145, "learning_rate": 1.7878028688658044e-05, "loss": 1.1715, "step": 6460 }, { "epoch": 2.405463578085188, "grad_norm": 0.16519810259342194, "learning_rate": 1.787728008379105e-05, "loss": 1.189, "step": 6461 }, { "epoch": 2.405835883235797, "grad_norm": 0.17165139317512512, "learning_rate": 1.787653136257713e-05, "loss": 1.1729, "step": 6462 }, { "epoch": 2.4062081883864064, "grad_norm": 0.16290132701396942, "learning_rate": 1.787578252502733e-05, "loss": 1.1823, "step": 6463 }, { "epoch": 2.406580493537015, "grad_norm": 0.16138114035129547, "learning_rate": 1.7875033571152718e-05, "loss": 1.1697, "step": 6464 }, { "epoch": 2.4069527986876245, "grad_norm": 0.16568872332572937, "learning_rate": 1.7874284500964357e-05, "loss": 1.1734, "step": 6465 }, { "epoch": 2.4073251038382333, "grad_norm": 0.1621939092874527, "learning_rate": 1.7873535314473308e-05, "loss": 1.1837, "step": 6466 }, { "epoch": 2.4076974089888425, "grad_norm": 0.15826240181922913, "learning_rate": 1.787278601169063e-05, "loss": 1.1699, "step": 6467 }, { "epoch": 2.4080697141394514, "grad_norm": 0.17099608480930328, "learning_rate": 1.7872036592627404e-05, "loss": 1.189, "step": 6468 }, { "epoch": 2.4084420192900606, "grad_norm": 0.16510766744613647, "learning_rate": 1.7871287057294688e-05, "loss": 1.1759, "step": 6469 }, { "epoch": 2.4088143244406695, "grad_norm": 0.1691225916147232, "learning_rate": 1.7870537405703556e-05, "loss": 1.1761, "step": 6470 }, { "epoch": 2.4091866295912787, "grad_norm": 0.1651260256767273, "learning_rate": 1.7869787637865084e-05, "loss": 1.1737, "step": 6471 }, { "epoch": 2.409558934741888, "grad_norm": 0.16575190424919128, "learning_rate": 1.7869037753790343e-05, "loss": 1.1768, "step": 6472 }, { "epoch": 2.409931239892497, "grad_norm": 0.17237798869609833, "learning_rate": 1.78682877534904e-05, "loss": 1.1944, "step": 6473 }, { "epoch": 2.410303545043106, "grad_norm": 0.15723305940628052, "learning_rate": 1.7867537636976348e-05, "loss": 1.1725, "step": 6474 }, { "epoch": 2.410675850193715, "grad_norm": 0.16039879620075226, "learning_rate": 1.7866787404259255e-05, "loss": 1.1674, "step": 6475 }, { "epoch": 2.411048155344324, "grad_norm": 0.1647602766752243, "learning_rate": 1.786603705535021e-05, "loss": 1.1616, "step": 6476 }, { "epoch": 2.411420460494933, "grad_norm": 0.16092441976070404, "learning_rate": 1.7865286590260288e-05, "loss": 1.156, "step": 6477 }, { "epoch": 2.4117927656455422, "grad_norm": 0.1571139693260193, "learning_rate": 1.7864536009000575e-05, "loss": 1.1713, "step": 6478 }, { "epoch": 2.412165070796151, "grad_norm": 0.16552495956420898, "learning_rate": 1.7863785311582163e-05, "loss": 1.1648, "step": 6479 }, { "epoch": 2.4125373759467603, "grad_norm": 0.16000515222549438, "learning_rate": 1.7863034498016133e-05, "loss": 1.1861, "step": 6480 }, { "epoch": 2.4129096810973696, "grad_norm": 0.16232497990131378, "learning_rate": 1.7862283568313578e-05, "loss": 1.1737, "step": 6481 }, { "epoch": 2.4132819862479784, "grad_norm": 0.1606409251689911, "learning_rate": 1.7861532522485588e-05, "loss": 1.1784, "step": 6482 }, { "epoch": 2.4136542913985877, "grad_norm": 0.16612519323825836, "learning_rate": 1.7860781360543255e-05, "loss": 1.1825, "step": 6483 }, { "epoch": 2.4140265965491965, "grad_norm": 0.16565221548080444, "learning_rate": 1.7860030082497676e-05, "loss": 1.1889, "step": 6484 }, { "epoch": 2.4143989016998058, "grad_norm": 0.16412298381328583, "learning_rate": 1.7859278688359946e-05, "loss": 1.171, "step": 6485 }, { "epoch": 2.414771206850415, "grad_norm": 0.1599871665239334, "learning_rate": 1.7858527178141162e-05, "loss": 1.1857, "step": 6486 }, { "epoch": 2.415143512001024, "grad_norm": 0.16030682623386383, "learning_rate": 1.7857775551852426e-05, "loss": 1.1743, "step": 6487 }, { "epoch": 2.4155158171516327, "grad_norm": 0.16047127544879913, "learning_rate": 1.785702380950484e-05, "loss": 1.1954, "step": 6488 }, { "epoch": 2.415888122302242, "grad_norm": 0.16832852363586426, "learning_rate": 1.78562719511095e-05, "loss": 1.1862, "step": 6489 }, { "epoch": 2.416260427452851, "grad_norm": 0.163087859749794, "learning_rate": 1.785551997667752e-05, "loss": 1.1849, "step": 6490 }, { "epoch": 2.41663273260346, "grad_norm": 0.16009113192558289, "learning_rate": 1.7854767886220002e-05, "loss": 1.1717, "step": 6491 }, { "epoch": 2.4170050377540693, "grad_norm": 0.1627979576587677, "learning_rate": 1.7854015679748053e-05, "loss": 1.1865, "step": 6492 }, { "epoch": 2.417377342904678, "grad_norm": 0.16498367488384247, "learning_rate": 1.785326335727279e-05, "loss": 1.1613, "step": 6493 }, { "epoch": 2.4177496480552874, "grad_norm": 0.16802792251110077, "learning_rate": 1.7852510918805318e-05, "loss": 1.1754, "step": 6494 }, { "epoch": 2.4181219532058966, "grad_norm": 0.16168522834777832, "learning_rate": 1.785175836435675e-05, "loss": 1.1908, "step": 6495 }, { "epoch": 2.4184942583565054, "grad_norm": 0.16734105348587036, "learning_rate": 1.7851005693938206e-05, "loss": 1.1671, "step": 6496 }, { "epoch": 2.4188665635071147, "grad_norm": 0.1836169809103012, "learning_rate": 1.78502529075608e-05, "loss": 1.1681, "step": 6497 }, { "epoch": 2.4192388686577235, "grad_norm": 0.20119720697402954, "learning_rate": 1.7849500005235652e-05, "loss": 1.1652, "step": 6498 }, { "epoch": 2.419611173808333, "grad_norm": 0.19571252167224884, "learning_rate": 1.7848746986973883e-05, "loss": 1.163, "step": 6499 }, { "epoch": 2.4199834789589416, "grad_norm": 0.19061268866062164, "learning_rate": 1.7847993852786612e-05, "loss": 1.1809, "step": 6500 }, { "epoch": 2.4199834789589416, "eval_loss": 1.3013861179351807, "eval_runtime": 17.1871, "eval_samples_per_second": 100.889, "eval_steps_per_second": 5.062, "step": 6500 }, { "epoch": 2.420355784109551, "grad_norm": 0.17919202148914337, "learning_rate": 1.7847240602684962e-05, "loss": 1.1778, "step": 6501 }, { "epoch": 2.4207280892601597, "grad_norm": 0.1730869710445404, "learning_rate": 1.7846487236680064e-05, "loss": 1.1851, "step": 6502 }, { "epoch": 2.421100394410769, "grad_norm": 0.1636027991771698, "learning_rate": 1.784573375478304e-05, "loss": 1.1806, "step": 6503 }, { "epoch": 2.4214726995613782, "grad_norm": 0.16488824784755707, "learning_rate": 1.7844980157005022e-05, "loss": 1.1554, "step": 6504 }, { "epoch": 2.421845004711987, "grad_norm": 0.16549324989318848, "learning_rate": 1.784422644335714e-05, "loss": 1.1765, "step": 6505 }, { "epoch": 2.4222173098625963, "grad_norm": 0.1688234508037567, "learning_rate": 1.7843472613850523e-05, "loss": 1.1575, "step": 6506 }, { "epoch": 2.422589615013205, "grad_norm": 0.17091526091098785, "learning_rate": 1.7842718668496312e-05, "loss": 1.1692, "step": 6507 }, { "epoch": 2.4229619201638144, "grad_norm": 0.1648736596107483, "learning_rate": 1.7841964607305636e-05, "loss": 1.1882, "step": 6508 }, { "epoch": 2.423334225314423, "grad_norm": 0.18578572571277618, "learning_rate": 1.7841210430289636e-05, "loss": 1.1695, "step": 6509 }, { "epoch": 2.4237065304650325, "grad_norm": 0.1957530379295349, "learning_rate": 1.7840456137459447e-05, "loss": 1.1688, "step": 6510 }, { "epoch": 2.4240788356156413, "grad_norm": 0.16478918492794037, "learning_rate": 1.7839701728826214e-05, "loss": 1.1787, "step": 6511 }, { "epoch": 2.4244511407662506, "grad_norm": 0.2944774329662323, "learning_rate": 1.7838947204401083e-05, "loss": 1.184, "step": 6512 }, { "epoch": 2.42482344591686, "grad_norm": 0.18149353563785553, "learning_rate": 1.7838192564195188e-05, "loss": 1.1683, "step": 6513 }, { "epoch": 2.4251957510674687, "grad_norm": 0.18618106842041016, "learning_rate": 1.7837437808219683e-05, "loss": 1.1592, "step": 6514 }, { "epoch": 2.425568056218078, "grad_norm": 0.1710144430398941, "learning_rate": 1.7836682936485714e-05, "loss": 1.1993, "step": 6515 }, { "epoch": 2.4259403613686867, "grad_norm": 0.17375528812408447, "learning_rate": 1.783592794900443e-05, "loss": 1.1793, "step": 6516 }, { "epoch": 2.426312666519296, "grad_norm": 0.1787467747926712, "learning_rate": 1.783517284578698e-05, "loss": 1.1852, "step": 6517 }, { "epoch": 2.426684971669905, "grad_norm": 0.17819075286388397, "learning_rate": 1.783441762684452e-05, "loss": 1.1715, "step": 6518 }, { "epoch": 2.427057276820514, "grad_norm": 0.16626736521720886, "learning_rate": 1.78336622921882e-05, "loss": 1.1769, "step": 6519 }, { "epoch": 2.427429581971123, "grad_norm": 0.1616727113723755, "learning_rate": 1.7832906841829187e-05, "loss": 1.1717, "step": 6520 }, { "epoch": 2.427801887121732, "grad_norm": 0.1669609099626541, "learning_rate": 1.7832151275778625e-05, "loss": 1.1717, "step": 6521 }, { "epoch": 2.4281741922723414, "grad_norm": 0.16761192679405212, "learning_rate": 1.7831395594047682e-05, "loss": 1.1763, "step": 6522 }, { "epoch": 2.4285464974229503, "grad_norm": 0.1701916605234146, "learning_rate": 1.783063979664752e-05, "loss": 1.1905, "step": 6523 }, { "epoch": 2.4289188025735595, "grad_norm": 0.16727663576602936, "learning_rate": 1.7829883883589297e-05, "loss": 1.1913, "step": 6524 }, { "epoch": 2.4292911077241683, "grad_norm": 0.16369487345218658, "learning_rate": 1.782912785488418e-05, "loss": 1.1616, "step": 6525 }, { "epoch": 2.4296634128747776, "grad_norm": 0.1752278357744217, "learning_rate": 1.7828371710543336e-05, "loss": 1.1921, "step": 6526 }, { "epoch": 2.4300357180253864, "grad_norm": 0.16097818315029144, "learning_rate": 1.7827615450577936e-05, "loss": 1.1998, "step": 6527 }, { "epoch": 2.4304080231759957, "grad_norm": 0.16219815611839294, "learning_rate": 1.7826859074999145e-05, "loss": 1.1806, "step": 6528 }, { "epoch": 2.4307803283266045, "grad_norm": 0.16351665556430817, "learning_rate": 1.7826102583818134e-05, "loss": 1.1762, "step": 6529 }, { "epoch": 2.4311526334772138, "grad_norm": 0.16075001657009125, "learning_rate": 1.782534597704608e-05, "loss": 1.1809, "step": 6530 }, { "epoch": 2.431524938627823, "grad_norm": 0.19722290337085724, "learning_rate": 1.7824589254694163e-05, "loss": 1.1829, "step": 6531 }, { "epoch": 2.431897243778432, "grad_norm": 0.1670161485671997, "learning_rate": 1.782383241677355e-05, "loss": 1.1921, "step": 6532 }, { "epoch": 2.432269548929041, "grad_norm": 0.1666540503501892, "learning_rate": 1.782307546329542e-05, "loss": 1.1845, "step": 6533 }, { "epoch": 2.43264185407965, "grad_norm": 0.16255459189414978, "learning_rate": 1.782231839427096e-05, "loss": 1.1682, "step": 6534 }, { "epoch": 2.433014159230259, "grad_norm": 0.16188620030879974, "learning_rate": 1.782156120971135e-05, "loss": 1.1696, "step": 6535 }, { "epoch": 2.433386464380868, "grad_norm": 0.17455708980560303, "learning_rate": 1.7820803909627766e-05, "loss": 1.1804, "step": 6536 }, { "epoch": 2.4337587695314773, "grad_norm": 0.16326938569545746, "learning_rate": 1.7820046494031405e-05, "loss": 1.1794, "step": 6537 }, { "epoch": 2.434131074682086, "grad_norm": 0.15706990659236908, "learning_rate": 1.7819288962933442e-05, "loss": 1.1619, "step": 6538 }, { "epoch": 2.4345033798326954, "grad_norm": 0.16634133458137512, "learning_rate": 1.7818531316345078e-05, "loss": 1.1824, "step": 6539 }, { "epoch": 2.4348756849833046, "grad_norm": 0.16253423690795898, "learning_rate": 1.7817773554277493e-05, "loss": 1.1833, "step": 6540 }, { "epoch": 2.4352479901339135, "grad_norm": 0.16380086541175842, "learning_rate": 1.7817015676741883e-05, "loss": 1.1861, "step": 6541 }, { "epoch": 2.4356202952845227, "grad_norm": 0.16374869644641876, "learning_rate": 1.7816257683749444e-05, "loss": 1.1758, "step": 6542 }, { "epoch": 2.4359926004351315, "grad_norm": 0.1633332371711731, "learning_rate": 1.7815499575311367e-05, "loss": 1.1984, "step": 6543 }, { "epoch": 2.436364905585741, "grad_norm": 0.16350342333316803, "learning_rate": 1.7814741351438855e-05, "loss": 1.1672, "step": 6544 }, { "epoch": 2.4367372107363496, "grad_norm": 0.16379904747009277, "learning_rate": 1.7813983012143104e-05, "loss": 1.1895, "step": 6545 }, { "epoch": 2.437109515886959, "grad_norm": 0.16727176308631897, "learning_rate": 1.7813224557435313e-05, "loss": 1.1741, "step": 6546 }, { "epoch": 2.4374818210375677, "grad_norm": 0.1602688729763031, "learning_rate": 1.7812465987326682e-05, "loss": 1.1816, "step": 6547 }, { "epoch": 2.437854126188177, "grad_norm": 0.16404597461223602, "learning_rate": 1.7811707301828424e-05, "loss": 1.1875, "step": 6548 }, { "epoch": 2.4382264313387862, "grad_norm": 0.16207285225391388, "learning_rate": 1.7810948500951738e-05, "loss": 1.1627, "step": 6549 }, { "epoch": 2.438598736489395, "grad_norm": 0.168754443526268, "learning_rate": 1.7810189584707834e-05, "loss": 1.1778, "step": 6550 }, { "epoch": 2.4389710416400043, "grad_norm": 0.1652376502752304, "learning_rate": 1.7809430553107915e-05, "loss": 1.1854, "step": 6551 }, { "epoch": 2.439343346790613, "grad_norm": 0.16541428864002228, "learning_rate": 1.78086714061632e-05, "loss": 1.1917, "step": 6552 }, { "epoch": 2.4397156519412224, "grad_norm": 0.1626761257648468, "learning_rate": 1.78079121438849e-05, "loss": 1.1666, "step": 6553 }, { "epoch": 2.4400879570918312, "grad_norm": 0.164178267121315, "learning_rate": 1.7807152766284222e-05, "loss": 1.1834, "step": 6554 }, { "epoch": 2.4404602622424405, "grad_norm": 0.15936389565467834, "learning_rate": 1.7806393273372396e-05, "loss": 1.1668, "step": 6555 }, { "epoch": 2.4408325673930493, "grad_norm": 0.16379861533641815, "learning_rate": 1.7805633665160623e-05, "loss": 1.1796, "step": 6556 }, { "epoch": 2.4412048725436586, "grad_norm": 0.165420264005661, "learning_rate": 1.7804873941660137e-05, "loss": 1.1771, "step": 6557 }, { "epoch": 2.441577177694268, "grad_norm": 0.16824427247047424, "learning_rate": 1.780411410288215e-05, "loss": 1.1844, "step": 6558 }, { "epoch": 2.4419494828448767, "grad_norm": 0.16532348096370697, "learning_rate": 1.780335414883789e-05, "loss": 1.1887, "step": 6559 }, { "epoch": 2.442321787995486, "grad_norm": 0.16297706961631775, "learning_rate": 1.7802594079538574e-05, "loss": 1.1702, "step": 6560 }, { "epoch": 2.4426940931460948, "grad_norm": 0.16439802944660187, "learning_rate": 1.7801833894995436e-05, "loss": 1.1802, "step": 6561 }, { "epoch": 2.443066398296704, "grad_norm": 0.16698960959911346, "learning_rate": 1.7801073595219702e-05, "loss": 1.183, "step": 6562 }, { "epoch": 2.443438703447313, "grad_norm": 0.17252756655216217, "learning_rate": 1.78003131802226e-05, "loss": 1.187, "step": 6563 }, { "epoch": 2.443811008597922, "grad_norm": 0.16879017651081085, "learning_rate": 1.779955265001536e-05, "loss": 1.1708, "step": 6564 }, { "epoch": 2.444183313748531, "grad_norm": 0.16200561821460724, "learning_rate": 1.779879200460922e-05, "loss": 1.173, "step": 6565 }, { "epoch": 2.44455561889914, "grad_norm": 0.16437043249607086, "learning_rate": 1.7798031244015406e-05, "loss": 1.1693, "step": 6566 }, { "epoch": 2.4449279240497495, "grad_norm": 0.17090344429016113, "learning_rate": 1.7797270368245166e-05, "loss": 1.1667, "step": 6567 }, { "epoch": 2.4453002292003583, "grad_norm": 0.1742093712091446, "learning_rate": 1.7796509377309728e-05, "loss": 1.1932, "step": 6568 }, { "epoch": 2.4456725343509675, "grad_norm": 0.16742129623889923, "learning_rate": 1.7795748271220337e-05, "loss": 1.1628, "step": 6569 }, { "epoch": 2.4460448395015764, "grad_norm": 0.16507847607135773, "learning_rate": 1.779498704998823e-05, "loss": 1.1942, "step": 6570 }, { "epoch": 2.4464171446521856, "grad_norm": 0.16421516239643097, "learning_rate": 1.7794225713624663e-05, "loss": 1.1746, "step": 6571 }, { "epoch": 2.4467894498027944, "grad_norm": 0.17189864814281464, "learning_rate": 1.7793464262140864e-05, "loss": 1.1843, "step": 6572 }, { "epoch": 2.4471617549534037, "grad_norm": 0.16595551371574402, "learning_rate": 1.7792702695548086e-05, "loss": 1.187, "step": 6573 }, { "epoch": 2.4475340601040125, "grad_norm": 0.16488155722618103, "learning_rate": 1.779194101385758e-05, "loss": 1.1726, "step": 6574 }, { "epoch": 2.447906365254622, "grad_norm": 0.164886474609375, "learning_rate": 1.7791179217080598e-05, "loss": 1.1745, "step": 6575 }, { "epoch": 2.448278670405231, "grad_norm": 0.17774739861488342, "learning_rate": 1.779041730522838e-05, "loss": 1.1738, "step": 6576 }, { "epoch": 2.44865097555584, "grad_norm": 0.1659976691007614, "learning_rate": 1.7789655278312198e-05, "loss": 1.1803, "step": 6577 }, { "epoch": 2.449023280706449, "grad_norm": 0.16639791429042816, "learning_rate": 1.7788893136343288e-05, "loss": 1.1628, "step": 6578 }, { "epoch": 2.449395585857058, "grad_norm": 0.16745202243328094, "learning_rate": 1.7788130879332918e-05, "loss": 1.1697, "step": 6579 }, { "epoch": 2.4497678910076672, "grad_norm": 0.16819866001605988, "learning_rate": 1.7787368507292343e-05, "loss": 1.1783, "step": 6580 }, { "epoch": 2.450140196158276, "grad_norm": 0.16538161039352417, "learning_rate": 1.7786606020232825e-05, "loss": 1.1736, "step": 6581 }, { "epoch": 2.4505125013088853, "grad_norm": 0.1632433533668518, "learning_rate": 1.7785843418165624e-05, "loss": 1.1779, "step": 6582 }, { "epoch": 2.450884806459494, "grad_norm": 0.1659589260816574, "learning_rate": 1.7785080701102003e-05, "loss": 1.195, "step": 6583 }, { "epoch": 2.4512571116101034, "grad_norm": 0.16399775445461273, "learning_rate": 1.778431786905323e-05, "loss": 1.1767, "step": 6584 }, { "epoch": 2.4516294167607127, "grad_norm": 0.1630357950925827, "learning_rate": 1.778355492203057e-05, "loss": 1.1817, "step": 6585 }, { "epoch": 2.4520017219113215, "grad_norm": 0.15688146650791168, "learning_rate": 1.778279186004529e-05, "loss": 1.1784, "step": 6586 }, { "epoch": 2.4523740270619308, "grad_norm": 0.15879155695438385, "learning_rate": 1.7782028683108667e-05, "loss": 1.1713, "step": 6587 }, { "epoch": 2.4527463322125396, "grad_norm": 0.16145865619182587, "learning_rate": 1.7781265391231968e-05, "loss": 1.186, "step": 6588 }, { "epoch": 2.453118637363149, "grad_norm": 0.1632131040096283, "learning_rate": 1.7780501984426465e-05, "loss": 1.1699, "step": 6589 }, { "epoch": 2.4534909425137577, "grad_norm": 0.15937484800815582, "learning_rate": 1.777973846270344e-05, "loss": 1.1633, "step": 6590 }, { "epoch": 2.453863247664367, "grad_norm": 0.16633634269237518, "learning_rate": 1.7778974826074163e-05, "loss": 1.1933, "step": 6591 }, { "epoch": 2.4542355528149757, "grad_norm": 0.1646929830312729, "learning_rate": 1.7778211074549916e-05, "loss": 1.1823, "step": 6592 }, { "epoch": 2.454607857965585, "grad_norm": 0.1606590747833252, "learning_rate": 1.777744720814198e-05, "loss": 1.1797, "step": 6593 }, { "epoch": 2.4549801631161943, "grad_norm": 0.16645345091819763, "learning_rate": 1.7776683226861636e-05, "loss": 1.1663, "step": 6594 }, { "epoch": 2.455352468266803, "grad_norm": 0.16911546885967255, "learning_rate": 1.777591913072017e-05, "loss": 1.1717, "step": 6595 }, { "epoch": 2.4557247734174124, "grad_norm": 0.16527560353279114, "learning_rate": 1.777515491972887e-05, "loss": 1.1854, "step": 6596 }, { "epoch": 2.456097078568021, "grad_norm": 0.16510476171970367, "learning_rate": 1.7774390593899014e-05, "loss": 1.1735, "step": 6597 }, { "epoch": 2.4564693837186304, "grad_norm": 0.16441304981708527, "learning_rate": 1.7773626153241897e-05, "loss": 1.1721, "step": 6598 }, { "epoch": 2.4568416888692393, "grad_norm": 0.163100928068161, "learning_rate": 1.7772861597768814e-05, "loss": 1.1774, "step": 6599 }, { "epoch": 2.4572139940198485, "grad_norm": 0.16607718169689178, "learning_rate": 1.777209692749105e-05, "loss": 1.171, "step": 6600 }, { "epoch": 2.4575862991704573, "grad_norm": 0.1616518199443817, "learning_rate": 1.77713321424199e-05, "loss": 1.1722, "step": 6601 }, { "epoch": 2.4579586043210666, "grad_norm": 0.1628957986831665, "learning_rate": 1.7770567242566667e-05, "loss": 1.1708, "step": 6602 }, { "epoch": 2.458330909471676, "grad_norm": 0.1651378571987152, "learning_rate": 1.776980222794264e-05, "loss": 1.1775, "step": 6603 }, { "epoch": 2.4587032146222847, "grad_norm": 0.16311919689178467, "learning_rate": 1.7769037098559124e-05, "loss": 1.1746, "step": 6604 }, { "epoch": 2.459075519772894, "grad_norm": 0.16620442271232605, "learning_rate": 1.7768271854427417e-05, "loss": 1.177, "step": 6605 }, { "epoch": 2.459447824923503, "grad_norm": 0.16138948500156403, "learning_rate": 1.776750649555882e-05, "loss": 1.1594, "step": 6606 }, { "epoch": 2.459820130074112, "grad_norm": 0.16413603723049164, "learning_rate": 1.776674102196464e-05, "loss": 1.1901, "step": 6607 }, { "epoch": 2.4601924352247213, "grad_norm": 0.1599273979663849, "learning_rate": 1.7765975433656187e-05, "loss": 1.1678, "step": 6608 }, { "epoch": 2.46056474037533, "grad_norm": 0.15869589149951935, "learning_rate": 1.776520973064476e-05, "loss": 1.1647, "step": 6609 }, { "epoch": 2.460937045525939, "grad_norm": 0.1629563570022583, "learning_rate": 1.7764443912941675e-05, "loss": 1.1711, "step": 6610 }, { "epoch": 2.461309350676548, "grad_norm": 0.1599353700876236, "learning_rate": 1.776367798055824e-05, "loss": 1.1772, "step": 6611 }, { "epoch": 2.4616816558271575, "grad_norm": 0.16407759487628937, "learning_rate": 1.7762911933505767e-05, "loss": 1.1826, "step": 6612 }, { "epoch": 2.4620539609777663, "grad_norm": 0.1610613763332367, "learning_rate": 1.776214577179557e-05, "loss": 1.1704, "step": 6613 }, { "epoch": 2.4624262661283756, "grad_norm": 0.16347752511501312, "learning_rate": 1.776137949543897e-05, "loss": 1.1712, "step": 6614 }, { "epoch": 2.4627985712789844, "grad_norm": 0.15655462443828583, "learning_rate": 1.7760613104447283e-05, "loss": 1.1515, "step": 6615 }, { "epoch": 2.4631708764295936, "grad_norm": 0.16429446637630463, "learning_rate": 1.7759846598831827e-05, "loss": 1.1793, "step": 6616 }, { "epoch": 2.463543181580203, "grad_norm": 0.16493436694145203, "learning_rate": 1.775907997860392e-05, "loss": 1.1681, "step": 6617 }, { "epoch": 2.4639154867308117, "grad_norm": 0.1644970327615738, "learning_rate": 1.775831324377489e-05, "loss": 1.1819, "step": 6618 }, { "epoch": 2.464287791881421, "grad_norm": 0.16684550046920776, "learning_rate": 1.7757546394356063e-05, "loss": 1.1864, "step": 6619 }, { "epoch": 2.46466009703203, "grad_norm": 0.16069693863391876, "learning_rate": 1.775677943035876e-05, "loss": 1.1721, "step": 6620 }, { "epoch": 2.465032402182639, "grad_norm": 0.1637074053287506, "learning_rate": 1.7756012351794315e-05, "loss": 1.1697, "step": 6621 }, { "epoch": 2.465404707333248, "grad_norm": 0.1612214297056198, "learning_rate": 1.7755245158674054e-05, "loss": 1.1658, "step": 6622 }, { "epoch": 2.465777012483857, "grad_norm": 0.16873739659786224, "learning_rate": 1.7754477851009307e-05, "loss": 1.1943, "step": 6623 }, { "epoch": 2.466149317634466, "grad_norm": 0.16475237905979156, "learning_rate": 1.775371042881141e-05, "loss": 1.1857, "step": 6624 }, { "epoch": 2.4665216227850753, "grad_norm": 0.16475136578083038, "learning_rate": 1.7752942892091694e-05, "loss": 1.1782, "step": 6625 }, { "epoch": 2.4668939279356845, "grad_norm": 0.16575996577739716, "learning_rate": 1.7752175240861497e-05, "loss": 1.1794, "step": 6626 }, { "epoch": 2.4672662330862933, "grad_norm": 0.16450487077236176, "learning_rate": 1.7751407475132164e-05, "loss": 1.1768, "step": 6627 }, { "epoch": 2.4676385382369026, "grad_norm": 0.1675347089767456, "learning_rate": 1.7750639594915026e-05, "loss": 1.1753, "step": 6628 }, { "epoch": 2.4680108433875114, "grad_norm": 0.16457484662532806, "learning_rate": 1.7749871600221426e-05, "loss": 1.1841, "step": 6629 }, { "epoch": 2.4683831485381207, "grad_norm": 0.17486611008644104, "learning_rate": 1.774910349106271e-05, "loss": 1.1963, "step": 6630 }, { "epoch": 2.4687554536887295, "grad_norm": 0.1780180037021637, "learning_rate": 1.774833526745022e-05, "loss": 1.1998, "step": 6631 }, { "epoch": 2.4691277588393388, "grad_norm": 0.168752983212471, "learning_rate": 1.7747566929395307e-05, "loss": 1.1728, "step": 6632 }, { "epoch": 2.4695000639899476, "grad_norm": 0.1614418923854828, "learning_rate": 1.7746798476909316e-05, "loss": 1.1632, "step": 6633 }, { "epoch": 2.469872369140557, "grad_norm": 0.16157226264476776, "learning_rate": 1.7746029910003598e-05, "loss": 1.1502, "step": 6634 }, { "epoch": 2.470244674291166, "grad_norm": 0.16318723559379578, "learning_rate": 1.7745261228689505e-05, "loss": 1.1645, "step": 6635 }, { "epoch": 2.470616979441775, "grad_norm": 0.17089669406414032, "learning_rate": 1.7744492432978385e-05, "loss": 1.1809, "step": 6636 }, { "epoch": 2.470989284592384, "grad_norm": 0.1579371988773346, "learning_rate": 1.7743723522881604e-05, "loss": 1.1742, "step": 6637 }, { "epoch": 2.471361589742993, "grad_norm": 0.16182205080986023, "learning_rate": 1.774295449841051e-05, "loss": 1.1868, "step": 6638 }, { "epoch": 2.4717338948936023, "grad_norm": 0.16928228735923767, "learning_rate": 1.7742185359576464e-05, "loss": 1.1915, "step": 6639 }, { "epoch": 2.472106200044211, "grad_norm": 0.15761636197566986, "learning_rate": 1.7741416106390828e-05, "loss": 1.1569, "step": 6640 }, { "epoch": 2.4724785051948204, "grad_norm": 0.1611185520887375, "learning_rate": 1.7740646738864956e-05, "loss": 1.1806, "step": 6641 }, { "epoch": 2.472850810345429, "grad_norm": 0.16303184628486633, "learning_rate": 1.7739877257010226e-05, "loss": 1.1706, "step": 6642 }, { "epoch": 2.4732231154960385, "grad_norm": 0.1614006906747818, "learning_rate": 1.7739107660837985e-05, "loss": 1.1721, "step": 6643 }, { "epoch": 2.4735954206466477, "grad_norm": 0.1621858775615692, "learning_rate": 1.7738337950359617e-05, "loss": 1.1672, "step": 6644 }, { "epoch": 2.4739677257972565, "grad_norm": 0.15944872796535492, "learning_rate": 1.7737568125586482e-05, "loss": 1.1675, "step": 6645 }, { "epoch": 2.474340030947866, "grad_norm": 0.16335934400558472, "learning_rate": 1.7736798186529947e-05, "loss": 1.1747, "step": 6646 }, { "epoch": 2.4747123360984746, "grad_norm": 0.16249805688858032, "learning_rate": 1.7736028133201394e-05, "loss": 1.1767, "step": 6647 }, { "epoch": 2.475084641249084, "grad_norm": 0.16741207242012024, "learning_rate": 1.7735257965612188e-05, "loss": 1.1809, "step": 6648 }, { "epoch": 2.4754569463996927, "grad_norm": 0.167801633477211, "learning_rate": 1.773448768377371e-05, "loss": 1.174, "step": 6649 }, { "epoch": 2.475829251550302, "grad_norm": 0.1614260971546173, "learning_rate": 1.7733717287697328e-05, "loss": 1.1734, "step": 6650 }, { "epoch": 2.476201556700911, "grad_norm": 0.15728265047073364, "learning_rate": 1.7732946777394432e-05, "loss": 1.172, "step": 6651 }, { "epoch": 2.47657386185152, "grad_norm": 0.16399987041950226, "learning_rate": 1.77321761528764e-05, "loss": 1.1902, "step": 6652 }, { "epoch": 2.4769461670021293, "grad_norm": 0.1572953164577484, "learning_rate": 1.7731405414154606e-05, "loss": 1.1712, "step": 6653 }, { "epoch": 2.477318472152738, "grad_norm": 0.16343964636325836, "learning_rate": 1.7730634561240442e-05, "loss": 1.178, "step": 6654 }, { "epoch": 2.4776907773033474, "grad_norm": 0.1638653576374054, "learning_rate": 1.7729863594145287e-05, "loss": 1.1719, "step": 6655 }, { "epoch": 2.4780630824539562, "grad_norm": 0.16249097883701324, "learning_rate": 1.7729092512880534e-05, "loss": 1.1715, "step": 6656 }, { "epoch": 2.4784353876045655, "grad_norm": 0.15715673565864563, "learning_rate": 1.7728321317457573e-05, "loss": 1.1589, "step": 6657 }, { "epoch": 2.4788076927551743, "grad_norm": 0.16444164514541626, "learning_rate": 1.7727550007887787e-05, "loss": 1.1729, "step": 6658 }, { "epoch": 2.4791799979057836, "grad_norm": 0.16417501866817474, "learning_rate": 1.7726778584182575e-05, "loss": 1.1709, "step": 6659 }, { "epoch": 2.4795523030563924, "grad_norm": 0.16987749934196472, "learning_rate": 1.7726007046353328e-05, "loss": 1.1784, "step": 6660 }, { "epoch": 2.4799246082070017, "grad_norm": 0.16455167531967163, "learning_rate": 1.772523539441144e-05, "loss": 1.1882, "step": 6661 }, { "epoch": 2.480296913357611, "grad_norm": 0.16781625151634216, "learning_rate": 1.772446362836831e-05, "loss": 1.1803, "step": 6662 }, { "epoch": 2.4806692185082198, "grad_norm": 0.1608152836561203, "learning_rate": 1.772369174823534e-05, "loss": 1.1655, "step": 6663 }, { "epoch": 2.481041523658829, "grad_norm": 0.16263465583324432, "learning_rate": 1.7722919754023923e-05, "loss": 1.1561, "step": 6664 }, { "epoch": 2.481413828809438, "grad_norm": 0.16114389896392822, "learning_rate": 1.772214764574547e-05, "loss": 1.1724, "step": 6665 }, { "epoch": 2.481786133960047, "grad_norm": 0.1621330976486206, "learning_rate": 1.7721375423411378e-05, "loss": 1.1724, "step": 6666 }, { "epoch": 2.482158439110656, "grad_norm": 0.16913004219532013, "learning_rate": 1.7720603087033058e-05, "loss": 1.1843, "step": 6667 }, { "epoch": 2.482530744261265, "grad_norm": 0.16739852726459503, "learning_rate": 1.7719830636621914e-05, "loss": 1.1717, "step": 6668 }, { "epoch": 2.482903049411874, "grad_norm": 0.16293086111545563, "learning_rate": 1.7719058072189355e-05, "loss": 1.1743, "step": 6669 }, { "epoch": 2.4832753545624833, "grad_norm": 0.16651244461536407, "learning_rate": 1.771828539374679e-05, "loss": 1.1892, "step": 6670 }, { "epoch": 2.4836476597130925, "grad_norm": 0.16542814671993256, "learning_rate": 1.771751260130564e-05, "loss": 1.1692, "step": 6671 }, { "epoch": 2.4840199648637014, "grad_norm": 0.16294194757938385, "learning_rate": 1.771673969487731e-05, "loss": 1.1784, "step": 6672 }, { "epoch": 2.4843922700143106, "grad_norm": 0.16167296469211578, "learning_rate": 1.771596667447322e-05, "loss": 1.1642, "step": 6673 }, { "epoch": 2.4847645751649194, "grad_norm": 0.1647670865058899, "learning_rate": 1.771519354010479e-05, "loss": 1.1843, "step": 6674 }, { "epoch": 2.4851368803155287, "grad_norm": 0.16380488872528076, "learning_rate": 1.771442029178343e-05, "loss": 1.1939, "step": 6675 }, { "epoch": 2.4855091854661375, "grad_norm": 0.16478745639324188, "learning_rate": 1.7713646929520568e-05, "loss": 1.174, "step": 6676 }, { "epoch": 2.485881490616747, "grad_norm": 0.16080977022647858, "learning_rate": 1.7712873453327626e-05, "loss": 1.1729, "step": 6677 }, { "epoch": 2.4862537957673556, "grad_norm": 0.16270096600055695, "learning_rate": 1.7712099863216027e-05, "loss": 1.1848, "step": 6678 }, { "epoch": 2.486626100917965, "grad_norm": 0.16529610753059387, "learning_rate": 1.7711326159197195e-05, "loss": 1.1513, "step": 6679 }, { "epoch": 2.486998406068574, "grad_norm": 0.16876338422298431, "learning_rate": 1.771055234128256e-05, "loss": 1.1805, "step": 6680 }, { "epoch": 2.487370711219183, "grad_norm": 0.18239647150039673, "learning_rate": 1.7709778409483554e-05, "loss": 1.1735, "step": 6681 }, { "epoch": 2.4877430163697922, "grad_norm": 0.16950790584087372, "learning_rate": 1.7709004363811598e-05, "loss": 1.1815, "step": 6682 }, { "epoch": 2.488115321520401, "grad_norm": 0.16490155458450317, "learning_rate": 1.770823020427814e-05, "loss": 1.1648, "step": 6683 }, { "epoch": 2.4884876266710103, "grad_norm": 0.17165668308734894, "learning_rate": 1.7707455930894603e-05, "loss": 1.178, "step": 6684 }, { "epoch": 2.488859931821619, "grad_norm": 0.16048189997673035, "learning_rate": 1.770668154367242e-05, "loss": 1.1776, "step": 6685 }, { "epoch": 2.4892322369722284, "grad_norm": 0.16602389514446259, "learning_rate": 1.770590704262304e-05, "loss": 1.1667, "step": 6686 }, { "epoch": 2.489604542122837, "grad_norm": 0.1645582914352417, "learning_rate": 1.7705132427757895e-05, "loss": 1.1881, "step": 6687 }, { "epoch": 2.4899768472734465, "grad_norm": 0.16345307230949402, "learning_rate": 1.7704357699088426e-05, "loss": 1.1757, "step": 6688 }, { "epoch": 2.4903491524240557, "grad_norm": 0.1574520319700241, "learning_rate": 1.770358285662608e-05, "loss": 1.1658, "step": 6689 }, { "epoch": 2.4907214575746646, "grad_norm": 0.16502508521080017, "learning_rate": 1.7702807900382296e-05, "loss": 1.1815, "step": 6690 }, { "epoch": 2.491093762725274, "grad_norm": 0.16367071866989136, "learning_rate": 1.7702032830368525e-05, "loss": 1.1812, "step": 6691 }, { "epoch": 2.4914660678758827, "grad_norm": 0.16245703399181366, "learning_rate": 1.7701257646596212e-05, "loss": 1.1684, "step": 6692 }, { "epoch": 2.491838373026492, "grad_norm": 0.16035568714141846, "learning_rate": 1.7700482349076808e-05, "loss": 1.1712, "step": 6693 }, { "epoch": 2.4922106781771007, "grad_norm": 0.16250720620155334, "learning_rate": 1.769970693782176e-05, "loss": 1.1736, "step": 6694 }, { "epoch": 2.49258298332771, "grad_norm": 0.1739581972360611, "learning_rate": 1.7698931412842526e-05, "loss": 1.1765, "step": 6695 }, { "epoch": 2.492955288478319, "grad_norm": 0.16992726922035217, "learning_rate": 1.7698155774150553e-05, "loss": 1.1814, "step": 6696 }, { "epoch": 2.493327593628928, "grad_norm": 0.16158819198608398, "learning_rate": 1.769738002175731e-05, "loss": 1.1727, "step": 6697 }, { "epoch": 2.4936998987795373, "grad_norm": 0.17658640444278717, "learning_rate": 1.769660415567424e-05, "loss": 1.1675, "step": 6698 }, { "epoch": 2.494072203930146, "grad_norm": 0.21519418060779572, "learning_rate": 1.7695828175912816e-05, "loss": 1.181, "step": 6699 }, { "epoch": 2.4944445090807554, "grad_norm": 0.1897740513086319, "learning_rate": 1.7695052082484493e-05, "loss": 1.168, "step": 6700 }, { "epoch": 2.4948168142313643, "grad_norm": 0.1640034019947052, "learning_rate": 1.769427587540073e-05, "loss": 1.1741, "step": 6701 }, { "epoch": 2.4951891193819735, "grad_norm": 0.17497296631336212, "learning_rate": 1.7693499554672996e-05, "loss": 1.1988, "step": 6702 }, { "epoch": 2.4955614245325823, "grad_norm": 0.16799713671207428, "learning_rate": 1.7692723120312757e-05, "loss": 1.1981, "step": 6703 }, { "epoch": 2.4959337296831916, "grad_norm": 0.16971002519130707, "learning_rate": 1.7691946572331477e-05, "loss": 1.198, "step": 6704 }, { "epoch": 2.4963060348338004, "grad_norm": 0.17075687646865845, "learning_rate": 1.7691169910740633e-05, "loss": 1.1848, "step": 6705 }, { "epoch": 2.4966783399844097, "grad_norm": 0.16614024341106415, "learning_rate": 1.7690393135551692e-05, "loss": 1.1759, "step": 6706 }, { "epoch": 2.497050645135019, "grad_norm": 0.17377164959907532, "learning_rate": 1.7689616246776125e-05, "loss": 1.1758, "step": 6707 }, { "epoch": 2.4974229502856278, "grad_norm": 0.1613984853029251, "learning_rate": 1.768883924442541e-05, "loss": 1.1545, "step": 6708 }, { "epoch": 2.497795255436237, "grad_norm": 0.16565319895744324, "learning_rate": 1.7688062128511023e-05, "loss": 1.1795, "step": 6709 }, { "epoch": 2.498167560586846, "grad_norm": 0.17493651807308197, "learning_rate": 1.768728489904444e-05, "loss": 1.1901, "step": 6710 }, { "epoch": 2.498539865737455, "grad_norm": 0.16326481103897095, "learning_rate": 1.7686507556037136e-05, "loss": 1.1731, "step": 6711 }, { "epoch": 2.498912170888064, "grad_norm": 0.1633228361606598, "learning_rate": 1.7685730099500606e-05, "loss": 1.1756, "step": 6712 }, { "epoch": 2.499284476038673, "grad_norm": 0.16988112032413483, "learning_rate": 1.768495252944632e-05, "loss": 1.1778, "step": 6713 }, { "epoch": 2.499656781189282, "grad_norm": 0.165721595287323, "learning_rate": 1.7684174845885768e-05, "loss": 1.178, "step": 6714 }, { "epoch": 2.5000290863398913, "grad_norm": 0.1885932981967926, "learning_rate": 1.768339704883044e-05, "loss": 1.1743, "step": 6715 }, { "epoch": 2.5004013914905006, "grad_norm": 0.16593773663043976, "learning_rate": 1.768261913829181e-05, "loss": 1.1713, "step": 6716 }, { "epoch": 2.5007736966411094, "grad_norm": 0.16606302559375763, "learning_rate": 1.7681841114281387e-05, "loss": 1.1694, "step": 6717 }, { "epoch": 2.5011460017917186, "grad_norm": 0.1713939607143402, "learning_rate": 1.768106297681065e-05, "loss": 1.167, "step": 6718 }, { "epoch": 2.5015183069423275, "grad_norm": 0.172207772731781, "learning_rate": 1.7680284725891095e-05, "loss": 1.1824, "step": 6719 }, { "epoch": 2.5018906120929367, "grad_norm": 0.1743270754814148, "learning_rate": 1.7679506361534216e-05, "loss": 1.1705, "step": 6720 }, { "epoch": 2.502262917243546, "grad_norm": 0.16744069755077362, "learning_rate": 1.7678727883751508e-05, "loss": 1.1853, "step": 6721 }, { "epoch": 2.502635222394155, "grad_norm": 0.18169431388378143, "learning_rate": 1.7677949292554473e-05, "loss": 1.1796, "step": 6722 }, { "epoch": 2.5030075275447636, "grad_norm": 0.1659853309392929, "learning_rate": 1.7677170587954607e-05, "loss": 1.1716, "step": 6723 }, { "epoch": 2.503379832695373, "grad_norm": 0.18055576086044312, "learning_rate": 1.7676391769963416e-05, "loss": 1.1826, "step": 6724 }, { "epoch": 2.503752137845982, "grad_norm": 0.16379252076148987, "learning_rate": 1.76756128385924e-05, "loss": 1.1916, "step": 6725 }, { "epoch": 2.504124442996591, "grad_norm": 0.16528604924678802, "learning_rate": 1.7674833793853064e-05, "loss": 1.1856, "step": 6726 }, { "epoch": 2.5044967481472002, "grad_norm": 0.1737004965543747, "learning_rate": 1.7674054635756914e-05, "loss": 1.1808, "step": 6727 }, { "epoch": 2.504869053297809, "grad_norm": 0.16210679709911346, "learning_rate": 1.7673275364315458e-05, "loss": 1.171, "step": 6728 }, { "epoch": 2.5052413584484183, "grad_norm": 0.16479384899139404, "learning_rate": 1.7672495979540207e-05, "loss": 1.1741, "step": 6729 }, { "epoch": 2.5056136635990276, "grad_norm": 0.16804920136928558, "learning_rate": 1.7671716481442674e-05, "loss": 1.1771, "step": 6730 }, { "epoch": 2.5059859687496364, "grad_norm": 0.16524706780910492, "learning_rate": 1.7670936870034366e-05, "loss": 1.1762, "step": 6731 }, { "epoch": 2.5063582739002452, "grad_norm": 0.163568913936615, "learning_rate": 1.7670157145326806e-05, "loss": 1.1838, "step": 6732 }, { "epoch": 2.5067305790508545, "grad_norm": 0.15912410616874695, "learning_rate": 1.7669377307331503e-05, "loss": 1.1833, "step": 6733 }, { "epoch": 2.5071028842014638, "grad_norm": 0.16776417195796967, "learning_rate": 1.7668597356059977e-05, "loss": 1.184, "step": 6734 }, { "epoch": 2.5074751893520726, "grad_norm": 0.17335230112075806, "learning_rate": 1.7667817291523753e-05, "loss": 1.2042, "step": 6735 }, { "epoch": 2.507847494502682, "grad_norm": 0.15524321794509888, "learning_rate": 1.766703711373435e-05, "loss": 1.1639, "step": 6736 }, { "epoch": 2.5082197996532907, "grad_norm": 0.16183823347091675, "learning_rate": 1.7666256822703288e-05, "loss": 1.1863, "step": 6737 }, { "epoch": 2.5085921048039, "grad_norm": 0.1681044101715088, "learning_rate": 1.7665476418442092e-05, "loss": 1.1777, "step": 6738 }, { "epoch": 2.508964409954509, "grad_norm": 0.165101557970047, "learning_rate": 1.766469590096229e-05, "loss": 1.1829, "step": 6739 }, { "epoch": 2.509336715105118, "grad_norm": 0.16544567048549652, "learning_rate": 1.7663915270275413e-05, "loss": 1.1665, "step": 6740 }, { "epoch": 2.509709020255727, "grad_norm": 0.17958173155784607, "learning_rate": 1.766313452639299e-05, "loss": 1.1861, "step": 6741 }, { "epoch": 2.510081325406336, "grad_norm": 0.19180700182914734, "learning_rate": 1.766235366932655e-05, "loss": 1.1774, "step": 6742 }, { "epoch": 2.5104536305569454, "grad_norm": 0.1678982377052307, "learning_rate": 1.7661572699087622e-05, "loss": 1.1739, "step": 6743 }, { "epoch": 2.510825935707554, "grad_norm": 0.16974776983261108, "learning_rate": 1.7660791615687752e-05, "loss": 1.1781, "step": 6744 }, { "epoch": 2.5111982408581635, "grad_norm": 0.18585939705371857, "learning_rate": 1.766001041913847e-05, "loss": 1.1733, "step": 6745 }, { "epoch": 2.5115705460087723, "grad_norm": 0.17436900734901428, "learning_rate": 1.7659229109451312e-05, "loss": 1.1856, "step": 6746 }, { "epoch": 2.5119428511593815, "grad_norm": 0.1617736667394638, "learning_rate": 1.765844768663782e-05, "loss": 1.1664, "step": 6747 }, { "epoch": 2.512315156309991, "grad_norm": 0.161626398563385, "learning_rate": 1.765766615070954e-05, "loss": 1.1708, "step": 6748 }, { "epoch": 2.5126874614605996, "grad_norm": 0.16217051446437836, "learning_rate": 1.7656884501678014e-05, "loss": 1.1661, "step": 6749 }, { "epoch": 2.5130597666112084, "grad_norm": 0.16591928899288177, "learning_rate": 1.765610273955478e-05, "loss": 1.1706, "step": 6750 }, { "epoch": 2.5134320717618177, "grad_norm": 0.17161224782466888, "learning_rate": 1.7655320864351388e-05, "loss": 1.1799, "step": 6751 }, { "epoch": 2.513804376912427, "grad_norm": 0.16334664821624756, "learning_rate": 1.7654538876079387e-05, "loss": 1.158, "step": 6752 }, { "epoch": 2.514176682063036, "grad_norm": 0.15914195775985718, "learning_rate": 1.7653756774750334e-05, "loss": 1.1648, "step": 6753 }, { "epoch": 2.514548987213645, "grad_norm": 0.18378858268260956, "learning_rate": 1.7652974560375765e-05, "loss": 1.1775, "step": 6754 }, { "epoch": 2.514921292364254, "grad_norm": 0.21220822632312775, "learning_rate": 1.7652192232967245e-05, "loss": 1.1733, "step": 6755 }, { "epoch": 2.515293597514863, "grad_norm": 0.1674444079399109, "learning_rate": 1.7651409792536328e-05, "loss": 1.1723, "step": 6756 }, { "epoch": 2.5156659026654724, "grad_norm": 0.1930401474237442, "learning_rate": 1.765062723909457e-05, "loss": 1.1771, "step": 6757 }, { "epoch": 2.5160382078160812, "grad_norm": 0.16456985473632812, "learning_rate": 1.7649844572653523e-05, "loss": 1.1803, "step": 6758 }, { "epoch": 2.51641051296669, "grad_norm": 0.18256615102291107, "learning_rate": 1.7649061793224752e-05, "loss": 1.1701, "step": 6759 }, { "epoch": 2.5167828181172993, "grad_norm": 0.16572584211826324, "learning_rate": 1.7648278900819822e-05, "loss": 1.1784, "step": 6760 }, { "epoch": 2.5171551232679086, "grad_norm": 0.1717369556427002, "learning_rate": 1.7647495895450292e-05, "loss": 1.1871, "step": 6761 }, { "epoch": 2.5175274284185174, "grad_norm": 0.16302919387817383, "learning_rate": 1.7646712777127722e-05, "loss": 1.1599, "step": 6762 }, { "epoch": 2.5178997335691267, "grad_norm": 0.1645524501800537, "learning_rate": 1.764592954586369e-05, "loss": 1.1726, "step": 6763 }, { "epoch": 2.5182720387197355, "grad_norm": 0.1693658083677292, "learning_rate": 1.764514620166976e-05, "loss": 1.1715, "step": 6764 }, { "epoch": 2.5186443438703447, "grad_norm": 0.17292669415473938, "learning_rate": 1.7644362744557498e-05, "loss": 1.1747, "step": 6765 }, { "epoch": 2.519016649020954, "grad_norm": 0.16915366053581238, "learning_rate": 1.7643579174538475e-05, "loss": 1.1728, "step": 6766 }, { "epoch": 2.519388954171563, "grad_norm": 0.16110309958457947, "learning_rate": 1.7642795491624268e-05, "loss": 1.1649, "step": 6767 }, { "epoch": 2.519761259322172, "grad_norm": 0.16794544458389282, "learning_rate": 1.7642011695826455e-05, "loss": 1.1718, "step": 6768 }, { "epoch": 2.520133564472781, "grad_norm": 0.15866640210151672, "learning_rate": 1.764122778715661e-05, "loss": 1.1585, "step": 6769 }, { "epoch": 2.52050586962339, "grad_norm": 0.16686029732227325, "learning_rate": 1.7640443765626304e-05, "loss": 1.1596, "step": 6770 }, { "epoch": 2.520878174773999, "grad_norm": 0.16550125181674957, "learning_rate": 1.7639659631247127e-05, "loss": 1.1785, "step": 6771 }, { "epoch": 2.5212504799246083, "grad_norm": 0.18188951909542084, "learning_rate": 1.7638875384030654e-05, "loss": 1.1607, "step": 6772 }, { "epoch": 2.521622785075217, "grad_norm": 0.1796000748872757, "learning_rate": 1.7638091023988473e-05, "loss": 1.1778, "step": 6773 }, { "epoch": 2.5219950902258264, "grad_norm": 0.17091524600982666, "learning_rate": 1.7637306551132166e-05, "loss": 1.1887, "step": 6774 }, { "epoch": 2.5223673953764356, "grad_norm": 0.16658535599708557, "learning_rate": 1.7636521965473324e-05, "loss": 1.1695, "step": 6775 }, { "epoch": 2.5227397005270444, "grad_norm": 0.16318367421627045, "learning_rate": 1.7635737267023527e-05, "loss": 1.1821, "step": 6776 }, { "epoch": 2.5231120056776537, "grad_norm": 0.16290733218193054, "learning_rate": 1.7634952455794373e-05, "loss": 1.1683, "step": 6777 }, { "epoch": 2.5234843108282625, "grad_norm": 0.16749358177185059, "learning_rate": 1.7634167531797447e-05, "loss": 1.1599, "step": 6778 }, { "epoch": 2.523856615978872, "grad_norm": 0.16928349435329437, "learning_rate": 1.7633382495044347e-05, "loss": 1.1721, "step": 6779 }, { "epoch": 2.5242289211294806, "grad_norm": 0.17672182619571686, "learning_rate": 1.7632597345546667e-05, "loss": 1.179, "step": 6780 }, { "epoch": 2.52460122628009, "grad_norm": 0.21029046177864075, "learning_rate": 1.7631812083316003e-05, "loss": 1.1655, "step": 6781 }, { "epoch": 2.5249735314306987, "grad_norm": 0.26718470454216003, "learning_rate": 1.7631026708363956e-05, "loss": 1.1803, "step": 6782 }, { "epoch": 2.525345836581308, "grad_norm": 0.22429171204566956, "learning_rate": 1.763024122070212e-05, "loss": 1.1808, "step": 6783 }, { "epoch": 2.525718141731917, "grad_norm": 0.1687416434288025, "learning_rate": 1.76294556203421e-05, "loss": 1.1771, "step": 6784 }, { "epoch": 2.526090446882526, "grad_norm": 0.18023616075515747, "learning_rate": 1.7628669907295504e-05, "loss": 1.1736, "step": 6785 }, { "epoch": 2.5264627520331353, "grad_norm": 0.17839936912059784, "learning_rate": 1.762788408157393e-05, "loss": 1.1743, "step": 6786 }, { "epoch": 2.526835057183744, "grad_norm": 0.16387495398521423, "learning_rate": 1.7627098143188982e-05, "loss": 1.1711, "step": 6787 }, { "epoch": 2.5272073623343534, "grad_norm": 0.17329815030097961, "learning_rate": 1.762631209215228e-05, "loss": 1.1782, "step": 6788 }, { "epoch": 2.527579667484962, "grad_norm": 0.1764519065618515, "learning_rate": 1.7625525928475424e-05, "loss": 1.1734, "step": 6789 }, { "epoch": 2.5279519726355715, "grad_norm": 0.1638839989900589, "learning_rate": 1.762473965217003e-05, "loss": 1.1666, "step": 6790 }, { "epoch": 2.5283242777861803, "grad_norm": 0.16194722056388855, "learning_rate": 1.7623953263247707e-05, "loss": 1.1526, "step": 6791 }, { "epoch": 2.5286965829367896, "grad_norm": 0.17311038076877594, "learning_rate": 1.7623166761720075e-05, "loss": 1.1802, "step": 6792 }, { "epoch": 2.529068888087399, "grad_norm": 0.1649443656206131, "learning_rate": 1.7622380147598745e-05, "loss": 1.1698, "step": 6793 }, { "epoch": 2.5294411932380076, "grad_norm": 0.16059640049934387, "learning_rate": 1.7621593420895342e-05, "loss": 1.1687, "step": 6794 }, { "epoch": 2.529813498388617, "grad_norm": 0.17570586502552032, "learning_rate": 1.762080658162148e-05, "loss": 1.1768, "step": 6795 }, { "epoch": 2.5301858035392257, "grad_norm": 0.1651303470134735, "learning_rate": 1.7620019629788786e-05, "loss": 1.1755, "step": 6796 }, { "epoch": 2.530558108689835, "grad_norm": 0.16856788098812103, "learning_rate": 1.7619232565408878e-05, "loss": 1.1711, "step": 6797 }, { "epoch": 2.530930413840444, "grad_norm": 0.16949835419654846, "learning_rate": 1.7618445388493386e-05, "loss": 1.1839, "step": 6798 }, { "epoch": 2.531302718991053, "grad_norm": 0.17608442902565002, "learning_rate": 1.761765809905393e-05, "loss": 1.1857, "step": 6799 }, { "epoch": 2.531675024141662, "grad_norm": 0.16439513862133026, "learning_rate": 1.7616870697102144e-05, "loss": 1.1789, "step": 6800 }, { "epoch": 2.532047329292271, "grad_norm": 0.1684199720621109, "learning_rate": 1.7616083182649654e-05, "loss": 1.1857, "step": 6801 }, { "epoch": 2.5324196344428804, "grad_norm": 0.16960062086582184, "learning_rate": 1.7615295555708098e-05, "loss": 1.1714, "step": 6802 }, { "epoch": 2.5327919395934892, "grad_norm": 0.162268728017807, "learning_rate": 1.7614507816289102e-05, "loss": 1.1742, "step": 6803 }, { "epoch": 2.5331642447440985, "grad_norm": 0.17141248285770416, "learning_rate": 1.7613719964404303e-05, "loss": 1.1886, "step": 6804 }, { "epoch": 2.5335365498947073, "grad_norm": 0.17517106235027313, "learning_rate": 1.7612932000065336e-05, "loss": 1.1835, "step": 6805 }, { "epoch": 2.5339088550453166, "grad_norm": 0.1663627028465271, "learning_rate": 1.7612143923283844e-05, "loss": 1.1616, "step": 6806 }, { "epoch": 2.5342811601959254, "grad_norm": 0.1708042472600937, "learning_rate": 1.7611355734071464e-05, "loss": 1.1604, "step": 6807 }, { "epoch": 2.5346534653465347, "grad_norm": 0.1797637939453125, "learning_rate": 1.7610567432439834e-05, "loss": 1.1802, "step": 6808 }, { "epoch": 2.5350257704971435, "grad_norm": 0.16849200427532196, "learning_rate": 1.7609779018400606e-05, "loss": 1.1651, "step": 6809 }, { "epoch": 2.5353980756477528, "grad_norm": 0.17425650358200073, "learning_rate": 1.7608990491965416e-05, "loss": 1.1741, "step": 6810 }, { "epoch": 2.535770380798362, "grad_norm": 0.18015819787979126, "learning_rate": 1.760820185314591e-05, "loss": 1.1862, "step": 6811 }, { "epoch": 2.536142685948971, "grad_norm": 0.16850873827934265, "learning_rate": 1.7607413101953747e-05, "loss": 1.1758, "step": 6812 }, { "epoch": 2.53651499109958, "grad_norm": 0.1831638067960739, "learning_rate": 1.7606624238400568e-05, "loss": 1.1718, "step": 6813 }, { "epoch": 2.536887296250189, "grad_norm": 0.16078107059001923, "learning_rate": 1.7605835262498027e-05, "loss": 1.1651, "step": 6814 }, { "epoch": 2.537259601400798, "grad_norm": 0.17661617696285248, "learning_rate": 1.7605046174257775e-05, "loss": 1.1663, "step": 6815 }, { "epoch": 2.5376319065514075, "grad_norm": 0.17243711650371552, "learning_rate": 1.7604256973691468e-05, "loss": 1.173, "step": 6816 }, { "epoch": 2.5380042117020163, "grad_norm": 0.16753429174423218, "learning_rate": 1.7603467660810763e-05, "loss": 1.1635, "step": 6817 }, { "epoch": 2.538376516852625, "grad_norm": 0.17135019600391388, "learning_rate": 1.7602678235627317e-05, "loss": 1.1634, "step": 6818 }, { "epoch": 2.5387488220032344, "grad_norm": 0.16117660701274872, "learning_rate": 1.7601888698152794e-05, "loss": 1.1647, "step": 6819 }, { "epoch": 2.5391211271538436, "grad_norm": 0.17489348351955414, "learning_rate": 1.760109904839885e-05, "loss": 1.1938, "step": 6820 }, { "epoch": 2.5394934323044525, "grad_norm": 0.16629846394062042, "learning_rate": 1.760030928637715e-05, "loss": 1.1785, "step": 6821 }, { "epoch": 2.5398657374550617, "grad_norm": 0.1682901829481125, "learning_rate": 1.759951941209936e-05, "loss": 1.1865, "step": 6822 }, { "epoch": 2.5402380426056705, "grad_norm": 0.1646634191274643, "learning_rate": 1.7598729425577143e-05, "loss": 1.1787, "step": 6823 }, { "epoch": 2.54061034775628, "grad_norm": 0.17023120820522308, "learning_rate": 1.759793932682217e-05, "loss": 1.1697, "step": 6824 }, { "epoch": 2.540982652906889, "grad_norm": 0.17722590267658234, "learning_rate": 1.759714911584611e-05, "loss": 1.1666, "step": 6825 }, { "epoch": 2.541354958057498, "grad_norm": 0.1645139455795288, "learning_rate": 1.7596358792660633e-05, "loss": 1.1907, "step": 6826 }, { "epoch": 2.5417272632081067, "grad_norm": 0.1630629450082779, "learning_rate": 1.7595568357277413e-05, "loss": 1.1829, "step": 6827 }, { "epoch": 2.542099568358716, "grad_norm": 0.16655808687210083, "learning_rate": 1.759477780970813e-05, "loss": 1.1751, "step": 6828 }, { "epoch": 2.5424718735093252, "grad_norm": 0.19367186725139618, "learning_rate": 1.759398714996445e-05, "loss": 1.1613, "step": 6829 }, { "epoch": 2.542844178659934, "grad_norm": 0.2233743518590927, "learning_rate": 1.759319637805806e-05, "loss": 1.1836, "step": 6830 }, { "epoch": 2.5432164838105433, "grad_norm": 0.19930030405521393, "learning_rate": 1.7592405494000635e-05, "loss": 1.1862, "step": 6831 }, { "epoch": 2.543588788961152, "grad_norm": 0.1649360954761505, "learning_rate": 1.7591614497803856e-05, "loss": 1.1586, "step": 6832 }, { "epoch": 2.5439610941117614, "grad_norm": 0.1704995036125183, "learning_rate": 1.7590823389479407e-05, "loss": 1.1979, "step": 6833 }, { "epoch": 2.5443333992623707, "grad_norm": 0.20266738533973694, "learning_rate": 1.7590032169038974e-05, "loss": 1.1811, "step": 6834 }, { "epoch": 2.5447057044129795, "grad_norm": 0.1894606202840805, "learning_rate": 1.7589240836494245e-05, "loss": 1.1745, "step": 6835 }, { "epoch": 2.5450780095635883, "grad_norm": 0.17341409623622894, "learning_rate": 1.7588449391856903e-05, "loss": 1.1793, "step": 6836 }, { "epoch": 2.5454503147141976, "grad_norm": 0.20491334795951843, "learning_rate": 1.758765783513864e-05, "loss": 1.1762, "step": 6837 }, { "epoch": 2.545822619864807, "grad_norm": 0.1708584725856781, "learning_rate": 1.758686616635114e-05, "loss": 1.1693, "step": 6838 }, { "epoch": 2.5461949250154157, "grad_norm": 0.17428459227085114, "learning_rate": 1.7586074385506114e-05, "loss": 1.1827, "step": 6839 }, { "epoch": 2.546567230166025, "grad_norm": 0.19591780006885529, "learning_rate": 1.758528249261524e-05, "loss": 1.1732, "step": 6840 }, { "epoch": 2.5469395353166338, "grad_norm": 0.16241784393787384, "learning_rate": 1.758449048769022e-05, "loss": 1.1675, "step": 6841 }, { "epoch": 2.547311840467243, "grad_norm": 0.17725640535354614, "learning_rate": 1.758369837074275e-05, "loss": 1.166, "step": 6842 }, { "epoch": 2.5476841456178523, "grad_norm": 0.17130807042121887, "learning_rate": 1.7582906141784534e-05, "loss": 1.1961, "step": 6843 }, { "epoch": 2.548056450768461, "grad_norm": 0.18036004900932312, "learning_rate": 1.758211380082727e-05, "loss": 1.1713, "step": 6844 }, { "epoch": 2.54842875591907, "grad_norm": 0.16558697819709778, "learning_rate": 1.7581321347882657e-05, "loss": 1.1733, "step": 6845 }, { "epoch": 2.548801061069679, "grad_norm": 0.17441225051879883, "learning_rate": 1.7580528782962408e-05, "loss": 1.1662, "step": 6846 }, { "epoch": 2.5491733662202885, "grad_norm": 0.16816645860671997, "learning_rate": 1.757973610607822e-05, "loss": 1.1792, "step": 6847 }, { "epoch": 2.5495456713708973, "grad_norm": 0.19029933214187622, "learning_rate": 1.757894331724181e-05, "loss": 1.1739, "step": 6848 }, { "epoch": 2.5499179765215065, "grad_norm": 0.15939323604106903, "learning_rate": 1.757815041646488e-05, "loss": 1.1672, "step": 6849 }, { "epoch": 2.5502902816721154, "grad_norm": 0.1666376292705536, "learning_rate": 1.7577357403759147e-05, "loss": 1.1632, "step": 6850 }, { "epoch": 2.5506625868227246, "grad_norm": 0.16542813181877136, "learning_rate": 1.7576564279136318e-05, "loss": 1.1823, "step": 6851 }, { "epoch": 2.551034891973334, "grad_norm": 0.19840499758720398, "learning_rate": 1.757577104260811e-05, "loss": 1.1901, "step": 6852 }, { "epoch": 2.5514071971239427, "grad_norm": 0.17866215109825134, "learning_rate": 1.757497769418624e-05, "loss": 1.1641, "step": 6853 }, { "epoch": 2.5517795022745515, "grad_norm": 0.16423308849334717, "learning_rate": 1.7574184233882424e-05, "loss": 1.176, "step": 6854 }, { "epoch": 2.552151807425161, "grad_norm": 0.1964891403913498, "learning_rate": 1.7573390661708386e-05, "loss": 1.1694, "step": 6855 }, { "epoch": 2.55252411257577, "grad_norm": 0.16654036939144135, "learning_rate": 1.7572596977675837e-05, "loss": 1.1754, "step": 6856 }, { "epoch": 2.552896417726379, "grad_norm": 0.18395081162452698, "learning_rate": 1.757180318179651e-05, "loss": 1.1691, "step": 6857 }, { "epoch": 2.553268722876988, "grad_norm": 0.18189731240272522, "learning_rate": 1.7571009274082124e-05, "loss": 1.181, "step": 6858 }, { "epoch": 2.553641028027597, "grad_norm": 0.1674988567829132, "learning_rate": 1.7570215254544406e-05, "loss": 1.173, "step": 6859 }, { "epoch": 2.5540133331782062, "grad_norm": 0.1818539947271347, "learning_rate": 1.7569421123195086e-05, "loss": 1.1811, "step": 6860 }, { "epoch": 2.5543856383288155, "grad_norm": 0.1642785221338272, "learning_rate": 1.7568626880045888e-05, "loss": 1.1886, "step": 6861 }, { "epoch": 2.5547579434794243, "grad_norm": 0.21301640570163727, "learning_rate": 1.7567832525108547e-05, "loss": 1.1762, "step": 6862 }, { "epoch": 2.555130248630033, "grad_norm": 0.1925705522298813, "learning_rate": 1.7567038058394797e-05, "loss": 1.1685, "step": 6863 }, { "epoch": 2.5555025537806424, "grad_norm": 0.1762634962797165, "learning_rate": 1.7566243479916365e-05, "loss": 1.1641, "step": 6864 }, { "epoch": 2.5558748589312517, "grad_norm": 0.19311204552650452, "learning_rate": 1.7565448789684996e-05, "loss": 1.1789, "step": 6865 }, { "epoch": 2.5562471640818605, "grad_norm": 0.17269863188266754, "learning_rate": 1.756465398771242e-05, "loss": 1.1702, "step": 6866 }, { "epoch": 2.5566194692324697, "grad_norm": 0.1711881160736084, "learning_rate": 1.7563859074010382e-05, "loss": 1.1752, "step": 6867 }, { "epoch": 2.5569917743830786, "grad_norm": 0.1725349724292755, "learning_rate": 1.756306404859062e-05, "loss": 1.1673, "step": 6868 }, { "epoch": 2.557364079533688, "grad_norm": 0.1748928427696228, "learning_rate": 1.7562268911464872e-05, "loss": 1.1651, "step": 6869 }, { "epoch": 2.557736384684297, "grad_norm": 0.17474377155303955, "learning_rate": 1.7561473662644893e-05, "loss": 1.1771, "step": 6870 }, { "epoch": 2.558108689834906, "grad_norm": 0.17705856263637543, "learning_rate": 1.7560678302142418e-05, "loss": 1.1725, "step": 6871 }, { "epoch": 2.5584809949855147, "grad_norm": 0.16561757028102875, "learning_rate": 1.7559882829969203e-05, "loss": 1.1827, "step": 6872 }, { "epoch": 2.558853300136124, "grad_norm": 0.17302724719047546, "learning_rate": 1.7559087246136987e-05, "loss": 1.1834, "step": 6873 }, { "epoch": 2.5592256052867333, "grad_norm": 0.16074450314044952, "learning_rate": 1.755829155065753e-05, "loss": 1.184, "step": 6874 }, { "epoch": 2.559597910437342, "grad_norm": 0.16113080084323883, "learning_rate": 1.7557495743542586e-05, "loss": 1.1707, "step": 6875 }, { "epoch": 2.5599702155879513, "grad_norm": 0.1629118025302887, "learning_rate": 1.7556699824803897e-05, "loss": 1.1578, "step": 6876 }, { "epoch": 2.56034252073856, "grad_norm": 0.16696172952651978, "learning_rate": 1.7555903794453232e-05, "loss": 1.1842, "step": 6877 }, { "epoch": 2.5607148258891694, "grad_norm": 0.16263677179813385, "learning_rate": 1.7555107652502337e-05, "loss": 1.1854, "step": 6878 }, { "epoch": 2.5610871310397787, "grad_norm": 0.163103848695755, "learning_rate": 1.7554311398962976e-05, "loss": 1.173, "step": 6879 }, { "epoch": 2.5614594361903875, "grad_norm": 0.18173319101333618, "learning_rate": 1.7553515033846913e-05, "loss": 1.1643, "step": 6880 }, { "epoch": 2.5618317413409963, "grad_norm": 0.1870352178812027, "learning_rate": 1.7552718557165907e-05, "loss": 1.1847, "step": 6881 }, { "epoch": 2.5622040464916056, "grad_norm": 0.166317880153656, "learning_rate": 1.755192196893172e-05, "loss": 1.1683, "step": 6882 }, { "epoch": 2.562576351642215, "grad_norm": 0.2503323554992676, "learning_rate": 1.755112526915612e-05, "loss": 1.1824, "step": 6883 }, { "epoch": 2.5629486567928237, "grad_norm": 0.2117423117160797, "learning_rate": 1.7550328457850873e-05, "loss": 1.1667, "step": 6884 }, { "epoch": 2.563320961943433, "grad_norm": 0.19109854102134705, "learning_rate": 1.754953153502775e-05, "loss": 1.1603, "step": 6885 }, { "epoch": 2.5636932670940418, "grad_norm": 0.1728496551513672, "learning_rate": 1.754873450069852e-05, "loss": 1.1538, "step": 6886 }, { "epoch": 2.564065572244651, "grad_norm": 0.20083752274513245, "learning_rate": 1.7547937354874953e-05, "loss": 1.1703, "step": 6887 }, { "epoch": 2.5644378773952603, "grad_norm": 0.1984606832265854, "learning_rate": 1.7547140097568827e-05, "loss": 1.1897, "step": 6888 }, { "epoch": 2.564810182545869, "grad_norm": 0.1722983717918396, "learning_rate": 1.7546342728791915e-05, "loss": 1.1647, "step": 6889 }, { "epoch": 2.5651824876964784, "grad_norm": 0.17819280922412872, "learning_rate": 1.7545545248555994e-05, "loss": 1.1718, "step": 6890 }, { "epoch": 2.565554792847087, "grad_norm": 0.17654377222061157, "learning_rate": 1.754474765687284e-05, "loss": 1.1609, "step": 6891 }, { "epoch": 2.5659270979976965, "grad_norm": 0.16665366291999817, "learning_rate": 1.7543949953754244e-05, "loss": 1.1773, "step": 6892 }, { "epoch": 2.5662994031483053, "grad_norm": 0.16269910335540771, "learning_rate": 1.7543152139211973e-05, "loss": 1.1675, "step": 6893 }, { "epoch": 2.5666717082989146, "grad_norm": 0.16960933804512024, "learning_rate": 1.7542354213257825e-05, "loss": 1.1802, "step": 6894 }, { "epoch": 2.5670440134495234, "grad_norm": 0.18422453105449677, "learning_rate": 1.7541556175903577e-05, "loss": 1.1859, "step": 6895 }, { "epoch": 2.5674163186001326, "grad_norm": 0.15753304958343506, "learning_rate": 1.7540758027161014e-05, "loss": 1.1712, "step": 6896 }, { "epoch": 2.567788623750742, "grad_norm": 0.1667400747537613, "learning_rate": 1.753995976704193e-05, "loss": 1.1902, "step": 6897 }, { "epoch": 2.5681609289013507, "grad_norm": 0.1718086153268814, "learning_rate": 1.7539161395558115e-05, "loss": 1.167, "step": 6898 }, { "epoch": 2.56853323405196, "grad_norm": 0.1692952811717987, "learning_rate": 1.7538362912721356e-05, "loss": 1.1707, "step": 6899 }, { "epoch": 2.568905539202569, "grad_norm": 0.16457752883434296, "learning_rate": 1.7537564318543455e-05, "loss": 1.1923, "step": 6900 }, { "epoch": 2.569277844353178, "grad_norm": 0.16177764534950256, "learning_rate": 1.7536765613036198e-05, "loss": 1.1722, "step": 6901 }, { "epoch": 2.569650149503787, "grad_norm": 0.17193439602851868, "learning_rate": 1.7535966796211387e-05, "loss": 1.1789, "step": 6902 }, { "epoch": 2.570022454654396, "grad_norm": 0.1670045256614685, "learning_rate": 1.753516786808082e-05, "loss": 1.1715, "step": 6903 }, { "epoch": 2.570394759805005, "grad_norm": 0.1643945276737213, "learning_rate": 1.7534368828656295e-05, "loss": 1.1788, "step": 6904 }, { "epoch": 2.5707670649556142, "grad_norm": 0.1577821522951126, "learning_rate": 1.7533569677949616e-05, "loss": 1.1758, "step": 6905 }, { "epoch": 2.5711393701062235, "grad_norm": 0.16875483095645905, "learning_rate": 1.7532770415972585e-05, "loss": 1.1799, "step": 6906 }, { "epoch": 2.5715116752568323, "grad_norm": 0.1640816628932953, "learning_rate": 1.7531971042737008e-05, "loss": 1.1744, "step": 6907 }, { "epoch": 2.5718839804074416, "grad_norm": 0.16457021236419678, "learning_rate": 1.7531171558254692e-05, "loss": 1.1772, "step": 6908 }, { "epoch": 2.5722562855580504, "grad_norm": 0.16494521498680115, "learning_rate": 1.7530371962537445e-05, "loss": 1.1781, "step": 6909 }, { "epoch": 2.5726285907086597, "grad_norm": 0.16351187229156494, "learning_rate": 1.7529572255597077e-05, "loss": 1.1792, "step": 6910 }, { "epoch": 2.5730008958592685, "grad_norm": 0.1582973450422287, "learning_rate": 1.7528772437445396e-05, "loss": 1.1826, "step": 6911 }, { "epoch": 2.5733732010098778, "grad_norm": 0.16494831442832947, "learning_rate": 1.7527972508094223e-05, "loss": 1.1892, "step": 6912 }, { "epoch": 2.5737455061604866, "grad_norm": 0.16706949472427368, "learning_rate": 1.7527172467555367e-05, "loss": 1.1759, "step": 6913 }, { "epoch": 2.574117811311096, "grad_norm": 0.16176964342594147, "learning_rate": 1.752637231584064e-05, "loss": 1.181, "step": 6914 }, { "epoch": 2.574490116461705, "grad_norm": 0.1603250950574875, "learning_rate": 1.7525572052961877e-05, "loss": 1.1735, "step": 6915 }, { "epoch": 2.574862421612314, "grad_norm": 0.1632986068725586, "learning_rate": 1.752477167893088e-05, "loss": 1.1669, "step": 6916 }, { "epoch": 2.575234726762923, "grad_norm": 0.16188709437847137, "learning_rate": 1.7523971193759482e-05, "loss": 1.176, "step": 6917 }, { "epoch": 2.575607031913532, "grad_norm": 0.16379517316818237, "learning_rate": 1.7523170597459497e-05, "loss": 1.1666, "step": 6918 }, { "epoch": 2.5759793370641413, "grad_norm": 0.16542209684848785, "learning_rate": 1.7522369890042755e-05, "loss": 1.1477, "step": 6919 }, { "epoch": 2.57635164221475, "grad_norm": 0.16362489759922028, "learning_rate": 1.7521569071521084e-05, "loss": 1.1709, "step": 6920 }, { "epoch": 2.5767239473653594, "grad_norm": 0.18013128638267517, "learning_rate": 1.752076814190631e-05, "loss": 1.1737, "step": 6921 }, { "epoch": 2.577096252515968, "grad_norm": 0.1668436974287033, "learning_rate": 1.7519967101210264e-05, "loss": 1.1797, "step": 6922 }, { "epoch": 2.5774685576665775, "grad_norm": 0.17277368903160095, "learning_rate": 1.751916594944477e-05, "loss": 1.1708, "step": 6923 }, { "epoch": 2.5778408628171867, "grad_norm": 0.1680234968662262, "learning_rate": 1.751836468662167e-05, "loss": 1.1707, "step": 6924 }, { "epoch": 2.5782131679677955, "grad_norm": 0.17191176116466522, "learning_rate": 1.7517563312752796e-05, "loss": 1.1807, "step": 6925 }, { "epoch": 2.578585473118405, "grad_norm": 0.16285885870456696, "learning_rate": 1.7516761827849987e-05, "loss": 1.1707, "step": 6926 }, { "epoch": 2.5789577782690136, "grad_norm": 0.16699667274951935, "learning_rate": 1.7515960231925072e-05, "loss": 1.1765, "step": 6927 }, { "epoch": 2.579330083419623, "grad_norm": 0.1778481900691986, "learning_rate": 1.7515158524989896e-05, "loss": 1.1785, "step": 6928 }, { "epoch": 2.579702388570232, "grad_norm": 0.17215748131275177, "learning_rate": 1.7514356707056303e-05, "loss": 1.1653, "step": 6929 }, { "epoch": 2.580074693720841, "grad_norm": 0.20714597404003143, "learning_rate": 1.7513554778136133e-05, "loss": 1.1778, "step": 6930 }, { "epoch": 2.58044699887145, "grad_norm": 0.1802298128604889, "learning_rate": 1.751275273824123e-05, "loss": 1.1798, "step": 6931 }, { "epoch": 2.580819304022059, "grad_norm": 0.18000389635562897, "learning_rate": 1.7511950587383438e-05, "loss": 1.1729, "step": 6932 }, { "epoch": 2.5811916091726683, "grad_norm": 0.1696796715259552, "learning_rate": 1.7511148325574613e-05, "loss": 1.176, "step": 6933 }, { "epoch": 2.581563914323277, "grad_norm": 0.1616394817829132, "learning_rate": 1.7510345952826594e-05, "loss": 1.1583, "step": 6934 }, { "epoch": 2.5819362194738864, "grad_norm": 0.18179087340831757, "learning_rate": 1.7509543469151234e-05, "loss": 1.1809, "step": 6935 }, { "epoch": 2.5823085246244952, "grad_norm": 0.17522992193698883, "learning_rate": 1.7508740874560393e-05, "loss": 1.1693, "step": 6936 }, { "epoch": 2.5826808297751045, "grad_norm": 0.15960480272769928, "learning_rate": 1.7507938169065922e-05, "loss": 1.17, "step": 6937 }, { "epoch": 2.5830531349257138, "grad_norm": 0.16696304082870483, "learning_rate": 1.750713535267967e-05, "loss": 1.1632, "step": 6938 }, { "epoch": 2.5834254400763226, "grad_norm": 0.16897381842136383, "learning_rate": 1.7506332425413505e-05, "loss": 1.1773, "step": 6939 }, { "epoch": 2.5837977452269314, "grad_norm": 0.16685079038143158, "learning_rate": 1.750552938727928e-05, "loss": 1.1664, "step": 6940 }, { "epoch": 2.5841700503775407, "grad_norm": 0.1602102816104889, "learning_rate": 1.7504726238288857e-05, "loss": 1.174, "step": 6941 }, { "epoch": 2.58454235552815, "grad_norm": 0.21326759457588196, "learning_rate": 1.7503922978454094e-05, "loss": 1.179, "step": 6942 }, { "epoch": 2.5849146606787587, "grad_norm": 0.16109180450439453, "learning_rate": 1.7503119607786865e-05, "loss": 1.1676, "step": 6943 }, { "epoch": 2.585286965829368, "grad_norm": 0.1658046394586563, "learning_rate": 1.7502316126299027e-05, "loss": 1.1825, "step": 6944 }, { "epoch": 2.585659270979977, "grad_norm": 0.16446712613105774, "learning_rate": 1.7501512534002453e-05, "loss": 1.1782, "step": 6945 }, { "epoch": 2.586031576130586, "grad_norm": 0.1652338057756424, "learning_rate": 1.7500708830909006e-05, "loss": 1.1615, "step": 6946 }, { "epoch": 2.5864038812811954, "grad_norm": 0.16343404352664948, "learning_rate": 1.7499905017030565e-05, "loss": 1.1683, "step": 6947 }, { "epoch": 2.586776186431804, "grad_norm": 0.16519500315189362, "learning_rate": 1.7499101092378995e-05, "loss": 1.175, "step": 6948 }, { "epoch": 2.587148491582413, "grad_norm": 0.16351045668125153, "learning_rate": 1.7498297056966174e-05, "loss": 1.1709, "step": 6949 }, { "epoch": 2.5875207967330223, "grad_norm": 0.1664671003818512, "learning_rate": 1.7497492910803972e-05, "loss": 1.169, "step": 6950 }, { "epoch": 2.5878931018836315, "grad_norm": 0.16572503745555878, "learning_rate": 1.7496688653904277e-05, "loss": 1.1797, "step": 6951 }, { "epoch": 2.5882654070342404, "grad_norm": 0.16367551684379578, "learning_rate": 1.7495884286278955e-05, "loss": 1.1695, "step": 6952 }, { "epoch": 2.5886377121848496, "grad_norm": 0.16530726850032806, "learning_rate": 1.7495079807939897e-05, "loss": 1.1604, "step": 6953 }, { "epoch": 2.5890100173354584, "grad_norm": 0.16979095339775085, "learning_rate": 1.7494275218898976e-05, "loss": 1.1693, "step": 6954 }, { "epoch": 2.5893823224860677, "grad_norm": 0.1681201159954071, "learning_rate": 1.749347051916808e-05, "loss": 1.1655, "step": 6955 }, { "epoch": 2.589754627636677, "grad_norm": 0.17620940506458282, "learning_rate": 1.74926657087591e-05, "loss": 1.1716, "step": 6956 }, { "epoch": 2.590126932787286, "grad_norm": 0.16808199882507324, "learning_rate": 1.7491860787683915e-05, "loss": 1.1593, "step": 6957 }, { "epoch": 2.5904992379378946, "grad_norm": 0.16996102035045624, "learning_rate": 1.7491055755954418e-05, "loss": 1.179, "step": 6958 }, { "epoch": 2.590871543088504, "grad_norm": 0.16559867560863495, "learning_rate": 1.7490250613582492e-05, "loss": 1.1794, "step": 6959 }, { "epoch": 2.591243848239113, "grad_norm": 0.16791416704654694, "learning_rate": 1.748944536058004e-05, "loss": 1.1791, "step": 6960 }, { "epoch": 2.591616153389722, "grad_norm": 0.17107127606868744, "learning_rate": 1.7488639996958952e-05, "loss": 1.1862, "step": 6961 }, { "epoch": 2.591988458540331, "grad_norm": 0.16655850410461426, "learning_rate": 1.7487834522731115e-05, "loss": 1.1807, "step": 6962 }, { "epoch": 2.59236076369094, "grad_norm": 0.171476349234581, "learning_rate": 1.7487028937908436e-05, "loss": 1.1816, "step": 6963 }, { "epoch": 2.5927330688415493, "grad_norm": 0.1685655415058136, "learning_rate": 1.748622324250281e-05, "loss": 1.1735, "step": 6964 }, { "epoch": 2.5931053739921586, "grad_norm": 0.17578360438346863, "learning_rate": 1.7485417436526134e-05, "loss": 1.1802, "step": 6965 }, { "epoch": 2.5934776791427674, "grad_norm": 0.17003393173217773, "learning_rate": 1.748461151999031e-05, "loss": 1.1937, "step": 6966 }, { "epoch": 2.593849984293376, "grad_norm": 0.18038241565227509, "learning_rate": 1.7483805492907246e-05, "loss": 1.1826, "step": 6967 }, { "epoch": 2.5942222894439855, "grad_norm": 0.23492062091827393, "learning_rate": 1.7482999355288846e-05, "loss": 1.1888, "step": 6968 }, { "epoch": 2.5945945945945947, "grad_norm": 0.18728743493556976, "learning_rate": 1.7482193107147012e-05, "loss": 1.1753, "step": 6969 }, { "epoch": 2.5949668997452036, "grad_norm": 0.1779962182044983, "learning_rate": 1.7481386748493664e-05, "loss": 1.1748, "step": 6970 }, { "epoch": 2.595339204895813, "grad_norm": 0.1643301546573639, "learning_rate": 1.7480580279340694e-05, "loss": 1.17, "step": 6971 }, { "epoch": 2.5957115100464216, "grad_norm": 0.25796541571617126, "learning_rate": 1.7479773699700024e-05, "loss": 1.1708, "step": 6972 }, { "epoch": 2.596083815197031, "grad_norm": 0.1716795414686203, "learning_rate": 1.747896700958357e-05, "loss": 1.1742, "step": 6973 }, { "epoch": 2.59645612034764, "grad_norm": 0.17768600583076477, "learning_rate": 1.747816020900324e-05, "loss": 1.1841, "step": 6974 }, { "epoch": 2.596828425498249, "grad_norm": 0.164155513048172, "learning_rate": 1.7477353297970952e-05, "loss": 1.1678, "step": 6975 }, { "epoch": 2.597200730648858, "grad_norm": 0.16183872520923615, "learning_rate": 1.7476546276498625e-05, "loss": 1.1626, "step": 6976 }, { "epoch": 2.597573035799467, "grad_norm": 0.17142947018146515, "learning_rate": 1.7475739144598183e-05, "loss": 1.1773, "step": 6977 }, { "epoch": 2.5979453409500763, "grad_norm": 0.16305896639823914, "learning_rate": 1.7474931902281538e-05, "loss": 1.182, "step": 6978 }, { "epoch": 2.598317646100685, "grad_norm": 0.17089135944843292, "learning_rate": 1.747412454956062e-05, "loss": 1.1743, "step": 6979 }, { "epoch": 2.5986899512512944, "grad_norm": 0.1611926555633545, "learning_rate": 1.747331708644735e-05, "loss": 1.1753, "step": 6980 }, { "epoch": 2.5990622564019032, "grad_norm": 0.16074836254119873, "learning_rate": 1.747250951295366e-05, "loss": 1.1583, "step": 6981 }, { "epoch": 2.5994345615525125, "grad_norm": 0.16934338212013245, "learning_rate": 1.7471701829091468e-05, "loss": 1.1796, "step": 6982 }, { "epoch": 2.5998068667031218, "grad_norm": 0.16507279872894287, "learning_rate": 1.747089403487271e-05, "loss": 1.1742, "step": 6983 }, { "epoch": 2.6001791718537306, "grad_norm": 0.16199171543121338, "learning_rate": 1.747008613030932e-05, "loss": 1.1715, "step": 6984 }, { "epoch": 2.6005514770043394, "grad_norm": 0.15591688454151154, "learning_rate": 1.7469278115413222e-05, "loss": 1.171, "step": 6985 }, { "epoch": 2.6009237821549487, "grad_norm": 0.16649380326271057, "learning_rate": 1.7468469990196358e-05, "loss": 1.1597, "step": 6986 }, { "epoch": 2.601296087305558, "grad_norm": 0.1675834059715271, "learning_rate": 1.746766175467066e-05, "loss": 1.183, "step": 6987 }, { "epoch": 2.6016683924561668, "grad_norm": 0.15882286429405212, "learning_rate": 1.7466853408848067e-05, "loss": 1.1885, "step": 6988 }, { "epoch": 2.602040697606776, "grad_norm": 0.16068002581596375, "learning_rate": 1.7466044952740517e-05, "loss": 1.1896, "step": 6989 }, { "epoch": 2.602413002757385, "grad_norm": 0.16341431438922882, "learning_rate": 1.7465236386359952e-05, "loss": 1.1716, "step": 6990 }, { "epoch": 2.602785307907994, "grad_norm": 0.1623801589012146, "learning_rate": 1.746442770971831e-05, "loss": 1.1738, "step": 6991 }, { "epoch": 2.6031576130586034, "grad_norm": 0.16583126783370972, "learning_rate": 1.7463618922827545e-05, "loss": 1.1788, "step": 6992 }, { "epoch": 2.603529918209212, "grad_norm": 0.16922509670257568, "learning_rate": 1.74628100256996e-05, "loss": 1.1855, "step": 6993 }, { "epoch": 2.603902223359821, "grad_norm": 0.16294334828853607, "learning_rate": 1.7462001018346408e-05, "loss": 1.178, "step": 6994 }, { "epoch": 2.6042745285104303, "grad_norm": 0.16397880017757416, "learning_rate": 1.7461191900779936e-05, "loss": 1.1779, "step": 6995 }, { "epoch": 2.6046468336610396, "grad_norm": 0.16646462678909302, "learning_rate": 1.746038267301213e-05, "loss": 1.1666, "step": 6996 }, { "epoch": 2.6050191388116484, "grad_norm": 0.16266998648643494, "learning_rate": 1.7459573335054935e-05, "loss": 1.1792, "step": 6997 }, { "epoch": 2.6053914439622576, "grad_norm": 0.16222761571407318, "learning_rate": 1.745876388692031e-05, "loss": 1.169, "step": 6998 }, { "epoch": 2.6057637491128665, "grad_norm": 0.16887149214744568, "learning_rate": 1.7457954328620217e-05, "loss": 1.1724, "step": 6999 }, { "epoch": 2.6061360542634757, "grad_norm": 0.16719955205917358, "learning_rate": 1.74571446601666e-05, "loss": 1.1934, "step": 7000 }, { "epoch": 2.6061360542634757, "eval_loss": 1.2935380935668945, "eval_runtime": 16.7437, "eval_samples_per_second": 103.562, "eval_steps_per_second": 5.196, "step": 7000 }, { "epoch": 2.606508359414085, "grad_norm": 0.16188542544841766, "learning_rate": 1.7456334881571428e-05, "loss": 1.1968, "step": 7001 }, { "epoch": 2.606880664564694, "grad_norm": 0.16819295287132263, "learning_rate": 1.745552499284666e-05, "loss": 1.1796, "step": 7002 }, { "epoch": 2.607252969715303, "grad_norm": 0.1619422435760498, "learning_rate": 1.745471499400425e-05, "loss": 1.1739, "step": 7003 }, { "epoch": 2.607625274865912, "grad_norm": 0.16731281578540802, "learning_rate": 1.745390488505617e-05, "loss": 1.1763, "step": 7004 }, { "epoch": 2.607997580016521, "grad_norm": 0.16323542594909668, "learning_rate": 1.745309466601438e-05, "loss": 1.1742, "step": 7005 }, { "epoch": 2.60836988516713, "grad_norm": 0.16577135026454926, "learning_rate": 1.7452284336890853e-05, "loss": 1.1684, "step": 7006 }, { "epoch": 2.6087421903177392, "grad_norm": 0.16486932337284088, "learning_rate": 1.7451473897697552e-05, "loss": 1.168, "step": 7007 }, { "epoch": 2.609114495468348, "grad_norm": 0.17325793206691742, "learning_rate": 1.745066334844645e-05, "loss": 1.1709, "step": 7008 }, { "epoch": 2.6094868006189573, "grad_norm": 0.1616426408290863, "learning_rate": 1.744985268914952e-05, "loss": 1.1741, "step": 7009 }, { "epoch": 2.6098591057695666, "grad_norm": 0.16339850425720215, "learning_rate": 1.744904191981873e-05, "loss": 1.1719, "step": 7010 }, { "epoch": 2.6102314109201754, "grad_norm": 0.16253487765789032, "learning_rate": 1.744823104046606e-05, "loss": 1.1705, "step": 7011 }, { "epoch": 2.6106037160707847, "grad_norm": 0.17588497698307037, "learning_rate": 1.7447420051103483e-05, "loss": 1.178, "step": 7012 }, { "epoch": 2.6109760212213935, "grad_norm": 0.18001097440719604, "learning_rate": 1.744660895174298e-05, "loss": 1.1642, "step": 7013 }, { "epoch": 2.6113483263720028, "grad_norm": 0.1719658225774765, "learning_rate": 1.7445797742396535e-05, "loss": 1.1675, "step": 7014 }, { "epoch": 2.6117206315226116, "grad_norm": 0.20831653475761414, "learning_rate": 1.7444986423076116e-05, "loss": 1.168, "step": 7015 }, { "epoch": 2.612092936673221, "grad_norm": 0.17336048185825348, "learning_rate": 1.744417499379372e-05, "loss": 1.1814, "step": 7016 }, { "epoch": 2.6124652418238297, "grad_norm": 0.17226654291152954, "learning_rate": 1.7443363454561327e-05, "loss": 1.1761, "step": 7017 }, { "epoch": 2.612837546974439, "grad_norm": 0.20160022377967834, "learning_rate": 1.744255180539092e-05, "loss": 1.1843, "step": 7018 }, { "epoch": 2.613209852125048, "grad_norm": 0.1713937222957611, "learning_rate": 1.7441740046294496e-05, "loss": 1.1625, "step": 7019 }, { "epoch": 2.613582157275657, "grad_norm": 0.1691260188817978, "learning_rate": 1.744092817728403e-05, "loss": 1.1839, "step": 7020 }, { "epoch": 2.6139544624262663, "grad_norm": 0.16929064691066742, "learning_rate": 1.7440116198371528e-05, "loss": 1.1644, "step": 7021 }, { "epoch": 2.614326767576875, "grad_norm": 0.16705340147018433, "learning_rate": 1.7439304109568972e-05, "loss": 1.1847, "step": 7022 }, { "epoch": 2.6146990727274844, "grad_norm": 0.16961196064949036, "learning_rate": 1.7438491910888367e-05, "loss": 1.1534, "step": 7023 }, { "epoch": 2.615071377878093, "grad_norm": 0.16219016909599304, "learning_rate": 1.74376796023417e-05, "loss": 1.1695, "step": 7024 }, { "epoch": 2.6154436830287024, "grad_norm": 0.19066371023654938, "learning_rate": 1.7436867183940972e-05, "loss": 1.1739, "step": 7025 }, { "epoch": 2.6158159881793113, "grad_norm": 0.16923125088214874, "learning_rate": 1.7436054655698184e-05, "loss": 1.1544, "step": 7026 }, { "epoch": 2.6161882933299205, "grad_norm": 0.1667940318584442, "learning_rate": 1.7435242017625333e-05, "loss": 1.167, "step": 7027 }, { "epoch": 2.61656059848053, "grad_norm": 0.18602821230888367, "learning_rate": 1.7434429269734426e-05, "loss": 1.1667, "step": 7028 }, { "epoch": 2.6169329036311386, "grad_norm": 0.17603205144405365, "learning_rate": 1.7433616412037462e-05, "loss": 1.1759, "step": 7029 }, { "epoch": 2.617305208781748, "grad_norm": 0.15942661464214325, "learning_rate": 1.7432803444546454e-05, "loss": 1.1731, "step": 7030 }, { "epoch": 2.6176775139323567, "grad_norm": 0.19452649354934692, "learning_rate": 1.7431990367273402e-05, "loss": 1.1756, "step": 7031 }, { "epoch": 2.618049819082966, "grad_norm": 0.1983923316001892, "learning_rate": 1.7431177180230323e-05, "loss": 1.1745, "step": 7032 }, { "epoch": 2.618422124233575, "grad_norm": 0.16836190223693848, "learning_rate": 1.7430363883429218e-05, "loss": 1.1774, "step": 7033 }, { "epoch": 2.618794429384184, "grad_norm": 0.29323258996009827, "learning_rate": 1.742955047688211e-05, "loss": 1.178, "step": 7034 }, { "epoch": 2.619166734534793, "grad_norm": 0.19639234244823456, "learning_rate": 1.7428736960601004e-05, "loss": 1.1731, "step": 7035 }, { "epoch": 2.619539039685402, "grad_norm": 0.1906968504190445, "learning_rate": 1.7427923334597922e-05, "loss": 1.1848, "step": 7036 }, { "epoch": 2.6199113448360114, "grad_norm": 0.16878125071525574, "learning_rate": 1.7427109598884877e-05, "loss": 1.1764, "step": 7037 }, { "epoch": 2.6202836499866202, "grad_norm": 0.16486749053001404, "learning_rate": 1.742629575347389e-05, "loss": 1.1691, "step": 7038 }, { "epoch": 2.6206559551372295, "grad_norm": 0.17094659805297852, "learning_rate": 1.742548179837698e-05, "loss": 1.174, "step": 7039 }, { "epoch": 2.6210282602878383, "grad_norm": 0.18062889575958252, "learning_rate": 1.742466773360617e-05, "loss": 1.1766, "step": 7040 }, { "epoch": 2.6214005654384476, "grad_norm": 0.16564013063907623, "learning_rate": 1.742385355917348e-05, "loss": 1.1826, "step": 7041 }, { "epoch": 2.6217728705890564, "grad_norm": 0.15717479586601257, "learning_rate": 1.7423039275090947e-05, "loss": 1.1718, "step": 7042 }, { "epoch": 2.6221451757396657, "grad_norm": 0.16660241782665253, "learning_rate": 1.7422224881370585e-05, "loss": 1.1845, "step": 7043 }, { "epoch": 2.6225174808902745, "grad_norm": 0.16383567452430725, "learning_rate": 1.7421410378024428e-05, "loss": 1.1843, "step": 7044 }, { "epoch": 2.6228897860408837, "grad_norm": 0.1657799780368805, "learning_rate": 1.7420595765064505e-05, "loss": 1.1704, "step": 7045 }, { "epoch": 2.623262091191493, "grad_norm": 0.16230309009552002, "learning_rate": 1.7419781042502846e-05, "loss": 1.1725, "step": 7046 }, { "epoch": 2.623634396342102, "grad_norm": 0.16071511805057526, "learning_rate": 1.7418966210351492e-05, "loss": 1.1682, "step": 7047 }, { "epoch": 2.624006701492711, "grad_norm": 0.18019315600395203, "learning_rate": 1.741815126862247e-05, "loss": 1.1601, "step": 7048 }, { "epoch": 2.62437900664332, "grad_norm": 0.16016197204589844, "learning_rate": 1.741733621732782e-05, "loss": 1.1783, "step": 7049 }, { "epoch": 2.624751311793929, "grad_norm": 0.15661844611167908, "learning_rate": 1.7416521056479577e-05, "loss": 1.1509, "step": 7050 }, { "epoch": 2.6251236169445384, "grad_norm": 0.16214154660701752, "learning_rate": 1.7415705786089784e-05, "loss": 1.162, "step": 7051 }, { "epoch": 2.6254959220951473, "grad_norm": 0.16358621418476105, "learning_rate": 1.7414890406170487e-05, "loss": 1.1727, "step": 7052 }, { "epoch": 2.625868227245756, "grad_norm": 0.1643485575914383, "learning_rate": 1.7414074916733715e-05, "loss": 1.168, "step": 7053 }, { "epoch": 2.6262405323963653, "grad_norm": 0.16986949741840363, "learning_rate": 1.7413259317791528e-05, "loss": 1.1664, "step": 7054 }, { "epoch": 2.6266128375469746, "grad_norm": 0.17110756039619446, "learning_rate": 1.7412443609355967e-05, "loss": 1.163, "step": 7055 }, { "epoch": 2.6269851426975834, "grad_norm": 0.16550353169441223, "learning_rate": 1.7411627791439073e-05, "loss": 1.1659, "step": 7056 }, { "epoch": 2.6273574478481927, "grad_norm": 0.16365563869476318, "learning_rate": 1.7410811864052908e-05, "loss": 1.1871, "step": 7057 }, { "epoch": 2.6277297529988015, "grad_norm": 0.17241568863391876, "learning_rate": 1.7409995827209517e-05, "loss": 1.1757, "step": 7058 }, { "epoch": 2.628102058149411, "grad_norm": 0.16813285648822784, "learning_rate": 1.7409179680920945e-05, "loss": 1.1615, "step": 7059 }, { "epoch": 2.62847436330002, "grad_norm": 0.15777957439422607, "learning_rate": 1.740836342519926e-05, "loss": 1.167, "step": 7060 }, { "epoch": 2.628846668450629, "grad_norm": 0.1858019232749939, "learning_rate": 1.7407547060056514e-05, "loss": 1.1673, "step": 7061 }, { "epoch": 2.6292189736012377, "grad_norm": 0.15979260206222534, "learning_rate": 1.740673058550476e-05, "loss": 1.178, "step": 7062 }, { "epoch": 2.629591278751847, "grad_norm": 0.1698780208826065, "learning_rate": 1.7405914001556058e-05, "loss": 1.1721, "step": 7063 }, { "epoch": 2.629963583902456, "grad_norm": 0.16836461424827576, "learning_rate": 1.7405097308222474e-05, "loss": 1.1717, "step": 7064 }, { "epoch": 2.630335889053065, "grad_norm": 0.16282488405704498, "learning_rate": 1.740428050551607e-05, "loss": 1.1735, "step": 7065 }, { "epoch": 2.6307081942036743, "grad_norm": 0.16781288385391235, "learning_rate": 1.74034635934489e-05, "loss": 1.2007, "step": 7066 }, { "epoch": 2.631080499354283, "grad_norm": 0.16193710267543793, "learning_rate": 1.7402646572033043e-05, "loss": 1.1854, "step": 7067 }, { "epoch": 2.6314528045048924, "grad_norm": 0.16660821437835693, "learning_rate": 1.7401829441280563e-05, "loss": 1.1706, "step": 7068 }, { "epoch": 2.6318251096555016, "grad_norm": 0.16784422099590302, "learning_rate": 1.740101220120352e-05, "loss": 1.1672, "step": 7069 }, { "epoch": 2.6321974148061105, "grad_norm": 0.17018967866897583, "learning_rate": 1.7400194851813994e-05, "loss": 1.1758, "step": 7070 }, { "epoch": 2.6325697199567193, "grad_norm": 0.16101203858852386, "learning_rate": 1.7399377393124056e-05, "loss": 1.185, "step": 7071 }, { "epoch": 2.6329420251073286, "grad_norm": 0.16601742804050446, "learning_rate": 1.7398559825145776e-05, "loss": 1.17, "step": 7072 }, { "epoch": 2.633314330257938, "grad_norm": 0.16237667202949524, "learning_rate": 1.7397742147891234e-05, "loss": 1.1807, "step": 7073 }, { "epoch": 2.6336866354085466, "grad_norm": 0.19661523401737213, "learning_rate": 1.7396924361372504e-05, "loss": 1.1696, "step": 7074 }, { "epoch": 2.634058940559156, "grad_norm": 0.18306462466716766, "learning_rate": 1.7396106465601662e-05, "loss": 1.1693, "step": 7075 }, { "epoch": 2.6344312457097647, "grad_norm": 0.16825948655605316, "learning_rate": 1.7395288460590797e-05, "loss": 1.1752, "step": 7076 }, { "epoch": 2.634803550860374, "grad_norm": 0.18328233063220978, "learning_rate": 1.739447034635198e-05, "loss": 1.1778, "step": 7077 }, { "epoch": 2.6351758560109833, "grad_norm": 0.1660008579492569, "learning_rate": 1.7393652122897306e-05, "loss": 1.1784, "step": 7078 }, { "epoch": 2.635548161161592, "grad_norm": 0.1794942319393158, "learning_rate": 1.7392833790238854e-05, "loss": 1.1743, "step": 7079 }, { "epoch": 2.635920466312201, "grad_norm": 0.16286969184875488, "learning_rate": 1.7392015348388707e-05, "loss": 1.1833, "step": 7080 }, { "epoch": 2.63629277146281, "grad_norm": 0.18671861290931702, "learning_rate": 1.7391196797358957e-05, "loss": 1.1811, "step": 7081 }, { "epoch": 2.6366650766134194, "grad_norm": 0.16873905062675476, "learning_rate": 1.7390378137161694e-05, "loss": 1.1764, "step": 7082 }, { "epoch": 2.6370373817640282, "grad_norm": 0.16419054567813873, "learning_rate": 1.7389559367809012e-05, "loss": 1.1631, "step": 7083 }, { "epoch": 2.6374096869146375, "grad_norm": 0.2097456008195877, "learning_rate": 1.7388740489313e-05, "loss": 1.1651, "step": 7084 }, { "epoch": 2.6377819920652463, "grad_norm": 0.19423960149288177, "learning_rate": 1.7387921501685757e-05, "loss": 1.1651, "step": 7085 }, { "epoch": 2.6381542972158556, "grad_norm": 0.17196422815322876, "learning_rate": 1.7387102404939375e-05, "loss": 1.1788, "step": 7086 }, { "epoch": 2.638526602366465, "grad_norm": 0.2757280170917511, "learning_rate": 1.7386283199085957e-05, "loss": 1.182, "step": 7087 }, { "epoch": 2.6388989075170737, "grad_norm": 0.16954492032527924, "learning_rate": 1.73854638841376e-05, "loss": 1.1732, "step": 7088 }, { "epoch": 2.6392712126676825, "grad_norm": 0.1665467768907547, "learning_rate": 1.7384644460106403e-05, "loss": 1.1731, "step": 7089 }, { "epoch": 2.6396435178182918, "grad_norm": 0.16451166570186615, "learning_rate": 1.738382492700447e-05, "loss": 1.1616, "step": 7090 }, { "epoch": 2.640015822968901, "grad_norm": 0.16248999536037445, "learning_rate": 1.7383005284843902e-05, "loss": 1.1636, "step": 7091 }, { "epoch": 2.64038812811951, "grad_norm": 0.1698039472103119, "learning_rate": 1.7382185533636815e-05, "loss": 1.1749, "step": 7092 }, { "epoch": 2.640760433270119, "grad_norm": 0.17128440737724304, "learning_rate": 1.738136567339531e-05, "loss": 1.1702, "step": 7093 }, { "epoch": 2.641132738420728, "grad_norm": 0.16965392231941223, "learning_rate": 1.7380545704131496e-05, "loss": 1.1698, "step": 7094 }, { "epoch": 2.641505043571337, "grad_norm": 0.17016279697418213, "learning_rate": 1.737972562585749e-05, "loss": 1.1715, "step": 7095 }, { "epoch": 2.6418773487219465, "grad_norm": 0.16740964353084564, "learning_rate": 1.7378905438585394e-05, "loss": 1.1743, "step": 7096 }, { "epoch": 2.6422496538725553, "grad_norm": 0.16494162380695343, "learning_rate": 1.737808514232733e-05, "loss": 1.1626, "step": 7097 }, { "epoch": 2.642621959023164, "grad_norm": 0.17924334108829498, "learning_rate": 1.7377264737095408e-05, "loss": 1.1731, "step": 7098 }, { "epoch": 2.6429942641737734, "grad_norm": 0.16398359835147858, "learning_rate": 1.7376444222901754e-05, "loss": 1.1804, "step": 7099 }, { "epoch": 2.6433665693243826, "grad_norm": 0.16579513251781464, "learning_rate": 1.737562359975848e-05, "loss": 1.1583, "step": 7100 }, { "epoch": 2.6437388744749915, "grad_norm": 0.16405485570430756, "learning_rate": 1.7374802867677706e-05, "loss": 1.1651, "step": 7101 }, { "epoch": 2.6441111796256007, "grad_norm": 0.16502594947814941, "learning_rate": 1.7373982026671557e-05, "loss": 1.1766, "step": 7102 }, { "epoch": 2.6444834847762095, "grad_norm": 0.16717404127120972, "learning_rate": 1.737316107675216e-05, "loss": 1.1841, "step": 7103 }, { "epoch": 2.644855789926819, "grad_norm": 0.16163359582424164, "learning_rate": 1.7372340017931636e-05, "loss": 1.1678, "step": 7104 }, { "epoch": 2.645228095077428, "grad_norm": 0.17032460868358612, "learning_rate": 1.737151885022211e-05, "loss": 1.1817, "step": 7105 }, { "epoch": 2.645600400228037, "grad_norm": 0.16142508387565613, "learning_rate": 1.7370697573635714e-05, "loss": 1.1869, "step": 7106 }, { "epoch": 2.6459727053786457, "grad_norm": 0.15926913917064667, "learning_rate": 1.7369876188184577e-05, "loss": 1.1717, "step": 7107 }, { "epoch": 2.646345010529255, "grad_norm": 0.1710928976535797, "learning_rate": 1.7369054693880832e-05, "loss": 1.1809, "step": 7108 }, { "epoch": 2.6467173156798642, "grad_norm": 0.1716298758983612, "learning_rate": 1.7368233090736613e-05, "loss": 1.1709, "step": 7109 }, { "epoch": 2.647089620830473, "grad_norm": 0.16648998856544495, "learning_rate": 1.736741137876405e-05, "loss": 1.1778, "step": 7110 }, { "epoch": 2.6474619259810823, "grad_norm": 0.16086730360984802, "learning_rate": 1.7366589557975287e-05, "loss": 1.1909, "step": 7111 }, { "epoch": 2.647834231131691, "grad_norm": 0.16966967284679413, "learning_rate": 1.7365767628382456e-05, "loss": 1.1799, "step": 7112 }, { "epoch": 2.6482065362823004, "grad_norm": 0.16273948550224304, "learning_rate": 1.7364945589997703e-05, "loss": 1.1777, "step": 7113 }, { "epoch": 2.6485788414329097, "grad_norm": 0.16940386593341827, "learning_rate": 1.736412344283316e-05, "loss": 1.1837, "step": 7114 }, { "epoch": 2.6489511465835185, "grad_norm": 0.17343072593212128, "learning_rate": 1.736330118690098e-05, "loss": 1.173, "step": 7115 }, { "epoch": 2.6493234517341273, "grad_norm": 0.16787442564964294, "learning_rate": 1.73624788222133e-05, "loss": 1.1676, "step": 7116 }, { "epoch": 2.6496957568847366, "grad_norm": 0.1711404025554657, "learning_rate": 1.7361656348782275e-05, "loss": 1.1643, "step": 7117 }, { "epoch": 2.650068062035346, "grad_norm": 0.17250101268291473, "learning_rate": 1.7360833766620046e-05, "loss": 1.1709, "step": 7118 }, { "epoch": 2.6504403671859547, "grad_norm": 0.16760481894016266, "learning_rate": 1.7360011075738762e-05, "loss": 1.1867, "step": 7119 }, { "epoch": 2.650812672336564, "grad_norm": 0.16904081404209137, "learning_rate": 1.7359188276150578e-05, "loss": 1.1658, "step": 7120 }, { "epoch": 2.6511849774871727, "grad_norm": 0.16162091493606567, "learning_rate": 1.7358365367867643e-05, "loss": 1.1653, "step": 7121 }, { "epoch": 2.651557282637782, "grad_norm": 0.166825532913208, "learning_rate": 1.7357542350902114e-05, "loss": 1.1697, "step": 7122 }, { "epoch": 2.6519295877883913, "grad_norm": 0.16394229233264923, "learning_rate": 1.7356719225266147e-05, "loss": 1.1705, "step": 7123 }, { "epoch": 2.652301892939, "grad_norm": 0.16806428134441376, "learning_rate": 1.73558959909719e-05, "loss": 1.1602, "step": 7124 }, { "epoch": 2.6526741980896094, "grad_norm": 0.16216734051704407, "learning_rate": 1.735507264803153e-05, "loss": 1.1691, "step": 7125 }, { "epoch": 2.653046503240218, "grad_norm": 0.17059317231178284, "learning_rate": 1.7354249196457198e-05, "loss": 1.1784, "step": 7126 }, { "epoch": 2.6534188083908274, "grad_norm": 0.1767067313194275, "learning_rate": 1.7353425636261067e-05, "loss": 1.1686, "step": 7127 }, { "epoch": 2.6537911135414363, "grad_norm": 0.16557256877422333, "learning_rate": 1.7352601967455303e-05, "loss": 1.1818, "step": 7128 }, { "epoch": 2.6541634186920455, "grad_norm": 0.17525194585323334, "learning_rate": 1.7351778190052067e-05, "loss": 1.1759, "step": 7129 }, { "epoch": 2.6545357238426543, "grad_norm": 0.16801874339580536, "learning_rate": 1.7350954304063528e-05, "loss": 1.1652, "step": 7130 }, { "epoch": 2.6549080289932636, "grad_norm": 0.16785259544849396, "learning_rate": 1.7350130309501855e-05, "loss": 1.1728, "step": 7131 }, { "epoch": 2.655280334143873, "grad_norm": 0.18316105008125305, "learning_rate": 1.734930620637922e-05, "loss": 1.1842, "step": 7132 }, { "epoch": 2.6556526392944817, "grad_norm": 0.18117022514343262, "learning_rate": 1.7348481994707795e-05, "loss": 1.1717, "step": 7133 }, { "epoch": 2.656024944445091, "grad_norm": 0.17637571692466736, "learning_rate": 1.734765767449975e-05, "loss": 1.1801, "step": 7134 }, { "epoch": 2.6563972495957, "grad_norm": 0.1829034388065338, "learning_rate": 1.7346833245767265e-05, "loss": 1.1731, "step": 7135 }, { "epoch": 2.656769554746309, "grad_norm": 0.17745697498321533, "learning_rate": 1.734600870852251e-05, "loss": 1.1639, "step": 7136 }, { "epoch": 2.657141859896918, "grad_norm": 0.176448792219162, "learning_rate": 1.7345184062777668e-05, "loss": 1.1718, "step": 7137 }, { "epoch": 2.657514165047527, "grad_norm": 0.1889655888080597, "learning_rate": 1.734435930854492e-05, "loss": 1.1717, "step": 7138 }, { "epoch": 2.657886470198136, "grad_norm": 0.21692200005054474, "learning_rate": 1.7343534445836446e-05, "loss": 1.1726, "step": 7139 }, { "epoch": 2.658258775348745, "grad_norm": 0.22923436760902405, "learning_rate": 1.7342709474664426e-05, "loss": 1.177, "step": 7140 }, { "epoch": 2.6586310804993545, "grad_norm": 0.2060500830411911, "learning_rate": 1.7341884395041052e-05, "loss": 1.1685, "step": 7141 }, { "epoch": 2.6590033856499633, "grad_norm": 0.16787934303283691, "learning_rate": 1.7341059206978505e-05, "loss": 1.1647, "step": 7142 }, { "epoch": 2.6593756908005726, "grad_norm": 0.2028505802154541, "learning_rate": 1.7340233910488973e-05, "loss": 1.1583, "step": 7143 }, { "epoch": 2.6597479959511814, "grad_norm": 0.18768154084682465, "learning_rate": 1.7339408505584653e-05, "loss": 1.1632, "step": 7144 }, { "epoch": 2.6601203011017907, "grad_norm": 0.17588740587234497, "learning_rate": 1.7338582992277723e-05, "loss": 1.1676, "step": 7145 }, { "epoch": 2.6604926062523995, "grad_norm": 0.19588559865951538, "learning_rate": 1.7337757370580385e-05, "loss": 1.1545, "step": 7146 }, { "epoch": 2.6608649114030087, "grad_norm": 0.16614019870758057, "learning_rate": 1.733693164050483e-05, "loss": 1.1864, "step": 7147 }, { "epoch": 2.6612372165536176, "grad_norm": 0.17815782129764557, "learning_rate": 1.7336105802063255e-05, "loss": 1.1624, "step": 7148 }, { "epoch": 2.661609521704227, "grad_norm": 0.172567680478096, "learning_rate": 1.7335279855267858e-05, "loss": 1.1725, "step": 7149 }, { "epoch": 2.661981826854836, "grad_norm": 0.16731174290180206, "learning_rate": 1.733445380013084e-05, "loss": 1.1608, "step": 7150 }, { "epoch": 2.662354132005445, "grad_norm": 0.17305542528629303, "learning_rate": 1.7333627636664397e-05, "loss": 1.1803, "step": 7151 }, { "epoch": 2.662726437156054, "grad_norm": 0.16189545392990112, "learning_rate": 1.7332801364880734e-05, "loss": 1.1735, "step": 7152 }, { "epoch": 2.663098742306663, "grad_norm": 0.25279855728149414, "learning_rate": 1.7331974984792056e-05, "loss": 1.1793, "step": 7153 }, { "epoch": 2.6634710474572723, "grad_norm": 0.1899125874042511, "learning_rate": 1.733114849641057e-05, "loss": 1.1931, "step": 7154 }, { "epoch": 2.663843352607881, "grad_norm": 0.17817838490009308, "learning_rate": 1.7330321899748476e-05, "loss": 1.1718, "step": 7155 }, { "epoch": 2.6642156577584903, "grad_norm": 0.16445818543434143, "learning_rate": 1.732949519481799e-05, "loss": 1.1743, "step": 7156 }, { "epoch": 2.664587962909099, "grad_norm": 0.1586284637451172, "learning_rate": 1.732866838163132e-05, "loss": 1.173, "step": 7157 }, { "epoch": 2.6649602680597084, "grad_norm": 0.1685306876897812, "learning_rate": 1.7327841460200677e-05, "loss": 1.1722, "step": 7158 }, { "epoch": 2.6653325732103177, "grad_norm": 0.16867049038410187, "learning_rate": 1.732701443053828e-05, "loss": 1.1736, "step": 7159 }, { "epoch": 2.6657048783609265, "grad_norm": 0.16906088590621948, "learning_rate": 1.7326187292656332e-05, "loss": 1.1659, "step": 7160 }, { "epoch": 2.6660771835115358, "grad_norm": 0.16725754737854004, "learning_rate": 1.7325360046567065e-05, "loss": 1.1736, "step": 7161 }, { "epoch": 2.6664494886621446, "grad_norm": 0.1647002398967743, "learning_rate": 1.732453269228268e-05, "loss": 1.1687, "step": 7162 }, { "epoch": 2.666821793812754, "grad_norm": 0.16950398683547974, "learning_rate": 1.7323705229815416e-05, "loss": 1.1733, "step": 7163 }, { "epoch": 2.6671940989633627, "grad_norm": 0.16586348414421082, "learning_rate": 1.7322877659177482e-05, "loss": 1.1649, "step": 7164 }, { "epoch": 2.667566404113972, "grad_norm": 0.1677524447441101, "learning_rate": 1.732204998038111e-05, "loss": 1.1786, "step": 7165 }, { "epoch": 2.6679387092645808, "grad_norm": 0.16064026951789856, "learning_rate": 1.7321222193438513e-05, "loss": 1.1817, "step": 7166 }, { "epoch": 2.66831101441519, "grad_norm": 0.17599624395370483, "learning_rate": 1.7320394298361926e-05, "loss": 1.1767, "step": 7167 }, { "epoch": 2.6686833195657993, "grad_norm": 0.16839216649532318, "learning_rate": 1.7319566295163572e-05, "loss": 1.165, "step": 7168 }, { "epoch": 2.669055624716408, "grad_norm": 0.16207967698574066, "learning_rate": 1.7318738183855685e-05, "loss": 1.1772, "step": 7169 }, { "epoch": 2.6694279298670174, "grad_norm": 0.1666906476020813, "learning_rate": 1.7317909964450494e-05, "loss": 1.1723, "step": 7170 }, { "epoch": 2.669800235017626, "grad_norm": 0.1715075820684433, "learning_rate": 1.731708163696023e-05, "loss": 1.1788, "step": 7171 }, { "epoch": 2.6701725401682355, "grad_norm": 0.1627563238143921, "learning_rate": 1.7316253201397134e-05, "loss": 1.1688, "step": 7172 }, { "epoch": 2.6705448453188447, "grad_norm": 0.1687730997800827, "learning_rate": 1.7315424657773433e-05, "loss": 1.1657, "step": 7173 }, { "epoch": 2.6709171504694535, "grad_norm": 0.16402725875377655, "learning_rate": 1.7314596006101372e-05, "loss": 1.1852, "step": 7174 }, { "epoch": 2.6712894556200624, "grad_norm": 0.16662922501564026, "learning_rate": 1.7313767246393184e-05, "loss": 1.1782, "step": 7175 }, { "epoch": 2.6716617607706716, "grad_norm": 0.1664755940437317, "learning_rate": 1.7312938378661118e-05, "loss": 1.1724, "step": 7176 }, { "epoch": 2.672034065921281, "grad_norm": 0.15922842919826508, "learning_rate": 1.7312109402917406e-05, "loss": 1.1719, "step": 7177 }, { "epoch": 2.6724063710718897, "grad_norm": 0.16002245247364044, "learning_rate": 1.73112803191743e-05, "loss": 1.176, "step": 7178 }, { "epoch": 2.672778676222499, "grad_norm": 0.16558903455734253, "learning_rate": 1.731045112744404e-05, "loss": 1.1703, "step": 7179 }, { "epoch": 2.673150981373108, "grad_norm": 0.16779029369354248, "learning_rate": 1.7309621827738877e-05, "loss": 1.1909, "step": 7180 }, { "epoch": 2.673523286523717, "grad_norm": 0.16729268431663513, "learning_rate": 1.730879242007106e-05, "loss": 1.1688, "step": 7181 }, { "epoch": 2.6738955916743263, "grad_norm": 0.1586335152387619, "learning_rate": 1.7307962904452837e-05, "loss": 1.1748, "step": 7182 }, { "epoch": 2.674267896824935, "grad_norm": 0.16203922033309937, "learning_rate": 1.730713328089646e-05, "loss": 1.1755, "step": 7183 }, { "epoch": 2.674640201975544, "grad_norm": 0.17098890244960785, "learning_rate": 1.730630354941418e-05, "loss": 1.1814, "step": 7184 }, { "epoch": 2.6750125071261532, "grad_norm": 0.16195635497570038, "learning_rate": 1.7305473710018258e-05, "loss": 1.1654, "step": 7185 }, { "epoch": 2.6753848122767625, "grad_norm": 0.16069965064525604, "learning_rate": 1.730464376272095e-05, "loss": 1.1661, "step": 7186 }, { "epoch": 2.6757571174273713, "grad_norm": 0.1605072170495987, "learning_rate": 1.7303813707534506e-05, "loss": 1.1829, "step": 7187 }, { "epoch": 2.6761294225779806, "grad_norm": 0.16202007234096527, "learning_rate": 1.7302983544471197e-05, "loss": 1.1668, "step": 7188 }, { "epoch": 2.6765017277285894, "grad_norm": 0.16660577058792114, "learning_rate": 1.7302153273543276e-05, "loss": 1.1741, "step": 7189 }, { "epoch": 2.6768740328791987, "grad_norm": 0.16862109303474426, "learning_rate": 1.7301322894763013e-05, "loss": 1.1775, "step": 7190 }, { "epoch": 2.677246338029808, "grad_norm": 0.1640176773071289, "learning_rate": 1.7300492408142666e-05, "loss": 1.1671, "step": 7191 }, { "epoch": 2.6776186431804168, "grad_norm": 0.1989293098449707, "learning_rate": 1.729966181369451e-05, "loss": 1.1844, "step": 7192 }, { "epoch": 2.6779909483310256, "grad_norm": 0.21205829083919525, "learning_rate": 1.72988311114308e-05, "loss": 1.1789, "step": 7193 }, { "epoch": 2.678363253481635, "grad_norm": 0.16825103759765625, "learning_rate": 1.7298000301363815e-05, "loss": 1.1659, "step": 7194 }, { "epoch": 2.678735558632244, "grad_norm": 0.5028261542320251, "learning_rate": 1.729716938350582e-05, "loss": 1.1628, "step": 7195 }, { "epoch": 2.679107863782853, "grad_norm": 0.1976155936717987, "learning_rate": 1.72963383578691e-05, "loss": 1.172, "step": 7196 }, { "epoch": 2.679480168933462, "grad_norm": 0.18413805961608887, "learning_rate": 1.7295507224465913e-05, "loss": 1.1695, "step": 7197 }, { "epoch": 2.679852474084071, "grad_norm": 0.17518746852874756, "learning_rate": 1.7294675983308545e-05, "loss": 1.1602, "step": 7198 }, { "epoch": 2.6802247792346803, "grad_norm": 0.1705147624015808, "learning_rate": 1.729384463440927e-05, "loss": 1.1783, "step": 7199 }, { "epoch": 2.6805970843852895, "grad_norm": 0.18338525295257568, "learning_rate": 1.7293013177780368e-05, "loss": 1.1786, "step": 7200 }, { "epoch": 2.6809693895358984, "grad_norm": 0.1804172545671463, "learning_rate": 1.7292181613434117e-05, "loss": 1.1682, "step": 7201 }, { "epoch": 2.681341694686507, "grad_norm": 0.17289164662361145, "learning_rate": 1.7291349941382804e-05, "loss": 1.1769, "step": 7202 }, { "epoch": 2.6817139998371164, "grad_norm": 0.15843546390533447, "learning_rate": 1.7290518161638707e-05, "loss": 1.1736, "step": 7203 }, { "epoch": 2.6820863049877257, "grad_norm": 0.16561047732830048, "learning_rate": 1.7289686274214116e-05, "loss": 1.1621, "step": 7204 }, { "epoch": 2.6824586101383345, "grad_norm": 0.16593630611896515, "learning_rate": 1.7288854279121318e-05, "loss": 1.1724, "step": 7205 }, { "epoch": 2.682830915288944, "grad_norm": 0.16903571784496307, "learning_rate": 1.7288022176372597e-05, "loss": 1.1679, "step": 7206 }, { "epoch": 2.6832032204395526, "grad_norm": 0.16603899002075195, "learning_rate": 1.7287189965980245e-05, "loss": 1.1607, "step": 7207 }, { "epoch": 2.683575525590162, "grad_norm": 0.16013038158416748, "learning_rate": 1.728635764795656e-05, "loss": 1.1641, "step": 7208 }, { "epoch": 2.683947830740771, "grad_norm": 0.16311000287532806, "learning_rate": 1.7285525222313823e-05, "loss": 1.1721, "step": 7209 }, { "epoch": 2.68432013589138, "grad_norm": 0.16615121066570282, "learning_rate": 1.728469268906434e-05, "loss": 1.181, "step": 7210 }, { "epoch": 2.684692441041989, "grad_norm": 0.16762305796146393, "learning_rate": 1.7283860048220403e-05, "loss": 1.1645, "step": 7211 }, { "epoch": 2.685064746192598, "grad_norm": 0.16307219862937927, "learning_rate": 1.7283027299794306e-05, "loss": 1.1769, "step": 7212 }, { "epoch": 2.6854370513432073, "grad_norm": 0.16042660176753998, "learning_rate": 1.7282194443798358e-05, "loss": 1.1636, "step": 7213 }, { "epoch": 2.685809356493816, "grad_norm": 0.16484393179416656, "learning_rate": 1.7281361480244852e-05, "loss": 1.1716, "step": 7214 }, { "epoch": 2.6861816616444254, "grad_norm": 0.16563540697097778, "learning_rate": 1.7280528409146097e-05, "loss": 1.1618, "step": 7215 }, { "epoch": 2.686553966795034, "grad_norm": 0.16255468130111694, "learning_rate": 1.7279695230514392e-05, "loss": 1.1771, "step": 7216 }, { "epoch": 2.6869262719456435, "grad_norm": 0.16036321222782135, "learning_rate": 1.7278861944362045e-05, "loss": 1.1671, "step": 7217 }, { "epoch": 2.6872985770962527, "grad_norm": 0.16216713190078735, "learning_rate": 1.7278028550701364e-05, "loss": 1.1772, "step": 7218 }, { "epoch": 2.6876708822468616, "grad_norm": 0.16452451050281525, "learning_rate": 1.727719504954466e-05, "loss": 1.1639, "step": 7219 }, { "epoch": 2.6880431873974704, "grad_norm": 0.16761431097984314, "learning_rate": 1.727636144090424e-05, "loss": 1.1635, "step": 7220 }, { "epoch": 2.6884154925480797, "grad_norm": 0.16240081191062927, "learning_rate": 1.7275527724792416e-05, "loss": 1.1699, "step": 7221 }, { "epoch": 2.688787797698689, "grad_norm": 0.16535256803035736, "learning_rate": 1.7274693901221507e-05, "loss": 1.1815, "step": 7222 }, { "epoch": 2.6891601028492977, "grad_norm": 0.16197264194488525, "learning_rate": 1.7273859970203825e-05, "loss": 1.1708, "step": 7223 }, { "epoch": 2.689532407999907, "grad_norm": 0.1600698083639145, "learning_rate": 1.727302593175169e-05, "loss": 1.1802, "step": 7224 }, { "epoch": 2.689904713150516, "grad_norm": 0.1599569022655487, "learning_rate": 1.7272191785877415e-05, "loss": 1.1891, "step": 7225 }, { "epoch": 2.690277018301125, "grad_norm": 0.16361677646636963, "learning_rate": 1.7271357532593325e-05, "loss": 1.1736, "step": 7226 }, { "epoch": 2.6906493234517344, "grad_norm": 0.16064906120300293, "learning_rate": 1.727052317191174e-05, "loss": 1.168, "step": 7227 }, { "epoch": 2.691021628602343, "grad_norm": 0.16545885801315308, "learning_rate": 1.7269688703844984e-05, "loss": 1.1683, "step": 7228 }, { "epoch": 2.691393933752952, "grad_norm": 0.16295583546161652, "learning_rate": 1.7268854128405384e-05, "loss": 1.1605, "step": 7229 }, { "epoch": 2.6917662389035613, "grad_norm": 0.16552822291851044, "learning_rate": 1.7268019445605263e-05, "loss": 1.161, "step": 7230 }, { "epoch": 2.6921385440541705, "grad_norm": 0.16663306951522827, "learning_rate": 1.726718465545695e-05, "loss": 1.17, "step": 7231 }, { "epoch": 2.6925108492047793, "grad_norm": 0.15905286371707916, "learning_rate": 1.726634975797278e-05, "loss": 1.1616, "step": 7232 }, { "epoch": 2.6928831543553886, "grad_norm": 0.1620490998029709, "learning_rate": 1.7265514753165075e-05, "loss": 1.176, "step": 7233 }, { "epoch": 2.6932554595059974, "grad_norm": 0.16328297555446625, "learning_rate": 1.7264679641046176e-05, "loss": 1.1792, "step": 7234 }, { "epoch": 2.6936277646566067, "grad_norm": 0.1633196771144867, "learning_rate": 1.7263844421628416e-05, "loss": 1.1659, "step": 7235 }, { "epoch": 2.694000069807216, "grad_norm": 0.1595584601163864, "learning_rate": 1.7263009094924125e-05, "loss": 1.1728, "step": 7236 }, { "epoch": 2.694372374957825, "grad_norm": 0.16301463544368744, "learning_rate": 1.7262173660945648e-05, "loss": 1.1735, "step": 7237 }, { "epoch": 2.6947446801084336, "grad_norm": 0.16339920461177826, "learning_rate": 1.7261338119705323e-05, "loss": 1.1772, "step": 7238 }, { "epoch": 2.695116985259043, "grad_norm": 0.16579227149486542, "learning_rate": 1.7260502471215488e-05, "loss": 1.171, "step": 7239 }, { "epoch": 2.695489290409652, "grad_norm": 0.164364293217659, "learning_rate": 1.7259666715488487e-05, "loss": 1.1683, "step": 7240 }, { "epoch": 2.695861595560261, "grad_norm": 0.16661317646503448, "learning_rate": 1.7258830852536666e-05, "loss": 1.1794, "step": 7241 }, { "epoch": 2.69623390071087, "grad_norm": 0.16931530833244324, "learning_rate": 1.7257994882372368e-05, "loss": 1.1703, "step": 7242 }, { "epoch": 2.696606205861479, "grad_norm": 0.1614353507757187, "learning_rate": 1.725715880500794e-05, "loss": 1.1639, "step": 7243 }, { "epoch": 2.6969785110120883, "grad_norm": 0.16335289180278778, "learning_rate": 1.7256322620455733e-05, "loss": 1.1744, "step": 7244 }, { "epoch": 2.6973508161626976, "grad_norm": 0.16357359290122986, "learning_rate": 1.7255486328728096e-05, "loss": 1.1627, "step": 7245 }, { "epoch": 2.6977231213133064, "grad_norm": 0.16286170482635498, "learning_rate": 1.725464992983738e-05, "loss": 1.1709, "step": 7246 }, { "epoch": 2.6980954264639156, "grad_norm": 0.16695433855056763, "learning_rate": 1.7253813423795943e-05, "loss": 1.1667, "step": 7247 }, { "epoch": 2.6984677316145245, "grad_norm": 0.15719039738178253, "learning_rate": 1.7252976810616134e-05, "loss": 1.1666, "step": 7248 }, { "epoch": 2.6988400367651337, "grad_norm": 0.15986768901348114, "learning_rate": 1.7252140090310314e-05, "loss": 1.1727, "step": 7249 }, { "epoch": 2.6992123419157426, "grad_norm": 0.15737248957157135, "learning_rate": 1.7251303262890838e-05, "loss": 1.181, "step": 7250 }, { "epoch": 2.699584647066352, "grad_norm": 0.1623496413230896, "learning_rate": 1.725046632837007e-05, "loss": 1.1727, "step": 7251 }, { "epoch": 2.6999569522169606, "grad_norm": 0.16519080102443695, "learning_rate": 1.724962928676037e-05, "loss": 1.1729, "step": 7252 }, { "epoch": 2.70032925736757, "grad_norm": 0.16225138306617737, "learning_rate": 1.72487921380741e-05, "loss": 1.1672, "step": 7253 }, { "epoch": 2.700701562518179, "grad_norm": 0.16735146939754486, "learning_rate": 1.7247954882323622e-05, "loss": 1.1627, "step": 7254 }, { "epoch": 2.701073867668788, "grad_norm": 0.1635318249464035, "learning_rate": 1.724711751952131e-05, "loss": 1.166, "step": 7255 }, { "epoch": 2.7014461728193973, "grad_norm": 0.1578553318977356, "learning_rate": 1.7246280049679526e-05, "loss": 1.155, "step": 7256 }, { "epoch": 2.701818477970006, "grad_norm": 0.16123878955841064, "learning_rate": 1.7245442472810638e-05, "loss": 1.1906, "step": 7257 }, { "epoch": 2.7021907831206153, "grad_norm": 0.16336768865585327, "learning_rate": 1.724460478892702e-05, "loss": 1.179, "step": 7258 }, { "epoch": 2.702563088271224, "grad_norm": 0.16107387840747833, "learning_rate": 1.7243766998041045e-05, "loss": 1.1603, "step": 7259 }, { "epoch": 2.7029353934218334, "grad_norm": 0.15997155010700226, "learning_rate": 1.7242929100165085e-05, "loss": 1.1739, "step": 7260 }, { "epoch": 2.7033076985724422, "grad_norm": 0.16311846673488617, "learning_rate": 1.7242091095311516e-05, "loss": 1.1698, "step": 7261 }, { "epoch": 2.7036800037230515, "grad_norm": 0.16336789727210999, "learning_rate": 1.724125298349272e-05, "loss": 1.1584, "step": 7262 }, { "epoch": 2.7040523088736608, "grad_norm": 0.1627056896686554, "learning_rate": 1.7240414764721067e-05, "loss": 1.1787, "step": 7263 }, { "epoch": 2.7044246140242696, "grad_norm": 0.1628088355064392, "learning_rate": 1.7239576439008945e-05, "loss": 1.185, "step": 7264 }, { "epoch": 2.704796919174879, "grad_norm": 0.16446030139923096, "learning_rate": 1.723873800636873e-05, "loss": 1.1599, "step": 7265 }, { "epoch": 2.7051692243254877, "grad_norm": 0.16059374809265137, "learning_rate": 1.7237899466812814e-05, "loss": 1.1611, "step": 7266 }, { "epoch": 2.705541529476097, "grad_norm": 0.16304123401641846, "learning_rate": 1.7237060820353573e-05, "loss": 1.1764, "step": 7267 }, { "epoch": 2.7059138346267058, "grad_norm": 0.16345947980880737, "learning_rate": 1.7236222067003402e-05, "loss": 1.1757, "step": 7268 }, { "epoch": 2.706286139777315, "grad_norm": 0.15779638290405273, "learning_rate": 1.7235383206774682e-05, "loss": 1.1699, "step": 7269 }, { "epoch": 2.706658444927924, "grad_norm": 0.1547388732433319, "learning_rate": 1.7234544239679807e-05, "loss": 1.1618, "step": 7270 }, { "epoch": 2.707030750078533, "grad_norm": 0.16903281211853027, "learning_rate": 1.723370516573117e-05, "loss": 1.1696, "step": 7271 }, { "epoch": 2.7074030552291424, "grad_norm": 0.16696353256702423, "learning_rate": 1.7232865984941156e-05, "loss": 1.1588, "step": 7272 }, { "epoch": 2.707775360379751, "grad_norm": 0.1645137071609497, "learning_rate": 1.723202669732217e-05, "loss": 1.1645, "step": 7273 }, { "epoch": 2.7081476655303605, "grad_norm": 0.16324058175086975, "learning_rate": 1.72311873028866e-05, "loss": 1.1723, "step": 7274 }, { "epoch": 2.7085199706809693, "grad_norm": 0.16554492712020874, "learning_rate": 1.723034780164685e-05, "loss": 1.1936, "step": 7275 }, { "epoch": 2.7088922758315785, "grad_norm": 0.16024906933307648, "learning_rate": 1.7229508193615316e-05, "loss": 1.1689, "step": 7276 }, { "epoch": 2.7092645809821874, "grad_norm": 0.16716809570789337, "learning_rate": 1.72286684788044e-05, "loss": 1.1698, "step": 7277 }, { "epoch": 2.7096368861327966, "grad_norm": 0.16631504893302917, "learning_rate": 1.7227828657226506e-05, "loss": 1.1789, "step": 7278 }, { "epoch": 2.7100091912834054, "grad_norm": 0.1682237982749939, "learning_rate": 1.7226988728894033e-05, "loss": 1.1719, "step": 7279 }, { "epoch": 2.7103814964340147, "grad_norm": 0.15563975274562836, "learning_rate": 1.722614869381939e-05, "loss": 1.166, "step": 7280 }, { "epoch": 2.710753801584624, "grad_norm": 0.16396182775497437, "learning_rate": 1.7225308552014988e-05, "loss": 1.1777, "step": 7281 }, { "epoch": 2.711126106735233, "grad_norm": 0.16292405128479004, "learning_rate": 1.722446830349323e-05, "loss": 1.1783, "step": 7282 }, { "epoch": 2.711498411885842, "grad_norm": 0.15950658917427063, "learning_rate": 1.7223627948266526e-05, "loss": 1.1775, "step": 7283 }, { "epoch": 2.711870717036451, "grad_norm": 0.16486436128616333, "learning_rate": 1.7222787486347296e-05, "loss": 1.1664, "step": 7284 }, { "epoch": 2.71224302218706, "grad_norm": 0.16335855424404144, "learning_rate": 1.7221946917747945e-05, "loss": 1.1599, "step": 7285 }, { "epoch": 2.7126153273376694, "grad_norm": 0.15834614634513855, "learning_rate": 1.722110624248089e-05, "loss": 1.1644, "step": 7286 }, { "epoch": 2.7129876324882782, "grad_norm": 0.16369593143463135, "learning_rate": 1.722026546055855e-05, "loss": 1.1681, "step": 7287 }, { "epoch": 2.713359937638887, "grad_norm": 0.16682885587215424, "learning_rate": 1.7219424571993345e-05, "loss": 1.1763, "step": 7288 }, { "epoch": 2.7137322427894963, "grad_norm": 0.17135527729988098, "learning_rate": 1.721858357679769e-05, "loss": 1.1878, "step": 7289 }, { "epoch": 2.7141045479401056, "grad_norm": 0.16507208347320557, "learning_rate": 1.7217742474984006e-05, "loss": 1.1619, "step": 7290 }, { "epoch": 2.7144768530907144, "grad_norm": 0.16445507109165192, "learning_rate": 1.721690126656472e-05, "loss": 1.1794, "step": 7291 }, { "epoch": 2.7148491582413237, "grad_norm": 0.1608470231294632, "learning_rate": 1.7216059951552256e-05, "loss": 1.1688, "step": 7292 }, { "epoch": 2.7152214633919325, "grad_norm": 0.16081736981868744, "learning_rate": 1.7215218529959042e-05, "loss": 1.183, "step": 7293 }, { "epoch": 2.7155937685425418, "grad_norm": 0.16180992126464844, "learning_rate": 1.7214377001797498e-05, "loss": 1.1553, "step": 7294 }, { "epoch": 2.715966073693151, "grad_norm": 0.16335974633693695, "learning_rate": 1.7213535367080064e-05, "loss": 1.1667, "step": 7295 }, { "epoch": 2.71633837884376, "grad_norm": 0.1620166003704071, "learning_rate": 1.7212693625819163e-05, "loss": 1.1695, "step": 7296 }, { "epoch": 2.7167106839943687, "grad_norm": 0.15948939323425293, "learning_rate": 1.7211851778027226e-05, "loss": 1.1667, "step": 7297 }, { "epoch": 2.717082989144978, "grad_norm": 0.15840508043766022, "learning_rate": 1.7211009823716695e-05, "loss": 1.1611, "step": 7298 }, { "epoch": 2.717455294295587, "grad_norm": 0.1646668016910553, "learning_rate": 1.72101677629e-05, "loss": 1.1748, "step": 7299 }, { "epoch": 2.717827599446196, "grad_norm": 0.16672910749912262, "learning_rate": 1.720932559558958e-05, "loss": 1.1637, "step": 7300 }, { "epoch": 2.7181999045968053, "grad_norm": 0.16393324732780457, "learning_rate": 1.7208483321797876e-05, "loss": 1.1536, "step": 7301 }, { "epoch": 2.718572209747414, "grad_norm": 0.16275498270988464, "learning_rate": 1.720764094153732e-05, "loss": 1.1675, "step": 7302 }, { "epoch": 2.7189445148980234, "grad_norm": 0.167283296585083, "learning_rate": 1.7206798454820366e-05, "loss": 1.1612, "step": 7303 }, { "epoch": 2.7193168200486326, "grad_norm": 0.16104766726493835, "learning_rate": 1.7205955861659446e-05, "loss": 1.1844, "step": 7304 }, { "epoch": 2.7196891251992414, "grad_norm": 0.16111914813518524, "learning_rate": 1.7205113162067013e-05, "loss": 1.1702, "step": 7305 }, { "epoch": 2.7200614303498503, "grad_norm": 0.16553427278995514, "learning_rate": 1.720427035605551e-05, "loss": 1.1664, "step": 7306 }, { "epoch": 2.7204337355004595, "grad_norm": 0.16949054598808289, "learning_rate": 1.7203427443637385e-05, "loss": 1.1755, "step": 7307 }, { "epoch": 2.720806040651069, "grad_norm": 0.16320055723190308, "learning_rate": 1.7202584424825092e-05, "loss": 1.1693, "step": 7308 }, { "epoch": 2.7211783458016776, "grad_norm": 0.16071534156799316, "learning_rate": 1.7201741299631075e-05, "loss": 1.1695, "step": 7309 }, { "epoch": 2.721550650952287, "grad_norm": 0.16326375305652618, "learning_rate": 1.720089806806779e-05, "loss": 1.1712, "step": 7310 }, { "epoch": 2.7219229561028957, "grad_norm": 0.16622449457645416, "learning_rate": 1.72000547301477e-05, "loss": 1.1679, "step": 7311 }, { "epoch": 2.722295261253505, "grad_norm": 0.1643485128879547, "learning_rate": 1.7199211285883245e-05, "loss": 1.1791, "step": 7312 }, { "epoch": 2.7226675664041142, "grad_norm": 0.15881820023059845, "learning_rate": 1.7198367735286897e-05, "loss": 1.1625, "step": 7313 }, { "epoch": 2.723039871554723, "grad_norm": 0.16432297229766846, "learning_rate": 1.7197524078371105e-05, "loss": 1.1736, "step": 7314 }, { "epoch": 2.723412176705332, "grad_norm": 0.15992549061775208, "learning_rate": 1.7196680315148335e-05, "loss": 1.1772, "step": 7315 }, { "epoch": 2.723784481855941, "grad_norm": 0.16613058745861053, "learning_rate": 1.719583644563105e-05, "loss": 1.1808, "step": 7316 }, { "epoch": 2.7241567870065504, "grad_norm": 0.16339601576328278, "learning_rate": 1.7194992469831712e-05, "loss": 1.1593, "step": 7317 }, { "epoch": 2.724529092157159, "grad_norm": 0.15978363156318665, "learning_rate": 1.719414838776279e-05, "loss": 1.1716, "step": 7318 }, { "epoch": 2.7249013973077685, "grad_norm": 0.16619928181171417, "learning_rate": 1.719330419943674e-05, "loss": 1.1694, "step": 7319 }, { "epoch": 2.7252737024583773, "grad_norm": 0.16087765991687775, "learning_rate": 1.7192459904866042e-05, "loss": 1.1529, "step": 7320 }, { "epoch": 2.7256460076089866, "grad_norm": 0.16294820606708527, "learning_rate": 1.7191615504063165e-05, "loss": 1.1925, "step": 7321 }, { "epoch": 2.726018312759596, "grad_norm": 0.1619005650281906, "learning_rate": 1.7190770997040574e-05, "loss": 1.1692, "step": 7322 }, { "epoch": 2.7263906179102047, "grad_norm": 0.17229419946670532, "learning_rate": 1.7189926383810755e-05, "loss": 1.1677, "step": 7323 }, { "epoch": 2.7267629230608135, "grad_norm": 0.16301006078720093, "learning_rate": 1.7189081664386168e-05, "loss": 1.1832, "step": 7324 }, { "epoch": 2.7271352282114227, "grad_norm": 0.16268163919448853, "learning_rate": 1.7188236838779297e-05, "loss": 1.1718, "step": 7325 }, { "epoch": 2.727507533362032, "grad_norm": 0.16183678805828094, "learning_rate": 1.718739190700262e-05, "loss": 1.1765, "step": 7326 }, { "epoch": 2.727879838512641, "grad_norm": 0.16009078919887543, "learning_rate": 1.7186546869068612e-05, "loss": 1.1812, "step": 7327 }, { "epoch": 2.72825214366325, "grad_norm": 0.16223828494548798, "learning_rate": 1.718570172498976e-05, "loss": 1.1905, "step": 7328 }, { "epoch": 2.728624448813859, "grad_norm": 0.1691174954175949, "learning_rate": 1.7184856474778543e-05, "loss": 1.1672, "step": 7329 }, { "epoch": 2.728996753964468, "grad_norm": 0.1607164889574051, "learning_rate": 1.7184011118447448e-05, "loss": 1.1904, "step": 7330 }, { "epoch": 2.7293690591150774, "grad_norm": 0.16635142266750336, "learning_rate": 1.718316565600896e-05, "loss": 1.1633, "step": 7331 }, { "epoch": 2.7297413642656863, "grad_norm": 0.16551473736763, "learning_rate": 1.7182320087475567e-05, "loss": 1.1649, "step": 7332 }, { "epoch": 2.730113669416295, "grad_norm": 0.16264252364635468, "learning_rate": 1.7181474412859756e-05, "loss": 1.1627, "step": 7333 }, { "epoch": 2.7304859745669043, "grad_norm": 0.16819992661476135, "learning_rate": 1.718062863217402e-05, "loss": 1.1749, "step": 7334 }, { "epoch": 2.7308582797175136, "grad_norm": 0.16753548383712769, "learning_rate": 1.717978274543085e-05, "loss": 1.1715, "step": 7335 }, { "epoch": 2.7312305848681224, "grad_norm": 0.16801391541957855, "learning_rate": 1.7178936752642737e-05, "loss": 1.175, "step": 7336 }, { "epoch": 2.7316028900187317, "grad_norm": 0.16924819350242615, "learning_rate": 1.717809065382218e-05, "loss": 1.1764, "step": 7337 }, { "epoch": 2.7319751951693405, "grad_norm": 0.1656593382358551, "learning_rate": 1.7177244448981675e-05, "loss": 1.1834, "step": 7338 }, { "epoch": 2.7323475003199498, "grad_norm": 0.15879951417446136, "learning_rate": 1.7176398138133718e-05, "loss": 1.1816, "step": 7339 }, { "epoch": 2.732719805470559, "grad_norm": 0.16423895955085754, "learning_rate": 1.717555172129081e-05, "loss": 1.1789, "step": 7340 }, { "epoch": 2.733092110621168, "grad_norm": 0.16044852137565613, "learning_rate": 1.7174705198465454e-05, "loss": 1.158, "step": 7341 }, { "epoch": 2.7334644157717767, "grad_norm": 0.16656622290611267, "learning_rate": 1.7173858569670155e-05, "loss": 1.1623, "step": 7342 }, { "epoch": 2.733836720922386, "grad_norm": 0.16473737359046936, "learning_rate": 1.7173011834917415e-05, "loss": 1.1799, "step": 7343 }, { "epoch": 2.734209026072995, "grad_norm": 0.16188117861747742, "learning_rate": 1.7172164994219738e-05, "loss": 1.1787, "step": 7344 }, { "epoch": 2.734581331223604, "grad_norm": 0.17341145873069763, "learning_rate": 1.7171318047589637e-05, "loss": 1.178, "step": 7345 }, { "epoch": 2.7349536363742133, "grad_norm": 0.1620875597000122, "learning_rate": 1.7170470995039618e-05, "loss": 1.1842, "step": 7346 }, { "epoch": 2.735325941524822, "grad_norm": 0.16189490258693695, "learning_rate": 1.716962383658219e-05, "loss": 1.1682, "step": 7347 }, { "epoch": 2.7356982466754314, "grad_norm": 0.16094739735126495, "learning_rate": 1.716877657222987e-05, "loss": 1.173, "step": 7348 }, { "epoch": 2.7360705518260406, "grad_norm": 0.16400355100631714, "learning_rate": 1.7167929201995167e-05, "loss": 1.1718, "step": 7349 }, { "epoch": 2.7364428569766495, "grad_norm": 0.1662655919790268, "learning_rate": 1.7167081725890602e-05, "loss": 1.1556, "step": 7350 }, { "epoch": 2.7368151621272583, "grad_norm": 0.15632013976573944, "learning_rate": 1.716623414392869e-05, "loss": 1.1616, "step": 7351 }, { "epoch": 2.7371874672778675, "grad_norm": 0.15825137495994568, "learning_rate": 1.7165386456121948e-05, "loss": 1.1631, "step": 7352 }, { "epoch": 2.737559772428477, "grad_norm": 0.16320408880710602, "learning_rate": 1.71645386624829e-05, "loss": 1.1664, "step": 7353 }, { "epoch": 2.7379320775790856, "grad_norm": 0.17327001690864563, "learning_rate": 1.7163690763024063e-05, "loss": 1.1686, "step": 7354 }, { "epoch": 2.738304382729695, "grad_norm": 0.1668155938386917, "learning_rate": 1.7162842757757964e-05, "loss": 1.1665, "step": 7355 }, { "epoch": 2.7386766878803037, "grad_norm": 0.15803512930870056, "learning_rate": 1.716199464669713e-05, "loss": 1.1703, "step": 7356 }, { "epoch": 2.739048993030913, "grad_norm": 0.15973332524299622, "learning_rate": 1.716114642985408e-05, "loss": 1.1632, "step": 7357 }, { "epoch": 2.7394212981815222, "grad_norm": 0.17077676951885223, "learning_rate": 1.7160298107241347e-05, "loss": 1.1578, "step": 7358 }, { "epoch": 2.739793603332131, "grad_norm": 0.17319710552692413, "learning_rate": 1.7159449678871463e-05, "loss": 1.1749, "step": 7359 }, { "epoch": 2.7401659084827403, "grad_norm": 0.16504226624965668, "learning_rate": 1.7158601144756953e-05, "loss": 1.1835, "step": 7360 }, { "epoch": 2.740538213633349, "grad_norm": 0.1656225025653839, "learning_rate": 1.715775250491036e-05, "loss": 1.1673, "step": 7361 }, { "epoch": 2.7409105187839584, "grad_norm": 0.16402550041675568, "learning_rate": 1.7156903759344207e-05, "loss": 1.1798, "step": 7362 }, { "epoch": 2.7412828239345672, "grad_norm": 0.1609089970588684, "learning_rate": 1.715605490807103e-05, "loss": 1.1661, "step": 7363 }, { "epoch": 2.7416551290851765, "grad_norm": 0.1700262427330017, "learning_rate": 1.7155205951103378e-05, "loss": 1.1807, "step": 7364 }, { "epoch": 2.7420274342357853, "grad_norm": 0.16744990646839142, "learning_rate": 1.715435688845378e-05, "loss": 1.1824, "step": 7365 }, { "epoch": 2.7423997393863946, "grad_norm": 0.1665429025888443, "learning_rate": 1.715350772013478e-05, "loss": 1.1701, "step": 7366 }, { "epoch": 2.742772044537004, "grad_norm": 0.16256341338157654, "learning_rate": 1.7152658446158926e-05, "loss": 1.175, "step": 7367 }, { "epoch": 2.7431443496876127, "grad_norm": 0.16586780548095703, "learning_rate": 1.7151809066538746e-05, "loss": 1.1882, "step": 7368 }, { "epoch": 2.743516654838222, "grad_norm": 0.1687692552804947, "learning_rate": 1.7150959581286802e-05, "loss": 1.1726, "step": 7369 }, { "epoch": 2.7438889599888308, "grad_norm": 0.1655583381652832, "learning_rate": 1.715010999041563e-05, "loss": 1.1742, "step": 7370 }, { "epoch": 2.74426126513944, "grad_norm": 0.1582690328359604, "learning_rate": 1.7149260293937782e-05, "loss": 1.1841, "step": 7371 }, { "epoch": 2.744633570290049, "grad_norm": 0.1673700511455536, "learning_rate": 1.714841049186581e-05, "loss": 1.1825, "step": 7372 }, { "epoch": 2.745005875440658, "grad_norm": 0.1654442995786667, "learning_rate": 1.7147560584212263e-05, "loss": 1.1678, "step": 7373 }, { "epoch": 2.745378180591267, "grad_norm": 0.16653363406658173, "learning_rate": 1.7146710570989698e-05, "loss": 1.1749, "step": 7374 }, { "epoch": 2.745750485741876, "grad_norm": 0.15975458920001984, "learning_rate": 1.7145860452210662e-05, "loss": 1.18, "step": 7375 }, { "epoch": 2.7461227908924855, "grad_norm": 0.16379772126674652, "learning_rate": 1.7145010227887716e-05, "loss": 1.1782, "step": 7376 }, { "epoch": 2.7464950960430943, "grad_norm": 0.16204066574573517, "learning_rate": 1.714415989803342e-05, "loss": 1.1681, "step": 7377 }, { "epoch": 2.7468674011937035, "grad_norm": 0.16692563891410828, "learning_rate": 1.7143309462660326e-05, "loss": 1.172, "step": 7378 }, { "epoch": 2.7472397063443124, "grad_norm": 0.16398832201957703, "learning_rate": 1.7142458921781002e-05, "loss": 1.1752, "step": 7379 }, { "epoch": 2.7476120114949216, "grad_norm": 0.16204705834388733, "learning_rate": 1.714160827540801e-05, "loss": 1.1742, "step": 7380 }, { "epoch": 2.7479843166455304, "grad_norm": 0.16847161948680878, "learning_rate": 1.7140757523553907e-05, "loss": 1.1765, "step": 7381 }, { "epoch": 2.7483566217961397, "grad_norm": 0.16786617040634155, "learning_rate": 1.713990666623127e-05, "loss": 1.1632, "step": 7382 }, { "epoch": 2.7487289269467485, "grad_norm": 0.15807227790355682, "learning_rate": 1.7139055703452653e-05, "loss": 1.1683, "step": 7383 }, { "epoch": 2.749101232097358, "grad_norm": 0.1643909215927124, "learning_rate": 1.7138204635230637e-05, "loss": 1.1715, "step": 7384 }, { "epoch": 2.749473537247967, "grad_norm": 0.15912140905857086, "learning_rate": 1.7137353461577785e-05, "loss": 1.179, "step": 7385 }, { "epoch": 2.749845842398576, "grad_norm": 0.16151286661624908, "learning_rate": 1.713650218250667e-05, "loss": 1.1741, "step": 7386 }, { "epoch": 2.750218147549185, "grad_norm": 0.1597239375114441, "learning_rate": 1.7135650798029864e-05, "loss": 1.16, "step": 7387 }, { "epoch": 2.750590452699794, "grad_norm": 0.16162365674972534, "learning_rate": 1.7134799308159946e-05, "loss": 1.1671, "step": 7388 }, { "epoch": 2.7509627578504032, "grad_norm": 0.16579143702983856, "learning_rate": 1.7133947712909485e-05, "loss": 1.1645, "step": 7389 }, { "epoch": 2.751335063001012, "grad_norm": 0.1629938781261444, "learning_rate": 1.7133096012291067e-05, "loss": 1.1764, "step": 7390 }, { "epoch": 2.7517073681516213, "grad_norm": 0.15928851068019867, "learning_rate": 1.7132244206317272e-05, "loss": 1.1714, "step": 7391 }, { "epoch": 2.75207967330223, "grad_norm": 0.16716843843460083, "learning_rate": 1.7131392295000676e-05, "loss": 1.1764, "step": 7392 }, { "epoch": 2.7524519784528394, "grad_norm": 0.16544127464294434, "learning_rate": 1.713054027835386e-05, "loss": 1.1731, "step": 7393 }, { "epoch": 2.7528242836034487, "grad_norm": 0.1620912104845047, "learning_rate": 1.7129688156389414e-05, "loss": 1.1723, "step": 7394 }, { "epoch": 2.7531965887540575, "grad_norm": 0.1651720106601715, "learning_rate": 1.7128835929119923e-05, "loss": 1.1602, "step": 7395 }, { "epoch": 2.7535688939046667, "grad_norm": 0.16467827558517456, "learning_rate": 1.712798359655797e-05, "loss": 1.1764, "step": 7396 }, { "epoch": 2.7539411990552756, "grad_norm": 0.15525712072849274, "learning_rate": 1.7127131158716145e-05, "loss": 1.1681, "step": 7397 }, { "epoch": 2.754313504205885, "grad_norm": 0.1612330675125122, "learning_rate": 1.7126278615607045e-05, "loss": 1.167, "step": 7398 }, { "epoch": 2.7546858093564937, "grad_norm": 0.16370365023612976, "learning_rate": 1.7125425967243252e-05, "loss": 1.1789, "step": 7399 }, { "epoch": 2.755058114507103, "grad_norm": 0.16268078982830048, "learning_rate": 1.7124573213637367e-05, "loss": 1.173, "step": 7400 }, { "epoch": 2.7554304196577117, "grad_norm": 0.16411568224430084, "learning_rate": 1.7123720354801984e-05, "loss": 1.1595, "step": 7401 }, { "epoch": 2.755802724808321, "grad_norm": 0.16271185874938965, "learning_rate": 1.7122867390749697e-05, "loss": 1.1676, "step": 7402 }, { "epoch": 2.7561750299589303, "grad_norm": 0.16013269126415253, "learning_rate": 1.7122014321493105e-05, "loss": 1.1689, "step": 7403 }, { "epoch": 2.756547335109539, "grad_norm": 0.168240487575531, "learning_rate": 1.7121161147044813e-05, "loss": 1.1712, "step": 7404 }, { "epoch": 2.7569196402601484, "grad_norm": 0.16075722873210907, "learning_rate": 1.7120307867417414e-05, "loss": 1.1624, "step": 7405 }, { "epoch": 2.757291945410757, "grad_norm": 0.16558417677879333, "learning_rate": 1.7119454482623515e-05, "loss": 1.1772, "step": 7406 }, { "epoch": 2.7576642505613664, "grad_norm": 0.1621825397014618, "learning_rate": 1.7118600992675718e-05, "loss": 1.1814, "step": 7407 }, { "epoch": 2.7580365557119757, "grad_norm": 0.16692297160625458, "learning_rate": 1.7117747397586636e-05, "loss": 1.1804, "step": 7408 }, { "epoch": 2.7584088608625845, "grad_norm": 0.15905342996120453, "learning_rate": 1.7116893697368866e-05, "loss": 1.171, "step": 7409 }, { "epoch": 2.7587811660131933, "grad_norm": 0.16342061758041382, "learning_rate": 1.7116039892035025e-05, "loss": 1.1518, "step": 7410 }, { "epoch": 2.7591534711638026, "grad_norm": 0.161211296916008, "learning_rate": 1.7115185981597725e-05, "loss": 1.1736, "step": 7411 }, { "epoch": 2.759525776314412, "grad_norm": 0.16845087707042694, "learning_rate": 1.7114331966069572e-05, "loss": 1.1743, "step": 7412 }, { "epoch": 2.7598980814650207, "grad_norm": 0.16681751608848572, "learning_rate": 1.7113477845463177e-05, "loss": 1.1663, "step": 7413 }, { "epoch": 2.76027038661563, "grad_norm": 0.16370663046836853, "learning_rate": 1.7112623619791167e-05, "loss": 1.1643, "step": 7414 }, { "epoch": 2.7606426917662388, "grad_norm": 0.16601529717445374, "learning_rate": 1.711176928906615e-05, "loss": 1.1716, "step": 7415 }, { "epoch": 2.761014996916848, "grad_norm": 0.1686096489429474, "learning_rate": 1.7110914853300748e-05, "loss": 1.1626, "step": 7416 }, { "epoch": 2.7613873020674573, "grad_norm": 0.1649993360042572, "learning_rate": 1.711006031250758e-05, "loss": 1.1706, "step": 7417 }, { "epoch": 2.761759607218066, "grad_norm": 0.16518589854240417, "learning_rate": 1.710920566669927e-05, "loss": 1.1622, "step": 7418 }, { "epoch": 2.762131912368675, "grad_norm": 0.1586756408214569, "learning_rate": 1.7108350915888432e-05, "loss": 1.1615, "step": 7419 }, { "epoch": 2.762504217519284, "grad_norm": 0.16265276074409485, "learning_rate": 1.71074960600877e-05, "loss": 1.1762, "step": 7420 }, { "epoch": 2.7628765226698935, "grad_norm": 0.16679120063781738, "learning_rate": 1.71066410993097e-05, "loss": 1.1838, "step": 7421 }, { "epoch": 2.7632488278205023, "grad_norm": 0.16883881390094757, "learning_rate": 1.7105786033567055e-05, "loss": 1.1721, "step": 7422 }, { "epoch": 2.7636211329711116, "grad_norm": 0.15802286565303802, "learning_rate": 1.7104930862872394e-05, "loss": 1.1737, "step": 7423 }, { "epoch": 2.7639934381217204, "grad_norm": 0.1627608686685562, "learning_rate": 1.7104075587238353e-05, "loss": 1.1678, "step": 7424 }, { "epoch": 2.7643657432723296, "grad_norm": 0.1643580198287964, "learning_rate": 1.710322020667756e-05, "loss": 1.1746, "step": 7425 }, { "epoch": 2.764738048422939, "grad_norm": 0.16312402486801147, "learning_rate": 1.7102364721202655e-05, "loss": 1.1812, "step": 7426 }, { "epoch": 2.7651103535735477, "grad_norm": 0.15832361578941345, "learning_rate": 1.710150913082626e-05, "loss": 1.1886, "step": 7427 }, { "epoch": 2.7654826587241566, "grad_norm": 0.16072019934654236, "learning_rate": 1.7100653435561027e-05, "loss": 1.1688, "step": 7428 }, { "epoch": 2.765854963874766, "grad_norm": 0.16109511256217957, "learning_rate": 1.7099797635419587e-05, "loss": 1.184, "step": 7429 }, { "epoch": 2.766227269025375, "grad_norm": 0.16431210935115814, "learning_rate": 1.709894173041458e-05, "loss": 1.1763, "step": 7430 }, { "epoch": 2.766599574175984, "grad_norm": 0.16304095089435577, "learning_rate": 1.7098085720558653e-05, "loss": 1.1671, "step": 7431 }, { "epoch": 2.766971879326593, "grad_norm": 0.16046518087387085, "learning_rate": 1.709722960586444e-05, "loss": 1.1659, "step": 7432 }, { "epoch": 2.767344184477202, "grad_norm": 0.1624547243118286, "learning_rate": 1.7096373386344596e-05, "loss": 1.1645, "step": 7433 }, { "epoch": 2.7677164896278112, "grad_norm": 0.1661730259656906, "learning_rate": 1.709551706201176e-05, "loss": 1.171, "step": 7434 }, { "epoch": 2.7680887947784205, "grad_norm": 0.15459467470645905, "learning_rate": 1.709466063287858e-05, "loss": 1.1533, "step": 7435 }, { "epoch": 2.7684610999290293, "grad_norm": 0.16066570580005646, "learning_rate": 1.709380409895771e-05, "loss": 1.1714, "step": 7436 }, { "epoch": 2.768833405079638, "grad_norm": 0.1591673642396927, "learning_rate": 1.7092947460261802e-05, "loss": 1.1589, "step": 7437 }, { "epoch": 2.7692057102302474, "grad_norm": 0.1647397130727768, "learning_rate": 1.7092090716803503e-05, "loss": 1.1783, "step": 7438 }, { "epoch": 2.7695780153808567, "grad_norm": 0.16545167565345764, "learning_rate": 1.7091233868595465e-05, "loss": 1.1808, "step": 7439 }, { "epoch": 2.7699503205314655, "grad_norm": 0.16354137659072876, "learning_rate": 1.7090376915650354e-05, "loss": 1.1713, "step": 7440 }, { "epoch": 2.7703226256820748, "grad_norm": 0.1619860678911209, "learning_rate": 1.708951985798082e-05, "loss": 1.1803, "step": 7441 }, { "epoch": 2.7706949308326836, "grad_norm": 0.1611848771572113, "learning_rate": 1.7088662695599517e-05, "loss": 1.1658, "step": 7442 }, { "epoch": 2.771067235983293, "grad_norm": 0.1581423580646515, "learning_rate": 1.7087805428519114e-05, "loss": 1.1639, "step": 7443 }, { "epoch": 2.771439541133902, "grad_norm": 0.1612246185541153, "learning_rate": 1.708694805675227e-05, "loss": 1.1588, "step": 7444 }, { "epoch": 2.771811846284511, "grad_norm": 0.16077731549739838, "learning_rate": 1.708609058031165e-05, "loss": 1.1788, "step": 7445 }, { "epoch": 2.7721841514351198, "grad_norm": 0.16849185526371002, "learning_rate": 1.7085232999209915e-05, "loss": 1.1797, "step": 7446 }, { "epoch": 2.772556456585729, "grad_norm": 0.16407588124275208, "learning_rate": 1.7084375313459735e-05, "loss": 1.1638, "step": 7447 }, { "epoch": 2.7729287617363383, "grad_norm": 0.1661745011806488, "learning_rate": 1.7083517523073775e-05, "loss": 1.1695, "step": 7448 }, { "epoch": 2.773301066886947, "grad_norm": 0.1609290987253189, "learning_rate": 1.7082659628064704e-05, "loss": 1.1596, "step": 7449 }, { "epoch": 2.7736733720375564, "grad_norm": 0.15832340717315674, "learning_rate": 1.7081801628445195e-05, "loss": 1.1536, "step": 7450 }, { "epoch": 2.774045677188165, "grad_norm": 0.16378222405910492, "learning_rate": 1.708094352422792e-05, "loss": 1.1704, "step": 7451 }, { "epoch": 2.7744179823387745, "grad_norm": 0.16052454710006714, "learning_rate": 1.7080085315425557e-05, "loss": 1.1643, "step": 7452 }, { "epoch": 2.7747902874893837, "grad_norm": 0.16648857295513153, "learning_rate": 1.7079227002050776e-05, "loss": 1.1766, "step": 7453 }, { "epoch": 2.7751625926399925, "grad_norm": 0.16049505770206451, "learning_rate": 1.7078368584116256e-05, "loss": 1.172, "step": 7454 }, { "epoch": 2.7755348977906014, "grad_norm": 0.1630232185125351, "learning_rate": 1.7077510061634675e-05, "loss": 1.1696, "step": 7455 }, { "epoch": 2.7759072029412106, "grad_norm": 0.15958517789840698, "learning_rate": 1.707665143461872e-05, "loss": 1.1703, "step": 7456 }, { "epoch": 2.77627950809182, "grad_norm": 0.16538631916046143, "learning_rate": 1.707579270308106e-05, "loss": 1.1695, "step": 7457 }, { "epoch": 2.7766518132424287, "grad_norm": 0.16825279593467712, "learning_rate": 1.7074933867034392e-05, "loss": 1.1668, "step": 7458 }, { "epoch": 2.777024118393038, "grad_norm": 0.15979918837547302, "learning_rate": 1.7074074926491392e-05, "loss": 1.1692, "step": 7459 }, { "epoch": 2.777396423543647, "grad_norm": 0.1648228019475937, "learning_rate": 1.707321588146475e-05, "loss": 1.1818, "step": 7460 }, { "epoch": 2.777768728694256, "grad_norm": 0.16858553886413574, "learning_rate": 1.7072356731967152e-05, "loss": 1.1717, "step": 7461 }, { "epoch": 2.7781410338448653, "grad_norm": 0.16324035823345184, "learning_rate": 1.707149747801129e-05, "loss": 1.1706, "step": 7462 }, { "epoch": 2.778513338995474, "grad_norm": 0.16237075626850128, "learning_rate": 1.7070638119609854e-05, "loss": 1.1689, "step": 7463 }, { "epoch": 2.778885644146083, "grad_norm": 0.1649467796087265, "learning_rate": 1.706977865677554e-05, "loss": 1.1741, "step": 7464 }, { "epoch": 2.7792579492966922, "grad_norm": 0.16637516021728516, "learning_rate": 1.7068919089521032e-05, "loss": 1.1881, "step": 7465 }, { "epoch": 2.7796302544473015, "grad_norm": 0.1721596121788025, "learning_rate": 1.7068059417859037e-05, "loss": 1.1669, "step": 7466 }, { "epoch": 2.7800025595979103, "grad_norm": 0.16336818039417267, "learning_rate": 1.7067199641802247e-05, "loss": 1.1616, "step": 7467 }, { "epoch": 2.7803748647485196, "grad_norm": 0.1582047939300537, "learning_rate": 1.7066339761363364e-05, "loss": 1.1887, "step": 7468 }, { "epoch": 2.7807471698991284, "grad_norm": 0.16283096373081207, "learning_rate": 1.7065479776555083e-05, "loss": 1.1633, "step": 7469 }, { "epoch": 2.7811194750497377, "grad_norm": 0.16766688227653503, "learning_rate": 1.7064619687390108e-05, "loss": 1.1501, "step": 7470 }, { "epoch": 2.781491780200347, "grad_norm": 0.16073793172836304, "learning_rate": 1.706375949388115e-05, "loss": 1.166, "step": 7471 }, { "epoch": 2.7818640853509558, "grad_norm": 0.1599031388759613, "learning_rate": 1.7062899196040903e-05, "loss": 1.1751, "step": 7472 }, { "epoch": 2.7822363905015646, "grad_norm": 0.16344262659549713, "learning_rate": 1.7062038793882078e-05, "loss": 1.1671, "step": 7473 }, { "epoch": 2.782608695652174, "grad_norm": 0.16053412854671478, "learning_rate": 1.7061178287417383e-05, "loss": 1.1696, "step": 7474 }, { "epoch": 2.782981000802783, "grad_norm": 0.16248364746570587, "learning_rate": 1.706031767665953e-05, "loss": 1.1705, "step": 7475 }, { "epoch": 2.783353305953392, "grad_norm": 0.16297762095928192, "learning_rate": 1.7059456961621226e-05, "loss": 1.1607, "step": 7476 }, { "epoch": 2.783725611104001, "grad_norm": 0.16271525621414185, "learning_rate": 1.7058596142315185e-05, "loss": 1.1568, "step": 7477 }, { "epoch": 2.78409791625461, "grad_norm": 0.1527474820613861, "learning_rate": 1.7057735218754126e-05, "loss": 1.1806, "step": 7478 }, { "epoch": 2.7844702214052193, "grad_norm": 0.15945225954055786, "learning_rate": 1.705687419095076e-05, "loss": 1.1747, "step": 7479 }, { "epoch": 2.7848425265558285, "grad_norm": 0.16087423264980316, "learning_rate": 1.7056013058917802e-05, "loss": 1.1723, "step": 7480 }, { "epoch": 2.7852148317064374, "grad_norm": 0.16305401921272278, "learning_rate": 1.7055151822667975e-05, "loss": 1.1841, "step": 7481 }, { "epoch": 2.7855871368570466, "grad_norm": 0.16334961354732513, "learning_rate": 1.7054290482213996e-05, "loss": 1.1602, "step": 7482 }, { "epoch": 2.7859594420076554, "grad_norm": 0.16361406445503235, "learning_rate": 1.7053429037568596e-05, "loss": 1.1759, "step": 7483 }, { "epoch": 2.7863317471582647, "grad_norm": 0.1611858606338501, "learning_rate": 1.7052567488744485e-05, "loss": 1.1622, "step": 7484 }, { "epoch": 2.7867040523088735, "grad_norm": 0.1641787737607956, "learning_rate": 1.7051705835754394e-05, "loss": 1.1794, "step": 7485 }, { "epoch": 2.787076357459483, "grad_norm": 0.15844056010246277, "learning_rate": 1.7050844078611058e-05, "loss": 1.1711, "step": 7486 }, { "epoch": 2.7874486626100916, "grad_norm": 0.15609781444072723, "learning_rate": 1.7049982217327192e-05, "loss": 1.1585, "step": 7487 }, { "epoch": 2.787820967760701, "grad_norm": 0.16171182692050934, "learning_rate": 1.704912025191553e-05, "loss": 1.1798, "step": 7488 }, { "epoch": 2.78819327291131, "grad_norm": 0.16126160323619843, "learning_rate": 1.704825818238881e-05, "loss": 1.158, "step": 7489 }, { "epoch": 2.788565578061919, "grad_norm": 0.16148677468299866, "learning_rate": 1.7047396008759755e-05, "loss": 1.168, "step": 7490 }, { "epoch": 2.7889378832125282, "grad_norm": 0.1633322536945343, "learning_rate": 1.7046533731041103e-05, "loss": 1.1784, "step": 7491 }, { "epoch": 2.789310188363137, "grad_norm": 0.16195450723171234, "learning_rate": 1.7045671349245588e-05, "loss": 1.1889, "step": 7492 }, { "epoch": 2.7896824935137463, "grad_norm": 0.16481108963489532, "learning_rate": 1.7044808863385953e-05, "loss": 1.1693, "step": 7493 }, { "epoch": 2.790054798664355, "grad_norm": 0.16438551247119904, "learning_rate": 1.7043946273474935e-05, "loss": 1.1582, "step": 7494 }, { "epoch": 2.7904271038149644, "grad_norm": 0.16081902384757996, "learning_rate": 1.704308357952527e-05, "loss": 1.1666, "step": 7495 }, { "epoch": 2.790799408965573, "grad_norm": 0.16997969150543213, "learning_rate": 1.7042220781549703e-05, "loss": 1.1741, "step": 7496 }, { "epoch": 2.7911717141161825, "grad_norm": 0.16140542924404144, "learning_rate": 1.7041357879560972e-05, "loss": 1.171, "step": 7497 }, { "epoch": 2.7915440192667917, "grad_norm": 0.1633981466293335, "learning_rate": 1.7040494873571832e-05, "loss": 1.1596, "step": 7498 }, { "epoch": 2.7919163244174006, "grad_norm": 0.17155370116233826, "learning_rate": 1.7039631763595025e-05, "loss": 1.1825, "step": 7499 }, { "epoch": 2.79228862956801, "grad_norm": 0.17078445851802826, "learning_rate": 1.7038768549643297e-05, "loss": 1.175, "step": 7500 }, { "epoch": 2.79228862956801, "eval_loss": 1.2953369617462158, "eval_runtime": 16.7431, "eval_samples_per_second": 103.565, "eval_steps_per_second": 5.196, "step": 7500 }, { "epoch": 2.7926609347186186, "grad_norm": 0.16718538105487823, "learning_rate": 1.7037905231729402e-05, "loss": 1.1802, "step": 7501 }, { "epoch": 2.793033239869228, "grad_norm": 0.16334514319896698, "learning_rate": 1.7037041809866085e-05, "loss": 1.1619, "step": 7502 }, { "epoch": 2.7934055450198367, "grad_norm": 0.1652374416589737, "learning_rate": 1.7036178284066103e-05, "loss": 1.1584, "step": 7503 }, { "epoch": 2.793777850170446, "grad_norm": 0.17066913843154907, "learning_rate": 1.703531465434221e-05, "loss": 1.1792, "step": 7504 }, { "epoch": 2.794150155321055, "grad_norm": 0.1763291358947754, "learning_rate": 1.7034450920707162e-05, "loss": 1.1846, "step": 7505 }, { "epoch": 2.794522460471664, "grad_norm": 0.16274365782737732, "learning_rate": 1.7033587083173713e-05, "loss": 1.1639, "step": 7506 }, { "epoch": 2.7948947656222733, "grad_norm": 0.16275723278522491, "learning_rate": 1.7032723141754626e-05, "loss": 1.1731, "step": 7507 }, { "epoch": 2.795267070772882, "grad_norm": 0.16381405293941498, "learning_rate": 1.703185909646266e-05, "loss": 1.1696, "step": 7508 }, { "epoch": 2.7956393759234914, "grad_norm": 0.1678345650434494, "learning_rate": 1.7030994947310576e-05, "loss": 1.1755, "step": 7509 }, { "epoch": 2.7960116810741003, "grad_norm": 0.16982942819595337, "learning_rate": 1.703013069431114e-05, "loss": 1.1687, "step": 7510 }, { "epoch": 2.7963839862247095, "grad_norm": 0.16046252846717834, "learning_rate": 1.7029266337477106e-05, "loss": 1.1686, "step": 7511 }, { "epoch": 2.7967562913753183, "grad_norm": 0.16178347170352936, "learning_rate": 1.7028401876821257e-05, "loss": 1.1689, "step": 7512 }, { "epoch": 2.7971285965259276, "grad_norm": 0.16835175454616547, "learning_rate": 1.7027537312356353e-05, "loss": 1.1482, "step": 7513 }, { "epoch": 2.7975009016765364, "grad_norm": 0.16404716670513153, "learning_rate": 1.702667264409516e-05, "loss": 1.157, "step": 7514 }, { "epoch": 2.7978732068271457, "grad_norm": 0.15957726538181305, "learning_rate": 1.7025807872050456e-05, "loss": 1.1719, "step": 7515 }, { "epoch": 2.798245511977755, "grad_norm": 0.16722844541072845, "learning_rate": 1.702494299623501e-05, "loss": 1.174, "step": 7516 }, { "epoch": 2.7986178171283638, "grad_norm": 0.16469328105449677, "learning_rate": 1.7024078016661597e-05, "loss": 1.1758, "step": 7517 }, { "epoch": 2.798990122278973, "grad_norm": 0.16519738733768463, "learning_rate": 1.7023212933342995e-05, "loss": 1.1714, "step": 7518 }, { "epoch": 2.799362427429582, "grad_norm": 0.16509708762168884, "learning_rate": 1.7022347746291975e-05, "loss": 1.1692, "step": 7519 }, { "epoch": 2.799734732580191, "grad_norm": 0.15529975295066833, "learning_rate": 1.7021482455521323e-05, "loss": 1.1697, "step": 7520 }, { "epoch": 2.8001070377308, "grad_norm": 0.16663393378257751, "learning_rate": 1.7020617061043815e-05, "loss": 1.1656, "step": 7521 }, { "epoch": 2.800479342881409, "grad_norm": 0.16711021959781647, "learning_rate": 1.701975156287223e-05, "loss": 1.1635, "step": 7522 }, { "epoch": 2.800851648032018, "grad_norm": 0.1636432558298111, "learning_rate": 1.7018885961019356e-05, "loss": 1.1626, "step": 7523 }, { "epoch": 2.8012239531826273, "grad_norm": 0.16135211288928986, "learning_rate": 1.701802025549798e-05, "loss": 1.1764, "step": 7524 }, { "epoch": 2.8015962583332366, "grad_norm": 0.161915123462677, "learning_rate": 1.7017154446320882e-05, "loss": 1.1893, "step": 7525 }, { "epoch": 2.8019685634838454, "grad_norm": 0.1613880842924118, "learning_rate": 1.7016288533500855e-05, "loss": 1.1716, "step": 7526 }, { "epoch": 2.8023408686344546, "grad_norm": 0.1624586284160614, "learning_rate": 1.7015422517050686e-05, "loss": 1.1683, "step": 7527 }, { "epoch": 2.8027131737850635, "grad_norm": 0.1543359011411667, "learning_rate": 1.7014556396983168e-05, "loss": 1.1611, "step": 7528 }, { "epoch": 2.8030854789356727, "grad_norm": 0.16166086494922638, "learning_rate": 1.701369017331109e-05, "loss": 1.1609, "step": 7529 }, { "epoch": 2.803457784086282, "grad_norm": 0.1620410680770874, "learning_rate": 1.7012823846047252e-05, "loss": 1.1608, "step": 7530 }, { "epoch": 2.803830089236891, "grad_norm": 0.168722003698349, "learning_rate": 1.7011957415204443e-05, "loss": 1.1798, "step": 7531 }, { "epoch": 2.8042023943874996, "grad_norm": 0.16052375733852386, "learning_rate": 1.7011090880795463e-05, "loss": 1.1571, "step": 7532 }, { "epoch": 2.804574699538109, "grad_norm": 0.1640620082616806, "learning_rate": 1.701022424283311e-05, "loss": 1.1875, "step": 7533 }, { "epoch": 2.804947004688718, "grad_norm": 0.16312499344348907, "learning_rate": 1.7009357501330188e-05, "loss": 1.1748, "step": 7534 }, { "epoch": 2.805319309839327, "grad_norm": 0.1676671802997589, "learning_rate": 1.7008490656299492e-05, "loss": 1.1347, "step": 7535 }, { "epoch": 2.8056916149899362, "grad_norm": 0.15957991778850555, "learning_rate": 1.700762370775383e-05, "loss": 1.1705, "step": 7536 }, { "epoch": 2.806063920140545, "grad_norm": 0.1587969958782196, "learning_rate": 1.700675665570601e-05, "loss": 1.1683, "step": 7537 }, { "epoch": 2.8064362252911543, "grad_norm": 0.15972483158111572, "learning_rate": 1.7005889500168828e-05, "loss": 1.172, "step": 7538 }, { "epoch": 2.8068085304417636, "grad_norm": 0.16055303812026978, "learning_rate": 1.70050222411551e-05, "loss": 1.1702, "step": 7539 }, { "epoch": 2.8071808355923724, "grad_norm": 0.16454675793647766, "learning_rate": 1.7004154878677634e-05, "loss": 1.173, "step": 7540 }, { "epoch": 2.8075531407429812, "grad_norm": 0.16008242964744568, "learning_rate": 1.7003287412749236e-05, "loss": 1.1688, "step": 7541 }, { "epoch": 2.8079254458935905, "grad_norm": 0.16654878854751587, "learning_rate": 1.7002419843382724e-05, "loss": 1.1581, "step": 7542 }, { "epoch": 2.8082977510441998, "grad_norm": 0.15987439453601837, "learning_rate": 1.7001552170590913e-05, "loss": 1.1766, "step": 7543 }, { "epoch": 2.8086700561948086, "grad_norm": 0.15910540521144867, "learning_rate": 1.7000684394386615e-05, "loss": 1.1669, "step": 7544 }, { "epoch": 2.809042361345418, "grad_norm": 0.16279643774032593, "learning_rate": 1.6999816514782647e-05, "loss": 1.1597, "step": 7545 }, { "epoch": 2.8094146664960267, "grad_norm": 0.16432224214076996, "learning_rate": 1.699894853179183e-05, "loss": 1.1671, "step": 7546 }, { "epoch": 2.809786971646636, "grad_norm": 0.16405612230300903, "learning_rate": 1.699808044542698e-05, "loss": 1.1825, "step": 7547 }, { "epoch": 2.810159276797245, "grad_norm": 0.16156992316246033, "learning_rate": 1.6997212255700924e-05, "loss": 1.1532, "step": 7548 }, { "epoch": 2.810531581947854, "grad_norm": 0.1572588086128235, "learning_rate": 1.699634396262648e-05, "loss": 1.1612, "step": 7549 }, { "epoch": 2.810903887098463, "grad_norm": 0.16491782665252686, "learning_rate": 1.6995475566216475e-05, "loss": 1.1627, "step": 7550 }, { "epoch": 2.811276192249072, "grad_norm": 0.1590576469898224, "learning_rate": 1.6994607066483735e-05, "loss": 1.159, "step": 7551 }, { "epoch": 2.8116484973996814, "grad_norm": 0.16059501469135284, "learning_rate": 1.6993738463441087e-05, "loss": 1.1792, "step": 7552 }, { "epoch": 2.81202080255029, "grad_norm": 0.16398665308952332, "learning_rate": 1.6992869757101362e-05, "loss": 1.1696, "step": 7553 }, { "epoch": 2.8123931077008995, "grad_norm": 0.16270703077316284, "learning_rate": 1.6992000947477386e-05, "loss": 1.1745, "step": 7554 }, { "epoch": 2.8127654128515083, "grad_norm": 0.1659878045320511, "learning_rate": 1.6991132034582e-05, "loss": 1.1604, "step": 7555 }, { "epoch": 2.8131377180021175, "grad_norm": 0.16921178996562958, "learning_rate": 1.699026301842803e-05, "loss": 1.1652, "step": 7556 }, { "epoch": 2.813510023152727, "grad_norm": 0.16774874925613403, "learning_rate": 1.6989393899028313e-05, "loss": 1.1847, "step": 7557 }, { "epoch": 2.8138823283033356, "grad_norm": 0.15916696190834045, "learning_rate": 1.698852467639569e-05, "loss": 1.1682, "step": 7558 }, { "epoch": 2.8142546334539444, "grad_norm": 0.1583160012960434, "learning_rate": 1.6987655350542993e-05, "loss": 1.1811, "step": 7559 }, { "epoch": 2.8146269386045537, "grad_norm": 0.16362488269805908, "learning_rate": 1.6986785921483068e-05, "loss": 1.1675, "step": 7560 }, { "epoch": 2.814999243755163, "grad_norm": 0.16386942565441132, "learning_rate": 1.6985916389228746e-05, "loss": 1.173, "step": 7561 }, { "epoch": 2.815371548905772, "grad_norm": 0.16522067785263062, "learning_rate": 1.6985046753792885e-05, "loss": 1.1626, "step": 7562 }, { "epoch": 2.815743854056381, "grad_norm": 0.15882590413093567, "learning_rate": 1.698417701518832e-05, "loss": 1.1694, "step": 7563 }, { "epoch": 2.81611615920699, "grad_norm": 0.1628119945526123, "learning_rate": 1.69833071734279e-05, "loss": 1.1819, "step": 7564 }, { "epoch": 2.816488464357599, "grad_norm": 0.1661859154701233, "learning_rate": 1.6982437228524468e-05, "loss": 1.1754, "step": 7565 }, { "epoch": 2.8168607695082084, "grad_norm": 0.16389884054660797, "learning_rate": 1.698156718049088e-05, "loss": 1.1741, "step": 7566 }, { "epoch": 2.8172330746588172, "grad_norm": 0.16429540514945984, "learning_rate": 1.6980697029339978e-05, "loss": 1.1648, "step": 7567 }, { "epoch": 2.817605379809426, "grad_norm": 0.16381768882274628, "learning_rate": 1.6979826775084624e-05, "loss": 1.1656, "step": 7568 }, { "epoch": 2.8179776849600353, "grad_norm": 0.1680193990468979, "learning_rate": 1.6978956417737663e-05, "loss": 1.167, "step": 7569 }, { "epoch": 2.8183499901106446, "grad_norm": 0.16285154223442078, "learning_rate": 1.6978085957311956e-05, "loss": 1.1723, "step": 7570 }, { "epoch": 2.8187222952612534, "grad_norm": 0.1649884134531021, "learning_rate": 1.6977215393820357e-05, "loss": 1.1937, "step": 7571 }, { "epoch": 2.8190946004118627, "grad_norm": 0.1563154011964798, "learning_rate": 1.6976344727275725e-05, "loss": 1.177, "step": 7572 }, { "epoch": 2.8194669055624715, "grad_norm": 0.1616252064704895, "learning_rate": 1.6975473957690917e-05, "loss": 1.1679, "step": 7573 }, { "epoch": 2.8198392107130807, "grad_norm": 0.16457171738147736, "learning_rate": 1.6974603085078798e-05, "loss": 1.1668, "step": 7574 }, { "epoch": 2.82021151586369, "grad_norm": 0.1589018553495407, "learning_rate": 1.697373210945223e-05, "loss": 1.1653, "step": 7575 }, { "epoch": 2.820583821014299, "grad_norm": 0.16056711971759796, "learning_rate": 1.6972861030824072e-05, "loss": 1.169, "step": 7576 }, { "epoch": 2.8209561261649077, "grad_norm": 0.1676499992609024, "learning_rate": 1.69719898492072e-05, "loss": 1.1786, "step": 7577 }, { "epoch": 2.821328431315517, "grad_norm": 0.16967107355594635, "learning_rate": 1.6971118564614473e-05, "loss": 1.158, "step": 7578 }, { "epoch": 2.821700736466126, "grad_norm": 0.17274503409862518, "learning_rate": 1.697024717705876e-05, "loss": 1.1906, "step": 7579 }, { "epoch": 2.822073041616735, "grad_norm": 0.16663110256195068, "learning_rate": 1.696937568655294e-05, "loss": 1.17, "step": 7580 }, { "epoch": 2.8224453467673443, "grad_norm": 0.16743509471416473, "learning_rate": 1.696850409310987e-05, "loss": 1.1783, "step": 7581 }, { "epoch": 2.822817651917953, "grad_norm": 0.16072916984558105, "learning_rate": 1.6967632396742434e-05, "loss": 1.1684, "step": 7582 }, { "epoch": 2.8231899570685624, "grad_norm": 0.16104640066623688, "learning_rate": 1.696676059746351e-05, "loss": 1.1607, "step": 7583 }, { "epoch": 2.8235622622191716, "grad_norm": 0.1597306877374649, "learning_rate": 1.6965888695285965e-05, "loss": 1.1765, "step": 7584 }, { "epoch": 2.8239345673697804, "grad_norm": 0.16316664218902588, "learning_rate": 1.6965016690222685e-05, "loss": 1.1683, "step": 7585 }, { "epoch": 2.8243068725203893, "grad_norm": 0.1655420958995819, "learning_rate": 1.696414458228654e-05, "loss": 1.1851, "step": 7586 }, { "epoch": 2.8246791776709985, "grad_norm": 0.16307014226913452, "learning_rate": 1.696327237149042e-05, "loss": 1.174, "step": 7587 }, { "epoch": 2.825051482821608, "grad_norm": 0.16215850412845612, "learning_rate": 1.6962400057847202e-05, "loss": 1.1768, "step": 7588 }, { "epoch": 2.8254237879722166, "grad_norm": 0.16659583151340485, "learning_rate": 1.6961527641369774e-05, "loss": 1.1698, "step": 7589 }, { "epoch": 2.825796093122826, "grad_norm": 0.16407622396945953, "learning_rate": 1.6960655122071023e-05, "loss": 1.1619, "step": 7590 }, { "epoch": 2.8261683982734347, "grad_norm": 0.15823231637477875, "learning_rate": 1.6959782499963827e-05, "loss": 1.1612, "step": 7591 }, { "epoch": 2.826540703424044, "grad_norm": 0.16647450625896454, "learning_rate": 1.6958909775061082e-05, "loss": 1.1726, "step": 7592 }, { "epoch": 2.826913008574653, "grad_norm": 0.16879020631313324, "learning_rate": 1.6958036947375676e-05, "loss": 1.1824, "step": 7593 }, { "epoch": 2.827285313725262, "grad_norm": 0.16589350998401642, "learning_rate": 1.69571640169205e-05, "loss": 1.1716, "step": 7594 }, { "epoch": 2.827657618875871, "grad_norm": 0.1656552106142044, "learning_rate": 1.695629098370845e-05, "loss": 1.1689, "step": 7595 }, { "epoch": 2.82802992402648, "grad_norm": 0.16472189128398895, "learning_rate": 1.6955417847752417e-05, "loss": 1.1785, "step": 7596 }, { "epoch": 2.8284022291770894, "grad_norm": 0.16501384973526, "learning_rate": 1.69545446090653e-05, "loss": 1.1715, "step": 7597 }, { "epoch": 2.828774534327698, "grad_norm": 0.16673643887043, "learning_rate": 1.6953671267659996e-05, "loss": 1.184, "step": 7598 }, { "epoch": 2.8291468394783075, "grad_norm": 0.16138608753681183, "learning_rate": 1.6952797823549406e-05, "loss": 1.164, "step": 7599 }, { "epoch": 2.8295191446289163, "grad_norm": 0.15768542885780334, "learning_rate": 1.6951924276746425e-05, "loss": 1.1794, "step": 7600 }, { "epoch": 2.8298914497795256, "grad_norm": 0.1626758873462677, "learning_rate": 1.6951050627263958e-05, "loss": 1.1608, "step": 7601 }, { "epoch": 2.830263754930135, "grad_norm": 0.16393102705478668, "learning_rate": 1.695017687511491e-05, "loss": 1.1826, "step": 7602 }, { "epoch": 2.8306360600807436, "grad_norm": 0.16286106407642365, "learning_rate": 1.6949303020312188e-05, "loss": 1.1822, "step": 7603 }, { "epoch": 2.831008365231353, "grad_norm": 0.16217343509197235, "learning_rate": 1.6948429062868697e-05, "loss": 1.1658, "step": 7604 }, { "epoch": 2.8313806703819617, "grad_norm": 0.17708547413349152, "learning_rate": 1.6947555002797344e-05, "loss": 1.1522, "step": 7605 }, { "epoch": 2.831752975532571, "grad_norm": 0.18956957757472992, "learning_rate": 1.6946680840111035e-05, "loss": 1.17, "step": 7606 }, { "epoch": 2.83212528068318, "grad_norm": 0.1878315806388855, "learning_rate": 1.6945806574822693e-05, "loss": 1.1833, "step": 7607 }, { "epoch": 2.832497585833789, "grad_norm": 0.1767759919166565, "learning_rate": 1.6944932206945218e-05, "loss": 1.1772, "step": 7608 }, { "epoch": 2.832869890984398, "grad_norm": 0.1732354611158371, "learning_rate": 1.6944057736491534e-05, "loss": 1.1649, "step": 7609 }, { "epoch": 2.833242196135007, "grad_norm": 0.16452713310718536, "learning_rate": 1.694318316347455e-05, "loss": 1.1707, "step": 7610 }, { "epoch": 2.8336145012856164, "grad_norm": 0.167094424366951, "learning_rate": 1.6942308487907187e-05, "loss": 1.1749, "step": 7611 }, { "epoch": 2.8339868064362252, "grad_norm": 0.17273786664009094, "learning_rate": 1.694143370980237e-05, "loss": 1.1577, "step": 7612 }, { "epoch": 2.8343591115868345, "grad_norm": 0.16662479937076569, "learning_rate": 1.6940558829173004e-05, "loss": 1.1733, "step": 7613 }, { "epoch": 2.8347314167374433, "grad_norm": 0.16592276096343994, "learning_rate": 1.6939683846032022e-05, "loss": 1.1724, "step": 7614 }, { "epoch": 2.8351037218880526, "grad_norm": 0.16113296151161194, "learning_rate": 1.6938808760392346e-05, "loss": 1.1643, "step": 7615 }, { "epoch": 2.8354760270386614, "grad_norm": 0.16023372113704681, "learning_rate": 1.69379335722669e-05, "loss": 1.1684, "step": 7616 }, { "epoch": 2.8358483321892707, "grad_norm": 0.16699042916297913, "learning_rate": 1.693705828166861e-05, "loss": 1.1701, "step": 7617 }, { "epoch": 2.8362206373398795, "grad_norm": 0.1670234203338623, "learning_rate": 1.693618288861041e-05, "loss": 1.1809, "step": 7618 }, { "epoch": 2.8365929424904888, "grad_norm": 0.19244331121444702, "learning_rate": 1.6935307393105215e-05, "loss": 1.153, "step": 7619 }, { "epoch": 2.836965247641098, "grad_norm": 0.16699756681919098, "learning_rate": 1.6934431795165972e-05, "loss": 1.1687, "step": 7620 }, { "epoch": 2.837337552791707, "grad_norm": 0.1908612698316574, "learning_rate": 1.6933556094805602e-05, "loss": 1.1881, "step": 7621 }, { "epoch": 2.837709857942316, "grad_norm": 0.17475947737693787, "learning_rate": 1.6932680292037045e-05, "loss": 1.1511, "step": 7622 }, { "epoch": 2.838082163092925, "grad_norm": 0.16836613416671753, "learning_rate": 1.6931804386873232e-05, "loss": 1.1599, "step": 7623 }, { "epoch": 2.838454468243534, "grad_norm": 0.17344890534877777, "learning_rate": 1.693092837932711e-05, "loss": 1.1767, "step": 7624 }, { "epoch": 2.838826773394143, "grad_norm": 0.17366820573806763, "learning_rate": 1.693005226941161e-05, "loss": 1.1815, "step": 7625 }, { "epoch": 2.8391990785447523, "grad_norm": 0.16873829066753387, "learning_rate": 1.692917605713967e-05, "loss": 1.1751, "step": 7626 }, { "epoch": 2.839571383695361, "grad_norm": 0.1660252809524536, "learning_rate": 1.6928299742524236e-05, "loss": 1.1723, "step": 7627 }, { "epoch": 2.8399436888459704, "grad_norm": 0.17038235068321228, "learning_rate": 1.6927423325578248e-05, "loss": 1.1829, "step": 7628 }, { "epoch": 2.8403159939965796, "grad_norm": 0.1674618273973465, "learning_rate": 1.692654680631465e-05, "loss": 1.1787, "step": 7629 }, { "epoch": 2.8406882991471885, "grad_norm": 0.16933341324329376, "learning_rate": 1.692567018474639e-05, "loss": 1.169, "step": 7630 }, { "epoch": 2.8410606042977977, "grad_norm": 0.16508066654205322, "learning_rate": 1.6924793460886424e-05, "loss": 1.1711, "step": 7631 }, { "epoch": 2.8414329094484065, "grad_norm": 0.1625910848379135, "learning_rate": 1.692391663474769e-05, "loss": 1.1557, "step": 7632 }, { "epoch": 2.841805214599016, "grad_norm": 0.16758020222187042, "learning_rate": 1.692303970634314e-05, "loss": 1.1555, "step": 7633 }, { "epoch": 2.8421775197496246, "grad_norm": 0.1603182852268219, "learning_rate": 1.6922162675685725e-05, "loss": 1.1575, "step": 7634 }, { "epoch": 2.842549824900234, "grad_norm": 0.16572555899620056, "learning_rate": 1.6921285542788405e-05, "loss": 1.1687, "step": 7635 }, { "epoch": 2.8429221300508427, "grad_norm": 0.16787642240524292, "learning_rate": 1.6920408307664132e-05, "loss": 1.1629, "step": 7636 }, { "epoch": 2.843294435201452, "grad_norm": 0.16438321769237518, "learning_rate": 1.6919530970325865e-05, "loss": 1.1628, "step": 7637 }, { "epoch": 2.8436667403520612, "grad_norm": 0.16906222701072693, "learning_rate": 1.6918653530786555e-05, "loss": 1.1589, "step": 7638 }, { "epoch": 2.84403904550267, "grad_norm": 0.1649426817893982, "learning_rate": 1.6917775989059167e-05, "loss": 1.1705, "step": 7639 }, { "epoch": 2.8444113506532793, "grad_norm": 0.17663908004760742, "learning_rate": 1.6916898345156668e-05, "loss": 1.1638, "step": 7640 }, { "epoch": 2.844783655803888, "grad_norm": 0.16318029165267944, "learning_rate": 1.6916020599092007e-05, "loss": 1.1592, "step": 7641 }, { "epoch": 2.8451559609544974, "grad_norm": 0.17562070488929749, "learning_rate": 1.691514275087816e-05, "loss": 1.1606, "step": 7642 }, { "epoch": 2.8455282661051067, "grad_norm": 0.1714848279953003, "learning_rate": 1.6914264800528087e-05, "loss": 1.1647, "step": 7643 }, { "epoch": 2.8459005712557155, "grad_norm": 0.1683172881603241, "learning_rate": 1.6913386748054757e-05, "loss": 1.175, "step": 7644 }, { "epoch": 2.8462728764063243, "grad_norm": 0.18591049313545227, "learning_rate": 1.6912508593471137e-05, "loss": 1.1501, "step": 7645 }, { "epoch": 2.8466451815569336, "grad_norm": 0.16155651211738586, "learning_rate": 1.69116303367902e-05, "loss": 1.1764, "step": 7646 }, { "epoch": 2.847017486707543, "grad_norm": 0.1677398383617401, "learning_rate": 1.691075197802492e-05, "loss": 1.1649, "step": 7647 }, { "epoch": 2.8473897918581517, "grad_norm": 0.16577932238578796, "learning_rate": 1.6909873517188263e-05, "loss": 1.1761, "step": 7648 }, { "epoch": 2.847762097008761, "grad_norm": 0.161788210272789, "learning_rate": 1.690899495429321e-05, "loss": 1.1662, "step": 7649 }, { "epoch": 2.8481344021593697, "grad_norm": 0.20268653333187103, "learning_rate": 1.6908116289352735e-05, "loss": 1.1871, "step": 7650 }, { "epoch": 2.848506707309979, "grad_norm": 0.18162259459495544, "learning_rate": 1.6907237522379816e-05, "loss": 1.1549, "step": 7651 }, { "epoch": 2.8488790124605883, "grad_norm": 0.16151395440101624, "learning_rate": 1.690635865338743e-05, "loss": 1.1815, "step": 7652 }, { "epoch": 2.849251317611197, "grad_norm": 0.1718040108680725, "learning_rate": 1.6905479682388565e-05, "loss": 1.1562, "step": 7653 }, { "epoch": 2.849623622761806, "grad_norm": 0.165380597114563, "learning_rate": 1.690460060939619e-05, "loss": 1.1713, "step": 7654 }, { "epoch": 2.849995927912415, "grad_norm": 0.18731936812400818, "learning_rate": 1.6903721434423306e-05, "loss": 1.1732, "step": 7655 }, { "epoch": 2.8503682330630244, "grad_norm": 0.16210754215717316, "learning_rate": 1.6902842157482885e-05, "loss": 1.1712, "step": 7656 }, { "epoch": 2.8507405382136333, "grad_norm": 0.1721230000257492, "learning_rate": 1.690196277858792e-05, "loss": 1.1572, "step": 7657 }, { "epoch": 2.8511128433642425, "grad_norm": 0.16690939664840698, "learning_rate": 1.6901083297751397e-05, "loss": 1.1777, "step": 7658 }, { "epoch": 2.8514851485148514, "grad_norm": 0.16586491465568542, "learning_rate": 1.6900203714986307e-05, "loss": 1.1747, "step": 7659 }, { "epoch": 2.8518574536654606, "grad_norm": 0.17058177292346954, "learning_rate": 1.689932403030564e-05, "loss": 1.1755, "step": 7660 }, { "epoch": 2.85222975881607, "grad_norm": 0.16409049928188324, "learning_rate": 1.6898444243722395e-05, "loss": 1.1629, "step": 7661 }, { "epoch": 2.8526020639666787, "grad_norm": 0.16023750603199005, "learning_rate": 1.6897564355249556e-05, "loss": 1.1642, "step": 7662 }, { "epoch": 2.8529743691172875, "grad_norm": 0.17429199814796448, "learning_rate": 1.6896684364900127e-05, "loss": 1.1746, "step": 7663 }, { "epoch": 2.853346674267897, "grad_norm": 0.1672469824552536, "learning_rate": 1.68958042726871e-05, "loss": 1.1697, "step": 7664 }, { "epoch": 2.853718979418506, "grad_norm": 0.16756999492645264, "learning_rate": 1.689492407862348e-05, "loss": 1.1722, "step": 7665 }, { "epoch": 2.854091284569115, "grad_norm": 0.16315622627735138, "learning_rate": 1.689404378272226e-05, "loss": 1.1669, "step": 7666 }, { "epoch": 2.854463589719724, "grad_norm": 0.16442465782165527, "learning_rate": 1.6893163384996453e-05, "loss": 1.1892, "step": 7667 }, { "epoch": 2.854835894870333, "grad_norm": 0.16709788143634796, "learning_rate": 1.689228288545905e-05, "loss": 1.1729, "step": 7668 }, { "epoch": 2.855208200020942, "grad_norm": 0.17159411311149597, "learning_rate": 1.689140228412306e-05, "loss": 1.1619, "step": 7669 }, { "epoch": 2.8555805051715515, "grad_norm": 0.16300630569458008, "learning_rate": 1.68905215810015e-05, "loss": 1.1589, "step": 7670 }, { "epoch": 2.8559528103221603, "grad_norm": 0.17560981214046478, "learning_rate": 1.6889640776107356e-05, "loss": 1.1749, "step": 7671 }, { "epoch": 2.856325115472769, "grad_norm": 0.16075876355171204, "learning_rate": 1.688875986945366e-05, "loss": 1.1659, "step": 7672 }, { "epoch": 2.8566974206233784, "grad_norm": 0.16523872315883636, "learning_rate": 1.6887878861053407e-05, "loss": 1.175, "step": 7673 }, { "epoch": 2.8570697257739877, "grad_norm": 0.1663205772638321, "learning_rate": 1.688699775091962e-05, "loss": 1.1689, "step": 7674 }, { "epoch": 2.8574420309245965, "grad_norm": 0.17149294912815094, "learning_rate": 1.68861165390653e-05, "loss": 1.1637, "step": 7675 }, { "epoch": 2.8578143360752057, "grad_norm": 0.15809527039527893, "learning_rate": 1.688523522550348e-05, "loss": 1.1621, "step": 7676 }, { "epoch": 2.8581866412258146, "grad_norm": 0.16826729476451874, "learning_rate": 1.6884353810247166e-05, "loss": 1.1671, "step": 7677 }, { "epoch": 2.858558946376424, "grad_norm": 0.1609870195388794, "learning_rate": 1.6883472293309375e-05, "loss": 1.1648, "step": 7678 }, { "epoch": 2.858931251527033, "grad_norm": 0.16899265348911285, "learning_rate": 1.6882590674703135e-05, "loss": 1.1532, "step": 7679 }, { "epoch": 2.859303556677642, "grad_norm": 0.1640852838754654, "learning_rate": 1.6881708954441458e-05, "loss": 1.1584, "step": 7680 }, { "epoch": 2.8596758618282507, "grad_norm": 0.16974326968193054, "learning_rate": 1.6880827132537373e-05, "loss": 1.1733, "step": 7681 }, { "epoch": 2.86004816697886, "grad_norm": 0.16964828968048096, "learning_rate": 1.6879945209003903e-05, "loss": 1.1698, "step": 7682 }, { "epoch": 2.8604204721294693, "grad_norm": 0.17090067267417908, "learning_rate": 1.6879063183854076e-05, "loss": 1.1671, "step": 7683 }, { "epoch": 2.860792777280078, "grad_norm": 0.1564168483018875, "learning_rate": 1.6878181057100915e-05, "loss": 1.1604, "step": 7684 }, { "epoch": 2.8611650824306873, "grad_norm": 0.1685391068458557, "learning_rate": 1.6877298828757452e-05, "loss": 1.1765, "step": 7685 }, { "epoch": 2.861537387581296, "grad_norm": 0.16214828193187714, "learning_rate": 1.6876416498836717e-05, "loss": 1.166, "step": 7686 }, { "epoch": 2.8619096927319054, "grad_norm": 0.16030354797840118, "learning_rate": 1.6875534067351744e-05, "loss": 1.1634, "step": 7687 }, { "epoch": 2.8622819978825147, "grad_norm": 0.17029841244220734, "learning_rate": 1.687465153431556e-05, "loss": 1.1719, "step": 7688 }, { "epoch": 2.8626543030331235, "grad_norm": 0.1636890470981598, "learning_rate": 1.6873768899741212e-05, "loss": 1.1582, "step": 7689 }, { "epoch": 2.8630266081837323, "grad_norm": 0.16757601499557495, "learning_rate": 1.687288616364172e-05, "loss": 1.1557, "step": 7690 }, { "epoch": 2.8633989133343416, "grad_norm": 0.18567247688770294, "learning_rate": 1.6872003326030136e-05, "loss": 1.1699, "step": 7691 }, { "epoch": 2.863771218484951, "grad_norm": 0.1703827977180481, "learning_rate": 1.6871120386919493e-05, "loss": 1.1579, "step": 7692 }, { "epoch": 2.8641435236355597, "grad_norm": 0.16367879509925842, "learning_rate": 1.6870237346322832e-05, "loss": 1.156, "step": 7693 }, { "epoch": 2.864515828786169, "grad_norm": 0.1624721884727478, "learning_rate": 1.6869354204253195e-05, "loss": 1.1659, "step": 7694 }, { "epoch": 2.8648881339367778, "grad_norm": 0.16088762879371643, "learning_rate": 1.686847096072363e-05, "loss": 1.1753, "step": 7695 }, { "epoch": 2.865260439087387, "grad_norm": 0.17531664669513702, "learning_rate": 1.6867587615747184e-05, "loss": 1.183, "step": 7696 }, { "epoch": 2.8656327442379963, "grad_norm": 0.18076026439666748, "learning_rate": 1.6866704169336895e-05, "loss": 1.1681, "step": 7697 }, { "epoch": 2.866005049388605, "grad_norm": 0.20938239991664886, "learning_rate": 1.6865820621505812e-05, "loss": 1.1816, "step": 7698 }, { "epoch": 2.866377354539214, "grad_norm": 0.23691917955875397, "learning_rate": 1.6864936972266996e-05, "loss": 1.1581, "step": 7699 }, { "epoch": 2.866749659689823, "grad_norm": 0.19211941957473755, "learning_rate": 1.686405322163349e-05, "loss": 1.1664, "step": 7700 }, { "epoch": 2.8671219648404325, "grad_norm": 0.17298676073551178, "learning_rate": 1.686316936961835e-05, "loss": 1.1663, "step": 7701 }, { "epoch": 2.8674942699910413, "grad_norm": 0.17014887928962708, "learning_rate": 1.6862285416234628e-05, "loss": 1.162, "step": 7702 }, { "epoch": 2.8678665751416506, "grad_norm": 0.1860615760087967, "learning_rate": 1.686140136149538e-05, "loss": 1.1707, "step": 7703 }, { "epoch": 2.8682388802922594, "grad_norm": 0.1729809045791626, "learning_rate": 1.6860517205413667e-05, "loss": 1.1786, "step": 7704 }, { "epoch": 2.8686111854428686, "grad_norm": 0.16746124625205994, "learning_rate": 1.6859632948002542e-05, "loss": 1.164, "step": 7705 }, { "epoch": 2.868983490593478, "grad_norm": 0.176571786403656, "learning_rate": 1.685874858927507e-05, "loss": 1.152, "step": 7706 }, { "epoch": 2.8693557957440867, "grad_norm": 0.17990010976791382, "learning_rate": 1.6857864129244314e-05, "loss": 1.1736, "step": 7707 }, { "epoch": 2.8697281008946955, "grad_norm": 0.16373847424983978, "learning_rate": 1.6856979567923333e-05, "loss": 1.1595, "step": 7708 }, { "epoch": 2.870100406045305, "grad_norm": 0.16468669474124908, "learning_rate": 1.6856094905325195e-05, "loss": 1.1652, "step": 7709 }, { "epoch": 2.870472711195914, "grad_norm": 0.1700790673494339, "learning_rate": 1.6855210141462964e-05, "loss": 1.1577, "step": 7710 }, { "epoch": 2.870845016346523, "grad_norm": 0.17594699561595917, "learning_rate": 1.685432527634971e-05, "loss": 1.1672, "step": 7711 }, { "epoch": 2.871217321497132, "grad_norm": 0.16046394407749176, "learning_rate": 1.68534403099985e-05, "loss": 1.1777, "step": 7712 }, { "epoch": 2.871589626647741, "grad_norm": 0.1609521210193634, "learning_rate": 1.685255524242241e-05, "loss": 1.173, "step": 7713 }, { "epoch": 2.8719619317983502, "grad_norm": 0.17106173932552338, "learning_rate": 1.6851670073634513e-05, "loss": 1.1632, "step": 7714 }, { "epoch": 2.8723342369489595, "grad_norm": 0.16623558104038239, "learning_rate": 1.685078480364787e-05, "loss": 1.1556, "step": 7715 }, { "epoch": 2.8727065420995683, "grad_norm": 0.1585904061794281, "learning_rate": 1.684989943247557e-05, "loss": 1.1718, "step": 7716 }, { "epoch": 2.8730788472501776, "grad_norm": 0.1625899374485016, "learning_rate": 1.6849013960130687e-05, "loss": 1.1638, "step": 7717 }, { "epoch": 2.8734511524007864, "grad_norm": 0.1680610626935959, "learning_rate": 1.6848128386626297e-05, "loss": 1.175, "step": 7718 }, { "epoch": 2.8738234575513957, "grad_norm": 0.16707110404968262, "learning_rate": 1.6847242711975477e-05, "loss": 1.1724, "step": 7719 }, { "epoch": 2.8741957627020045, "grad_norm": 0.16035796701908112, "learning_rate": 1.684635693619131e-05, "loss": 1.1504, "step": 7720 }, { "epoch": 2.8745680678526138, "grad_norm": 0.1676149070262909, "learning_rate": 1.684547105928689e-05, "loss": 1.1657, "step": 7721 }, { "epoch": 2.8749403730032226, "grad_norm": 0.17349836230278015, "learning_rate": 1.6844585081275285e-05, "loss": 1.1534, "step": 7722 }, { "epoch": 2.875312678153832, "grad_norm": 0.16714976727962494, "learning_rate": 1.684369900216959e-05, "loss": 1.1686, "step": 7723 }, { "epoch": 2.875684983304441, "grad_norm": 0.168970987200737, "learning_rate": 1.684281282198289e-05, "loss": 1.1644, "step": 7724 }, { "epoch": 2.87605728845505, "grad_norm": 0.16321352124214172, "learning_rate": 1.6841926540728276e-05, "loss": 1.1585, "step": 7725 }, { "epoch": 2.876429593605659, "grad_norm": 0.16680431365966797, "learning_rate": 1.684104015841883e-05, "loss": 1.1496, "step": 7726 }, { "epoch": 2.876801898756268, "grad_norm": 0.1670006364583969, "learning_rate": 1.6840153675067658e-05, "loss": 1.1815, "step": 7727 }, { "epoch": 2.8771742039068773, "grad_norm": 0.1647917628288269, "learning_rate": 1.683926709068784e-05, "loss": 1.1795, "step": 7728 }, { "epoch": 2.877546509057486, "grad_norm": 0.16818176209926605, "learning_rate": 1.683838040529248e-05, "loss": 1.181, "step": 7729 }, { "epoch": 2.8779188142080954, "grad_norm": 0.16495051980018616, "learning_rate": 1.6837493618894666e-05, "loss": 1.151, "step": 7730 }, { "epoch": 2.878291119358704, "grad_norm": 0.15997083485126495, "learning_rate": 1.6836606731507506e-05, "loss": 1.1581, "step": 7731 }, { "epoch": 2.8786634245093135, "grad_norm": 0.16573216021060944, "learning_rate": 1.683571974314409e-05, "loss": 1.1754, "step": 7732 }, { "epoch": 2.8790357296599227, "grad_norm": 0.16260533034801483, "learning_rate": 1.683483265381752e-05, "loss": 1.1807, "step": 7733 }, { "epoch": 2.8794080348105315, "grad_norm": 0.16376620531082153, "learning_rate": 1.6833945463540906e-05, "loss": 1.1666, "step": 7734 }, { "epoch": 2.879780339961141, "grad_norm": 0.1692168414592743, "learning_rate": 1.6833058172327344e-05, "loss": 1.1674, "step": 7735 }, { "epoch": 2.8801526451117496, "grad_norm": 0.16318558156490326, "learning_rate": 1.683217078018994e-05, "loss": 1.1756, "step": 7736 }, { "epoch": 2.880524950262359, "grad_norm": 0.1628124862909317, "learning_rate": 1.6831283287141807e-05, "loss": 1.1631, "step": 7737 }, { "epoch": 2.8808972554129677, "grad_norm": 0.1683143526315689, "learning_rate": 1.6830395693196043e-05, "loss": 1.1762, "step": 7738 }, { "epoch": 2.881269560563577, "grad_norm": 0.16499009728431702, "learning_rate": 1.682950799836577e-05, "loss": 1.1689, "step": 7739 }, { "epoch": 2.881641865714186, "grad_norm": 0.16890759766101837, "learning_rate": 1.6828620202664086e-05, "loss": 1.1914, "step": 7740 }, { "epoch": 2.882014170864795, "grad_norm": 0.16511119902133942, "learning_rate": 1.6827732306104113e-05, "loss": 1.1627, "step": 7741 }, { "epoch": 2.8823864760154043, "grad_norm": 0.16963976621627808, "learning_rate": 1.6826844308698962e-05, "loss": 1.1626, "step": 7742 }, { "epoch": 2.882758781166013, "grad_norm": 0.17037193477153778, "learning_rate": 1.682595621046175e-05, "loss": 1.1666, "step": 7743 }, { "epoch": 2.8831310863166224, "grad_norm": 0.16597189009189606, "learning_rate": 1.682506801140559e-05, "loss": 1.1835, "step": 7744 }, { "epoch": 2.8835033914672312, "grad_norm": 0.2021397054195404, "learning_rate": 1.6824179711543607e-05, "loss": 1.1815, "step": 7745 }, { "epoch": 2.8838756966178405, "grad_norm": 0.1731014996767044, "learning_rate": 1.6823291310888916e-05, "loss": 1.1642, "step": 7746 }, { "epoch": 2.8842480017684493, "grad_norm": 0.17293405532836914, "learning_rate": 1.682240280945464e-05, "loss": 1.1538, "step": 7747 }, { "epoch": 2.8846203069190586, "grad_norm": 0.1841578483581543, "learning_rate": 1.6821514207253905e-05, "loss": 1.1458, "step": 7748 }, { "epoch": 2.8849926120696674, "grad_norm": 0.16726046800613403, "learning_rate": 1.6820625504299833e-05, "loss": 1.1729, "step": 7749 }, { "epoch": 2.8853649172202767, "grad_norm": 0.1666422039270401, "learning_rate": 1.6819736700605548e-05, "loss": 1.16, "step": 7750 }, { "epoch": 2.885737222370886, "grad_norm": 0.1761844903230667, "learning_rate": 1.6818847796184185e-05, "loss": 1.1787, "step": 7751 }, { "epoch": 2.8861095275214947, "grad_norm": 0.16153545677661896, "learning_rate": 1.6817958791048866e-05, "loss": 1.1775, "step": 7752 }, { "epoch": 2.886481832672104, "grad_norm": 0.16363756358623505, "learning_rate": 1.6817069685212717e-05, "loss": 1.1761, "step": 7753 }, { "epoch": 2.886854137822713, "grad_norm": 0.176415354013443, "learning_rate": 1.6816180478688885e-05, "loss": 1.1782, "step": 7754 }, { "epoch": 2.887226442973322, "grad_norm": 0.17358264327049255, "learning_rate": 1.681529117149049e-05, "loss": 1.1663, "step": 7755 }, { "epoch": 2.887598748123931, "grad_norm": 0.1688726395368576, "learning_rate": 1.6814401763630674e-05, "loss": 1.1724, "step": 7756 }, { "epoch": 2.88797105327454, "grad_norm": 0.20944684743881226, "learning_rate": 1.6813512255122573e-05, "loss": 1.1587, "step": 7757 }, { "epoch": 2.888343358425149, "grad_norm": 0.20509681105613708, "learning_rate": 1.681262264597932e-05, "loss": 1.1472, "step": 7758 }, { "epoch": 2.8887156635757583, "grad_norm": 0.17619788646697998, "learning_rate": 1.6811732936214063e-05, "loss": 1.1787, "step": 7759 }, { "epoch": 2.8890879687263675, "grad_norm": 0.16343960165977478, "learning_rate": 1.6810843125839934e-05, "loss": 1.1768, "step": 7760 }, { "epoch": 2.8894602738769763, "grad_norm": 0.1722916066646576, "learning_rate": 1.680995321487008e-05, "loss": 1.1698, "step": 7761 }, { "epoch": 2.8898325790275856, "grad_norm": 0.1792430579662323, "learning_rate": 1.6809063203317645e-05, "loss": 1.1665, "step": 7762 }, { "epoch": 2.8902048841781944, "grad_norm": 0.17468638718128204, "learning_rate": 1.6808173091195774e-05, "loss": 1.1726, "step": 7763 }, { "epoch": 2.8905771893288037, "grad_norm": 0.1607234925031662, "learning_rate": 1.6807282878517614e-05, "loss": 1.1714, "step": 7764 }, { "epoch": 2.890949494479413, "grad_norm": 0.15706735849380493, "learning_rate": 1.680639256529631e-05, "loss": 1.1607, "step": 7765 }, { "epoch": 2.891321799630022, "grad_norm": 0.15637610852718353, "learning_rate": 1.6805502151545022e-05, "loss": 1.1658, "step": 7766 }, { "epoch": 2.8916941047806306, "grad_norm": 0.16825786232948303, "learning_rate": 1.6804611637276888e-05, "loss": 1.1706, "step": 7767 }, { "epoch": 2.89206640993124, "grad_norm": 0.18059858679771423, "learning_rate": 1.680372102250507e-05, "loss": 1.1694, "step": 7768 }, { "epoch": 2.892438715081849, "grad_norm": 0.18463246524333954, "learning_rate": 1.6802830307242716e-05, "loss": 1.1592, "step": 7769 }, { "epoch": 2.892811020232458, "grad_norm": 0.16244830191135406, "learning_rate": 1.680193949150299e-05, "loss": 1.1693, "step": 7770 }, { "epoch": 2.893183325383067, "grad_norm": 0.1961405873298645, "learning_rate": 1.680104857529904e-05, "loss": 1.184, "step": 7771 }, { "epoch": 2.893555630533676, "grad_norm": 0.19858300685882568, "learning_rate": 1.6800157558644034e-05, "loss": 1.1683, "step": 7772 }, { "epoch": 2.8939279356842853, "grad_norm": 0.18224839866161346, "learning_rate": 1.6799266441551124e-05, "loss": 1.1785, "step": 7773 }, { "epoch": 2.8943002408348946, "grad_norm": 0.2373683899641037, "learning_rate": 1.679837522403348e-05, "loss": 1.1623, "step": 7774 }, { "epoch": 2.8946725459855034, "grad_norm": 0.16270266473293304, "learning_rate": 1.6797483906104256e-05, "loss": 1.1622, "step": 7775 }, { "epoch": 2.895044851136112, "grad_norm": 0.16937381029129028, "learning_rate": 1.679659248777662e-05, "loss": 1.1632, "step": 7776 }, { "epoch": 2.8954171562867215, "grad_norm": 0.16399821639060974, "learning_rate": 1.6795700969063743e-05, "loss": 1.1614, "step": 7777 }, { "epoch": 2.8957894614373307, "grad_norm": 0.16400204598903656, "learning_rate": 1.679480934997879e-05, "loss": 1.1718, "step": 7778 }, { "epoch": 2.8961617665879396, "grad_norm": 0.16825588047504425, "learning_rate": 1.679391763053493e-05, "loss": 1.1615, "step": 7779 }, { "epoch": 2.896534071738549, "grad_norm": 0.16895346343517303, "learning_rate": 1.679302581074533e-05, "loss": 1.1646, "step": 7780 }, { "epoch": 2.8969063768891576, "grad_norm": 0.16163942217826843, "learning_rate": 1.6792133890623162e-05, "loss": 1.1582, "step": 7781 }, { "epoch": 2.897278682039767, "grad_norm": 0.1691490262746811, "learning_rate": 1.6791241870181607e-05, "loss": 1.1644, "step": 7782 }, { "epoch": 2.897650987190376, "grad_norm": 0.16696293652057648, "learning_rate": 1.6790349749433835e-05, "loss": 1.1835, "step": 7783 }, { "epoch": 2.898023292340985, "grad_norm": 0.16553637385368347, "learning_rate": 1.678945752839302e-05, "loss": 1.175, "step": 7784 }, { "epoch": 2.898395597491594, "grad_norm": 0.1722661852836609, "learning_rate": 1.678856520707235e-05, "loss": 1.1799, "step": 7785 }, { "epoch": 2.898767902642203, "grad_norm": 0.16318213939666748, "learning_rate": 1.6787672785484998e-05, "loss": 1.1511, "step": 7786 }, { "epoch": 2.8991402077928123, "grad_norm": 0.16159263253211975, "learning_rate": 1.678678026364414e-05, "loss": 1.1715, "step": 7787 }, { "epoch": 2.899512512943421, "grad_norm": 0.16238828003406525, "learning_rate": 1.6785887641562967e-05, "loss": 1.1655, "step": 7788 }, { "epoch": 2.8998848180940304, "grad_norm": 0.16337160766124725, "learning_rate": 1.6784994919254652e-05, "loss": 1.1697, "step": 7789 }, { "epoch": 2.9002571232446392, "grad_norm": 0.16441784799098969, "learning_rate": 1.67841020967324e-05, "loss": 1.1774, "step": 7790 }, { "epoch": 2.9006294283952485, "grad_norm": 0.16245967149734497, "learning_rate": 1.6783209174009377e-05, "loss": 1.1751, "step": 7791 }, { "epoch": 2.9010017335458578, "grad_norm": 0.17327164113521576, "learning_rate": 1.678231615109878e-05, "loss": 1.1608, "step": 7792 }, { "epoch": 2.9013740386964666, "grad_norm": 0.16266658902168274, "learning_rate": 1.6781423028013803e-05, "loss": 1.1599, "step": 7793 }, { "epoch": 2.9017463438470754, "grad_norm": 0.17774546146392822, "learning_rate": 1.6780529804767632e-05, "loss": 1.1655, "step": 7794 }, { "epoch": 2.9021186489976847, "grad_norm": 0.18671812117099762, "learning_rate": 1.6779636481373462e-05, "loss": 1.1759, "step": 7795 }, { "epoch": 2.902490954148294, "grad_norm": 0.1647380143404007, "learning_rate": 1.6778743057844487e-05, "loss": 1.1736, "step": 7796 }, { "epoch": 2.9028632592989028, "grad_norm": 0.23281651735305786, "learning_rate": 1.67778495341939e-05, "loss": 1.1769, "step": 7797 }, { "epoch": 2.903235564449512, "grad_norm": 0.1884678155183792, "learning_rate": 1.67769559104349e-05, "loss": 1.1746, "step": 7798 }, { "epoch": 2.903607869600121, "grad_norm": 0.18494202196598053, "learning_rate": 1.677606218658069e-05, "loss": 1.1601, "step": 7799 }, { "epoch": 2.90398017475073, "grad_norm": 0.1650589555501938, "learning_rate": 1.6775168362644465e-05, "loss": 1.1903, "step": 7800 }, { "epoch": 2.9043524799013394, "grad_norm": 0.2014658898115158, "learning_rate": 1.677427443863943e-05, "loss": 1.1632, "step": 7801 }, { "epoch": 2.904724785051948, "grad_norm": 0.16059400141239166, "learning_rate": 1.6773380414578785e-05, "loss": 1.1651, "step": 7802 }, { "epoch": 2.905097090202557, "grad_norm": 0.1685784012079239, "learning_rate": 1.6772486290475737e-05, "loss": 1.1869, "step": 7803 }, { "epoch": 2.9054693953531663, "grad_norm": 0.16995343565940857, "learning_rate": 1.677159206634349e-05, "loss": 1.1793, "step": 7804 }, { "epoch": 2.9058417005037755, "grad_norm": 0.16553527116775513, "learning_rate": 1.6770697742195256e-05, "loss": 1.151, "step": 7805 }, { "epoch": 2.9062140056543844, "grad_norm": 0.16535979509353638, "learning_rate": 1.676980331804424e-05, "loss": 1.1719, "step": 7806 }, { "epoch": 2.9065863108049936, "grad_norm": 0.172554612159729, "learning_rate": 1.6768908793903653e-05, "loss": 1.1577, "step": 7807 }, { "epoch": 2.9069586159556025, "grad_norm": 0.1674085408449173, "learning_rate": 1.6768014169786712e-05, "loss": 1.1716, "step": 7808 }, { "epoch": 2.9073309211062117, "grad_norm": 0.17498281598091125, "learning_rate": 1.676711944570662e-05, "loss": 1.1595, "step": 7809 }, { "epoch": 2.907703226256821, "grad_norm": 0.1705126017332077, "learning_rate": 1.67662246216766e-05, "loss": 1.1731, "step": 7810 }, { "epoch": 2.90807553140743, "grad_norm": 0.1687796264886856, "learning_rate": 1.676532969770987e-05, "loss": 1.18, "step": 7811 }, { "epoch": 2.9084478365580386, "grad_norm": 0.17635595798492432, "learning_rate": 1.6764434673819644e-05, "loss": 1.1541, "step": 7812 }, { "epoch": 2.908820141708648, "grad_norm": 0.17208701372146606, "learning_rate": 1.6763539550019143e-05, "loss": 1.1639, "step": 7813 }, { "epoch": 2.909192446859257, "grad_norm": 0.172771617770195, "learning_rate": 1.6762644326321586e-05, "loss": 1.1729, "step": 7814 }, { "epoch": 2.909564752009866, "grad_norm": 0.16505715250968933, "learning_rate": 1.6761749002740195e-05, "loss": 1.1502, "step": 7815 }, { "epoch": 2.9099370571604752, "grad_norm": 0.17262002825737, "learning_rate": 1.6760853579288196e-05, "loss": 1.1625, "step": 7816 }, { "epoch": 2.910309362311084, "grad_norm": 0.17353248596191406, "learning_rate": 1.675995805597882e-05, "loss": 1.1678, "step": 7817 }, { "epoch": 2.9106816674616933, "grad_norm": 0.18139515817165375, "learning_rate": 1.675906243282528e-05, "loss": 1.169, "step": 7818 }, { "epoch": 2.9110539726123026, "grad_norm": 0.1768987476825714, "learning_rate": 1.6758166709840815e-05, "loss": 1.1685, "step": 7819 }, { "epoch": 2.9114262777629114, "grad_norm": 0.15893658995628357, "learning_rate": 1.6757270887038653e-05, "loss": 1.1696, "step": 7820 }, { "epoch": 2.9117985829135202, "grad_norm": 0.19786134362220764, "learning_rate": 1.6756374964432022e-05, "loss": 1.1686, "step": 7821 }, { "epoch": 2.9121708880641295, "grad_norm": 0.16756854951381683, "learning_rate": 1.675547894203416e-05, "loss": 1.1556, "step": 7822 }, { "epoch": 2.9125431932147388, "grad_norm": 0.17230384051799774, "learning_rate": 1.6754582819858295e-05, "loss": 1.19, "step": 7823 }, { "epoch": 2.9129154983653476, "grad_norm": 0.17054490745067596, "learning_rate": 1.6753686597917668e-05, "loss": 1.161, "step": 7824 }, { "epoch": 2.913287803515957, "grad_norm": 0.16075944900512695, "learning_rate": 1.675279027622551e-05, "loss": 1.1755, "step": 7825 }, { "epoch": 2.9136601086665657, "grad_norm": 0.17305156588554382, "learning_rate": 1.6751893854795068e-05, "loss": 1.1624, "step": 7826 }, { "epoch": 2.914032413817175, "grad_norm": 0.17909590899944305, "learning_rate": 1.6750997333639574e-05, "loss": 1.1666, "step": 7827 }, { "epoch": 2.914404718967784, "grad_norm": 0.16497522592544556, "learning_rate": 1.6750100712772276e-05, "loss": 1.1747, "step": 7828 }, { "epoch": 2.914777024118393, "grad_norm": 0.17517150938510895, "learning_rate": 1.6749203992206412e-05, "loss": 1.1543, "step": 7829 }, { "epoch": 2.915149329269002, "grad_norm": 0.17387013137340546, "learning_rate": 1.6748307171955226e-05, "loss": 1.1769, "step": 7830 }, { "epoch": 2.915521634419611, "grad_norm": 0.1653532087802887, "learning_rate": 1.674741025203197e-05, "loss": 1.1586, "step": 7831 }, { "epoch": 2.9158939395702204, "grad_norm": 0.17375896871089935, "learning_rate": 1.6746513232449888e-05, "loss": 1.1712, "step": 7832 }, { "epoch": 2.916266244720829, "grad_norm": 0.16700343787670135, "learning_rate": 1.6745616113222228e-05, "loss": 1.1565, "step": 7833 }, { "epoch": 2.9166385498714384, "grad_norm": 0.15995582938194275, "learning_rate": 1.6744718894362243e-05, "loss": 1.1726, "step": 7834 }, { "epoch": 2.9170108550220473, "grad_norm": 0.16936053335666656, "learning_rate": 1.674382157588318e-05, "loss": 1.1607, "step": 7835 }, { "epoch": 2.9173831601726565, "grad_norm": 0.16181506216526031, "learning_rate": 1.6742924157798302e-05, "loss": 1.1635, "step": 7836 }, { "epoch": 2.917755465323266, "grad_norm": 0.1655145138502121, "learning_rate": 1.674202664012085e-05, "loss": 1.1594, "step": 7837 }, { "epoch": 2.9181277704738746, "grad_norm": 0.1637190282344818, "learning_rate": 1.674112902286409e-05, "loss": 1.1595, "step": 7838 }, { "epoch": 2.918500075624484, "grad_norm": 0.16163207590579987, "learning_rate": 1.674023130604128e-05, "loss": 1.1678, "step": 7839 }, { "epoch": 2.9188723807750927, "grad_norm": 0.1664661020040512, "learning_rate": 1.6739333489665672e-05, "loss": 1.1553, "step": 7840 }, { "epoch": 2.919244685925702, "grad_norm": 0.1619381159543991, "learning_rate": 1.6738435573750535e-05, "loss": 1.1539, "step": 7841 }, { "epoch": 2.919616991076311, "grad_norm": 0.16180214285850525, "learning_rate": 1.6737537558309128e-05, "loss": 1.1754, "step": 7842 }, { "epoch": 2.91998929622692, "grad_norm": 0.17024828493595123, "learning_rate": 1.6736639443354712e-05, "loss": 1.1597, "step": 7843 }, { "epoch": 2.920361601377529, "grad_norm": 0.16076843440532684, "learning_rate": 1.6735741228900556e-05, "loss": 1.169, "step": 7844 }, { "epoch": 2.920733906528138, "grad_norm": 0.16919641196727753, "learning_rate": 1.673484291495992e-05, "loss": 1.179, "step": 7845 }, { "epoch": 2.9211062116787474, "grad_norm": 0.1667402684688568, "learning_rate": 1.673394450154608e-05, "loss": 1.1683, "step": 7846 }, { "epoch": 2.921478516829356, "grad_norm": 0.16661867499351501, "learning_rate": 1.6733045988672306e-05, "loss": 1.1734, "step": 7847 }, { "epoch": 2.9218508219799655, "grad_norm": 0.16996459662914276, "learning_rate": 1.673214737635186e-05, "loss": 1.169, "step": 7848 }, { "epoch": 2.9222231271305743, "grad_norm": 0.16732630133628845, "learning_rate": 1.6731248664598023e-05, "loss": 1.157, "step": 7849 }, { "epoch": 2.9225954322811836, "grad_norm": 0.1656404733657837, "learning_rate": 1.6730349853424064e-05, "loss": 1.1684, "step": 7850 }, { "epoch": 2.9229677374317924, "grad_norm": 0.16698972880840302, "learning_rate": 1.6729450942843256e-05, "loss": 1.1607, "step": 7851 }, { "epoch": 2.9233400425824017, "grad_norm": 0.18779774010181427, "learning_rate": 1.6728551932868885e-05, "loss": 1.1719, "step": 7852 }, { "epoch": 2.9237123477330105, "grad_norm": 0.1793741136789322, "learning_rate": 1.6727652823514225e-05, "loss": 1.1573, "step": 7853 }, { "epoch": 2.9240846528836197, "grad_norm": 0.17160984873771667, "learning_rate": 1.6726753614792555e-05, "loss": 1.1786, "step": 7854 }, { "epoch": 2.924456958034229, "grad_norm": 0.16810734570026398, "learning_rate": 1.6725854306717155e-05, "loss": 1.1771, "step": 7855 }, { "epoch": 2.924829263184838, "grad_norm": 0.1664050966501236, "learning_rate": 1.6724954899301308e-05, "loss": 1.1654, "step": 7856 }, { "epoch": 2.925201568335447, "grad_norm": 0.19625960290431976, "learning_rate": 1.6724055392558302e-05, "loss": 1.1529, "step": 7857 }, { "epoch": 2.925573873486056, "grad_norm": 0.20255877077579498, "learning_rate": 1.6723155786501414e-05, "loss": 1.1741, "step": 7858 }, { "epoch": 2.925946178636665, "grad_norm": 0.1825208216905594, "learning_rate": 1.672225608114394e-05, "loss": 1.1686, "step": 7859 }, { "epoch": 2.926318483787274, "grad_norm": 0.3680337071418762, "learning_rate": 1.672135627649917e-05, "loss": 1.1604, "step": 7860 }, { "epoch": 2.9266907889378833, "grad_norm": 0.18671971559524536, "learning_rate": 1.672045637258038e-05, "loss": 1.166, "step": 7861 }, { "epoch": 2.927063094088492, "grad_norm": 0.1835385113954544, "learning_rate": 1.671955636940088e-05, "loss": 1.1751, "step": 7862 }, { "epoch": 2.9274353992391013, "grad_norm": 0.15858596563339233, "learning_rate": 1.6718656266973952e-05, "loss": 1.1597, "step": 7863 }, { "epoch": 2.9278077043897106, "grad_norm": 0.16256222128868103, "learning_rate": 1.6717756065312892e-05, "loss": 1.1723, "step": 7864 }, { "epoch": 2.9281800095403194, "grad_norm": 0.17425408959388733, "learning_rate": 1.6716855764430995e-05, "loss": 1.1597, "step": 7865 }, { "epoch": 2.9285523146909287, "grad_norm": 0.16741712391376495, "learning_rate": 1.6715955364341563e-05, "loss": 1.168, "step": 7866 }, { "epoch": 2.9289246198415375, "grad_norm": 0.1664438098669052, "learning_rate": 1.671505486505789e-05, "loss": 1.1784, "step": 7867 }, { "epoch": 2.9292969249921468, "grad_norm": 0.15664342045783997, "learning_rate": 1.6714154266593277e-05, "loss": 1.1586, "step": 7868 }, { "epoch": 2.9296692301427556, "grad_norm": 0.1611543446779251, "learning_rate": 1.671325356896103e-05, "loss": 1.1643, "step": 7869 }, { "epoch": 2.930041535293365, "grad_norm": 0.167099729180336, "learning_rate": 1.6712352772174444e-05, "loss": 1.1622, "step": 7870 }, { "epoch": 2.9304138404439737, "grad_norm": 0.16537703573703766, "learning_rate": 1.6711451876246833e-05, "loss": 1.1692, "step": 7871 }, { "epoch": 2.930786145594583, "grad_norm": 0.16481894254684448, "learning_rate": 1.6710550881191498e-05, "loss": 1.1521, "step": 7872 }, { "epoch": 2.931158450745192, "grad_norm": 0.1629692018032074, "learning_rate": 1.6709649787021748e-05, "loss": 1.1602, "step": 7873 }, { "epoch": 2.931530755895801, "grad_norm": 0.16847184300422668, "learning_rate": 1.6708748593750888e-05, "loss": 1.1644, "step": 7874 }, { "epoch": 2.9319030610464103, "grad_norm": 0.1654757410287857, "learning_rate": 1.6707847301392237e-05, "loss": 1.1596, "step": 7875 }, { "epoch": 2.932275366197019, "grad_norm": 0.16103285551071167, "learning_rate": 1.67069459099591e-05, "loss": 1.1816, "step": 7876 }, { "epoch": 2.9326476713476284, "grad_norm": 0.15957196056842804, "learning_rate": 1.6706044419464792e-05, "loss": 1.1675, "step": 7877 }, { "epoch": 2.933019976498237, "grad_norm": 0.16220717132091522, "learning_rate": 1.670514282992263e-05, "loss": 1.1758, "step": 7878 }, { "epoch": 2.9333922816488465, "grad_norm": 0.16329532861709595, "learning_rate": 1.670424114134593e-05, "loss": 1.1742, "step": 7879 }, { "epoch": 2.9337645867994553, "grad_norm": 0.16387712955474854, "learning_rate": 1.6703339353748006e-05, "loss": 1.1657, "step": 7880 }, { "epoch": 2.9341368919500646, "grad_norm": 0.15758801996707916, "learning_rate": 1.6702437467142186e-05, "loss": 1.1537, "step": 7881 }, { "epoch": 2.934509197100674, "grad_norm": 0.16278088092803955, "learning_rate": 1.6701535481541783e-05, "loss": 1.1738, "step": 7882 }, { "epoch": 2.9348815022512826, "grad_norm": 0.16199669241905212, "learning_rate": 1.670063339696012e-05, "loss": 1.1741, "step": 7883 }, { "epoch": 2.935253807401892, "grad_norm": 0.164675772190094, "learning_rate": 1.6699731213410524e-05, "loss": 1.1584, "step": 7884 }, { "epoch": 2.9356261125525007, "grad_norm": 0.16404102742671967, "learning_rate": 1.6698828930906316e-05, "loss": 1.1697, "step": 7885 }, { "epoch": 2.93599841770311, "grad_norm": 0.16526629030704498, "learning_rate": 1.6697926549460826e-05, "loss": 1.1623, "step": 7886 }, { "epoch": 2.9363707228537193, "grad_norm": 0.15449416637420654, "learning_rate": 1.669702406908738e-05, "loss": 1.1646, "step": 7887 }, { "epoch": 2.936743028004328, "grad_norm": 0.1667950302362442, "learning_rate": 1.6696121489799314e-05, "loss": 1.1745, "step": 7888 }, { "epoch": 2.937115333154937, "grad_norm": 0.16033466160297394, "learning_rate": 1.669521881160995e-05, "loss": 1.1658, "step": 7889 }, { "epoch": 2.937487638305546, "grad_norm": 0.1629643738269806, "learning_rate": 1.6694316034532626e-05, "loss": 1.177, "step": 7890 }, { "epoch": 2.9378599434561554, "grad_norm": 0.15993858873844147, "learning_rate": 1.6693413158580672e-05, "loss": 1.1587, "step": 7891 }, { "epoch": 2.9382322486067642, "grad_norm": 0.16693513095378876, "learning_rate": 1.6692510183767424e-05, "loss": 1.1738, "step": 7892 }, { "epoch": 2.9386045537573735, "grad_norm": 0.1641031950712204, "learning_rate": 1.6691607110106223e-05, "loss": 1.1536, "step": 7893 }, { "epoch": 2.9389768589079823, "grad_norm": 0.15620005130767822, "learning_rate": 1.6690703937610406e-05, "loss": 1.1768, "step": 7894 }, { "epoch": 2.9393491640585916, "grad_norm": 0.16095943748950958, "learning_rate": 1.668980066629331e-05, "loss": 1.1693, "step": 7895 }, { "epoch": 2.939721469209201, "grad_norm": 0.1613335907459259, "learning_rate": 1.668889729616828e-05, "loss": 1.1716, "step": 7896 }, { "epoch": 2.9400937743598097, "grad_norm": 0.1632033735513687, "learning_rate": 1.6687993827248657e-05, "loss": 1.1618, "step": 7897 }, { "epoch": 2.9404660795104185, "grad_norm": 0.16224397718906403, "learning_rate": 1.6687090259547782e-05, "loss": 1.1716, "step": 7898 }, { "epoch": 2.9408383846610278, "grad_norm": 0.16363045573234558, "learning_rate": 1.6686186593079003e-05, "loss": 1.1671, "step": 7899 }, { "epoch": 2.941210689811637, "grad_norm": 0.1638343632221222, "learning_rate": 1.6685282827855672e-05, "loss": 1.1756, "step": 7900 }, { "epoch": 2.941582994962246, "grad_norm": 0.16511334478855133, "learning_rate": 1.668437896389113e-05, "loss": 1.1802, "step": 7901 }, { "epoch": 2.941955300112855, "grad_norm": 0.16726627945899963, "learning_rate": 1.6683475001198733e-05, "loss": 1.1729, "step": 7902 }, { "epoch": 2.942327605263464, "grad_norm": 0.15701860189437866, "learning_rate": 1.6682570939791827e-05, "loss": 1.1546, "step": 7903 }, { "epoch": 2.942699910414073, "grad_norm": 0.16227418184280396, "learning_rate": 1.668166677968377e-05, "loss": 1.1613, "step": 7904 }, { "epoch": 2.9430722155646825, "grad_norm": 0.16237196326255798, "learning_rate": 1.668076252088791e-05, "loss": 1.1529, "step": 7905 }, { "epoch": 2.9434445207152913, "grad_norm": 0.15888628363609314, "learning_rate": 1.6679858163417607e-05, "loss": 1.1825, "step": 7906 }, { "epoch": 2.9438168258659, "grad_norm": 0.17086133360862732, "learning_rate": 1.667895370728622e-05, "loss": 1.1891, "step": 7907 }, { "epoch": 2.9441891310165094, "grad_norm": 0.16408465802669525, "learning_rate": 1.667804915250711e-05, "loss": 1.1658, "step": 7908 }, { "epoch": 2.9445614361671186, "grad_norm": 0.15986572206020355, "learning_rate": 1.6677144499093626e-05, "loss": 1.1581, "step": 7909 }, { "epoch": 2.9449337413177274, "grad_norm": 0.164417564868927, "learning_rate": 1.667623974705914e-05, "loss": 1.1553, "step": 7910 }, { "epoch": 2.9453060464683367, "grad_norm": 0.16091735661029816, "learning_rate": 1.6675334896417014e-05, "loss": 1.1755, "step": 7911 }, { "epoch": 2.9456783516189455, "grad_norm": 0.16419173777103424, "learning_rate": 1.6674429947180607e-05, "loss": 1.169, "step": 7912 }, { "epoch": 2.946050656769555, "grad_norm": 0.17175257205963135, "learning_rate": 1.667352489936329e-05, "loss": 1.1745, "step": 7913 }, { "epoch": 2.946422961920164, "grad_norm": 0.16759838163852692, "learning_rate": 1.6672619752978428e-05, "loss": 1.1677, "step": 7914 }, { "epoch": 2.946795267070773, "grad_norm": 0.16285474598407745, "learning_rate": 1.6671714508039394e-05, "loss": 1.1712, "step": 7915 }, { "epoch": 2.9471675722213817, "grad_norm": 0.16049782931804657, "learning_rate": 1.6670809164559553e-05, "loss": 1.1513, "step": 7916 }, { "epoch": 2.947539877371991, "grad_norm": 0.16227254271507263, "learning_rate": 1.666990372255228e-05, "loss": 1.1794, "step": 7917 }, { "epoch": 2.9479121825226002, "grad_norm": 0.16385634243488312, "learning_rate": 1.6668998182030945e-05, "loss": 1.179, "step": 7918 }, { "epoch": 2.948284487673209, "grad_norm": 0.15982592105865479, "learning_rate": 1.666809254300893e-05, "loss": 1.1648, "step": 7919 }, { "epoch": 2.9486567928238183, "grad_norm": 0.16695238649845123, "learning_rate": 1.6667186805499605e-05, "loss": 1.1793, "step": 7920 }, { "epoch": 2.949029097974427, "grad_norm": 0.1569126844406128, "learning_rate": 1.666628096951635e-05, "loss": 1.165, "step": 7921 }, { "epoch": 2.9494014031250364, "grad_norm": 0.1580849289894104, "learning_rate": 1.666537503507254e-05, "loss": 1.1664, "step": 7922 }, { "epoch": 2.9497737082756457, "grad_norm": 0.16597579419612885, "learning_rate": 1.6664469002181562e-05, "loss": 1.1624, "step": 7923 }, { "epoch": 2.9501460134262545, "grad_norm": 0.16480037569999695, "learning_rate": 1.6663562870856793e-05, "loss": 1.1796, "step": 7924 }, { "epoch": 2.9505183185768633, "grad_norm": 0.16459214687347412, "learning_rate": 1.6662656641111623e-05, "loss": 1.1583, "step": 7925 }, { "epoch": 2.9508906237274726, "grad_norm": 0.1645277440547943, "learning_rate": 1.666175031295943e-05, "loss": 1.173, "step": 7926 }, { "epoch": 2.951262928878082, "grad_norm": 0.16501055657863617, "learning_rate": 1.66608438864136e-05, "loss": 1.1613, "step": 7927 }, { "epoch": 2.9516352340286907, "grad_norm": 0.16335533559322357, "learning_rate": 1.6659937361487527e-05, "loss": 1.1732, "step": 7928 }, { "epoch": 2.9520075391793, "grad_norm": 0.16024471819400787, "learning_rate": 1.6659030738194594e-05, "loss": 1.1738, "step": 7929 }, { "epoch": 2.9523798443299087, "grad_norm": 0.15945962071418762, "learning_rate": 1.66581240165482e-05, "loss": 1.1485, "step": 7930 }, { "epoch": 2.952752149480518, "grad_norm": 0.15847419202327728, "learning_rate": 1.6657217196561727e-05, "loss": 1.1606, "step": 7931 }, { "epoch": 2.9531244546311273, "grad_norm": 0.1631278246641159, "learning_rate": 1.6656310278248577e-05, "loss": 1.1659, "step": 7932 }, { "epoch": 2.953496759781736, "grad_norm": 0.1575775444507599, "learning_rate": 1.6655403261622143e-05, "loss": 1.1682, "step": 7933 }, { "epoch": 2.953869064932345, "grad_norm": 0.16149340569972992, "learning_rate": 1.6654496146695817e-05, "loss": 1.1578, "step": 7934 }, { "epoch": 2.954241370082954, "grad_norm": 0.16209939122200012, "learning_rate": 1.6653588933483003e-05, "loss": 1.1566, "step": 7935 }, { "epoch": 2.9546136752335634, "grad_norm": 0.15919779241085052, "learning_rate": 1.6652681621997095e-05, "loss": 1.1714, "step": 7936 }, { "epoch": 2.9549859803841723, "grad_norm": 0.15968568623065948, "learning_rate": 1.66517742122515e-05, "loss": 1.1729, "step": 7937 }, { "epoch": 2.9553582855347815, "grad_norm": 0.16550806164741516, "learning_rate": 1.6650866704259615e-05, "loss": 1.1591, "step": 7938 }, { "epoch": 2.9557305906853903, "grad_norm": 0.16333597898483276, "learning_rate": 1.664995909803485e-05, "loss": 1.1571, "step": 7939 }, { "epoch": 2.9561028958359996, "grad_norm": 0.1608911156654358, "learning_rate": 1.6649051393590605e-05, "loss": 1.1608, "step": 7940 }, { "epoch": 2.956475200986609, "grad_norm": 0.1644187867641449, "learning_rate": 1.6648143590940286e-05, "loss": 1.1719, "step": 7941 }, { "epoch": 2.9568475061372177, "grad_norm": 0.1601906716823578, "learning_rate": 1.6647235690097303e-05, "loss": 1.1627, "step": 7942 }, { "epoch": 2.9572198112878265, "grad_norm": 0.15772992372512817, "learning_rate": 1.6646327691075067e-05, "loss": 1.1456, "step": 7943 }, { "epoch": 2.957592116438436, "grad_norm": 0.15940575301647186, "learning_rate": 1.664541959388699e-05, "loss": 1.1709, "step": 7944 }, { "epoch": 2.957964421589045, "grad_norm": 0.16511191427707672, "learning_rate": 1.664451139854648e-05, "loss": 1.1746, "step": 7945 }, { "epoch": 2.958336726739654, "grad_norm": 0.15787945687770844, "learning_rate": 1.6643603105066955e-05, "loss": 1.1639, "step": 7946 }, { "epoch": 2.958709031890263, "grad_norm": 0.15846529603004456, "learning_rate": 1.664269471346183e-05, "loss": 1.1778, "step": 7947 }, { "epoch": 2.959081337040872, "grad_norm": 0.16118839383125305, "learning_rate": 1.6641786223744518e-05, "loss": 1.1771, "step": 7948 }, { "epoch": 2.959453642191481, "grad_norm": 0.16540107131004333, "learning_rate": 1.6640877635928446e-05, "loss": 1.1726, "step": 7949 }, { "epoch": 2.9598259473420905, "grad_norm": 0.15974678099155426, "learning_rate": 1.6639968950027023e-05, "loss": 1.1715, "step": 7950 }, { "epoch": 2.9601982524926993, "grad_norm": 0.15811410546302795, "learning_rate": 1.663906016605368e-05, "loss": 1.1567, "step": 7951 }, { "epoch": 2.960570557643308, "grad_norm": 0.16194650530815125, "learning_rate": 1.6638151284021828e-05, "loss": 1.1735, "step": 7952 }, { "epoch": 2.9609428627939174, "grad_norm": 0.1610521525144577, "learning_rate": 1.66372423039449e-05, "loss": 1.1847, "step": 7953 }, { "epoch": 2.9613151679445266, "grad_norm": 0.16063009202480316, "learning_rate": 1.6636333225836323e-05, "loss": 1.1642, "step": 7954 }, { "epoch": 2.9616874730951355, "grad_norm": 0.16281422972679138, "learning_rate": 1.663542404970952e-05, "loss": 1.1756, "step": 7955 }, { "epoch": 2.9620597782457447, "grad_norm": 0.15930494666099548, "learning_rate": 1.663451477557792e-05, "loss": 1.1409, "step": 7956 }, { "epoch": 2.9624320833963536, "grad_norm": 0.16165930032730103, "learning_rate": 1.6633605403454952e-05, "loss": 1.1662, "step": 7957 }, { "epoch": 2.962804388546963, "grad_norm": 0.16098619997501373, "learning_rate": 1.6632695933354052e-05, "loss": 1.1678, "step": 7958 }, { "epoch": 2.963176693697572, "grad_norm": 0.16247297823429108, "learning_rate": 1.663178636528864e-05, "loss": 1.1676, "step": 7959 }, { "epoch": 2.963548998848181, "grad_norm": 0.16938212513923645, "learning_rate": 1.663087669927217e-05, "loss": 1.1518, "step": 7960 }, { "epoch": 2.96392130399879, "grad_norm": 0.16098153591156006, "learning_rate": 1.6629966935318062e-05, "loss": 1.1678, "step": 7961 }, { "epoch": 2.964293609149399, "grad_norm": 0.16554594039916992, "learning_rate": 1.662905707343976e-05, "loss": 1.1628, "step": 7962 }, { "epoch": 2.9646659143000083, "grad_norm": 0.16836988925933838, "learning_rate": 1.6628147113650703e-05, "loss": 1.1883, "step": 7963 }, { "epoch": 2.965038219450617, "grad_norm": 0.16309542953968048, "learning_rate": 1.6627237055964324e-05, "loss": 1.1766, "step": 7964 }, { "epoch": 2.9654105246012263, "grad_norm": 0.16153202950954437, "learning_rate": 1.6626326900394073e-05, "loss": 1.1642, "step": 7965 }, { "epoch": 2.965782829751835, "grad_norm": 0.16180694103240967, "learning_rate": 1.662541664695339e-05, "loss": 1.1684, "step": 7966 }, { "epoch": 2.9661551349024444, "grad_norm": 0.16211725771427155, "learning_rate": 1.662450629565572e-05, "loss": 1.1634, "step": 7967 }, { "epoch": 2.9665274400530537, "grad_norm": 0.16405652463436127, "learning_rate": 1.6623595846514503e-05, "loss": 1.1675, "step": 7968 }, { "epoch": 2.9668997452036625, "grad_norm": 0.1562814861536026, "learning_rate": 1.662268529954319e-05, "loss": 1.1668, "step": 7969 }, { "epoch": 2.9672720503542718, "grad_norm": 0.161887064576149, "learning_rate": 1.6621774654755238e-05, "loss": 1.1572, "step": 7970 }, { "epoch": 2.9676443555048806, "grad_norm": 0.16800396144390106, "learning_rate": 1.6620863912164086e-05, "loss": 1.1691, "step": 7971 }, { "epoch": 2.96801666065549, "grad_norm": 0.16673104465007782, "learning_rate": 1.661995307178319e-05, "loss": 1.1746, "step": 7972 }, { "epoch": 2.9683889658060987, "grad_norm": 0.17055685818195343, "learning_rate": 1.6619042133626003e-05, "loss": 1.1713, "step": 7973 }, { "epoch": 2.968761270956708, "grad_norm": 0.16628359258174896, "learning_rate": 1.661813109770598e-05, "loss": 1.1812, "step": 7974 }, { "epoch": 2.9691335761073168, "grad_norm": 0.16214534640312195, "learning_rate": 1.6617219964036572e-05, "loss": 1.1605, "step": 7975 }, { "epoch": 2.969505881257926, "grad_norm": 0.16540850698947906, "learning_rate": 1.6616308732631245e-05, "loss": 1.1694, "step": 7976 }, { "epoch": 2.9698781864085353, "grad_norm": 0.16304931044578552, "learning_rate": 1.6615397403503452e-05, "loss": 1.1558, "step": 7977 }, { "epoch": 2.970250491559144, "grad_norm": 0.16456028819084167, "learning_rate": 1.661448597666665e-05, "loss": 1.1587, "step": 7978 }, { "epoch": 2.9706227967097534, "grad_norm": 0.1654767096042633, "learning_rate": 1.6613574452134314e-05, "loss": 1.1568, "step": 7979 }, { "epoch": 2.970995101860362, "grad_norm": 0.16024528443813324, "learning_rate": 1.6612662829919894e-05, "loss": 1.1774, "step": 7980 }, { "epoch": 2.9713674070109715, "grad_norm": 0.1553923636674881, "learning_rate": 1.6611751110036856e-05, "loss": 1.1664, "step": 7981 }, { "epoch": 2.9717397121615803, "grad_norm": 0.16233742237091064, "learning_rate": 1.661083929249867e-05, "loss": 1.166, "step": 7982 }, { "epoch": 2.9721120173121895, "grad_norm": 0.16320884227752686, "learning_rate": 1.6609927377318804e-05, "loss": 1.1617, "step": 7983 }, { "epoch": 2.9724843224627984, "grad_norm": 0.16230836510658264, "learning_rate": 1.6609015364510726e-05, "loss": 1.157, "step": 7984 }, { "epoch": 2.9728566276134076, "grad_norm": 0.16287636756896973, "learning_rate": 1.6608103254087905e-05, "loss": 1.1753, "step": 7985 }, { "epoch": 2.973228932764017, "grad_norm": 0.16399414837360382, "learning_rate": 1.6607191046063815e-05, "loss": 1.1643, "step": 7986 }, { "epoch": 2.9736012379146257, "grad_norm": 0.158194899559021, "learning_rate": 1.6606278740451927e-05, "loss": 1.1791, "step": 7987 }, { "epoch": 2.973973543065235, "grad_norm": 0.16402703523635864, "learning_rate": 1.660536633726572e-05, "loss": 1.1695, "step": 7988 }, { "epoch": 2.974345848215844, "grad_norm": 0.1630365401506424, "learning_rate": 1.6604453836518658e-05, "loss": 1.1654, "step": 7989 }, { "epoch": 2.974718153366453, "grad_norm": 0.16490447521209717, "learning_rate": 1.6603541238224235e-05, "loss": 1.1718, "step": 7990 }, { "epoch": 2.975090458517062, "grad_norm": 0.15666000545024872, "learning_rate": 1.660262854239592e-05, "loss": 1.1588, "step": 7991 }, { "epoch": 2.975462763667671, "grad_norm": 0.16078460216522217, "learning_rate": 1.6601715749047195e-05, "loss": 1.1669, "step": 7992 }, { "epoch": 2.97583506881828, "grad_norm": 0.16264961659908295, "learning_rate": 1.6600802858191543e-05, "loss": 1.1733, "step": 7993 }, { "epoch": 2.9762073739688892, "grad_norm": 0.16248923540115356, "learning_rate": 1.6599889869842447e-05, "loss": 1.1615, "step": 7994 }, { "epoch": 2.9765796791194985, "grad_norm": 0.16207122802734375, "learning_rate": 1.6598976784013394e-05, "loss": 1.1652, "step": 7995 }, { "epoch": 2.9769519842701073, "grad_norm": 0.1700456589460373, "learning_rate": 1.6598063600717865e-05, "loss": 1.1734, "step": 7996 }, { "epoch": 2.9773242894207166, "grad_norm": 0.1640198975801468, "learning_rate": 1.659715031996935e-05, "loss": 1.1636, "step": 7997 }, { "epoch": 2.9776965945713254, "grad_norm": 0.16763059794902802, "learning_rate": 1.6596236941781342e-05, "loss": 1.1581, "step": 7998 }, { "epoch": 2.9780688997219347, "grad_norm": 0.16505372524261475, "learning_rate": 1.6595323466167327e-05, "loss": 1.1632, "step": 7999 }, { "epoch": 2.978441204872544, "grad_norm": 0.16153225302696228, "learning_rate": 1.6594409893140796e-05, "loss": 1.1629, "step": 8000 }, { "epoch": 2.978441204872544, "eval_loss": 1.2953892946243286, "eval_runtime": 16.6947, "eval_samples_per_second": 103.865, "eval_steps_per_second": 5.211, "step": 8000 }, { "epoch": 2.9788135100231528, "grad_norm": 0.163248211145401, "learning_rate": 1.659349622271525e-05, "loss": 1.1506, "step": 8001 }, { "epoch": 2.9791858151737616, "grad_norm": 0.16736340522766113, "learning_rate": 1.6592582454904176e-05, "loss": 1.164, "step": 8002 }, { "epoch": 2.979558120324371, "grad_norm": 0.164361834526062, "learning_rate": 1.659166858972107e-05, "loss": 1.1693, "step": 8003 }, { "epoch": 2.97993042547498, "grad_norm": 0.16543278098106384, "learning_rate": 1.6590754627179438e-05, "loss": 1.1647, "step": 8004 }, { "epoch": 2.980302730625589, "grad_norm": 0.1678323745727539, "learning_rate": 1.658984056729277e-05, "loss": 1.1628, "step": 8005 }, { "epoch": 2.980675035776198, "grad_norm": 0.16372528672218323, "learning_rate": 1.6588926410074573e-05, "loss": 1.1664, "step": 8006 }, { "epoch": 2.981047340926807, "grad_norm": 0.1635918915271759, "learning_rate": 1.6588012155538343e-05, "loss": 1.1571, "step": 8007 }, { "epoch": 2.9814196460774163, "grad_norm": 0.1657022386789322, "learning_rate": 1.658709780369759e-05, "loss": 1.174, "step": 8008 }, { "epoch": 2.9817919512280255, "grad_norm": 0.1680508553981781, "learning_rate": 1.6586183354565814e-05, "loss": 1.1591, "step": 8009 }, { "epoch": 2.9821642563786344, "grad_norm": 0.16020500659942627, "learning_rate": 1.658526880815652e-05, "loss": 1.1726, "step": 8010 }, { "epoch": 2.982536561529243, "grad_norm": 0.16022956371307373, "learning_rate": 1.6584354164483225e-05, "loss": 1.1781, "step": 8011 }, { "epoch": 2.9829088666798524, "grad_norm": 0.16404716670513153, "learning_rate": 1.658343942355943e-05, "loss": 1.1735, "step": 8012 }, { "epoch": 2.9832811718304617, "grad_norm": 0.1646728813648224, "learning_rate": 1.6582524585398647e-05, "loss": 1.1829, "step": 8013 }, { "epoch": 2.9836534769810705, "grad_norm": 0.16547395288944244, "learning_rate": 1.658160965001439e-05, "loss": 1.1801, "step": 8014 }, { "epoch": 2.98402578213168, "grad_norm": 0.16899387538433075, "learning_rate": 1.6580694617420173e-05, "loss": 1.1598, "step": 8015 }, { "epoch": 2.9843980872822886, "grad_norm": 0.1721050888299942, "learning_rate": 1.6579779487629508e-05, "loss": 1.165, "step": 8016 }, { "epoch": 2.984770392432898, "grad_norm": 0.166593998670578, "learning_rate": 1.657886426065591e-05, "loss": 1.1621, "step": 8017 }, { "epoch": 2.985142697583507, "grad_norm": 0.16592232882976532, "learning_rate": 1.6577948936512905e-05, "loss": 1.1688, "step": 8018 }, { "epoch": 2.985515002734116, "grad_norm": 0.16835105419158936, "learning_rate": 1.6577033515214e-05, "loss": 1.1917, "step": 8019 }, { "epoch": 2.985887307884725, "grad_norm": 0.16211645305156708, "learning_rate": 1.6576117996772728e-05, "loss": 1.1697, "step": 8020 }, { "epoch": 2.986259613035334, "grad_norm": 0.16470582783222198, "learning_rate": 1.657520238120261e-05, "loss": 1.1691, "step": 8021 }, { "epoch": 2.9866319181859433, "grad_norm": 0.16480398178100586, "learning_rate": 1.6574286668517155e-05, "loss": 1.1564, "step": 8022 }, { "epoch": 2.987004223336552, "grad_norm": 0.15897324681282043, "learning_rate": 1.6573370858729907e-05, "loss": 1.1565, "step": 8023 }, { "epoch": 2.9873765284871614, "grad_norm": 0.1609337478876114, "learning_rate": 1.657245495185438e-05, "loss": 1.1762, "step": 8024 }, { "epoch": 2.98774883363777, "grad_norm": 0.16375748813152313, "learning_rate": 1.6571538947904105e-05, "loss": 1.172, "step": 8025 }, { "epoch": 2.9881211387883795, "grad_norm": 0.16461491584777832, "learning_rate": 1.6570622846892615e-05, "loss": 1.1717, "step": 8026 }, { "epoch": 2.9884934439389887, "grad_norm": 0.1679835021495819, "learning_rate": 1.6569706648833436e-05, "loss": 1.1607, "step": 8027 }, { "epoch": 2.9888657490895976, "grad_norm": 0.1627042144536972, "learning_rate": 1.6568790353740104e-05, "loss": 1.1534, "step": 8028 }, { "epoch": 2.9892380542402064, "grad_norm": 0.1635371893644333, "learning_rate": 1.656787396162615e-05, "loss": 1.1718, "step": 8029 }, { "epoch": 2.9896103593908157, "grad_norm": 0.16765522956848145, "learning_rate": 1.656695747250511e-05, "loss": 1.1629, "step": 8030 }, { "epoch": 2.989982664541425, "grad_norm": 0.16241970658302307, "learning_rate": 1.656604088639052e-05, "loss": 1.1618, "step": 8031 }, { "epoch": 2.9903549696920337, "grad_norm": 0.16690513491630554, "learning_rate": 1.6565124203295918e-05, "loss": 1.1722, "step": 8032 }, { "epoch": 2.990727274842643, "grad_norm": 0.15712475776672363, "learning_rate": 1.656420742323484e-05, "loss": 1.1419, "step": 8033 }, { "epoch": 2.991099579993252, "grad_norm": 0.16157348453998566, "learning_rate": 1.6563290546220835e-05, "loss": 1.1554, "step": 8034 }, { "epoch": 2.991471885143861, "grad_norm": 0.16115577518939972, "learning_rate": 1.6562373572267438e-05, "loss": 1.167, "step": 8035 }, { "epoch": 2.9918441902944704, "grad_norm": 0.16360346972942352, "learning_rate": 1.6561456501388197e-05, "loss": 1.1597, "step": 8036 }, { "epoch": 2.992216495445079, "grad_norm": 0.15973906219005585, "learning_rate": 1.6560539333596657e-05, "loss": 1.1556, "step": 8037 }, { "epoch": 2.992588800595688, "grad_norm": 0.16520273685455322, "learning_rate": 1.6559622068906357e-05, "loss": 1.1799, "step": 8038 }, { "epoch": 2.9929611057462973, "grad_norm": 0.1652621626853943, "learning_rate": 1.6558704707330857e-05, "loss": 1.1635, "step": 8039 }, { "epoch": 2.9933334108969065, "grad_norm": 0.16183042526245117, "learning_rate": 1.6557787248883698e-05, "loss": 1.1567, "step": 8040 }, { "epoch": 2.9937057160475153, "grad_norm": 0.1610204577445984, "learning_rate": 1.655686969357843e-05, "loss": 1.1709, "step": 8041 }, { "epoch": 2.9940780211981246, "grad_norm": 0.162316232919693, "learning_rate": 1.655595204142861e-05, "loss": 1.165, "step": 8042 }, { "epoch": 2.9944503263487334, "grad_norm": 0.1625785082578659, "learning_rate": 1.655503429244779e-05, "loss": 1.1799, "step": 8043 }, { "epoch": 2.9948226314993427, "grad_norm": 0.15980488061904907, "learning_rate": 1.6554116446649528e-05, "loss": 1.1653, "step": 8044 }, { "epoch": 2.995194936649952, "grad_norm": 0.16220425069332123, "learning_rate": 1.655319850404737e-05, "loss": 1.1635, "step": 8045 }, { "epoch": 2.9955672418005608, "grad_norm": 0.1602453589439392, "learning_rate": 1.6552280464654888e-05, "loss": 1.1805, "step": 8046 }, { "epoch": 2.9959395469511696, "grad_norm": 0.16323374211788177, "learning_rate": 1.6551362328485633e-05, "loss": 1.1555, "step": 8047 }, { "epoch": 2.996311852101779, "grad_norm": 0.16577033698558807, "learning_rate": 1.6550444095553167e-05, "loss": 1.1722, "step": 8048 }, { "epoch": 2.996684157252388, "grad_norm": 0.15781056880950928, "learning_rate": 1.654952576587105e-05, "loss": 1.1596, "step": 8049 }, { "epoch": 2.997056462402997, "grad_norm": 0.16249702870845795, "learning_rate": 1.6548607339452853e-05, "loss": 1.1499, "step": 8050 }, { "epoch": 2.997428767553606, "grad_norm": 0.16227367520332336, "learning_rate": 1.6547688816312134e-05, "loss": 1.1898, "step": 8051 }, { "epoch": 2.997801072704215, "grad_norm": 0.16416963934898376, "learning_rate": 1.6546770196462462e-05, "loss": 1.1614, "step": 8052 }, { "epoch": 2.9981733778548243, "grad_norm": 0.1586388200521469, "learning_rate": 1.654585147991741e-05, "loss": 1.175, "step": 8053 }, { "epoch": 2.9985456830054336, "grad_norm": 0.16161870956420898, "learning_rate": 1.6544932666690538e-05, "loss": 1.1595, "step": 8054 }, { "epoch": 2.9989179881560424, "grad_norm": 0.16111552715301514, "learning_rate": 1.654401375679542e-05, "loss": 1.1749, "step": 8055 }, { "epoch": 2.999290293306651, "grad_norm": 0.16045445203781128, "learning_rate": 1.654309475024563e-05, "loss": 1.1532, "step": 8056 }, { "epoch": 2.9996625984572605, "grad_norm": 0.16429480910301208, "learning_rate": 1.654217564705474e-05, "loss": 1.1617, "step": 8057 }, { "epoch": 3.0000349036078697, "grad_norm": 0.16927365958690643, "learning_rate": 1.654125644723633e-05, "loss": 1.1758, "step": 8058 }, { "epoch": 3.0004072087584785, "grad_norm": 0.15990622341632843, "learning_rate": 1.654033715080397e-05, "loss": 1.1513, "step": 8059 }, { "epoch": 3.000779513909088, "grad_norm": 0.16184860467910767, "learning_rate": 1.6539417757771246e-05, "loss": 1.1631, "step": 8060 }, { "epoch": 3.0011518190596966, "grad_norm": 0.16141143441200256, "learning_rate": 1.6538498268151728e-05, "loss": 1.153, "step": 8061 }, { "epoch": 3.001524124210306, "grad_norm": 0.16540926694869995, "learning_rate": 1.6537578681958998e-05, "loss": 1.1469, "step": 8062 }, { "epoch": 3.0018964293609147, "grad_norm": 0.18034324049949646, "learning_rate": 1.6536658999206643e-05, "loss": 1.1609, "step": 8063 }, { "epoch": 3.002268734511524, "grad_norm": 0.15943139791488647, "learning_rate": 1.653573921990825e-05, "loss": 1.1609, "step": 8064 }, { "epoch": 3.0026410396621332, "grad_norm": 0.16149209439754486, "learning_rate": 1.6534819344077392e-05, "loss": 1.1507, "step": 8065 }, { "epoch": 3.003013344812742, "grad_norm": 0.1729980707168579, "learning_rate": 1.6533899371727668e-05, "loss": 1.1611, "step": 8066 }, { "epoch": 3.0033856499633513, "grad_norm": 0.16380807757377625, "learning_rate": 1.6532979302872654e-05, "loss": 1.1694, "step": 8067 }, { "epoch": 3.00375795511396, "grad_norm": 0.16485844552516937, "learning_rate": 1.653205913752595e-05, "loss": 1.1572, "step": 8068 }, { "epoch": 3.0041302602645694, "grad_norm": 0.1658172309398651, "learning_rate": 1.6531138875701142e-05, "loss": 1.1508, "step": 8069 }, { "epoch": 3.0045025654151782, "grad_norm": 0.15986262261867523, "learning_rate": 1.6530218517411823e-05, "loss": 1.1413, "step": 8070 }, { "epoch": 3.0048748705657875, "grad_norm": 0.16682755947113037, "learning_rate": 1.6529298062671587e-05, "loss": 1.1601, "step": 8071 }, { "epoch": 3.0052471757163968, "grad_norm": 0.16259485483169556, "learning_rate": 1.6528377511494028e-05, "loss": 1.1513, "step": 8072 }, { "epoch": 3.0056194808670056, "grad_norm": 0.1639915257692337, "learning_rate": 1.652745686389274e-05, "loss": 1.1732, "step": 8073 }, { "epoch": 3.005991786017615, "grad_norm": 0.16997599601745605, "learning_rate": 1.6526536119881325e-05, "loss": 1.1559, "step": 8074 }, { "epoch": 3.0063640911682237, "grad_norm": 0.17218388617038727, "learning_rate": 1.6525615279473385e-05, "loss": 1.1643, "step": 8075 }, { "epoch": 3.006736396318833, "grad_norm": 0.16318227350711823, "learning_rate": 1.652469434268251e-05, "loss": 1.1595, "step": 8076 }, { "epoch": 3.0071087014694418, "grad_norm": 0.1637173295021057, "learning_rate": 1.6523773309522314e-05, "loss": 1.1586, "step": 8077 }, { "epoch": 3.007481006620051, "grad_norm": 0.1644509881734848, "learning_rate": 1.6522852180006396e-05, "loss": 1.168, "step": 8078 }, { "epoch": 3.00785331177066, "grad_norm": 0.16260574758052826, "learning_rate": 1.6521930954148358e-05, "loss": 1.1534, "step": 8079 }, { "epoch": 3.008225616921269, "grad_norm": 0.16555775701999664, "learning_rate": 1.652100963196181e-05, "loss": 1.1544, "step": 8080 }, { "epoch": 3.0085979220718784, "grad_norm": 0.1621171087026596, "learning_rate": 1.652008821346036e-05, "loss": 1.1601, "step": 8081 }, { "epoch": 3.008970227222487, "grad_norm": 0.15849411487579346, "learning_rate": 1.6519166698657616e-05, "loss": 1.1589, "step": 8082 }, { "epoch": 3.0093425323730965, "grad_norm": 0.16716186702251434, "learning_rate": 1.6518245087567188e-05, "loss": 1.154, "step": 8083 }, { "epoch": 3.0097148375237053, "grad_norm": 0.1690070927143097, "learning_rate": 1.6517323380202693e-05, "loss": 1.1483, "step": 8084 }, { "epoch": 3.0100871426743145, "grad_norm": 0.16490869224071503, "learning_rate": 1.6516401576577736e-05, "loss": 1.1596, "step": 8085 }, { "epoch": 3.0104594478249234, "grad_norm": 0.16512571275234222, "learning_rate": 1.6515479676705935e-05, "loss": 1.1574, "step": 8086 }, { "epoch": 3.0108317529755326, "grad_norm": 0.17184092104434967, "learning_rate": 1.6514557680600912e-05, "loss": 1.1579, "step": 8087 }, { "epoch": 3.0112040581261414, "grad_norm": 0.17025062441825867, "learning_rate": 1.651363558827628e-05, "loss": 1.1661, "step": 8088 }, { "epoch": 3.0115763632767507, "grad_norm": 0.17025823891162872, "learning_rate": 1.651271339974566e-05, "loss": 1.1601, "step": 8089 }, { "epoch": 3.01194866842736, "grad_norm": 0.16634193062782288, "learning_rate": 1.6511791115022672e-05, "loss": 1.1519, "step": 8090 }, { "epoch": 3.012320973577969, "grad_norm": 0.17068715393543243, "learning_rate": 1.6510868734120935e-05, "loss": 1.171, "step": 8091 }, { "epoch": 3.012693278728578, "grad_norm": 0.1623985916376114, "learning_rate": 1.6509946257054078e-05, "loss": 1.1518, "step": 8092 }, { "epoch": 3.013065583879187, "grad_norm": 0.16179263591766357, "learning_rate": 1.650902368383572e-05, "loss": 1.1498, "step": 8093 }, { "epoch": 3.013437889029796, "grad_norm": 0.1658586859703064, "learning_rate": 1.6508101014479494e-05, "loss": 1.1601, "step": 8094 }, { "epoch": 3.013810194180405, "grad_norm": 0.16824504733085632, "learning_rate": 1.6507178248999026e-05, "loss": 1.1453, "step": 8095 }, { "epoch": 3.0141824993310142, "grad_norm": 0.16324131190776825, "learning_rate": 1.6506255387407942e-05, "loss": 1.16, "step": 8096 }, { "epoch": 3.014554804481623, "grad_norm": 0.16698867082595825, "learning_rate": 1.6505332429719872e-05, "loss": 1.1527, "step": 8097 }, { "epoch": 3.0149271096322323, "grad_norm": 0.16737176477909088, "learning_rate": 1.650440937594845e-05, "loss": 1.162, "step": 8098 }, { "epoch": 3.0152994147828416, "grad_norm": 0.16450172662734985, "learning_rate": 1.650348622610731e-05, "loss": 1.1524, "step": 8099 }, { "epoch": 3.0156717199334504, "grad_norm": 0.16533337533473969, "learning_rate": 1.650256298021009e-05, "loss": 1.1577, "step": 8100 }, { "epoch": 3.0160440250840597, "grad_norm": 0.16812501847743988, "learning_rate": 1.650163963827042e-05, "loss": 1.1594, "step": 8101 }, { "epoch": 3.0164163302346685, "grad_norm": 0.16462981700897217, "learning_rate": 1.6500716200301943e-05, "loss": 1.1554, "step": 8102 }, { "epoch": 3.0167886353852778, "grad_norm": 0.1637783944606781, "learning_rate": 1.6499792666318294e-05, "loss": 1.1647, "step": 8103 }, { "epoch": 3.0171609405358866, "grad_norm": 0.16835294663906097, "learning_rate": 1.6498869036333116e-05, "loss": 1.1474, "step": 8104 }, { "epoch": 3.017533245686496, "grad_norm": 0.16287805140018463, "learning_rate": 1.649794531036005e-05, "loss": 1.1588, "step": 8105 }, { "epoch": 3.0179055508371047, "grad_norm": 0.1584957391023636, "learning_rate": 1.649702148841274e-05, "loss": 1.1543, "step": 8106 }, { "epoch": 3.018277855987714, "grad_norm": 0.15747535228729248, "learning_rate": 1.6496097570504826e-05, "loss": 1.1669, "step": 8107 }, { "epoch": 3.018650161138323, "grad_norm": 0.16498741507530212, "learning_rate": 1.6495173556649965e-05, "loss": 1.1697, "step": 8108 }, { "epoch": 3.019022466288932, "grad_norm": 0.16772598028182983, "learning_rate": 1.6494249446861795e-05, "loss": 1.1731, "step": 8109 }, { "epoch": 3.0193947714395413, "grad_norm": 0.16565538942813873, "learning_rate": 1.6493325241153968e-05, "loss": 1.1572, "step": 8110 }, { "epoch": 3.01976707659015, "grad_norm": 0.16811548173427582, "learning_rate": 1.6492400939540134e-05, "loss": 1.1409, "step": 8111 }, { "epoch": 3.0201393817407594, "grad_norm": 0.17630524933338165, "learning_rate": 1.6491476542033948e-05, "loss": 1.1761, "step": 8112 }, { "epoch": 3.020511686891368, "grad_norm": 0.17578953504562378, "learning_rate": 1.649055204864906e-05, "loss": 1.1435, "step": 8113 }, { "epoch": 3.0208839920419774, "grad_norm": 0.17409630119800568, "learning_rate": 1.6489627459399123e-05, "loss": 1.1627, "step": 8114 }, { "epoch": 3.0212562971925863, "grad_norm": 0.16359230875968933, "learning_rate": 1.64887027742978e-05, "loss": 1.1507, "step": 8115 }, { "epoch": 3.0216286023431955, "grad_norm": 0.16680704057216644, "learning_rate": 1.648777799335874e-05, "loss": 1.1732, "step": 8116 }, { "epoch": 3.022000907493805, "grad_norm": 0.1649051010608673, "learning_rate": 1.6486853116595608e-05, "loss": 1.1658, "step": 8117 }, { "epoch": 3.0223732126444136, "grad_norm": 0.1658775508403778, "learning_rate": 1.6485928144022066e-05, "loss": 1.1569, "step": 8118 }, { "epoch": 3.022745517795023, "grad_norm": 0.17610371112823486, "learning_rate": 1.648500307565177e-05, "loss": 1.1667, "step": 8119 }, { "epoch": 3.0231178229456317, "grad_norm": 0.17497166991233826, "learning_rate": 1.6484077911498383e-05, "loss": 1.146, "step": 8120 }, { "epoch": 3.023490128096241, "grad_norm": 0.17234422266483307, "learning_rate": 1.6483152651575575e-05, "loss": 1.1553, "step": 8121 }, { "epoch": 3.02386243324685, "grad_norm": 0.166042760014534, "learning_rate": 1.6482227295897008e-05, "loss": 1.161, "step": 8122 }, { "epoch": 3.024234738397459, "grad_norm": 0.16150951385498047, "learning_rate": 1.648130184447635e-05, "loss": 1.1666, "step": 8123 }, { "epoch": 3.024607043548068, "grad_norm": 0.16276511549949646, "learning_rate": 1.648037629732727e-05, "loss": 1.1454, "step": 8124 }, { "epoch": 3.024979348698677, "grad_norm": 0.16008317470550537, "learning_rate": 1.6479450654463443e-05, "loss": 1.152, "step": 8125 }, { "epoch": 3.0253516538492864, "grad_norm": 0.1709553301334381, "learning_rate": 1.6478524915898532e-05, "loss": 1.1614, "step": 8126 }, { "epoch": 3.025723958999895, "grad_norm": 0.16601483523845673, "learning_rate": 1.6477599081646217e-05, "loss": 1.1536, "step": 8127 }, { "epoch": 3.0260962641505045, "grad_norm": 0.16305498778820038, "learning_rate": 1.647667315172017e-05, "loss": 1.1581, "step": 8128 }, { "epoch": 3.0264685693011133, "grad_norm": 0.17328190803527832, "learning_rate": 1.6475747126134066e-05, "loss": 1.1502, "step": 8129 }, { "epoch": 3.0268408744517226, "grad_norm": 0.20106814801692963, "learning_rate": 1.647482100490158e-05, "loss": 1.1654, "step": 8130 }, { "epoch": 3.0272131796023314, "grad_norm": 0.18390576541423798, "learning_rate": 1.64738947880364e-05, "loss": 1.1635, "step": 8131 }, { "epoch": 3.0275854847529406, "grad_norm": 0.16346316039562225, "learning_rate": 1.6472968475552197e-05, "loss": 1.177, "step": 8132 }, { "epoch": 3.02795778990355, "grad_norm": 0.18944934010505676, "learning_rate": 1.647204206746266e-05, "loss": 1.1598, "step": 8133 }, { "epoch": 3.0283300950541587, "grad_norm": 0.1777556836605072, "learning_rate": 1.6471115563781467e-05, "loss": 1.1578, "step": 8134 }, { "epoch": 3.028702400204768, "grad_norm": 0.17949633300304413, "learning_rate": 1.6470188964522296e-05, "loss": 1.1698, "step": 8135 }, { "epoch": 3.029074705355377, "grad_norm": 0.18838725984096527, "learning_rate": 1.6469262269698846e-05, "loss": 1.1585, "step": 8136 }, { "epoch": 3.029447010505986, "grad_norm": 0.1662994623184204, "learning_rate": 1.6468335479324796e-05, "loss": 1.1597, "step": 8137 }, { "epoch": 3.029819315656595, "grad_norm": 0.172524094581604, "learning_rate": 1.646740859341384e-05, "loss": 1.1683, "step": 8138 }, { "epoch": 3.030191620807204, "grad_norm": 0.17046119272708893, "learning_rate": 1.6466481611979665e-05, "loss": 1.1582, "step": 8139 }, { "epoch": 3.030563925957813, "grad_norm": 0.18434128165245056, "learning_rate": 1.646555453503596e-05, "loss": 1.1576, "step": 8140 }, { "epoch": 3.0309362311084223, "grad_norm": 0.16541865468025208, "learning_rate": 1.646462736259642e-05, "loss": 1.1453, "step": 8141 }, { "epoch": 3.0313085362590315, "grad_norm": 0.1774558275938034, "learning_rate": 1.646370009467474e-05, "loss": 1.1619, "step": 8142 }, { "epoch": 3.0316808414096403, "grad_norm": 0.16616548597812653, "learning_rate": 1.6462772731284615e-05, "loss": 1.1659, "step": 8143 }, { "epoch": 3.0320531465602496, "grad_norm": 0.17070108652114868, "learning_rate": 1.6461845272439743e-05, "loss": 1.1528, "step": 8144 }, { "epoch": 3.0324254517108584, "grad_norm": 0.17470189929008484, "learning_rate": 1.646091771815382e-05, "loss": 1.1824, "step": 8145 }, { "epoch": 3.0327977568614677, "grad_norm": 0.17306390404701233, "learning_rate": 1.645999006844055e-05, "loss": 1.1623, "step": 8146 }, { "epoch": 3.0331700620120765, "grad_norm": 0.16506531834602356, "learning_rate": 1.6459062323313634e-05, "loss": 1.1725, "step": 8147 }, { "epoch": 3.0335423671626858, "grad_norm": 0.17168426513671875, "learning_rate": 1.645813448278677e-05, "loss": 1.1475, "step": 8148 }, { "epoch": 3.0339146723132946, "grad_norm": 0.16225890815258026, "learning_rate": 1.6457206546873665e-05, "loss": 1.1614, "step": 8149 }, { "epoch": 3.034286977463904, "grad_norm": 0.16611915826797485, "learning_rate": 1.6456278515588023e-05, "loss": 1.1497, "step": 8150 }, { "epoch": 3.034659282614513, "grad_norm": 0.174469992518425, "learning_rate": 1.6455350388943555e-05, "loss": 1.17, "step": 8151 }, { "epoch": 3.035031587765122, "grad_norm": 0.16496939957141876, "learning_rate": 1.6454422166953968e-05, "loss": 1.1392, "step": 8152 }, { "epoch": 3.035403892915731, "grad_norm": 0.17337270081043243, "learning_rate": 1.6453493849632968e-05, "loss": 1.1587, "step": 8153 }, { "epoch": 3.03577619806634, "grad_norm": 0.1641514003276825, "learning_rate": 1.6452565436994272e-05, "loss": 1.1646, "step": 8154 }, { "epoch": 3.0361485032169493, "grad_norm": 0.17081360518932343, "learning_rate": 1.6451636929051587e-05, "loss": 1.145, "step": 8155 }, { "epoch": 3.036520808367558, "grad_norm": 0.16759049892425537, "learning_rate": 1.645070832581863e-05, "loss": 1.171, "step": 8156 }, { "epoch": 3.0368931135181674, "grad_norm": 0.170254647731781, "learning_rate": 1.6449779627309113e-05, "loss": 1.142, "step": 8157 }, { "epoch": 3.037265418668776, "grad_norm": 0.16546176373958588, "learning_rate": 1.644885083353676e-05, "loss": 1.1524, "step": 8158 }, { "epoch": 3.0376377238193855, "grad_norm": 0.16441422700881958, "learning_rate": 1.6447921944515285e-05, "loss": 1.1519, "step": 8159 }, { "epoch": 3.0380100289699947, "grad_norm": 0.17892083525657654, "learning_rate": 1.6446992960258404e-05, "loss": 1.169, "step": 8160 }, { "epoch": 3.0383823341206035, "grad_norm": 0.1632370948791504, "learning_rate": 1.6446063880779845e-05, "loss": 1.1567, "step": 8161 }, { "epoch": 3.038754639271213, "grad_norm": 0.17492267489433289, "learning_rate": 1.6445134706093325e-05, "loss": 1.1518, "step": 8162 }, { "epoch": 3.0391269444218216, "grad_norm": 0.17203693091869354, "learning_rate": 1.6444205436212567e-05, "loss": 1.1601, "step": 8163 }, { "epoch": 3.039499249572431, "grad_norm": 0.17056483030319214, "learning_rate": 1.6443276071151303e-05, "loss": 1.1711, "step": 8164 }, { "epoch": 3.0398715547230397, "grad_norm": 0.17935492098331451, "learning_rate": 1.6442346610923258e-05, "loss": 1.1439, "step": 8165 }, { "epoch": 3.040243859873649, "grad_norm": 0.16036418080329895, "learning_rate": 1.6441417055542154e-05, "loss": 1.1444, "step": 8166 }, { "epoch": 3.040616165024258, "grad_norm": 0.1764068752527237, "learning_rate": 1.6440487405021727e-05, "loss": 1.1411, "step": 8167 }, { "epoch": 3.040988470174867, "grad_norm": 0.16799296438694, "learning_rate": 1.6439557659375705e-05, "loss": 1.1551, "step": 8168 }, { "epoch": 3.0413607753254763, "grad_norm": 0.17432555556297302, "learning_rate": 1.6438627818617817e-05, "loss": 1.1627, "step": 8169 }, { "epoch": 3.041733080476085, "grad_norm": 0.18049269914627075, "learning_rate": 1.6437697882761802e-05, "loss": 1.1532, "step": 8170 }, { "epoch": 3.0421053856266944, "grad_norm": 0.16729703545570374, "learning_rate": 1.6436767851821395e-05, "loss": 1.1795, "step": 8171 }, { "epoch": 3.0424776907773032, "grad_norm": 0.1749987155199051, "learning_rate": 1.6435837725810326e-05, "loss": 1.1612, "step": 8172 }, { "epoch": 3.0428499959279125, "grad_norm": 0.16448847949504852, "learning_rate": 1.6434907504742342e-05, "loss": 1.1616, "step": 8173 }, { "epoch": 3.0432223010785213, "grad_norm": 0.1609332412481308, "learning_rate": 1.6433977188631177e-05, "loss": 1.1641, "step": 8174 }, { "epoch": 3.0435946062291306, "grad_norm": 0.1693718433380127, "learning_rate": 1.6433046777490576e-05, "loss": 1.1719, "step": 8175 }, { "epoch": 3.0439669113797394, "grad_norm": 0.16598600149154663, "learning_rate": 1.643211627133427e-05, "loss": 1.1562, "step": 8176 }, { "epoch": 3.0443392165303487, "grad_norm": 0.16353359818458557, "learning_rate": 1.6431185670176017e-05, "loss": 1.1626, "step": 8177 }, { "epoch": 3.044711521680958, "grad_norm": 0.1780644953250885, "learning_rate": 1.6430254974029554e-05, "loss": 1.1502, "step": 8178 }, { "epoch": 3.0450838268315668, "grad_norm": 0.1959240734577179, "learning_rate": 1.6429324182908628e-05, "loss": 1.1409, "step": 8179 }, { "epoch": 3.045456131982176, "grad_norm": 0.18989701569080353, "learning_rate": 1.6428393296826987e-05, "loss": 1.1457, "step": 8180 }, { "epoch": 3.045828437132785, "grad_norm": 0.16791072487831116, "learning_rate": 1.642746231579838e-05, "loss": 1.1521, "step": 8181 }, { "epoch": 3.046200742283394, "grad_norm": 0.17417898774147034, "learning_rate": 1.642653123983656e-05, "loss": 1.1476, "step": 8182 }, { "epoch": 3.046573047434003, "grad_norm": 0.20077449083328247, "learning_rate": 1.6425600068955272e-05, "loss": 1.1731, "step": 8183 }, { "epoch": 3.046945352584612, "grad_norm": 0.1830679327249527, "learning_rate": 1.642466880316828e-05, "loss": 1.166, "step": 8184 }, { "epoch": 3.0473176577352215, "grad_norm": 0.16485272347927094, "learning_rate": 1.642373744248933e-05, "loss": 1.1681, "step": 8185 }, { "epoch": 3.0476899628858303, "grad_norm": 0.1660563200712204, "learning_rate": 1.6422805986932184e-05, "loss": 1.1524, "step": 8186 }, { "epoch": 3.0480622680364395, "grad_norm": 0.16424550116062164, "learning_rate": 1.642187443651059e-05, "loss": 1.1539, "step": 8187 }, { "epoch": 3.0484345731870484, "grad_norm": 0.1737472116947174, "learning_rate": 1.642094279123832e-05, "loss": 1.1597, "step": 8188 }, { "epoch": 3.0488068783376576, "grad_norm": 0.1679793894290924, "learning_rate": 1.6420011051129127e-05, "loss": 1.1495, "step": 8189 }, { "epoch": 3.0491791834882664, "grad_norm": 0.16718783974647522, "learning_rate": 1.641907921619677e-05, "loss": 1.1653, "step": 8190 }, { "epoch": 3.0495514886388757, "grad_norm": 0.17275018990039825, "learning_rate": 1.641814728645502e-05, "loss": 1.1521, "step": 8191 }, { "epoch": 3.0499237937894845, "grad_norm": 0.17378568649291992, "learning_rate": 1.6417215261917638e-05, "loss": 1.1681, "step": 8192 }, { "epoch": 3.050296098940094, "grad_norm": 0.19507519900798798, "learning_rate": 1.6416283142598387e-05, "loss": 1.1686, "step": 8193 }, { "epoch": 3.050668404090703, "grad_norm": 0.1943422257900238, "learning_rate": 1.6415350928511037e-05, "loss": 1.1595, "step": 8194 }, { "epoch": 3.051040709241312, "grad_norm": 0.16781660914421082, "learning_rate": 1.6414418619669354e-05, "loss": 1.1471, "step": 8195 }, { "epoch": 3.051413014391921, "grad_norm": 0.1694183200597763, "learning_rate": 1.6413486216087114e-05, "loss": 1.1439, "step": 8196 }, { "epoch": 3.05178531954253, "grad_norm": 0.17168377339839935, "learning_rate": 1.6412553717778085e-05, "loss": 1.159, "step": 8197 }, { "epoch": 3.0521576246931392, "grad_norm": 0.16123619675636292, "learning_rate": 1.6411621124756035e-05, "loss": 1.1616, "step": 8198 }, { "epoch": 3.052529929843748, "grad_norm": 0.18184486031532288, "learning_rate": 1.641068843703475e-05, "loss": 1.1613, "step": 8199 }, { "epoch": 3.0529022349943573, "grad_norm": 0.2111905813217163, "learning_rate": 1.6409755654627994e-05, "loss": 1.165, "step": 8200 }, { "epoch": 3.053274540144966, "grad_norm": 0.1873953491449356, "learning_rate": 1.6408822777549552e-05, "loss": 1.1603, "step": 8201 }, { "epoch": 3.0536468452955754, "grad_norm": 0.1657257080078125, "learning_rate": 1.64078898058132e-05, "loss": 1.1615, "step": 8202 }, { "epoch": 3.0540191504461847, "grad_norm": 0.18995431065559387, "learning_rate": 1.6406956739432716e-05, "loss": 1.1451, "step": 8203 }, { "epoch": 3.0543914555967935, "grad_norm": 0.1653064638376236, "learning_rate": 1.6406023578421884e-05, "loss": 1.1514, "step": 8204 }, { "epoch": 3.0547637607474027, "grad_norm": 0.19428938627243042, "learning_rate": 1.6405090322794484e-05, "loss": 1.1483, "step": 8205 }, { "epoch": 3.0551360658980116, "grad_norm": 0.18280021846294403, "learning_rate": 1.6404156972564305e-05, "loss": 1.1456, "step": 8206 }, { "epoch": 3.055508371048621, "grad_norm": 0.18493548035621643, "learning_rate": 1.6403223527745127e-05, "loss": 1.152, "step": 8207 }, { "epoch": 3.0558806761992297, "grad_norm": 0.16537559032440186, "learning_rate": 1.6402289988350742e-05, "loss": 1.1563, "step": 8208 }, { "epoch": 3.056252981349839, "grad_norm": 0.19156776368618011, "learning_rate": 1.6401356354394934e-05, "loss": 1.1445, "step": 8209 }, { "epoch": 3.0566252865004477, "grad_norm": 0.17017942667007446, "learning_rate": 1.6400422625891493e-05, "loss": 1.1643, "step": 8210 }, { "epoch": 3.056997591651057, "grad_norm": 0.1759144514799118, "learning_rate": 1.6399488802854214e-05, "loss": 1.1651, "step": 8211 }, { "epoch": 3.0573698968016663, "grad_norm": 0.17706505954265594, "learning_rate": 1.6398554885296888e-05, "loss": 1.1491, "step": 8212 }, { "epoch": 3.057742201952275, "grad_norm": 0.16222470998764038, "learning_rate": 1.6397620873233304e-05, "loss": 1.1636, "step": 8213 }, { "epoch": 3.0581145071028843, "grad_norm": 0.21946337819099426, "learning_rate": 1.6396686766677263e-05, "loss": 1.1623, "step": 8214 }, { "epoch": 3.058486812253493, "grad_norm": 0.16888217628002167, "learning_rate": 1.639575256564256e-05, "loss": 1.1594, "step": 8215 }, { "epoch": 3.0588591174041024, "grad_norm": 0.16997283697128296, "learning_rate": 1.6394818270142995e-05, "loss": 1.1507, "step": 8216 }, { "epoch": 3.0592314225547113, "grad_norm": 0.16864870488643646, "learning_rate": 1.6393883880192362e-05, "loss": 1.1613, "step": 8217 }, { "epoch": 3.0596037277053205, "grad_norm": 0.16163980960845947, "learning_rate": 1.6392949395804464e-05, "loss": 1.1675, "step": 8218 }, { "epoch": 3.0599760328559293, "grad_norm": 0.16427217423915863, "learning_rate": 1.639201481699311e-05, "loss": 1.1476, "step": 8219 }, { "epoch": 3.0603483380065386, "grad_norm": 0.16542796790599823, "learning_rate": 1.6391080143772094e-05, "loss": 1.1446, "step": 8220 }, { "epoch": 3.060720643157148, "grad_norm": 0.1682531237602234, "learning_rate": 1.639014537615523e-05, "loss": 1.1564, "step": 8221 }, { "epoch": 3.0610929483077567, "grad_norm": 0.16446895897388458, "learning_rate": 1.6389210514156317e-05, "loss": 1.1661, "step": 8222 }, { "epoch": 3.061465253458366, "grad_norm": 0.16606532037258148, "learning_rate": 1.6388275557789165e-05, "loss": 1.1658, "step": 8223 }, { "epoch": 3.0618375586089748, "grad_norm": 0.16892418265342712, "learning_rate": 1.6387340507067584e-05, "loss": 1.1503, "step": 8224 }, { "epoch": 3.062209863759584, "grad_norm": 0.16336201131343842, "learning_rate": 1.6386405362005385e-05, "loss": 1.1647, "step": 8225 }, { "epoch": 3.062582168910193, "grad_norm": 0.15720415115356445, "learning_rate": 1.638547012261638e-05, "loss": 1.151, "step": 8226 }, { "epoch": 3.062954474060802, "grad_norm": 0.1673574596643448, "learning_rate": 1.6384534788914383e-05, "loss": 1.1508, "step": 8227 }, { "epoch": 3.063326779211411, "grad_norm": 0.1641821265220642, "learning_rate": 1.6383599360913204e-05, "loss": 1.1522, "step": 8228 }, { "epoch": 3.06369908436202, "grad_norm": 0.17019106447696686, "learning_rate": 1.6382663838626667e-05, "loss": 1.1652, "step": 8229 }, { "epoch": 3.0640713895126295, "grad_norm": 0.17099004983901978, "learning_rate": 1.6381728222068585e-05, "loss": 1.1716, "step": 8230 }, { "epoch": 3.0644436946632383, "grad_norm": 0.16094860434532166, "learning_rate": 1.6380792511252775e-05, "loss": 1.1498, "step": 8231 }, { "epoch": 3.0648159998138476, "grad_norm": 0.16481037437915802, "learning_rate": 1.6379856706193064e-05, "loss": 1.1479, "step": 8232 }, { "epoch": 3.0651883049644564, "grad_norm": 0.16703465580940247, "learning_rate": 1.6378920806903265e-05, "loss": 1.1656, "step": 8233 }, { "epoch": 3.0655606101150656, "grad_norm": 0.17205815017223358, "learning_rate": 1.6377984813397212e-05, "loss": 1.1599, "step": 8234 }, { "epoch": 3.0659329152656745, "grad_norm": 0.1638091653585434, "learning_rate": 1.637704872568872e-05, "loss": 1.1548, "step": 8235 }, { "epoch": 3.0663052204162837, "grad_norm": 0.16245143115520477, "learning_rate": 1.6376112543791622e-05, "loss": 1.1547, "step": 8236 }, { "epoch": 3.0666775255668925, "grad_norm": 0.17442138493061066, "learning_rate": 1.6375176267719735e-05, "loss": 1.1526, "step": 8237 }, { "epoch": 3.067049830717502, "grad_norm": 0.1698945164680481, "learning_rate": 1.63742398974869e-05, "loss": 1.1514, "step": 8238 }, { "epoch": 3.067422135868111, "grad_norm": 0.17009849846363068, "learning_rate": 1.6373303433106936e-05, "loss": 1.1399, "step": 8239 }, { "epoch": 3.06779444101872, "grad_norm": 0.1673266440629959, "learning_rate": 1.6372366874593688e-05, "loss": 1.1445, "step": 8240 }, { "epoch": 3.068166746169329, "grad_norm": 0.16925260424613953, "learning_rate": 1.6371430221960975e-05, "loss": 1.1528, "step": 8241 }, { "epoch": 3.068539051319938, "grad_norm": 0.17465613782405853, "learning_rate": 1.637049347522264e-05, "loss": 1.159, "step": 8242 }, { "epoch": 3.0689113564705472, "grad_norm": 0.17698585987091064, "learning_rate": 1.636955663439252e-05, "loss": 1.1602, "step": 8243 }, { "epoch": 3.069283661621156, "grad_norm": 0.17667868733406067, "learning_rate": 1.6368619699484446e-05, "loss": 1.158, "step": 8244 }, { "epoch": 3.0696559667717653, "grad_norm": 0.17002210021018982, "learning_rate": 1.6367682670512253e-05, "loss": 1.1616, "step": 8245 }, { "epoch": 3.070028271922374, "grad_norm": 0.18032929301261902, "learning_rate": 1.636674554748979e-05, "loss": 1.1825, "step": 8246 }, { "epoch": 3.0704005770729834, "grad_norm": 0.16729597747325897, "learning_rate": 1.6365808330430897e-05, "loss": 1.161, "step": 8247 }, { "epoch": 3.0707728822235927, "grad_norm": 0.188649520277977, "learning_rate": 1.6364871019349414e-05, "loss": 1.1605, "step": 8248 }, { "epoch": 3.0711451873742015, "grad_norm": 0.1881103366613388, "learning_rate": 1.6363933614259184e-05, "loss": 1.1416, "step": 8249 }, { "epoch": 3.0715174925248108, "grad_norm": 0.1650039553642273, "learning_rate": 1.6362996115174056e-05, "loss": 1.1556, "step": 8250 }, { "epoch": 3.0718897976754196, "grad_norm": 0.19066570699214935, "learning_rate": 1.6362058522107872e-05, "loss": 1.1541, "step": 8251 }, { "epoch": 3.072262102826029, "grad_norm": 0.19495618343353271, "learning_rate": 1.6361120835074485e-05, "loss": 1.1702, "step": 8252 }, { "epoch": 3.0726344079766377, "grad_norm": 0.1677856743335724, "learning_rate": 1.636018305408774e-05, "loss": 1.1482, "step": 8253 }, { "epoch": 3.073006713127247, "grad_norm": 0.22052864730358124, "learning_rate": 1.6359245179161492e-05, "loss": 1.1543, "step": 8254 }, { "epoch": 3.073379018277856, "grad_norm": 0.17813050746917725, "learning_rate": 1.6358307210309595e-05, "loss": 1.1654, "step": 8255 }, { "epoch": 3.073751323428465, "grad_norm": 0.18808187544345856, "learning_rate": 1.6357369147545894e-05, "loss": 1.165, "step": 8256 }, { "epoch": 3.0741236285790743, "grad_norm": 0.16268914937973022, "learning_rate": 1.635643099088425e-05, "loss": 1.1691, "step": 8257 }, { "epoch": 3.074495933729683, "grad_norm": 0.19553907215595245, "learning_rate": 1.6355492740338523e-05, "loss": 1.1465, "step": 8258 }, { "epoch": 3.0748682388802924, "grad_norm": 0.1695629507303238, "learning_rate": 1.6354554395922564e-05, "loss": 1.1672, "step": 8259 }, { "epoch": 3.075240544030901, "grad_norm": 0.1807146817445755, "learning_rate": 1.635361595765024e-05, "loss": 1.1486, "step": 8260 }, { "epoch": 3.0756128491815105, "grad_norm": 0.1665555089712143, "learning_rate": 1.63526774255354e-05, "loss": 1.1478, "step": 8261 }, { "epoch": 3.0759851543321193, "grad_norm": 0.17835910618305206, "learning_rate": 1.6351738799591918e-05, "loss": 1.1594, "step": 8262 }, { "epoch": 3.0763574594827285, "grad_norm": 0.17160485684871674, "learning_rate": 1.635080007983365e-05, "loss": 1.1574, "step": 8263 }, { "epoch": 3.076729764633338, "grad_norm": 0.16415171325206757, "learning_rate": 1.6349861266274467e-05, "loss": 1.1511, "step": 8264 }, { "epoch": 3.0771020697839466, "grad_norm": 0.16749624907970428, "learning_rate": 1.6348922358928228e-05, "loss": 1.148, "step": 8265 }, { "epoch": 3.077474374934556, "grad_norm": 0.1690567582845688, "learning_rate": 1.6347983357808804e-05, "loss": 1.1539, "step": 8266 }, { "epoch": 3.0778466800851647, "grad_norm": 0.17391081154346466, "learning_rate": 1.6347044262930067e-05, "loss": 1.1471, "step": 8267 }, { "epoch": 3.078218985235774, "grad_norm": 0.17175255715847015, "learning_rate": 1.6346105074305884e-05, "loss": 1.1618, "step": 8268 }, { "epoch": 3.078591290386383, "grad_norm": 0.1740272045135498, "learning_rate": 1.6345165791950125e-05, "loss": 1.1542, "step": 8269 }, { "epoch": 3.078963595536992, "grad_norm": 0.19044634699821472, "learning_rate": 1.634422641587667e-05, "loss": 1.1599, "step": 8270 }, { "epoch": 3.079335900687601, "grad_norm": 0.1639656275510788, "learning_rate": 1.6343286946099385e-05, "loss": 1.1623, "step": 8271 }, { "epoch": 3.07970820583821, "grad_norm": 0.17656481266021729, "learning_rate": 1.6342347382632155e-05, "loss": 1.1523, "step": 8272 }, { "epoch": 3.0800805109888194, "grad_norm": 0.15982165932655334, "learning_rate": 1.6341407725488844e-05, "loss": 1.1475, "step": 8273 }, { "epoch": 3.0804528161394282, "grad_norm": 0.17611384391784668, "learning_rate": 1.6340467974683344e-05, "loss": 1.1676, "step": 8274 }, { "epoch": 3.0808251212900375, "grad_norm": 0.17255373299121857, "learning_rate": 1.633952813022953e-05, "loss": 1.1819, "step": 8275 }, { "epoch": 3.0811974264406463, "grad_norm": 0.17750908434391022, "learning_rate": 1.633858819214128e-05, "loss": 1.1745, "step": 8276 }, { "epoch": 3.0815697315912556, "grad_norm": 0.21422599256038666, "learning_rate": 1.6337648160432484e-05, "loss": 1.1535, "step": 8277 }, { "epoch": 3.0819420367418644, "grad_norm": 0.17324034869670868, "learning_rate": 1.633670803511702e-05, "loss": 1.1591, "step": 8278 }, { "epoch": 3.0823143418924737, "grad_norm": 0.18860980868339539, "learning_rate": 1.6335767816208775e-05, "loss": 1.1505, "step": 8279 }, { "epoch": 3.0826866470430825, "grad_norm": 0.20710186660289764, "learning_rate": 1.633482750372164e-05, "loss": 1.1695, "step": 8280 }, { "epoch": 3.0830589521936917, "grad_norm": 0.16789516806602478, "learning_rate": 1.63338870976695e-05, "loss": 1.1677, "step": 8281 }, { "epoch": 3.083431257344301, "grad_norm": 0.1649201661348343, "learning_rate": 1.6332946598066244e-05, "loss": 1.1482, "step": 8282 }, { "epoch": 3.08380356249491, "grad_norm": 0.16981996595859528, "learning_rate": 1.6332006004925763e-05, "loss": 1.1704, "step": 8283 }, { "epoch": 3.084175867645519, "grad_norm": 0.16521765291690826, "learning_rate": 1.6331065318261955e-05, "loss": 1.1556, "step": 8284 }, { "epoch": 3.084548172796128, "grad_norm": 0.16958005726337433, "learning_rate": 1.6330124538088705e-05, "loss": 1.1542, "step": 8285 }, { "epoch": 3.084920477946737, "grad_norm": 0.1606731116771698, "learning_rate": 1.6329183664419918e-05, "loss": 1.1508, "step": 8286 }, { "epoch": 3.085292783097346, "grad_norm": 0.166421040892601, "learning_rate": 1.6328242697269478e-05, "loss": 1.1642, "step": 8287 }, { "epoch": 3.0856650882479553, "grad_norm": 0.1679898500442505, "learning_rate": 1.6327301636651296e-05, "loss": 1.1504, "step": 8288 }, { "epoch": 3.086037393398564, "grad_norm": 0.1639619916677475, "learning_rate": 1.6326360482579265e-05, "loss": 1.158, "step": 8289 }, { "epoch": 3.0864096985491734, "grad_norm": 0.16645826399326324, "learning_rate": 1.6325419235067286e-05, "loss": 1.151, "step": 8290 }, { "epoch": 3.0867820036997826, "grad_norm": 0.1687537580728531, "learning_rate": 1.6324477894129263e-05, "loss": 1.1579, "step": 8291 }, { "epoch": 3.0871543088503914, "grad_norm": 0.16648247838020325, "learning_rate": 1.6323536459779098e-05, "loss": 1.1513, "step": 8292 }, { "epoch": 3.0875266140010007, "grad_norm": 0.16747455298900604, "learning_rate": 1.6322594932030697e-05, "loss": 1.1528, "step": 8293 }, { "epoch": 3.0878989191516095, "grad_norm": 0.20737069845199585, "learning_rate": 1.632165331089796e-05, "loss": 1.1452, "step": 8294 }, { "epoch": 3.088271224302219, "grad_norm": 0.3523947298526764, "learning_rate": 1.6320711596394805e-05, "loss": 1.1623, "step": 8295 }, { "epoch": 3.0886435294528276, "grad_norm": 0.21294717490673065, "learning_rate": 1.6319769788535135e-05, "loss": 1.1556, "step": 8296 }, { "epoch": 3.089015834603437, "grad_norm": 0.18315008282661438, "learning_rate": 1.6318827887332865e-05, "loss": 1.1677, "step": 8297 }, { "epoch": 3.089388139754046, "grad_norm": 0.1660303920507431, "learning_rate": 1.6317885892801902e-05, "loss": 1.164, "step": 8298 }, { "epoch": 3.089760444904655, "grad_norm": 0.1686762571334839, "learning_rate": 1.631694380495616e-05, "loss": 1.1713, "step": 8299 }, { "epoch": 3.090132750055264, "grad_norm": 0.1759401112794876, "learning_rate": 1.6316001623809557e-05, "loss": 1.1509, "step": 8300 }, { "epoch": 3.090505055205873, "grad_norm": 0.17869412899017334, "learning_rate": 1.6315059349376002e-05, "loss": 1.1608, "step": 8301 }, { "epoch": 3.0908773603564823, "grad_norm": 0.17332401871681213, "learning_rate": 1.6314116981669418e-05, "loss": 1.1478, "step": 8302 }, { "epoch": 3.091249665507091, "grad_norm": 0.1675678789615631, "learning_rate": 1.6313174520703727e-05, "loss": 1.166, "step": 8303 }, { "epoch": 3.0916219706577004, "grad_norm": 0.1689613312482834, "learning_rate": 1.631223196649284e-05, "loss": 1.1671, "step": 8304 }, { "epoch": 3.091994275808309, "grad_norm": 0.16656659543514252, "learning_rate": 1.631128931905068e-05, "loss": 1.1671, "step": 8305 }, { "epoch": 3.0923665809589185, "grad_norm": 0.16922014951705933, "learning_rate": 1.631034657839118e-05, "loss": 1.1645, "step": 8306 }, { "epoch": 3.0927388861095277, "grad_norm": 0.169945627450943, "learning_rate": 1.6309403744528254e-05, "loss": 1.1565, "step": 8307 }, { "epoch": 3.0931111912601366, "grad_norm": 0.1612582504749298, "learning_rate": 1.630846081747583e-05, "loss": 1.1604, "step": 8308 }, { "epoch": 3.093483496410746, "grad_norm": 0.16429011523723602, "learning_rate": 1.6307517797247836e-05, "loss": 1.1472, "step": 8309 }, { "epoch": 3.0938558015613546, "grad_norm": 0.1718926727771759, "learning_rate": 1.63065746838582e-05, "loss": 1.1568, "step": 8310 }, { "epoch": 3.094228106711964, "grad_norm": 0.16393272578716278, "learning_rate": 1.6305631477320853e-05, "loss": 1.1553, "step": 8311 }, { "epoch": 3.0946004118625727, "grad_norm": 0.166838601231575, "learning_rate": 1.6304688177649725e-05, "loss": 1.1584, "step": 8312 }, { "epoch": 3.094972717013182, "grad_norm": 0.16786356270313263, "learning_rate": 1.6303744784858745e-05, "loss": 1.1563, "step": 8313 }, { "epoch": 3.095345022163791, "grad_norm": 0.16941292583942413, "learning_rate": 1.6302801298961853e-05, "loss": 1.158, "step": 8314 }, { "epoch": 3.0957173273144, "grad_norm": 0.16948345303535461, "learning_rate": 1.6301857719972977e-05, "loss": 1.1635, "step": 8315 }, { "epoch": 3.0960896324650093, "grad_norm": 0.16545285284519196, "learning_rate": 1.6300914047906063e-05, "loss": 1.1388, "step": 8316 }, { "epoch": 3.096461937615618, "grad_norm": 0.1651238054037094, "learning_rate": 1.6299970282775046e-05, "loss": 1.1709, "step": 8317 }, { "epoch": 3.0968342427662274, "grad_norm": 0.16604574024677277, "learning_rate": 1.6299026424593858e-05, "loss": 1.1495, "step": 8318 }, { "epoch": 3.0972065479168363, "grad_norm": 0.16288162767887115, "learning_rate": 1.6298082473376444e-05, "loss": 1.149, "step": 8319 }, { "epoch": 3.0975788530674455, "grad_norm": 0.16391494870185852, "learning_rate": 1.629713842913675e-05, "loss": 1.1536, "step": 8320 }, { "epoch": 3.0979511582180543, "grad_norm": 0.16908983886241913, "learning_rate": 1.6296194291888718e-05, "loss": 1.1626, "step": 8321 }, { "epoch": 3.0983234633686636, "grad_norm": 0.16984610259532928, "learning_rate": 1.6295250061646292e-05, "loss": 1.1611, "step": 8322 }, { "epoch": 3.0986957685192724, "grad_norm": 0.16279767453670502, "learning_rate": 1.6294305738423413e-05, "loss": 1.1429, "step": 8323 }, { "epoch": 3.0990680736698817, "grad_norm": 0.16995452344417572, "learning_rate": 1.6293361322234036e-05, "loss": 1.154, "step": 8324 }, { "epoch": 3.099440378820491, "grad_norm": 0.1628962606191635, "learning_rate": 1.6292416813092107e-05, "loss": 1.1576, "step": 8325 }, { "epoch": 3.0998126839710998, "grad_norm": 0.16195490956306458, "learning_rate": 1.6291472211011575e-05, "loss": 1.1549, "step": 8326 }, { "epoch": 3.100184989121709, "grad_norm": 0.16598109900951385, "learning_rate": 1.6290527516006396e-05, "loss": 1.1605, "step": 8327 }, { "epoch": 3.100557294272318, "grad_norm": 0.16446641087532043, "learning_rate": 1.628958272809052e-05, "loss": 1.163, "step": 8328 }, { "epoch": 3.100929599422927, "grad_norm": 0.1585415005683899, "learning_rate": 1.62886378472779e-05, "loss": 1.1575, "step": 8329 }, { "epoch": 3.101301904573536, "grad_norm": 0.16187165677547455, "learning_rate": 1.6287692873582495e-05, "loss": 1.1461, "step": 8330 }, { "epoch": 3.101674209724145, "grad_norm": 0.1652093529701233, "learning_rate": 1.628674780701826e-05, "loss": 1.1509, "step": 8331 }, { "epoch": 3.102046514874754, "grad_norm": 0.16078272461891174, "learning_rate": 1.6285802647599156e-05, "loss": 1.1402, "step": 8332 }, { "epoch": 3.1024188200253633, "grad_norm": 0.1659494936466217, "learning_rate": 1.6284857395339143e-05, "loss": 1.1555, "step": 8333 }, { "epoch": 3.1027911251759726, "grad_norm": 0.16813765466213226, "learning_rate": 1.6283912050252176e-05, "loss": 1.1623, "step": 8334 }, { "epoch": 3.1031634303265814, "grad_norm": 0.16813194751739502, "learning_rate": 1.6282966612352224e-05, "loss": 1.1508, "step": 8335 }, { "epoch": 3.1035357354771906, "grad_norm": 0.16411879658699036, "learning_rate": 1.628202108165325e-05, "loss": 1.1743, "step": 8336 }, { "epoch": 3.1039080406277995, "grad_norm": 0.17190620303153992, "learning_rate": 1.628107545816922e-05, "loss": 1.1595, "step": 8337 }, { "epoch": 3.1042803457784087, "grad_norm": 0.16098536550998688, "learning_rate": 1.6280129741914098e-05, "loss": 1.1521, "step": 8338 }, { "epoch": 3.1046526509290175, "grad_norm": 0.16388416290283203, "learning_rate": 1.6279183932901853e-05, "loss": 1.164, "step": 8339 }, { "epoch": 3.105024956079627, "grad_norm": 0.16218237578868866, "learning_rate": 1.627823803114646e-05, "loss": 1.1579, "step": 8340 }, { "epoch": 3.1053972612302356, "grad_norm": 0.16982564330101013, "learning_rate": 1.627729203666188e-05, "loss": 1.1598, "step": 8341 }, { "epoch": 3.105769566380845, "grad_norm": 0.16258504986763, "learning_rate": 1.627634594946209e-05, "loss": 1.1694, "step": 8342 }, { "epoch": 3.106141871531454, "grad_norm": 0.16323330998420715, "learning_rate": 1.6275399769561068e-05, "loss": 1.1582, "step": 8343 }, { "epoch": 3.106514176682063, "grad_norm": 0.16780774295330048, "learning_rate": 1.6274453496972783e-05, "loss": 1.1605, "step": 8344 }, { "epoch": 3.1068864818326722, "grad_norm": 0.1661432832479477, "learning_rate": 1.6273507131711216e-05, "loss": 1.1553, "step": 8345 }, { "epoch": 3.107258786983281, "grad_norm": 0.16441033780574799, "learning_rate": 1.627256067379034e-05, "loss": 1.1619, "step": 8346 }, { "epoch": 3.1076310921338903, "grad_norm": 0.16335833072662354, "learning_rate": 1.6271614123224137e-05, "loss": 1.151, "step": 8347 }, { "epoch": 3.108003397284499, "grad_norm": 0.16474972665309906, "learning_rate": 1.6270667480026588e-05, "loss": 1.1616, "step": 8348 }, { "epoch": 3.1083757024351084, "grad_norm": 0.17309875786304474, "learning_rate": 1.6269720744211675e-05, "loss": 1.1736, "step": 8349 }, { "epoch": 3.1087480075857172, "grad_norm": 0.16838182508945465, "learning_rate": 1.6268773915793376e-05, "loss": 1.1496, "step": 8350 }, { "epoch": 3.1091203127363265, "grad_norm": 0.1622922271490097, "learning_rate": 1.6267826994785683e-05, "loss": 1.1562, "step": 8351 }, { "epoch": 3.1094926178869358, "grad_norm": 0.16335055232048035, "learning_rate": 1.6266879981202577e-05, "loss": 1.1565, "step": 8352 }, { "epoch": 3.1098649230375446, "grad_norm": 0.16523607075214386, "learning_rate": 1.626593287505805e-05, "loss": 1.1497, "step": 8353 }, { "epoch": 3.110237228188154, "grad_norm": 0.16475920379161835, "learning_rate": 1.6264985676366085e-05, "loss": 1.1481, "step": 8354 }, { "epoch": 3.1106095333387627, "grad_norm": 0.16444498300552368, "learning_rate": 1.6264038385140676e-05, "loss": 1.1676, "step": 8355 }, { "epoch": 3.110981838489372, "grad_norm": 0.16481232643127441, "learning_rate": 1.6263091001395808e-05, "loss": 1.1595, "step": 8356 }, { "epoch": 3.1113541436399808, "grad_norm": 0.16143718361854553, "learning_rate": 1.6262143525145485e-05, "loss": 1.143, "step": 8357 }, { "epoch": 3.11172644879059, "grad_norm": 0.16786842048168182, "learning_rate": 1.6261195956403694e-05, "loss": 1.159, "step": 8358 }, { "epoch": 3.112098753941199, "grad_norm": 0.163263738155365, "learning_rate": 1.626024829518443e-05, "loss": 1.1516, "step": 8359 }, { "epoch": 3.112471059091808, "grad_norm": 0.16883710026741028, "learning_rate": 1.6259300541501694e-05, "loss": 1.1651, "step": 8360 }, { "epoch": 3.1128433642424174, "grad_norm": 0.16147491335868835, "learning_rate": 1.6258352695369478e-05, "loss": 1.1586, "step": 8361 }, { "epoch": 3.113215669393026, "grad_norm": 0.15874330699443817, "learning_rate": 1.625740475680179e-05, "loss": 1.1513, "step": 8362 }, { "epoch": 3.1135879745436355, "grad_norm": 0.16957154870033264, "learning_rate": 1.6256456725812625e-05, "loss": 1.1636, "step": 8363 }, { "epoch": 3.1139602796942443, "grad_norm": 0.1629595011472702, "learning_rate": 1.6255508602415987e-05, "loss": 1.1588, "step": 8364 }, { "epoch": 3.1143325848448535, "grad_norm": 0.1647290140390396, "learning_rate": 1.6254560386625874e-05, "loss": 1.1537, "step": 8365 }, { "epoch": 3.1147048899954624, "grad_norm": 0.1659194529056549, "learning_rate": 1.6253612078456304e-05, "loss": 1.1602, "step": 8366 }, { "epoch": 3.1150771951460716, "grad_norm": 0.16474999487400055, "learning_rate": 1.625266367792127e-05, "loss": 1.152, "step": 8367 }, { "epoch": 3.1154495002966804, "grad_norm": 0.1607973873615265, "learning_rate": 1.6251715185034795e-05, "loss": 1.1555, "step": 8368 }, { "epoch": 3.1158218054472897, "grad_norm": 0.16143812239170074, "learning_rate": 1.625076659981087e-05, "loss": 1.149, "step": 8369 }, { "epoch": 3.116194110597899, "grad_norm": 0.16376273334026337, "learning_rate": 1.6249817922263518e-05, "loss": 1.1621, "step": 8370 }, { "epoch": 3.116566415748508, "grad_norm": 0.1660854071378708, "learning_rate": 1.6248869152406745e-05, "loss": 1.1475, "step": 8371 }, { "epoch": 3.116938720899117, "grad_norm": 0.1652403473854065, "learning_rate": 1.624792029025457e-05, "loss": 1.1496, "step": 8372 }, { "epoch": 3.117311026049726, "grad_norm": 0.1686633676290512, "learning_rate": 1.6246971335821004e-05, "loss": 1.1589, "step": 8373 }, { "epoch": 3.117683331200335, "grad_norm": 0.1678876131772995, "learning_rate": 1.6246022289120063e-05, "loss": 1.1533, "step": 8374 }, { "epoch": 3.118055636350944, "grad_norm": 0.1635279506444931, "learning_rate": 1.6245073150165766e-05, "loss": 1.156, "step": 8375 }, { "epoch": 3.1184279415015532, "grad_norm": 0.16207703948020935, "learning_rate": 1.624412391897213e-05, "loss": 1.1726, "step": 8376 }, { "epoch": 3.1188002466521625, "grad_norm": 0.1617921143770218, "learning_rate": 1.6243174595553174e-05, "loss": 1.1607, "step": 8377 }, { "epoch": 3.1191725518027713, "grad_norm": 0.16754458844661713, "learning_rate": 1.624222517992292e-05, "loss": 1.163, "step": 8378 }, { "epoch": 3.1195448569533806, "grad_norm": 0.15687395632266998, "learning_rate": 1.6241275672095397e-05, "loss": 1.1651, "step": 8379 }, { "epoch": 3.1199171621039894, "grad_norm": 0.1605893075466156, "learning_rate": 1.6240326072084617e-05, "loss": 1.1457, "step": 8380 }, { "epoch": 3.1202894672545987, "grad_norm": 0.16542315483093262, "learning_rate": 1.6239376379904618e-05, "loss": 1.165, "step": 8381 }, { "epoch": 3.1206617724052075, "grad_norm": 0.16830827295780182, "learning_rate": 1.623842659556942e-05, "loss": 1.1516, "step": 8382 }, { "epoch": 3.1210340775558167, "grad_norm": 0.1646735519170761, "learning_rate": 1.623747671909305e-05, "loss": 1.1631, "step": 8383 }, { "epoch": 3.1214063827064256, "grad_norm": 0.16516734659671783, "learning_rate": 1.6236526750489542e-05, "loss": 1.1675, "step": 8384 }, { "epoch": 3.121778687857035, "grad_norm": 0.16348835825920105, "learning_rate": 1.6235576689772927e-05, "loss": 1.1596, "step": 8385 }, { "epoch": 3.122150993007644, "grad_norm": 0.17139267921447754, "learning_rate": 1.6234626536957235e-05, "loss": 1.1434, "step": 8386 }, { "epoch": 3.122523298158253, "grad_norm": 0.16735519468784332, "learning_rate": 1.62336762920565e-05, "loss": 1.1433, "step": 8387 }, { "epoch": 3.122895603308862, "grad_norm": 0.16513751447200775, "learning_rate": 1.6232725955084756e-05, "loss": 1.1627, "step": 8388 }, { "epoch": 3.123267908459471, "grad_norm": 0.16176214814186096, "learning_rate": 1.6231775526056044e-05, "loss": 1.1773, "step": 8389 }, { "epoch": 3.1236402136100803, "grad_norm": 0.16931404173374176, "learning_rate": 1.6230825004984395e-05, "loss": 1.1647, "step": 8390 }, { "epoch": 3.124012518760689, "grad_norm": 0.16403864324092865, "learning_rate": 1.6229874391883856e-05, "loss": 1.1753, "step": 8391 }, { "epoch": 3.1243848239112983, "grad_norm": 0.16192291676998138, "learning_rate": 1.6228923686768458e-05, "loss": 1.1377, "step": 8392 }, { "epoch": 3.124757129061907, "grad_norm": 0.16515354812145233, "learning_rate": 1.622797288965225e-05, "loss": 1.1599, "step": 8393 }, { "epoch": 3.1251294342125164, "grad_norm": 0.1676434427499771, "learning_rate": 1.6227022000549276e-05, "loss": 1.163, "step": 8394 }, { "epoch": 3.1255017393631257, "grad_norm": 0.16849081218242645, "learning_rate": 1.6226071019473577e-05, "loss": 1.1566, "step": 8395 }, { "epoch": 3.1258740445137345, "grad_norm": 0.16453030705451965, "learning_rate": 1.6225119946439196e-05, "loss": 1.1531, "step": 8396 }, { "epoch": 3.126246349664344, "grad_norm": 0.166966512799263, "learning_rate": 1.622416878146019e-05, "loss": 1.1641, "step": 8397 }, { "epoch": 3.1266186548149526, "grad_norm": 0.1629679799079895, "learning_rate": 1.6223217524550595e-05, "loss": 1.16, "step": 8398 }, { "epoch": 3.126990959965562, "grad_norm": 0.16159994900226593, "learning_rate": 1.6222266175724472e-05, "loss": 1.1775, "step": 8399 }, { "epoch": 3.1273632651161707, "grad_norm": 0.16298353672027588, "learning_rate": 1.6221314734995867e-05, "loss": 1.1664, "step": 8400 }, { "epoch": 3.12773557026678, "grad_norm": 0.16081856191158295, "learning_rate": 1.622036320237883e-05, "loss": 1.1433, "step": 8401 }, { "epoch": 3.1281078754173888, "grad_norm": 0.17165522277355194, "learning_rate": 1.6219411577887428e-05, "loss": 1.1592, "step": 8402 }, { "epoch": 3.128480180567998, "grad_norm": 0.16671797633171082, "learning_rate": 1.62184598615357e-05, "loss": 1.1508, "step": 8403 }, { "epoch": 3.1288524857186073, "grad_norm": 0.16210219264030457, "learning_rate": 1.6217508053337713e-05, "loss": 1.1595, "step": 8404 }, { "epoch": 3.129224790869216, "grad_norm": 0.17077292501926422, "learning_rate": 1.6216556153307518e-05, "loss": 1.1484, "step": 8405 }, { "epoch": 3.1295970960198254, "grad_norm": 0.16143833100795746, "learning_rate": 1.6215604161459183e-05, "loss": 1.1785, "step": 8406 }, { "epoch": 3.129969401170434, "grad_norm": 0.18213441967964172, "learning_rate": 1.6214652077806764e-05, "loss": 1.1635, "step": 8407 }, { "epoch": 3.1303417063210435, "grad_norm": 0.16605854034423828, "learning_rate": 1.6213699902364325e-05, "loss": 1.1576, "step": 8408 }, { "epoch": 3.1307140114716523, "grad_norm": 0.17813484370708466, "learning_rate": 1.6212747635145928e-05, "loss": 1.1615, "step": 8409 }, { "epoch": 3.1310863166222616, "grad_norm": 0.16783683001995087, "learning_rate": 1.6211795276165635e-05, "loss": 1.1513, "step": 8410 }, { "epoch": 3.131458621772871, "grad_norm": 0.1840137392282486, "learning_rate": 1.621084282543752e-05, "loss": 1.1596, "step": 8411 }, { "epoch": 3.1318309269234796, "grad_norm": 0.1757200062274933, "learning_rate": 1.6209890282975644e-05, "loss": 1.1677, "step": 8412 }, { "epoch": 3.132203232074089, "grad_norm": 0.17295728623867035, "learning_rate": 1.6208937648794076e-05, "loss": 1.1536, "step": 8413 }, { "epoch": 3.1325755372246977, "grad_norm": 0.1669415831565857, "learning_rate": 1.6207984922906893e-05, "loss": 1.1625, "step": 8414 }, { "epoch": 3.132947842375307, "grad_norm": 0.18558120727539062, "learning_rate": 1.620703210532816e-05, "loss": 1.1613, "step": 8415 }, { "epoch": 3.133320147525916, "grad_norm": 0.16660122573375702, "learning_rate": 1.6206079196071952e-05, "loss": 1.1513, "step": 8416 }, { "epoch": 3.133692452676525, "grad_norm": 0.17779971659183502, "learning_rate": 1.6205126195152345e-05, "loss": 1.1553, "step": 8417 }, { "epoch": 3.134064757827134, "grad_norm": 0.17245428264141083, "learning_rate": 1.620417310258341e-05, "loss": 1.1591, "step": 8418 }, { "epoch": 3.134437062977743, "grad_norm": 0.179290309548378, "learning_rate": 1.620321991837923e-05, "loss": 1.1569, "step": 8419 }, { "epoch": 3.1348093681283524, "grad_norm": 0.16428595781326294, "learning_rate": 1.6202266642553884e-05, "loss": 1.1522, "step": 8420 }, { "epoch": 3.1351816732789612, "grad_norm": 0.1769377738237381, "learning_rate": 1.6201313275121447e-05, "loss": 1.1554, "step": 8421 }, { "epoch": 3.1355539784295705, "grad_norm": 0.16289122402668, "learning_rate": 1.6200359816096e-05, "loss": 1.1597, "step": 8422 }, { "epoch": 3.1359262835801793, "grad_norm": 0.1719742715358734, "learning_rate": 1.619940626549163e-05, "loss": 1.1637, "step": 8423 }, { "epoch": 3.1362985887307886, "grad_norm": 0.16820377111434937, "learning_rate": 1.619845262332242e-05, "loss": 1.1467, "step": 8424 }, { "epoch": 3.1366708938813974, "grad_norm": 0.16198331117630005, "learning_rate": 1.619749888960245e-05, "loss": 1.1575, "step": 8425 }, { "epoch": 3.1370431990320067, "grad_norm": 0.17046979069709778, "learning_rate": 1.6196545064345813e-05, "loss": 1.1583, "step": 8426 }, { "epoch": 3.1374155041826155, "grad_norm": 0.15980608761310577, "learning_rate": 1.619559114756659e-05, "loss": 1.1496, "step": 8427 }, { "epoch": 3.1377878093332248, "grad_norm": 0.1674344688653946, "learning_rate": 1.619463713927888e-05, "loss": 1.1611, "step": 8428 }, { "epoch": 3.138160114483834, "grad_norm": 0.17162348330020905, "learning_rate": 1.6193683039496768e-05, "loss": 1.1675, "step": 8429 }, { "epoch": 3.138532419634443, "grad_norm": 0.16896787285804749, "learning_rate": 1.6192728848234343e-05, "loss": 1.1662, "step": 8430 }, { "epoch": 3.138904724785052, "grad_norm": 0.17117677628993988, "learning_rate": 1.6191774565505703e-05, "loss": 1.1653, "step": 8431 }, { "epoch": 3.139277029935661, "grad_norm": 0.16469299793243408, "learning_rate": 1.619082019132494e-05, "loss": 1.1709, "step": 8432 }, { "epoch": 3.13964933508627, "grad_norm": 0.18532411754131317, "learning_rate": 1.618986572570615e-05, "loss": 1.1446, "step": 8433 }, { "epoch": 3.140021640236879, "grad_norm": 0.16900207102298737, "learning_rate": 1.6188911168663433e-05, "loss": 1.1662, "step": 8434 }, { "epoch": 3.1403939453874883, "grad_norm": 0.17637377977371216, "learning_rate": 1.6187956520210893e-05, "loss": 1.1558, "step": 8435 }, { "epoch": 3.140766250538097, "grad_norm": 0.16088640689849854, "learning_rate": 1.6187001780362613e-05, "loss": 1.1539, "step": 8436 }, { "epoch": 3.1411385556887064, "grad_norm": 0.18611860275268555, "learning_rate": 1.6186046949132713e-05, "loss": 1.1575, "step": 8437 }, { "epoch": 3.1415108608393156, "grad_norm": 0.17658965289592743, "learning_rate": 1.6185092026535286e-05, "loss": 1.1652, "step": 8438 }, { "epoch": 3.1418831659899245, "grad_norm": 0.19135300815105438, "learning_rate": 1.6184137012584434e-05, "loss": 1.1432, "step": 8439 }, { "epoch": 3.1422554711405337, "grad_norm": 0.16602368652820587, "learning_rate": 1.618318190729427e-05, "loss": 1.1599, "step": 8440 }, { "epoch": 3.1426277762911425, "grad_norm": 0.20455126464366913, "learning_rate": 1.6182226710678898e-05, "loss": 1.1602, "step": 8441 }, { "epoch": 3.143000081441752, "grad_norm": 0.18047231435775757, "learning_rate": 1.6181271422752424e-05, "loss": 1.1531, "step": 8442 }, { "epoch": 3.1433723865923606, "grad_norm": 0.17739862203598022, "learning_rate": 1.6180316043528957e-05, "loss": 1.1533, "step": 8443 }, { "epoch": 3.14374469174297, "grad_norm": 0.16755236685276031, "learning_rate": 1.617936057302261e-05, "loss": 1.1491, "step": 8444 }, { "epoch": 3.1441169968935787, "grad_norm": 0.18111351132392883, "learning_rate": 1.61784050112475e-05, "loss": 1.156, "step": 8445 }, { "epoch": 3.144489302044188, "grad_norm": 0.1638672947883606, "learning_rate": 1.617744935821773e-05, "loss": 1.1649, "step": 8446 }, { "epoch": 3.1448616071947972, "grad_norm": 0.17401021718978882, "learning_rate": 1.6176493613947425e-05, "loss": 1.1624, "step": 8447 }, { "epoch": 3.145233912345406, "grad_norm": 0.16585645079612732, "learning_rate": 1.6175537778450694e-05, "loss": 1.1537, "step": 8448 }, { "epoch": 3.1456062174960153, "grad_norm": 0.18722714483737946, "learning_rate": 1.6174581851741658e-05, "loss": 1.1663, "step": 8449 }, { "epoch": 3.145978522646624, "grad_norm": 0.16669632494449615, "learning_rate": 1.6173625833834438e-05, "loss": 1.143, "step": 8450 }, { "epoch": 3.1463508277972334, "grad_norm": 0.1624676138162613, "learning_rate": 1.617266972474315e-05, "loss": 1.1485, "step": 8451 }, { "epoch": 3.1467231329478422, "grad_norm": 0.16883505880832672, "learning_rate": 1.617171352448192e-05, "loss": 1.1538, "step": 8452 }, { "epoch": 3.1470954380984515, "grad_norm": 0.16919681429862976, "learning_rate": 1.6170757233064863e-05, "loss": 1.1483, "step": 8453 }, { "epoch": 3.1474677432490603, "grad_norm": 0.18415765464305878, "learning_rate": 1.6169800850506113e-05, "loss": 1.1692, "step": 8454 }, { "epoch": 3.1478400483996696, "grad_norm": 0.16374878585338593, "learning_rate": 1.616884437681979e-05, "loss": 1.146, "step": 8455 }, { "epoch": 3.148212353550279, "grad_norm": 0.16538135707378387, "learning_rate": 1.6167887812020023e-05, "loss": 1.1619, "step": 8456 }, { "epoch": 3.1485846587008877, "grad_norm": 0.1735272854566574, "learning_rate": 1.6166931156120943e-05, "loss": 1.1598, "step": 8457 }, { "epoch": 3.148956963851497, "grad_norm": 0.16497787833213806, "learning_rate": 1.6165974409136673e-05, "loss": 1.1705, "step": 8458 }, { "epoch": 3.1493292690021057, "grad_norm": 0.1695416122674942, "learning_rate": 1.6165017571081348e-05, "loss": 1.1711, "step": 8459 }, { "epoch": 3.149701574152715, "grad_norm": 0.1687697172164917, "learning_rate": 1.6164060641969104e-05, "loss": 1.1484, "step": 8460 }, { "epoch": 3.150073879303324, "grad_norm": 0.16305190324783325, "learning_rate": 1.6163103621814065e-05, "loss": 1.1711, "step": 8461 }, { "epoch": 3.150446184453933, "grad_norm": 0.16032063961029053, "learning_rate": 1.616214651063038e-05, "loss": 1.1599, "step": 8462 }, { "epoch": 3.150818489604542, "grad_norm": 0.17928291857242584, "learning_rate": 1.6161189308432174e-05, "loss": 1.1642, "step": 8463 }, { "epoch": 3.151190794755151, "grad_norm": 0.170723557472229, "learning_rate": 1.616023201523359e-05, "loss": 1.1557, "step": 8464 }, { "epoch": 3.1515630999057604, "grad_norm": 0.17493624985218048, "learning_rate": 1.6159274631048763e-05, "loss": 1.137, "step": 8465 }, { "epoch": 3.1519354050563693, "grad_norm": 0.19886091351509094, "learning_rate": 1.6158317155891837e-05, "loss": 1.1471, "step": 8466 }, { "epoch": 3.1523077102069785, "grad_norm": 0.1778782457113266, "learning_rate": 1.6157359589776952e-05, "loss": 1.1636, "step": 8467 }, { "epoch": 3.1526800153575874, "grad_norm": 0.16123712062835693, "learning_rate": 1.6156401932718258e-05, "loss": 1.1593, "step": 8468 }, { "epoch": 3.1530523205081966, "grad_norm": 0.17915739119052887, "learning_rate": 1.6155444184729888e-05, "loss": 1.1455, "step": 8469 }, { "epoch": 3.1534246256588054, "grad_norm": 0.19462330639362335, "learning_rate": 1.6154486345825996e-05, "loss": 1.1657, "step": 8470 }, { "epoch": 3.1537969308094147, "grad_norm": 0.1797885149717331, "learning_rate": 1.6153528416020724e-05, "loss": 1.1494, "step": 8471 }, { "epoch": 3.1541692359600235, "grad_norm": 0.16456390917301178, "learning_rate": 1.6152570395328227e-05, "loss": 1.1628, "step": 8472 }, { "epoch": 3.154541541110633, "grad_norm": 0.16568942368030548, "learning_rate": 1.6151612283762653e-05, "loss": 1.162, "step": 8473 }, { "epoch": 3.154913846261242, "grad_norm": 0.16583888232707977, "learning_rate": 1.6150654081338143e-05, "loss": 1.1659, "step": 8474 }, { "epoch": 3.155286151411851, "grad_norm": 0.15852290391921997, "learning_rate": 1.6149695788068868e-05, "loss": 1.1591, "step": 8475 }, { "epoch": 3.15565845656246, "grad_norm": 0.16142380237579346, "learning_rate": 1.6148737403968967e-05, "loss": 1.1588, "step": 8476 }, { "epoch": 3.156030761713069, "grad_norm": 0.17836064100265503, "learning_rate": 1.6147778929052602e-05, "loss": 1.1708, "step": 8477 }, { "epoch": 3.156403066863678, "grad_norm": 0.1994626820087433, "learning_rate": 1.614682036333393e-05, "loss": 1.153, "step": 8478 }, { "epoch": 3.156775372014287, "grad_norm": 0.21441636979579926, "learning_rate": 1.6145861706827104e-05, "loss": 1.1498, "step": 8479 }, { "epoch": 3.1571476771648963, "grad_norm": 0.19782859086990356, "learning_rate": 1.6144902959546286e-05, "loss": 1.1527, "step": 8480 }, { "epoch": 3.157519982315505, "grad_norm": 0.16160625219345093, "learning_rate": 1.614394412150564e-05, "loss": 1.1526, "step": 8481 }, { "epoch": 3.1578922874661144, "grad_norm": 0.20167799293994904, "learning_rate": 1.614298519271932e-05, "loss": 1.155, "step": 8482 }, { "epoch": 3.1582645926167237, "grad_norm": 0.21661166846752167, "learning_rate": 1.61420261732015e-05, "loss": 1.1462, "step": 8483 }, { "epoch": 3.1586368977673325, "grad_norm": 0.16825565695762634, "learning_rate": 1.6141067062966332e-05, "loss": 1.1623, "step": 8484 }, { "epoch": 3.1590092029179417, "grad_norm": 0.2019083946943283, "learning_rate": 1.6140107862027993e-05, "loss": 1.1671, "step": 8485 }, { "epoch": 3.1593815080685506, "grad_norm": 0.17671798169612885, "learning_rate": 1.6139148570400647e-05, "loss": 1.1589, "step": 8486 }, { "epoch": 3.15975381321916, "grad_norm": 0.17492499947547913, "learning_rate": 1.6138189188098463e-05, "loss": 1.1591, "step": 8487 }, { "epoch": 3.1601261183697686, "grad_norm": 0.16377635300159454, "learning_rate": 1.6137229715135604e-05, "loss": 1.1596, "step": 8488 }, { "epoch": 3.160498423520378, "grad_norm": 0.17293529212474823, "learning_rate": 1.6136270151526254e-05, "loss": 1.1644, "step": 8489 }, { "epoch": 3.1608707286709867, "grad_norm": 0.15997327864170074, "learning_rate": 1.6135310497284575e-05, "loss": 1.141, "step": 8490 }, { "epoch": 3.161243033821596, "grad_norm": 0.16692161560058594, "learning_rate": 1.6134350752424746e-05, "loss": 1.1528, "step": 8491 }, { "epoch": 3.1616153389722053, "grad_norm": 0.17296627163887024, "learning_rate": 1.613339091696094e-05, "loss": 1.1472, "step": 8492 }, { "epoch": 3.161987644122814, "grad_norm": 0.1687869429588318, "learning_rate": 1.6132430990907338e-05, "loss": 1.1503, "step": 8493 }, { "epoch": 3.1623599492734233, "grad_norm": 0.18529605865478516, "learning_rate": 1.6131470974278114e-05, "loss": 1.1723, "step": 8494 }, { "epoch": 3.162732254424032, "grad_norm": 0.1638706773519516, "learning_rate": 1.6130510867087447e-05, "loss": 1.1436, "step": 8495 }, { "epoch": 3.1631045595746414, "grad_norm": 0.18358993530273438, "learning_rate": 1.612955066934952e-05, "loss": 1.1605, "step": 8496 }, { "epoch": 3.1634768647252502, "grad_norm": 0.16975806653499603, "learning_rate": 1.6128590381078516e-05, "loss": 1.1495, "step": 8497 }, { "epoch": 3.1638491698758595, "grad_norm": 0.1780412644147873, "learning_rate": 1.6127630002288615e-05, "loss": 1.1628, "step": 8498 }, { "epoch": 3.1642214750264688, "grad_norm": 0.17131397128105164, "learning_rate": 1.6126669532994003e-05, "loss": 1.1578, "step": 8499 }, { "epoch": 3.1645937801770776, "grad_norm": 0.17910616099834442, "learning_rate": 1.6125708973208868e-05, "loss": 1.1559, "step": 8500 }, { "epoch": 3.1645937801770776, "eval_loss": 1.297155499458313, "eval_runtime": 16.9178, "eval_samples_per_second": 102.496, "eval_steps_per_second": 5.143, "step": 8500 }, { "epoch": 3.1645937801770776, "step": 8500, "total_flos": 1.2178098370248075e+20, "train_loss": 1.2383465781352099, "train_runtime": 245543.4426, "train_samples_per_second": 70.008, "train_steps_per_second": 0.109 } ], "logging_steps": 1, "max_steps": 26850, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2178098370248075e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }