diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,59687 @@ +{ + "best_metric": 1.2935380935668945, + "best_model_checkpoint": "/export/data/salmasia/tradutor/checkpoints/hf_llama3_lora/checkpoint-7000", + "epoch": 3.1645937801770776, + "eval_steps": 500, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00037230515060906794, + "grad_norm": 1.4287878274917603, + "learning_rate": 2e-08, + "loss": 1.8524, + "step": 1 + }, + { + "epoch": 0.0007446103012181359, + "grad_norm": 1.438577651977539, + "learning_rate": 4e-08, + "loss": 1.8624, + "step": 2 + }, + { + "epoch": 0.001116915451827204, + "grad_norm": 1.4175422191619873, + "learning_rate": 6.000000000000001e-08, + "loss": 1.8535, + "step": 3 + }, + { + "epoch": 0.0014892206024362717, + "grad_norm": 1.3822880983352661, + "learning_rate": 8e-08, + "loss": 1.8581, + "step": 4 + }, + { + "epoch": 0.0018615257530453398, + "grad_norm": 1.4041341543197632, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.8706, + "step": 5 + }, + { + "epoch": 0.002233830903654408, + "grad_norm": 1.352337121963501, + "learning_rate": 1.2000000000000002e-07, + "loss": 1.8438, + "step": 6 + }, + { + "epoch": 0.002606136054263476, + "grad_norm": 1.3975361585617065, + "learning_rate": 1.4e-07, + "loss": 1.8411, + "step": 7 + }, + { + "epoch": 0.0029784412048725435, + "grad_norm": 1.3485698699951172, + "learning_rate": 1.6e-07, + "loss": 1.8473, + "step": 8 + }, + { + "epoch": 0.0033507463554816115, + "grad_norm": 1.4282227754592896, + "learning_rate": 1.8e-07, + "loss": 1.8677, + "step": 9 + }, + { + "epoch": 0.0037230515060906796, + "grad_norm": 1.3968946933746338, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.876, + "step": 10 + }, + { + "epoch": 0.004095356656699747, + "grad_norm": 1.4412802457809448, + "learning_rate": 2.2e-07, + "loss": 1.8736, + "step": 11 + }, + { + "epoch": 0.004467661807308816, + "grad_norm": 1.3732651472091675, + "learning_rate": 2.4000000000000003e-07, + "loss": 1.8566, + "step": 12 + }, + { + "epoch": 0.004839966957917883, + "grad_norm": 1.444575309753418, + "learning_rate": 2.6e-07, + "loss": 1.8654, + "step": 13 + }, + { + "epoch": 0.005212272108526952, + "grad_norm": 1.4188005924224854, + "learning_rate": 2.8e-07, + "loss": 1.8672, + "step": 14 + }, + { + "epoch": 0.005584577259136019, + "grad_norm": 1.3773771524429321, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.8475, + "step": 15 + }, + { + "epoch": 0.005956882409745087, + "grad_norm": 1.4325660467147827, + "learning_rate": 3.2e-07, + "loss": 1.8845, + "step": 16 + }, + { + "epoch": 0.0063291875603541554, + "grad_norm": 1.366593599319458, + "learning_rate": 3.4000000000000003e-07, + "loss": 1.8627, + "step": 17 + }, + { + "epoch": 0.006701492710963223, + "grad_norm": 1.3906618356704712, + "learning_rate": 3.6e-07, + "loss": 1.8424, + "step": 18 + }, + { + "epoch": 0.0070737978615722915, + "grad_norm": 1.3560819625854492, + "learning_rate": 3.8e-07, + "loss": 1.8623, + "step": 19 + }, + { + "epoch": 0.007446103012181359, + "grad_norm": 1.3274669647216797, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.8461, + "step": 20 + }, + { + "epoch": 0.007818408162790428, + "grad_norm": 1.3316004276275635, + "learning_rate": 4.2000000000000006e-07, + "loss": 1.8471, + "step": 21 + }, + { + "epoch": 0.008190713313399494, + "grad_norm": 1.3909695148468018, + "learning_rate": 4.4e-07, + "loss": 1.8484, + "step": 22 + }, + { + "epoch": 0.008563018464008563, + "grad_norm": 1.3994210958480835, + "learning_rate": 4.6000000000000004e-07, + "loss": 1.8488, + "step": 23 + }, + { + "epoch": 0.008935323614617631, + "grad_norm": 1.4674007892608643, + "learning_rate": 4.800000000000001e-07, + "loss": 1.8548, + "step": 24 + }, + { + "epoch": 0.0093076287652267, + "grad_norm": 1.4008136987686157, + "learning_rate": 5.000000000000001e-07, + "loss": 1.8322, + "step": 25 + }, + { + "epoch": 0.009679933915835767, + "grad_norm": 1.386723279953003, + "learning_rate": 5.2e-07, + "loss": 1.8451, + "step": 26 + }, + { + "epoch": 0.010052239066444835, + "grad_norm": 1.398271918296814, + "learning_rate": 5.4e-07, + "loss": 1.8389, + "step": 27 + }, + { + "epoch": 0.010424544217053903, + "grad_norm": 1.4241478443145752, + "learning_rate": 5.6e-07, + "loss": 1.8636, + "step": 28 + }, + { + "epoch": 0.01079684936766297, + "grad_norm": 1.4040849208831787, + "learning_rate": 5.800000000000001e-07, + "loss": 1.8427, + "step": 29 + }, + { + "epoch": 0.011169154518272039, + "grad_norm": 1.3591426610946655, + "learning_rate": 6.000000000000001e-07, + "loss": 1.8354, + "step": 30 + }, + { + "epoch": 0.011541459668881107, + "grad_norm": 1.4355502128601074, + "learning_rate": 6.200000000000001e-07, + "loss": 1.8536, + "step": 31 + }, + { + "epoch": 0.011913764819490174, + "grad_norm": 1.4141122102737427, + "learning_rate": 6.4e-07, + "loss": 1.8295, + "step": 32 + }, + { + "epoch": 0.012286069970099242, + "grad_norm": 1.4399964809417725, + "learning_rate": 6.6e-07, + "loss": 1.8555, + "step": 33 + }, + { + "epoch": 0.012658375120708311, + "grad_norm": 1.3795485496520996, + "learning_rate": 6.800000000000001e-07, + "loss": 1.8304, + "step": 34 + }, + { + "epoch": 0.01303068027131738, + "grad_norm": 1.4039381742477417, + "learning_rate": 7.000000000000001e-07, + "loss": 1.8467, + "step": 35 + }, + { + "epoch": 0.013402985421926446, + "grad_norm": 1.4005100727081299, + "learning_rate": 7.2e-07, + "loss": 1.8253, + "step": 36 + }, + { + "epoch": 0.013775290572535515, + "grad_norm": 1.415635347366333, + "learning_rate": 7.4e-07, + "loss": 1.8458, + "step": 37 + }, + { + "epoch": 0.014147595723144583, + "grad_norm": 1.39545738697052, + "learning_rate": 7.6e-07, + "loss": 1.8264, + "step": 38 + }, + { + "epoch": 0.01451990087375365, + "grad_norm": 1.3957033157348633, + "learning_rate": 7.8e-07, + "loss": 1.8404, + "step": 39 + }, + { + "epoch": 0.014892206024362718, + "grad_norm": 1.3806068897247314, + "learning_rate": 8.000000000000001e-07, + "loss": 1.828, + "step": 40 + }, + { + "epoch": 0.015264511174971787, + "grad_norm": 1.387890338897705, + "learning_rate": 8.200000000000001e-07, + "loss": 1.8302, + "step": 41 + }, + { + "epoch": 0.015636816325580855, + "grad_norm": 1.3517177104949951, + "learning_rate": 8.400000000000001e-07, + "loss": 1.8021, + "step": 42 + }, + { + "epoch": 0.016009121476189922, + "grad_norm": 1.3489928245544434, + "learning_rate": 8.6e-07, + "loss": 1.8084, + "step": 43 + }, + { + "epoch": 0.01638142662679899, + "grad_norm": 1.352052092552185, + "learning_rate": 8.8e-07, + "loss": 1.8143, + "step": 44 + }, + { + "epoch": 0.01675373177740806, + "grad_norm": 1.324896216392517, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7936, + "step": 45 + }, + { + "epoch": 0.017126036928017126, + "grad_norm": 1.2951364517211914, + "learning_rate": 9.200000000000001e-07, + "loss": 1.8092, + "step": 46 + }, + { + "epoch": 0.017498342078626192, + "grad_norm": 1.3121585845947266, + "learning_rate": 9.400000000000001e-07, + "loss": 1.8096, + "step": 47 + }, + { + "epoch": 0.017870647229235263, + "grad_norm": 1.359284520149231, + "learning_rate": 9.600000000000001e-07, + "loss": 1.822, + "step": 48 + }, + { + "epoch": 0.01824295237984433, + "grad_norm": 1.3160185813903809, + "learning_rate": 9.800000000000001e-07, + "loss": 1.8113, + "step": 49 + }, + { + "epoch": 0.0186152575304534, + "grad_norm": 1.2798627614974976, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7942, + "step": 50 + }, + { + "epoch": 0.018987562681062466, + "grad_norm": 1.3182300329208374, + "learning_rate": 1.02e-06, + "loss": 1.8119, + "step": 51 + }, + { + "epoch": 0.019359867831671533, + "grad_norm": 1.3083176612854004, + "learning_rate": 1.04e-06, + "loss": 1.8189, + "step": 52 + }, + { + "epoch": 0.019732172982280603, + "grad_norm": 1.3400063514709473, + "learning_rate": 1.06e-06, + "loss": 1.837, + "step": 53 + }, + { + "epoch": 0.02010447813288967, + "grad_norm": 1.2751755714416504, + "learning_rate": 1.08e-06, + "loss": 1.7936, + "step": 54 + }, + { + "epoch": 0.020476783283498737, + "grad_norm": 1.223643183708191, + "learning_rate": 1.1e-06, + "loss": 1.7851, + "step": 55 + }, + { + "epoch": 0.020849088434107807, + "grad_norm": 1.2413800954818726, + "learning_rate": 1.12e-06, + "loss": 1.7893, + "step": 56 + }, + { + "epoch": 0.021221393584716874, + "grad_norm": 1.196629524230957, + "learning_rate": 1.14e-06, + "loss": 1.8031, + "step": 57 + }, + { + "epoch": 0.02159369873532594, + "grad_norm": 1.158254861831665, + "learning_rate": 1.1600000000000001e-06, + "loss": 1.79, + "step": 58 + }, + { + "epoch": 0.02196600388593501, + "grad_norm": 1.0098892450332642, + "learning_rate": 1.1800000000000001e-06, + "loss": 1.7772, + "step": 59 + }, + { + "epoch": 0.022338309036544077, + "grad_norm": 0.949759304523468, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.7582, + "step": 60 + }, + { + "epoch": 0.022710614187153144, + "grad_norm": 0.9904881119728088, + "learning_rate": 1.2200000000000002e-06, + "loss": 1.775, + "step": 61 + }, + { + "epoch": 0.023082919337762214, + "grad_norm": 0.9897575378417969, + "learning_rate": 1.2400000000000002e-06, + "loss": 1.7599, + "step": 62 + }, + { + "epoch": 0.02345522448837128, + "grad_norm": 0.9522385597229004, + "learning_rate": 1.26e-06, + "loss": 1.7643, + "step": 63 + }, + { + "epoch": 0.023827529638980348, + "grad_norm": 0.9590041637420654, + "learning_rate": 1.28e-06, + "loss": 1.7782, + "step": 64 + }, + { + "epoch": 0.024199834789589418, + "grad_norm": 0.910637378692627, + "learning_rate": 1.3e-06, + "loss": 1.7478, + "step": 65 + }, + { + "epoch": 0.024572139940198485, + "grad_norm": 0.9400556087493896, + "learning_rate": 1.32e-06, + "loss": 1.745, + "step": 66 + }, + { + "epoch": 0.02494444509080755, + "grad_norm": 0.9209262728691101, + "learning_rate": 1.34e-06, + "loss": 1.7594, + "step": 67 + }, + { + "epoch": 0.025316750241416622, + "grad_norm": 0.896427571773529, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.7446, + "step": 68 + }, + { + "epoch": 0.02568905539202569, + "grad_norm": 0.895569920539856, + "learning_rate": 1.3800000000000001e-06, + "loss": 1.769, + "step": 69 + }, + { + "epoch": 0.02606136054263476, + "grad_norm": 0.7845183610916138, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.7196, + "step": 70 + }, + { + "epoch": 0.026433665693243825, + "grad_norm": 0.800564706325531, + "learning_rate": 1.42e-06, + "loss": 1.732, + "step": 71 + }, + { + "epoch": 0.026805970843852892, + "grad_norm": 0.6883242130279541, + "learning_rate": 1.44e-06, + "loss": 1.7251, + "step": 72 + }, + { + "epoch": 0.027178275994461962, + "grad_norm": 0.6776332259178162, + "learning_rate": 1.46e-06, + "loss": 1.7311, + "step": 73 + }, + { + "epoch": 0.02755058114507103, + "grad_norm": 0.6384090781211853, + "learning_rate": 1.48e-06, + "loss": 1.7215, + "step": 74 + }, + { + "epoch": 0.027922886295680096, + "grad_norm": 0.647678017616272, + "learning_rate": 1.5e-06, + "loss": 1.7038, + "step": 75 + }, + { + "epoch": 0.028295191446289166, + "grad_norm": 0.6253066658973694, + "learning_rate": 1.52e-06, + "loss": 1.7308, + "step": 76 + }, + { + "epoch": 0.028667496596898233, + "grad_norm": 0.599458634853363, + "learning_rate": 1.54e-06, + "loss": 1.6998, + "step": 77 + }, + { + "epoch": 0.0290398017475073, + "grad_norm": 0.5987157225608826, + "learning_rate": 1.56e-06, + "loss": 1.7092, + "step": 78 + }, + { + "epoch": 0.02941210689811637, + "grad_norm": 0.5856855511665344, + "learning_rate": 1.5800000000000001e-06, + "loss": 1.7001, + "step": 79 + }, + { + "epoch": 0.029784412048725437, + "grad_norm": 0.5748167037963867, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.6861, + "step": 80 + }, + { + "epoch": 0.030156717199334503, + "grad_norm": 0.5907240509986877, + "learning_rate": 1.6200000000000002e-06, + "loss": 1.7043, + "step": 81 + }, + { + "epoch": 0.030529022349943574, + "grad_norm": 0.5696431994438171, + "learning_rate": 1.6400000000000002e-06, + "loss": 1.6871, + "step": 82 + }, + { + "epoch": 0.03090132750055264, + "grad_norm": 0.5690540075302124, + "learning_rate": 1.6600000000000002e-06, + "loss": 1.6584, + "step": 83 + }, + { + "epoch": 0.03127363265116171, + "grad_norm": 0.5912604331970215, + "learning_rate": 1.6800000000000002e-06, + "loss": 1.6724, + "step": 84 + }, + { + "epoch": 0.031645937801770774, + "grad_norm": 0.595125138759613, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.6629, + "step": 85 + }, + { + "epoch": 0.032018242952379844, + "grad_norm": 0.5880417227745056, + "learning_rate": 1.72e-06, + "loss": 1.6722, + "step": 86 + }, + { + "epoch": 0.032390548102988914, + "grad_norm": 0.621819019317627, + "learning_rate": 1.74e-06, + "loss": 1.6583, + "step": 87 + }, + { + "epoch": 0.03276285325359798, + "grad_norm": 0.6170995831489563, + "learning_rate": 1.76e-06, + "loss": 1.6434, + "step": 88 + }, + { + "epoch": 0.03313515840420705, + "grad_norm": 0.6138344407081604, + "learning_rate": 1.7800000000000001e-06, + "loss": 1.6368, + "step": 89 + }, + { + "epoch": 0.03350746355481612, + "grad_norm": 0.6477394700050354, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.6606, + "step": 90 + }, + { + "epoch": 0.03387976870542518, + "grad_norm": 0.6119678616523743, + "learning_rate": 1.8200000000000002e-06, + "loss": 1.6266, + "step": 91 + }, + { + "epoch": 0.03425207385603425, + "grad_norm": 0.612330436706543, + "learning_rate": 1.8400000000000002e-06, + "loss": 1.618, + "step": 92 + }, + { + "epoch": 0.03462437900664332, + "grad_norm": 0.6225689053535461, + "learning_rate": 1.8600000000000002e-06, + "loss": 1.6234, + "step": 93 + }, + { + "epoch": 0.034996684157252385, + "grad_norm": 0.5868818759918213, + "learning_rate": 1.8800000000000002e-06, + "loss": 1.6091, + "step": 94 + }, + { + "epoch": 0.035368989307861455, + "grad_norm": 0.5659094452857971, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.6211, + "step": 95 + }, + { + "epoch": 0.035741294458470525, + "grad_norm": 0.550031840801239, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.6023, + "step": 96 + }, + { + "epoch": 0.03611359960907959, + "grad_norm": 0.5339946746826172, + "learning_rate": 1.94e-06, + "loss": 1.607, + "step": 97 + }, + { + "epoch": 0.03648590475968866, + "grad_norm": 0.5484851598739624, + "learning_rate": 1.9600000000000003e-06, + "loss": 1.6088, + "step": 98 + }, + { + "epoch": 0.03685820991029773, + "grad_norm": 0.5426100492477417, + "learning_rate": 1.98e-06, + "loss": 1.5946, + "step": 99 + }, + { + "epoch": 0.0372305150609068, + "grad_norm": 0.53022700548172, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5923, + "step": 100 + }, + { + "epoch": 0.03760282021151586, + "grad_norm": 0.5056052207946777, + "learning_rate": 2.02e-06, + "loss": 1.5901, + "step": 101 + }, + { + "epoch": 0.03797512536212493, + "grad_norm": 0.5120390057563782, + "learning_rate": 2.04e-06, + "loss": 1.5924, + "step": 102 + }, + { + "epoch": 0.038347430512734, + "grad_norm": 0.5347453355789185, + "learning_rate": 2.06e-06, + "loss": 1.5924, + "step": 103 + }, + { + "epoch": 0.038719735663343066, + "grad_norm": 0.535159170627594, + "learning_rate": 2.08e-06, + "loss": 1.5821, + "step": 104 + }, + { + "epoch": 0.039092040813952136, + "grad_norm": 0.5138244032859802, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.5981, + "step": 105 + }, + { + "epoch": 0.03946434596456121, + "grad_norm": 0.5143431425094604, + "learning_rate": 2.12e-06, + "loss": 1.5742, + "step": 106 + }, + { + "epoch": 0.03983665111517027, + "grad_norm": 0.5397112965583801, + "learning_rate": 2.1400000000000003e-06, + "loss": 1.5919, + "step": 107 + }, + { + "epoch": 0.04020895626577934, + "grad_norm": 0.503512978553772, + "learning_rate": 2.16e-06, + "loss": 1.5856, + "step": 108 + }, + { + "epoch": 0.04058126141638841, + "grad_norm": 0.5465199947357178, + "learning_rate": 2.1800000000000003e-06, + "loss": 1.5853, + "step": 109 + }, + { + "epoch": 0.040953566566997474, + "grad_norm": 0.561886727809906, + "learning_rate": 2.2e-06, + "loss": 1.5841, + "step": 110 + }, + { + "epoch": 0.041325871717606544, + "grad_norm": 0.5566477179527283, + "learning_rate": 2.2200000000000003e-06, + "loss": 1.5819, + "step": 111 + }, + { + "epoch": 0.041698176868215614, + "grad_norm": 0.5444602370262146, + "learning_rate": 2.24e-06, + "loss": 1.5702, + "step": 112 + }, + { + "epoch": 0.04207048201882468, + "grad_norm": 0.5601593852043152, + "learning_rate": 2.2600000000000004e-06, + "loss": 1.5667, + "step": 113 + }, + { + "epoch": 0.04244278716943375, + "grad_norm": 0.5660892128944397, + "learning_rate": 2.28e-06, + "loss": 1.5746, + "step": 114 + }, + { + "epoch": 0.04281509232004282, + "grad_norm": 0.5766957402229309, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.5627, + "step": 115 + }, + { + "epoch": 0.04318739747065188, + "grad_norm": 0.6171875596046448, + "learning_rate": 2.3200000000000002e-06, + "loss": 1.5807, + "step": 116 + }, + { + "epoch": 0.04355970262126095, + "grad_norm": 0.5867440700531006, + "learning_rate": 2.3400000000000005e-06, + "loss": 1.5642, + "step": 117 + }, + { + "epoch": 0.04393200777187002, + "grad_norm": 0.5847681164741516, + "learning_rate": 2.3600000000000003e-06, + "loss": 1.5593, + "step": 118 + }, + { + "epoch": 0.044304312922479085, + "grad_norm": 0.5901392102241516, + "learning_rate": 2.38e-06, + "loss": 1.5697, + "step": 119 + }, + { + "epoch": 0.044676618073088155, + "grad_norm": 0.5898075699806213, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.5522, + "step": 120 + }, + { + "epoch": 0.045048923223697225, + "grad_norm": 0.5409351587295532, + "learning_rate": 2.42e-06, + "loss": 1.5498, + "step": 121 + }, + { + "epoch": 0.04542122837430629, + "grad_norm": 0.5224587321281433, + "learning_rate": 2.4400000000000004e-06, + "loss": 1.5567, + "step": 122 + }, + { + "epoch": 0.04579353352491536, + "grad_norm": 0.48012423515319824, + "learning_rate": 2.46e-06, + "loss": 1.5345, + "step": 123 + }, + { + "epoch": 0.04616583867552443, + "grad_norm": 0.44709786772727966, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.5434, + "step": 124 + }, + { + "epoch": 0.04653814382613349, + "grad_norm": 0.43979910016059875, + "learning_rate": 2.5e-06, + "loss": 1.5461, + "step": 125 + }, + { + "epoch": 0.04691044897674256, + "grad_norm": 0.3758682608604431, + "learning_rate": 2.52e-06, + "loss": 1.5484, + "step": 126 + }, + { + "epoch": 0.04728275412735163, + "grad_norm": 0.34224575757980347, + "learning_rate": 2.5400000000000002e-06, + "loss": 1.5238, + "step": 127 + }, + { + "epoch": 0.047655059277960696, + "grad_norm": 0.3217172622680664, + "learning_rate": 2.56e-06, + "loss": 1.5567, + "step": 128 + }, + { + "epoch": 0.048027364428569766, + "grad_norm": 0.2725611925125122, + "learning_rate": 2.5800000000000003e-06, + "loss": 1.5349, + "step": 129 + }, + { + "epoch": 0.048399669579178836, + "grad_norm": 0.22641003131866455, + "learning_rate": 2.6e-06, + "loss": 1.533, + "step": 130 + }, + { + "epoch": 0.0487719747297879, + "grad_norm": 0.19425782561302185, + "learning_rate": 2.6200000000000003e-06, + "loss": 1.5486, + "step": 131 + }, + { + "epoch": 0.04914427988039697, + "grad_norm": 0.18164916336536407, + "learning_rate": 2.64e-06, + "loss": 1.5494, + "step": 132 + }, + { + "epoch": 0.04951658503100604, + "grad_norm": 0.16984394192695618, + "learning_rate": 2.6600000000000004e-06, + "loss": 1.5342, + "step": 133 + }, + { + "epoch": 0.0498888901816151, + "grad_norm": 0.15408554673194885, + "learning_rate": 2.68e-06, + "loss": 1.5349, + "step": 134 + }, + { + "epoch": 0.05026119533222417, + "grad_norm": 0.1477842777967453, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.5492, + "step": 135 + }, + { + "epoch": 0.050633500482833244, + "grad_norm": 0.14385773241519928, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.5415, + "step": 136 + }, + { + "epoch": 0.05100580563344231, + "grad_norm": 0.1423702985048294, + "learning_rate": 2.7400000000000004e-06, + "loss": 1.5231, + "step": 137 + }, + { + "epoch": 0.05137811078405138, + "grad_norm": 0.4125309884548187, + "learning_rate": 2.7600000000000003e-06, + "loss": 1.5381, + "step": 138 + }, + { + "epoch": 0.05175041593466045, + "grad_norm": 0.14331988990306854, + "learning_rate": 2.7800000000000005e-06, + "loss": 1.5438, + "step": 139 + }, + { + "epoch": 0.05212272108526952, + "grad_norm": 0.12616126239299774, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.5481, + "step": 140 + }, + { + "epoch": 0.05249502623587858, + "grad_norm": 0.1296859085559845, + "learning_rate": 2.82e-06, + "loss": 1.5316, + "step": 141 + }, + { + "epoch": 0.05286733138648765, + "grad_norm": 0.14372870326042175, + "learning_rate": 2.84e-06, + "loss": 1.5314, + "step": 142 + }, + { + "epoch": 0.05323963653709672, + "grad_norm": 0.12012794613838196, + "learning_rate": 2.86e-06, + "loss": 1.5256, + "step": 143 + }, + { + "epoch": 0.053611941687705784, + "grad_norm": 0.1238911896944046, + "learning_rate": 2.88e-06, + "loss": 1.5376, + "step": 144 + }, + { + "epoch": 0.053984246838314855, + "grad_norm": 0.11403171718120575, + "learning_rate": 2.9e-06, + "loss": 1.5313, + "step": 145 + }, + { + "epoch": 0.054356551988923925, + "grad_norm": 0.11582870781421661, + "learning_rate": 2.92e-06, + "loss": 1.5293, + "step": 146 + }, + { + "epoch": 0.05472885713953299, + "grad_norm": 0.11598813533782959, + "learning_rate": 2.9400000000000002e-06, + "loss": 1.5387, + "step": 147 + }, + { + "epoch": 0.05510116229014206, + "grad_norm": 0.11165577918291092, + "learning_rate": 2.96e-06, + "loss": 1.5195, + "step": 148 + }, + { + "epoch": 0.05547346744075113, + "grad_norm": 0.1125321313738823, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.521, + "step": 149 + }, + { + "epoch": 0.05584577259136019, + "grad_norm": 0.11534541845321655, + "learning_rate": 3e-06, + "loss": 1.5368, + "step": 150 + }, + { + "epoch": 0.05621807774196926, + "grad_norm": 0.11158929765224457, + "learning_rate": 3.0200000000000003e-06, + "loss": 1.53, + "step": 151 + }, + { + "epoch": 0.05659038289257833, + "grad_norm": 0.11701493710279465, + "learning_rate": 3.04e-06, + "loss": 1.517, + "step": 152 + }, + { + "epoch": 0.056962688043187396, + "grad_norm": 0.10573873668909073, + "learning_rate": 3.0600000000000003e-06, + "loss": 1.5479, + "step": 153 + }, + { + "epoch": 0.057334993193796466, + "grad_norm": 0.10436706990003586, + "learning_rate": 3.08e-06, + "loss": 1.5376, + "step": 154 + }, + { + "epoch": 0.057707298344405536, + "grad_norm": 0.11538543552160263, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.5405, + "step": 155 + }, + { + "epoch": 0.0580796034950146, + "grad_norm": 0.11722234636545181, + "learning_rate": 3.12e-06, + "loss": 1.5249, + "step": 156 + }, + { + "epoch": 0.05845190864562367, + "grad_norm": 0.11727150529623032, + "learning_rate": 3.1400000000000004e-06, + "loss": 1.5279, + "step": 157 + }, + { + "epoch": 0.05882421379623274, + "grad_norm": 0.10549698024988174, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.5101, + "step": 158 + }, + { + "epoch": 0.0591965189468418, + "grad_norm": 0.10655630379915237, + "learning_rate": 3.1800000000000005e-06, + "loss": 1.523, + "step": 159 + }, + { + "epoch": 0.05956882409745087, + "grad_norm": 0.10541582852602005, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.5114, + "step": 160 + }, + { + "epoch": 0.05994112924805994, + "grad_norm": 0.1057916209101677, + "learning_rate": 3.2200000000000005e-06, + "loss": 1.5146, + "step": 161 + }, + { + "epoch": 0.06031343439866901, + "grad_norm": 0.10496512800455093, + "learning_rate": 3.2400000000000003e-06, + "loss": 1.5168, + "step": 162 + }, + { + "epoch": 0.06068573954927808, + "grad_norm": 0.09892502427101135, + "learning_rate": 3.2600000000000006e-06, + "loss": 1.5053, + "step": 163 + }, + { + "epoch": 0.06105804469988715, + "grad_norm": 0.10325302928686142, + "learning_rate": 3.2800000000000004e-06, + "loss": 1.5157, + "step": 164 + }, + { + "epoch": 0.06143034985049621, + "grad_norm": 0.11229792982339859, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.4997, + "step": 165 + }, + { + "epoch": 0.06180265500110528, + "grad_norm": 0.10319443047046661, + "learning_rate": 3.3200000000000004e-06, + "loss": 1.5175, + "step": 166 + }, + { + "epoch": 0.06217496015171435, + "grad_norm": 0.10021348297595978, + "learning_rate": 3.3400000000000006e-06, + "loss": 1.5383, + "step": 167 + }, + { + "epoch": 0.06254726530232342, + "grad_norm": 0.09915069490671158, + "learning_rate": 3.3600000000000004e-06, + "loss": 1.5226, + "step": 168 + }, + { + "epoch": 0.06291957045293249, + "grad_norm": 0.10507778823375702, + "learning_rate": 3.3800000000000007e-06, + "loss": 1.5251, + "step": 169 + }, + { + "epoch": 0.06329187560354155, + "grad_norm": 0.10222301632165909, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.5287, + "step": 170 + }, + { + "epoch": 0.06366418075415062, + "grad_norm": 0.10224229842424393, + "learning_rate": 3.4200000000000007e-06, + "loss": 1.5047, + "step": 171 + }, + { + "epoch": 0.06403648590475969, + "grad_norm": 0.09706564992666245, + "learning_rate": 3.44e-06, + "loss": 1.5136, + "step": 172 + }, + { + "epoch": 0.06440879105536876, + "grad_norm": 0.09412699937820435, + "learning_rate": 3.46e-06, + "loss": 1.5141, + "step": 173 + }, + { + "epoch": 0.06478109620597783, + "grad_norm": 0.09892858564853668, + "learning_rate": 3.48e-06, + "loss": 1.5198, + "step": 174 + }, + { + "epoch": 0.0651534013565869, + "grad_norm": 0.10099875926971436, + "learning_rate": 3.5e-06, + "loss": 1.5338, + "step": 175 + }, + { + "epoch": 0.06552570650719595, + "grad_norm": 0.0975039079785347, + "learning_rate": 3.52e-06, + "loss": 1.5202, + "step": 176 + }, + { + "epoch": 0.06589801165780503, + "grad_norm": 0.09733244776725769, + "learning_rate": 3.54e-06, + "loss": 1.5256, + "step": 177 + }, + { + "epoch": 0.0662703168084141, + "grad_norm": 0.10019107162952423, + "learning_rate": 3.5600000000000002e-06, + "loss": 1.5048, + "step": 178 + }, + { + "epoch": 0.06664262195902317, + "grad_norm": 0.10042434930801392, + "learning_rate": 3.58e-06, + "loss": 1.499, + "step": 179 + }, + { + "epoch": 0.06701492710963224, + "grad_norm": 0.10034509003162384, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.5059, + "step": 180 + }, + { + "epoch": 0.0673872322602413, + "grad_norm": 0.09680823236703873, + "learning_rate": 3.62e-06, + "loss": 1.5074, + "step": 181 + }, + { + "epoch": 0.06775953741085036, + "grad_norm": 0.10564741492271423, + "learning_rate": 3.6400000000000003e-06, + "loss": 1.5014, + "step": 182 + }, + { + "epoch": 0.06813184256145943, + "grad_norm": 0.09912260621786118, + "learning_rate": 3.66e-06, + "loss": 1.5329, + "step": 183 + }, + { + "epoch": 0.0685041477120685, + "grad_norm": 0.09995097666978836, + "learning_rate": 3.6800000000000003e-06, + "loss": 1.4999, + "step": 184 + }, + { + "epoch": 0.06887645286267757, + "grad_norm": 0.10238537192344666, + "learning_rate": 3.7e-06, + "loss": 1.5197, + "step": 185 + }, + { + "epoch": 0.06924875801328664, + "grad_norm": 0.09943666309118271, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.5234, + "step": 186 + }, + { + "epoch": 0.06962106316389571, + "grad_norm": 0.09201087057590485, + "learning_rate": 3.74e-06, + "loss": 1.5103, + "step": 187 + }, + { + "epoch": 0.06999336831450477, + "grad_norm": 0.09809333086013794, + "learning_rate": 3.7600000000000004e-06, + "loss": 1.5107, + "step": 188 + }, + { + "epoch": 0.07036567346511384, + "grad_norm": 0.09340554475784302, + "learning_rate": 3.7800000000000002e-06, + "loss": 1.517, + "step": 189 + }, + { + "epoch": 0.07073797861572291, + "grad_norm": 0.10037955641746521, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.51, + "step": 190 + }, + { + "epoch": 0.07111028376633198, + "grad_norm": 0.0913090705871582, + "learning_rate": 3.820000000000001e-06, + "loss": 1.5033, + "step": 191 + }, + { + "epoch": 0.07148258891694105, + "grad_norm": 0.09793171286582947, + "learning_rate": 3.8400000000000005e-06, + "loss": 1.5268, + "step": 192 + }, + { + "epoch": 0.07185489406755012, + "grad_norm": 0.09348586201667786, + "learning_rate": 3.86e-06, + "loss": 1.5046, + "step": 193 + }, + { + "epoch": 0.07222719921815918, + "grad_norm": 0.09434423595666885, + "learning_rate": 3.88e-06, + "loss": 1.498, + "step": 194 + }, + { + "epoch": 0.07259950436876825, + "grad_norm": 0.08928900957107544, + "learning_rate": 3.900000000000001e-06, + "loss": 1.4918, + "step": 195 + }, + { + "epoch": 0.07297180951937732, + "grad_norm": 0.0911901518702507, + "learning_rate": 3.920000000000001e-06, + "loss": 1.4903, + "step": 196 + }, + { + "epoch": 0.07334411466998639, + "grad_norm": 0.08914012461900711, + "learning_rate": 3.94e-06, + "loss": 1.512, + "step": 197 + }, + { + "epoch": 0.07371641982059546, + "grad_norm": 0.0897861197590828, + "learning_rate": 3.96e-06, + "loss": 1.5127, + "step": 198 + }, + { + "epoch": 0.07408872497120453, + "grad_norm": 0.0919618159532547, + "learning_rate": 3.980000000000001e-06, + "loss": 1.5032, + "step": 199 + }, + { + "epoch": 0.0744610301218136, + "grad_norm": 0.0871221199631691, + "learning_rate": 4.000000000000001e-06, + "loss": 1.5092, + "step": 200 + }, + { + "epoch": 0.07483333527242265, + "grad_norm": 0.08722933381795883, + "learning_rate": 4.0200000000000005e-06, + "loss": 1.4905, + "step": 201 + }, + { + "epoch": 0.07520564042303172, + "grad_norm": 0.08796436339616776, + "learning_rate": 4.04e-06, + "loss": 1.4992, + "step": 202 + }, + { + "epoch": 0.0755779455736408, + "grad_norm": 0.09674139320850372, + "learning_rate": 4.060000000000001e-06, + "loss": 1.5032, + "step": 203 + }, + { + "epoch": 0.07595025072424987, + "grad_norm": 0.08799508959054947, + "learning_rate": 4.08e-06, + "loss": 1.5124, + "step": 204 + }, + { + "epoch": 0.07632255587485894, + "grad_norm": 0.08769244700670242, + "learning_rate": 4.1e-06, + "loss": 1.5087, + "step": 205 + }, + { + "epoch": 0.076694861025468, + "grad_norm": 0.08863134682178497, + "learning_rate": 4.12e-06, + "loss": 1.5129, + "step": 206 + }, + { + "epoch": 0.07706716617607706, + "grad_norm": 0.1002495214343071, + "learning_rate": 4.14e-06, + "loss": 1.5024, + "step": 207 + }, + { + "epoch": 0.07743947132668613, + "grad_norm": 0.09144977480173111, + "learning_rate": 4.16e-06, + "loss": 1.5006, + "step": 208 + }, + { + "epoch": 0.0778117764772952, + "grad_norm": 0.10466210544109344, + "learning_rate": 4.18e-06, + "loss": 1.5022, + "step": 209 + }, + { + "epoch": 0.07818408162790427, + "grad_norm": 0.08975500613451004, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.4965, + "step": 210 + }, + { + "epoch": 0.07855638677851334, + "grad_norm": 0.08787556737661362, + "learning_rate": 4.22e-06, + "loss": 1.5042, + "step": 211 + }, + { + "epoch": 0.07892869192912241, + "grad_norm": 0.08773490786552429, + "learning_rate": 4.24e-06, + "loss": 1.5138, + "step": 212 + }, + { + "epoch": 0.07930099707973147, + "grad_norm": 0.08304975181818008, + "learning_rate": 4.26e-06, + "loss": 1.507, + "step": 213 + }, + { + "epoch": 0.07967330223034054, + "grad_norm": 0.08493588864803314, + "learning_rate": 4.2800000000000005e-06, + "loss": 1.5017, + "step": 214 + }, + { + "epoch": 0.08004560738094961, + "grad_norm": 0.08817660808563232, + "learning_rate": 4.3e-06, + "loss": 1.5068, + "step": 215 + }, + { + "epoch": 0.08041791253155868, + "grad_norm": 0.08963938057422638, + "learning_rate": 4.32e-06, + "loss": 1.5263, + "step": 216 + }, + { + "epoch": 0.08079021768216775, + "grad_norm": 0.08390969783067703, + "learning_rate": 4.34e-06, + "loss": 1.5, + "step": 217 + }, + { + "epoch": 0.08116252283277682, + "grad_norm": 0.08665873855352402, + "learning_rate": 4.360000000000001e-06, + "loss": 1.4882, + "step": 218 + }, + { + "epoch": 0.08153482798338588, + "grad_norm": 0.08230563998222351, + "learning_rate": 4.38e-06, + "loss": 1.4888, + "step": 219 + }, + { + "epoch": 0.08190713313399495, + "grad_norm": 0.08207309246063232, + "learning_rate": 4.4e-06, + "loss": 1.4954, + "step": 220 + }, + { + "epoch": 0.08227943828460402, + "grad_norm": 0.08314842730760574, + "learning_rate": 4.42e-06, + "loss": 1.5103, + "step": 221 + }, + { + "epoch": 0.08265174343521309, + "grad_norm": 0.08730518817901611, + "learning_rate": 4.440000000000001e-06, + "loss": 1.4845, + "step": 222 + }, + { + "epoch": 0.08302404858582216, + "grad_norm": 0.09334740787744522, + "learning_rate": 4.4600000000000005e-06, + "loss": 1.4726, + "step": 223 + }, + { + "epoch": 0.08339635373643123, + "grad_norm": 0.0923716351389885, + "learning_rate": 4.48e-06, + "loss": 1.5005, + "step": 224 + }, + { + "epoch": 0.08376865888704028, + "grad_norm": 0.08330400288105011, + "learning_rate": 4.5e-06, + "loss": 1.4939, + "step": 225 + }, + { + "epoch": 0.08414096403764935, + "grad_norm": 0.08828233182430267, + "learning_rate": 4.520000000000001e-06, + "loss": 1.4926, + "step": 226 + }, + { + "epoch": 0.08451326918825842, + "grad_norm": 0.08477775007486343, + "learning_rate": 4.540000000000001e-06, + "loss": 1.4872, + "step": 227 + }, + { + "epoch": 0.0848855743388675, + "grad_norm": 0.08313533663749695, + "learning_rate": 4.56e-06, + "loss": 1.488, + "step": 228 + }, + { + "epoch": 0.08525787948947657, + "grad_norm": 0.08070395141839981, + "learning_rate": 4.58e-06, + "loss": 1.498, + "step": 229 + }, + { + "epoch": 0.08563018464008564, + "grad_norm": 0.08496372401714325, + "learning_rate": 4.600000000000001e-06, + "loss": 1.4995, + "step": 230 + }, + { + "epoch": 0.08600248979069469, + "grad_norm": 0.08610887080430984, + "learning_rate": 4.620000000000001e-06, + "loss": 1.4861, + "step": 231 + }, + { + "epoch": 0.08637479494130376, + "grad_norm": 0.1109740138053894, + "learning_rate": 4.6400000000000005e-06, + "loss": 1.4997, + "step": 232 + }, + { + "epoch": 0.08674710009191283, + "grad_norm": 0.0828074961900711, + "learning_rate": 4.66e-06, + "loss": 1.4768, + "step": 233 + }, + { + "epoch": 0.0871194052425219, + "grad_norm": 0.10130389034748077, + "learning_rate": 4.680000000000001e-06, + "loss": 1.5143, + "step": 234 + }, + { + "epoch": 0.08749171039313097, + "grad_norm": 0.07845676690340042, + "learning_rate": 4.7e-06, + "loss": 1.4835, + "step": 235 + }, + { + "epoch": 0.08786401554374004, + "grad_norm": 0.0835128203034401, + "learning_rate": 4.7200000000000005e-06, + "loss": 1.487, + "step": 236 + }, + { + "epoch": 0.0882363206943491, + "grad_norm": 0.07999230176210403, + "learning_rate": 4.74e-06, + "loss": 1.4768, + "step": 237 + }, + { + "epoch": 0.08860862584495817, + "grad_norm": 0.08866851031780243, + "learning_rate": 4.76e-06, + "loss": 1.4877, + "step": 238 + }, + { + "epoch": 0.08898093099556724, + "grad_norm": 0.09091200679540634, + "learning_rate": 4.78e-06, + "loss": 1.4663, + "step": 239 + }, + { + "epoch": 0.08935323614617631, + "grad_norm": 0.09118737280368805, + "learning_rate": 4.800000000000001e-06, + "loss": 1.4818, + "step": 240 + }, + { + "epoch": 0.08972554129678538, + "grad_norm": 0.09198911488056183, + "learning_rate": 4.8200000000000004e-06, + "loss": 1.486, + "step": 241 + }, + { + "epoch": 0.09009784644739445, + "grad_norm": 0.08087292313575745, + "learning_rate": 4.84e-06, + "loss": 1.4839, + "step": 242 + }, + { + "epoch": 0.09047015159800352, + "grad_norm": 0.0856778621673584, + "learning_rate": 4.86e-06, + "loss": 1.4882, + "step": 243 + }, + { + "epoch": 0.09084245674861258, + "grad_norm": 0.09004831314086914, + "learning_rate": 4.880000000000001e-06, + "loss": 1.4977, + "step": 244 + }, + { + "epoch": 0.09121476189922165, + "grad_norm": 0.08808062225580215, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.4961, + "step": 245 + }, + { + "epoch": 0.09158706704983072, + "grad_norm": 0.10897014290094376, + "learning_rate": 4.92e-06, + "loss": 1.4837, + "step": 246 + }, + { + "epoch": 0.09195937220043979, + "grad_norm": 0.12116125226020813, + "learning_rate": 4.94e-06, + "loss": 1.4899, + "step": 247 + }, + { + "epoch": 0.09233167735104886, + "grad_norm": 0.09493674337863922, + "learning_rate": 4.960000000000001e-06, + "loss": 1.4834, + "step": 248 + }, + { + "epoch": 0.09270398250165793, + "grad_norm": 0.13177277147769928, + "learning_rate": 4.980000000000001e-06, + "loss": 1.4721, + "step": 249 + }, + { + "epoch": 0.09307628765226698, + "grad_norm": 0.11521487683057785, + "learning_rate": 5e-06, + "loss": 1.5012, + "step": 250 + }, + { + "epoch": 0.09344859280287605, + "grad_norm": 0.106010802090168, + "learning_rate": 5.02e-06, + "loss": 1.5041, + "step": 251 + }, + { + "epoch": 0.09382089795348512, + "grad_norm": 0.08309640735387802, + "learning_rate": 5.04e-06, + "loss": 1.4651, + "step": 252 + }, + { + "epoch": 0.0941932031040942, + "grad_norm": 0.23099473118782043, + "learning_rate": 5.060000000000001e-06, + "loss": 1.4706, + "step": 253 + }, + { + "epoch": 0.09456550825470326, + "grad_norm": 0.08957033604383469, + "learning_rate": 5.0800000000000005e-06, + "loss": 1.4851, + "step": 254 + }, + { + "epoch": 0.09493781340531234, + "grad_norm": 0.08968747407197952, + "learning_rate": 5.1e-06, + "loss": 1.4615, + "step": 255 + }, + { + "epoch": 0.09531011855592139, + "grad_norm": 0.09505753219127655, + "learning_rate": 5.12e-06, + "loss": 1.4852, + "step": 256 + }, + { + "epoch": 0.09568242370653046, + "grad_norm": 0.08881967514753342, + "learning_rate": 5.140000000000001e-06, + "loss": 1.5098, + "step": 257 + }, + { + "epoch": 0.09605472885713953, + "grad_norm": 0.08623534440994263, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.466, + "step": 258 + }, + { + "epoch": 0.0964270340077486, + "grad_norm": 0.08393881469964981, + "learning_rate": 5.18e-06, + "loss": 1.4932, + "step": 259 + }, + { + "epoch": 0.09679933915835767, + "grad_norm": 0.0790511816740036, + "learning_rate": 5.2e-06, + "loss": 1.489, + "step": 260 + }, + { + "epoch": 0.09717164430896674, + "grad_norm": 0.08720332384109497, + "learning_rate": 5.220000000000001e-06, + "loss": 1.4878, + "step": 261 + }, + { + "epoch": 0.0975439494595758, + "grad_norm": 0.09274443984031677, + "learning_rate": 5.240000000000001e-06, + "loss": 1.4629, + "step": 262 + }, + { + "epoch": 0.09791625461018487, + "grad_norm": 0.0823291540145874, + "learning_rate": 5.2600000000000005e-06, + "loss": 1.4798, + "step": 263 + }, + { + "epoch": 0.09828855976079394, + "grad_norm": 0.10442589223384857, + "learning_rate": 5.28e-06, + "loss": 1.4885, + "step": 264 + }, + { + "epoch": 0.09866086491140301, + "grad_norm": 0.08945530652999878, + "learning_rate": 5.300000000000001e-06, + "loss": 1.4802, + "step": 265 + }, + { + "epoch": 0.09903317006201208, + "grad_norm": 0.11119077354669571, + "learning_rate": 5.320000000000001e-06, + "loss": 1.4888, + "step": 266 + }, + { + "epoch": 0.09940547521262115, + "grad_norm": 0.09534472972154617, + "learning_rate": 5.3400000000000005e-06, + "loss": 1.4838, + "step": 267 + }, + { + "epoch": 0.0997777803632302, + "grad_norm": 0.08682861924171448, + "learning_rate": 5.36e-06, + "loss": 1.4617, + "step": 268 + }, + { + "epoch": 0.10015008551383928, + "grad_norm": 0.10255074501037598, + "learning_rate": 5.380000000000001e-06, + "loss": 1.4767, + "step": 269 + }, + { + "epoch": 0.10052239066444835, + "grad_norm": 0.09030777961015701, + "learning_rate": 5.400000000000001e-06, + "loss": 1.463, + "step": 270 + }, + { + "epoch": 0.10089469581505742, + "grad_norm": 0.09011770784854889, + "learning_rate": 5.420000000000001e-06, + "loss": 1.484, + "step": 271 + }, + { + "epoch": 0.10126700096566649, + "grad_norm": 0.08929470181465149, + "learning_rate": 5.4400000000000004e-06, + "loss": 1.4789, + "step": 272 + }, + { + "epoch": 0.10163930611627556, + "grad_norm": 0.09251522272825241, + "learning_rate": 5.460000000000001e-06, + "loss": 1.4682, + "step": 273 + }, + { + "epoch": 0.10201161126688461, + "grad_norm": 0.11165442317724228, + "learning_rate": 5.480000000000001e-06, + "loss": 1.4829, + "step": 274 + }, + { + "epoch": 0.10238391641749368, + "grad_norm": 0.09440817683935165, + "learning_rate": 5.500000000000001e-06, + "loss": 1.4769, + "step": 275 + }, + { + "epoch": 0.10275622156810275, + "grad_norm": 0.11706740409135818, + "learning_rate": 5.5200000000000005e-06, + "loss": 1.477, + "step": 276 + }, + { + "epoch": 0.10312852671871182, + "grad_norm": 0.08593502640724182, + "learning_rate": 5.540000000000001e-06, + "loss": 1.4727, + "step": 277 + }, + { + "epoch": 0.1035008318693209, + "grad_norm": 0.0881851464509964, + "learning_rate": 5.560000000000001e-06, + "loss": 1.4852, + "step": 278 + }, + { + "epoch": 0.10387313701992996, + "grad_norm": 0.11567908525466919, + "learning_rate": 5.580000000000001e-06, + "loss": 1.4608, + "step": 279 + }, + { + "epoch": 0.10424544217053903, + "grad_norm": 0.09296432882547379, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4811, + "step": 280 + }, + { + "epoch": 0.10461774732114809, + "grad_norm": 0.10216089338064194, + "learning_rate": 5.620000000000001e-06, + "loss": 1.4682, + "step": 281 + }, + { + "epoch": 0.10499005247175716, + "grad_norm": 0.08564305305480957, + "learning_rate": 5.64e-06, + "loss": 1.4832, + "step": 282 + }, + { + "epoch": 0.10536235762236623, + "grad_norm": 0.11430171877145767, + "learning_rate": 5.66e-06, + "loss": 1.4754, + "step": 283 + }, + { + "epoch": 0.1057346627729753, + "grad_norm": 0.09230099618434906, + "learning_rate": 5.68e-06, + "loss": 1.4765, + "step": 284 + }, + { + "epoch": 0.10610696792358437, + "grad_norm": 0.14818595349788666, + "learning_rate": 5.7e-06, + "loss": 1.4642, + "step": 285 + }, + { + "epoch": 0.10647927307419344, + "grad_norm": 0.1206260696053505, + "learning_rate": 5.72e-06, + "loss": 1.462, + "step": 286 + }, + { + "epoch": 0.1068515782248025, + "grad_norm": 0.11496801674365997, + "learning_rate": 5.74e-06, + "loss": 1.4741, + "step": 287 + }, + { + "epoch": 0.10722388337541157, + "grad_norm": 0.09007902443408966, + "learning_rate": 5.76e-06, + "loss": 1.472, + "step": 288 + }, + { + "epoch": 0.10759618852602064, + "grad_norm": 0.0860317051410675, + "learning_rate": 5.78e-06, + "loss": 1.4814, + "step": 289 + }, + { + "epoch": 0.10796849367662971, + "grad_norm": 0.08847720175981522, + "learning_rate": 5.8e-06, + "loss": 1.473, + "step": 290 + }, + { + "epoch": 0.10834079882723878, + "grad_norm": 0.09280810505151749, + "learning_rate": 5.82e-06, + "loss": 1.4628, + "step": 291 + }, + { + "epoch": 0.10871310397784785, + "grad_norm": 0.0906905010342598, + "learning_rate": 5.84e-06, + "loss": 1.4637, + "step": 292 + }, + { + "epoch": 0.1090854091284569, + "grad_norm": 0.08728771656751633, + "learning_rate": 5.86e-06, + "loss": 1.4667, + "step": 293 + }, + { + "epoch": 0.10945771427906598, + "grad_norm": 0.09215451776981354, + "learning_rate": 5.8800000000000005e-06, + "loss": 1.4588, + "step": 294 + }, + { + "epoch": 0.10983001942967505, + "grad_norm": 0.09837444871664047, + "learning_rate": 5.9e-06, + "loss": 1.464, + "step": 295 + }, + { + "epoch": 0.11020232458028412, + "grad_norm": 0.08866247534751892, + "learning_rate": 5.92e-06, + "loss": 1.4585, + "step": 296 + }, + { + "epoch": 0.11057462973089319, + "grad_norm": 0.11951064318418503, + "learning_rate": 5.94e-06, + "loss": 1.4612, + "step": 297 + }, + { + "epoch": 0.11094693488150226, + "grad_norm": 0.08314000070095062, + "learning_rate": 5.9600000000000005e-06, + "loss": 1.4635, + "step": 298 + }, + { + "epoch": 0.11131924003211131, + "grad_norm": 0.12508852779865265, + "learning_rate": 5.98e-06, + "loss": 1.4722, + "step": 299 + }, + { + "epoch": 0.11169154518272038, + "grad_norm": 0.11367225646972656, + "learning_rate": 6e-06, + "loss": 1.4722, + "step": 300 + }, + { + "epoch": 0.11206385033332945, + "grad_norm": 0.1337917000055313, + "learning_rate": 6.02e-06, + "loss": 1.4531, + "step": 301 + }, + { + "epoch": 0.11243615548393852, + "grad_norm": 0.10296988487243652, + "learning_rate": 6.040000000000001e-06, + "loss": 1.4355, + "step": 302 + }, + { + "epoch": 0.1128084606345476, + "grad_norm": 0.09838810563087463, + "learning_rate": 6.0600000000000004e-06, + "loss": 1.4705, + "step": 303 + }, + { + "epoch": 0.11318076578515666, + "grad_norm": 0.14201873540878296, + "learning_rate": 6.08e-06, + "loss": 1.465, + "step": 304 + }, + { + "epoch": 0.11355307093576572, + "grad_norm": 0.10391878336668015, + "learning_rate": 6.1e-06, + "loss": 1.4691, + "step": 305 + }, + { + "epoch": 0.11392537608637479, + "grad_norm": 0.0908937081694603, + "learning_rate": 6.120000000000001e-06, + "loss": 1.4738, + "step": 306 + }, + { + "epoch": 0.11429768123698386, + "grad_norm": 0.1304592341184616, + "learning_rate": 6.1400000000000005e-06, + "loss": 1.4564, + "step": 307 + }, + { + "epoch": 0.11466998638759293, + "grad_norm": 0.11030007898807526, + "learning_rate": 6.16e-06, + "loss": 1.4681, + "step": 308 + }, + { + "epoch": 0.115042291538202, + "grad_norm": 0.1404293328523636, + "learning_rate": 6.18e-06, + "loss": 1.4489, + "step": 309 + }, + { + "epoch": 0.11541459668881107, + "grad_norm": 0.15717652440071106, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4619, + "step": 310 + }, + { + "epoch": 0.11578690183942013, + "grad_norm": 0.08294162154197693, + "learning_rate": 6.220000000000001e-06, + "loss": 1.4564, + "step": 311 + }, + { + "epoch": 0.1161592069900292, + "grad_norm": 0.12932319939136505, + "learning_rate": 6.24e-06, + "loss": 1.4615, + "step": 312 + }, + { + "epoch": 0.11653151214063827, + "grad_norm": 0.1291813999414444, + "learning_rate": 6.26e-06, + "loss": 1.4736, + "step": 313 + }, + { + "epoch": 0.11690381729124734, + "grad_norm": 0.08296732604503632, + "learning_rate": 6.280000000000001e-06, + "loss": 1.4592, + "step": 314 + }, + { + "epoch": 0.11727612244185641, + "grad_norm": 0.1045786589384079, + "learning_rate": 6.300000000000001e-06, + "loss": 1.4473, + "step": 315 + }, + { + "epoch": 0.11764842759246548, + "grad_norm": 0.14720311760902405, + "learning_rate": 6.3200000000000005e-06, + "loss": 1.45, + "step": 316 + }, + { + "epoch": 0.11802073274307454, + "grad_norm": 0.08579614013433456, + "learning_rate": 6.34e-06, + "loss": 1.4448, + "step": 317 + }, + { + "epoch": 0.1183930378936836, + "grad_norm": 0.12133117765188217, + "learning_rate": 6.360000000000001e-06, + "loss": 1.4536, + "step": 318 + }, + { + "epoch": 0.11876534304429268, + "grad_norm": 0.09560643136501312, + "learning_rate": 6.380000000000001e-06, + "loss": 1.4501, + "step": 319 + }, + { + "epoch": 0.11913764819490175, + "grad_norm": 0.08440724015235901, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4516, + "step": 320 + }, + { + "epoch": 0.11950995334551082, + "grad_norm": 0.0895630419254303, + "learning_rate": 6.42e-06, + "loss": 1.4537, + "step": 321 + }, + { + "epoch": 0.11988225849611989, + "grad_norm": 0.10389460623264313, + "learning_rate": 6.440000000000001e-06, + "loss": 1.4552, + "step": 322 + }, + { + "epoch": 0.12025456364672896, + "grad_norm": 0.09369999170303345, + "learning_rate": 6.460000000000001e-06, + "loss": 1.4544, + "step": 323 + }, + { + "epoch": 0.12062686879733801, + "grad_norm": 0.099007248878479, + "learning_rate": 6.480000000000001e-06, + "loss": 1.4891, + "step": 324 + }, + { + "epoch": 0.12099917394794708, + "grad_norm": 0.09093964099884033, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.4671, + "step": 325 + }, + { + "epoch": 0.12137147909855615, + "grad_norm": 0.09165001660585403, + "learning_rate": 6.520000000000001e-06, + "loss": 1.4577, + "step": 326 + }, + { + "epoch": 0.12174378424916522, + "grad_norm": 0.09465596079826355, + "learning_rate": 6.540000000000001e-06, + "loss": 1.4737, + "step": 327 + }, + { + "epoch": 0.1221160893997743, + "grad_norm": 0.09906231611967087, + "learning_rate": 6.560000000000001e-06, + "loss": 1.4587, + "step": 328 + }, + { + "epoch": 0.12248839455038336, + "grad_norm": 0.09170754998922348, + "learning_rate": 6.5800000000000005e-06, + "loss": 1.4604, + "step": 329 + }, + { + "epoch": 0.12286069970099242, + "grad_norm": 0.08741523325443268, + "learning_rate": 6.600000000000001e-06, + "loss": 1.4563, + "step": 330 + }, + { + "epoch": 0.12323300485160149, + "grad_norm": 0.1020127385854721, + "learning_rate": 6.620000000000001e-06, + "loss": 1.4656, + "step": 331 + }, + { + "epoch": 0.12360531000221056, + "grad_norm": 0.09506560862064362, + "learning_rate": 6.640000000000001e-06, + "loss": 1.4752, + "step": 332 + }, + { + "epoch": 0.12397761515281963, + "grad_norm": 0.1042810007929802, + "learning_rate": 6.660000000000001e-06, + "loss": 1.4593, + "step": 333 + }, + { + "epoch": 0.1243499203034287, + "grad_norm": 0.09513174742460251, + "learning_rate": 6.680000000000001e-06, + "loss": 1.4636, + "step": 334 + }, + { + "epoch": 0.12472222545403777, + "grad_norm": 0.09068495035171509, + "learning_rate": 6.700000000000001e-06, + "loss": 1.4729, + "step": 335 + }, + { + "epoch": 0.12509453060464684, + "grad_norm": 0.09143901616334915, + "learning_rate": 6.720000000000001e-06, + "loss": 1.4528, + "step": 336 + }, + { + "epoch": 0.1254668357552559, + "grad_norm": 0.08922934532165527, + "learning_rate": 6.740000000000001e-06, + "loss": 1.4668, + "step": 337 + }, + { + "epoch": 0.12583914090586498, + "grad_norm": 0.10561850666999817, + "learning_rate": 6.760000000000001e-06, + "loss": 1.4536, + "step": 338 + }, + { + "epoch": 0.12621144605647402, + "grad_norm": 0.09490080922842026, + "learning_rate": 6.780000000000001e-06, + "loss": 1.4604, + "step": 339 + }, + { + "epoch": 0.1265837512070831, + "grad_norm": 0.09219174832105637, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4577, + "step": 340 + }, + { + "epoch": 0.12695605635769217, + "grad_norm": 0.11259440332651138, + "learning_rate": 6.820000000000001e-06, + "loss": 1.461, + "step": 341 + }, + { + "epoch": 0.12732836150830124, + "grad_norm": 0.10154417902231216, + "learning_rate": 6.8400000000000014e-06, + "loss": 1.4539, + "step": 342 + }, + { + "epoch": 0.1277006666589103, + "grad_norm": 0.08527655154466629, + "learning_rate": 6.860000000000001e-06, + "loss": 1.4516, + "step": 343 + }, + { + "epoch": 0.12807297180951938, + "grad_norm": 0.10697508603334427, + "learning_rate": 6.88e-06, + "loss": 1.4519, + "step": 344 + }, + { + "epoch": 0.12844527696012845, + "grad_norm": 0.1440954953432083, + "learning_rate": 6.9e-06, + "loss": 1.4394, + "step": 345 + }, + { + "epoch": 0.12881758211073752, + "grad_norm": 0.10360608994960785, + "learning_rate": 6.92e-06, + "loss": 1.4472, + "step": 346 + }, + { + "epoch": 0.1291898872613466, + "grad_norm": 0.09860967099666595, + "learning_rate": 6.9400000000000005e-06, + "loss": 1.4362, + "step": 347 + }, + { + "epoch": 0.12956219241195566, + "grad_norm": 0.10211774706840515, + "learning_rate": 6.96e-06, + "loss": 1.4497, + "step": 348 + }, + { + "epoch": 0.12993449756256473, + "grad_norm": 0.10965809226036072, + "learning_rate": 6.98e-06, + "loss": 1.457, + "step": 349 + }, + { + "epoch": 0.1303068027131738, + "grad_norm": 0.09159507602453232, + "learning_rate": 7e-06, + "loss": 1.4512, + "step": 350 + }, + { + "epoch": 0.13067910786378284, + "grad_norm": 0.11210039258003235, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.4418, + "step": 351 + }, + { + "epoch": 0.1310514130143919, + "grad_norm": 0.0950743705034256, + "learning_rate": 7.04e-06, + "loss": 1.4443, + "step": 352 + }, + { + "epoch": 0.13142371816500098, + "grad_norm": 0.10520799458026886, + "learning_rate": 7.06e-06, + "loss": 1.4587, + "step": 353 + }, + { + "epoch": 0.13179602331561005, + "grad_norm": 0.09574756771326065, + "learning_rate": 7.08e-06, + "loss": 1.4353, + "step": 354 + }, + { + "epoch": 0.13216832846621912, + "grad_norm": 0.09506519138813019, + "learning_rate": 7.100000000000001e-06, + "loss": 1.4425, + "step": 355 + }, + { + "epoch": 0.1325406336168282, + "grad_norm": 0.09055452048778534, + "learning_rate": 7.1200000000000004e-06, + "loss": 1.4514, + "step": 356 + }, + { + "epoch": 0.13291293876743726, + "grad_norm": 0.08780631422996521, + "learning_rate": 7.14e-06, + "loss": 1.4378, + "step": 357 + }, + { + "epoch": 0.13328524391804633, + "grad_norm": 0.09684795886278152, + "learning_rate": 7.16e-06, + "loss": 1.4358, + "step": 358 + }, + { + "epoch": 0.1336575490686554, + "grad_norm": 0.09261483699083328, + "learning_rate": 7.180000000000001e-06, + "loss": 1.4521, + "step": 359 + }, + { + "epoch": 0.13402985421926447, + "grad_norm": 0.09582682698965073, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.4393, + "step": 360 + }, + { + "epoch": 0.13440215936987354, + "grad_norm": 0.0970984697341919, + "learning_rate": 7.22e-06, + "loss": 1.4289, + "step": 361 + }, + { + "epoch": 0.1347744645204826, + "grad_norm": 0.10083375871181488, + "learning_rate": 7.24e-06, + "loss": 1.4493, + "step": 362 + }, + { + "epoch": 0.13514676967109168, + "grad_norm": 0.11022008955478668, + "learning_rate": 7.260000000000001e-06, + "loss": 1.4504, + "step": 363 + }, + { + "epoch": 0.13551907482170072, + "grad_norm": 0.09099752455949783, + "learning_rate": 7.280000000000001e-06, + "loss": 1.4439, + "step": 364 + }, + { + "epoch": 0.1358913799723098, + "grad_norm": 0.13994981348514557, + "learning_rate": 7.3e-06, + "loss": 1.4537, + "step": 365 + }, + { + "epoch": 0.13626368512291887, + "grad_norm": 0.0927768424153328, + "learning_rate": 7.32e-06, + "loss": 1.4577, + "step": 366 + }, + { + "epoch": 0.13663599027352794, + "grad_norm": 0.12550784647464752, + "learning_rate": 7.340000000000001e-06, + "loss": 1.4595, + "step": 367 + }, + { + "epoch": 0.137008295424137, + "grad_norm": 0.0998741090297699, + "learning_rate": 7.360000000000001e-06, + "loss": 1.4606, + "step": 368 + }, + { + "epoch": 0.13738060057474608, + "grad_norm": 0.09384217113256454, + "learning_rate": 7.3800000000000005e-06, + "loss": 1.4388, + "step": 369 + }, + { + "epoch": 0.13775290572535515, + "grad_norm": 0.10957197099924088, + "learning_rate": 7.4e-06, + "loss": 1.4463, + "step": 370 + }, + { + "epoch": 0.13812521087596422, + "grad_norm": 0.10382582992315292, + "learning_rate": 7.420000000000001e-06, + "loss": 1.4393, + "step": 371 + }, + { + "epoch": 0.1384975160265733, + "grad_norm": 0.09171084314584732, + "learning_rate": 7.440000000000001e-06, + "loss": 1.4493, + "step": 372 + }, + { + "epoch": 0.13886982117718236, + "grad_norm": 0.10419381409883499, + "learning_rate": 7.4600000000000006e-06, + "loss": 1.4589, + "step": 373 + }, + { + "epoch": 0.13924212632779143, + "grad_norm": 0.0996531993150711, + "learning_rate": 7.48e-06, + "loss": 1.4258, + "step": 374 + }, + { + "epoch": 0.1396144314784005, + "grad_norm": 0.09250695258378983, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4458, + "step": 375 + }, + { + "epoch": 0.13998673662900954, + "grad_norm": 0.09015652537345886, + "learning_rate": 7.520000000000001e-06, + "loss": 1.452, + "step": 376 + }, + { + "epoch": 0.1403590417796186, + "grad_norm": 0.11032413691282272, + "learning_rate": 7.540000000000001e-06, + "loss": 1.4307, + "step": 377 + }, + { + "epoch": 0.14073134693022768, + "grad_norm": 0.09179755300283432, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.4462, + "step": 378 + }, + { + "epoch": 0.14110365208083675, + "grad_norm": 0.09686073660850525, + "learning_rate": 7.58e-06, + "loss": 1.4362, + "step": 379 + }, + { + "epoch": 0.14147595723144582, + "grad_norm": 0.09349130094051361, + "learning_rate": 7.600000000000001e-06, + "loss": 1.4358, + "step": 380 + }, + { + "epoch": 0.1418482623820549, + "grad_norm": 0.10256840288639069, + "learning_rate": 7.620000000000001e-06, + "loss": 1.4469, + "step": 381 + }, + { + "epoch": 0.14222056753266396, + "grad_norm": 0.09944120049476624, + "learning_rate": 7.640000000000001e-06, + "loss": 1.4485, + "step": 382 + }, + { + "epoch": 0.14259287268327303, + "grad_norm": 0.09924148768186569, + "learning_rate": 7.660000000000001e-06, + "loss": 1.4382, + "step": 383 + }, + { + "epoch": 0.1429651778338821, + "grad_norm": 0.09792005270719528, + "learning_rate": 7.680000000000001e-06, + "loss": 1.4256, + "step": 384 + }, + { + "epoch": 0.14333748298449117, + "grad_norm": 0.08989827334880829, + "learning_rate": 7.7e-06, + "loss": 1.441, + "step": 385 + }, + { + "epoch": 0.14370978813510024, + "grad_norm": 0.13259711861610413, + "learning_rate": 7.72e-06, + "loss": 1.4604, + "step": 386 + }, + { + "epoch": 0.1440820932857093, + "grad_norm": 0.10013754665851593, + "learning_rate": 7.74e-06, + "loss": 1.4319, + "step": 387 + }, + { + "epoch": 0.14445439843631835, + "grad_norm": 0.10801331698894501, + "learning_rate": 7.76e-06, + "loss": 1.4358, + "step": 388 + }, + { + "epoch": 0.14482670358692742, + "grad_norm": 0.0908760130405426, + "learning_rate": 7.78e-06, + "loss": 1.4496, + "step": 389 + }, + { + "epoch": 0.1451990087375365, + "grad_norm": 0.09459855407476425, + "learning_rate": 7.800000000000002e-06, + "loss": 1.4482, + "step": 390 + }, + { + "epoch": 0.14557131388814556, + "grad_norm": 0.09493505209684372, + "learning_rate": 7.820000000000001e-06, + "loss": 1.443, + "step": 391 + }, + { + "epoch": 0.14594361903875464, + "grad_norm": 0.09393203258514404, + "learning_rate": 7.840000000000001e-06, + "loss": 1.4461, + "step": 392 + }, + { + "epoch": 0.1463159241893637, + "grad_norm": 0.0932602807879448, + "learning_rate": 7.860000000000001e-06, + "loss": 1.4193, + "step": 393 + }, + { + "epoch": 0.14668822933997278, + "grad_norm": 0.10115011781454086, + "learning_rate": 7.88e-06, + "loss": 1.4143, + "step": 394 + }, + { + "epoch": 0.14706053449058185, + "grad_norm": 0.10510238260030746, + "learning_rate": 7.9e-06, + "loss": 1.4377, + "step": 395 + }, + { + "epoch": 0.14743283964119092, + "grad_norm": 0.09857072681188583, + "learning_rate": 7.92e-06, + "loss": 1.4537, + "step": 396 + }, + { + "epoch": 0.14780514479179999, + "grad_norm": 0.11328350752592087, + "learning_rate": 7.94e-06, + "loss": 1.4274, + "step": 397 + }, + { + "epoch": 0.14817744994240906, + "grad_norm": 0.10213533043861389, + "learning_rate": 7.960000000000002e-06, + "loss": 1.4282, + "step": 398 + }, + { + "epoch": 0.14854975509301813, + "grad_norm": 0.1178865134716034, + "learning_rate": 7.980000000000002e-06, + "loss": 1.4333, + "step": 399 + }, + { + "epoch": 0.1489220602436272, + "grad_norm": 0.11789101362228394, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4413, + "step": 400 + }, + { + "epoch": 0.14929436539423624, + "grad_norm": 0.10633790493011475, + "learning_rate": 8.020000000000001e-06, + "loss": 1.4281, + "step": 401 + }, + { + "epoch": 0.1496666705448453, + "grad_norm": 0.11100617051124573, + "learning_rate": 8.040000000000001e-06, + "loss": 1.4299, + "step": 402 + }, + { + "epoch": 0.15003897569545438, + "grad_norm": 0.09987955540418625, + "learning_rate": 8.06e-06, + "loss": 1.4396, + "step": 403 + }, + { + "epoch": 0.15041128084606345, + "grad_norm": 0.13840006291866302, + "learning_rate": 8.08e-06, + "loss": 1.4378, + "step": 404 + }, + { + "epoch": 0.15078358599667252, + "grad_norm": 0.10468387603759766, + "learning_rate": 8.1e-06, + "loss": 1.4361, + "step": 405 + }, + { + "epoch": 0.1511558911472816, + "grad_norm": 0.09684094786643982, + "learning_rate": 8.120000000000002e-06, + "loss": 1.4408, + "step": 406 + }, + { + "epoch": 0.15152819629789066, + "grad_norm": 0.1058192178606987, + "learning_rate": 8.14e-06, + "loss": 1.4353, + "step": 407 + }, + { + "epoch": 0.15190050144849973, + "grad_norm": 0.11632636189460754, + "learning_rate": 8.16e-06, + "loss": 1.4331, + "step": 408 + }, + { + "epoch": 0.1522728065991088, + "grad_norm": 0.10219406336545944, + "learning_rate": 8.18e-06, + "loss": 1.4527, + "step": 409 + }, + { + "epoch": 0.15264511174971787, + "grad_norm": 0.110807865858078, + "learning_rate": 8.2e-06, + "loss": 1.4389, + "step": 410 + }, + { + "epoch": 0.15301741690032694, + "grad_norm": 0.09530390799045563, + "learning_rate": 8.220000000000001e-06, + "loss": 1.4252, + "step": 411 + }, + { + "epoch": 0.153389722050936, + "grad_norm": 0.10499103367328644, + "learning_rate": 8.24e-06, + "loss": 1.4429, + "step": 412 + }, + { + "epoch": 0.15376202720154505, + "grad_norm": 0.1094844788312912, + "learning_rate": 8.26e-06, + "loss": 1.4172, + "step": 413 + }, + { + "epoch": 0.15413433235215412, + "grad_norm": 0.10519949346780777, + "learning_rate": 8.28e-06, + "loss": 1.4371, + "step": 414 + }, + { + "epoch": 0.1545066375027632, + "grad_norm": 0.10553234815597534, + "learning_rate": 8.3e-06, + "loss": 1.4409, + "step": 415 + }, + { + "epoch": 0.15487894265337226, + "grad_norm": 0.10840829461812973, + "learning_rate": 8.32e-06, + "loss": 1.4345, + "step": 416 + }, + { + "epoch": 0.15525124780398133, + "grad_norm": 0.09976733475923538, + "learning_rate": 8.34e-06, + "loss": 1.4508, + "step": 417 + }, + { + "epoch": 0.1556235529545904, + "grad_norm": 0.10259655117988586, + "learning_rate": 8.36e-06, + "loss": 1.4196, + "step": 418 + }, + { + "epoch": 0.15599585810519948, + "grad_norm": 0.10481464862823486, + "learning_rate": 8.380000000000001e-06, + "loss": 1.4367, + "step": 419 + }, + { + "epoch": 0.15636816325580855, + "grad_norm": 0.1130518987774849, + "learning_rate": 8.400000000000001e-06, + "loss": 1.4479, + "step": 420 + }, + { + "epoch": 0.15674046840641762, + "grad_norm": 0.11022792756557465, + "learning_rate": 8.42e-06, + "loss": 1.421, + "step": 421 + }, + { + "epoch": 0.15711277355702669, + "grad_norm": 0.0973302498459816, + "learning_rate": 8.44e-06, + "loss": 1.4445, + "step": 422 + }, + { + "epoch": 0.15748507870763576, + "grad_norm": 0.09701745212078094, + "learning_rate": 8.46e-06, + "loss": 1.42, + "step": 423 + }, + { + "epoch": 0.15785738385824483, + "grad_norm": 0.09979844093322754, + "learning_rate": 8.48e-06, + "loss": 1.4221, + "step": 424 + }, + { + "epoch": 0.15822968900885387, + "grad_norm": 0.099349245429039, + "learning_rate": 8.5e-06, + "loss": 1.4131, + "step": 425 + }, + { + "epoch": 0.15860199415946294, + "grad_norm": 0.10651430487632751, + "learning_rate": 8.52e-06, + "loss": 1.4249, + "step": 426 + }, + { + "epoch": 0.158974299310072, + "grad_norm": 0.11765170842409134, + "learning_rate": 8.540000000000001e-06, + "loss": 1.4503, + "step": 427 + }, + { + "epoch": 0.15934660446068108, + "grad_norm": 0.10456649959087372, + "learning_rate": 8.560000000000001e-06, + "loss": 1.4261, + "step": 428 + }, + { + "epoch": 0.15971890961129015, + "grad_norm": 0.13042724132537842, + "learning_rate": 8.580000000000001e-06, + "loss": 1.4238, + "step": 429 + }, + { + "epoch": 0.16009121476189922, + "grad_norm": 0.10613591969013214, + "learning_rate": 8.6e-06, + "loss": 1.4097, + "step": 430 + }, + { + "epoch": 0.1604635199125083, + "grad_norm": 0.10777163505554199, + "learning_rate": 8.62e-06, + "loss": 1.4327, + "step": 431 + }, + { + "epoch": 0.16083582506311736, + "grad_norm": 0.1047920435667038, + "learning_rate": 8.64e-06, + "loss": 1.4247, + "step": 432 + }, + { + "epoch": 0.16120813021372643, + "grad_norm": 0.09752184897661209, + "learning_rate": 8.66e-06, + "loss": 1.4327, + "step": 433 + }, + { + "epoch": 0.1615804353643355, + "grad_norm": 0.1117076650261879, + "learning_rate": 8.68e-06, + "loss": 1.4327, + "step": 434 + }, + { + "epoch": 0.16195274051494457, + "grad_norm": 0.11003684252500534, + "learning_rate": 8.700000000000001e-06, + "loss": 1.4211, + "step": 435 + }, + { + "epoch": 0.16232504566555364, + "grad_norm": 0.09675519168376923, + "learning_rate": 8.720000000000001e-06, + "loss": 1.435, + "step": 436 + }, + { + "epoch": 0.16269735081616268, + "grad_norm": 0.09906768053770065, + "learning_rate": 8.740000000000001e-06, + "loss": 1.4381, + "step": 437 + }, + { + "epoch": 0.16306965596677175, + "grad_norm": 0.10028046369552612, + "learning_rate": 8.76e-06, + "loss": 1.4251, + "step": 438 + }, + { + "epoch": 0.16344196111738082, + "grad_norm": 0.11393031477928162, + "learning_rate": 8.78e-06, + "loss": 1.4234, + "step": 439 + }, + { + "epoch": 0.1638142662679899, + "grad_norm": 0.10733836144208908, + "learning_rate": 8.8e-06, + "loss": 1.4168, + "step": 440 + }, + { + "epoch": 0.16418657141859896, + "grad_norm": 0.10900059342384338, + "learning_rate": 8.82e-06, + "loss": 1.4356, + "step": 441 + }, + { + "epoch": 0.16455887656920803, + "grad_norm": 0.10403525829315186, + "learning_rate": 8.84e-06, + "loss": 1.4058, + "step": 442 + }, + { + "epoch": 0.1649311817198171, + "grad_norm": 0.10996660590171814, + "learning_rate": 8.860000000000002e-06, + "loss": 1.4317, + "step": 443 + }, + { + "epoch": 0.16530348687042618, + "grad_norm": 0.1056893914937973, + "learning_rate": 8.880000000000001e-06, + "loss": 1.4258, + "step": 444 + }, + { + "epoch": 0.16567579202103525, + "grad_norm": 0.10371371358633041, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4261, + "step": 445 + }, + { + "epoch": 0.16604809717164432, + "grad_norm": 0.10245388001203537, + "learning_rate": 8.920000000000001e-06, + "loss": 1.4237, + "step": 446 + }, + { + "epoch": 0.16642040232225339, + "grad_norm": 0.11346007138490677, + "learning_rate": 8.94e-06, + "loss": 1.4171, + "step": 447 + }, + { + "epoch": 0.16679270747286246, + "grad_norm": 0.1109023243188858, + "learning_rate": 8.96e-06, + "loss": 1.4311, + "step": 448 + }, + { + "epoch": 0.16716501262347153, + "grad_norm": 0.10533205419778824, + "learning_rate": 8.98e-06, + "loss": 1.4159, + "step": 449 + }, + { + "epoch": 0.16753731777408057, + "grad_norm": 0.1102309301495552, + "learning_rate": 9e-06, + "loss": 1.425, + "step": 450 + }, + { + "epoch": 0.16790962292468964, + "grad_norm": 0.10830759257078171, + "learning_rate": 9.020000000000002e-06, + "loss": 1.4188, + "step": 451 + }, + { + "epoch": 0.1682819280752987, + "grad_norm": 0.11227082461118698, + "learning_rate": 9.040000000000002e-06, + "loss": 1.4189, + "step": 452 + }, + { + "epoch": 0.16865423322590778, + "grad_norm": 0.11561018973588943, + "learning_rate": 9.060000000000001e-06, + "loss": 1.4225, + "step": 453 + }, + { + "epoch": 0.16902653837651685, + "grad_norm": 0.10718375444412231, + "learning_rate": 9.080000000000001e-06, + "loss": 1.437, + "step": 454 + }, + { + "epoch": 0.16939884352712592, + "grad_norm": 0.11734096705913544, + "learning_rate": 9.100000000000001e-06, + "loss": 1.4243, + "step": 455 + }, + { + "epoch": 0.169771148677735, + "grad_norm": 0.11969781666994095, + "learning_rate": 9.12e-06, + "loss": 1.4048, + "step": 456 + }, + { + "epoch": 0.17014345382834406, + "grad_norm": 0.1117396354675293, + "learning_rate": 9.14e-06, + "loss": 1.4281, + "step": 457 + }, + { + "epoch": 0.17051575897895313, + "grad_norm": 0.10435774177312851, + "learning_rate": 9.16e-06, + "loss": 1.429, + "step": 458 + }, + { + "epoch": 0.1708880641295622, + "grad_norm": 0.10865868628025055, + "learning_rate": 9.180000000000002e-06, + "loss": 1.4092, + "step": 459 + }, + { + "epoch": 0.17126036928017127, + "grad_norm": 0.1147746816277504, + "learning_rate": 9.200000000000002e-06, + "loss": 1.4197, + "step": 460 + }, + { + "epoch": 0.17163267443078034, + "grad_norm": 0.1095675453543663, + "learning_rate": 9.220000000000002e-06, + "loss": 1.425, + "step": 461 + }, + { + "epoch": 0.17200497958138938, + "grad_norm": 0.10761556029319763, + "learning_rate": 9.240000000000001e-06, + "loss": 1.4152, + "step": 462 + }, + { + "epoch": 0.17237728473199845, + "grad_norm": 0.11335011571645737, + "learning_rate": 9.260000000000001e-06, + "loss": 1.4224, + "step": 463 + }, + { + "epoch": 0.17274958988260752, + "grad_norm": 0.10845465958118439, + "learning_rate": 9.280000000000001e-06, + "loss": 1.4005, + "step": 464 + }, + { + "epoch": 0.1731218950332166, + "grad_norm": 0.1037868782877922, + "learning_rate": 9.3e-06, + "loss": 1.4277, + "step": 465 + }, + { + "epoch": 0.17349420018382566, + "grad_norm": 0.11700176447629929, + "learning_rate": 9.32e-06, + "loss": 1.4232, + "step": 466 + }, + { + "epoch": 0.17386650533443473, + "grad_norm": 0.10587536543607712, + "learning_rate": 9.340000000000002e-06, + "loss": 1.4399, + "step": 467 + }, + { + "epoch": 0.1742388104850438, + "grad_norm": 0.11157836019992828, + "learning_rate": 9.360000000000002e-06, + "loss": 1.4159, + "step": 468 + }, + { + "epoch": 0.17461111563565287, + "grad_norm": 0.10884707421064377, + "learning_rate": 9.38e-06, + "loss": 1.4088, + "step": 469 + }, + { + "epoch": 0.17498342078626195, + "grad_norm": 0.11198698729276657, + "learning_rate": 9.4e-06, + "loss": 1.3952, + "step": 470 + }, + { + "epoch": 0.17535572593687102, + "grad_norm": 0.1030043363571167, + "learning_rate": 9.42e-06, + "loss": 1.4294, + "step": 471 + }, + { + "epoch": 0.17572803108748009, + "grad_norm": 0.11066511273384094, + "learning_rate": 9.440000000000001e-06, + "loss": 1.4114, + "step": 472 + }, + { + "epoch": 0.17610033623808916, + "grad_norm": 0.1187884584069252, + "learning_rate": 9.460000000000001e-06, + "loss": 1.415, + "step": 473 + }, + { + "epoch": 0.1764726413886982, + "grad_norm": 0.09824799001216888, + "learning_rate": 9.48e-06, + "loss": 1.4166, + "step": 474 + }, + { + "epoch": 0.17684494653930727, + "grad_norm": 0.09938450902700424, + "learning_rate": 9.5e-06, + "loss": 1.4139, + "step": 475 + }, + { + "epoch": 0.17721725168991634, + "grad_norm": 0.11766441911458969, + "learning_rate": 9.52e-06, + "loss": 1.4234, + "step": 476 + }, + { + "epoch": 0.1775895568405254, + "grad_norm": 0.10825181007385254, + "learning_rate": 9.54e-06, + "loss": 1.4207, + "step": 477 + }, + { + "epoch": 0.17796186199113448, + "grad_norm": 0.10996980965137482, + "learning_rate": 9.56e-06, + "loss": 1.4228, + "step": 478 + }, + { + "epoch": 0.17833416714174355, + "grad_norm": 0.11518129706382751, + "learning_rate": 9.58e-06, + "loss": 1.4155, + "step": 479 + }, + { + "epoch": 0.17870647229235262, + "grad_norm": 0.1234855204820633, + "learning_rate": 9.600000000000001e-06, + "loss": 1.4309, + "step": 480 + }, + { + "epoch": 0.1790787774429617, + "grad_norm": 0.10811053216457367, + "learning_rate": 9.620000000000001e-06, + "loss": 1.4211, + "step": 481 + }, + { + "epoch": 0.17945108259357076, + "grad_norm": 0.10534091293811798, + "learning_rate": 9.640000000000001e-06, + "loss": 1.4175, + "step": 482 + }, + { + "epoch": 0.17982338774417983, + "grad_norm": 0.12702758610248566, + "learning_rate": 9.66e-06, + "loss": 1.4104, + "step": 483 + }, + { + "epoch": 0.1801956928947889, + "grad_norm": 0.10806774348020554, + "learning_rate": 9.68e-06, + "loss": 1.4136, + "step": 484 + }, + { + "epoch": 0.18056799804539797, + "grad_norm": 0.10667596012353897, + "learning_rate": 9.7e-06, + "loss": 1.4111, + "step": 485 + }, + { + "epoch": 0.18094030319600704, + "grad_norm": 0.1084335595369339, + "learning_rate": 9.72e-06, + "loss": 1.424, + "step": 486 + }, + { + "epoch": 0.18131260834661608, + "grad_norm": 0.10393060743808746, + "learning_rate": 9.74e-06, + "loss": 1.413, + "step": 487 + }, + { + "epoch": 0.18168491349722515, + "grad_norm": 0.11034102737903595, + "learning_rate": 9.760000000000001e-06, + "loss": 1.4215, + "step": 488 + }, + { + "epoch": 0.18205721864783422, + "grad_norm": 0.1163318008184433, + "learning_rate": 9.780000000000001e-06, + "loss": 1.4123, + "step": 489 + }, + { + "epoch": 0.1824295237984433, + "grad_norm": 0.11191318929195404, + "learning_rate": 9.800000000000001e-06, + "loss": 1.4292, + "step": 490 + }, + { + "epoch": 0.18280182894905236, + "grad_norm": 0.10725145041942596, + "learning_rate": 9.820000000000001e-06, + "loss": 1.4084, + "step": 491 + }, + { + "epoch": 0.18317413409966143, + "grad_norm": 0.11110781133174896, + "learning_rate": 9.84e-06, + "loss": 1.394, + "step": 492 + }, + { + "epoch": 0.1835464392502705, + "grad_norm": 0.10888979583978653, + "learning_rate": 9.86e-06, + "loss": 1.4129, + "step": 493 + }, + { + "epoch": 0.18391874440087957, + "grad_norm": 0.11001749336719513, + "learning_rate": 9.88e-06, + "loss": 1.4248, + "step": 494 + }, + { + "epoch": 0.18429104955148864, + "grad_norm": 0.10707477480173111, + "learning_rate": 9.9e-06, + "loss": 1.4088, + "step": 495 + }, + { + "epoch": 0.18466335470209772, + "grad_norm": 0.12006059288978577, + "learning_rate": 9.920000000000002e-06, + "loss": 1.4126, + "step": 496 + }, + { + "epoch": 0.18503565985270679, + "grad_norm": 0.10763873159885406, + "learning_rate": 9.940000000000001e-06, + "loss": 1.4219, + "step": 497 + }, + { + "epoch": 0.18540796500331586, + "grad_norm": 0.10750039666891098, + "learning_rate": 9.960000000000001e-06, + "loss": 1.3938, + "step": 498 + }, + { + "epoch": 0.1857802701539249, + "grad_norm": 0.10844654589891434, + "learning_rate": 9.980000000000001e-06, + "loss": 1.4257, + "step": 499 + }, + { + "epoch": 0.18615257530453397, + "grad_norm": 0.10777752101421356, + "learning_rate": 1e-05, + "loss": 1.4042, + "step": 500 + }, + { + "epoch": 0.18615257530453397, + "eval_loss": 1.3990447521209717, + "eval_runtime": 16.7137, + "eval_samples_per_second": 103.747, + "eval_steps_per_second": 5.205, + "step": 500 + }, + { + "epoch": 0.18652488045514304, + "grad_norm": 0.12805776298046112, + "learning_rate": 1.002e-05, + "loss": 1.3923, + "step": 501 + }, + { + "epoch": 0.1868971856057521, + "grad_norm": 0.11597350984811783, + "learning_rate": 1.004e-05, + "loss": 1.4246, + "step": 502 + }, + { + "epoch": 0.18726949075636118, + "grad_norm": 0.12692782282829285, + "learning_rate": 1.006e-05, + "loss": 1.4091, + "step": 503 + }, + { + "epoch": 0.18764179590697025, + "grad_norm": 0.11857876926660538, + "learning_rate": 1.008e-05, + "loss": 1.4025, + "step": 504 + }, + { + "epoch": 0.18801410105757932, + "grad_norm": 0.11153510212898254, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.4069, + "step": 505 + }, + { + "epoch": 0.1883864062081884, + "grad_norm": 0.11478788405656815, + "learning_rate": 1.0120000000000001e-05, + "loss": 1.4043, + "step": 506 + }, + { + "epoch": 0.18875871135879746, + "grad_norm": 0.11333264410495758, + "learning_rate": 1.0140000000000001e-05, + "loss": 1.4, + "step": 507 + }, + { + "epoch": 0.18913101650940653, + "grad_norm": 0.12227758020162582, + "learning_rate": 1.0160000000000001e-05, + "loss": 1.4023, + "step": 508 + }, + { + "epoch": 0.1895033216600156, + "grad_norm": 0.12102984637022018, + "learning_rate": 1.018e-05, + "loss": 1.4056, + "step": 509 + }, + { + "epoch": 0.18987562681062467, + "grad_norm": 0.11923938989639282, + "learning_rate": 1.02e-05, + "loss": 1.4019, + "step": 510 + }, + { + "epoch": 0.1902479319612337, + "grad_norm": 0.12334270030260086, + "learning_rate": 1.022e-05, + "loss": 1.3959, + "step": 511 + }, + { + "epoch": 0.19062023711184278, + "grad_norm": 0.12032870948314667, + "learning_rate": 1.024e-05, + "loss": 1.3994, + "step": 512 + }, + { + "epoch": 0.19099254226245185, + "grad_norm": 0.1170472502708435, + "learning_rate": 1.0260000000000002e-05, + "loss": 1.3959, + "step": 513 + }, + { + "epoch": 0.19136484741306092, + "grad_norm": 0.10825354605913162, + "learning_rate": 1.0280000000000002e-05, + "loss": 1.4031, + "step": 514 + }, + { + "epoch": 0.19173715256367, + "grad_norm": 0.11196181178092957, + "learning_rate": 1.0300000000000001e-05, + "loss": 1.4048, + "step": 515 + }, + { + "epoch": 0.19210945771427906, + "grad_norm": 0.11114434897899628, + "learning_rate": 1.0320000000000001e-05, + "loss": 1.4064, + "step": 516 + }, + { + "epoch": 0.19248176286488813, + "grad_norm": 0.11616680771112442, + "learning_rate": 1.0340000000000001e-05, + "loss": 1.3967, + "step": 517 + }, + { + "epoch": 0.1928540680154972, + "grad_norm": 0.11657610535621643, + "learning_rate": 1.036e-05, + "loss": 1.4152, + "step": 518 + }, + { + "epoch": 0.19322637316610627, + "grad_norm": 0.10928953438997269, + "learning_rate": 1.038e-05, + "loss": 1.4026, + "step": 519 + }, + { + "epoch": 0.19359867831671534, + "grad_norm": 0.11354228109121323, + "learning_rate": 1.04e-05, + "loss": 1.4026, + "step": 520 + }, + { + "epoch": 0.19397098346732441, + "grad_norm": 0.12005306780338287, + "learning_rate": 1.0420000000000002e-05, + "loss": 1.4094, + "step": 521 + }, + { + "epoch": 0.19434328861793349, + "grad_norm": 0.12376662343740463, + "learning_rate": 1.0440000000000002e-05, + "loss": 1.4076, + "step": 522 + }, + { + "epoch": 0.19471559376854256, + "grad_norm": 0.1191818043589592, + "learning_rate": 1.0460000000000001e-05, + "loss": 1.4055, + "step": 523 + }, + { + "epoch": 0.1950878989191516, + "grad_norm": 0.11545392870903015, + "learning_rate": 1.0480000000000001e-05, + "loss": 1.3881, + "step": 524 + }, + { + "epoch": 0.19546020406976067, + "grad_norm": 0.1315152794122696, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.4119, + "step": 525 + }, + { + "epoch": 0.19583250922036974, + "grad_norm": 0.11896664649248123, + "learning_rate": 1.0520000000000001e-05, + "loss": 1.4165, + "step": 526 + }, + { + "epoch": 0.1962048143709788, + "grad_norm": 0.11113154143095016, + "learning_rate": 1.054e-05, + "loss": 1.3992, + "step": 527 + }, + { + "epoch": 0.19657711952158788, + "grad_norm": 0.13099145889282227, + "learning_rate": 1.056e-05, + "loss": 1.3997, + "step": 528 + }, + { + "epoch": 0.19694942467219695, + "grad_norm": 0.11786054819822311, + "learning_rate": 1.0580000000000002e-05, + "loss": 1.3967, + "step": 529 + }, + { + "epoch": 0.19732172982280602, + "grad_norm": 0.1341516375541687, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.3993, + "step": 530 + }, + { + "epoch": 0.1976940349734151, + "grad_norm": 0.12682707607746124, + "learning_rate": 1.0620000000000002e-05, + "loss": 1.4148, + "step": 531 + }, + { + "epoch": 0.19806634012402416, + "grad_norm": 0.1188652366399765, + "learning_rate": 1.0640000000000001e-05, + "loss": 1.3933, + "step": 532 + }, + { + "epoch": 0.19843864527463323, + "grad_norm": 0.11299144476652145, + "learning_rate": 1.0660000000000001e-05, + "loss": 1.4004, + "step": 533 + }, + { + "epoch": 0.1988109504252423, + "grad_norm": 0.12654365599155426, + "learning_rate": 1.0680000000000001e-05, + "loss": 1.3929, + "step": 534 + }, + { + "epoch": 0.19918325557585137, + "grad_norm": 0.12072401493787766, + "learning_rate": 1.0700000000000001e-05, + "loss": 1.3967, + "step": 535 + }, + { + "epoch": 0.1995555607264604, + "grad_norm": 0.12298454344272614, + "learning_rate": 1.072e-05, + "loss": 1.4072, + "step": 536 + }, + { + "epoch": 0.19992786587706948, + "grad_norm": 0.127255380153656, + "learning_rate": 1.0740000000000002e-05, + "loss": 1.408, + "step": 537 + }, + { + "epoch": 0.20030017102767855, + "grad_norm": 0.12813320755958557, + "learning_rate": 1.0760000000000002e-05, + "loss": 1.4025, + "step": 538 + }, + { + "epoch": 0.20067247617828762, + "grad_norm": 0.12207869440317154, + "learning_rate": 1.0780000000000002e-05, + "loss": 1.3844, + "step": 539 + }, + { + "epoch": 0.2010447813288967, + "grad_norm": 0.11978594213724136, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.3953, + "step": 540 + }, + { + "epoch": 0.20141708647950576, + "grad_norm": 0.12205009162425995, + "learning_rate": 1.0820000000000001e-05, + "loss": 1.3904, + "step": 541 + }, + { + "epoch": 0.20178939163011483, + "grad_norm": 0.11987481266260147, + "learning_rate": 1.0840000000000001e-05, + "loss": 1.3985, + "step": 542 + }, + { + "epoch": 0.2021616967807239, + "grad_norm": 0.12077482044696808, + "learning_rate": 1.0860000000000001e-05, + "loss": 1.4081, + "step": 543 + }, + { + "epoch": 0.20253400193133297, + "grad_norm": 0.11721371859312057, + "learning_rate": 1.0880000000000001e-05, + "loss": 1.4001, + "step": 544 + }, + { + "epoch": 0.20290630708194204, + "grad_norm": 0.11542621999979019, + "learning_rate": 1.0900000000000002e-05, + "loss": 1.4109, + "step": 545 + }, + { + "epoch": 0.20327861223255111, + "grad_norm": 0.12170755863189697, + "learning_rate": 1.0920000000000002e-05, + "loss": 1.3943, + "step": 546 + }, + { + "epoch": 0.20365091738316018, + "grad_norm": 0.11331729590892792, + "learning_rate": 1.0940000000000002e-05, + "loss": 1.404, + "step": 547 + }, + { + "epoch": 0.20402322253376923, + "grad_norm": 0.12473474442958832, + "learning_rate": 1.0960000000000002e-05, + "loss": 1.399, + "step": 548 + }, + { + "epoch": 0.2043955276843783, + "grad_norm": 0.12112352252006531, + "learning_rate": 1.0980000000000002e-05, + "loss": 1.394, + "step": 549 + }, + { + "epoch": 0.20476783283498737, + "grad_norm": 0.11570420861244202, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3961, + "step": 550 + }, + { + "epoch": 0.20514013798559644, + "grad_norm": 0.12518157064914703, + "learning_rate": 1.1020000000000001e-05, + "loss": 1.3929, + "step": 551 + }, + { + "epoch": 0.2055124431362055, + "grad_norm": 0.12412535399198532, + "learning_rate": 1.1040000000000001e-05, + "loss": 1.3953, + "step": 552 + }, + { + "epoch": 0.20588474828681458, + "grad_norm": 0.12580837309360504, + "learning_rate": 1.1060000000000003e-05, + "loss": 1.3887, + "step": 553 + }, + { + "epoch": 0.20625705343742365, + "grad_norm": 0.12166325002908707, + "learning_rate": 1.1080000000000002e-05, + "loss": 1.4035, + "step": 554 + }, + { + "epoch": 0.20662935858803272, + "grad_norm": 0.11380494385957718, + "learning_rate": 1.1100000000000002e-05, + "loss": 1.3881, + "step": 555 + }, + { + "epoch": 0.2070016637386418, + "grad_norm": 0.12607711553573608, + "learning_rate": 1.1120000000000002e-05, + "loss": 1.3911, + "step": 556 + }, + { + "epoch": 0.20737396888925086, + "grad_norm": 0.1209443137049675, + "learning_rate": 1.1140000000000002e-05, + "loss": 1.4025, + "step": 557 + }, + { + "epoch": 0.20774627403985993, + "grad_norm": 0.11875788867473602, + "learning_rate": 1.1160000000000002e-05, + "loss": 1.382, + "step": 558 + }, + { + "epoch": 0.208118579190469, + "grad_norm": 0.12261742353439331, + "learning_rate": 1.1180000000000001e-05, + "loss": 1.39, + "step": 559 + }, + { + "epoch": 0.20849088434107807, + "grad_norm": 0.13006705045700073, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.3926, + "step": 560 + }, + { + "epoch": 0.2088631894916871, + "grad_norm": 0.1237824410200119, + "learning_rate": 1.1220000000000003e-05, + "loss": 1.3977, + "step": 561 + }, + { + "epoch": 0.20923549464229618, + "grad_norm": 0.12436344474554062, + "learning_rate": 1.1240000000000002e-05, + "loss": 1.3924, + "step": 562 + }, + { + "epoch": 0.20960779979290525, + "grad_norm": 0.12768109142780304, + "learning_rate": 1.126e-05, + "loss": 1.4062, + "step": 563 + }, + { + "epoch": 0.20998010494351432, + "grad_norm": 0.12379388511180878, + "learning_rate": 1.128e-05, + "loss": 1.3951, + "step": 564 + }, + { + "epoch": 0.2103524100941234, + "grad_norm": 0.12182633578777313, + "learning_rate": 1.13e-05, + "loss": 1.3969, + "step": 565 + }, + { + "epoch": 0.21072471524473246, + "grad_norm": 0.12477164715528488, + "learning_rate": 1.132e-05, + "loss": 1.3954, + "step": 566 + }, + { + "epoch": 0.21109702039534153, + "grad_norm": 0.12322055548429489, + "learning_rate": 1.134e-05, + "loss": 1.4005, + "step": 567 + }, + { + "epoch": 0.2114693255459506, + "grad_norm": 0.12597453594207764, + "learning_rate": 1.136e-05, + "loss": 1.3974, + "step": 568 + }, + { + "epoch": 0.21184163069655967, + "grad_norm": 0.13351480662822723, + "learning_rate": 1.138e-05, + "loss": 1.3914, + "step": 569 + }, + { + "epoch": 0.21221393584716874, + "grad_norm": 0.13195490837097168, + "learning_rate": 1.14e-05, + "loss": 1.4162, + "step": 570 + }, + { + "epoch": 0.21258624099777781, + "grad_norm": 0.12694774568080902, + "learning_rate": 1.142e-05, + "loss": 1.3844, + "step": 571 + }, + { + "epoch": 0.21295854614838688, + "grad_norm": 0.12894363701343536, + "learning_rate": 1.144e-05, + "loss": 1.387, + "step": 572 + }, + { + "epoch": 0.21333085129899593, + "grad_norm": 0.11793356388807297, + "learning_rate": 1.146e-05, + "loss": 1.3881, + "step": 573 + }, + { + "epoch": 0.213703156449605, + "grad_norm": 0.13151799142360687, + "learning_rate": 1.148e-05, + "loss": 1.3872, + "step": 574 + }, + { + "epoch": 0.21407546160021407, + "grad_norm": 0.1286250352859497, + "learning_rate": 1.15e-05, + "loss": 1.3668, + "step": 575 + }, + { + "epoch": 0.21444776675082314, + "grad_norm": 0.13224467635154724, + "learning_rate": 1.152e-05, + "loss": 1.4045, + "step": 576 + }, + { + "epoch": 0.2148200719014322, + "grad_norm": 0.12700480222702026, + "learning_rate": 1.154e-05, + "loss": 1.3814, + "step": 577 + }, + { + "epoch": 0.21519237705204128, + "grad_norm": 0.13367070257663727, + "learning_rate": 1.156e-05, + "loss": 1.3868, + "step": 578 + }, + { + "epoch": 0.21556468220265035, + "grad_norm": 0.12190359085798264, + "learning_rate": 1.1580000000000001e-05, + "loss": 1.3981, + "step": 579 + }, + { + "epoch": 0.21593698735325942, + "grad_norm": 0.1240309402346611, + "learning_rate": 1.16e-05, + "loss": 1.3945, + "step": 580 + }, + { + "epoch": 0.2163092925038685, + "grad_norm": 0.15947964787483215, + "learning_rate": 1.162e-05, + "loss": 1.3889, + "step": 581 + }, + { + "epoch": 0.21668159765447756, + "grad_norm": 0.130709707736969, + "learning_rate": 1.164e-05, + "loss": 1.3987, + "step": 582 + }, + { + "epoch": 0.21705390280508663, + "grad_norm": 0.12745317816734314, + "learning_rate": 1.166e-05, + "loss": 1.3802, + "step": 583 + }, + { + "epoch": 0.2174262079556957, + "grad_norm": 0.12209773063659668, + "learning_rate": 1.168e-05, + "loss": 1.3798, + "step": 584 + }, + { + "epoch": 0.21779851310630474, + "grad_norm": 0.14180858433246613, + "learning_rate": 1.17e-05, + "loss": 1.3814, + "step": 585 + }, + { + "epoch": 0.2181708182569138, + "grad_norm": 0.11788824945688248, + "learning_rate": 1.172e-05, + "loss": 1.3758, + "step": 586 + }, + { + "epoch": 0.21854312340752288, + "grad_norm": 0.12975317239761353, + "learning_rate": 1.1740000000000001e-05, + "loss": 1.395, + "step": 587 + }, + { + "epoch": 0.21891542855813195, + "grad_norm": 0.12418822199106216, + "learning_rate": 1.1760000000000001e-05, + "loss": 1.4044, + "step": 588 + }, + { + "epoch": 0.21928773370874102, + "grad_norm": 0.12132619321346283, + "learning_rate": 1.178e-05, + "loss": 1.3866, + "step": 589 + }, + { + "epoch": 0.2196600388593501, + "grad_norm": 0.1325748711824417, + "learning_rate": 1.18e-05, + "loss": 1.3903, + "step": 590 + }, + { + "epoch": 0.22003234400995916, + "grad_norm": 0.1266520470380783, + "learning_rate": 1.182e-05, + "loss": 1.3894, + "step": 591 + }, + { + "epoch": 0.22040464916056823, + "grad_norm": 0.12683559954166412, + "learning_rate": 1.184e-05, + "loss": 1.3948, + "step": 592 + }, + { + "epoch": 0.2207769543111773, + "grad_norm": 0.12880323827266693, + "learning_rate": 1.186e-05, + "loss": 1.3756, + "step": 593 + }, + { + "epoch": 0.22114925946178637, + "grad_norm": 0.12198843061923981, + "learning_rate": 1.188e-05, + "loss": 1.3821, + "step": 594 + }, + { + "epoch": 0.22152156461239544, + "grad_norm": 0.13132554292678833, + "learning_rate": 1.1900000000000001e-05, + "loss": 1.3877, + "step": 595 + }, + { + "epoch": 0.22189386976300451, + "grad_norm": 0.1291092336177826, + "learning_rate": 1.1920000000000001e-05, + "loss": 1.4057, + "step": 596 + }, + { + "epoch": 0.22226617491361356, + "grad_norm": 0.13351795077323914, + "learning_rate": 1.1940000000000001e-05, + "loss": 1.3652, + "step": 597 + }, + { + "epoch": 0.22263848006422263, + "grad_norm": 0.12667742371559143, + "learning_rate": 1.196e-05, + "loss": 1.386, + "step": 598 + }, + { + "epoch": 0.2230107852148317, + "grad_norm": 0.13008779287338257, + "learning_rate": 1.198e-05, + "loss": 1.3887, + "step": 599 + }, + { + "epoch": 0.22338309036544077, + "grad_norm": 0.12079621851444244, + "learning_rate": 1.2e-05, + "loss": 1.3855, + "step": 600 + }, + { + "epoch": 0.22375539551604984, + "grad_norm": 0.13083826005458832, + "learning_rate": 1.202e-05, + "loss": 1.3819, + "step": 601 + }, + { + "epoch": 0.2241277006666589, + "grad_norm": 0.13189451396465302, + "learning_rate": 1.204e-05, + "loss": 1.3667, + "step": 602 + }, + { + "epoch": 0.22450000581726798, + "grad_norm": 0.1339392513036728, + "learning_rate": 1.2060000000000001e-05, + "loss": 1.3802, + "step": 603 + }, + { + "epoch": 0.22487231096787705, + "grad_norm": 0.12915685772895813, + "learning_rate": 1.2080000000000001e-05, + "loss": 1.3871, + "step": 604 + }, + { + "epoch": 0.22524461611848612, + "grad_norm": 0.13761506974697113, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.392, + "step": 605 + }, + { + "epoch": 0.2256169212690952, + "grad_norm": 0.1275719255208969, + "learning_rate": 1.2120000000000001e-05, + "loss": 1.3715, + "step": 606 + }, + { + "epoch": 0.22598922641970426, + "grad_norm": 0.16112855076789856, + "learning_rate": 1.214e-05, + "loss": 1.3759, + "step": 607 + }, + { + "epoch": 0.22636153157031333, + "grad_norm": 0.13108204305171967, + "learning_rate": 1.216e-05, + "loss": 1.375, + "step": 608 + }, + { + "epoch": 0.2267338367209224, + "grad_norm": 0.13481353223323822, + "learning_rate": 1.218e-05, + "loss": 1.3704, + "step": 609 + }, + { + "epoch": 0.22710614187153144, + "grad_norm": 0.144792377948761, + "learning_rate": 1.22e-05, + "loss": 1.3865, + "step": 610 + }, + { + "epoch": 0.2274784470221405, + "grad_norm": 0.13363252580165863, + "learning_rate": 1.2220000000000002e-05, + "loss": 1.3826, + "step": 611 + }, + { + "epoch": 0.22785075217274958, + "grad_norm": 0.14733262360095978, + "learning_rate": 1.2240000000000001e-05, + "loss": 1.3875, + "step": 612 + }, + { + "epoch": 0.22822305732335865, + "grad_norm": 0.13364124298095703, + "learning_rate": 1.2260000000000001e-05, + "loss": 1.3885, + "step": 613 + }, + { + "epoch": 0.22859536247396772, + "grad_norm": 0.13215725123882294, + "learning_rate": 1.2280000000000001e-05, + "loss": 1.3644, + "step": 614 + }, + { + "epoch": 0.2289676676245768, + "grad_norm": 0.14032886922359467, + "learning_rate": 1.23e-05, + "loss": 1.3931, + "step": 615 + }, + { + "epoch": 0.22933997277518586, + "grad_norm": 0.15233881771564484, + "learning_rate": 1.232e-05, + "loss": 1.3771, + "step": 616 + }, + { + "epoch": 0.22971227792579493, + "grad_norm": 0.14122669398784637, + "learning_rate": 1.234e-05, + "loss": 1.3774, + "step": 617 + }, + { + "epoch": 0.230084583076404, + "grad_norm": 0.13818544149398804, + "learning_rate": 1.236e-05, + "loss": 1.374, + "step": 618 + }, + { + "epoch": 0.23045688822701307, + "grad_norm": 0.14192582666873932, + "learning_rate": 1.2380000000000002e-05, + "loss": 1.3773, + "step": 619 + }, + { + "epoch": 0.23082919337762214, + "grad_norm": 0.1523938775062561, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.3832, + "step": 620 + }, + { + "epoch": 0.23120149852823121, + "grad_norm": 0.1431209146976471, + "learning_rate": 1.2420000000000001e-05, + "loss": 1.3765, + "step": 621 + }, + { + "epoch": 0.23157380367884026, + "grad_norm": 0.13690458238124847, + "learning_rate": 1.2440000000000001e-05, + "loss": 1.3791, + "step": 622 + }, + { + "epoch": 0.23194610882944933, + "grad_norm": 0.14077654480934143, + "learning_rate": 1.2460000000000001e-05, + "loss": 1.3785, + "step": 623 + }, + { + "epoch": 0.2323184139800584, + "grad_norm": 0.15121321380138397, + "learning_rate": 1.248e-05, + "loss": 1.3945, + "step": 624 + }, + { + "epoch": 0.23269071913066747, + "grad_norm": 0.1371421217918396, + "learning_rate": 1.25e-05, + "loss": 1.376, + "step": 625 + }, + { + "epoch": 0.23306302428127654, + "grad_norm": 0.14253154397010803, + "learning_rate": 1.252e-05, + "loss": 1.3823, + "step": 626 + }, + { + "epoch": 0.2334353294318856, + "grad_norm": 0.1420959234237671, + "learning_rate": 1.254e-05, + "loss": 1.3702, + "step": 627 + }, + { + "epoch": 0.23380763458249468, + "grad_norm": 0.15104436874389648, + "learning_rate": 1.2560000000000002e-05, + "loss": 1.385, + "step": 628 + }, + { + "epoch": 0.23417993973310375, + "grad_norm": 0.1486346274614334, + "learning_rate": 1.2580000000000002e-05, + "loss": 1.3792, + "step": 629 + }, + { + "epoch": 0.23455224488371282, + "grad_norm": 0.16748331487178802, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.3916, + "step": 630 + }, + { + "epoch": 0.2349245500343219, + "grad_norm": 0.14780151844024658, + "learning_rate": 1.2620000000000001e-05, + "loss": 1.3827, + "step": 631 + }, + { + "epoch": 0.23529685518493096, + "grad_norm": 0.152265265583992, + "learning_rate": 1.2640000000000001e-05, + "loss": 1.3812, + "step": 632 + }, + { + "epoch": 0.23566916033554003, + "grad_norm": 0.1454855501651764, + "learning_rate": 1.266e-05, + "loss": 1.3907, + "step": 633 + }, + { + "epoch": 0.23604146548614907, + "grad_norm": 0.14732711017131805, + "learning_rate": 1.268e-05, + "loss": 1.3651, + "step": 634 + }, + { + "epoch": 0.23641377063675814, + "grad_norm": 0.1476028710603714, + "learning_rate": 1.27e-05, + "loss": 1.3723, + "step": 635 + }, + { + "epoch": 0.2367860757873672, + "grad_norm": 0.14366750419139862, + "learning_rate": 1.2720000000000002e-05, + "loss": 1.3904, + "step": 636 + }, + { + "epoch": 0.23715838093797628, + "grad_norm": 0.15820929408073425, + "learning_rate": 1.2740000000000002e-05, + "loss": 1.3852, + "step": 637 + }, + { + "epoch": 0.23753068608858535, + "grad_norm": 0.1551392823457718, + "learning_rate": 1.2760000000000001e-05, + "loss": 1.389, + "step": 638 + }, + { + "epoch": 0.23790299123919442, + "grad_norm": 0.13778699934482574, + "learning_rate": 1.2780000000000001e-05, + "loss": 1.3627, + "step": 639 + }, + { + "epoch": 0.2382752963898035, + "grad_norm": 0.13657543063163757, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.3941, + "step": 640 + }, + { + "epoch": 0.23864760154041256, + "grad_norm": 0.13666628301143646, + "learning_rate": 1.2820000000000001e-05, + "loss": 1.3729, + "step": 641 + }, + { + "epoch": 0.23901990669102163, + "grad_norm": 0.14898766577243805, + "learning_rate": 1.284e-05, + "loss": 1.3823, + "step": 642 + }, + { + "epoch": 0.2393922118416307, + "grad_norm": 0.1644555926322937, + "learning_rate": 1.286e-05, + "loss": 1.3794, + "step": 643 + }, + { + "epoch": 0.23976451699223977, + "grad_norm": 0.1226273775100708, + "learning_rate": 1.2880000000000002e-05, + "loss": 1.3631, + "step": 644 + }, + { + "epoch": 0.24013682214284884, + "grad_norm": 0.16430190205574036, + "learning_rate": 1.2900000000000002e-05, + "loss": 1.3936, + "step": 645 + }, + { + "epoch": 0.2405091272934579, + "grad_norm": 0.15783290565013885, + "learning_rate": 1.2920000000000002e-05, + "loss": 1.3951, + "step": 646 + }, + { + "epoch": 0.24088143244406696, + "grad_norm": 0.14612354338169098, + "learning_rate": 1.2940000000000001e-05, + "loss": 1.3701, + "step": 647 + }, + { + "epoch": 0.24125373759467603, + "grad_norm": 0.15059353411197662, + "learning_rate": 1.2960000000000001e-05, + "loss": 1.3649, + "step": 648 + }, + { + "epoch": 0.2416260427452851, + "grad_norm": 0.15658129751682281, + "learning_rate": 1.2980000000000001e-05, + "loss": 1.3793, + "step": 649 + }, + { + "epoch": 0.24199834789589417, + "grad_norm": 0.14705069363117218, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3816, + "step": 650 + }, + { + "epoch": 0.24237065304650324, + "grad_norm": 0.13878506422042847, + "learning_rate": 1.302e-05, + "loss": 1.3881, + "step": 651 + }, + { + "epoch": 0.2427429581971123, + "grad_norm": 0.1551000475883484, + "learning_rate": 1.3040000000000002e-05, + "loss": 1.3759, + "step": 652 + }, + { + "epoch": 0.24311526334772138, + "grad_norm": 0.15696851909160614, + "learning_rate": 1.3060000000000002e-05, + "loss": 1.3744, + "step": 653 + }, + { + "epoch": 0.24348756849833045, + "grad_norm": 0.13662178814411163, + "learning_rate": 1.3080000000000002e-05, + "loss": 1.3699, + "step": 654 + }, + { + "epoch": 0.24385987364893952, + "grad_norm": 0.14424225687980652, + "learning_rate": 1.3100000000000002e-05, + "loss": 1.371, + "step": 655 + }, + { + "epoch": 0.2442321787995486, + "grad_norm": 0.17988137900829315, + "learning_rate": 1.3120000000000001e-05, + "loss": 1.3955, + "step": 656 + }, + { + "epoch": 0.24460448395015766, + "grad_norm": 0.1564609706401825, + "learning_rate": 1.3140000000000001e-05, + "loss": 1.3792, + "step": 657 + }, + { + "epoch": 0.24497678910076673, + "grad_norm": 0.1378934532403946, + "learning_rate": 1.3160000000000001e-05, + "loss": 1.3501, + "step": 658 + }, + { + "epoch": 0.24534909425137577, + "grad_norm": 0.14894217252731323, + "learning_rate": 1.3180000000000001e-05, + "loss": 1.3716, + "step": 659 + }, + { + "epoch": 0.24572139940198484, + "grad_norm": 0.1473313868045807, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.3666, + "step": 660 + }, + { + "epoch": 0.2460937045525939, + "grad_norm": 0.14587174355983734, + "learning_rate": 1.3220000000000002e-05, + "loss": 1.3887, + "step": 661 + }, + { + "epoch": 0.24646600970320298, + "grad_norm": 0.14664749801158905, + "learning_rate": 1.3240000000000002e-05, + "loss": 1.3727, + "step": 662 + }, + { + "epoch": 0.24683831485381205, + "grad_norm": 0.1472824066877365, + "learning_rate": 1.3260000000000002e-05, + "loss": 1.3676, + "step": 663 + }, + { + "epoch": 0.24721062000442112, + "grad_norm": 0.1401338130235672, + "learning_rate": 1.3280000000000002e-05, + "loss": 1.3873, + "step": 664 + }, + { + "epoch": 0.2475829251550302, + "grad_norm": 0.13379812240600586, + "learning_rate": 1.3300000000000001e-05, + "loss": 1.3625, + "step": 665 + }, + { + "epoch": 0.24795523030563926, + "grad_norm": 0.14346960186958313, + "learning_rate": 1.3320000000000001e-05, + "loss": 1.359, + "step": 666 + }, + { + "epoch": 0.24832753545624833, + "grad_norm": 0.14162355661392212, + "learning_rate": 1.3340000000000001e-05, + "loss": 1.3702, + "step": 667 + }, + { + "epoch": 0.2486998406068574, + "grad_norm": 0.15658289194107056, + "learning_rate": 1.3360000000000003e-05, + "loss": 1.3698, + "step": 668 + }, + { + "epoch": 0.24907214575746647, + "grad_norm": 0.13803306221961975, + "learning_rate": 1.3380000000000002e-05, + "loss": 1.3588, + "step": 669 + }, + { + "epoch": 0.24944445090807554, + "grad_norm": 0.14932705461978912, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.3748, + "step": 670 + }, + { + "epoch": 0.24981675605868459, + "grad_norm": 0.3856603503227234, + "learning_rate": 1.3420000000000002e-05, + "loss": 1.3876, + "step": 671 + }, + { + "epoch": 0.2501890612092937, + "grad_norm": 0.14300397038459778, + "learning_rate": 1.3440000000000002e-05, + "loss": 1.36, + "step": 672 + }, + { + "epoch": 0.2505613663599027, + "grad_norm": 0.15117305517196655, + "learning_rate": 1.3460000000000002e-05, + "loss": 1.3943, + "step": 673 + }, + { + "epoch": 0.2509336715105118, + "grad_norm": 0.15544290840625763, + "learning_rate": 1.3480000000000001e-05, + "loss": 1.3826, + "step": 674 + }, + { + "epoch": 0.25130597666112087, + "grad_norm": 0.14352373778820038, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.3577, + "step": 675 + }, + { + "epoch": 0.25167828181172996, + "grad_norm": 0.14757125079631805, + "learning_rate": 1.3520000000000003e-05, + "loss": 1.3706, + "step": 676 + }, + { + "epoch": 0.252050586962339, + "grad_norm": 0.14953893423080444, + "learning_rate": 1.3540000000000003e-05, + "loss": 1.3656, + "step": 677 + }, + { + "epoch": 0.25242289211294805, + "grad_norm": 0.15957669913768768, + "learning_rate": 1.3560000000000002e-05, + "loss": 1.3569, + "step": 678 + }, + { + "epoch": 0.25279519726355715, + "grad_norm": 0.15265138447284698, + "learning_rate": 1.3580000000000002e-05, + "loss": 1.3636, + "step": 679 + }, + { + "epoch": 0.2531675024141662, + "grad_norm": 0.1513211727142334, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.3717, + "step": 680 + }, + { + "epoch": 0.2535398075647753, + "grad_norm": 0.15139970183372498, + "learning_rate": 1.3620000000000002e-05, + "loss": 1.3776, + "step": 681 + }, + { + "epoch": 0.25391211271538433, + "grad_norm": 0.15319296717643738, + "learning_rate": 1.3640000000000002e-05, + "loss": 1.3674, + "step": 682 + }, + { + "epoch": 0.25428441786599343, + "grad_norm": 0.14396759867668152, + "learning_rate": 1.3660000000000001e-05, + "loss": 1.3739, + "step": 683 + }, + { + "epoch": 0.25465672301660247, + "grad_norm": 0.14723047614097595, + "learning_rate": 1.3680000000000003e-05, + "loss": 1.3777, + "step": 684 + }, + { + "epoch": 0.25502902816721157, + "grad_norm": 0.13962876796722412, + "learning_rate": 1.3700000000000003e-05, + "loss": 1.3676, + "step": 685 + }, + { + "epoch": 0.2554013333178206, + "grad_norm": 0.14297540485858917, + "learning_rate": 1.3720000000000002e-05, + "loss": 1.3725, + "step": 686 + }, + { + "epoch": 0.2557736384684297, + "grad_norm": 0.14365188777446747, + "learning_rate": 1.3740000000000002e-05, + "loss": 1.3547, + "step": 687 + }, + { + "epoch": 0.25614594361903875, + "grad_norm": 0.14428015053272247, + "learning_rate": 1.376e-05, + "loss": 1.3883, + "step": 688 + }, + { + "epoch": 0.25651824876964785, + "grad_norm": 0.14234168827533722, + "learning_rate": 1.378e-05, + "loss": 1.345, + "step": 689 + }, + { + "epoch": 0.2568905539202569, + "grad_norm": 0.14778351783752441, + "learning_rate": 1.38e-05, + "loss": 1.3773, + "step": 690 + }, + { + "epoch": 0.25726285907086593, + "grad_norm": 0.14804288744926453, + "learning_rate": 1.382e-05, + "loss": 1.3877, + "step": 691 + }, + { + "epoch": 0.25763516422147503, + "grad_norm": 0.1505001187324524, + "learning_rate": 1.384e-05, + "loss": 1.3638, + "step": 692 + }, + { + "epoch": 0.2580074693720841, + "grad_norm": 0.1481354534626007, + "learning_rate": 1.386e-05, + "loss": 1.3624, + "step": 693 + }, + { + "epoch": 0.2583797745226932, + "grad_norm": 0.1433618813753128, + "learning_rate": 1.3880000000000001e-05, + "loss": 1.3531, + "step": 694 + }, + { + "epoch": 0.2587520796733022, + "grad_norm": 0.14664116501808167, + "learning_rate": 1.39e-05, + "loss": 1.3705, + "step": 695 + }, + { + "epoch": 0.2591243848239113, + "grad_norm": 0.13810987770557404, + "learning_rate": 1.392e-05, + "loss": 1.3528, + "step": 696 + }, + { + "epoch": 0.25949668997452036, + "grad_norm": 0.14705303311347961, + "learning_rate": 1.394e-05, + "loss": 1.3598, + "step": 697 + }, + { + "epoch": 0.25986899512512945, + "grad_norm": 0.14440058171749115, + "learning_rate": 1.396e-05, + "loss": 1.3584, + "step": 698 + }, + { + "epoch": 0.2602413002757385, + "grad_norm": 0.14364704489707947, + "learning_rate": 1.398e-05, + "loss": 1.3734, + "step": 699 + }, + { + "epoch": 0.2606136054263476, + "grad_norm": 0.1593163013458252, + "learning_rate": 1.4e-05, + "loss": 1.3782, + "step": 700 + }, + { + "epoch": 0.26098591057695664, + "grad_norm": 0.15553325414657593, + "learning_rate": 1.402e-05, + "loss": 1.3768, + "step": 701 + }, + { + "epoch": 0.2613582157275657, + "grad_norm": 0.15635879337787628, + "learning_rate": 1.4040000000000001e-05, + "loss": 1.3541, + "step": 702 + }, + { + "epoch": 0.2617305208781748, + "grad_norm": 0.15832242369651794, + "learning_rate": 1.4060000000000001e-05, + "loss": 1.3801, + "step": 703 + }, + { + "epoch": 0.2621028260287838, + "grad_norm": 0.145589679479599, + "learning_rate": 1.408e-05, + "loss": 1.3701, + "step": 704 + }, + { + "epoch": 0.2624751311793929, + "grad_norm": 0.1448379009962082, + "learning_rate": 1.41e-05, + "loss": 1.3817, + "step": 705 + }, + { + "epoch": 0.26284743633000196, + "grad_norm": 0.15079927444458008, + "learning_rate": 1.412e-05, + "loss": 1.3642, + "step": 706 + }, + { + "epoch": 0.26321974148061106, + "grad_norm": 0.15496708452701569, + "learning_rate": 1.414e-05, + "loss": 1.383, + "step": 707 + }, + { + "epoch": 0.2635920466312201, + "grad_norm": 0.14300493896007538, + "learning_rate": 1.416e-05, + "loss": 1.3664, + "step": 708 + }, + { + "epoch": 0.2639643517818292, + "grad_norm": 0.15340474247932434, + "learning_rate": 1.418e-05, + "loss": 1.371, + "step": 709 + }, + { + "epoch": 0.26433665693243824, + "grad_norm": 0.14752480387687683, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.3754, + "step": 710 + }, + { + "epoch": 0.26470896208304734, + "grad_norm": 0.15268085896968842, + "learning_rate": 1.4220000000000001e-05, + "loss": 1.3604, + "step": 711 + }, + { + "epoch": 0.2650812672336564, + "grad_norm": 0.15046533942222595, + "learning_rate": 1.4240000000000001e-05, + "loss": 1.3566, + "step": 712 + }, + { + "epoch": 0.2654535723842655, + "grad_norm": 0.14135941863059998, + "learning_rate": 1.426e-05, + "loss": 1.3729, + "step": 713 + }, + { + "epoch": 0.2658258775348745, + "grad_norm": 0.14512401819229126, + "learning_rate": 1.428e-05, + "loss": 1.3521, + "step": 714 + }, + { + "epoch": 0.26619818268548356, + "grad_norm": 0.15231768786907196, + "learning_rate": 1.43e-05, + "loss": 1.3648, + "step": 715 + }, + { + "epoch": 0.26657048783609266, + "grad_norm": 0.14620406925678253, + "learning_rate": 1.432e-05, + "loss": 1.3534, + "step": 716 + }, + { + "epoch": 0.2669427929867017, + "grad_norm": 0.15307819843292236, + "learning_rate": 1.434e-05, + "loss": 1.3661, + "step": 717 + }, + { + "epoch": 0.2673150981373108, + "grad_norm": 0.15813127160072327, + "learning_rate": 1.4360000000000001e-05, + "loss": 1.3647, + "step": 718 + }, + { + "epoch": 0.26768740328791985, + "grad_norm": 0.15037232637405396, + "learning_rate": 1.4380000000000001e-05, + "loss": 1.3595, + "step": 719 + }, + { + "epoch": 0.26805970843852894, + "grad_norm": 0.1518103927373886, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.356, + "step": 720 + }, + { + "epoch": 0.268432013589138, + "grad_norm": 0.15428012609481812, + "learning_rate": 1.4420000000000001e-05, + "loss": 1.3636, + "step": 721 + }, + { + "epoch": 0.2688043187397471, + "grad_norm": 0.15426598489284515, + "learning_rate": 1.444e-05, + "loss": 1.355, + "step": 722 + }, + { + "epoch": 0.2691766238903561, + "grad_norm": 0.15186642110347748, + "learning_rate": 1.446e-05, + "loss": 1.3409, + "step": 723 + }, + { + "epoch": 0.2695489290409652, + "grad_norm": 0.15585435926914215, + "learning_rate": 1.448e-05, + "loss": 1.3503, + "step": 724 + }, + { + "epoch": 0.26992123419157427, + "grad_norm": 0.1601250320672989, + "learning_rate": 1.45e-05, + "loss": 1.3641, + "step": 725 + }, + { + "epoch": 0.27029353934218336, + "grad_norm": 0.15690714120864868, + "learning_rate": 1.4520000000000002e-05, + "loss": 1.3681, + "step": 726 + }, + { + "epoch": 0.2706658444927924, + "grad_norm": 0.1501239687204361, + "learning_rate": 1.4540000000000001e-05, + "loss": 1.361, + "step": 727 + }, + { + "epoch": 0.27103814964340145, + "grad_norm": 0.15437743067741394, + "learning_rate": 1.4560000000000001e-05, + "loss": 1.3699, + "step": 728 + }, + { + "epoch": 0.27141045479401055, + "grad_norm": 0.14239777624607086, + "learning_rate": 1.4580000000000001e-05, + "loss": 1.352, + "step": 729 + }, + { + "epoch": 0.2717827599446196, + "grad_norm": 0.15204055607318878, + "learning_rate": 1.46e-05, + "loss": 1.3609, + "step": 730 + }, + { + "epoch": 0.2721550650952287, + "grad_norm": 0.14352013170719147, + "learning_rate": 1.462e-05, + "loss": 1.3316, + "step": 731 + }, + { + "epoch": 0.27252737024583773, + "grad_norm": 0.1465366929769516, + "learning_rate": 1.464e-05, + "loss": 1.3621, + "step": 732 + }, + { + "epoch": 0.27289967539644683, + "grad_norm": 0.1439734250307083, + "learning_rate": 1.466e-05, + "loss": 1.3471, + "step": 733 + }, + { + "epoch": 0.27327198054705587, + "grad_norm": 0.16148164868354797, + "learning_rate": 1.4680000000000002e-05, + "loss": 1.3531, + "step": 734 + }, + { + "epoch": 0.27364428569766497, + "grad_norm": 0.15046028792858124, + "learning_rate": 1.4700000000000002e-05, + "loss": 1.357, + "step": 735 + }, + { + "epoch": 0.274016590848274, + "grad_norm": 0.1497603803873062, + "learning_rate": 1.4720000000000001e-05, + "loss": 1.3609, + "step": 736 + }, + { + "epoch": 0.2743888959988831, + "grad_norm": 0.1596868485212326, + "learning_rate": 1.4740000000000001e-05, + "loss": 1.3569, + "step": 737 + }, + { + "epoch": 0.27476120114949215, + "grad_norm": 0.1531444936990738, + "learning_rate": 1.4760000000000001e-05, + "loss": 1.3436, + "step": 738 + }, + { + "epoch": 0.2751335063001012, + "grad_norm": 0.14884676039218903, + "learning_rate": 1.478e-05, + "loss": 1.3583, + "step": 739 + }, + { + "epoch": 0.2755058114507103, + "grad_norm": 0.1622639298439026, + "learning_rate": 1.48e-05, + "loss": 1.3598, + "step": 740 + }, + { + "epoch": 0.27587811660131933, + "grad_norm": 0.16207291185855865, + "learning_rate": 1.482e-05, + "loss": 1.3771, + "step": 741 + }, + { + "epoch": 0.27625042175192843, + "grad_norm": 0.16403760015964508, + "learning_rate": 1.4840000000000002e-05, + "loss": 1.3618, + "step": 742 + }, + { + "epoch": 0.2766227269025375, + "grad_norm": 0.1647598147392273, + "learning_rate": 1.4860000000000002e-05, + "loss": 1.3542, + "step": 743 + }, + { + "epoch": 0.2769950320531466, + "grad_norm": 0.1558753401041031, + "learning_rate": 1.4880000000000002e-05, + "loss": 1.3527, + "step": 744 + }, + { + "epoch": 0.2773673372037556, + "grad_norm": 0.1733637899160385, + "learning_rate": 1.4900000000000001e-05, + "loss": 1.3538, + "step": 745 + }, + { + "epoch": 0.2777396423543647, + "grad_norm": 0.15772801637649536, + "learning_rate": 1.4920000000000001e-05, + "loss": 1.3492, + "step": 746 + }, + { + "epoch": 0.27811194750497376, + "grad_norm": 0.15663520991802216, + "learning_rate": 1.4940000000000001e-05, + "loss": 1.3565, + "step": 747 + }, + { + "epoch": 0.27848425265558285, + "grad_norm": 0.15726646780967712, + "learning_rate": 1.496e-05, + "loss": 1.3591, + "step": 748 + }, + { + "epoch": 0.2788565578061919, + "grad_norm": 0.17913725972175598, + "learning_rate": 1.498e-05, + "loss": 1.3504, + "step": 749 + }, + { + "epoch": 0.279228862956801, + "grad_norm": 0.15114524960517883, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3499, + "step": 750 + }, + { + "epoch": 0.27960116810741004, + "grad_norm": 0.15915299952030182, + "learning_rate": 1.5020000000000002e-05, + "loss": 1.3364, + "step": 751 + }, + { + "epoch": 0.2799734732580191, + "grad_norm": 0.15501591563224792, + "learning_rate": 1.5040000000000002e-05, + "loss": 1.3473, + "step": 752 + }, + { + "epoch": 0.2803457784086282, + "grad_norm": 0.1513669490814209, + "learning_rate": 1.5060000000000001e-05, + "loss": 1.3547, + "step": 753 + }, + { + "epoch": 0.2807180835592372, + "grad_norm": 0.16978231072425842, + "learning_rate": 1.5080000000000001e-05, + "loss": 1.3447, + "step": 754 + }, + { + "epoch": 0.2810903887098463, + "grad_norm": 0.16618570685386658, + "learning_rate": 1.5100000000000001e-05, + "loss": 1.3493, + "step": 755 + }, + { + "epoch": 0.28146269386045536, + "grad_norm": 0.15305636823177338, + "learning_rate": 1.5120000000000001e-05, + "loss": 1.3675, + "step": 756 + }, + { + "epoch": 0.28183499901106446, + "grad_norm": 0.16489024460315704, + "learning_rate": 1.514e-05, + "loss": 1.3465, + "step": 757 + }, + { + "epoch": 0.2822073041616735, + "grad_norm": 0.1630878895521164, + "learning_rate": 1.516e-05, + "loss": 1.3551, + "step": 758 + }, + { + "epoch": 0.2825796093122826, + "grad_norm": 0.15557344257831573, + "learning_rate": 1.5180000000000002e-05, + "loss": 1.3452, + "step": 759 + }, + { + "epoch": 0.28295191446289164, + "grad_norm": 0.16549739241600037, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.3515, + "step": 760 + }, + { + "epoch": 0.28332421961350074, + "grad_norm": 0.1618327498435974, + "learning_rate": 1.5220000000000002e-05, + "loss": 1.3422, + "step": 761 + }, + { + "epoch": 0.2836965247641098, + "grad_norm": 0.16406500339508057, + "learning_rate": 1.5240000000000001e-05, + "loss": 1.354, + "step": 762 + }, + { + "epoch": 0.2840688299147189, + "grad_norm": 0.1552378088235855, + "learning_rate": 1.5260000000000003e-05, + "loss": 1.3644, + "step": 763 + }, + { + "epoch": 0.2844411350653279, + "grad_norm": 0.16994218528270721, + "learning_rate": 1.5280000000000003e-05, + "loss": 1.3639, + "step": 764 + }, + { + "epoch": 0.28481344021593696, + "grad_norm": 0.15901988744735718, + "learning_rate": 1.5300000000000003e-05, + "loss": 1.3657, + "step": 765 + }, + { + "epoch": 0.28518574536654606, + "grad_norm": 0.164791077375412, + "learning_rate": 1.5320000000000002e-05, + "loss": 1.3518, + "step": 766 + }, + { + "epoch": 0.2855580505171551, + "grad_norm": 0.17213784158229828, + "learning_rate": 1.5340000000000002e-05, + "loss": 1.3479, + "step": 767 + }, + { + "epoch": 0.2859303556677642, + "grad_norm": 0.15734751522541046, + "learning_rate": 1.5360000000000002e-05, + "loss": 1.3589, + "step": 768 + }, + { + "epoch": 0.28630266081837324, + "grad_norm": 0.17351983487606049, + "learning_rate": 1.5380000000000002e-05, + "loss": 1.3469, + "step": 769 + }, + { + "epoch": 0.28667496596898234, + "grad_norm": 0.1618126779794693, + "learning_rate": 1.54e-05, + "loss": 1.3508, + "step": 770 + }, + { + "epoch": 0.2870472711195914, + "grad_norm": 0.15765798091888428, + "learning_rate": 1.542e-05, + "loss": 1.3704, + "step": 771 + }, + { + "epoch": 0.2874195762702005, + "grad_norm": 0.17241904139518738, + "learning_rate": 1.544e-05, + "loss": 1.3537, + "step": 772 + }, + { + "epoch": 0.2877918814208095, + "grad_norm": 0.15991517901420593, + "learning_rate": 1.546e-05, + "loss": 1.3434, + "step": 773 + }, + { + "epoch": 0.2881641865714186, + "grad_norm": 0.18322332203388214, + "learning_rate": 1.548e-05, + "loss": 1.3534, + "step": 774 + }, + { + "epoch": 0.28853649172202767, + "grad_norm": 0.1581200808286667, + "learning_rate": 1.55e-05, + "loss": 1.357, + "step": 775 + }, + { + "epoch": 0.2889087968726367, + "grad_norm": 0.16509777307510376, + "learning_rate": 1.552e-05, + "loss": 1.3659, + "step": 776 + }, + { + "epoch": 0.2892811020232458, + "grad_norm": 0.1753901094198227, + "learning_rate": 1.554e-05, + "loss": 1.3456, + "step": 777 + }, + { + "epoch": 0.28965340717385485, + "grad_norm": 0.16197755932807922, + "learning_rate": 1.556e-05, + "loss": 1.3508, + "step": 778 + }, + { + "epoch": 0.29002571232446395, + "grad_norm": 0.16059085726737976, + "learning_rate": 1.5580000000000003e-05, + "loss": 1.343, + "step": 779 + }, + { + "epoch": 0.290398017475073, + "grad_norm": 0.170423224568367, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.3545, + "step": 780 + }, + { + "epoch": 0.2907703226256821, + "grad_norm": 0.1572631150484085, + "learning_rate": 1.5620000000000003e-05, + "loss": 1.3468, + "step": 781 + }, + { + "epoch": 0.29114262777629113, + "grad_norm": 0.16300170123577118, + "learning_rate": 1.5640000000000003e-05, + "loss": 1.3604, + "step": 782 + }, + { + "epoch": 0.2915149329269002, + "grad_norm": 0.1643751561641693, + "learning_rate": 1.5660000000000003e-05, + "loss": 1.3446, + "step": 783 + }, + { + "epoch": 0.29188723807750927, + "grad_norm": 0.16648028790950775, + "learning_rate": 1.5680000000000002e-05, + "loss": 1.3365, + "step": 784 + }, + { + "epoch": 0.29225954322811837, + "grad_norm": 0.1662505716085434, + "learning_rate": 1.5700000000000002e-05, + "loss": 1.3469, + "step": 785 + }, + { + "epoch": 0.2926318483787274, + "grad_norm": 0.16111180186271667, + "learning_rate": 1.5720000000000002e-05, + "loss": 1.3461, + "step": 786 + }, + { + "epoch": 0.2930041535293365, + "grad_norm": 0.16680561006069183, + "learning_rate": 1.5740000000000002e-05, + "loss": 1.3457, + "step": 787 + }, + { + "epoch": 0.29337645867994555, + "grad_norm": 0.18159611523151398, + "learning_rate": 1.576e-05, + "loss": 1.3601, + "step": 788 + }, + { + "epoch": 0.2937487638305546, + "grad_norm": 0.1730850487947464, + "learning_rate": 1.578e-05, + "loss": 1.3629, + "step": 789 + }, + { + "epoch": 0.2941210689811637, + "grad_norm": 0.17999385297298431, + "learning_rate": 1.58e-05, + "loss": 1.3558, + "step": 790 + }, + { + "epoch": 0.29449337413177273, + "grad_norm": 0.16653069853782654, + "learning_rate": 1.582e-05, + "loss": 1.3508, + "step": 791 + }, + { + "epoch": 0.29486567928238183, + "grad_norm": 0.16509103775024414, + "learning_rate": 1.584e-05, + "loss": 1.3411, + "step": 792 + }, + { + "epoch": 0.2952379844329909, + "grad_norm": 0.17233438789844513, + "learning_rate": 1.586e-05, + "loss": 1.3429, + "step": 793 + }, + { + "epoch": 0.29561028958359997, + "grad_norm": 0.16929791867733002, + "learning_rate": 1.588e-05, + "loss": 1.3628, + "step": 794 + }, + { + "epoch": 0.295982594734209, + "grad_norm": 0.16331572830677032, + "learning_rate": 1.5900000000000004e-05, + "loss": 1.346, + "step": 795 + }, + { + "epoch": 0.2963548998848181, + "grad_norm": 0.1649729311466217, + "learning_rate": 1.5920000000000003e-05, + "loss": 1.35, + "step": 796 + }, + { + "epoch": 0.29672720503542716, + "grad_norm": 0.16012988984584808, + "learning_rate": 1.5940000000000003e-05, + "loss": 1.3463, + "step": 797 + }, + { + "epoch": 0.29709951018603625, + "grad_norm": 0.16761143505573273, + "learning_rate": 1.5960000000000003e-05, + "loss": 1.3485, + "step": 798 + }, + { + "epoch": 0.2974718153366453, + "grad_norm": 0.17172127962112427, + "learning_rate": 1.5980000000000003e-05, + "loss": 1.351, + "step": 799 + }, + { + "epoch": 0.2978441204872544, + "grad_norm": 0.16175536811351776, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3546, + "step": 800 + }, + { + "epoch": 0.29821642563786344, + "grad_norm": 0.17179609835147858, + "learning_rate": 1.6020000000000002e-05, + "loss": 1.3506, + "step": 801 + }, + { + "epoch": 0.2985887307884725, + "grad_norm": 0.1593533307313919, + "learning_rate": 1.6040000000000002e-05, + "loss": 1.3578, + "step": 802 + }, + { + "epoch": 0.2989610359390816, + "grad_norm": 0.17455071210861206, + "learning_rate": 1.6060000000000002e-05, + "loss": 1.3424, + "step": 803 + }, + { + "epoch": 0.2993333410896906, + "grad_norm": 0.1619621366262436, + "learning_rate": 1.6080000000000002e-05, + "loss": 1.3344, + "step": 804 + }, + { + "epoch": 0.2997056462402997, + "grad_norm": 0.1549704223871231, + "learning_rate": 1.6100000000000002e-05, + "loss": 1.3478, + "step": 805 + }, + { + "epoch": 0.30007795139090876, + "grad_norm": 0.16837717592716217, + "learning_rate": 1.612e-05, + "loss": 1.3585, + "step": 806 + }, + { + "epoch": 0.30045025654151786, + "grad_norm": 0.15680178999900818, + "learning_rate": 1.614e-05, + "loss": 1.3474, + "step": 807 + }, + { + "epoch": 0.3008225616921269, + "grad_norm": 0.16553205251693726, + "learning_rate": 1.616e-05, + "loss": 1.3576, + "step": 808 + }, + { + "epoch": 0.301194866842736, + "grad_norm": 0.16092002391815186, + "learning_rate": 1.618e-05, + "loss": 1.3503, + "step": 809 + }, + { + "epoch": 0.30156717199334504, + "grad_norm": 0.16862614452838898, + "learning_rate": 1.62e-05, + "loss": 1.3556, + "step": 810 + }, + { + "epoch": 0.30193947714395414, + "grad_norm": 0.16667672991752625, + "learning_rate": 1.6220000000000004e-05, + "loss": 1.3602, + "step": 811 + }, + { + "epoch": 0.3023117822945632, + "grad_norm": 0.16125406324863434, + "learning_rate": 1.6240000000000004e-05, + "loss": 1.3445, + "step": 812 + }, + { + "epoch": 0.3026840874451722, + "grad_norm": 0.16460643708705902, + "learning_rate": 1.626e-05, + "loss": 1.3546, + "step": 813 + }, + { + "epoch": 0.3030563925957813, + "grad_norm": 0.15730057656764984, + "learning_rate": 1.628e-05, + "loss": 1.3461, + "step": 814 + }, + { + "epoch": 0.30342869774639036, + "grad_norm": 0.16555914282798767, + "learning_rate": 1.63e-05, + "loss": 1.3506, + "step": 815 + }, + { + "epoch": 0.30380100289699946, + "grad_norm": 0.15877728164196014, + "learning_rate": 1.632e-05, + "loss": 1.3536, + "step": 816 + }, + { + "epoch": 0.3041733080476085, + "grad_norm": 0.1610872894525528, + "learning_rate": 1.634e-05, + "loss": 1.3461, + "step": 817 + }, + { + "epoch": 0.3045456131982176, + "grad_norm": 0.17202696204185486, + "learning_rate": 1.636e-05, + "loss": 1.3405, + "step": 818 + }, + { + "epoch": 0.30491791834882664, + "grad_norm": 0.16373762488365173, + "learning_rate": 1.638e-05, + "loss": 1.3514, + "step": 819 + }, + { + "epoch": 0.30529022349943574, + "grad_norm": 0.17125077545642853, + "learning_rate": 1.64e-05, + "loss": 1.3451, + "step": 820 + }, + { + "epoch": 0.3056625286500448, + "grad_norm": 0.18695010244846344, + "learning_rate": 1.6420000000000002e-05, + "loss": 1.3443, + "step": 821 + }, + { + "epoch": 0.3060348338006539, + "grad_norm": 0.1650739312171936, + "learning_rate": 1.6440000000000002e-05, + "loss": 1.3512, + "step": 822 + }, + { + "epoch": 0.3064071389512629, + "grad_norm": 0.18323633074760437, + "learning_rate": 1.646e-05, + "loss": 1.3426, + "step": 823 + }, + { + "epoch": 0.306779444101872, + "grad_norm": 0.1654973030090332, + "learning_rate": 1.648e-05, + "loss": 1.3452, + "step": 824 + }, + { + "epoch": 0.30715174925248107, + "grad_norm": 0.1673707365989685, + "learning_rate": 1.65e-05, + "loss": 1.3493, + "step": 825 + }, + { + "epoch": 0.3075240544030901, + "grad_norm": 0.16768096387386322, + "learning_rate": 1.652e-05, + "loss": 1.3484, + "step": 826 + }, + { + "epoch": 0.3078963595536992, + "grad_norm": 0.15851683914661407, + "learning_rate": 1.654e-05, + "loss": 1.3481, + "step": 827 + }, + { + "epoch": 0.30826866470430825, + "grad_norm": 0.1723686009645462, + "learning_rate": 1.656e-05, + "loss": 1.3566, + "step": 828 + }, + { + "epoch": 0.30864096985491735, + "grad_norm": 0.16892585158348083, + "learning_rate": 1.658e-05, + "loss": 1.3563, + "step": 829 + }, + { + "epoch": 0.3090132750055264, + "grad_norm": 0.16549967229366302, + "learning_rate": 1.66e-05, + "loss": 1.3514, + "step": 830 + }, + { + "epoch": 0.3093855801561355, + "grad_norm": 0.16105307638645172, + "learning_rate": 1.662e-05, + "loss": 1.3528, + "step": 831 + }, + { + "epoch": 0.30975788530674453, + "grad_norm": 0.1725122034549713, + "learning_rate": 1.664e-05, + "loss": 1.3408, + "step": 832 + }, + { + "epoch": 0.3101301904573536, + "grad_norm": 0.16419732570648193, + "learning_rate": 1.666e-05, + "loss": 1.3422, + "step": 833 + }, + { + "epoch": 0.31050249560796267, + "grad_norm": 0.16626691818237305, + "learning_rate": 1.668e-05, + "loss": 1.332, + "step": 834 + }, + { + "epoch": 0.31087480075857177, + "grad_norm": 0.16400444507598877, + "learning_rate": 1.67e-05, + "loss": 1.3378, + "step": 835 + }, + { + "epoch": 0.3112471059091808, + "grad_norm": 0.1672375500202179, + "learning_rate": 1.672e-05, + "loss": 1.334, + "step": 836 + }, + { + "epoch": 0.31161941105978985, + "grad_norm": 0.16108368337154388, + "learning_rate": 1.6740000000000002e-05, + "loss": 1.3525, + "step": 837 + }, + { + "epoch": 0.31199171621039895, + "grad_norm": 0.16484303772449493, + "learning_rate": 1.6760000000000002e-05, + "loss": 1.3541, + "step": 838 + }, + { + "epoch": 0.312364021361008, + "grad_norm": 0.16695603728294373, + "learning_rate": 1.6780000000000002e-05, + "loss": 1.3298, + "step": 839 + }, + { + "epoch": 0.3127363265116171, + "grad_norm": 0.1630071997642517, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.3388, + "step": 840 + }, + { + "epoch": 0.31310863166222613, + "grad_norm": 0.17112383246421814, + "learning_rate": 1.682e-05, + "loss": 1.3464, + "step": 841 + }, + { + "epoch": 0.31348093681283523, + "grad_norm": 0.1732318103313446, + "learning_rate": 1.684e-05, + "loss": 1.3344, + "step": 842 + }, + { + "epoch": 0.3138532419634443, + "grad_norm": 0.17030829191207886, + "learning_rate": 1.686e-05, + "loss": 1.3411, + "step": 843 + }, + { + "epoch": 0.31422554711405337, + "grad_norm": 0.16124574840068817, + "learning_rate": 1.688e-05, + "loss": 1.3473, + "step": 844 + }, + { + "epoch": 0.3145978522646624, + "grad_norm": 0.17811204493045807, + "learning_rate": 1.69e-05, + "loss": 1.3522, + "step": 845 + }, + { + "epoch": 0.3149701574152715, + "grad_norm": 0.17002224922180176, + "learning_rate": 1.692e-05, + "loss": 1.3511, + "step": 846 + }, + { + "epoch": 0.31534246256588055, + "grad_norm": 0.19199255108833313, + "learning_rate": 1.694e-05, + "loss": 1.3263, + "step": 847 + }, + { + "epoch": 0.31571476771648965, + "grad_norm": 0.1713089942932129, + "learning_rate": 1.696e-05, + "loss": 1.3383, + "step": 848 + }, + { + "epoch": 0.3160870728670987, + "grad_norm": 0.17415201663970947, + "learning_rate": 1.698e-05, + "loss": 1.3336, + "step": 849 + }, + { + "epoch": 0.31645937801770774, + "grad_norm": 0.1600249856710434, + "learning_rate": 1.7e-05, + "loss": 1.3639, + "step": 850 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.17309750616550446, + "learning_rate": 1.702e-05, + "loss": 1.3371, + "step": 851 + }, + { + "epoch": 0.3172039883189259, + "grad_norm": 0.16656599938869476, + "learning_rate": 1.704e-05, + "loss": 1.3367, + "step": 852 + }, + { + "epoch": 0.317576293469535, + "grad_norm": 0.17623035609722137, + "learning_rate": 1.7060000000000003e-05, + "loss": 1.3414, + "step": 853 + }, + { + "epoch": 0.317948598620144, + "grad_norm": 0.16454781591892242, + "learning_rate": 1.7080000000000002e-05, + "loss": 1.3341, + "step": 854 + }, + { + "epoch": 0.3183209037707531, + "grad_norm": 0.16562296450138092, + "learning_rate": 1.7100000000000002e-05, + "loss": 1.3321, + "step": 855 + }, + { + "epoch": 0.31869320892136216, + "grad_norm": 0.16431719064712524, + "learning_rate": 1.7120000000000002e-05, + "loss": 1.3332, + "step": 856 + }, + { + "epoch": 0.31906551407197126, + "grad_norm": 0.16330453753471375, + "learning_rate": 1.7140000000000002e-05, + "loss": 1.3376, + "step": 857 + }, + { + "epoch": 0.3194378192225803, + "grad_norm": 0.1680067926645279, + "learning_rate": 1.7160000000000002e-05, + "loss": 1.3343, + "step": 858 + }, + { + "epoch": 0.3198101243731894, + "grad_norm": 0.1762932538986206, + "learning_rate": 1.718e-05, + "loss": 1.3387, + "step": 859 + }, + { + "epoch": 0.32018242952379844, + "grad_norm": 0.1647929847240448, + "learning_rate": 1.72e-05, + "loss": 1.3451, + "step": 860 + }, + { + "epoch": 0.32055473467440754, + "grad_norm": 0.16490478813648224, + "learning_rate": 1.722e-05, + "loss": 1.3486, + "step": 861 + }, + { + "epoch": 0.3209270398250166, + "grad_norm": 0.17381954193115234, + "learning_rate": 1.724e-05, + "loss": 1.3598, + "step": 862 + }, + { + "epoch": 0.3212993449756256, + "grad_norm": 0.17484751343727112, + "learning_rate": 1.726e-05, + "loss": 1.3432, + "step": 863 + }, + { + "epoch": 0.3216716501262347, + "grad_norm": 0.17640666663646698, + "learning_rate": 1.728e-05, + "loss": 1.3512, + "step": 864 + }, + { + "epoch": 0.32204395527684376, + "grad_norm": 0.17663167417049408, + "learning_rate": 1.73e-05, + "loss": 1.3646, + "step": 865 + }, + { + "epoch": 0.32241626042745286, + "grad_norm": 0.1713891476392746, + "learning_rate": 1.732e-05, + "loss": 1.3256, + "step": 866 + }, + { + "epoch": 0.3227885655780619, + "grad_norm": 0.17021964490413666, + "learning_rate": 1.734e-05, + "loss": 1.332, + "step": 867 + }, + { + "epoch": 0.323160870728671, + "grad_norm": 0.16920128464698792, + "learning_rate": 1.736e-05, + "loss": 1.3463, + "step": 868 + }, + { + "epoch": 0.32353317587928004, + "grad_norm": 0.18202915787696838, + "learning_rate": 1.7380000000000003e-05, + "loss": 1.3461, + "step": 869 + }, + { + "epoch": 0.32390548102988914, + "grad_norm": 0.16767475008964539, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.33, + "step": 870 + }, + { + "epoch": 0.3242777861804982, + "grad_norm": 0.18158170580863953, + "learning_rate": 1.7420000000000003e-05, + "loss": 1.3504, + "step": 871 + }, + { + "epoch": 0.3246500913311073, + "grad_norm": 0.17010337114334106, + "learning_rate": 1.7440000000000002e-05, + "loss": 1.3229, + "step": 872 + }, + { + "epoch": 0.3250223964817163, + "grad_norm": 0.1630079299211502, + "learning_rate": 1.7460000000000002e-05, + "loss": 1.3142, + "step": 873 + }, + { + "epoch": 0.32539470163232537, + "grad_norm": 0.176998108625412, + "learning_rate": 1.7480000000000002e-05, + "loss": 1.3371, + "step": 874 + }, + { + "epoch": 0.32576700678293447, + "grad_norm": 0.18017543852329254, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.3562, + "step": 875 + }, + { + "epoch": 0.3261393119335435, + "grad_norm": 0.17135824263095856, + "learning_rate": 1.752e-05, + "loss": 1.3387, + "step": 876 + }, + { + "epoch": 0.3265116170841526, + "grad_norm": 0.17937518656253815, + "learning_rate": 1.754e-05, + "loss": 1.3389, + "step": 877 + }, + { + "epoch": 0.32688392223476165, + "grad_norm": 0.1736384481191635, + "learning_rate": 1.756e-05, + "loss": 1.3556, + "step": 878 + }, + { + "epoch": 0.32725622738537075, + "grad_norm": 0.162176251411438, + "learning_rate": 1.758e-05, + "loss": 1.3499, + "step": 879 + }, + { + "epoch": 0.3276285325359798, + "grad_norm": 0.17646604776382446, + "learning_rate": 1.76e-05, + "loss": 1.3404, + "step": 880 + }, + { + "epoch": 0.3280008376865889, + "grad_norm": 0.18382136523723602, + "learning_rate": 1.762e-05, + "loss": 1.3429, + "step": 881 + }, + { + "epoch": 0.32837314283719793, + "grad_norm": 0.19566792249679565, + "learning_rate": 1.764e-05, + "loss": 1.3399, + "step": 882 + }, + { + "epoch": 0.328745447987807, + "grad_norm": 0.17790797352790833, + "learning_rate": 1.766e-05, + "loss": 1.3413, + "step": 883 + }, + { + "epoch": 0.32911775313841607, + "grad_norm": 0.16908185184001923, + "learning_rate": 1.768e-05, + "loss": 1.3363, + "step": 884 + }, + { + "epoch": 0.32949005828902517, + "grad_norm": 0.17043371498584747, + "learning_rate": 1.77e-05, + "loss": 1.3415, + "step": 885 + }, + { + "epoch": 0.3298623634396342, + "grad_norm": 0.18717624247074127, + "learning_rate": 1.7720000000000003e-05, + "loss": 1.3382, + "step": 886 + }, + { + "epoch": 0.33023466859024325, + "grad_norm": 0.17335492372512817, + "learning_rate": 1.7740000000000003e-05, + "loss": 1.3391, + "step": 887 + }, + { + "epoch": 0.33060697374085235, + "grad_norm": 0.1830221712589264, + "learning_rate": 1.7760000000000003e-05, + "loss": 1.3435, + "step": 888 + }, + { + "epoch": 0.3309792788914614, + "grad_norm": 0.17462217807769775, + "learning_rate": 1.7780000000000003e-05, + "loss": 1.3322, + "step": 889 + }, + { + "epoch": 0.3313515840420705, + "grad_norm": 0.17677493393421173, + "learning_rate": 1.7800000000000002e-05, + "loss": 1.3234, + "step": 890 + }, + { + "epoch": 0.33172388919267953, + "grad_norm": 0.18080869317054749, + "learning_rate": 1.7820000000000002e-05, + "loss": 1.3442, + "step": 891 + }, + { + "epoch": 0.33209619434328863, + "grad_norm": 0.1581638604402542, + "learning_rate": 1.7840000000000002e-05, + "loss": 1.3198, + "step": 892 + }, + { + "epoch": 0.3324684994938977, + "grad_norm": 0.18022367358207703, + "learning_rate": 1.7860000000000002e-05, + "loss": 1.3182, + "step": 893 + }, + { + "epoch": 0.33284080464450677, + "grad_norm": 0.19160951673984528, + "learning_rate": 1.788e-05, + "loss": 1.3365, + "step": 894 + }, + { + "epoch": 0.3332131097951158, + "grad_norm": 0.18681097030639648, + "learning_rate": 1.79e-05, + "loss": 1.337, + "step": 895 + }, + { + "epoch": 0.3335854149457249, + "grad_norm": 0.17079736292362213, + "learning_rate": 1.792e-05, + "loss": 1.3371, + "step": 896 + }, + { + "epoch": 0.33395772009633395, + "grad_norm": 0.17504505813121796, + "learning_rate": 1.794e-05, + "loss": 1.3492, + "step": 897 + }, + { + "epoch": 0.33433002524694305, + "grad_norm": 0.18182472884655, + "learning_rate": 1.796e-05, + "loss": 1.3276, + "step": 898 + }, + { + "epoch": 0.3347023303975521, + "grad_norm": 0.1663830280303955, + "learning_rate": 1.798e-05, + "loss": 1.3366, + "step": 899 + }, + { + "epoch": 0.33507463554816114, + "grad_norm": 0.18775950372219086, + "learning_rate": 1.8e-05, + "loss": 1.3478, + "step": 900 + }, + { + "epoch": 0.33544694069877024, + "grad_norm": 0.16797006130218506, + "learning_rate": 1.802e-05, + "loss": 1.3456, + "step": 901 + }, + { + "epoch": 0.3358192458493793, + "grad_norm": 0.16422320902347565, + "learning_rate": 1.8040000000000003e-05, + "loss": 1.327, + "step": 902 + }, + { + "epoch": 0.3361915509999884, + "grad_norm": 0.1987656205892563, + "learning_rate": 1.8060000000000003e-05, + "loss": 1.3367, + "step": 903 + }, + { + "epoch": 0.3365638561505974, + "grad_norm": 0.16781911253929138, + "learning_rate": 1.8080000000000003e-05, + "loss": 1.3366, + "step": 904 + }, + { + "epoch": 0.3369361613012065, + "grad_norm": 0.17309685051441193, + "learning_rate": 1.8100000000000003e-05, + "loss": 1.3398, + "step": 905 + }, + { + "epoch": 0.33730846645181556, + "grad_norm": 0.17799708247184753, + "learning_rate": 1.8120000000000003e-05, + "loss": 1.3504, + "step": 906 + }, + { + "epoch": 0.33768077160242466, + "grad_norm": 0.16965223848819733, + "learning_rate": 1.8140000000000003e-05, + "loss": 1.3352, + "step": 907 + }, + { + "epoch": 0.3380530767530337, + "grad_norm": 0.1705309897661209, + "learning_rate": 1.8160000000000002e-05, + "loss": 1.3119, + "step": 908 + }, + { + "epoch": 0.3384253819036428, + "grad_norm": 0.18711362779140472, + "learning_rate": 1.8180000000000002e-05, + "loss": 1.3077, + "step": 909 + }, + { + "epoch": 0.33879768705425184, + "grad_norm": 0.1738344430923462, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.3292, + "step": 910 + }, + { + "epoch": 0.3391699922048609, + "grad_norm": 0.17799516022205353, + "learning_rate": 1.8220000000000002e-05, + "loss": 1.3294, + "step": 911 + }, + { + "epoch": 0.33954229735547, + "grad_norm": 0.18020783364772797, + "learning_rate": 1.824e-05, + "loss": 1.3383, + "step": 912 + }, + { + "epoch": 0.339914602506079, + "grad_norm": 0.16939756274223328, + "learning_rate": 1.826e-05, + "loss": 1.3291, + "step": 913 + }, + { + "epoch": 0.3402869076566881, + "grad_norm": 0.17973625659942627, + "learning_rate": 1.828e-05, + "loss": 1.3315, + "step": 914 + }, + { + "epoch": 0.34065921280729716, + "grad_norm": 0.1785053163766861, + "learning_rate": 1.83e-05, + "loss": 1.3386, + "step": 915 + }, + { + "epoch": 0.34103151795790626, + "grad_norm": 0.17944440245628357, + "learning_rate": 1.832e-05, + "loss": 1.3356, + "step": 916 + }, + { + "epoch": 0.3414038231085153, + "grad_norm": 0.1690181940793991, + "learning_rate": 1.834e-05, + "loss": 1.3428, + "step": 917 + }, + { + "epoch": 0.3417761282591244, + "grad_norm": 0.16925571858882904, + "learning_rate": 1.8360000000000004e-05, + "loss": 1.335, + "step": 918 + }, + { + "epoch": 0.34214843340973344, + "grad_norm": 0.17593559622764587, + "learning_rate": 1.8380000000000004e-05, + "loss": 1.3326, + "step": 919 + }, + { + "epoch": 0.34252073856034254, + "grad_norm": 0.1841055005788803, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.3245, + "step": 920 + }, + { + "epoch": 0.3428930437109516, + "grad_norm": 0.18622370064258575, + "learning_rate": 1.8420000000000003e-05, + "loss": 1.3273, + "step": 921 + }, + { + "epoch": 0.3432653488615607, + "grad_norm": 0.17079919576644897, + "learning_rate": 1.8440000000000003e-05, + "loss": 1.3233, + "step": 922 + }, + { + "epoch": 0.3436376540121697, + "grad_norm": 0.17922057211399078, + "learning_rate": 1.8460000000000003e-05, + "loss": 1.3385, + "step": 923 + }, + { + "epoch": 0.34400995916277877, + "grad_norm": 0.1806870400905609, + "learning_rate": 1.8480000000000003e-05, + "loss": 1.3414, + "step": 924 + }, + { + "epoch": 0.34438226431338786, + "grad_norm": 0.1918867528438568, + "learning_rate": 1.8500000000000002e-05, + "loss": 1.3441, + "step": 925 + }, + { + "epoch": 0.3447545694639969, + "grad_norm": 0.18761901557445526, + "learning_rate": 1.8520000000000002e-05, + "loss": 1.3352, + "step": 926 + }, + { + "epoch": 0.345126874614606, + "grad_norm": 0.17143699526786804, + "learning_rate": 1.8540000000000002e-05, + "loss": 1.3225, + "step": 927 + }, + { + "epoch": 0.34549917976521505, + "grad_norm": 0.1826808899641037, + "learning_rate": 1.8560000000000002e-05, + "loss": 1.338, + "step": 928 + }, + { + "epoch": 0.34587148491582415, + "grad_norm": 0.1739514321088791, + "learning_rate": 1.858e-05, + "loss": 1.3572, + "step": 929 + }, + { + "epoch": 0.3462437900664332, + "grad_norm": 0.18177390098571777, + "learning_rate": 1.86e-05, + "loss": 1.3125, + "step": 930 + }, + { + "epoch": 0.3466160952170423, + "grad_norm": 0.17709952592849731, + "learning_rate": 1.862e-05, + "loss": 1.326, + "step": 931 + }, + { + "epoch": 0.34698840036765133, + "grad_norm": 0.1684853583574295, + "learning_rate": 1.864e-05, + "loss": 1.3176, + "step": 932 + }, + { + "epoch": 0.3473607055182604, + "grad_norm": 0.18772630393505096, + "learning_rate": 1.866e-05, + "loss": 1.3246, + "step": 933 + }, + { + "epoch": 0.34773301066886947, + "grad_norm": 0.1772524118423462, + "learning_rate": 1.8680000000000004e-05, + "loss": 1.3463, + "step": 934 + }, + { + "epoch": 0.34810531581947857, + "grad_norm": 0.1651849001646042, + "learning_rate": 1.8700000000000004e-05, + "loss": 1.3414, + "step": 935 + }, + { + "epoch": 0.3484776209700876, + "grad_norm": 0.19098713994026184, + "learning_rate": 1.8720000000000004e-05, + "loss": 1.3423, + "step": 936 + }, + { + "epoch": 0.34884992612069665, + "grad_norm": 0.18180294334888458, + "learning_rate": 1.8740000000000004e-05, + "loss": 1.324, + "step": 937 + }, + { + "epoch": 0.34922223127130575, + "grad_norm": 0.1721077859401703, + "learning_rate": 1.876e-05, + "loss": 1.3337, + "step": 938 + }, + { + "epoch": 0.3495945364219148, + "grad_norm": 0.1962614804506302, + "learning_rate": 1.878e-05, + "loss": 1.3268, + "step": 939 + }, + { + "epoch": 0.3499668415725239, + "grad_norm": 0.17538578808307648, + "learning_rate": 1.88e-05, + "loss": 1.3238, + "step": 940 + }, + { + "epoch": 0.35033914672313293, + "grad_norm": 0.17809061706066132, + "learning_rate": 1.882e-05, + "loss": 1.3135, + "step": 941 + }, + { + "epoch": 0.35071145187374203, + "grad_norm": 0.182732954621315, + "learning_rate": 1.884e-05, + "loss": 1.328, + "step": 942 + }, + { + "epoch": 0.3510837570243511, + "grad_norm": 0.1797078251838684, + "learning_rate": 1.886e-05, + "loss": 1.3315, + "step": 943 + }, + { + "epoch": 0.35145606217496017, + "grad_norm": 0.187151238322258, + "learning_rate": 1.8880000000000002e-05, + "loss": 1.3277, + "step": 944 + }, + { + "epoch": 0.3518283673255692, + "grad_norm": 0.16728267073631287, + "learning_rate": 1.8900000000000002e-05, + "loss": 1.3369, + "step": 945 + }, + { + "epoch": 0.3522006724761783, + "grad_norm": 0.1842220276594162, + "learning_rate": 1.8920000000000002e-05, + "loss": 1.3236, + "step": 946 + }, + { + "epoch": 0.35257297762678735, + "grad_norm": 0.18164734542369843, + "learning_rate": 1.894e-05, + "loss": 1.3348, + "step": 947 + }, + { + "epoch": 0.3529452827773964, + "grad_norm": 0.17997892200946808, + "learning_rate": 1.896e-05, + "loss": 1.3231, + "step": 948 + }, + { + "epoch": 0.3533175879280055, + "grad_norm": 0.19494079053401947, + "learning_rate": 1.898e-05, + "loss": 1.3258, + "step": 949 + }, + { + "epoch": 0.35368989307861454, + "grad_norm": 0.17961707711219788, + "learning_rate": 1.9e-05, + "loss": 1.3301, + "step": 950 + }, + { + "epoch": 0.35406219822922363, + "grad_norm": 0.17583470046520233, + "learning_rate": 1.902e-05, + "loss": 1.3331, + "step": 951 + }, + { + "epoch": 0.3544345033798327, + "grad_norm": 0.18307524919509888, + "learning_rate": 1.904e-05, + "loss": 1.3396, + "step": 952 + }, + { + "epoch": 0.3548068085304418, + "grad_norm": 0.1822507530450821, + "learning_rate": 1.906e-05, + "loss": 1.3372, + "step": 953 + }, + { + "epoch": 0.3551791136810508, + "grad_norm": 0.18464961647987366, + "learning_rate": 1.908e-05, + "loss": 1.3201, + "step": 954 + }, + { + "epoch": 0.3555514188316599, + "grad_norm": 0.1767146736383438, + "learning_rate": 1.91e-05, + "loss": 1.3194, + "step": 955 + }, + { + "epoch": 0.35592372398226896, + "grad_norm": 0.19084005057811737, + "learning_rate": 1.912e-05, + "loss": 1.3535, + "step": 956 + }, + { + "epoch": 0.35629602913287806, + "grad_norm": 0.19618546962738037, + "learning_rate": 1.914e-05, + "loss": 1.3393, + "step": 957 + }, + { + "epoch": 0.3566683342834871, + "grad_norm": 0.18410375714302063, + "learning_rate": 1.916e-05, + "loss": 1.3131, + "step": 958 + }, + { + "epoch": 0.3570406394340962, + "grad_norm": 0.20000839233398438, + "learning_rate": 1.918e-05, + "loss": 1.3226, + "step": 959 + }, + { + "epoch": 0.35741294458470524, + "grad_norm": 0.17728938162326813, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.3195, + "step": 960 + }, + { + "epoch": 0.3577852497353143, + "grad_norm": 0.19262300431728363, + "learning_rate": 1.9220000000000002e-05, + "loss": 1.3395, + "step": 961 + }, + { + "epoch": 0.3581575548859234, + "grad_norm": 0.2089599370956421, + "learning_rate": 1.9240000000000002e-05, + "loss": 1.3296, + "step": 962 + }, + { + "epoch": 0.3585298600365324, + "grad_norm": 0.19653920829296112, + "learning_rate": 1.9260000000000002e-05, + "loss": 1.3266, + "step": 963 + }, + { + "epoch": 0.3589021651871415, + "grad_norm": 0.18377585709095, + "learning_rate": 1.9280000000000002e-05, + "loss": 1.3331, + "step": 964 + }, + { + "epoch": 0.35927447033775056, + "grad_norm": 0.18499638140201569, + "learning_rate": 1.93e-05, + "loss": 1.3195, + "step": 965 + }, + { + "epoch": 0.35964677548835966, + "grad_norm": 0.19310572743415833, + "learning_rate": 1.932e-05, + "loss": 1.3545, + "step": 966 + }, + { + "epoch": 0.3600190806389687, + "grad_norm": 0.19002506136894226, + "learning_rate": 1.934e-05, + "loss": 1.341, + "step": 967 + }, + { + "epoch": 0.3603913857895778, + "grad_norm": 0.1937621384859085, + "learning_rate": 1.936e-05, + "loss": 1.32, + "step": 968 + }, + { + "epoch": 0.36076369094018684, + "grad_norm": 0.1868428736925125, + "learning_rate": 1.938e-05, + "loss": 1.3172, + "step": 969 + }, + { + "epoch": 0.36113599609079594, + "grad_norm": 0.1846293956041336, + "learning_rate": 1.94e-05, + "loss": 1.3261, + "step": 970 + }, + { + "epoch": 0.361508301241405, + "grad_norm": 0.18833589553833008, + "learning_rate": 1.942e-05, + "loss": 1.3307, + "step": 971 + }, + { + "epoch": 0.3618806063920141, + "grad_norm": 0.1832055151462555, + "learning_rate": 1.944e-05, + "loss": 1.3167, + "step": 972 + }, + { + "epoch": 0.3622529115426231, + "grad_norm": 0.1855573207139969, + "learning_rate": 1.946e-05, + "loss": 1.3279, + "step": 973 + }, + { + "epoch": 0.36262521669323217, + "grad_norm": 0.1866430938243866, + "learning_rate": 1.948e-05, + "loss": 1.3361, + "step": 974 + }, + { + "epoch": 0.36299752184384126, + "grad_norm": 0.1905186027288437, + "learning_rate": 1.95e-05, + "loss": 1.3255, + "step": 975 + }, + { + "epoch": 0.3633698269944503, + "grad_norm": 0.1822662651538849, + "learning_rate": 1.9520000000000003e-05, + "loss": 1.314, + "step": 976 + }, + { + "epoch": 0.3637421321450594, + "grad_norm": 0.18091034889221191, + "learning_rate": 1.9540000000000003e-05, + "loss": 1.3267, + "step": 977 + }, + { + "epoch": 0.36411443729566845, + "grad_norm": 0.18915344774723053, + "learning_rate": 1.9560000000000002e-05, + "loss": 1.3161, + "step": 978 + }, + { + "epoch": 0.36448674244627755, + "grad_norm": 0.17233604192733765, + "learning_rate": 1.9580000000000002e-05, + "loss": 1.3453, + "step": 979 + }, + { + "epoch": 0.3648590475968866, + "grad_norm": 0.18806397914886475, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.3442, + "step": 980 + }, + { + "epoch": 0.3652313527474957, + "grad_norm": 0.19142785668373108, + "learning_rate": 1.9620000000000002e-05, + "loss": 1.3218, + "step": 981 + }, + { + "epoch": 0.36560365789810473, + "grad_norm": 0.17856691777706146, + "learning_rate": 1.9640000000000002e-05, + "loss": 1.3334, + "step": 982 + }, + { + "epoch": 0.3659759630487138, + "grad_norm": 0.189533069729805, + "learning_rate": 1.966e-05, + "loss": 1.3201, + "step": 983 + }, + { + "epoch": 0.36634826819932287, + "grad_norm": 0.19316861033439636, + "learning_rate": 1.968e-05, + "loss": 1.3177, + "step": 984 + }, + { + "epoch": 0.3667205733499319, + "grad_norm": 0.18213561177253723, + "learning_rate": 1.97e-05, + "loss": 1.3317, + "step": 985 + }, + { + "epoch": 0.367092878500541, + "grad_norm": 0.21344400942325592, + "learning_rate": 1.972e-05, + "loss": 1.3263, + "step": 986 + }, + { + "epoch": 0.36746518365115005, + "grad_norm": 0.1816537231206894, + "learning_rate": 1.974e-05, + "loss": 1.3155, + "step": 987 + }, + { + "epoch": 0.36783748880175915, + "grad_norm": 0.18131639063358307, + "learning_rate": 1.976e-05, + "loss": 1.3265, + "step": 988 + }, + { + "epoch": 0.3682097939523682, + "grad_norm": 0.21126648783683777, + "learning_rate": 1.978e-05, + "loss": 1.3293, + "step": 989 + }, + { + "epoch": 0.3685820991029773, + "grad_norm": 0.17451252043247223, + "learning_rate": 1.98e-05, + "loss": 1.3293, + "step": 990 + }, + { + "epoch": 0.36895440425358633, + "grad_norm": 0.17249158024787903, + "learning_rate": 1.982e-05, + "loss": 1.3242, + "step": 991 + }, + { + "epoch": 0.36932670940419543, + "grad_norm": 0.18453699350357056, + "learning_rate": 1.9840000000000003e-05, + "loss": 1.335, + "step": 992 + }, + { + "epoch": 0.3696990145548045, + "grad_norm": 0.16820859909057617, + "learning_rate": 1.9860000000000003e-05, + "loss": 1.3249, + "step": 993 + }, + { + "epoch": 0.37007131970541357, + "grad_norm": 0.19031395018100739, + "learning_rate": 1.9880000000000003e-05, + "loss": 1.3255, + "step": 994 + }, + { + "epoch": 0.3704436248560226, + "grad_norm": 0.19492731988430023, + "learning_rate": 1.9900000000000003e-05, + "loss": 1.3323, + "step": 995 + }, + { + "epoch": 0.3708159300066317, + "grad_norm": 0.17573820054531097, + "learning_rate": 1.9920000000000002e-05, + "loss": 1.3371, + "step": 996 + }, + { + "epoch": 0.37118823515724075, + "grad_norm": 0.18719618022441864, + "learning_rate": 1.9940000000000002e-05, + "loss": 1.3166, + "step": 997 + }, + { + "epoch": 0.3715605403078498, + "grad_norm": 0.17977416515350342, + "learning_rate": 1.9960000000000002e-05, + "loss": 1.323, + "step": 998 + }, + { + "epoch": 0.3719328454584589, + "grad_norm": 0.1703476756811142, + "learning_rate": 1.9980000000000002e-05, + "loss": 1.3185, + "step": 999 + }, + { + "epoch": 0.37230515060906794, + "grad_norm": 0.1878376603126526, + "learning_rate": 2e-05, + "loss": 1.3445, + "step": 1000 + }, + { + "epoch": 0.37230515060906794, + "eval_loss": 1.3607902526855469, + "eval_runtime": 16.6559, + "eval_samples_per_second": 104.108, + "eval_steps_per_second": 5.223, + "step": 1000 + }, + { + "epoch": 0.37267745575967703, + "grad_norm": 0.18507210910320282, + "learning_rate": 1.9999999926150316e-05, + "loss": 1.3343, + "step": 1001 + }, + { + "epoch": 0.3730497609102861, + "grad_norm": 0.17154212296009064, + "learning_rate": 1.999999970460126e-05, + "loss": 1.3407, + "step": 1002 + }, + { + "epoch": 0.3734220660608952, + "grad_norm": 0.1845930516719818, + "learning_rate": 1.9999999335352835e-05, + "loss": 1.3236, + "step": 1003 + }, + { + "epoch": 0.3737943712115042, + "grad_norm": 0.1764208972454071, + "learning_rate": 1.9999998818405046e-05, + "loss": 1.3242, + "step": 1004 + }, + { + "epoch": 0.3741666763621133, + "grad_norm": 0.1841861754655838, + "learning_rate": 1.99999981537579e-05, + "loss": 1.3287, + "step": 1005 + }, + { + "epoch": 0.37453898151272236, + "grad_norm": 0.17870505154132843, + "learning_rate": 1.9999997341411412e-05, + "loss": 1.3134, + "step": 1006 + }, + { + "epoch": 0.37491128666333146, + "grad_norm": 0.18854598701000214, + "learning_rate": 1.999999638136559e-05, + "loss": 1.3089, + "step": 1007 + }, + { + "epoch": 0.3752835918139405, + "grad_norm": 0.19055992364883423, + "learning_rate": 1.9999995273620453e-05, + "loss": 1.3269, + "step": 1008 + }, + { + "epoch": 0.3756558969645496, + "grad_norm": 0.1775302290916443, + "learning_rate": 1.9999994018176008e-05, + "loss": 1.3235, + "step": 1009 + }, + { + "epoch": 0.37602820211515864, + "grad_norm": 0.17888285219669342, + "learning_rate": 1.999999261503228e-05, + "loss": 1.3249, + "step": 1010 + }, + { + "epoch": 0.3764005072657677, + "grad_norm": 0.17626729607582092, + "learning_rate": 1.999999106418929e-05, + "loss": 1.3312, + "step": 1011 + }, + { + "epoch": 0.3767728124163768, + "grad_norm": 0.1689792424440384, + "learning_rate": 1.999998936564706e-05, + "loss": 1.3235, + "step": 1012 + }, + { + "epoch": 0.3771451175669858, + "grad_norm": 0.18726208806037903, + "learning_rate": 1.9999987519405614e-05, + "loss": 1.3249, + "step": 1013 + }, + { + "epoch": 0.3775174227175949, + "grad_norm": 0.1736321747303009, + "learning_rate": 1.9999985525464977e-05, + "loss": 1.3251, + "step": 1014 + }, + { + "epoch": 0.37788972786820396, + "grad_norm": 0.1852688491344452, + "learning_rate": 1.9999983383825186e-05, + "loss": 1.3272, + "step": 1015 + }, + { + "epoch": 0.37826203301881306, + "grad_norm": 0.17666661739349365, + "learning_rate": 1.9999981094486264e-05, + "loss": 1.3341, + "step": 1016 + }, + { + "epoch": 0.3786343381694221, + "grad_norm": 0.17533712089061737, + "learning_rate": 1.9999978657448252e-05, + "loss": 1.3045, + "step": 1017 + }, + { + "epoch": 0.3790066433200312, + "grad_norm": 0.1879895031452179, + "learning_rate": 1.999997607271118e-05, + "loss": 1.3311, + "step": 1018 + }, + { + "epoch": 0.37937894847064024, + "grad_norm": 0.17512273788452148, + "learning_rate": 1.999997334027509e-05, + "loss": 1.3284, + "step": 1019 + }, + { + "epoch": 0.37975125362124934, + "grad_norm": 0.17723974585533142, + "learning_rate": 1.9999970460140023e-05, + "loss": 1.3109, + "step": 1020 + }, + { + "epoch": 0.3801235587718584, + "grad_norm": 0.1877744346857071, + "learning_rate": 1.999996743230602e-05, + "loss": 1.3204, + "step": 1021 + }, + { + "epoch": 0.3804958639224674, + "grad_norm": 0.17423878610134125, + "learning_rate": 1.9999964256773125e-05, + "loss": 1.3204, + "step": 1022 + }, + { + "epoch": 0.3808681690730765, + "grad_norm": 0.19012653827667236, + "learning_rate": 1.9999960933541383e-05, + "loss": 1.3345, + "step": 1023 + }, + { + "epoch": 0.38124047422368557, + "grad_norm": 0.19567571580410004, + "learning_rate": 1.999995746261085e-05, + "loss": 1.3195, + "step": 1024 + }, + { + "epoch": 0.38161277937429466, + "grad_norm": 0.17858469486236572, + "learning_rate": 1.999995384398157e-05, + "loss": 1.3161, + "step": 1025 + }, + { + "epoch": 0.3819850845249037, + "grad_norm": 0.20314441621303558, + "learning_rate": 1.9999950077653597e-05, + "loss": 1.3214, + "step": 1026 + }, + { + "epoch": 0.3823573896755128, + "grad_norm": 0.19132466614246368, + "learning_rate": 1.9999946163626993e-05, + "loss": 1.3227, + "step": 1027 + }, + { + "epoch": 0.38272969482612185, + "grad_norm": 0.18081142008304596, + "learning_rate": 1.999994210190181e-05, + "loss": 1.3226, + "step": 1028 + }, + { + "epoch": 0.38310199997673094, + "grad_norm": 0.1872538924217224, + "learning_rate": 1.999993789247811e-05, + "loss": 1.321, + "step": 1029 + }, + { + "epoch": 0.38347430512734, + "grad_norm": 0.16693538427352905, + "learning_rate": 1.9999933535355955e-05, + "loss": 1.32, + "step": 1030 + }, + { + "epoch": 0.3838466102779491, + "grad_norm": 0.18578219413757324, + "learning_rate": 1.9999929030535406e-05, + "loss": 1.3046, + "step": 1031 + }, + { + "epoch": 0.3842189154285581, + "grad_norm": 0.19395247101783752, + "learning_rate": 1.999992437801654e-05, + "loss": 1.3357, + "step": 1032 + }, + { + "epoch": 0.3845912205791672, + "grad_norm": 0.18756742775440216, + "learning_rate": 1.9999919577799415e-05, + "loss": 1.3373, + "step": 1033 + }, + { + "epoch": 0.38496352572977627, + "grad_norm": 0.18206077814102173, + "learning_rate": 1.9999914629884104e-05, + "loss": 1.3087, + "step": 1034 + }, + { + "epoch": 0.3853358308803853, + "grad_norm": 0.18908272683620453, + "learning_rate": 1.999990953427068e-05, + "loss": 1.3191, + "step": 1035 + }, + { + "epoch": 0.3857081360309944, + "grad_norm": 0.19302143156528473, + "learning_rate": 1.9999904290959225e-05, + "loss": 1.3283, + "step": 1036 + }, + { + "epoch": 0.38608044118160345, + "grad_norm": 0.1931566596031189, + "learning_rate": 1.999989889994981e-05, + "loss": 1.3368, + "step": 1037 + }, + { + "epoch": 0.38645274633221255, + "grad_norm": 0.18609251081943512, + "learning_rate": 1.9999893361242512e-05, + "loss": 1.3135, + "step": 1038 + }, + { + "epoch": 0.3868250514828216, + "grad_norm": 0.19373655319213867, + "learning_rate": 1.9999887674837416e-05, + "loss": 1.3423, + "step": 1039 + }, + { + "epoch": 0.3871973566334307, + "grad_norm": 0.2070687711238861, + "learning_rate": 1.9999881840734613e-05, + "loss": 1.3103, + "step": 1040 + }, + { + "epoch": 0.38756966178403973, + "grad_norm": 0.18242163956165314, + "learning_rate": 1.999987585893418e-05, + "loss": 1.3367, + "step": 1041 + }, + { + "epoch": 0.38794196693464883, + "grad_norm": 0.17910490930080414, + "learning_rate": 1.9999869729436205e-05, + "loss": 1.3158, + "step": 1042 + }, + { + "epoch": 0.38831427208525787, + "grad_norm": 0.20917744934558868, + "learning_rate": 1.9999863452240784e-05, + "loss": 1.3203, + "step": 1043 + }, + { + "epoch": 0.38868657723586697, + "grad_norm": 0.16768380999565125, + "learning_rate": 1.9999857027348008e-05, + "loss": 1.324, + "step": 1044 + }, + { + "epoch": 0.389058882386476, + "grad_norm": 0.1901809275150299, + "learning_rate": 1.999985045475797e-05, + "loss": 1.333, + "step": 1045 + }, + { + "epoch": 0.3894311875370851, + "grad_norm": 0.18755842745304108, + "learning_rate": 1.9999843734470768e-05, + "loss": 1.3174, + "step": 1046 + }, + { + "epoch": 0.38980349268769415, + "grad_norm": 0.18284977972507477, + "learning_rate": 1.9999836866486505e-05, + "loss": 1.3204, + "step": 1047 + }, + { + "epoch": 0.3901757978383032, + "grad_norm": 0.19717617332935333, + "learning_rate": 1.9999829850805273e-05, + "loss": 1.3158, + "step": 1048 + }, + { + "epoch": 0.3905481029889123, + "grad_norm": 0.19506654143333435, + "learning_rate": 1.9999822687427188e-05, + "loss": 1.3254, + "step": 1049 + }, + { + "epoch": 0.39092040813952134, + "grad_norm": 0.18811020255088806, + "learning_rate": 1.9999815376352346e-05, + "loss": 1.3195, + "step": 1050 + }, + { + "epoch": 0.39129271329013043, + "grad_norm": 0.19153869152069092, + "learning_rate": 1.9999807917580858e-05, + "loss": 1.3346, + "step": 1051 + }, + { + "epoch": 0.3916650184407395, + "grad_norm": 0.17620836198329926, + "learning_rate": 1.9999800311112838e-05, + "loss": 1.3085, + "step": 1052 + }, + { + "epoch": 0.3920373235913486, + "grad_norm": 0.18910714983940125, + "learning_rate": 1.999979255694839e-05, + "loss": 1.3138, + "step": 1053 + }, + { + "epoch": 0.3924096287419576, + "grad_norm": 0.1759803593158722, + "learning_rate": 1.999978465508764e-05, + "loss": 1.3205, + "step": 1054 + }, + { + "epoch": 0.3927819338925667, + "grad_norm": 0.1760639101266861, + "learning_rate": 1.9999776605530693e-05, + "loss": 1.3111, + "step": 1055 + }, + { + "epoch": 0.39315423904317576, + "grad_norm": 0.1799839735031128, + "learning_rate": 1.9999768408277674e-05, + "loss": 1.3128, + "step": 1056 + }, + { + "epoch": 0.39352654419378486, + "grad_norm": 0.18230627477169037, + "learning_rate": 1.9999760063328705e-05, + "loss": 1.3294, + "step": 1057 + }, + { + "epoch": 0.3938988493443939, + "grad_norm": 0.17548716068267822, + "learning_rate": 1.9999751570683905e-05, + "loss": 1.3224, + "step": 1058 + }, + { + "epoch": 0.39427115449500294, + "grad_norm": 0.17289309203624725, + "learning_rate": 1.9999742930343404e-05, + "loss": 1.3113, + "step": 1059 + }, + { + "epoch": 0.39464345964561204, + "grad_norm": 0.18272966146469116, + "learning_rate": 1.9999734142307326e-05, + "loss": 1.3166, + "step": 1060 + }, + { + "epoch": 0.3950157647962211, + "grad_norm": 0.1845044642686844, + "learning_rate": 1.99997252065758e-05, + "loss": 1.3138, + "step": 1061 + }, + { + "epoch": 0.3953880699468302, + "grad_norm": 0.1838754415512085, + "learning_rate": 1.9999716123148966e-05, + "loss": 1.312, + "step": 1062 + }, + { + "epoch": 0.3957603750974392, + "grad_norm": 0.18985752761363983, + "learning_rate": 1.999970689202695e-05, + "loss": 1.3226, + "step": 1063 + }, + { + "epoch": 0.3961326802480483, + "grad_norm": 0.1912066787481308, + "learning_rate": 1.999969751320989e-05, + "loss": 1.3408, + "step": 1064 + }, + { + "epoch": 0.39650498539865736, + "grad_norm": 0.1811860352754593, + "learning_rate": 1.9999687986697925e-05, + "loss": 1.3085, + "step": 1065 + }, + { + "epoch": 0.39687729054926646, + "grad_norm": 0.18986211717128754, + "learning_rate": 1.9999678312491194e-05, + "loss": 1.3329, + "step": 1066 + }, + { + "epoch": 0.3972495956998755, + "grad_norm": 0.19396282732486725, + "learning_rate": 1.9999668490589848e-05, + "loss": 1.3115, + "step": 1067 + }, + { + "epoch": 0.3976219008504846, + "grad_norm": 0.19043684005737305, + "learning_rate": 1.999965852099402e-05, + "loss": 1.3174, + "step": 1068 + }, + { + "epoch": 0.39799420600109364, + "grad_norm": 0.18925096094608307, + "learning_rate": 1.9999648403703867e-05, + "loss": 1.3015, + "step": 1069 + }, + { + "epoch": 0.39836651115170274, + "grad_norm": 0.19922588765621185, + "learning_rate": 1.9999638138719532e-05, + "loss": 1.3154, + "step": 1070 + }, + { + "epoch": 0.3987388163023118, + "grad_norm": 0.1778346300125122, + "learning_rate": 1.999962772604117e-05, + "loss": 1.314, + "step": 1071 + }, + { + "epoch": 0.3991111214529208, + "grad_norm": 0.18273292481899261, + "learning_rate": 1.9999617165668935e-05, + "loss": 1.2956, + "step": 1072 + }, + { + "epoch": 0.3994834266035299, + "grad_norm": 0.18473650515079498, + "learning_rate": 1.999960645760298e-05, + "loss": 1.3015, + "step": 1073 + }, + { + "epoch": 0.39985573175413897, + "grad_norm": 0.20028461515903473, + "learning_rate": 1.9999595601843466e-05, + "loss": 1.3271, + "step": 1074 + }, + { + "epoch": 0.40022803690474806, + "grad_norm": 0.1781882643699646, + "learning_rate": 1.9999584598390556e-05, + "loss": 1.3189, + "step": 1075 + }, + { + "epoch": 0.4006003420553571, + "grad_norm": 0.18099310994148254, + "learning_rate": 1.9999573447244404e-05, + "loss": 1.3319, + "step": 1076 + }, + { + "epoch": 0.4009726472059662, + "grad_norm": 0.18824267387390137, + "learning_rate": 1.9999562148405184e-05, + "loss": 1.315, + "step": 1077 + }, + { + "epoch": 0.40134495235657525, + "grad_norm": 0.18727093935012817, + "learning_rate": 1.9999550701873056e-05, + "loss": 1.3246, + "step": 1078 + }, + { + "epoch": 0.40171725750718434, + "grad_norm": 0.19821962714195251, + "learning_rate": 1.9999539107648195e-05, + "loss": 1.2926, + "step": 1079 + }, + { + "epoch": 0.4020895626577934, + "grad_norm": 0.18535029888153076, + "learning_rate": 1.9999527365730766e-05, + "loss": 1.3103, + "step": 1080 + }, + { + "epoch": 0.4024618678084025, + "grad_norm": 0.18775507807731628, + "learning_rate": 1.9999515476120945e-05, + "loss": 1.3188, + "step": 1081 + }, + { + "epoch": 0.4028341729590115, + "grad_norm": 0.18621505796909332, + "learning_rate": 1.999950343881891e-05, + "loss": 1.3179, + "step": 1082 + }, + { + "epoch": 0.4032064781096206, + "grad_norm": 0.18887226283550262, + "learning_rate": 1.9999491253824833e-05, + "loss": 1.3146, + "step": 1083 + }, + { + "epoch": 0.40357878326022967, + "grad_norm": 0.18072552978992462, + "learning_rate": 1.9999478921138898e-05, + "loss": 1.3137, + "step": 1084 + }, + { + "epoch": 0.4039510884108387, + "grad_norm": 0.18956084549427032, + "learning_rate": 1.999946644076129e-05, + "loss": 1.3064, + "step": 1085 + }, + { + "epoch": 0.4043233935614478, + "grad_norm": 0.19679097831249237, + "learning_rate": 1.999945381269219e-05, + "loss": 1.2912, + "step": 1086 + }, + { + "epoch": 0.40469569871205685, + "grad_norm": 0.18827444314956665, + "learning_rate": 1.9999441036931784e-05, + "loss": 1.328, + "step": 1087 + }, + { + "epoch": 0.40506800386266595, + "grad_norm": 0.1727646291255951, + "learning_rate": 1.999942811348026e-05, + "loss": 1.3148, + "step": 1088 + }, + { + "epoch": 0.405440309013275, + "grad_norm": 0.18676549196243286, + "learning_rate": 1.999941504233781e-05, + "loss": 1.3315, + "step": 1089 + }, + { + "epoch": 0.4058126141638841, + "grad_norm": 0.18589061498641968, + "learning_rate": 1.9999401823504628e-05, + "loss": 1.3326, + "step": 1090 + }, + { + "epoch": 0.40618491931449313, + "grad_norm": 0.19661076366901398, + "learning_rate": 1.999938845698091e-05, + "loss": 1.3221, + "step": 1091 + }, + { + "epoch": 0.40655722446510223, + "grad_norm": 0.1824602484703064, + "learning_rate": 1.9999374942766853e-05, + "loss": 1.3216, + "step": 1092 + }, + { + "epoch": 0.40692952961571127, + "grad_norm": 0.1898437738418579, + "learning_rate": 1.999936128086265e-05, + "loss": 1.3212, + "step": 1093 + }, + { + "epoch": 0.40730183476632037, + "grad_norm": 0.18662068247795105, + "learning_rate": 1.9999347471268517e-05, + "loss": 1.316, + "step": 1094 + }, + { + "epoch": 0.4076741399169294, + "grad_norm": 0.17926158010959625, + "learning_rate": 1.9999333513984644e-05, + "loss": 1.3243, + "step": 1095 + }, + { + "epoch": 0.40804644506753845, + "grad_norm": 0.18369993567466736, + "learning_rate": 1.9999319409011243e-05, + "loss": 1.3001, + "step": 1096 + }, + { + "epoch": 0.40841875021814755, + "grad_norm": 0.18747882544994354, + "learning_rate": 1.9999305156348523e-05, + "loss": 1.3137, + "step": 1097 + }, + { + "epoch": 0.4087910553687566, + "grad_norm": 0.17961914837360382, + "learning_rate": 1.999929075599669e-05, + "loss": 1.3138, + "step": 1098 + }, + { + "epoch": 0.4091633605193657, + "grad_norm": 0.1906268298625946, + "learning_rate": 1.9999276207955965e-05, + "loss": 1.3301, + "step": 1099 + }, + { + "epoch": 0.40953566566997474, + "grad_norm": 0.1943434178829193, + "learning_rate": 1.9999261512226556e-05, + "loss": 1.3254, + "step": 1100 + }, + { + "epoch": 0.40990797082058383, + "grad_norm": 0.17660486698150635, + "learning_rate": 1.999924666880868e-05, + "loss": 1.3178, + "step": 1101 + }, + { + "epoch": 0.4102802759711929, + "grad_norm": 0.1914491206407547, + "learning_rate": 1.999923167770256e-05, + "loss": 1.303, + "step": 1102 + }, + { + "epoch": 0.410652581121802, + "grad_norm": 0.18666186928749084, + "learning_rate": 1.9999216538908416e-05, + "loss": 1.3144, + "step": 1103 + }, + { + "epoch": 0.411024886272411, + "grad_norm": 0.18117761611938477, + "learning_rate": 1.9999201252426473e-05, + "loss": 1.3063, + "step": 1104 + }, + { + "epoch": 0.4113971914230201, + "grad_norm": 0.1789981722831726, + "learning_rate": 1.9999185818256953e-05, + "loss": 1.3212, + "step": 1105 + }, + { + "epoch": 0.41176949657362916, + "grad_norm": 0.18627247214317322, + "learning_rate": 1.9999170236400087e-05, + "loss": 1.3088, + "step": 1106 + }, + { + "epoch": 0.41214180172423825, + "grad_norm": 0.18642041087150574, + "learning_rate": 1.99991545068561e-05, + "loss": 1.3296, + "step": 1107 + }, + { + "epoch": 0.4125141068748473, + "grad_norm": 0.1862434297800064, + "learning_rate": 1.999913862962523e-05, + "loss": 1.3261, + "step": 1108 + }, + { + "epoch": 0.41288641202545634, + "grad_norm": 0.1830601990222931, + "learning_rate": 1.9999122604707714e-05, + "loss": 1.3262, + "step": 1109 + }, + { + "epoch": 0.41325871717606544, + "grad_norm": 0.1928444504737854, + "learning_rate": 1.9999106432103785e-05, + "loss": 1.3062, + "step": 1110 + }, + { + "epoch": 0.4136310223266745, + "grad_norm": 0.1889939159154892, + "learning_rate": 1.9999090111813674e-05, + "loss": 1.3028, + "step": 1111 + }, + { + "epoch": 0.4140033274772836, + "grad_norm": 0.18017908930778503, + "learning_rate": 1.9999073643837637e-05, + "loss": 1.3231, + "step": 1112 + }, + { + "epoch": 0.4143756326278926, + "grad_norm": 0.18042001128196716, + "learning_rate": 1.9999057028175906e-05, + "loss": 1.301, + "step": 1113 + }, + { + "epoch": 0.4147479377785017, + "grad_norm": 0.1881111115217209, + "learning_rate": 1.999904026482873e-05, + "loss": 1.3057, + "step": 1114 + }, + { + "epoch": 0.41512024292911076, + "grad_norm": 0.17957095801830292, + "learning_rate": 1.9999023353796357e-05, + "loss": 1.316, + "step": 1115 + }, + { + "epoch": 0.41549254807971986, + "grad_norm": 0.19074207544326782, + "learning_rate": 1.999900629507904e-05, + "loss": 1.3326, + "step": 1116 + }, + { + "epoch": 0.4158648532303289, + "grad_norm": 0.19031281769275665, + "learning_rate": 1.9998989088677027e-05, + "loss": 1.3277, + "step": 1117 + }, + { + "epoch": 0.416237158380938, + "grad_norm": 0.17994950711727142, + "learning_rate": 1.9998971734590567e-05, + "loss": 1.3236, + "step": 1118 + }, + { + "epoch": 0.41660946353154704, + "grad_norm": 0.17953146994113922, + "learning_rate": 1.9998954232819928e-05, + "loss": 1.3225, + "step": 1119 + }, + { + "epoch": 0.41698176868215614, + "grad_norm": 0.18217253684997559, + "learning_rate": 1.9998936583365358e-05, + "loss": 1.3234, + "step": 1120 + }, + { + "epoch": 0.4173540738327652, + "grad_norm": 0.1890113651752472, + "learning_rate": 1.9998918786227124e-05, + "loss": 1.3206, + "step": 1121 + }, + { + "epoch": 0.4177263789833742, + "grad_norm": 0.18318399786949158, + "learning_rate": 1.999890084140549e-05, + "loss": 1.3222, + "step": 1122 + }, + { + "epoch": 0.4180986841339833, + "grad_norm": 0.19415515661239624, + "learning_rate": 1.9998882748900714e-05, + "loss": 1.3069, + "step": 1123 + }, + { + "epoch": 0.41847098928459237, + "grad_norm": 0.1776365041732788, + "learning_rate": 1.9998864508713068e-05, + "loss": 1.3146, + "step": 1124 + }, + { + "epoch": 0.41884329443520146, + "grad_norm": 0.18183977901935577, + "learning_rate": 1.9998846120842824e-05, + "loss": 1.3096, + "step": 1125 + }, + { + "epoch": 0.4192155995858105, + "grad_norm": 0.19202455878257751, + "learning_rate": 1.9998827585290245e-05, + "loss": 1.2982, + "step": 1126 + }, + { + "epoch": 0.4195879047364196, + "grad_norm": 0.18983449041843414, + "learning_rate": 1.9998808902055616e-05, + "loss": 1.32, + "step": 1127 + }, + { + "epoch": 0.41996020988702865, + "grad_norm": 0.17936941981315613, + "learning_rate": 1.9998790071139202e-05, + "loss": 1.3271, + "step": 1128 + }, + { + "epoch": 0.42033251503763774, + "grad_norm": 0.18909084796905518, + "learning_rate": 1.9998771092541287e-05, + "loss": 1.3014, + "step": 1129 + }, + { + "epoch": 0.4207048201882468, + "grad_norm": 0.19434739649295807, + "learning_rate": 1.9998751966262154e-05, + "loss": 1.3097, + "step": 1130 + }, + { + "epoch": 0.4210771253388559, + "grad_norm": 0.17867596447467804, + "learning_rate": 1.9998732692302077e-05, + "loss": 1.3138, + "step": 1131 + }, + { + "epoch": 0.4214494304894649, + "grad_norm": 0.17590618133544922, + "learning_rate": 1.999871327066135e-05, + "loss": 1.3162, + "step": 1132 + }, + { + "epoch": 0.42182173564007397, + "grad_norm": 0.19735798239707947, + "learning_rate": 1.999869370134025e-05, + "loss": 1.3109, + "step": 1133 + }, + { + "epoch": 0.42219404079068307, + "grad_norm": 0.18292424082756042, + "learning_rate": 1.999867398433908e-05, + "loss": 1.3073, + "step": 1134 + }, + { + "epoch": 0.4225663459412921, + "grad_norm": 0.1984787881374359, + "learning_rate": 1.9998654119658115e-05, + "loss": 1.3127, + "step": 1135 + }, + { + "epoch": 0.4229386510919012, + "grad_norm": 0.19433364272117615, + "learning_rate": 1.999863410729766e-05, + "loss": 1.3199, + "step": 1136 + }, + { + "epoch": 0.42331095624251025, + "grad_norm": 0.1794789433479309, + "learning_rate": 1.9998613947258006e-05, + "loss": 1.3065, + "step": 1137 + }, + { + "epoch": 0.42368326139311935, + "grad_norm": 0.18570461869239807, + "learning_rate": 1.9998593639539453e-05, + "loss": 1.3106, + "step": 1138 + }, + { + "epoch": 0.4240555665437284, + "grad_norm": 0.18316258490085602, + "learning_rate": 1.9998573184142294e-05, + "loss": 1.3232, + "step": 1139 + }, + { + "epoch": 0.4244278716943375, + "grad_norm": 0.19057685136795044, + "learning_rate": 1.9998552581066842e-05, + "loss": 1.3216, + "step": 1140 + }, + { + "epoch": 0.42480017684494653, + "grad_norm": 0.17034713923931122, + "learning_rate": 1.9998531830313394e-05, + "loss": 1.3071, + "step": 1141 + }, + { + "epoch": 0.42517248199555563, + "grad_norm": 0.19503635168075562, + "learning_rate": 1.999851093188226e-05, + "loss": 1.3007, + "step": 1142 + }, + { + "epoch": 0.42554478714616467, + "grad_norm": 0.18633443117141724, + "learning_rate": 1.9998489885773746e-05, + "loss": 1.3197, + "step": 1143 + }, + { + "epoch": 0.42591709229677377, + "grad_norm": 0.16780251264572144, + "learning_rate": 1.999846869198816e-05, + "loss": 1.3104, + "step": 1144 + }, + { + "epoch": 0.4262893974473828, + "grad_norm": 0.1772105097770691, + "learning_rate": 1.9998447350525822e-05, + "loss": 1.3154, + "step": 1145 + }, + { + "epoch": 0.42666170259799185, + "grad_norm": 0.1707058846950531, + "learning_rate": 1.9998425861387045e-05, + "loss": 1.3108, + "step": 1146 + }, + { + "epoch": 0.42703400774860095, + "grad_norm": 0.1806851178407669, + "learning_rate": 1.9998404224572147e-05, + "loss": 1.3152, + "step": 1147 + }, + { + "epoch": 0.42740631289921, + "grad_norm": 0.19098347425460815, + "learning_rate": 1.9998382440081442e-05, + "loss": 1.3187, + "step": 1148 + }, + { + "epoch": 0.4277786180498191, + "grad_norm": 0.18459391593933105, + "learning_rate": 1.999836050791526e-05, + "loss": 1.3139, + "step": 1149 + }, + { + "epoch": 0.42815092320042814, + "grad_norm": 0.1884915679693222, + "learning_rate": 1.9998338428073916e-05, + "loss": 1.318, + "step": 1150 + }, + { + "epoch": 0.42852322835103723, + "grad_norm": 0.181873619556427, + "learning_rate": 1.9998316200557742e-05, + "loss": 1.3283, + "step": 1151 + }, + { + "epoch": 0.4288955335016463, + "grad_norm": 0.17732377350330353, + "learning_rate": 1.9998293825367066e-05, + "loss": 1.2947, + "step": 1152 + }, + { + "epoch": 0.4292678386522554, + "grad_norm": 0.17769268155097961, + "learning_rate": 1.999827130250222e-05, + "loss": 1.3118, + "step": 1153 + }, + { + "epoch": 0.4296401438028644, + "grad_norm": 0.1880146563053131, + "learning_rate": 1.9998248631963532e-05, + "loss": 1.3123, + "step": 1154 + }, + { + "epoch": 0.4300124489534735, + "grad_norm": 0.18913207948207855, + "learning_rate": 1.9998225813751338e-05, + "loss": 1.3172, + "step": 1155 + }, + { + "epoch": 0.43038475410408256, + "grad_norm": 0.19146108627319336, + "learning_rate": 1.999820284786598e-05, + "loss": 1.3207, + "step": 1156 + }, + { + "epoch": 0.43075705925469165, + "grad_norm": 0.16881105303764343, + "learning_rate": 1.9998179734307794e-05, + "loss": 1.3047, + "step": 1157 + }, + { + "epoch": 0.4311293644053007, + "grad_norm": 0.18325145542621613, + "learning_rate": 1.9998156473077114e-05, + "loss": 1.3047, + "step": 1158 + }, + { + "epoch": 0.43150166955590974, + "grad_norm": 0.19172656536102295, + "learning_rate": 1.9998133064174297e-05, + "loss": 1.2973, + "step": 1159 + }, + { + "epoch": 0.43187397470651884, + "grad_norm": 0.18118645250797272, + "learning_rate": 1.9998109507599678e-05, + "loss": 1.3079, + "step": 1160 + }, + { + "epoch": 0.4322462798571279, + "grad_norm": 0.1834815889596939, + "learning_rate": 1.999808580335361e-05, + "loss": 1.3397, + "step": 1161 + }, + { + "epoch": 0.432618585007737, + "grad_norm": 0.19053733348846436, + "learning_rate": 1.9998061951436444e-05, + "loss": 1.3065, + "step": 1162 + }, + { + "epoch": 0.432990890158346, + "grad_norm": 0.18071451783180237, + "learning_rate": 1.999803795184853e-05, + "loss": 1.3144, + "step": 1163 + }, + { + "epoch": 0.4333631953089551, + "grad_norm": 0.17997939884662628, + "learning_rate": 1.9998013804590223e-05, + "loss": 1.311, + "step": 1164 + }, + { + "epoch": 0.43373550045956416, + "grad_norm": 0.19499291479587555, + "learning_rate": 1.999798950966188e-05, + "loss": 1.3367, + "step": 1165 + }, + { + "epoch": 0.43410780561017326, + "grad_norm": 0.20583321154117584, + "learning_rate": 1.9997965067063856e-05, + "loss": 1.312, + "step": 1166 + }, + { + "epoch": 0.4344801107607823, + "grad_norm": 0.18382562696933746, + "learning_rate": 1.9997940476796516e-05, + "loss": 1.3215, + "step": 1167 + }, + { + "epoch": 0.4348524159113914, + "grad_norm": 0.19162672758102417, + "learning_rate": 1.9997915738860224e-05, + "loss": 1.3025, + "step": 1168 + }, + { + "epoch": 0.43522472106200044, + "grad_norm": 0.1860532909631729, + "learning_rate": 1.9997890853255346e-05, + "loss": 1.2959, + "step": 1169 + }, + { + "epoch": 0.4355970262126095, + "grad_norm": 0.1830555945634842, + "learning_rate": 1.9997865819982247e-05, + "loss": 1.3056, + "step": 1170 + }, + { + "epoch": 0.4359693313632186, + "grad_norm": 0.18392489850521088, + "learning_rate": 1.9997840639041293e-05, + "loss": 1.3165, + "step": 1171 + }, + { + "epoch": 0.4363416365138276, + "grad_norm": 0.18183907866477966, + "learning_rate": 1.9997815310432864e-05, + "loss": 1.3102, + "step": 1172 + }, + { + "epoch": 0.4367139416644367, + "grad_norm": 0.19381216168403625, + "learning_rate": 1.999778983415733e-05, + "loss": 1.2985, + "step": 1173 + }, + { + "epoch": 0.43708624681504576, + "grad_norm": 0.18290908634662628, + "learning_rate": 1.9997764210215067e-05, + "loss": 1.3021, + "step": 1174 + }, + { + "epoch": 0.43745855196565486, + "grad_norm": 0.19478839635849, + "learning_rate": 1.9997738438606454e-05, + "loss": 1.3154, + "step": 1175 + }, + { + "epoch": 0.4378308571162639, + "grad_norm": 0.1829633265733719, + "learning_rate": 1.999771251933187e-05, + "loss": 1.3105, + "step": 1176 + }, + { + "epoch": 0.438203162266873, + "grad_norm": 0.1946062445640564, + "learning_rate": 1.9997686452391703e-05, + "loss": 1.315, + "step": 1177 + }, + { + "epoch": 0.43857546741748205, + "grad_norm": 0.185636505484581, + "learning_rate": 1.999766023778633e-05, + "loss": 1.3065, + "step": 1178 + }, + { + "epoch": 0.43894777256809114, + "grad_norm": 0.1923389434814453, + "learning_rate": 1.9997633875516148e-05, + "loss": 1.306, + "step": 1179 + }, + { + "epoch": 0.4393200777187002, + "grad_norm": 0.18182942271232605, + "learning_rate": 1.999760736558154e-05, + "loss": 1.295, + "step": 1180 + }, + { + "epoch": 0.4396923828693093, + "grad_norm": 0.1975245475769043, + "learning_rate": 1.9997580707982896e-05, + "loss": 1.3185, + "step": 1181 + }, + { + "epoch": 0.4400646880199183, + "grad_norm": 0.1868218183517456, + "learning_rate": 1.999755390272061e-05, + "loss": 1.3127, + "step": 1182 + }, + { + "epoch": 0.44043699317052737, + "grad_norm": 0.18964411318302155, + "learning_rate": 1.9997526949795087e-05, + "loss": 1.297, + "step": 1183 + }, + { + "epoch": 0.44080929832113647, + "grad_norm": 0.19626370072364807, + "learning_rate": 1.9997499849206715e-05, + "loss": 1.2892, + "step": 1184 + }, + { + "epoch": 0.4411816034717455, + "grad_norm": 0.1894233077764511, + "learning_rate": 1.99974726009559e-05, + "loss": 1.3238, + "step": 1185 + }, + { + "epoch": 0.4415539086223546, + "grad_norm": 0.20008660852909088, + "learning_rate": 1.9997445205043037e-05, + "loss": 1.3053, + "step": 1186 + }, + { + "epoch": 0.44192621377296365, + "grad_norm": 0.18232440948486328, + "learning_rate": 1.999741766146854e-05, + "loss": 1.3007, + "step": 1187 + }, + { + "epoch": 0.44229851892357275, + "grad_norm": 0.19060754776000977, + "learning_rate": 1.999738997023281e-05, + "loss": 1.2931, + "step": 1188 + }, + { + "epoch": 0.4426708240741818, + "grad_norm": 0.2020169347524643, + "learning_rate": 1.999736213133626e-05, + "loss": 1.3155, + "step": 1189 + }, + { + "epoch": 0.4430431292247909, + "grad_norm": 0.1948639452457428, + "learning_rate": 1.9997334144779295e-05, + "loss": 1.3057, + "step": 1190 + }, + { + "epoch": 0.44341543437539993, + "grad_norm": 0.1938532143831253, + "learning_rate": 1.9997306010562334e-05, + "loss": 1.2947, + "step": 1191 + }, + { + "epoch": 0.44378773952600903, + "grad_norm": 0.21005672216415405, + "learning_rate": 1.9997277728685788e-05, + "loss": 1.3209, + "step": 1192 + }, + { + "epoch": 0.44416004467661807, + "grad_norm": 0.20058314502239227, + "learning_rate": 1.9997249299150078e-05, + "loss": 1.2951, + "step": 1193 + }, + { + "epoch": 0.4445323498272271, + "grad_norm": 0.1923152059316635, + "learning_rate": 1.9997220721955627e-05, + "loss": 1.3118, + "step": 1194 + }, + { + "epoch": 0.4449046549778362, + "grad_norm": 0.19684681296348572, + "learning_rate": 1.9997191997102853e-05, + "loss": 1.3241, + "step": 1195 + }, + { + "epoch": 0.44527696012844525, + "grad_norm": 0.19519715011119843, + "learning_rate": 1.9997163124592175e-05, + "loss": 1.3147, + "step": 1196 + }, + { + "epoch": 0.44564926527905435, + "grad_norm": 0.19331727921962738, + "learning_rate": 1.9997134104424033e-05, + "loss": 1.3278, + "step": 1197 + }, + { + "epoch": 0.4460215704296634, + "grad_norm": 0.19848302006721497, + "learning_rate": 1.999710493659884e-05, + "loss": 1.307, + "step": 1198 + }, + { + "epoch": 0.4463938755802725, + "grad_norm": 0.19539180397987366, + "learning_rate": 1.999707562111704e-05, + "loss": 1.2933, + "step": 1199 + }, + { + "epoch": 0.44676618073088153, + "grad_norm": 0.18977102637290955, + "learning_rate": 1.999704615797906e-05, + "loss": 1.3087, + "step": 1200 + }, + { + "epoch": 0.44713848588149063, + "grad_norm": 0.2267744392156601, + "learning_rate": 1.9997016547185333e-05, + "loss": 1.3055, + "step": 1201 + }, + { + "epoch": 0.4475107910320997, + "grad_norm": 0.19115746021270752, + "learning_rate": 1.9996986788736298e-05, + "loss": 1.2991, + "step": 1202 + }, + { + "epoch": 0.4478830961827088, + "grad_norm": 0.1895446479320526, + "learning_rate": 1.99969568826324e-05, + "loss": 1.3006, + "step": 1203 + }, + { + "epoch": 0.4482554013333178, + "grad_norm": 0.18717582523822784, + "learning_rate": 1.999692682887407e-05, + "loss": 1.3014, + "step": 1204 + }, + { + "epoch": 0.4486277064839269, + "grad_norm": 0.19228947162628174, + "learning_rate": 1.9996896627461764e-05, + "loss": 1.3189, + "step": 1205 + }, + { + "epoch": 0.44900001163453596, + "grad_norm": 0.19240860641002655, + "learning_rate": 1.999686627839592e-05, + "loss": 1.2877, + "step": 1206 + }, + { + "epoch": 0.449372316785145, + "grad_norm": 0.18697458505630493, + "learning_rate": 1.999683578167699e-05, + "loss": 1.3058, + "step": 1207 + }, + { + "epoch": 0.4497446219357541, + "grad_norm": 0.21261201798915863, + "learning_rate": 1.999680513730542e-05, + "loss": 1.3055, + "step": 1208 + }, + { + "epoch": 0.45011692708636314, + "grad_norm": 0.19081415235996246, + "learning_rate": 1.9996774345281668e-05, + "loss": 1.2985, + "step": 1209 + }, + { + "epoch": 0.45048923223697224, + "grad_norm": 0.18838602304458618, + "learning_rate": 1.9996743405606188e-05, + "loss": 1.3032, + "step": 1210 + }, + { + "epoch": 0.4508615373875813, + "grad_norm": 0.19826094806194305, + "learning_rate": 1.999671231827943e-05, + "loss": 1.3116, + "step": 1211 + }, + { + "epoch": 0.4512338425381904, + "grad_norm": 0.1974056512117386, + "learning_rate": 1.999668108330186e-05, + "loss": 1.3211, + "step": 1212 + }, + { + "epoch": 0.4516061476887994, + "grad_norm": 0.1901020109653473, + "learning_rate": 1.999664970067394e-05, + "loss": 1.303, + "step": 1213 + }, + { + "epoch": 0.4519784528394085, + "grad_norm": 0.18617480993270874, + "learning_rate": 1.999661817039613e-05, + "loss": 1.3013, + "step": 1214 + }, + { + "epoch": 0.45235075799001756, + "grad_norm": 0.20741531252861023, + "learning_rate": 1.9996586492468895e-05, + "loss": 1.3031, + "step": 1215 + }, + { + "epoch": 0.45272306314062666, + "grad_norm": 0.18789881467819214, + "learning_rate": 1.999655466689271e-05, + "loss": 1.3058, + "step": 1216 + }, + { + "epoch": 0.4530953682912357, + "grad_norm": 0.1936379075050354, + "learning_rate": 1.9996522693668034e-05, + "loss": 1.311, + "step": 1217 + }, + { + "epoch": 0.4534676734418448, + "grad_norm": 0.20415879786014557, + "learning_rate": 1.9996490572795348e-05, + "loss": 1.32, + "step": 1218 + }, + { + "epoch": 0.45383997859245384, + "grad_norm": 0.19625741243362427, + "learning_rate": 1.9996458304275125e-05, + "loss": 1.3058, + "step": 1219 + }, + { + "epoch": 0.4542122837430629, + "grad_norm": 0.20820340514183044, + "learning_rate": 1.999642588810784e-05, + "loss": 1.3154, + "step": 1220 + }, + { + "epoch": 0.454584588893672, + "grad_norm": 0.2056087851524353, + "learning_rate": 1.9996393324293972e-05, + "loss": 1.3051, + "step": 1221 + }, + { + "epoch": 0.454956894044281, + "grad_norm": 0.19374904036521912, + "learning_rate": 1.9996360612833997e-05, + "loss": 1.3236, + "step": 1222 + }, + { + "epoch": 0.4553291991948901, + "grad_norm": 0.18062308430671692, + "learning_rate": 1.999632775372841e-05, + "loss": 1.292, + "step": 1223 + }, + { + "epoch": 0.45570150434549916, + "grad_norm": 0.20089417695999146, + "learning_rate": 1.9996294746977686e-05, + "loss": 1.3074, + "step": 1224 + }, + { + "epoch": 0.45607380949610826, + "grad_norm": 0.19090351462364197, + "learning_rate": 1.9996261592582312e-05, + "loss": 1.3025, + "step": 1225 + }, + { + "epoch": 0.4564461146467173, + "grad_norm": 0.19648805260658264, + "learning_rate": 1.9996228290542787e-05, + "loss": 1.3098, + "step": 1226 + }, + { + "epoch": 0.4568184197973264, + "grad_norm": 0.1995651125907898, + "learning_rate": 1.99961948408596e-05, + "loss": 1.3083, + "step": 1227 + }, + { + "epoch": 0.45719072494793545, + "grad_norm": 0.1871064007282257, + "learning_rate": 1.9996161243533238e-05, + "loss": 1.2835, + "step": 1228 + }, + { + "epoch": 0.45756303009854454, + "grad_norm": 0.19911763072013855, + "learning_rate": 1.9996127498564203e-05, + "loss": 1.3044, + "step": 1229 + }, + { + "epoch": 0.4579353352491536, + "grad_norm": 0.17743955552577972, + "learning_rate": 1.9996093605952992e-05, + "loss": 1.3067, + "step": 1230 + }, + { + "epoch": 0.45830764039976263, + "grad_norm": 0.19110862910747528, + "learning_rate": 1.9996059565700103e-05, + "loss": 1.2972, + "step": 1231 + }, + { + "epoch": 0.4586799455503717, + "grad_norm": 0.18435370922088623, + "learning_rate": 1.9996025377806044e-05, + "loss": 1.3045, + "step": 1232 + }, + { + "epoch": 0.45905225070098077, + "grad_norm": 0.18806539475917816, + "learning_rate": 1.999599104227132e-05, + "loss": 1.3115, + "step": 1233 + }, + { + "epoch": 0.45942455585158987, + "grad_norm": 0.18353897333145142, + "learning_rate": 1.9995956559096432e-05, + "loss": 1.3079, + "step": 1234 + }, + { + "epoch": 0.4597968610021989, + "grad_norm": 0.18023177981376648, + "learning_rate": 1.9995921928281893e-05, + "loss": 1.3037, + "step": 1235 + }, + { + "epoch": 0.460169166152808, + "grad_norm": 0.18274232745170593, + "learning_rate": 1.9995887149828216e-05, + "loss": 1.3211, + "step": 1236 + }, + { + "epoch": 0.46054147130341705, + "grad_norm": 0.18951238691806793, + "learning_rate": 1.9995852223735914e-05, + "loss": 1.299, + "step": 1237 + }, + { + "epoch": 0.46091377645402615, + "grad_norm": 0.1818682998418808, + "learning_rate": 1.9995817150005502e-05, + "loss": 1.2999, + "step": 1238 + }, + { + "epoch": 0.4612860816046352, + "grad_norm": 0.20125912129878998, + "learning_rate": 1.9995781928637494e-05, + "loss": 1.3045, + "step": 1239 + }, + { + "epoch": 0.4616583867552443, + "grad_norm": 0.16718147695064545, + "learning_rate": 1.9995746559632417e-05, + "loss": 1.2862, + "step": 1240 + }, + { + "epoch": 0.46203069190585333, + "grad_norm": 0.19009731709957123, + "learning_rate": 1.999571104299079e-05, + "loss": 1.3052, + "step": 1241 + }, + { + "epoch": 0.46240299705646243, + "grad_norm": 0.18542075157165527, + "learning_rate": 1.999567537871314e-05, + "loss": 1.2998, + "step": 1242 + }, + { + "epoch": 0.46277530220707147, + "grad_norm": 0.18148809671401978, + "learning_rate": 1.999563956679999e-05, + "loss": 1.3011, + "step": 1243 + }, + { + "epoch": 0.4631476073576805, + "grad_norm": 0.1700984239578247, + "learning_rate": 1.9995603607251873e-05, + "loss": 1.3053, + "step": 1244 + }, + { + "epoch": 0.4635199125082896, + "grad_norm": 0.19107471406459808, + "learning_rate": 1.9995567500069314e-05, + "loss": 1.3124, + "step": 1245 + }, + { + "epoch": 0.46389221765889865, + "grad_norm": 0.18561914563179016, + "learning_rate": 1.9995531245252854e-05, + "loss": 1.3055, + "step": 1246 + }, + { + "epoch": 0.46426452280950775, + "grad_norm": 0.18733622133731842, + "learning_rate": 1.9995494842803026e-05, + "loss": 1.3053, + "step": 1247 + }, + { + "epoch": 0.4646368279601168, + "grad_norm": 0.18674108386039734, + "learning_rate": 1.9995458292720364e-05, + "loss": 1.3043, + "step": 1248 + }, + { + "epoch": 0.4650091331107259, + "grad_norm": 0.17907080054283142, + "learning_rate": 1.9995421595005408e-05, + "loss": 1.3161, + "step": 1249 + }, + { + "epoch": 0.46538143826133493, + "grad_norm": 0.17312780022621155, + "learning_rate": 1.9995384749658705e-05, + "loss": 1.321, + "step": 1250 + }, + { + "epoch": 0.46575374341194403, + "grad_norm": 0.18137013912200928, + "learning_rate": 1.99953477566808e-05, + "loss": 1.3175, + "step": 1251 + }, + { + "epoch": 0.4661260485625531, + "grad_norm": 0.1776915341615677, + "learning_rate": 1.999531061607223e-05, + "loss": 1.3135, + "step": 1252 + }, + { + "epoch": 0.4664983537131622, + "grad_norm": 0.18204154074192047, + "learning_rate": 1.9995273327833553e-05, + "loss": 1.2955, + "step": 1253 + }, + { + "epoch": 0.4668706588637712, + "grad_norm": 0.18510939180850983, + "learning_rate": 1.999523589196531e-05, + "loss": 1.2945, + "step": 1254 + }, + { + "epoch": 0.4672429640143803, + "grad_norm": 0.17913533747196198, + "learning_rate": 1.999519830846807e-05, + "loss": 1.3177, + "step": 1255 + }, + { + "epoch": 0.46761526916498936, + "grad_norm": 0.18509091436862946, + "learning_rate": 1.9995160577342375e-05, + "loss": 1.3117, + "step": 1256 + }, + { + "epoch": 0.4679875743155984, + "grad_norm": 0.19263845682144165, + "learning_rate": 1.999512269858878e-05, + "loss": 1.2951, + "step": 1257 + }, + { + "epoch": 0.4683598794662075, + "grad_norm": 0.1835833191871643, + "learning_rate": 1.9995084672207855e-05, + "loss": 1.3027, + "step": 1258 + }, + { + "epoch": 0.46873218461681654, + "grad_norm": 0.18313245475292206, + "learning_rate": 1.9995046498200158e-05, + "loss": 1.2837, + "step": 1259 + }, + { + "epoch": 0.46910448976742564, + "grad_norm": 0.1853981614112854, + "learning_rate": 1.9995008176566247e-05, + "loss": 1.2973, + "step": 1260 + }, + { + "epoch": 0.4694767949180347, + "grad_norm": 0.20372353494167328, + "learning_rate": 1.9994969707306697e-05, + "loss": 1.3229, + "step": 1261 + }, + { + "epoch": 0.4698491000686438, + "grad_norm": 0.183163583278656, + "learning_rate": 1.9994931090422067e-05, + "loss": 1.3065, + "step": 1262 + }, + { + "epoch": 0.4702214052192528, + "grad_norm": 0.18896111845970154, + "learning_rate": 1.9994892325912937e-05, + "loss": 1.3135, + "step": 1263 + }, + { + "epoch": 0.4705937103698619, + "grad_norm": 0.18443109095096588, + "learning_rate": 1.999485341377987e-05, + "loss": 1.3002, + "step": 1264 + }, + { + "epoch": 0.47096601552047096, + "grad_norm": 0.19512967765331268, + "learning_rate": 1.9994814354023446e-05, + "loss": 1.2936, + "step": 1265 + }, + { + "epoch": 0.47133832067108006, + "grad_norm": 0.1836230754852295, + "learning_rate": 1.9994775146644245e-05, + "loss": 1.2921, + "step": 1266 + }, + { + "epoch": 0.4717106258216891, + "grad_norm": 0.20863986015319824, + "learning_rate": 1.999473579164284e-05, + "loss": 1.3075, + "step": 1267 + }, + { + "epoch": 0.47208293097229814, + "grad_norm": 0.19831794500350952, + "learning_rate": 1.999469628901981e-05, + "loss": 1.3106, + "step": 1268 + }, + { + "epoch": 0.47245523612290724, + "grad_norm": 0.18101930618286133, + "learning_rate": 1.999465663877575e-05, + "loss": 1.2915, + "step": 1269 + }, + { + "epoch": 0.4728275412735163, + "grad_norm": 0.20090216398239136, + "learning_rate": 1.9994616840911237e-05, + "loss": 1.2941, + "step": 1270 + }, + { + "epoch": 0.4731998464241254, + "grad_norm": 0.20153005421161652, + "learning_rate": 1.9994576895426858e-05, + "loss": 1.3137, + "step": 1271 + }, + { + "epoch": 0.4735721515747344, + "grad_norm": 0.18877728283405304, + "learning_rate": 1.999453680232321e-05, + "loss": 1.2897, + "step": 1272 + }, + { + "epoch": 0.4739444567253435, + "grad_norm": 0.19803088903427124, + "learning_rate": 1.9994496561600874e-05, + "loss": 1.2905, + "step": 1273 + }, + { + "epoch": 0.47431676187595256, + "grad_norm": 0.19023656845092773, + "learning_rate": 1.9994456173260457e-05, + "loss": 1.3019, + "step": 1274 + }, + { + "epoch": 0.47468906702656166, + "grad_norm": 0.17500866949558258, + "learning_rate": 1.9994415637302545e-05, + "loss": 1.3089, + "step": 1275 + }, + { + "epoch": 0.4750613721771707, + "grad_norm": 0.18865026533603668, + "learning_rate": 1.9994374953727747e-05, + "loss": 1.2977, + "step": 1276 + }, + { + "epoch": 0.4754336773277798, + "grad_norm": 0.176120787858963, + "learning_rate": 1.9994334122536654e-05, + "loss": 1.2825, + "step": 1277 + }, + { + "epoch": 0.47580598247838884, + "grad_norm": 0.18565769493579865, + "learning_rate": 1.9994293143729873e-05, + "loss": 1.3182, + "step": 1278 + }, + { + "epoch": 0.47617828762899794, + "grad_norm": 0.18790839612483978, + "learning_rate": 1.9994252017308012e-05, + "loss": 1.3215, + "step": 1279 + }, + { + "epoch": 0.476550592779607, + "grad_norm": 0.19500873982906342, + "learning_rate": 1.9994210743271675e-05, + "loss": 1.3114, + "step": 1280 + }, + { + "epoch": 0.476922897930216, + "grad_norm": 0.1855781525373459, + "learning_rate": 1.9994169321621474e-05, + "loss": 1.3051, + "step": 1281 + }, + { + "epoch": 0.4772952030808251, + "grad_norm": 0.1813666671514511, + "learning_rate": 1.9994127752358014e-05, + "loss": 1.2957, + "step": 1282 + }, + { + "epoch": 0.47766750823143417, + "grad_norm": 0.20357996225357056, + "learning_rate": 1.999408603548192e-05, + "loss": 1.2988, + "step": 1283 + }, + { + "epoch": 0.47803981338204327, + "grad_norm": 0.17496971786022186, + "learning_rate": 1.99940441709938e-05, + "loss": 1.3004, + "step": 1284 + }, + { + "epoch": 0.4784121185326523, + "grad_norm": 0.19494318962097168, + "learning_rate": 1.9994002158894274e-05, + "loss": 1.3138, + "step": 1285 + }, + { + "epoch": 0.4787844236832614, + "grad_norm": 0.19843032956123352, + "learning_rate": 1.9993959999183964e-05, + "loss": 1.3031, + "step": 1286 + }, + { + "epoch": 0.47915672883387045, + "grad_norm": 0.19136017560958862, + "learning_rate": 1.9993917691863493e-05, + "loss": 1.3004, + "step": 1287 + }, + { + "epoch": 0.47952903398447955, + "grad_norm": 0.18518052995204926, + "learning_rate": 1.9993875236933486e-05, + "loss": 1.2824, + "step": 1288 + }, + { + "epoch": 0.4799013391350886, + "grad_norm": 0.18933340907096863, + "learning_rate": 1.9993832634394564e-05, + "loss": 1.2983, + "step": 1289 + }, + { + "epoch": 0.4802736442856977, + "grad_norm": 0.1836833357810974, + "learning_rate": 1.9993789884247365e-05, + "loss": 1.2916, + "step": 1290 + }, + { + "epoch": 0.48064594943630673, + "grad_norm": 0.17370636761188507, + "learning_rate": 1.9993746986492515e-05, + "loss": 1.2841, + "step": 1291 + }, + { + "epoch": 0.4810182545869158, + "grad_norm": 0.19281500577926636, + "learning_rate": 1.999370394113065e-05, + "loss": 1.3149, + "step": 1292 + }, + { + "epoch": 0.48139055973752487, + "grad_norm": 0.1878812462091446, + "learning_rate": 1.999366074816241e-05, + "loss": 1.3104, + "step": 1293 + }, + { + "epoch": 0.4817628648881339, + "grad_norm": 0.18660883605480194, + "learning_rate": 1.999361740758842e-05, + "loss": 1.3022, + "step": 1294 + }, + { + "epoch": 0.482135170038743, + "grad_norm": 0.1894964575767517, + "learning_rate": 1.999357391940933e-05, + "loss": 1.3056, + "step": 1295 + }, + { + "epoch": 0.48250747518935205, + "grad_norm": 0.18290723860263824, + "learning_rate": 1.999353028362578e-05, + "loss": 1.3045, + "step": 1296 + }, + { + "epoch": 0.48287978033996115, + "grad_norm": 0.1781376600265503, + "learning_rate": 1.9993486500238417e-05, + "loss": 1.3114, + "step": 1297 + }, + { + "epoch": 0.4832520854905702, + "grad_norm": 0.1864173412322998, + "learning_rate": 1.9993442569247885e-05, + "loss": 1.3034, + "step": 1298 + }, + { + "epoch": 0.4836243906411793, + "grad_norm": 0.19244234263896942, + "learning_rate": 1.9993398490654835e-05, + "loss": 1.3071, + "step": 1299 + }, + { + "epoch": 0.48399669579178833, + "grad_norm": 0.18302254378795624, + "learning_rate": 1.9993354264459913e-05, + "loss": 1.3059, + "step": 1300 + }, + { + "epoch": 0.48436900094239743, + "grad_norm": 0.17523464560508728, + "learning_rate": 1.9993309890663775e-05, + "loss": 1.2919, + "step": 1301 + }, + { + "epoch": 0.4847413060930065, + "grad_norm": 0.18764130771160126, + "learning_rate": 1.999326536926708e-05, + "loss": 1.2931, + "step": 1302 + }, + { + "epoch": 0.4851136112436156, + "grad_norm": 0.1812962144613266, + "learning_rate": 1.9993220700270484e-05, + "loss": 1.289, + "step": 1303 + }, + { + "epoch": 0.4854859163942246, + "grad_norm": 0.1877674013376236, + "learning_rate": 1.9993175883674642e-05, + "loss": 1.2957, + "step": 1304 + }, + { + "epoch": 0.48585822154483366, + "grad_norm": 0.24576683342456818, + "learning_rate": 1.9993130919480223e-05, + "loss": 1.3056, + "step": 1305 + }, + { + "epoch": 0.48623052669544276, + "grad_norm": 0.185043066740036, + "learning_rate": 1.9993085807687883e-05, + "loss": 1.297, + "step": 1306 + }, + { + "epoch": 0.4866028318460518, + "grad_norm": 0.19536662101745605, + "learning_rate": 1.9993040548298297e-05, + "loss": 1.3053, + "step": 1307 + }, + { + "epoch": 0.4869751369966609, + "grad_norm": 0.17894597351551056, + "learning_rate": 1.9992995141312126e-05, + "loss": 1.3136, + "step": 1308 + }, + { + "epoch": 0.48734744214726994, + "grad_norm": 0.1845971643924713, + "learning_rate": 1.9992949586730046e-05, + "loss": 1.3001, + "step": 1309 + }, + { + "epoch": 0.48771974729787904, + "grad_norm": 0.18812014162540436, + "learning_rate": 1.9992903884552727e-05, + "loss": 1.2948, + "step": 1310 + }, + { + "epoch": 0.4880920524484881, + "grad_norm": 0.18313339352607727, + "learning_rate": 1.9992858034780848e-05, + "loss": 1.3007, + "step": 1311 + }, + { + "epoch": 0.4884643575990972, + "grad_norm": 0.18373283743858337, + "learning_rate": 1.9992812037415077e-05, + "loss": 1.2924, + "step": 1312 + }, + { + "epoch": 0.4888366627497062, + "grad_norm": 0.1760503053665161, + "learning_rate": 1.9992765892456102e-05, + "loss": 1.2953, + "step": 1313 + }, + { + "epoch": 0.4892089679003153, + "grad_norm": 0.18641230463981628, + "learning_rate": 1.99927195999046e-05, + "loss": 1.288, + "step": 1314 + }, + { + "epoch": 0.48958127305092436, + "grad_norm": 0.1766699105501175, + "learning_rate": 1.999267315976126e-05, + "loss": 1.2889, + "step": 1315 + }, + { + "epoch": 0.48995357820153346, + "grad_norm": 0.17779386043548584, + "learning_rate": 1.9992626572026764e-05, + "loss": 1.2969, + "step": 1316 + }, + { + "epoch": 0.4903258833521425, + "grad_norm": 0.1781335324048996, + "learning_rate": 1.9992579836701796e-05, + "loss": 1.2939, + "step": 1317 + }, + { + "epoch": 0.49069818850275154, + "grad_norm": 0.18048399686813354, + "learning_rate": 1.9992532953787057e-05, + "loss": 1.29, + "step": 1318 + }, + { + "epoch": 0.49107049365336064, + "grad_norm": 0.18843825161457062, + "learning_rate": 1.999248592328323e-05, + "loss": 1.3121, + "step": 1319 + }, + { + "epoch": 0.4914427988039697, + "grad_norm": 0.1733110100030899, + "learning_rate": 1.9992438745191017e-05, + "loss": 1.3121, + "step": 1320 + }, + { + "epoch": 0.4918151039545788, + "grad_norm": 0.18750248849391937, + "learning_rate": 1.999239141951111e-05, + "loss": 1.298, + "step": 1321 + }, + { + "epoch": 0.4921874091051878, + "grad_norm": 0.18104903399944305, + "learning_rate": 1.9992343946244205e-05, + "loss": 1.3, + "step": 1322 + }, + { + "epoch": 0.4925597142557969, + "grad_norm": 0.1810617595911026, + "learning_rate": 1.9992296325391004e-05, + "loss": 1.3075, + "step": 1323 + }, + { + "epoch": 0.49293201940640596, + "grad_norm": 0.18038879334926605, + "learning_rate": 1.999224855695222e-05, + "loss": 1.304, + "step": 1324 + }, + { + "epoch": 0.49330432455701506, + "grad_norm": 0.17841710150241852, + "learning_rate": 1.999220064092855e-05, + "loss": 1.3075, + "step": 1325 + }, + { + "epoch": 0.4936766297076241, + "grad_norm": 0.18616177141666412, + "learning_rate": 1.9992152577320706e-05, + "loss": 1.2902, + "step": 1326 + }, + { + "epoch": 0.4940489348582332, + "grad_norm": 0.18415139615535736, + "learning_rate": 1.999210436612939e-05, + "loss": 1.3003, + "step": 1327 + }, + { + "epoch": 0.49442124000884224, + "grad_norm": 0.2058713585138321, + "learning_rate": 1.9992056007355323e-05, + "loss": 1.3074, + "step": 1328 + }, + { + "epoch": 0.49479354515945134, + "grad_norm": 0.17545197904109955, + "learning_rate": 1.9992007500999216e-05, + "loss": 1.3167, + "step": 1329 + }, + { + "epoch": 0.4951658503100604, + "grad_norm": 0.1859792321920395, + "learning_rate": 1.9991958847061786e-05, + "loss": 1.3226, + "step": 1330 + }, + { + "epoch": 0.4955381554606694, + "grad_norm": 0.2005636990070343, + "learning_rate": 1.999191004554375e-05, + "loss": 1.3016, + "step": 1331 + }, + { + "epoch": 0.4959104606112785, + "grad_norm": 0.17621280252933502, + "learning_rate": 1.999186109644583e-05, + "loss": 1.2963, + "step": 1332 + }, + { + "epoch": 0.49628276576188757, + "grad_norm": 0.18345515429973602, + "learning_rate": 1.9991811999768747e-05, + "loss": 1.2984, + "step": 1333 + }, + { + "epoch": 0.49665507091249667, + "grad_norm": 0.191512331366539, + "learning_rate": 1.999176275551323e-05, + "loss": 1.2894, + "step": 1334 + }, + { + "epoch": 0.4970273760631057, + "grad_norm": 0.18425214290618896, + "learning_rate": 1.9991713363680002e-05, + "loss": 1.2952, + "step": 1335 + }, + { + "epoch": 0.4973996812137148, + "grad_norm": 0.4703981280326843, + "learning_rate": 1.9991663824269797e-05, + "loss": 1.3056, + "step": 1336 + }, + { + "epoch": 0.49777198636432385, + "grad_norm": 0.19318270683288574, + "learning_rate": 1.999161413728334e-05, + "loss": 1.2895, + "step": 1337 + }, + { + "epoch": 0.49814429151493295, + "grad_norm": 0.19595178961753845, + "learning_rate": 1.9991564302721374e-05, + "loss": 1.2952, + "step": 1338 + }, + { + "epoch": 0.498516596665542, + "grad_norm": 0.20309020578861237, + "learning_rate": 1.9991514320584628e-05, + "loss": 1.3124, + "step": 1339 + }, + { + "epoch": 0.4988889018161511, + "grad_norm": 0.19227808713912964, + "learning_rate": 1.9991464190873845e-05, + "loss": 1.2956, + "step": 1340 + }, + { + "epoch": 0.49926120696676013, + "grad_norm": 0.19931499660015106, + "learning_rate": 1.999141391358976e-05, + "loss": 1.2961, + "step": 1341 + }, + { + "epoch": 0.49963351211736917, + "grad_norm": 0.19872671365737915, + "learning_rate": 1.999136348873312e-05, + "loss": 1.2977, + "step": 1342 + }, + { + "epoch": 0.5000058172679782, + "grad_norm": 0.1944931149482727, + "learning_rate": 1.999131291630467e-05, + "loss": 1.2856, + "step": 1343 + }, + { + "epoch": 0.5003781224185874, + "grad_norm": 0.19236011803150177, + "learning_rate": 1.9991262196305153e-05, + "loss": 1.2963, + "step": 1344 + }, + { + "epoch": 0.5007504275691964, + "grad_norm": 0.19329576194286346, + "learning_rate": 1.999121132873532e-05, + "loss": 1.3028, + "step": 1345 + }, + { + "epoch": 0.5011227327198055, + "grad_norm": 0.17316564917564392, + "learning_rate": 1.9991160313595924e-05, + "loss": 1.2826, + "step": 1346 + }, + { + "epoch": 0.5014950378704145, + "grad_norm": 0.19512668251991272, + "learning_rate": 1.9991109150887715e-05, + "loss": 1.3035, + "step": 1347 + }, + { + "epoch": 0.5018673430210236, + "grad_norm": 0.19115027785301208, + "learning_rate": 1.9991057840611453e-05, + "loss": 1.3046, + "step": 1348 + }, + { + "epoch": 0.5022396481716327, + "grad_norm": 0.19715119898319244, + "learning_rate": 1.9991006382767892e-05, + "loss": 1.3129, + "step": 1349 + }, + { + "epoch": 0.5026119533222417, + "grad_norm": 0.18349628150463104, + "learning_rate": 1.9990954777357795e-05, + "loss": 1.3005, + "step": 1350 + }, + { + "epoch": 0.5029842584728508, + "grad_norm": 0.19362014532089233, + "learning_rate": 1.999090302438192e-05, + "loss": 1.3151, + "step": 1351 + }, + { + "epoch": 0.5033565636234599, + "grad_norm": 0.18228283524513245, + "learning_rate": 1.999085112384104e-05, + "loss": 1.3068, + "step": 1352 + }, + { + "epoch": 0.503728868774069, + "grad_norm": 0.193775936961174, + "learning_rate": 1.9990799075735912e-05, + "loss": 1.3139, + "step": 1353 + }, + { + "epoch": 0.504101173924678, + "grad_norm": 0.18167784810066223, + "learning_rate": 1.999074688006731e-05, + "loss": 1.2844, + "step": 1354 + }, + { + "epoch": 0.5044734790752871, + "grad_norm": 0.19863297045230865, + "learning_rate": 1.9990694536836002e-05, + "loss": 1.2932, + "step": 1355 + }, + { + "epoch": 0.5048457842258961, + "grad_norm": 0.189836323261261, + "learning_rate": 1.9990642046042766e-05, + "loss": 1.284, + "step": 1356 + }, + { + "epoch": 0.5052180893765053, + "grad_norm": 0.19672173261642456, + "learning_rate": 1.9990589407688373e-05, + "loss": 1.3062, + "step": 1357 + }, + { + "epoch": 0.5055903945271143, + "grad_norm": 0.18417510390281677, + "learning_rate": 1.99905366217736e-05, + "loss": 1.303, + "step": 1358 + }, + { + "epoch": 0.5059626996777233, + "grad_norm": 0.18591108918190002, + "learning_rate": 1.999048368829923e-05, + "loss": 1.291, + "step": 1359 + }, + { + "epoch": 0.5063350048283324, + "grad_norm": 0.1832229644060135, + "learning_rate": 1.9990430607266038e-05, + "loss": 1.2888, + "step": 1360 + }, + { + "epoch": 0.5067073099789415, + "grad_norm": 0.21367454528808594, + "learning_rate": 1.999037737867482e-05, + "loss": 1.307, + "step": 1361 + }, + { + "epoch": 0.5070796151295506, + "grad_norm": 0.18742305040359497, + "learning_rate": 1.999032400252635e-05, + "loss": 1.2955, + "step": 1362 + }, + { + "epoch": 0.5074519202801596, + "grad_norm": 0.18082669377326965, + "learning_rate": 1.9990270478821422e-05, + "loss": 1.2794, + "step": 1363 + }, + { + "epoch": 0.5078242254307687, + "grad_norm": 0.19011190533638, + "learning_rate": 1.9990216807560827e-05, + "loss": 1.2864, + "step": 1364 + }, + { + "epoch": 0.5081965305813778, + "grad_norm": 0.18790535628795624, + "learning_rate": 1.9990162988745357e-05, + "loss": 1.2974, + "step": 1365 + }, + { + "epoch": 0.5085688357319869, + "grad_norm": 0.19116294384002686, + "learning_rate": 1.9990109022375807e-05, + "loss": 1.2972, + "step": 1366 + }, + { + "epoch": 0.5089411408825959, + "grad_norm": 0.19198502600193024, + "learning_rate": 1.999005490845297e-05, + "loss": 1.3173, + "step": 1367 + }, + { + "epoch": 0.5093134460332049, + "grad_norm": 0.19381704926490784, + "learning_rate": 1.9990000646977653e-05, + "loss": 1.3012, + "step": 1368 + }, + { + "epoch": 0.509685751183814, + "grad_norm": 0.1835579127073288, + "learning_rate": 1.9989946237950653e-05, + "loss": 1.3028, + "step": 1369 + }, + { + "epoch": 0.5100580563344231, + "grad_norm": 0.18221747875213623, + "learning_rate": 1.998989168137277e-05, + "loss": 1.2992, + "step": 1370 + }, + { + "epoch": 0.5104303614850322, + "grad_norm": 0.18950197100639343, + "learning_rate": 1.998983697724482e-05, + "loss": 1.2997, + "step": 1371 + }, + { + "epoch": 0.5108026666356412, + "grad_norm": 0.18854863941669464, + "learning_rate": 1.99897821255676e-05, + "loss": 1.3083, + "step": 1372 + }, + { + "epoch": 0.5111749717862503, + "grad_norm": 0.1852334886789322, + "learning_rate": 1.9989727126341927e-05, + "loss": 1.2923, + "step": 1373 + }, + { + "epoch": 0.5115472769368594, + "grad_norm": 0.18260884284973145, + "learning_rate": 1.998967197956861e-05, + "loss": 1.2941, + "step": 1374 + }, + { + "epoch": 0.5119195820874685, + "grad_norm": 0.1905146837234497, + "learning_rate": 1.9989616685248468e-05, + "loss": 1.3061, + "step": 1375 + }, + { + "epoch": 0.5122918872380775, + "grad_norm": 0.17799590528011322, + "learning_rate": 1.9989561243382313e-05, + "loss": 1.305, + "step": 1376 + }, + { + "epoch": 0.5126641923886865, + "grad_norm": 0.18852990865707397, + "learning_rate": 1.9989505653970963e-05, + "loss": 1.3001, + "step": 1377 + }, + { + "epoch": 0.5130364975392957, + "grad_norm": 0.19264602661132812, + "learning_rate": 1.9989449917015242e-05, + "loss": 1.2991, + "step": 1378 + }, + { + "epoch": 0.5134088026899047, + "grad_norm": 0.1937609314918518, + "learning_rate": 1.9989394032515974e-05, + "loss": 1.2826, + "step": 1379 + }, + { + "epoch": 0.5137811078405138, + "grad_norm": 0.19785352051258087, + "learning_rate": 1.9989338000473982e-05, + "loss": 1.2972, + "step": 1380 + }, + { + "epoch": 0.5141534129911228, + "grad_norm": 0.18624554574489594, + "learning_rate": 1.9989281820890095e-05, + "loss": 1.2628, + "step": 1381 + }, + { + "epoch": 0.5145257181417319, + "grad_norm": 0.18756809830665588, + "learning_rate": 1.9989225493765144e-05, + "loss": 1.2842, + "step": 1382 + }, + { + "epoch": 0.514898023292341, + "grad_norm": 0.18710888922214508, + "learning_rate": 1.9989169019099956e-05, + "loss": 1.2887, + "step": 1383 + }, + { + "epoch": 0.5152703284429501, + "grad_norm": 0.17629897594451904, + "learning_rate": 1.9989112396895374e-05, + "loss": 1.2993, + "step": 1384 + }, + { + "epoch": 0.5156426335935591, + "grad_norm": 0.18918879330158234, + "learning_rate": 1.9989055627152222e-05, + "loss": 1.304, + "step": 1385 + }, + { + "epoch": 0.5160149387441682, + "grad_norm": 0.1894582360982895, + "learning_rate": 1.998899870987135e-05, + "loss": 1.2917, + "step": 1386 + }, + { + "epoch": 0.5163872438947773, + "grad_norm": 0.18650588393211365, + "learning_rate": 1.9988941645053594e-05, + "loss": 1.2985, + "step": 1387 + }, + { + "epoch": 0.5167595490453863, + "grad_norm": 0.17856687307357788, + "learning_rate": 1.9988884432699795e-05, + "loss": 1.2898, + "step": 1388 + }, + { + "epoch": 0.5171318541959954, + "grad_norm": 0.19840949773788452, + "learning_rate": 1.9988827072810798e-05, + "loss": 1.2982, + "step": 1389 + }, + { + "epoch": 0.5175041593466044, + "grad_norm": 0.18756093084812164, + "learning_rate": 1.9988769565387454e-05, + "loss": 1.294, + "step": 1390 + }, + { + "epoch": 0.5178764644972135, + "grad_norm": 0.17636418342590332, + "learning_rate": 1.9988711910430613e-05, + "loss": 1.291, + "step": 1391 + }, + { + "epoch": 0.5182487696478226, + "grad_norm": 0.18831880390644073, + "learning_rate": 1.998865410794112e-05, + "loss": 1.2831, + "step": 1392 + }, + { + "epoch": 0.5186210747984317, + "grad_norm": 0.17440542578697205, + "learning_rate": 1.9988596157919836e-05, + "loss": 1.305, + "step": 1393 + }, + { + "epoch": 0.5189933799490407, + "grad_norm": 0.18249642848968506, + "learning_rate": 1.9988538060367612e-05, + "loss": 1.2852, + "step": 1394 + }, + { + "epoch": 0.5193656850996498, + "grad_norm": 0.18289236724376678, + "learning_rate": 1.9988479815285308e-05, + "loss": 1.2902, + "step": 1395 + }, + { + "epoch": 0.5197379902502589, + "grad_norm": 0.18215063214302063, + "learning_rate": 1.998842142267378e-05, + "loss": 1.2896, + "step": 1396 + }, + { + "epoch": 0.520110295400868, + "grad_norm": 0.18357405066490173, + "learning_rate": 1.99883628825339e-05, + "loss": 1.2959, + "step": 1397 + }, + { + "epoch": 0.520482600551477, + "grad_norm": 0.18299397826194763, + "learning_rate": 1.9988304194866527e-05, + "loss": 1.3121, + "step": 1398 + }, + { + "epoch": 0.520854905702086, + "grad_norm": 0.18076153099536896, + "learning_rate": 1.9988245359672523e-05, + "loss": 1.3005, + "step": 1399 + }, + { + "epoch": 0.5212272108526952, + "grad_norm": 0.17696769535541534, + "learning_rate": 1.9988186376952766e-05, + "loss": 1.2815, + "step": 1400 + }, + { + "epoch": 0.5215995160033042, + "grad_norm": 0.18042761087417603, + "learning_rate": 1.998812724670812e-05, + "loss": 1.2893, + "step": 1401 + }, + { + "epoch": 0.5219718211539133, + "grad_norm": 0.1830262839794159, + "learning_rate": 1.9988067968939463e-05, + "loss": 1.2929, + "step": 1402 + }, + { + "epoch": 0.5223441263045223, + "grad_norm": 0.18442308902740479, + "learning_rate": 1.998800854364767e-05, + "loss": 1.2998, + "step": 1403 + }, + { + "epoch": 0.5227164314551314, + "grad_norm": 0.1778024435043335, + "learning_rate": 1.9987948970833616e-05, + "loss": 1.2871, + "step": 1404 + }, + { + "epoch": 0.5230887366057405, + "grad_norm": 0.17833255231380463, + "learning_rate": 1.9987889250498185e-05, + "loss": 1.2935, + "step": 1405 + }, + { + "epoch": 0.5234610417563496, + "grad_norm": 0.1755494475364685, + "learning_rate": 1.9987829382642253e-05, + "loss": 1.2908, + "step": 1406 + }, + { + "epoch": 0.5238333469069586, + "grad_norm": 0.18358327448368073, + "learning_rate": 1.998776936726671e-05, + "loss": 1.2984, + "step": 1407 + }, + { + "epoch": 0.5242056520575676, + "grad_norm": 0.1703406274318695, + "learning_rate": 1.998770920437244e-05, + "loss": 1.2906, + "step": 1408 + }, + { + "epoch": 0.5245779572081768, + "grad_norm": 0.1967104971408844, + "learning_rate": 1.9987648893960334e-05, + "loss": 1.293, + "step": 1409 + }, + { + "epoch": 0.5249502623587858, + "grad_norm": 0.19022947549819946, + "learning_rate": 1.998758843603128e-05, + "loss": 1.2919, + "step": 1410 + }, + { + "epoch": 0.5253225675093949, + "grad_norm": 0.17558811604976654, + "learning_rate": 1.9987527830586168e-05, + "loss": 1.2928, + "step": 1411 + }, + { + "epoch": 0.5256948726600039, + "grad_norm": 0.1828990876674652, + "learning_rate": 1.99874670776259e-05, + "loss": 1.2967, + "step": 1412 + }, + { + "epoch": 0.5260671778106131, + "grad_norm": 0.18160808086395264, + "learning_rate": 1.9987406177151368e-05, + "loss": 1.2936, + "step": 1413 + }, + { + "epoch": 0.5264394829612221, + "grad_norm": 0.1777198314666748, + "learning_rate": 1.9987345129163472e-05, + "loss": 1.2832, + "step": 1414 + }, + { + "epoch": 0.5268117881118312, + "grad_norm": 0.1877478063106537, + "learning_rate": 1.998728393366312e-05, + "loss": 1.3125, + "step": 1415 + }, + { + "epoch": 0.5271840932624402, + "grad_norm": 0.17702911794185638, + "learning_rate": 1.9987222590651206e-05, + "loss": 1.2747, + "step": 1416 + }, + { + "epoch": 0.5275563984130492, + "grad_norm": 0.19214600324630737, + "learning_rate": 1.9987161100128646e-05, + "loss": 1.2897, + "step": 1417 + }, + { + "epoch": 0.5279287035636584, + "grad_norm": 0.18091824650764465, + "learning_rate": 1.9987099462096342e-05, + "loss": 1.29, + "step": 1418 + }, + { + "epoch": 0.5283010087142674, + "grad_norm": 0.19012725353240967, + "learning_rate": 1.9987037676555205e-05, + "loss": 1.2834, + "step": 1419 + }, + { + "epoch": 0.5286733138648765, + "grad_norm": 0.18362641334533691, + "learning_rate": 1.9986975743506146e-05, + "loss": 1.2939, + "step": 1420 + }, + { + "epoch": 0.5290456190154855, + "grad_norm": 0.17870792746543884, + "learning_rate": 1.9986913662950084e-05, + "loss": 1.2931, + "step": 1421 + }, + { + "epoch": 0.5294179241660947, + "grad_norm": 0.16860565543174744, + "learning_rate": 1.9986851434887934e-05, + "loss": 1.3015, + "step": 1422 + }, + { + "epoch": 0.5297902293167037, + "grad_norm": 0.18669748306274414, + "learning_rate": 1.9986789059320614e-05, + "loss": 1.2966, + "step": 1423 + }, + { + "epoch": 0.5301625344673128, + "grad_norm": 0.1844332367181778, + "learning_rate": 1.998672653624905e-05, + "loss": 1.2936, + "step": 1424 + }, + { + "epoch": 0.5305348396179218, + "grad_norm": 0.18035277724266052, + "learning_rate": 1.998666386567416e-05, + "loss": 1.2964, + "step": 1425 + }, + { + "epoch": 0.530907144768531, + "grad_norm": 0.19097861647605896, + "learning_rate": 1.998660104759687e-05, + "loss": 1.305, + "step": 1426 + }, + { + "epoch": 0.53127944991914, + "grad_norm": 0.17276540398597717, + "learning_rate": 1.998653808201811e-05, + "loss": 1.3028, + "step": 1427 + }, + { + "epoch": 0.531651755069749, + "grad_norm": 0.19778695702552795, + "learning_rate": 1.9986474968938808e-05, + "loss": 1.2907, + "step": 1428 + }, + { + "epoch": 0.5320240602203581, + "grad_norm": 0.18655353784561157, + "learning_rate": 1.99864117083599e-05, + "loss": 1.2774, + "step": 1429 + }, + { + "epoch": 0.5323963653709671, + "grad_norm": 0.18784590065479279, + "learning_rate": 1.9986348300282318e-05, + "loss": 1.2953, + "step": 1430 + }, + { + "epoch": 0.5327686705215763, + "grad_norm": 0.17535433173179626, + "learning_rate": 1.9986284744706995e-05, + "loss": 1.2927, + "step": 1431 + }, + { + "epoch": 0.5331409756721853, + "grad_norm": 0.1903795450925827, + "learning_rate": 1.9986221041634874e-05, + "loss": 1.2778, + "step": 1432 + }, + { + "epoch": 0.5335132808227944, + "grad_norm": 0.1824638694524765, + "learning_rate": 1.9986157191066897e-05, + "loss": 1.2905, + "step": 1433 + }, + { + "epoch": 0.5338855859734034, + "grad_norm": 0.17660009860992432, + "learning_rate": 1.9986093193004005e-05, + "loss": 1.2794, + "step": 1434 + }, + { + "epoch": 0.5342578911240126, + "grad_norm": 0.18862681090831757, + "learning_rate": 1.998602904744714e-05, + "loss": 1.2997, + "step": 1435 + }, + { + "epoch": 0.5346301962746216, + "grad_norm": 0.18798676133155823, + "learning_rate": 1.9985964754397256e-05, + "loss": 1.2973, + "step": 1436 + }, + { + "epoch": 0.5350025014252306, + "grad_norm": 0.18016012012958527, + "learning_rate": 1.9985900313855297e-05, + "loss": 1.2902, + "step": 1437 + }, + { + "epoch": 0.5353748065758397, + "grad_norm": 0.18391664326190948, + "learning_rate": 1.9985835725822217e-05, + "loss": 1.2734, + "step": 1438 + }, + { + "epoch": 0.5357471117264488, + "grad_norm": 0.2144862860441208, + "learning_rate": 1.998577099029897e-05, + "loss": 1.2937, + "step": 1439 + }, + { + "epoch": 0.5361194168770579, + "grad_norm": 0.18601621687412262, + "learning_rate": 1.9985706107286515e-05, + "loss": 1.2836, + "step": 1440 + }, + { + "epoch": 0.5364917220276669, + "grad_norm": 0.18405480682849884, + "learning_rate": 1.9985641076785806e-05, + "loss": 1.2814, + "step": 1441 + }, + { + "epoch": 0.536864027178276, + "grad_norm": 0.19651199877262115, + "learning_rate": 1.9985575898797803e-05, + "loss": 1.2892, + "step": 1442 + }, + { + "epoch": 0.537236332328885, + "grad_norm": 0.18740807473659515, + "learning_rate": 1.9985510573323474e-05, + "loss": 1.3043, + "step": 1443 + }, + { + "epoch": 0.5376086374794942, + "grad_norm": 0.17624454200267792, + "learning_rate": 1.998544510036378e-05, + "loss": 1.2874, + "step": 1444 + }, + { + "epoch": 0.5379809426301032, + "grad_norm": 0.19030825793743134, + "learning_rate": 1.9985379479919685e-05, + "loss": 1.2915, + "step": 1445 + }, + { + "epoch": 0.5383532477807123, + "grad_norm": 0.18553490936756134, + "learning_rate": 1.9985313711992164e-05, + "loss": 1.3027, + "step": 1446 + }, + { + "epoch": 0.5387255529313213, + "grad_norm": 0.17176979780197144, + "learning_rate": 1.998524779658219e-05, + "loss": 1.2953, + "step": 1447 + }, + { + "epoch": 0.5390978580819304, + "grad_norm": 0.1822216957807541, + "learning_rate": 1.9985181733690728e-05, + "loss": 1.2795, + "step": 1448 + }, + { + "epoch": 0.5394701632325395, + "grad_norm": 0.18704013526439667, + "learning_rate": 1.9985115523318758e-05, + "loss": 1.2843, + "step": 1449 + }, + { + "epoch": 0.5398424683831485, + "grad_norm": 0.18013562262058258, + "learning_rate": 1.998504916546726e-05, + "loss": 1.2947, + "step": 1450 + }, + { + "epoch": 0.5402147735337576, + "grad_norm": 0.1960912048816681, + "learning_rate": 1.998498266013721e-05, + "loss": 1.3077, + "step": 1451 + }, + { + "epoch": 0.5405870786843667, + "grad_norm": 0.18962568044662476, + "learning_rate": 1.9984916007329596e-05, + "loss": 1.2998, + "step": 1452 + }, + { + "epoch": 0.5409593838349758, + "grad_norm": 0.17837461829185486, + "learning_rate": 1.99848492070454e-05, + "loss": 1.2742, + "step": 1453 + }, + { + "epoch": 0.5413316889855848, + "grad_norm": 0.17769278585910797, + "learning_rate": 1.9984782259285604e-05, + "loss": 1.29, + "step": 1454 + }, + { + "epoch": 0.5417039941361939, + "grad_norm": 0.19605344533920288, + "learning_rate": 1.9984715164051203e-05, + "loss": 1.2954, + "step": 1455 + }, + { + "epoch": 0.5420762992868029, + "grad_norm": 0.18324050307273865, + "learning_rate": 1.9984647921343185e-05, + "loss": 1.3072, + "step": 1456 + }, + { + "epoch": 0.542448604437412, + "grad_norm": 0.18658418953418732, + "learning_rate": 1.9984580531162544e-05, + "loss": 1.3097, + "step": 1457 + }, + { + "epoch": 0.5428209095880211, + "grad_norm": 0.1968865692615509, + "learning_rate": 1.9984512993510275e-05, + "loss": 1.2943, + "step": 1458 + }, + { + "epoch": 0.5431932147386301, + "grad_norm": 0.19187164306640625, + "learning_rate": 1.9984445308387377e-05, + "loss": 1.29, + "step": 1459 + }, + { + "epoch": 0.5435655198892392, + "grad_norm": 0.18562346696853638, + "learning_rate": 1.9984377475794847e-05, + "loss": 1.2895, + "step": 1460 + }, + { + "epoch": 0.5439378250398483, + "grad_norm": 0.18785440921783447, + "learning_rate": 1.998430949573369e-05, + "loss": 1.3003, + "step": 1461 + }, + { + "epoch": 0.5443101301904574, + "grad_norm": 0.17857135832309723, + "learning_rate": 1.9984241368204907e-05, + "loss": 1.2806, + "step": 1462 + }, + { + "epoch": 0.5446824353410664, + "grad_norm": 0.18769045174121857, + "learning_rate": 1.99841730932095e-05, + "loss": 1.2866, + "step": 1463 + }, + { + "epoch": 0.5450547404916755, + "grad_norm": 0.1790071427822113, + "learning_rate": 1.9984104670748493e-05, + "loss": 1.2814, + "step": 1464 + }, + { + "epoch": 0.5454270456422845, + "grad_norm": 0.19664452970027924, + "learning_rate": 1.998403610082288e-05, + "loss": 1.2965, + "step": 1465 + }, + { + "epoch": 0.5457993507928937, + "grad_norm": 0.19964231550693512, + "learning_rate": 1.9983967383433685e-05, + "loss": 1.2875, + "step": 1466 + }, + { + "epoch": 0.5461716559435027, + "grad_norm": 0.1774752289056778, + "learning_rate": 1.9983898518581913e-05, + "loss": 1.2741, + "step": 1467 + }, + { + "epoch": 0.5465439610941117, + "grad_norm": 0.18312522768974304, + "learning_rate": 1.998382950626859e-05, + "loss": 1.2932, + "step": 1468 + }, + { + "epoch": 0.5469162662447208, + "grad_norm": 0.19389499723911285, + "learning_rate": 1.998376034649473e-05, + "loss": 1.2921, + "step": 1469 + }, + { + "epoch": 0.5472885713953299, + "grad_norm": 0.22545579075813293, + "learning_rate": 1.9983691039261358e-05, + "loss": 1.3011, + "step": 1470 + }, + { + "epoch": 0.547660876545939, + "grad_norm": 0.17456892132759094, + "learning_rate": 1.9983621584569496e-05, + "loss": 1.2762, + "step": 1471 + }, + { + "epoch": 0.548033181696548, + "grad_norm": 0.20221838355064392, + "learning_rate": 1.9983551982420168e-05, + "loss": 1.2822, + "step": 1472 + }, + { + "epoch": 0.5484054868471571, + "grad_norm": 0.17790015041828156, + "learning_rate": 1.9983482232814405e-05, + "loss": 1.2854, + "step": 1473 + }, + { + "epoch": 0.5487777919977662, + "grad_norm": 0.19563356041908264, + "learning_rate": 1.9983412335753237e-05, + "loss": 1.2901, + "step": 1474 + }, + { + "epoch": 0.5491500971483753, + "grad_norm": 0.17833010852336884, + "learning_rate": 1.9983342291237693e-05, + "loss": 1.2993, + "step": 1475 + }, + { + "epoch": 0.5495224022989843, + "grad_norm": 0.18136514723300934, + "learning_rate": 1.998327209926881e-05, + "loss": 1.2829, + "step": 1476 + }, + { + "epoch": 0.5498947074495933, + "grad_norm": 0.19292525947093964, + "learning_rate": 1.9983201759847627e-05, + "loss": 1.2967, + "step": 1477 + }, + { + "epoch": 0.5502670126002024, + "grad_norm": 0.1802646517753601, + "learning_rate": 1.9983131272975178e-05, + "loss": 1.2871, + "step": 1478 + }, + { + "epoch": 0.5506393177508115, + "grad_norm": 0.18988975882530212, + "learning_rate": 1.9983060638652507e-05, + "loss": 1.2881, + "step": 1479 + }, + { + "epoch": 0.5510116229014206, + "grad_norm": 0.18343466520309448, + "learning_rate": 1.9982989856880655e-05, + "loss": 1.2826, + "step": 1480 + }, + { + "epoch": 0.5513839280520296, + "grad_norm": 0.18973736464977264, + "learning_rate": 1.9982918927660676e-05, + "loss": 1.277, + "step": 1481 + }, + { + "epoch": 0.5517562332026387, + "grad_norm": 0.1893860101699829, + "learning_rate": 1.9982847850993605e-05, + "loss": 1.2789, + "step": 1482 + }, + { + "epoch": 0.5521285383532478, + "grad_norm": 0.17867526412010193, + "learning_rate": 1.9982776626880498e-05, + "loss": 1.2916, + "step": 1483 + }, + { + "epoch": 0.5525008435038569, + "grad_norm": 0.1821223944425583, + "learning_rate": 1.998270525532241e-05, + "loss": 1.2678, + "step": 1484 + }, + { + "epoch": 0.5528731486544659, + "grad_norm": 0.18600653111934662, + "learning_rate": 1.998263373632039e-05, + "loss": 1.2878, + "step": 1485 + }, + { + "epoch": 0.553245453805075, + "grad_norm": 0.17727398872375488, + "learning_rate": 1.9982562069875495e-05, + "loss": 1.293, + "step": 1486 + }, + { + "epoch": 0.5536177589556841, + "grad_norm": 0.1803734004497528, + "learning_rate": 1.9982490255988786e-05, + "loss": 1.2873, + "step": 1487 + }, + { + "epoch": 0.5539900641062931, + "grad_norm": 0.1834772229194641, + "learning_rate": 1.9982418294661322e-05, + "loss": 1.2884, + "step": 1488 + }, + { + "epoch": 0.5543623692569022, + "grad_norm": 0.1872669756412506, + "learning_rate": 1.998234618589417e-05, + "loss": 1.2798, + "step": 1489 + }, + { + "epoch": 0.5547346744075112, + "grad_norm": 0.18299680948257446, + "learning_rate": 1.9982273929688384e-05, + "loss": 1.3008, + "step": 1490 + }, + { + "epoch": 0.5551069795581203, + "grad_norm": 0.1841609627008438, + "learning_rate": 1.9982201526045044e-05, + "loss": 1.2893, + "step": 1491 + }, + { + "epoch": 0.5554792847087294, + "grad_norm": 0.19560469686985016, + "learning_rate": 1.9982128974965215e-05, + "loss": 1.2846, + "step": 1492 + }, + { + "epoch": 0.5558515898593385, + "grad_norm": 0.17704376578330994, + "learning_rate": 1.998205627644996e-05, + "loss": 1.2972, + "step": 1493 + }, + { + "epoch": 0.5562238950099475, + "grad_norm": 0.18441899120807648, + "learning_rate": 1.9981983430500368e-05, + "loss": 1.3004, + "step": 1494 + }, + { + "epoch": 0.5565962001605566, + "grad_norm": 0.19001679122447968, + "learning_rate": 1.9981910437117502e-05, + "loss": 1.2903, + "step": 1495 + }, + { + "epoch": 0.5569685053111657, + "grad_norm": 0.19117416441440582, + "learning_rate": 1.998183729630245e-05, + "loss": 1.287, + "step": 1496 + }, + { + "epoch": 0.5573408104617747, + "grad_norm": 0.1872899830341339, + "learning_rate": 1.9981764008056283e-05, + "loss": 1.291, + "step": 1497 + }, + { + "epoch": 0.5577131156123838, + "grad_norm": 0.19934000074863434, + "learning_rate": 1.998169057238009e-05, + "loss": 1.2897, + "step": 1498 + }, + { + "epoch": 0.5580854207629928, + "grad_norm": 0.1839592605829239, + "learning_rate": 1.9981616989274955e-05, + "loss": 1.2772, + "step": 1499 + }, + { + "epoch": 0.558457725913602, + "grad_norm": 0.18648071587085724, + "learning_rate": 1.998154325874196e-05, + "loss": 1.291, + "step": 1500 + }, + { + "epoch": 0.558457725913602, + "eval_loss": 1.3492937088012695, + "eval_runtime": 16.1571, + "eval_samples_per_second": 107.321, + "eval_steps_per_second": 5.385, + "step": 1500 + }, + { + "epoch": 0.558830031064211, + "grad_norm": 0.18601854145526886, + "learning_rate": 1.9981469380782205e-05, + "loss": 1.2898, + "step": 1501 + }, + { + "epoch": 0.5592023362148201, + "grad_norm": 0.18247583508491516, + "learning_rate": 1.9981395355396764e-05, + "loss": 1.276, + "step": 1502 + }, + { + "epoch": 0.5595746413654291, + "grad_norm": 0.19537685811519623, + "learning_rate": 1.9981321182586746e-05, + "loss": 1.2851, + "step": 1503 + }, + { + "epoch": 0.5599469465160382, + "grad_norm": 0.17951083183288574, + "learning_rate": 1.998124686235324e-05, + "loss": 1.2892, + "step": 1504 + }, + { + "epoch": 0.5603192516666473, + "grad_norm": 0.1824089139699936, + "learning_rate": 1.998117239469734e-05, + "loss": 1.285, + "step": 1505 + }, + { + "epoch": 0.5606915568172564, + "grad_norm": 0.19962851703166962, + "learning_rate": 1.9981097779620156e-05, + "loss": 1.2866, + "step": 1506 + }, + { + "epoch": 0.5610638619678654, + "grad_norm": 0.17792317271232605, + "learning_rate": 1.998102301712278e-05, + "loss": 1.281, + "step": 1507 + }, + { + "epoch": 0.5614361671184744, + "grad_norm": 0.20243586599826813, + "learning_rate": 1.9980948107206323e-05, + "loss": 1.3045, + "step": 1508 + }, + { + "epoch": 0.5618084722690836, + "grad_norm": 0.1958632916212082, + "learning_rate": 1.998087304987189e-05, + "loss": 1.2902, + "step": 1509 + }, + { + "epoch": 0.5621807774196926, + "grad_norm": 0.17324163019657135, + "learning_rate": 1.9980797845120583e-05, + "loss": 1.2868, + "step": 1510 + }, + { + "epoch": 0.5625530825703017, + "grad_norm": 0.18130120635032654, + "learning_rate": 1.998072249295352e-05, + "loss": 1.289, + "step": 1511 + }, + { + "epoch": 0.5629253877209107, + "grad_norm": 0.18327681720256805, + "learning_rate": 1.9980646993371816e-05, + "loss": 1.269, + "step": 1512 + }, + { + "epoch": 0.5632976928715199, + "grad_norm": 0.1806684136390686, + "learning_rate": 1.998057134637658e-05, + "loss": 1.2989, + "step": 1513 + }, + { + "epoch": 0.5636699980221289, + "grad_norm": 0.18309220671653748, + "learning_rate": 1.998049555196893e-05, + "loss": 1.287, + "step": 1514 + }, + { + "epoch": 0.564042303172738, + "grad_norm": 0.19237829744815826, + "learning_rate": 1.998041961014999e-05, + "loss": 1.2894, + "step": 1515 + }, + { + "epoch": 0.564414608323347, + "grad_norm": 0.1768263727426529, + "learning_rate": 1.998034352092088e-05, + "loss": 1.2811, + "step": 1516 + }, + { + "epoch": 0.564786913473956, + "grad_norm": 0.17326796054840088, + "learning_rate": 1.9980267284282718e-05, + "loss": 1.2783, + "step": 1517 + }, + { + "epoch": 0.5651592186245652, + "grad_norm": 0.18072554469108582, + "learning_rate": 1.9980190900236637e-05, + "loss": 1.2742, + "step": 1518 + }, + { + "epoch": 0.5655315237751742, + "grad_norm": 0.17865929007530212, + "learning_rate": 1.998011436878376e-05, + "loss": 1.281, + "step": 1519 + }, + { + "epoch": 0.5659038289257833, + "grad_norm": 0.18478450179100037, + "learning_rate": 1.998003768992522e-05, + "loss": 1.2918, + "step": 1520 + }, + { + "epoch": 0.5662761340763923, + "grad_norm": 0.18366698920726776, + "learning_rate": 1.9979960863662155e-05, + "loss": 1.2721, + "step": 1521 + }, + { + "epoch": 0.5666484392270015, + "grad_norm": 0.17557835578918457, + "learning_rate": 1.997988388999569e-05, + "loss": 1.2887, + "step": 1522 + }, + { + "epoch": 0.5670207443776105, + "grad_norm": 0.19200293719768524, + "learning_rate": 1.997980676892697e-05, + "loss": 1.2778, + "step": 1523 + }, + { + "epoch": 0.5673930495282196, + "grad_norm": 0.18071214854717255, + "learning_rate": 1.9979729500457125e-05, + "loss": 1.2984, + "step": 1524 + }, + { + "epoch": 0.5677653546788286, + "grad_norm": 0.18595604598522186, + "learning_rate": 1.9979652084587305e-05, + "loss": 1.2879, + "step": 1525 + }, + { + "epoch": 0.5681376598294378, + "grad_norm": 0.17925933003425598, + "learning_rate": 1.9979574521318648e-05, + "loss": 1.2851, + "step": 1526 + }, + { + "epoch": 0.5685099649800468, + "grad_norm": 0.1706983745098114, + "learning_rate": 1.9979496810652303e-05, + "loss": 1.2867, + "step": 1527 + }, + { + "epoch": 0.5688822701306558, + "grad_norm": 0.1855979859828949, + "learning_rate": 1.9979418952589417e-05, + "loss": 1.3028, + "step": 1528 + }, + { + "epoch": 0.5692545752812649, + "grad_norm": 0.1966056227684021, + "learning_rate": 1.997934094713114e-05, + "loss": 1.2808, + "step": 1529 + }, + { + "epoch": 0.5696268804318739, + "grad_norm": 0.1873730570077896, + "learning_rate": 1.997926279427862e-05, + "loss": 1.2833, + "step": 1530 + }, + { + "epoch": 0.5699991855824831, + "grad_norm": 0.19508777558803558, + "learning_rate": 1.9979184494033016e-05, + "loss": 1.2821, + "step": 1531 + }, + { + "epoch": 0.5703714907330921, + "grad_norm": 0.17242860794067383, + "learning_rate": 1.9979106046395487e-05, + "loss": 1.2833, + "step": 1532 + }, + { + "epoch": 0.5707437958837012, + "grad_norm": 0.1809634268283844, + "learning_rate": 1.9979027451367185e-05, + "loss": 1.2796, + "step": 1533 + }, + { + "epoch": 0.5711161010343102, + "grad_norm": 0.17092421650886536, + "learning_rate": 1.9978948708949274e-05, + "loss": 1.2873, + "step": 1534 + }, + { + "epoch": 0.5714884061849194, + "grad_norm": 0.19138309359550476, + "learning_rate": 1.9978869819142915e-05, + "loss": 1.2923, + "step": 1535 + }, + { + "epoch": 0.5718607113355284, + "grad_norm": 0.18097564578056335, + "learning_rate": 1.997879078194928e-05, + "loss": 1.2754, + "step": 1536 + }, + { + "epoch": 0.5722330164861374, + "grad_norm": 0.1787206530570984, + "learning_rate": 1.9978711597369528e-05, + "loss": 1.2976, + "step": 1537 + }, + { + "epoch": 0.5726053216367465, + "grad_norm": 0.1908443570137024, + "learning_rate": 1.997863226540483e-05, + "loss": 1.2946, + "step": 1538 + }, + { + "epoch": 0.5729776267873555, + "grad_norm": 0.1926680952310562, + "learning_rate": 1.9978552786056364e-05, + "loss": 1.2935, + "step": 1539 + }, + { + "epoch": 0.5733499319379647, + "grad_norm": 0.18011966347694397, + "learning_rate": 1.9978473159325296e-05, + "loss": 1.2915, + "step": 1540 + }, + { + "epoch": 0.5737222370885737, + "grad_norm": 0.17944326996803284, + "learning_rate": 1.997839338521281e-05, + "loss": 1.2818, + "step": 1541 + }, + { + "epoch": 0.5740945422391828, + "grad_norm": 0.1845857799053192, + "learning_rate": 1.9978313463720073e-05, + "loss": 1.2721, + "step": 1542 + }, + { + "epoch": 0.5744668473897918, + "grad_norm": 0.18156015872955322, + "learning_rate": 1.997823339484828e-05, + "loss": 1.2868, + "step": 1543 + }, + { + "epoch": 0.574839152540401, + "grad_norm": 0.18522031605243683, + "learning_rate": 1.99781531785986e-05, + "loss": 1.2795, + "step": 1544 + }, + { + "epoch": 0.57521145769101, + "grad_norm": 0.20020869374275208, + "learning_rate": 1.9978072814972226e-05, + "loss": 1.3067, + "step": 1545 + }, + { + "epoch": 0.575583762841619, + "grad_norm": 0.17684873938560486, + "learning_rate": 1.9977992303970342e-05, + "loss": 1.2903, + "step": 1546 + }, + { + "epoch": 0.5759560679922281, + "grad_norm": 0.18267862498760223, + "learning_rate": 1.997791164559414e-05, + "loss": 1.2972, + "step": 1547 + }, + { + "epoch": 0.5763283731428372, + "grad_norm": 0.17601077258586884, + "learning_rate": 1.9977830839844808e-05, + "loss": 1.278, + "step": 1548 + }, + { + "epoch": 0.5767006782934463, + "grad_norm": 0.18240796029567719, + "learning_rate": 1.997774988672354e-05, + "loss": 1.2753, + "step": 1549 + }, + { + "epoch": 0.5770729834440553, + "grad_norm": 0.19379922747612, + "learning_rate": 1.9977668786231536e-05, + "loss": 1.2932, + "step": 1550 + }, + { + "epoch": 0.5774452885946644, + "grad_norm": 0.19803746044635773, + "learning_rate": 1.9977587538369985e-05, + "loss": 1.2876, + "step": 1551 + }, + { + "epoch": 0.5778175937452734, + "grad_norm": 0.19004493951797485, + "learning_rate": 1.9977506143140094e-05, + "loss": 1.2687, + "step": 1552 + }, + { + "epoch": 0.5781898988958826, + "grad_norm": 0.18594302237033844, + "learning_rate": 1.9977424600543065e-05, + "loss": 1.2875, + "step": 1553 + }, + { + "epoch": 0.5785622040464916, + "grad_norm": 0.193922758102417, + "learning_rate": 1.9977342910580097e-05, + "loss": 1.2779, + "step": 1554 + }, + { + "epoch": 0.5789345091971007, + "grad_norm": 0.19569186866283417, + "learning_rate": 1.9977261073252405e-05, + "loss": 1.2926, + "step": 1555 + }, + { + "epoch": 0.5793068143477097, + "grad_norm": 0.18837492167949677, + "learning_rate": 1.9977179088561193e-05, + "loss": 1.2963, + "step": 1556 + }, + { + "epoch": 0.5796791194983189, + "grad_norm": 0.17669405043125153, + "learning_rate": 1.9977096956507668e-05, + "loss": 1.2761, + "step": 1557 + }, + { + "epoch": 0.5800514246489279, + "grad_norm": 0.191486656665802, + "learning_rate": 1.997701467709305e-05, + "loss": 1.2913, + "step": 1558 + }, + { + "epoch": 0.5804237297995369, + "grad_norm": 0.18390990793704987, + "learning_rate": 1.997693225031855e-05, + "loss": 1.2948, + "step": 1559 + }, + { + "epoch": 0.580796034950146, + "grad_norm": 0.18185609579086304, + "learning_rate": 1.9976849676185384e-05, + "loss": 1.283, + "step": 1560 + }, + { + "epoch": 0.5811683401007551, + "grad_norm": 0.1948121190071106, + "learning_rate": 1.997676695469478e-05, + "loss": 1.2848, + "step": 1561 + }, + { + "epoch": 0.5815406452513642, + "grad_norm": 0.18750061094760895, + "learning_rate": 1.997668408584795e-05, + "loss": 1.2828, + "step": 1562 + }, + { + "epoch": 0.5819129504019732, + "grad_norm": 0.1849110871553421, + "learning_rate": 1.9976601069646126e-05, + "loss": 1.2883, + "step": 1563 + }, + { + "epoch": 0.5822852555525823, + "grad_norm": 0.18599045276641846, + "learning_rate": 1.9976517906090528e-05, + "loss": 1.2742, + "step": 1564 + }, + { + "epoch": 0.5826575607031913, + "grad_norm": 0.1824532300233841, + "learning_rate": 1.997643459518239e-05, + "loss": 1.2835, + "step": 1565 + }, + { + "epoch": 0.5830298658538005, + "grad_norm": 0.18060551583766937, + "learning_rate": 1.9976351136922934e-05, + "loss": 1.2756, + "step": 1566 + }, + { + "epoch": 0.5834021710044095, + "grad_norm": 0.18784838914871216, + "learning_rate": 1.99762675313134e-05, + "loss": 1.2935, + "step": 1567 + }, + { + "epoch": 0.5837744761550185, + "grad_norm": 0.25187990069389343, + "learning_rate": 1.9976183778355018e-05, + "loss": 1.2882, + "step": 1568 + }, + { + "epoch": 0.5841467813056276, + "grad_norm": 0.17487937211990356, + "learning_rate": 1.997609987804903e-05, + "loss": 1.2734, + "step": 1569 + }, + { + "epoch": 0.5845190864562367, + "grad_norm": 0.18456514179706573, + "learning_rate": 1.9976015830396676e-05, + "loss": 1.2816, + "step": 1570 + }, + { + "epoch": 0.5848913916068458, + "grad_norm": 0.18243633210659027, + "learning_rate": 1.997593163539919e-05, + "loss": 1.2809, + "step": 1571 + }, + { + "epoch": 0.5852636967574548, + "grad_norm": 0.17159947752952576, + "learning_rate": 1.9975847293057822e-05, + "loss": 1.274, + "step": 1572 + }, + { + "epoch": 0.5856360019080639, + "grad_norm": 0.18149283528327942, + "learning_rate": 1.9975762803373815e-05, + "loss": 1.2842, + "step": 1573 + }, + { + "epoch": 0.586008307058673, + "grad_norm": 0.19415758550167084, + "learning_rate": 1.9975678166348417e-05, + "loss": 1.2903, + "step": 1574 + }, + { + "epoch": 0.5863806122092821, + "grad_norm": 0.18272031843662262, + "learning_rate": 1.9975593381982877e-05, + "loss": 1.2931, + "step": 1575 + }, + { + "epoch": 0.5867529173598911, + "grad_norm": 0.19155685603618622, + "learning_rate": 1.997550845027845e-05, + "loss": 1.2876, + "step": 1576 + }, + { + "epoch": 0.5871252225105001, + "grad_norm": 0.17365863919258118, + "learning_rate": 1.9975423371236392e-05, + "loss": 1.2757, + "step": 1577 + }, + { + "epoch": 0.5874975276611092, + "grad_norm": 0.1818249225616455, + "learning_rate": 1.9975338144857954e-05, + "loss": 1.2853, + "step": 1578 + }, + { + "epoch": 0.5878698328117183, + "grad_norm": 0.17870758473873138, + "learning_rate": 1.99752527711444e-05, + "loss": 1.2682, + "step": 1579 + }, + { + "epoch": 0.5882421379623274, + "grad_norm": 0.18116502463817596, + "learning_rate": 1.9975167250096985e-05, + "loss": 1.2774, + "step": 1580 + }, + { + "epoch": 0.5886144431129364, + "grad_norm": 0.17226383090019226, + "learning_rate": 1.997508158171698e-05, + "loss": 1.2666, + "step": 1581 + }, + { + "epoch": 0.5889867482635455, + "grad_norm": 0.18359726667404175, + "learning_rate": 1.9974995766005644e-05, + "loss": 1.2794, + "step": 1582 + }, + { + "epoch": 0.5893590534141546, + "grad_norm": 0.21769051253795624, + "learning_rate": 1.9974909802964244e-05, + "loss": 1.2836, + "step": 1583 + }, + { + "epoch": 0.5897313585647637, + "grad_norm": 0.18334870040416718, + "learning_rate": 1.9974823692594054e-05, + "loss": 1.2834, + "step": 1584 + }, + { + "epoch": 0.5901036637153727, + "grad_norm": 0.18716122210025787, + "learning_rate": 1.9974737434896346e-05, + "loss": 1.2893, + "step": 1585 + }, + { + "epoch": 0.5904759688659817, + "grad_norm": 0.21679291129112244, + "learning_rate": 1.997465102987239e-05, + "loss": 1.264, + "step": 1586 + }, + { + "epoch": 0.5908482740165909, + "grad_norm": 0.1788294017314911, + "learning_rate": 1.9974564477523462e-05, + "loss": 1.2765, + "step": 1587 + }, + { + "epoch": 0.5912205791671999, + "grad_norm": 0.18102407455444336, + "learning_rate": 1.9974477777850847e-05, + "loss": 1.2636, + "step": 1588 + }, + { + "epoch": 0.591592884317809, + "grad_norm": 0.17994816601276398, + "learning_rate": 1.997439093085582e-05, + "loss": 1.2746, + "step": 1589 + }, + { + "epoch": 0.591965189468418, + "grad_norm": 0.18664704263210297, + "learning_rate": 1.9974303936539665e-05, + "loss": 1.2926, + "step": 1590 + }, + { + "epoch": 0.5923374946190271, + "grad_norm": 0.18071360886096954, + "learning_rate": 1.9974216794903666e-05, + "loss": 1.2776, + "step": 1591 + }, + { + "epoch": 0.5927097997696362, + "grad_norm": 0.19198505580425262, + "learning_rate": 1.9974129505949112e-05, + "loss": 1.2895, + "step": 1592 + }, + { + "epoch": 0.5930821049202453, + "grad_norm": 0.1870136260986328, + "learning_rate": 1.997404206967729e-05, + "loss": 1.2733, + "step": 1593 + }, + { + "epoch": 0.5934544100708543, + "grad_norm": 0.18357659876346588, + "learning_rate": 1.9973954486089494e-05, + "loss": 1.2859, + "step": 1594 + }, + { + "epoch": 0.5938267152214634, + "grad_norm": 0.20039206743240356, + "learning_rate": 1.9973866755187012e-05, + "loss": 1.2969, + "step": 1595 + }, + { + "epoch": 0.5941990203720725, + "grad_norm": 0.1847948431968689, + "learning_rate": 1.997377887697115e-05, + "loss": 1.2811, + "step": 1596 + }, + { + "epoch": 0.5945713255226815, + "grad_norm": 0.18077334761619568, + "learning_rate": 1.9973690851443198e-05, + "loss": 1.2874, + "step": 1597 + }, + { + "epoch": 0.5949436306732906, + "grad_norm": 0.17171575129032135, + "learning_rate": 1.9973602678604454e-05, + "loss": 1.2942, + "step": 1598 + }, + { + "epoch": 0.5953159358238996, + "grad_norm": 0.17874233424663544, + "learning_rate": 1.9973514358456228e-05, + "loss": 1.2938, + "step": 1599 + }, + { + "epoch": 0.5956882409745088, + "grad_norm": 0.19295063614845276, + "learning_rate": 1.997342589099982e-05, + "loss": 1.3016, + "step": 1600 + }, + { + "epoch": 0.5960605461251178, + "grad_norm": 0.18213194608688354, + "learning_rate": 1.9973337276236538e-05, + "loss": 1.2787, + "step": 1601 + }, + { + "epoch": 0.5964328512757269, + "grad_norm": 0.18399560451507568, + "learning_rate": 1.997324851416769e-05, + "loss": 1.2824, + "step": 1602 + }, + { + "epoch": 0.5968051564263359, + "grad_norm": 0.184067040681839, + "learning_rate": 1.9973159604794587e-05, + "loss": 1.2785, + "step": 1603 + }, + { + "epoch": 0.597177461576945, + "grad_norm": 0.18284447491168976, + "learning_rate": 1.9973070548118545e-05, + "loss": 1.261, + "step": 1604 + }, + { + "epoch": 0.5975497667275541, + "grad_norm": 0.18606549501419067, + "learning_rate": 1.9972981344140875e-05, + "loss": 1.2841, + "step": 1605 + }, + { + "epoch": 0.5979220718781632, + "grad_norm": 0.18835964798927307, + "learning_rate": 1.9972891992862895e-05, + "loss": 1.28, + "step": 1606 + }, + { + "epoch": 0.5982943770287722, + "grad_norm": 0.18669797480106354, + "learning_rate": 1.997280249428593e-05, + "loss": 1.2893, + "step": 1607 + }, + { + "epoch": 0.5986666821793812, + "grad_norm": 0.19121499359607697, + "learning_rate": 1.9972712848411292e-05, + "loss": 1.2847, + "step": 1608 + }, + { + "epoch": 0.5990389873299904, + "grad_norm": 0.18842531740665436, + "learning_rate": 1.9972623055240316e-05, + "loss": 1.2856, + "step": 1609 + }, + { + "epoch": 0.5994112924805994, + "grad_norm": 0.18126575648784637, + "learning_rate": 1.9972533114774322e-05, + "loss": 1.2767, + "step": 1610 + }, + { + "epoch": 0.5997835976312085, + "grad_norm": 0.17144346237182617, + "learning_rate": 1.997244302701464e-05, + "loss": 1.2738, + "step": 1611 + }, + { + "epoch": 0.6001559027818175, + "grad_norm": 0.17671994864940643, + "learning_rate": 1.99723527919626e-05, + "loss": 1.2693, + "step": 1612 + }, + { + "epoch": 0.6005282079324266, + "grad_norm": 0.17835721373558044, + "learning_rate": 1.9972262409619534e-05, + "loss": 1.2738, + "step": 1613 + }, + { + "epoch": 0.6009005130830357, + "grad_norm": 0.17789305746555328, + "learning_rate": 1.997217187998678e-05, + "loss": 1.2877, + "step": 1614 + }, + { + "epoch": 0.6012728182336448, + "grad_norm": 0.18692655861377716, + "learning_rate": 1.9972081203065672e-05, + "loss": 1.2754, + "step": 1615 + }, + { + "epoch": 0.6016451233842538, + "grad_norm": 0.1710839867591858, + "learning_rate": 1.997199037885755e-05, + "loss": 1.2831, + "step": 1616 + }, + { + "epoch": 0.6020174285348628, + "grad_norm": 0.1843128800392151, + "learning_rate": 1.9971899407363757e-05, + "loss": 1.281, + "step": 1617 + }, + { + "epoch": 0.602389733685472, + "grad_norm": 0.19229008257389069, + "learning_rate": 1.9971808288585636e-05, + "loss": 1.3007, + "step": 1618 + }, + { + "epoch": 0.602762038836081, + "grad_norm": 0.18288210034370422, + "learning_rate": 1.997171702252453e-05, + "loss": 1.2852, + "step": 1619 + }, + { + "epoch": 0.6031343439866901, + "grad_norm": 0.18607817590236664, + "learning_rate": 1.997162560918179e-05, + "loss": 1.2698, + "step": 1620 + }, + { + "epoch": 0.6035066491372991, + "grad_norm": 0.1832936704158783, + "learning_rate": 1.997153404855877e-05, + "loss": 1.2929, + "step": 1621 + }, + { + "epoch": 0.6038789542879083, + "grad_norm": 0.1910560131072998, + "learning_rate": 1.9971442340656812e-05, + "loss": 1.2891, + "step": 1622 + }, + { + "epoch": 0.6042512594385173, + "grad_norm": 0.19132542610168457, + "learning_rate": 1.997135048547728e-05, + "loss": 1.2862, + "step": 1623 + }, + { + "epoch": 0.6046235645891264, + "grad_norm": 0.18313977122306824, + "learning_rate": 1.9971258483021526e-05, + "loss": 1.278, + "step": 1624 + }, + { + "epoch": 0.6049958697397354, + "grad_norm": 0.17760074138641357, + "learning_rate": 1.997116633329091e-05, + "loss": 1.2783, + "step": 1625 + }, + { + "epoch": 0.6053681748903444, + "grad_norm": 0.18431836366653442, + "learning_rate": 1.997107403628679e-05, + "loss": 1.28, + "step": 1626 + }, + { + "epoch": 0.6057404800409536, + "grad_norm": 0.18512500822544098, + "learning_rate": 1.9970981592010538e-05, + "loss": 1.2817, + "step": 1627 + }, + { + "epoch": 0.6061127851915626, + "grad_norm": 0.18769600987434387, + "learning_rate": 1.9970889000463512e-05, + "loss": 1.2779, + "step": 1628 + }, + { + "epoch": 0.6064850903421717, + "grad_norm": 0.17421449720859528, + "learning_rate": 1.997079626164708e-05, + "loss": 1.2822, + "step": 1629 + }, + { + "epoch": 0.6068573954927807, + "grad_norm": 0.1772501915693283, + "learning_rate": 1.997070337556261e-05, + "loss": 1.2774, + "step": 1630 + }, + { + "epoch": 0.6072297006433899, + "grad_norm": 0.1793195903301239, + "learning_rate": 1.9970610342211484e-05, + "loss": 1.2789, + "step": 1631 + }, + { + "epoch": 0.6076020057939989, + "grad_norm": 0.18559563159942627, + "learning_rate": 1.9970517161595063e-05, + "loss": 1.2971, + "step": 1632 + }, + { + "epoch": 0.607974310944608, + "grad_norm": 0.18821711838245392, + "learning_rate": 1.997042383371473e-05, + "loss": 1.2901, + "step": 1633 + }, + { + "epoch": 0.608346616095217, + "grad_norm": 0.18289709091186523, + "learning_rate": 1.9970330358571862e-05, + "loss": 1.2772, + "step": 1634 + }, + { + "epoch": 0.6087189212458262, + "grad_norm": 0.18088442087173462, + "learning_rate": 1.9970236736167846e-05, + "loss": 1.2877, + "step": 1635 + }, + { + "epoch": 0.6090912263964352, + "grad_norm": 0.1836581975221634, + "learning_rate": 1.9970142966504053e-05, + "loss": 1.2779, + "step": 1636 + }, + { + "epoch": 0.6094635315470442, + "grad_norm": 0.18793676793575287, + "learning_rate": 1.9970049049581878e-05, + "loss": 1.2817, + "step": 1637 + }, + { + "epoch": 0.6098358366976533, + "grad_norm": 0.18134282529354095, + "learning_rate": 1.9969954985402702e-05, + "loss": 1.2842, + "step": 1638 + }, + { + "epoch": 0.6102081418482623, + "grad_norm": 0.1860257387161255, + "learning_rate": 1.9969860773967916e-05, + "loss": 1.292, + "step": 1639 + }, + { + "epoch": 0.6105804469988715, + "grad_norm": 0.1779407411813736, + "learning_rate": 1.9969766415278916e-05, + "loss": 1.2869, + "step": 1640 + }, + { + "epoch": 0.6109527521494805, + "grad_norm": 0.17907682061195374, + "learning_rate": 1.9969671909337086e-05, + "loss": 1.2777, + "step": 1641 + }, + { + "epoch": 0.6113250573000896, + "grad_norm": 0.21232067048549652, + "learning_rate": 1.996957725614383e-05, + "loss": 1.2694, + "step": 1642 + }, + { + "epoch": 0.6116973624506986, + "grad_norm": 0.1791219562292099, + "learning_rate": 1.9969482455700544e-05, + "loss": 1.2757, + "step": 1643 + }, + { + "epoch": 0.6120696676013078, + "grad_norm": 0.17560942471027374, + "learning_rate": 1.996938750800863e-05, + "loss": 1.2838, + "step": 1644 + }, + { + "epoch": 0.6124419727519168, + "grad_norm": 0.19401302933692932, + "learning_rate": 1.9969292413069485e-05, + "loss": 1.2896, + "step": 1645 + }, + { + "epoch": 0.6128142779025259, + "grad_norm": 0.1834617555141449, + "learning_rate": 1.9969197170884517e-05, + "loss": 1.2824, + "step": 1646 + }, + { + "epoch": 0.6131865830531349, + "grad_norm": 0.18403379619121552, + "learning_rate": 1.9969101781455132e-05, + "loss": 1.2783, + "step": 1647 + }, + { + "epoch": 0.613558888203744, + "grad_norm": 0.1884704977273941, + "learning_rate": 1.996900624478274e-05, + "loss": 1.2783, + "step": 1648 + }, + { + "epoch": 0.6139311933543531, + "grad_norm": 0.1792110949754715, + "learning_rate": 1.9968910560868757e-05, + "loss": 1.288, + "step": 1649 + }, + { + "epoch": 0.6143034985049621, + "grad_norm": 0.1805010288953781, + "learning_rate": 1.9968814729714584e-05, + "loss": 1.2694, + "step": 1650 + }, + { + "epoch": 0.6146758036555712, + "grad_norm": 0.17859011888504028, + "learning_rate": 1.9968718751321643e-05, + "loss": 1.2631, + "step": 1651 + }, + { + "epoch": 0.6150481088061802, + "grad_norm": 0.1796571910381317, + "learning_rate": 1.9968622625691353e-05, + "loss": 1.2819, + "step": 1652 + }, + { + "epoch": 0.6154204139567894, + "grad_norm": 0.1841122955083847, + "learning_rate": 1.9968526352825135e-05, + "loss": 1.27, + "step": 1653 + }, + { + "epoch": 0.6157927191073984, + "grad_norm": 0.17526361346244812, + "learning_rate": 1.9968429932724404e-05, + "loss": 1.2927, + "step": 1654 + }, + { + "epoch": 0.6161650242580075, + "grad_norm": 0.17394275963306427, + "learning_rate": 1.996833336539059e-05, + "loss": 1.2978, + "step": 1655 + }, + { + "epoch": 0.6165373294086165, + "grad_norm": 0.17003491520881653, + "learning_rate": 1.996823665082512e-05, + "loss": 1.2652, + "step": 1656 + }, + { + "epoch": 0.6169096345592257, + "grad_norm": 0.18178239464759827, + "learning_rate": 1.9968139789029418e-05, + "loss": 1.2768, + "step": 1657 + }, + { + "epoch": 0.6172819397098347, + "grad_norm": 0.17376887798309326, + "learning_rate": 1.9968042780004917e-05, + "loss": 1.2898, + "step": 1658 + }, + { + "epoch": 0.6176542448604437, + "grad_norm": 0.179343119263649, + "learning_rate": 1.9967945623753052e-05, + "loss": 1.2641, + "step": 1659 + }, + { + "epoch": 0.6180265500110528, + "grad_norm": 0.1855798065662384, + "learning_rate": 1.9967848320275253e-05, + "loss": 1.2841, + "step": 1660 + }, + { + "epoch": 0.6183988551616619, + "grad_norm": 0.17375251650810242, + "learning_rate": 1.996775086957296e-05, + "loss": 1.2759, + "step": 1661 + }, + { + "epoch": 0.618771160312271, + "grad_norm": 0.1763077974319458, + "learning_rate": 1.9967653271647613e-05, + "loss": 1.274, + "step": 1662 + }, + { + "epoch": 0.61914346546288, + "grad_norm": 0.17887422442436218, + "learning_rate": 1.9967555526500652e-05, + "loss": 1.2685, + "step": 1663 + }, + { + "epoch": 0.6195157706134891, + "grad_norm": 0.17712260782718658, + "learning_rate": 1.9967457634133524e-05, + "loss": 1.2755, + "step": 1664 + }, + { + "epoch": 0.6198880757640981, + "grad_norm": 0.1702347993850708, + "learning_rate": 1.996735959454767e-05, + "loss": 1.2645, + "step": 1665 + }, + { + "epoch": 0.6202603809147073, + "grad_norm": 0.17374765872955322, + "learning_rate": 1.996726140774454e-05, + "loss": 1.2772, + "step": 1666 + }, + { + "epoch": 0.6206326860653163, + "grad_norm": 0.17728720605373383, + "learning_rate": 1.9967163073725585e-05, + "loss": 1.2669, + "step": 1667 + }, + { + "epoch": 0.6210049912159253, + "grad_norm": 0.18125899136066437, + "learning_rate": 1.9967064592492258e-05, + "loss": 1.2798, + "step": 1668 + }, + { + "epoch": 0.6213772963665344, + "grad_norm": 0.17096486687660217, + "learning_rate": 1.996696596404601e-05, + "loss": 1.2666, + "step": 1669 + }, + { + "epoch": 0.6217496015171435, + "grad_norm": 0.18018490076065063, + "learning_rate": 1.9966867188388303e-05, + "loss": 1.2884, + "step": 1670 + }, + { + "epoch": 0.6221219066677526, + "grad_norm": 0.17836543917655945, + "learning_rate": 1.9966768265520593e-05, + "loss": 1.2874, + "step": 1671 + }, + { + "epoch": 0.6224942118183616, + "grad_norm": 0.17728979885578156, + "learning_rate": 1.996666919544434e-05, + "loss": 1.2756, + "step": 1672 + }, + { + "epoch": 0.6228665169689707, + "grad_norm": 0.18157729506492615, + "learning_rate": 1.9966569978161008e-05, + "loss": 1.2668, + "step": 1673 + }, + { + "epoch": 0.6232388221195797, + "grad_norm": 0.1761295050382614, + "learning_rate": 1.9966470613672064e-05, + "loss": 1.2999, + "step": 1674 + }, + { + "epoch": 0.6236111272701889, + "grad_norm": 0.18089498579502106, + "learning_rate": 1.9966371101978975e-05, + "loss": 1.2796, + "step": 1675 + }, + { + "epoch": 0.6239834324207979, + "grad_norm": 0.17708507180213928, + "learning_rate": 1.996627144308321e-05, + "loss": 1.2575, + "step": 1676 + }, + { + "epoch": 0.6243557375714069, + "grad_norm": 0.17729350924491882, + "learning_rate": 1.9966171636986238e-05, + "loss": 1.2783, + "step": 1677 + }, + { + "epoch": 0.624728042722016, + "grad_norm": 0.18605685234069824, + "learning_rate": 1.996607168368954e-05, + "loss": 1.271, + "step": 1678 + }, + { + "epoch": 0.6251003478726251, + "grad_norm": 0.1942148506641388, + "learning_rate": 1.9965971583194587e-05, + "loss": 1.2831, + "step": 1679 + }, + { + "epoch": 0.6254726530232342, + "grad_norm": 0.18195581436157227, + "learning_rate": 1.996587133550286e-05, + "loss": 1.278, + "step": 1680 + }, + { + "epoch": 0.6258449581738432, + "grad_norm": 0.17865653336048126, + "learning_rate": 1.996577094061584e-05, + "loss": 1.2773, + "step": 1681 + }, + { + "epoch": 0.6262172633244523, + "grad_norm": 0.1941101998090744, + "learning_rate": 1.9965670398535004e-05, + "loss": 1.2721, + "step": 1682 + }, + { + "epoch": 0.6265895684750614, + "grad_norm": 0.18968285620212555, + "learning_rate": 1.9965569709261845e-05, + "loss": 1.2871, + "step": 1683 + }, + { + "epoch": 0.6269618736256705, + "grad_norm": 0.18020257353782654, + "learning_rate": 1.996546887279785e-05, + "loss": 1.2824, + "step": 1684 + }, + { + "epoch": 0.6273341787762795, + "grad_norm": 0.1798500269651413, + "learning_rate": 1.99653678891445e-05, + "loss": 1.2753, + "step": 1685 + }, + { + "epoch": 0.6277064839268885, + "grad_norm": 0.16887503862380981, + "learning_rate": 1.9965266758303295e-05, + "loss": 1.2799, + "step": 1686 + }, + { + "epoch": 0.6280787890774976, + "grad_norm": 0.1764368861913681, + "learning_rate": 1.9965165480275722e-05, + "loss": 1.2649, + "step": 1687 + }, + { + "epoch": 0.6284510942281067, + "grad_norm": 0.19635643064975739, + "learning_rate": 1.9965064055063282e-05, + "loss": 1.2691, + "step": 1688 + }, + { + "epoch": 0.6288233993787158, + "grad_norm": 0.1750660389661789, + "learning_rate": 1.996496248266747e-05, + "loss": 1.2775, + "step": 1689 + }, + { + "epoch": 0.6291957045293248, + "grad_norm": 0.17725105583667755, + "learning_rate": 1.996486076308979e-05, + "loss": 1.2829, + "step": 1690 + }, + { + "epoch": 0.6295680096799339, + "grad_norm": 0.1825384497642517, + "learning_rate": 1.9964758896331743e-05, + "loss": 1.2577, + "step": 1691 + }, + { + "epoch": 0.629940314830543, + "grad_norm": 0.18121573328971863, + "learning_rate": 1.996465688239483e-05, + "loss": 1.2805, + "step": 1692 + }, + { + "epoch": 0.6303126199811521, + "grad_norm": 0.18279202282428741, + "learning_rate": 1.996455472128056e-05, + "loss": 1.2794, + "step": 1693 + }, + { + "epoch": 0.6306849251317611, + "grad_norm": 0.17859618365764618, + "learning_rate": 1.996445241299044e-05, + "loss": 1.2718, + "step": 1694 + }, + { + "epoch": 0.6310572302823702, + "grad_norm": 0.18363523483276367, + "learning_rate": 1.9964349957525988e-05, + "loss": 1.2731, + "step": 1695 + }, + { + "epoch": 0.6314295354329793, + "grad_norm": 0.1790492683649063, + "learning_rate": 1.9964247354888712e-05, + "loss": 1.2734, + "step": 1696 + }, + { + "epoch": 0.6318018405835883, + "grad_norm": 0.17761005461215973, + "learning_rate": 1.9964144605080125e-05, + "loss": 1.2536, + "step": 1697 + }, + { + "epoch": 0.6321741457341974, + "grad_norm": 0.18184557557106018, + "learning_rate": 1.9964041708101745e-05, + "loss": 1.2721, + "step": 1698 + }, + { + "epoch": 0.6325464508848064, + "grad_norm": 0.18488840758800507, + "learning_rate": 1.99639386639551e-05, + "loss": 1.2795, + "step": 1699 + }, + { + "epoch": 0.6329187560354155, + "grad_norm": 0.1846456080675125, + "learning_rate": 1.9963835472641704e-05, + "loss": 1.2744, + "step": 1700 + }, + { + "epoch": 0.6332910611860246, + "grad_norm": 0.1817891150712967, + "learning_rate": 1.9963732134163084e-05, + "loss": 1.2856, + "step": 1701 + }, + { + "epoch": 0.6336633663366337, + "grad_norm": 0.18094323575496674, + "learning_rate": 1.9963628648520767e-05, + "loss": 1.28, + "step": 1702 + }, + { + "epoch": 0.6340356714872427, + "grad_norm": 0.18917135894298553, + "learning_rate": 1.9963525015716277e-05, + "loss": 1.277, + "step": 1703 + }, + { + "epoch": 0.6344079766378518, + "grad_norm": 0.19249355792999268, + "learning_rate": 1.996342123575115e-05, + "loss": 1.2728, + "step": 1704 + }, + { + "epoch": 0.6347802817884609, + "grad_norm": 0.17361187934875488, + "learning_rate": 1.9963317308626916e-05, + "loss": 1.2699, + "step": 1705 + }, + { + "epoch": 0.63515258693907, + "grad_norm": 0.18167155981063843, + "learning_rate": 1.996321323434511e-05, + "loss": 1.2732, + "step": 1706 + }, + { + "epoch": 0.635524892089679, + "grad_norm": 0.18852338194847107, + "learning_rate": 1.9963109012907268e-05, + "loss": 1.2791, + "step": 1707 + }, + { + "epoch": 0.635897197240288, + "grad_norm": 0.17643553018569946, + "learning_rate": 1.9963004644314937e-05, + "loss": 1.2737, + "step": 1708 + }, + { + "epoch": 0.6362695023908972, + "grad_norm": 0.18324004113674164, + "learning_rate": 1.9962900128569645e-05, + "loss": 1.2779, + "step": 1709 + }, + { + "epoch": 0.6366418075415062, + "grad_norm": 0.18091078102588654, + "learning_rate": 1.9962795465672952e-05, + "loss": 1.2825, + "step": 1710 + }, + { + "epoch": 0.6370141126921153, + "grad_norm": 0.17854343354701996, + "learning_rate": 1.996269065562639e-05, + "loss": 1.2876, + "step": 1711 + }, + { + "epoch": 0.6373864178427243, + "grad_norm": 0.16912639141082764, + "learning_rate": 1.9962585698431513e-05, + "loss": 1.2463, + "step": 1712 + }, + { + "epoch": 0.6377587229933334, + "grad_norm": 0.18469521403312683, + "learning_rate": 1.9962480594089867e-05, + "loss": 1.2723, + "step": 1713 + }, + { + "epoch": 0.6381310281439425, + "grad_norm": 0.1734856516122818, + "learning_rate": 1.9962375342603013e-05, + "loss": 1.2732, + "step": 1714 + }, + { + "epoch": 0.6385033332945516, + "grad_norm": 0.17890629172325134, + "learning_rate": 1.99622699439725e-05, + "loss": 1.2711, + "step": 1715 + }, + { + "epoch": 0.6388756384451606, + "grad_norm": 0.18328309059143066, + "learning_rate": 1.996216439819988e-05, + "loss": 1.2765, + "step": 1716 + }, + { + "epoch": 0.6392479435957696, + "grad_norm": 0.18121285736560822, + "learning_rate": 1.9962058705286722e-05, + "loss": 1.2854, + "step": 1717 + }, + { + "epoch": 0.6396202487463788, + "grad_norm": 0.18126457929611206, + "learning_rate": 1.9961952865234582e-05, + "loss": 1.2748, + "step": 1718 + }, + { + "epoch": 0.6399925538969878, + "grad_norm": 0.17969095706939697, + "learning_rate": 1.9961846878045024e-05, + "loss": 1.2606, + "step": 1719 + }, + { + "epoch": 0.6403648590475969, + "grad_norm": 0.18220089375972748, + "learning_rate": 1.9961740743719612e-05, + "loss": 1.2719, + "step": 1720 + }, + { + "epoch": 0.6407371641982059, + "grad_norm": 0.1800280064344406, + "learning_rate": 1.996163446225991e-05, + "loss": 1.2857, + "step": 1721 + }, + { + "epoch": 0.6411094693488151, + "grad_norm": 0.17898349463939667, + "learning_rate": 1.9961528033667498e-05, + "loss": 1.27, + "step": 1722 + }, + { + "epoch": 0.6414817744994241, + "grad_norm": 0.18221056461334229, + "learning_rate": 1.996142145794394e-05, + "loss": 1.2665, + "step": 1723 + }, + { + "epoch": 0.6418540796500332, + "grad_norm": 0.18029160797595978, + "learning_rate": 1.996131473509081e-05, + "loss": 1.2915, + "step": 1724 + }, + { + "epoch": 0.6422263848006422, + "grad_norm": 0.17953678965568542, + "learning_rate": 1.9961207865109688e-05, + "loss": 1.2822, + "step": 1725 + }, + { + "epoch": 0.6425986899512512, + "grad_norm": 0.20553520321846008, + "learning_rate": 1.9961100848002154e-05, + "loss": 1.2923, + "step": 1726 + }, + { + "epoch": 0.6429709951018604, + "grad_norm": 0.1798718124628067, + "learning_rate": 1.996099368376978e-05, + "loss": 1.2898, + "step": 1727 + }, + { + "epoch": 0.6433433002524694, + "grad_norm": 0.18341365456581116, + "learning_rate": 1.996088637241416e-05, + "loss": 1.2693, + "step": 1728 + }, + { + "epoch": 0.6437156054030785, + "grad_norm": 0.18246109783649445, + "learning_rate": 1.996077891393687e-05, + "loss": 1.2783, + "step": 1729 + }, + { + "epoch": 0.6440879105536875, + "grad_norm": 0.1854100227355957, + "learning_rate": 1.9960671308339503e-05, + "loss": 1.2791, + "step": 1730 + }, + { + "epoch": 0.6444602157042967, + "grad_norm": 0.1951596736907959, + "learning_rate": 1.9960563555623644e-05, + "loss": 1.2851, + "step": 1731 + }, + { + "epoch": 0.6448325208549057, + "grad_norm": 0.20141799747943878, + "learning_rate": 1.9960455655790885e-05, + "loss": 1.277, + "step": 1732 + }, + { + "epoch": 0.6452048260055148, + "grad_norm": 0.1731090396642685, + "learning_rate": 1.996034760884282e-05, + "loss": 1.2755, + "step": 1733 + }, + { + "epoch": 0.6455771311561238, + "grad_norm": 0.2005818784236908, + "learning_rate": 1.9960239414781052e-05, + "loss": 1.2724, + "step": 1734 + }, + { + "epoch": 0.645949436306733, + "grad_norm": 0.1791912168264389, + "learning_rate": 1.996013107360717e-05, + "loss": 1.2689, + "step": 1735 + }, + { + "epoch": 0.646321741457342, + "grad_norm": 0.1755741536617279, + "learning_rate": 1.9960022585322774e-05, + "loss": 1.2772, + "step": 1736 + }, + { + "epoch": 0.646694046607951, + "grad_norm": 0.1764545440673828, + "learning_rate": 1.9959913949929474e-05, + "loss": 1.2745, + "step": 1737 + }, + { + "epoch": 0.6470663517585601, + "grad_norm": 0.1788182109594345, + "learning_rate": 1.9959805167428868e-05, + "loss": 1.2635, + "step": 1738 + }, + { + "epoch": 0.6474386569091691, + "grad_norm": 0.1874772310256958, + "learning_rate": 1.9959696237822566e-05, + "loss": 1.2883, + "step": 1739 + }, + { + "epoch": 0.6478109620597783, + "grad_norm": 0.18383602797985077, + "learning_rate": 1.9959587161112174e-05, + "loss": 1.2667, + "step": 1740 + }, + { + "epoch": 0.6481832672103873, + "grad_norm": 0.1755083203315735, + "learning_rate": 1.9959477937299305e-05, + "loss": 1.2857, + "step": 1741 + }, + { + "epoch": 0.6485555723609964, + "grad_norm": 0.1754198670387268, + "learning_rate": 1.995936856638557e-05, + "loss": 1.2582, + "step": 1742 + }, + { + "epoch": 0.6489278775116054, + "grad_norm": 0.18323983252048492, + "learning_rate": 1.9959259048372593e-05, + "loss": 1.2825, + "step": 1743 + }, + { + "epoch": 0.6493001826622146, + "grad_norm": 0.17415131628513336, + "learning_rate": 1.995914938326198e-05, + "loss": 1.2624, + "step": 1744 + }, + { + "epoch": 0.6496724878128236, + "grad_norm": 0.18525010347366333, + "learning_rate": 1.9959039571055356e-05, + "loss": 1.2923, + "step": 1745 + }, + { + "epoch": 0.6500447929634326, + "grad_norm": 0.1781807243824005, + "learning_rate": 1.995892961175434e-05, + "loss": 1.2821, + "step": 1746 + }, + { + "epoch": 0.6504170981140417, + "grad_norm": 0.1848139613866806, + "learning_rate": 1.995881950536056e-05, + "loss": 1.2953, + "step": 1747 + }, + { + "epoch": 0.6507894032646507, + "grad_norm": 0.17933423817157745, + "learning_rate": 1.9958709251875642e-05, + "loss": 1.2666, + "step": 1748 + }, + { + "epoch": 0.6511617084152599, + "grad_norm": 0.17203424870967865, + "learning_rate": 1.9958598851301218e-05, + "loss": 1.2691, + "step": 1749 + }, + { + "epoch": 0.6515340135658689, + "grad_norm": 0.1798200160264969, + "learning_rate": 1.995848830363891e-05, + "loss": 1.2672, + "step": 1750 + }, + { + "epoch": 0.651906318716478, + "grad_norm": 0.19163981080055237, + "learning_rate": 1.9958377608890348e-05, + "loss": 1.2615, + "step": 1751 + }, + { + "epoch": 0.652278623867087, + "grad_norm": 0.177472323179245, + "learning_rate": 1.9958266767057183e-05, + "loss": 1.2775, + "step": 1752 + }, + { + "epoch": 0.6526509290176962, + "grad_norm": 0.17973048985004425, + "learning_rate": 1.995815577814104e-05, + "loss": 1.2709, + "step": 1753 + }, + { + "epoch": 0.6530232341683052, + "grad_norm": 0.20184633135795593, + "learning_rate": 1.995804464214356e-05, + "loss": 1.2796, + "step": 1754 + }, + { + "epoch": 0.6533955393189143, + "grad_norm": 0.1837623566389084, + "learning_rate": 1.9957933359066385e-05, + "loss": 1.2679, + "step": 1755 + }, + { + "epoch": 0.6537678444695233, + "grad_norm": 0.17423506081104279, + "learning_rate": 1.995782192891116e-05, + "loss": 1.2671, + "step": 1756 + }, + { + "epoch": 0.6541401496201324, + "grad_norm": 0.1909656673669815, + "learning_rate": 1.9957710351679533e-05, + "loss": 1.2775, + "step": 1757 + }, + { + "epoch": 0.6545124547707415, + "grad_norm": 0.19637969136238098, + "learning_rate": 1.9957598627373145e-05, + "loss": 1.2737, + "step": 1758 + }, + { + "epoch": 0.6548847599213505, + "grad_norm": 0.18318361043930054, + "learning_rate": 1.995748675599365e-05, + "loss": 1.2772, + "step": 1759 + }, + { + "epoch": 0.6552570650719596, + "grad_norm": 0.1808813214302063, + "learning_rate": 1.9957374737542702e-05, + "loss": 1.2918, + "step": 1760 + }, + { + "epoch": 0.6556293702225686, + "grad_norm": 0.1866048276424408, + "learning_rate": 1.9957262572021955e-05, + "loss": 1.2677, + "step": 1761 + }, + { + "epoch": 0.6560016753731778, + "grad_norm": 0.18360257148742676, + "learning_rate": 1.9957150259433065e-05, + "loss": 1.2832, + "step": 1762 + }, + { + "epoch": 0.6563739805237868, + "grad_norm": 0.17542779445648193, + "learning_rate": 1.995703779977769e-05, + "loss": 1.2695, + "step": 1763 + }, + { + "epoch": 0.6567462856743959, + "grad_norm": 0.1828540414571762, + "learning_rate": 1.995692519305749e-05, + "loss": 1.2893, + "step": 1764 + }, + { + "epoch": 0.6571185908250049, + "grad_norm": 0.18941769003868103, + "learning_rate": 1.995681243927413e-05, + "loss": 1.2742, + "step": 1765 + }, + { + "epoch": 0.657490895975614, + "grad_norm": 0.1731724739074707, + "learning_rate": 1.9956699538429275e-05, + "loss": 1.2966, + "step": 1766 + }, + { + "epoch": 0.6578632011262231, + "grad_norm": 0.18366792798042297, + "learning_rate": 1.9956586490524596e-05, + "loss": 1.2779, + "step": 1767 + }, + { + "epoch": 0.6582355062768321, + "grad_norm": 0.18383246660232544, + "learning_rate": 1.9956473295561756e-05, + "loss": 1.267, + "step": 1768 + }, + { + "epoch": 0.6586078114274412, + "grad_norm": 0.19387759268283844, + "learning_rate": 1.9956359953542433e-05, + "loss": 1.2768, + "step": 1769 + }, + { + "epoch": 0.6589801165780503, + "grad_norm": 0.18272364139556885, + "learning_rate": 1.9956246464468294e-05, + "loss": 1.2654, + "step": 1770 + }, + { + "epoch": 0.6593524217286594, + "grad_norm": 0.17363867163658142, + "learning_rate": 1.9956132828341022e-05, + "loss": 1.2724, + "step": 1771 + }, + { + "epoch": 0.6597247268792684, + "grad_norm": 0.18008361756801605, + "learning_rate": 1.9956019045162294e-05, + "loss": 1.2799, + "step": 1772 + }, + { + "epoch": 0.6600970320298775, + "grad_norm": 0.18820790946483612, + "learning_rate": 1.995590511493379e-05, + "loss": 1.2741, + "step": 1773 + }, + { + "epoch": 0.6604693371804865, + "grad_norm": 0.18072496354579926, + "learning_rate": 1.995579103765719e-05, + "loss": 1.2887, + "step": 1774 + }, + { + "epoch": 0.6608416423310957, + "grad_norm": 0.18533405661582947, + "learning_rate": 1.9955676813334182e-05, + "loss": 1.2798, + "step": 1775 + }, + { + "epoch": 0.6612139474817047, + "grad_norm": 0.19441775977611542, + "learning_rate": 1.9955562441966452e-05, + "loss": 1.2523, + "step": 1776 + }, + { + "epoch": 0.6615862526323137, + "grad_norm": 0.19145584106445312, + "learning_rate": 1.995544792355569e-05, + "loss": 1.2635, + "step": 1777 + }, + { + "epoch": 0.6619585577829228, + "grad_norm": 0.2026701271533966, + "learning_rate": 1.9955333258103586e-05, + "loss": 1.2747, + "step": 1778 + }, + { + "epoch": 0.6623308629335319, + "grad_norm": 0.18436765670776367, + "learning_rate": 1.9955218445611834e-05, + "loss": 1.2653, + "step": 1779 + }, + { + "epoch": 0.662703168084141, + "grad_norm": 0.18256279826164246, + "learning_rate": 1.9955103486082135e-05, + "loss": 1.2807, + "step": 1780 + }, + { + "epoch": 0.66307547323475, + "grad_norm": 0.1890406608581543, + "learning_rate": 1.9954988379516177e-05, + "loss": 1.2832, + "step": 1781 + }, + { + "epoch": 0.6634477783853591, + "grad_norm": 0.1824447363615036, + "learning_rate": 1.995487312591567e-05, + "loss": 1.2533, + "step": 1782 + }, + { + "epoch": 0.6638200835359682, + "grad_norm": 0.1869254857301712, + "learning_rate": 1.9954757725282308e-05, + "loss": 1.2811, + "step": 1783 + }, + { + "epoch": 0.6641923886865773, + "grad_norm": 0.18558120727539062, + "learning_rate": 1.99546421776178e-05, + "loss": 1.2687, + "step": 1784 + }, + { + "epoch": 0.6645646938371863, + "grad_norm": 0.18721050024032593, + "learning_rate": 1.995452648292385e-05, + "loss": 1.2834, + "step": 1785 + }, + { + "epoch": 0.6649369989877953, + "grad_norm": 0.1912430077791214, + "learning_rate": 1.9954410641202173e-05, + "loss": 1.2801, + "step": 1786 + }, + { + "epoch": 0.6653093041384044, + "grad_norm": 0.1814345419406891, + "learning_rate": 1.995429465245447e-05, + "loss": 1.2739, + "step": 1787 + }, + { + "epoch": 0.6656816092890135, + "grad_norm": 0.18435440957546234, + "learning_rate": 1.9954178516682464e-05, + "loss": 1.2652, + "step": 1788 + }, + { + "epoch": 0.6660539144396226, + "grad_norm": 0.18596956133842468, + "learning_rate": 1.9954062233887866e-05, + "loss": 1.2661, + "step": 1789 + }, + { + "epoch": 0.6664262195902316, + "grad_norm": 0.1784246861934662, + "learning_rate": 1.995394580407239e-05, + "loss": 1.2688, + "step": 1790 + }, + { + "epoch": 0.6667985247408407, + "grad_norm": 0.1948971003293991, + "learning_rate": 1.9953829227237762e-05, + "loss": 1.2825, + "step": 1791 + }, + { + "epoch": 0.6671708298914498, + "grad_norm": 0.17607876658439636, + "learning_rate": 1.9953712503385702e-05, + "loss": 1.2722, + "step": 1792 + }, + { + "epoch": 0.6675431350420589, + "grad_norm": 0.19248910248279572, + "learning_rate": 1.995359563251793e-05, + "loss": 1.2908, + "step": 1793 + }, + { + "epoch": 0.6679154401926679, + "grad_norm": 0.1824008673429489, + "learning_rate": 1.9953478614636178e-05, + "loss": 1.2737, + "step": 1794 + }, + { + "epoch": 0.668287745343277, + "grad_norm": 0.17767947912216187, + "learning_rate": 1.9953361449742167e-05, + "loss": 1.2698, + "step": 1795 + }, + { + "epoch": 0.6686600504938861, + "grad_norm": 0.20070941746234894, + "learning_rate": 1.995324413783764e-05, + "loss": 1.2716, + "step": 1796 + }, + { + "epoch": 0.6690323556444951, + "grad_norm": 0.1810212880373001, + "learning_rate": 1.9953126678924315e-05, + "loss": 1.2641, + "step": 1797 + }, + { + "epoch": 0.6694046607951042, + "grad_norm": 0.18314088881015778, + "learning_rate": 1.9953009073003935e-05, + "loss": 1.2713, + "step": 1798 + }, + { + "epoch": 0.6697769659457132, + "grad_norm": 0.1876356452703476, + "learning_rate": 1.9952891320078235e-05, + "loss": 1.279, + "step": 1799 + }, + { + "epoch": 0.6701492710963223, + "grad_norm": 0.18788988888263702, + "learning_rate": 1.9952773420148958e-05, + "loss": 1.2786, + "step": 1800 + }, + { + "epoch": 0.6705215762469314, + "grad_norm": 0.17897135019302368, + "learning_rate": 1.995265537321784e-05, + "loss": 1.2746, + "step": 1801 + }, + { + "epoch": 0.6708938813975405, + "grad_norm": 0.1953326016664505, + "learning_rate": 1.9952537179286623e-05, + "loss": 1.2815, + "step": 1802 + }, + { + "epoch": 0.6712661865481495, + "grad_norm": 0.19297561049461365, + "learning_rate": 1.995241883835706e-05, + "loss": 1.2698, + "step": 1803 + }, + { + "epoch": 0.6716384916987586, + "grad_norm": 0.18052466213703156, + "learning_rate": 1.99523003504309e-05, + "loss": 1.2744, + "step": 1804 + }, + { + "epoch": 0.6720107968493677, + "grad_norm": 0.1788993775844574, + "learning_rate": 1.9952181715509883e-05, + "loss": 1.2621, + "step": 1805 + }, + { + "epoch": 0.6723831019999768, + "grad_norm": 0.18485794961452484, + "learning_rate": 1.9952062933595765e-05, + "loss": 1.2767, + "step": 1806 + }, + { + "epoch": 0.6727554071505858, + "grad_norm": 0.18124793469905853, + "learning_rate": 1.9951944004690308e-05, + "loss": 1.2811, + "step": 1807 + }, + { + "epoch": 0.6731277123011948, + "grad_norm": 0.1836545318365097, + "learning_rate": 1.9951824928795255e-05, + "loss": 1.2692, + "step": 1808 + }, + { + "epoch": 0.673500017451804, + "grad_norm": 0.1730068475008011, + "learning_rate": 1.9951705705912377e-05, + "loss": 1.2753, + "step": 1809 + }, + { + "epoch": 0.673872322602413, + "grad_norm": 0.17720746994018555, + "learning_rate": 1.995158633604343e-05, + "loss": 1.2653, + "step": 1810 + }, + { + "epoch": 0.6742446277530221, + "grad_norm": 0.1764315515756607, + "learning_rate": 1.995146681919018e-05, + "loss": 1.2608, + "step": 1811 + }, + { + "epoch": 0.6746169329036311, + "grad_norm": 0.19442118704319, + "learning_rate": 1.9951347155354386e-05, + "loss": 1.2774, + "step": 1812 + }, + { + "epoch": 0.6749892380542402, + "grad_norm": 0.18689338862895966, + "learning_rate": 1.995122734453782e-05, + "loss": 1.2659, + "step": 1813 + }, + { + "epoch": 0.6753615432048493, + "grad_norm": 0.17291222512722015, + "learning_rate": 1.995110738674225e-05, + "loss": 1.2637, + "step": 1814 + }, + { + "epoch": 0.6757338483554584, + "grad_norm": 0.2008456140756607, + "learning_rate": 1.995098728196945e-05, + "loss": 1.2837, + "step": 1815 + }, + { + "epoch": 0.6761061535060674, + "grad_norm": 0.1726778745651245, + "learning_rate": 1.995086703022119e-05, + "loss": 1.2876, + "step": 1816 + }, + { + "epoch": 0.6764784586566764, + "grad_norm": 0.18805347383022308, + "learning_rate": 1.9950746631499252e-05, + "loss": 1.2486, + "step": 1817 + }, + { + "epoch": 0.6768507638072856, + "grad_norm": 0.1827612966299057, + "learning_rate": 1.9950626085805406e-05, + "loss": 1.2832, + "step": 1818 + }, + { + "epoch": 0.6772230689578946, + "grad_norm": 0.17770785093307495, + "learning_rate": 1.995050539314144e-05, + "loss": 1.2713, + "step": 1819 + }, + { + "epoch": 0.6775953741085037, + "grad_norm": 0.18160304427146912, + "learning_rate": 1.9950384553509134e-05, + "loss": 1.2836, + "step": 1820 + }, + { + "epoch": 0.6779676792591127, + "grad_norm": 0.1883334517478943, + "learning_rate": 1.9950263566910275e-05, + "loss": 1.2813, + "step": 1821 + }, + { + "epoch": 0.6783399844097218, + "grad_norm": 0.18117238581180573, + "learning_rate": 1.9950142433346642e-05, + "loss": 1.2715, + "step": 1822 + }, + { + "epoch": 0.6787122895603309, + "grad_norm": 0.1771121472120285, + "learning_rate": 1.9950021152820032e-05, + "loss": 1.266, + "step": 1823 + }, + { + "epoch": 0.67908459471094, + "grad_norm": 0.18538367748260498, + "learning_rate": 1.9949899725332233e-05, + "loss": 1.2638, + "step": 1824 + }, + { + "epoch": 0.679456899861549, + "grad_norm": 0.17744530737400055, + "learning_rate": 1.994977815088504e-05, + "loss": 1.2669, + "step": 1825 + }, + { + "epoch": 0.679829205012158, + "grad_norm": 0.17428846657276154, + "learning_rate": 1.9949656429480252e-05, + "loss": 1.2753, + "step": 1826 + }, + { + "epoch": 0.6802015101627672, + "grad_norm": 0.2017204761505127, + "learning_rate": 1.9949534561119658e-05, + "loss": 1.2723, + "step": 1827 + }, + { + "epoch": 0.6805738153133762, + "grad_norm": 0.18561388552188873, + "learning_rate": 1.9949412545805065e-05, + "loss": 1.2748, + "step": 1828 + }, + { + "epoch": 0.6809461204639853, + "grad_norm": 0.18007059395313263, + "learning_rate": 1.9949290383538272e-05, + "loss": 1.2579, + "step": 1829 + }, + { + "epoch": 0.6813184256145943, + "grad_norm": 0.17988115549087524, + "learning_rate": 1.9949168074321088e-05, + "loss": 1.2769, + "step": 1830 + }, + { + "epoch": 0.6816907307652035, + "grad_norm": 0.19387783110141754, + "learning_rate": 1.9949045618155312e-05, + "loss": 1.2746, + "step": 1831 + }, + { + "epoch": 0.6820630359158125, + "grad_norm": 0.1811383217573166, + "learning_rate": 1.994892301504276e-05, + "loss": 1.2784, + "step": 1832 + }, + { + "epoch": 0.6824353410664216, + "grad_norm": 0.17574800550937653, + "learning_rate": 1.9948800264985236e-05, + "loss": 1.2688, + "step": 1833 + }, + { + "epoch": 0.6828076462170306, + "grad_norm": 0.17047591507434845, + "learning_rate": 1.9948677367984558e-05, + "loss": 1.2783, + "step": 1834 + }, + { + "epoch": 0.6831799513676396, + "grad_norm": 0.18834684789180756, + "learning_rate": 1.994855432404254e-05, + "loss": 1.2685, + "step": 1835 + }, + { + "epoch": 0.6835522565182488, + "grad_norm": 0.17548666894435883, + "learning_rate": 1.9948431133160998e-05, + "loss": 1.269, + "step": 1836 + }, + { + "epoch": 0.6839245616688578, + "grad_norm": 0.17728163301944733, + "learning_rate": 1.9948307795341755e-05, + "loss": 1.2748, + "step": 1837 + }, + { + "epoch": 0.6842968668194669, + "grad_norm": 0.18654660880565643, + "learning_rate": 1.9948184310586625e-05, + "loss": 1.2839, + "step": 1838 + }, + { + "epoch": 0.6846691719700759, + "grad_norm": 0.18166908621788025, + "learning_rate": 1.9948060678897443e-05, + "loss": 1.2677, + "step": 1839 + }, + { + "epoch": 0.6850414771206851, + "grad_norm": 0.17448103427886963, + "learning_rate": 1.9947936900276023e-05, + "loss": 1.2738, + "step": 1840 + }, + { + "epoch": 0.6854137822712941, + "grad_norm": 0.17304863035678864, + "learning_rate": 1.9947812974724203e-05, + "loss": 1.2705, + "step": 1841 + }, + { + "epoch": 0.6857860874219032, + "grad_norm": 0.17068156599998474, + "learning_rate": 1.994768890224381e-05, + "loss": 1.2782, + "step": 1842 + }, + { + "epoch": 0.6861583925725122, + "grad_norm": 0.17635151743888855, + "learning_rate": 1.9947564682836678e-05, + "loss": 1.2707, + "step": 1843 + }, + { + "epoch": 0.6865306977231214, + "grad_norm": 0.17905697226524353, + "learning_rate": 1.9947440316504636e-05, + "loss": 1.2628, + "step": 1844 + }, + { + "epoch": 0.6869030028737304, + "grad_norm": 0.17490267753601074, + "learning_rate": 1.9947315803249525e-05, + "loss": 1.257, + "step": 1845 + }, + { + "epoch": 0.6872753080243394, + "grad_norm": 0.18009649217128754, + "learning_rate": 1.9947191143073185e-05, + "loss": 1.285, + "step": 1846 + }, + { + "epoch": 0.6876476131749485, + "grad_norm": 0.19091250002384186, + "learning_rate": 1.994706633597746e-05, + "loss": 1.2647, + "step": 1847 + }, + { + "epoch": 0.6880199183255575, + "grad_norm": 0.183569997549057, + "learning_rate": 1.994694138196418e-05, + "loss": 1.2662, + "step": 1848 + }, + { + "epoch": 0.6883922234761667, + "grad_norm": 0.18256776034832, + "learning_rate": 1.994681628103521e-05, + "loss": 1.2643, + "step": 1849 + }, + { + "epoch": 0.6887645286267757, + "grad_norm": 0.17568175494670868, + "learning_rate": 1.9946691033192384e-05, + "loss": 1.2724, + "step": 1850 + }, + { + "epoch": 0.6891368337773848, + "grad_norm": 0.18592911958694458, + "learning_rate": 1.9946565638437552e-05, + "loss": 1.2602, + "step": 1851 + }, + { + "epoch": 0.6895091389279938, + "grad_norm": 0.18410173058509827, + "learning_rate": 1.9946440096772574e-05, + "loss": 1.2539, + "step": 1852 + }, + { + "epoch": 0.689881444078603, + "grad_norm": 0.18612074851989746, + "learning_rate": 1.99463144081993e-05, + "loss": 1.265, + "step": 1853 + }, + { + "epoch": 0.690253749229212, + "grad_norm": 0.19011899828910828, + "learning_rate": 1.9946188572719585e-05, + "loss": 1.2938, + "step": 1854 + }, + { + "epoch": 0.690626054379821, + "grad_norm": 0.18528851866722107, + "learning_rate": 1.9946062590335287e-05, + "loss": 1.2757, + "step": 1855 + }, + { + "epoch": 0.6909983595304301, + "grad_norm": 0.1764160841703415, + "learning_rate": 1.9945936461048273e-05, + "loss": 1.2859, + "step": 1856 + }, + { + "epoch": 0.6913706646810392, + "grad_norm": 0.17735952138900757, + "learning_rate": 1.9945810184860396e-05, + "loss": 1.2706, + "step": 1857 + }, + { + "epoch": 0.6917429698316483, + "grad_norm": 0.18873561918735504, + "learning_rate": 1.9945683761773533e-05, + "loss": 1.2833, + "step": 1858 + }, + { + "epoch": 0.6921152749822573, + "grad_norm": 0.17699329555034637, + "learning_rate": 1.9945557191789543e-05, + "loss": 1.2709, + "step": 1859 + }, + { + "epoch": 0.6924875801328664, + "grad_norm": 0.18327003717422485, + "learning_rate": 1.9945430474910295e-05, + "loss": 1.2801, + "step": 1860 + }, + { + "epoch": 0.6928598852834754, + "grad_norm": 0.18743474781513214, + "learning_rate": 1.9945303611137665e-05, + "loss": 1.2724, + "step": 1861 + }, + { + "epoch": 0.6932321904340846, + "grad_norm": 0.17246921360492706, + "learning_rate": 1.9945176600473526e-05, + "loss": 1.252, + "step": 1862 + }, + { + "epoch": 0.6936044955846936, + "grad_norm": 0.17474091053009033, + "learning_rate": 1.994504944291975e-05, + "loss": 1.2746, + "step": 1863 + }, + { + "epoch": 0.6939768007353027, + "grad_norm": 0.19760233163833618, + "learning_rate": 1.994492213847822e-05, + "loss": 1.2925, + "step": 1864 + }, + { + "epoch": 0.6943491058859117, + "grad_norm": 0.32504546642303467, + "learning_rate": 1.9944794687150812e-05, + "loss": 1.2715, + "step": 1865 + }, + { + "epoch": 0.6947214110365209, + "grad_norm": 0.16915461421012878, + "learning_rate": 1.9944667088939414e-05, + "loss": 1.2777, + "step": 1866 + }, + { + "epoch": 0.6950937161871299, + "grad_norm": 0.18415702879428864, + "learning_rate": 1.9944539343845905e-05, + "loss": 1.2666, + "step": 1867 + }, + { + "epoch": 0.6954660213377389, + "grad_norm": 0.19284015893936157, + "learning_rate": 1.994441145187217e-05, + "loss": 1.2955, + "step": 1868 + }, + { + "epoch": 0.695838326488348, + "grad_norm": 0.1781989485025406, + "learning_rate": 1.994428341302011e-05, + "loss": 1.2754, + "step": 1869 + }, + { + "epoch": 0.6962106316389571, + "grad_norm": 0.18543171882629395, + "learning_rate": 1.9944155227291603e-05, + "loss": 1.2615, + "step": 1870 + }, + { + "epoch": 0.6965829367895662, + "grad_norm": 0.1806837022304535, + "learning_rate": 1.9944026894688547e-05, + "loss": 1.2585, + "step": 1871 + }, + { + "epoch": 0.6969552419401752, + "grad_norm": 0.18426816165447235, + "learning_rate": 1.9943898415212842e-05, + "loss": 1.2783, + "step": 1872 + }, + { + "epoch": 0.6973275470907843, + "grad_norm": 0.17620044946670532, + "learning_rate": 1.9943769788866377e-05, + "loss": 1.2815, + "step": 1873 + }, + { + "epoch": 0.6976998522413933, + "grad_norm": 0.1815589815378189, + "learning_rate": 1.9943641015651057e-05, + "loss": 1.2715, + "step": 1874 + }, + { + "epoch": 0.6980721573920025, + "grad_norm": 0.1777685433626175, + "learning_rate": 1.9943512095568785e-05, + "loss": 1.2717, + "step": 1875 + }, + { + "epoch": 0.6984444625426115, + "grad_norm": 0.19864802062511444, + "learning_rate": 1.9943383028621463e-05, + "loss": 1.2827, + "step": 1876 + }, + { + "epoch": 0.6988167676932205, + "grad_norm": 0.1869768351316452, + "learning_rate": 1.9943253814810998e-05, + "loss": 1.276, + "step": 1877 + }, + { + "epoch": 0.6991890728438296, + "grad_norm": 0.18127432465553284, + "learning_rate": 1.9943124454139298e-05, + "loss": 1.258, + "step": 1878 + }, + { + "epoch": 0.6995613779944387, + "grad_norm": 0.18956811726093292, + "learning_rate": 1.9942994946608273e-05, + "loss": 1.2536, + "step": 1879 + }, + { + "epoch": 0.6999336831450478, + "grad_norm": 0.19713161885738373, + "learning_rate": 1.9942865292219837e-05, + "loss": 1.272, + "step": 1880 + }, + { + "epoch": 0.7003059882956568, + "grad_norm": 0.1791374236345291, + "learning_rate": 1.9942735490975903e-05, + "loss": 1.2739, + "step": 1881 + }, + { + "epoch": 0.7006782934462659, + "grad_norm": 0.17911037802696228, + "learning_rate": 1.9942605542878393e-05, + "loss": 1.2725, + "step": 1882 + }, + { + "epoch": 0.701050598596875, + "grad_norm": 0.1858641803264618, + "learning_rate": 1.9942475447929223e-05, + "loss": 1.2588, + "step": 1883 + }, + { + "epoch": 0.7014229037474841, + "grad_norm": 0.1832643300294876, + "learning_rate": 1.9942345206130313e-05, + "loss": 1.2863, + "step": 1884 + }, + { + "epoch": 0.7017952088980931, + "grad_norm": 0.18985310196876526, + "learning_rate": 1.9942214817483588e-05, + "loss": 1.2711, + "step": 1885 + }, + { + "epoch": 0.7021675140487021, + "grad_norm": 0.8048028945922852, + "learning_rate": 1.9942084281990973e-05, + "loss": 1.2672, + "step": 1886 + }, + { + "epoch": 0.7025398191993112, + "grad_norm": 0.19335870444774628, + "learning_rate": 1.9941953599654398e-05, + "loss": 1.2665, + "step": 1887 + }, + { + "epoch": 0.7029121243499203, + "grad_norm": 0.17363718152046204, + "learning_rate": 1.9941822770475795e-05, + "loss": 1.2456, + "step": 1888 + }, + { + "epoch": 0.7032844295005294, + "grad_norm": 0.18014384806156158, + "learning_rate": 1.9941691794457088e-05, + "loss": 1.2661, + "step": 1889 + }, + { + "epoch": 0.7036567346511384, + "grad_norm": 0.17859698832035065, + "learning_rate": 1.9941560671600223e-05, + "loss": 1.2699, + "step": 1890 + }, + { + "epoch": 0.7040290398017475, + "grad_norm": 0.17710040509700775, + "learning_rate": 1.9941429401907126e-05, + "loss": 1.2629, + "step": 1891 + }, + { + "epoch": 0.7044013449523566, + "grad_norm": 0.186106339097023, + "learning_rate": 1.9941297985379747e-05, + "loss": 1.2573, + "step": 1892 + }, + { + "epoch": 0.7047736501029657, + "grad_norm": 0.1873396337032318, + "learning_rate": 1.9941166422020016e-05, + "loss": 1.2468, + "step": 1893 + }, + { + "epoch": 0.7051459552535747, + "grad_norm": 0.1858346313238144, + "learning_rate": 1.9941034711829878e-05, + "loss": 1.2776, + "step": 1894 + }, + { + "epoch": 0.7055182604041838, + "grad_norm": 0.18264545500278473, + "learning_rate": 1.9940902854811284e-05, + "loss": 1.278, + "step": 1895 + }, + { + "epoch": 0.7058905655547928, + "grad_norm": 0.18094655871391296, + "learning_rate": 1.9940770850966184e-05, + "loss": 1.2737, + "step": 1896 + }, + { + "epoch": 0.706262870705402, + "grad_norm": 0.18092148005962372, + "learning_rate": 1.9940638700296514e-05, + "loss": 1.2721, + "step": 1897 + }, + { + "epoch": 0.706635175856011, + "grad_norm": 0.18232271075248718, + "learning_rate": 1.994050640280424e-05, + "loss": 1.2564, + "step": 1898 + }, + { + "epoch": 0.70700748100662, + "grad_norm": 0.17807061970233917, + "learning_rate": 1.9940373958491308e-05, + "loss": 1.2714, + "step": 1899 + }, + { + "epoch": 0.7073797861572291, + "grad_norm": 0.17300982773303986, + "learning_rate": 1.9940241367359675e-05, + "loss": 1.255, + "step": 1900 + }, + { + "epoch": 0.7077520913078382, + "grad_norm": 0.18998445570468903, + "learning_rate": 1.9940108629411305e-05, + "loss": 1.2794, + "step": 1901 + }, + { + "epoch": 0.7081243964584473, + "grad_norm": 0.1696198284626007, + "learning_rate": 1.9939975744648152e-05, + "loss": 1.2784, + "step": 1902 + }, + { + "epoch": 0.7084967016090563, + "grad_norm": 0.1855718344449997, + "learning_rate": 1.993984271307218e-05, + "loss": 1.2728, + "step": 1903 + }, + { + "epoch": 0.7088690067596654, + "grad_norm": 0.18481917679309845, + "learning_rate": 1.9939709534685353e-05, + "loss": 1.2794, + "step": 1904 + }, + { + "epoch": 0.7092413119102745, + "grad_norm": 0.18903714418411255, + "learning_rate": 1.9939576209489648e-05, + "loss": 1.2656, + "step": 1905 + }, + { + "epoch": 0.7096136170608836, + "grad_norm": 0.17437592148780823, + "learning_rate": 1.993944273748702e-05, + "loss": 1.2737, + "step": 1906 + }, + { + "epoch": 0.7099859222114926, + "grad_norm": 0.17127926647663116, + "learning_rate": 1.9939309118679445e-05, + "loss": 1.2662, + "step": 1907 + }, + { + "epoch": 0.7103582273621016, + "grad_norm": 0.1836404949426651, + "learning_rate": 1.99391753530689e-05, + "loss": 1.2774, + "step": 1908 + }, + { + "epoch": 0.7107305325127107, + "grad_norm": 0.18096838891506195, + "learning_rate": 1.993904144065736e-05, + "loss": 1.2775, + "step": 1909 + }, + { + "epoch": 0.7111028376633198, + "grad_norm": 0.17942848801612854, + "learning_rate": 1.9938907381446802e-05, + "loss": 1.2671, + "step": 1910 + }, + { + "epoch": 0.7114751428139289, + "grad_norm": 0.1706017553806305, + "learning_rate": 1.9938773175439205e-05, + "loss": 1.2683, + "step": 1911 + }, + { + "epoch": 0.7118474479645379, + "grad_norm": 0.1811804175376892, + "learning_rate": 1.9938638822636555e-05, + "loss": 1.2771, + "step": 1912 + }, + { + "epoch": 0.712219753115147, + "grad_norm": 0.1840972751379013, + "learning_rate": 1.9938504323040826e-05, + "loss": 1.268, + "step": 1913 + }, + { + "epoch": 0.7125920582657561, + "grad_norm": 0.17353455722332, + "learning_rate": 1.9938369676654015e-05, + "loss": 1.2617, + "step": 1914 + }, + { + "epoch": 0.7129643634163652, + "grad_norm": 0.16893000900745392, + "learning_rate": 1.993823488347811e-05, + "loss": 1.2651, + "step": 1915 + }, + { + "epoch": 0.7133366685669742, + "grad_norm": 0.17910560965538025, + "learning_rate": 1.9938099943515098e-05, + "loss": 1.2633, + "step": 1916 + }, + { + "epoch": 0.7137089737175832, + "grad_norm": 0.18880413472652435, + "learning_rate": 1.9937964856766975e-05, + "loss": 1.2445, + "step": 1917 + }, + { + "epoch": 0.7140812788681924, + "grad_norm": 0.17777028679847717, + "learning_rate": 1.9937829623235733e-05, + "loss": 1.2701, + "step": 1918 + }, + { + "epoch": 0.7144535840188014, + "grad_norm": 0.17710170149803162, + "learning_rate": 1.993769424292337e-05, + "loss": 1.2697, + "step": 1919 + }, + { + "epoch": 0.7148258891694105, + "grad_norm": 0.17528566718101501, + "learning_rate": 1.993755871583189e-05, + "loss": 1.261, + "step": 1920 + }, + { + "epoch": 0.7151981943200195, + "grad_norm": 0.17900541424751282, + "learning_rate": 1.993742304196329e-05, + "loss": 1.2624, + "step": 1921 + }, + { + "epoch": 0.7155704994706286, + "grad_norm": 0.18231788277626038, + "learning_rate": 1.9937287221319576e-05, + "loss": 1.2659, + "step": 1922 + }, + { + "epoch": 0.7159428046212377, + "grad_norm": 0.1796240657567978, + "learning_rate": 1.993715125390275e-05, + "loss": 1.2658, + "step": 1923 + }, + { + "epoch": 0.7163151097718468, + "grad_norm": 0.16966049373149872, + "learning_rate": 1.9937015139714825e-05, + "loss": 1.2611, + "step": 1924 + }, + { + "epoch": 0.7166874149224558, + "grad_norm": 0.18244795501232147, + "learning_rate": 1.993687887875781e-05, + "loss": 1.2567, + "step": 1925 + }, + { + "epoch": 0.7170597200730648, + "grad_norm": 0.17813333868980408, + "learning_rate": 1.993674247103372e-05, + "loss": 1.2639, + "step": 1926 + }, + { + "epoch": 0.717432025223674, + "grad_norm": 0.17147675156593323, + "learning_rate": 1.9936605916544566e-05, + "loss": 1.2508, + "step": 1927 + }, + { + "epoch": 0.717804330374283, + "grad_norm": 0.1758221685886383, + "learning_rate": 1.9936469215292366e-05, + "loss": 1.2537, + "step": 1928 + }, + { + "epoch": 0.7181766355248921, + "grad_norm": 0.1679316908121109, + "learning_rate": 1.993633236727914e-05, + "loss": 1.2708, + "step": 1929 + }, + { + "epoch": 0.7185489406755011, + "grad_norm": 0.18029126524925232, + "learning_rate": 1.9936195372506906e-05, + "loss": 1.2628, + "step": 1930 + }, + { + "epoch": 0.7189212458261103, + "grad_norm": 0.1729169636964798, + "learning_rate": 1.9936058230977694e-05, + "loss": 1.2676, + "step": 1931 + }, + { + "epoch": 0.7192935509767193, + "grad_norm": 0.1810874193906784, + "learning_rate": 1.993592094269352e-05, + "loss": 1.2575, + "step": 1932 + }, + { + "epoch": 0.7196658561273284, + "grad_norm": 0.17737914621829987, + "learning_rate": 1.993578350765642e-05, + "loss": 1.2801, + "step": 1933 + }, + { + "epoch": 0.7200381612779374, + "grad_norm": 0.18004107475280762, + "learning_rate": 1.9935645925868424e-05, + "loss": 1.2596, + "step": 1934 + }, + { + "epoch": 0.7204104664285464, + "grad_norm": 0.17037642002105713, + "learning_rate": 1.9935508197331556e-05, + "loss": 1.2656, + "step": 1935 + }, + { + "epoch": 0.7207827715791556, + "grad_norm": 0.1718427985906601, + "learning_rate": 1.993537032204786e-05, + "loss": 1.2734, + "step": 1936 + }, + { + "epoch": 0.7211550767297646, + "grad_norm": 0.19909080862998962, + "learning_rate": 1.9935232300019364e-05, + "loss": 1.2584, + "step": 1937 + }, + { + "epoch": 0.7215273818803737, + "grad_norm": 0.17230063676834106, + "learning_rate": 1.9935094131248113e-05, + "loss": 1.2605, + "step": 1938 + }, + { + "epoch": 0.7218996870309827, + "grad_norm": 0.18916752934455872, + "learning_rate": 1.9934955815736145e-05, + "loss": 1.2758, + "step": 1939 + }, + { + "epoch": 0.7222719921815919, + "grad_norm": 0.1908046007156372, + "learning_rate": 1.99348173534855e-05, + "loss": 1.2634, + "step": 1940 + }, + { + "epoch": 0.7226442973322009, + "grad_norm": 0.17501883208751678, + "learning_rate": 1.9934678744498234e-05, + "loss": 1.2604, + "step": 1941 + }, + { + "epoch": 0.72301660248281, + "grad_norm": 0.17639294266700745, + "learning_rate": 1.993453998877638e-05, + "loss": 1.2641, + "step": 1942 + }, + { + "epoch": 0.723388907633419, + "grad_norm": 0.19345468282699585, + "learning_rate": 1.9934401086321995e-05, + "loss": 1.2709, + "step": 1943 + }, + { + "epoch": 0.7237612127840282, + "grad_norm": 0.18009069561958313, + "learning_rate": 1.9934262037137132e-05, + "loss": 1.2635, + "step": 1944 + }, + { + "epoch": 0.7241335179346372, + "grad_norm": 0.19029593467712402, + "learning_rate": 1.993412284122384e-05, + "loss": 1.2532, + "step": 1945 + }, + { + "epoch": 0.7245058230852462, + "grad_norm": 0.19446800649166107, + "learning_rate": 1.9933983498584175e-05, + "loss": 1.2771, + "step": 1946 + }, + { + "epoch": 0.7248781282358553, + "grad_norm": 0.17743223905563354, + "learning_rate": 1.99338440092202e-05, + "loss": 1.276, + "step": 1947 + }, + { + "epoch": 0.7252504333864643, + "grad_norm": 0.18026785552501678, + "learning_rate": 1.9933704373133967e-05, + "loss": 1.2737, + "step": 1948 + }, + { + "epoch": 0.7256227385370735, + "grad_norm": 0.191921204328537, + "learning_rate": 1.993356459032755e-05, + "loss": 1.2722, + "step": 1949 + }, + { + "epoch": 0.7259950436876825, + "grad_norm": 0.1790904849767685, + "learning_rate": 1.9933424660803006e-05, + "loss": 1.2661, + "step": 1950 + }, + { + "epoch": 0.7263673488382916, + "grad_norm": 0.19434896111488342, + "learning_rate": 1.99332845845624e-05, + "loss": 1.2543, + "step": 1951 + }, + { + "epoch": 0.7267396539889006, + "grad_norm": 0.17882947623729706, + "learning_rate": 1.993314436160781e-05, + "loss": 1.2631, + "step": 1952 + }, + { + "epoch": 0.7271119591395098, + "grad_norm": 0.18903744220733643, + "learning_rate": 1.99330039919413e-05, + "loss": 1.2721, + "step": 1953 + }, + { + "epoch": 0.7274842642901188, + "grad_norm": 0.17733576893806458, + "learning_rate": 1.993286347556494e-05, + "loss": 1.2782, + "step": 1954 + }, + { + "epoch": 0.7278565694407279, + "grad_norm": 0.1745980829000473, + "learning_rate": 1.9932722812480813e-05, + "loss": 1.2609, + "step": 1955 + }, + { + "epoch": 0.7282288745913369, + "grad_norm": 0.18172860145568848, + "learning_rate": 1.9932582002690993e-05, + "loss": 1.2645, + "step": 1956 + }, + { + "epoch": 0.728601179741946, + "grad_norm": 0.17440102994441986, + "learning_rate": 1.9932441046197558e-05, + "loss": 1.2739, + "step": 1957 + }, + { + "epoch": 0.7289734848925551, + "grad_norm": 0.17599335312843323, + "learning_rate": 1.9932299943002596e-05, + "loss": 1.2626, + "step": 1958 + }, + { + "epoch": 0.7293457900431641, + "grad_norm": 0.18249286711215973, + "learning_rate": 1.9932158693108183e-05, + "loss": 1.2629, + "step": 1959 + }, + { + "epoch": 0.7297180951937732, + "grad_norm": 0.19463662803173065, + "learning_rate": 1.9932017296516414e-05, + "loss": 1.2589, + "step": 1960 + }, + { + "epoch": 0.7300904003443822, + "grad_norm": 0.17670099437236786, + "learning_rate": 1.9931875753229367e-05, + "loss": 1.2893, + "step": 1961 + }, + { + "epoch": 0.7304627054949914, + "grad_norm": 0.181423619389534, + "learning_rate": 1.9931734063249143e-05, + "loss": 1.2583, + "step": 1962 + }, + { + "epoch": 0.7308350106456004, + "grad_norm": 0.19615116715431213, + "learning_rate": 1.993159222657783e-05, + "loss": 1.273, + "step": 1963 + }, + { + "epoch": 0.7312073157962095, + "grad_norm": 0.1963336318731308, + "learning_rate": 1.9931450243217522e-05, + "loss": 1.2694, + "step": 1964 + }, + { + "epoch": 0.7315796209468185, + "grad_norm": 0.18081746995449066, + "learning_rate": 1.993130811317032e-05, + "loss": 1.2604, + "step": 1965 + }, + { + "epoch": 0.7319519260974277, + "grad_norm": 0.19056904315948486, + "learning_rate": 1.9931165836438314e-05, + "loss": 1.2732, + "step": 1966 + }, + { + "epoch": 0.7323242312480367, + "grad_norm": 0.18734106421470642, + "learning_rate": 1.9931023413023615e-05, + "loss": 1.2747, + "step": 1967 + }, + { + "epoch": 0.7326965363986457, + "grad_norm": 0.17514047026634216, + "learning_rate": 1.9930880842928325e-05, + "loss": 1.2536, + "step": 1968 + }, + { + "epoch": 0.7330688415492548, + "grad_norm": 0.17114852368831635, + "learning_rate": 1.993073812615455e-05, + "loss": 1.2664, + "step": 1969 + }, + { + "epoch": 0.7334411466998638, + "grad_norm": 0.187078058719635, + "learning_rate": 1.993059526270439e-05, + "loss": 1.265, + "step": 1970 + }, + { + "epoch": 0.733813451850473, + "grad_norm": 0.1790546178817749, + "learning_rate": 1.9930452252579967e-05, + "loss": 1.2676, + "step": 1971 + }, + { + "epoch": 0.734185757001082, + "grad_norm": 0.18027950823307037, + "learning_rate": 1.9930309095783386e-05, + "loss": 1.2745, + "step": 1972 + }, + { + "epoch": 0.7345580621516911, + "grad_norm": 0.18280746042728424, + "learning_rate": 1.993016579231676e-05, + "loss": 1.2724, + "step": 1973 + }, + { + "epoch": 0.7349303673023001, + "grad_norm": 0.18688839673995972, + "learning_rate": 1.9930022342182213e-05, + "loss": 1.2695, + "step": 1974 + }, + { + "epoch": 0.7353026724529093, + "grad_norm": 0.17303088307380676, + "learning_rate": 1.9929878745381855e-05, + "loss": 1.2601, + "step": 1975 + }, + { + "epoch": 0.7356749776035183, + "grad_norm": 0.17247672379016876, + "learning_rate": 1.992973500191781e-05, + "loss": 1.2698, + "step": 1976 + }, + { + "epoch": 0.7360472827541273, + "grad_norm": 0.18683381378650665, + "learning_rate": 1.9929591111792206e-05, + "loss": 1.2856, + "step": 1977 + }, + { + "epoch": 0.7364195879047364, + "grad_norm": 0.17574839293956757, + "learning_rate": 1.9929447075007164e-05, + "loss": 1.2798, + "step": 1978 + }, + { + "epoch": 0.7367918930553455, + "grad_norm": 0.177045539021492, + "learning_rate": 1.992930289156481e-05, + "loss": 1.2671, + "step": 1979 + }, + { + "epoch": 0.7371641982059546, + "grad_norm": 0.1845403015613556, + "learning_rate": 1.9929158561467276e-05, + "loss": 1.2516, + "step": 1980 + }, + { + "epoch": 0.7375365033565636, + "grad_norm": 0.18033714592456818, + "learning_rate": 1.9929014084716695e-05, + "loss": 1.2661, + "step": 1981 + }, + { + "epoch": 0.7379088085071727, + "grad_norm": 0.17001290619373322, + "learning_rate": 1.9928869461315197e-05, + "loss": 1.2729, + "step": 1982 + }, + { + "epoch": 0.7382811136577817, + "grad_norm": 0.1728724241256714, + "learning_rate": 1.992872469126492e-05, + "loss": 1.2679, + "step": 1983 + }, + { + "epoch": 0.7386534188083909, + "grad_norm": 0.1804700642824173, + "learning_rate": 1.9928579774568005e-05, + "loss": 1.2785, + "step": 1984 + }, + { + "epoch": 0.7390257239589999, + "grad_norm": 0.174544095993042, + "learning_rate": 1.9928434711226586e-05, + "loss": 1.2484, + "step": 1985 + }, + { + "epoch": 0.739398029109609, + "grad_norm": 0.18159344792366028, + "learning_rate": 1.9928289501242812e-05, + "loss": 1.2658, + "step": 1986 + }, + { + "epoch": 0.739770334260218, + "grad_norm": 0.16609835624694824, + "learning_rate": 1.9928144144618824e-05, + "loss": 1.266, + "step": 1987 + }, + { + "epoch": 0.7401426394108271, + "grad_norm": 0.1732049435377121, + "learning_rate": 1.992799864135677e-05, + "loss": 1.2598, + "step": 1988 + }, + { + "epoch": 0.7405149445614362, + "grad_norm": 0.17838548123836517, + "learning_rate": 1.9927852991458802e-05, + "loss": 1.2866, + "step": 1989 + }, + { + "epoch": 0.7408872497120452, + "grad_norm": 0.17985506355762482, + "learning_rate": 1.9927707194927067e-05, + "loss": 1.2706, + "step": 1990 + }, + { + "epoch": 0.7412595548626543, + "grad_norm": 0.1812310814857483, + "learning_rate": 1.9927561251763717e-05, + "loss": 1.2751, + "step": 1991 + }, + { + "epoch": 0.7416318600132634, + "grad_norm": 0.1740981936454773, + "learning_rate": 1.9927415161970913e-05, + "loss": 1.2769, + "step": 1992 + }, + { + "epoch": 0.7420041651638725, + "grad_norm": 0.1680895835161209, + "learning_rate": 1.9927268925550808e-05, + "loss": 1.2502, + "step": 1993 + }, + { + "epoch": 0.7423764703144815, + "grad_norm": 0.17788243293762207, + "learning_rate": 1.992712254250557e-05, + "loss": 1.2735, + "step": 1994 + }, + { + "epoch": 0.7427487754650905, + "grad_norm": 0.17746172845363617, + "learning_rate": 1.9926976012837345e-05, + "loss": 1.2665, + "step": 1995 + }, + { + "epoch": 0.7431210806156996, + "grad_norm": 0.1807558834552765, + "learning_rate": 1.9926829336548314e-05, + "loss": 1.263, + "step": 1996 + }, + { + "epoch": 0.7434933857663087, + "grad_norm": 0.17683039605617523, + "learning_rate": 1.9926682513640634e-05, + "loss": 1.2665, + "step": 1997 + }, + { + "epoch": 0.7438656909169178, + "grad_norm": 0.1726188063621521, + "learning_rate": 1.992653554411648e-05, + "loss": 1.279, + "step": 1998 + }, + { + "epoch": 0.7442379960675268, + "grad_norm": 0.1840149462223053, + "learning_rate": 1.9926388427978016e-05, + "loss": 1.267, + "step": 1999 + }, + { + "epoch": 0.7446103012181359, + "grad_norm": 0.17595337331295013, + "learning_rate": 1.992624116522742e-05, + "loss": 1.264, + "step": 2000 + }, + { + "epoch": 0.7446103012181359, + "eval_loss": 1.3380753993988037, + "eval_runtime": 15.9929, + "eval_samples_per_second": 108.423, + "eval_steps_per_second": 5.44, + "step": 2000 + }, + { + "epoch": 0.744982606368745, + "grad_norm": 0.17696943879127502, + "learning_rate": 1.9926093755866862e-05, + "loss": 1.2773, + "step": 2001 + }, + { + "epoch": 0.7453549115193541, + "grad_norm": 0.1840478777885437, + "learning_rate": 1.9925946199898526e-05, + "loss": 1.2484, + "step": 2002 + }, + { + "epoch": 0.7457272166699631, + "grad_norm": 0.16878820955753326, + "learning_rate": 1.9925798497324583e-05, + "loss": 1.2478, + "step": 2003 + }, + { + "epoch": 0.7460995218205722, + "grad_norm": 0.17494863271713257, + "learning_rate": 1.992565064814722e-05, + "loss": 1.2547, + "step": 2004 + }, + { + "epoch": 0.7464718269711813, + "grad_norm": 0.17591069638729095, + "learning_rate": 1.992550265236862e-05, + "loss": 1.2644, + "step": 2005 + }, + { + "epoch": 0.7468441321217903, + "grad_norm": 0.17405180633068085, + "learning_rate": 1.992535450999097e-05, + "loss": 1.2685, + "step": 2006 + }, + { + "epoch": 0.7472164372723994, + "grad_norm": 0.17987771332263947, + "learning_rate": 1.9925206221016456e-05, + "loss": 1.2792, + "step": 2007 + }, + { + "epoch": 0.7475887424230084, + "grad_norm": 0.1718064546585083, + "learning_rate": 1.992505778544727e-05, + "loss": 1.2552, + "step": 2008 + }, + { + "epoch": 0.7479610475736175, + "grad_norm": 0.16629862785339355, + "learning_rate": 1.9924909203285604e-05, + "loss": 1.2621, + "step": 2009 + }, + { + "epoch": 0.7483333527242266, + "grad_norm": 0.16779156029224396, + "learning_rate": 1.9924760474533654e-05, + "loss": 1.2569, + "step": 2010 + }, + { + "epoch": 0.7487056578748357, + "grad_norm": 0.17635951936244965, + "learning_rate": 1.992461159919361e-05, + "loss": 1.2617, + "step": 2011 + }, + { + "epoch": 0.7490779630254447, + "grad_norm": 0.17284058034420013, + "learning_rate": 1.9924462577267676e-05, + "loss": 1.2622, + "step": 2012 + }, + { + "epoch": 0.7494502681760538, + "grad_norm": 0.16899628937244415, + "learning_rate": 1.9924313408758053e-05, + "loss": 1.2684, + "step": 2013 + }, + { + "epoch": 0.7498225733266629, + "grad_norm": 0.17767173051834106, + "learning_rate": 1.9924164093666946e-05, + "loss": 1.2624, + "step": 2014 + }, + { + "epoch": 0.750194878477272, + "grad_norm": 0.17249037325382233, + "learning_rate": 1.9924014631996557e-05, + "loss": 1.2647, + "step": 2015 + }, + { + "epoch": 0.750567183627881, + "grad_norm": 0.1744518280029297, + "learning_rate": 1.9923865023749095e-05, + "loss": 1.2541, + "step": 2016 + }, + { + "epoch": 0.75093948877849, + "grad_norm": 0.17807431519031525, + "learning_rate": 1.9923715268926765e-05, + "loss": 1.2731, + "step": 2017 + }, + { + "epoch": 0.7513117939290992, + "grad_norm": 0.17543937265872955, + "learning_rate": 1.992356536753179e-05, + "loss": 1.2463, + "step": 2018 + }, + { + "epoch": 0.7516840990797082, + "grad_norm": 0.17383424937725067, + "learning_rate": 1.9923415319566372e-05, + "loss": 1.2479, + "step": 2019 + }, + { + "epoch": 0.7520564042303173, + "grad_norm": 0.181121364235878, + "learning_rate": 1.9923265125032736e-05, + "loss": 1.2617, + "step": 2020 + }, + { + "epoch": 0.7524287093809263, + "grad_norm": 0.18264862895011902, + "learning_rate": 1.9923114783933096e-05, + "loss": 1.2607, + "step": 2021 + }, + { + "epoch": 0.7528010145315354, + "grad_norm": 0.1814924031496048, + "learning_rate": 1.9922964296269672e-05, + "loss": 1.2828, + "step": 2022 + }, + { + "epoch": 0.7531733196821445, + "grad_norm": 0.17525698244571686, + "learning_rate": 1.992281366204469e-05, + "loss": 1.2538, + "step": 2023 + }, + { + "epoch": 0.7535456248327536, + "grad_norm": 0.17692695558071136, + "learning_rate": 1.9922662881260374e-05, + "loss": 1.2701, + "step": 2024 + }, + { + "epoch": 0.7539179299833626, + "grad_norm": 0.17601527273654938, + "learning_rate": 1.9922511953918945e-05, + "loss": 1.2625, + "step": 2025 + }, + { + "epoch": 0.7542902351339716, + "grad_norm": 0.17493323981761932, + "learning_rate": 1.992236088002264e-05, + "loss": 1.256, + "step": 2026 + }, + { + "epoch": 0.7546625402845808, + "grad_norm": 0.1716911792755127, + "learning_rate": 1.992220965957369e-05, + "loss": 1.2375, + "step": 2027 + }, + { + "epoch": 0.7550348454351898, + "grad_norm": 0.1788312941789627, + "learning_rate": 1.9922058292574323e-05, + "loss": 1.2571, + "step": 2028 + }, + { + "epoch": 0.7554071505857989, + "grad_norm": 0.1795925498008728, + "learning_rate": 1.9921906779026775e-05, + "loss": 1.2737, + "step": 2029 + }, + { + "epoch": 0.7557794557364079, + "grad_norm": 0.17507243156433105, + "learning_rate": 1.9921755118933292e-05, + "loss": 1.2625, + "step": 2030 + }, + { + "epoch": 0.756151760887017, + "grad_norm": 0.17492863535881042, + "learning_rate": 1.992160331229611e-05, + "loss": 1.268, + "step": 2031 + }, + { + "epoch": 0.7565240660376261, + "grad_norm": 0.18614165484905243, + "learning_rate": 1.992145135911746e-05, + "loss": 1.2878, + "step": 2032 + }, + { + "epoch": 0.7568963711882352, + "grad_norm": 0.18001849949359894, + "learning_rate": 1.9921299259399604e-05, + "loss": 1.2622, + "step": 2033 + }, + { + "epoch": 0.7572686763388442, + "grad_norm": 0.18375787138938904, + "learning_rate": 1.9921147013144782e-05, + "loss": 1.2825, + "step": 2034 + }, + { + "epoch": 0.7576409814894532, + "grad_norm": 0.17793749272823334, + "learning_rate": 1.9920994620355236e-05, + "loss": 1.2546, + "step": 2035 + }, + { + "epoch": 0.7580132866400624, + "grad_norm": 0.18328694999217987, + "learning_rate": 1.9920842081033225e-05, + "loss": 1.2607, + "step": 2036 + }, + { + "epoch": 0.7583855917906714, + "grad_norm": 0.17626026272773743, + "learning_rate": 1.9920689395180996e-05, + "loss": 1.27, + "step": 2037 + }, + { + "epoch": 0.7587578969412805, + "grad_norm": 0.1846167892217636, + "learning_rate": 1.9920536562800808e-05, + "loss": 1.2657, + "step": 2038 + }, + { + "epoch": 0.7591302020918895, + "grad_norm": 0.1859884262084961, + "learning_rate": 1.9920383583894922e-05, + "loss": 1.2806, + "step": 2039 + }, + { + "epoch": 0.7595025072424987, + "grad_norm": 0.17855176329612732, + "learning_rate": 1.992023045846559e-05, + "loss": 1.2725, + "step": 2040 + }, + { + "epoch": 0.7598748123931077, + "grad_norm": 0.17177069187164307, + "learning_rate": 1.9920077186515076e-05, + "loss": 1.2623, + "step": 2041 + }, + { + "epoch": 0.7602471175437168, + "grad_norm": 0.18153630197048187, + "learning_rate": 1.9919923768045646e-05, + "loss": 1.2827, + "step": 2042 + }, + { + "epoch": 0.7606194226943258, + "grad_norm": 0.18789653480052948, + "learning_rate": 1.9919770203059564e-05, + "loss": 1.2675, + "step": 2043 + }, + { + "epoch": 0.7609917278449349, + "grad_norm": 0.18295817077159882, + "learning_rate": 1.99196164915591e-05, + "loss": 1.2635, + "step": 2044 + }, + { + "epoch": 0.761364032995544, + "grad_norm": 0.18244396150112152, + "learning_rate": 1.991946263354652e-05, + "loss": 1.2586, + "step": 2045 + }, + { + "epoch": 0.761736338146153, + "grad_norm": 0.19641557335853577, + "learning_rate": 1.99193086290241e-05, + "loss": 1.2838, + "step": 2046 + }, + { + "epoch": 0.7621086432967621, + "grad_norm": 0.1869594007730484, + "learning_rate": 1.9919154477994117e-05, + "loss": 1.262, + "step": 2047 + }, + { + "epoch": 0.7624809484473711, + "grad_norm": 0.1961170732975006, + "learning_rate": 1.991900018045884e-05, + "loss": 1.2574, + "step": 2048 + }, + { + "epoch": 0.7628532535979803, + "grad_norm": 0.18542712926864624, + "learning_rate": 1.9918845736420554e-05, + "loss": 1.2627, + "step": 2049 + }, + { + "epoch": 0.7632255587485893, + "grad_norm": 0.19344452023506165, + "learning_rate": 1.9918691145881542e-05, + "loss": 1.2757, + "step": 2050 + }, + { + "epoch": 0.7635978638991984, + "grad_norm": 0.19451677799224854, + "learning_rate": 1.9918536408844082e-05, + "loss": 1.265, + "step": 2051 + }, + { + "epoch": 0.7639701690498074, + "grad_norm": 0.19341467320919037, + "learning_rate": 1.9918381525310464e-05, + "loss": 1.2655, + "step": 2052 + }, + { + "epoch": 0.7643424742004166, + "grad_norm": 0.1805962473154068, + "learning_rate": 1.991822649528297e-05, + "loss": 1.2639, + "step": 2053 + }, + { + "epoch": 0.7647147793510256, + "grad_norm": 0.19268319010734558, + "learning_rate": 1.9918071318763898e-05, + "loss": 1.2563, + "step": 2054 + }, + { + "epoch": 0.7650870845016347, + "grad_norm": 0.1859290599822998, + "learning_rate": 1.991791599575553e-05, + "loss": 1.2654, + "step": 2055 + }, + { + "epoch": 0.7654593896522437, + "grad_norm": 0.18508413434028625, + "learning_rate": 1.991776052626017e-05, + "loss": 1.2577, + "step": 2056 + }, + { + "epoch": 0.7658316948028527, + "grad_norm": 0.19763630628585815, + "learning_rate": 1.9917604910280106e-05, + "loss": 1.2622, + "step": 2057 + }, + { + "epoch": 0.7662039999534619, + "grad_norm": 0.16519156098365784, + "learning_rate": 1.991744914781764e-05, + "loss": 1.2553, + "step": 2058 + }, + { + "epoch": 0.7665763051040709, + "grad_norm": 0.191755473613739, + "learning_rate": 1.991729323887507e-05, + "loss": 1.2623, + "step": 2059 + }, + { + "epoch": 0.76694861025468, + "grad_norm": 0.19937783479690552, + "learning_rate": 1.9917137183454706e-05, + "loss": 1.2631, + "step": 2060 + }, + { + "epoch": 0.767320915405289, + "grad_norm": 0.17880156636238098, + "learning_rate": 1.9916980981558846e-05, + "loss": 1.2731, + "step": 2061 + }, + { + "epoch": 0.7676932205558982, + "grad_norm": 0.17433685064315796, + "learning_rate": 1.99168246331898e-05, + "loss": 1.2617, + "step": 2062 + }, + { + "epoch": 0.7680655257065072, + "grad_norm": 0.19513197243213654, + "learning_rate": 1.9916668138349873e-05, + "loss": 1.2746, + "step": 2063 + }, + { + "epoch": 0.7684378308571163, + "grad_norm": 0.16553771495819092, + "learning_rate": 1.9916511497041388e-05, + "loss": 1.2607, + "step": 2064 + }, + { + "epoch": 0.7688101360077253, + "grad_norm": 0.1720885932445526, + "learning_rate": 1.9916354709266645e-05, + "loss": 1.2665, + "step": 2065 + }, + { + "epoch": 0.7691824411583345, + "grad_norm": 0.18258140981197357, + "learning_rate": 1.9916197775027967e-05, + "loss": 1.2586, + "step": 2066 + }, + { + "epoch": 0.7695547463089435, + "grad_norm": 0.18003836274147034, + "learning_rate": 1.991604069432767e-05, + "loss": 1.2594, + "step": 2067 + }, + { + "epoch": 0.7699270514595525, + "grad_norm": 0.1830645203590393, + "learning_rate": 1.991588346716807e-05, + "loss": 1.2621, + "step": 2068 + }, + { + "epoch": 0.7702993566101616, + "grad_norm": 0.1741950958967209, + "learning_rate": 1.9915726093551497e-05, + "loss": 1.2628, + "step": 2069 + }, + { + "epoch": 0.7706716617607706, + "grad_norm": 0.17035536468029022, + "learning_rate": 1.991556857348027e-05, + "loss": 1.2504, + "step": 2070 + }, + { + "epoch": 0.7710439669113798, + "grad_norm": 0.1875630021095276, + "learning_rate": 1.9915410906956723e-05, + "loss": 1.2616, + "step": 2071 + }, + { + "epoch": 0.7714162720619888, + "grad_norm": 0.1755642145872116, + "learning_rate": 1.9915253093983175e-05, + "loss": 1.2758, + "step": 2072 + }, + { + "epoch": 0.7717885772125979, + "grad_norm": 0.17393867671489716, + "learning_rate": 1.991509513456196e-05, + "loss": 1.2584, + "step": 2073 + }, + { + "epoch": 0.7721608823632069, + "grad_norm": 0.186118945479393, + "learning_rate": 1.9914937028695412e-05, + "loss": 1.2733, + "step": 2074 + }, + { + "epoch": 0.772533187513816, + "grad_norm": 0.1890277862548828, + "learning_rate": 1.991477877638587e-05, + "loss": 1.2652, + "step": 2075 + }, + { + "epoch": 0.7729054926644251, + "grad_norm": 0.18629412353038788, + "learning_rate": 1.9914620377635666e-05, + "loss": 1.2573, + "step": 2076 + }, + { + "epoch": 0.7732777978150341, + "grad_norm": 0.17759479582309723, + "learning_rate": 1.9914461832447142e-05, + "loss": 1.2551, + "step": 2077 + }, + { + "epoch": 0.7736501029656432, + "grad_norm": 0.1834801733493805, + "learning_rate": 1.9914303140822634e-05, + "loss": 1.252, + "step": 2078 + }, + { + "epoch": 0.7740224081162523, + "grad_norm": 0.17973852157592773, + "learning_rate": 1.9914144302764497e-05, + "loss": 1.2553, + "step": 2079 + }, + { + "epoch": 0.7743947132668614, + "grad_norm": 0.17896102368831635, + "learning_rate": 1.9913985318275068e-05, + "loss": 1.2553, + "step": 2080 + }, + { + "epoch": 0.7747670184174704, + "grad_norm": 0.18261843919754028, + "learning_rate": 1.99138261873567e-05, + "loss": 1.2491, + "step": 2081 + }, + { + "epoch": 0.7751393235680795, + "grad_norm": 0.19672666490077972, + "learning_rate": 1.9913666910011737e-05, + "loss": 1.2655, + "step": 2082 + }, + { + "epoch": 0.7755116287186885, + "grad_norm": 0.18677249550819397, + "learning_rate": 1.9913507486242537e-05, + "loss": 1.2749, + "step": 2083 + }, + { + "epoch": 0.7758839338692977, + "grad_norm": 0.18855483829975128, + "learning_rate": 1.9913347916051458e-05, + "loss": 1.2729, + "step": 2084 + }, + { + "epoch": 0.7762562390199067, + "grad_norm": 0.18166404962539673, + "learning_rate": 1.9913188199440848e-05, + "loss": 1.253, + "step": 2085 + }, + { + "epoch": 0.7766285441705157, + "grad_norm": 0.18028071522712708, + "learning_rate": 1.9913028336413074e-05, + "loss": 1.2587, + "step": 2086 + }, + { + "epoch": 0.7770008493211248, + "grad_norm": 0.18412554264068604, + "learning_rate": 1.991286832697049e-05, + "loss": 1.2503, + "step": 2087 + }, + { + "epoch": 0.7773731544717339, + "grad_norm": 0.18333709239959717, + "learning_rate": 1.9912708171115463e-05, + "loss": 1.2636, + "step": 2088 + }, + { + "epoch": 0.777745459622343, + "grad_norm": 0.18019062280654907, + "learning_rate": 1.991254786885036e-05, + "loss": 1.2577, + "step": 2089 + }, + { + "epoch": 0.778117764772952, + "grad_norm": 0.18314556777477264, + "learning_rate": 1.991238742017755e-05, + "loss": 1.2482, + "step": 2090 + }, + { + "epoch": 0.7784900699235611, + "grad_norm": 0.17646336555480957, + "learning_rate": 1.9912226825099395e-05, + "loss": 1.262, + "step": 2091 + }, + { + "epoch": 0.7788623750741702, + "grad_norm": 0.19445347785949707, + "learning_rate": 1.9912066083618275e-05, + "loss": 1.256, + "step": 2092 + }, + { + "epoch": 0.7792346802247793, + "grad_norm": 0.17373862862586975, + "learning_rate": 1.991190519573656e-05, + "loss": 1.2702, + "step": 2093 + }, + { + "epoch": 0.7796069853753883, + "grad_norm": 0.16923663020133972, + "learning_rate": 1.9911744161456624e-05, + "loss": 1.2482, + "step": 2094 + }, + { + "epoch": 0.7799792905259973, + "grad_norm": 0.1783125400543213, + "learning_rate": 1.9911582980780854e-05, + "loss": 1.2623, + "step": 2095 + }, + { + "epoch": 0.7803515956766064, + "grad_norm": 0.18715058267116547, + "learning_rate": 1.9911421653711624e-05, + "loss": 1.2634, + "step": 2096 + }, + { + "epoch": 0.7807239008272155, + "grad_norm": 0.17201177775859833, + "learning_rate": 1.9911260180251316e-05, + "loss": 1.2659, + "step": 2097 + }, + { + "epoch": 0.7810962059778246, + "grad_norm": 0.1753210425376892, + "learning_rate": 1.991109856040232e-05, + "loss": 1.2558, + "step": 2098 + }, + { + "epoch": 0.7814685111284336, + "grad_norm": 0.16961847245693207, + "learning_rate": 1.991093679416702e-05, + "loss": 1.2614, + "step": 2099 + }, + { + "epoch": 0.7818408162790427, + "grad_norm": 0.18109846115112305, + "learning_rate": 1.9910774881547803e-05, + "loss": 1.2597, + "step": 2100 + }, + { + "epoch": 0.7822131214296518, + "grad_norm": 0.16653625667095184, + "learning_rate": 1.9910612822547063e-05, + "loss": 1.2603, + "step": 2101 + }, + { + "epoch": 0.7825854265802609, + "grad_norm": 0.17125585675239563, + "learning_rate": 1.9910450617167198e-05, + "loss": 1.2597, + "step": 2102 + }, + { + "epoch": 0.7829577317308699, + "grad_norm": 0.18307803571224213, + "learning_rate": 1.9910288265410593e-05, + "loss": 1.2728, + "step": 2103 + }, + { + "epoch": 0.783330036881479, + "grad_norm": 0.17665451765060425, + "learning_rate": 1.9910125767279655e-05, + "loss": 1.2727, + "step": 2104 + }, + { + "epoch": 0.783702342032088, + "grad_norm": 0.188068225979805, + "learning_rate": 1.9909963122776785e-05, + "loss": 1.2526, + "step": 2105 + }, + { + "epoch": 0.7840746471826971, + "grad_norm": 0.1769012063741684, + "learning_rate": 1.9909800331904375e-05, + "loss": 1.2541, + "step": 2106 + }, + { + "epoch": 0.7844469523333062, + "grad_norm": 0.1765563189983368, + "learning_rate": 1.9909637394664842e-05, + "loss": 1.2499, + "step": 2107 + }, + { + "epoch": 0.7848192574839152, + "grad_norm": 0.19480668008327484, + "learning_rate": 1.9909474311060583e-05, + "loss": 1.2617, + "step": 2108 + }, + { + "epoch": 0.7851915626345243, + "grad_norm": 0.1799997091293335, + "learning_rate": 1.9909311081094012e-05, + "loss": 1.2636, + "step": 2109 + }, + { + "epoch": 0.7855638677851334, + "grad_norm": 0.16892112791538239, + "learning_rate": 1.9909147704767537e-05, + "loss": 1.2525, + "step": 2110 + }, + { + "epoch": 0.7859361729357425, + "grad_norm": 0.17401593923568726, + "learning_rate": 1.9908984182083574e-05, + "loss": 1.2497, + "step": 2111 + }, + { + "epoch": 0.7863084780863515, + "grad_norm": 0.18398383259773254, + "learning_rate": 1.9908820513044535e-05, + "loss": 1.2704, + "step": 2112 + }, + { + "epoch": 0.7866807832369606, + "grad_norm": 0.17597925662994385, + "learning_rate": 1.990865669765284e-05, + "loss": 1.2628, + "step": 2113 + }, + { + "epoch": 0.7870530883875697, + "grad_norm": 0.17537419497966766, + "learning_rate": 1.9908492735910907e-05, + "loss": 1.2674, + "step": 2114 + }, + { + "epoch": 0.7874253935381788, + "grad_norm": 0.198894664645195, + "learning_rate": 1.990832862782116e-05, + "loss": 1.273, + "step": 2115 + }, + { + "epoch": 0.7877976986887878, + "grad_norm": 0.18598125874996185, + "learning_rate": 1.9908164373386016e-05, + "loss": 1.2714, + "step": 2116 + }, + { + "epoch": 0.7881700038393968, + "grad_norm": 0.18410056829452515, + "learning_rate": 1.990799997260791e-05, + "loss": 1.2774, + "step": 2117 + }, + { + "epoch": 0.7885423089900059, + "grad_norm": 0.17665806412696838, + "learning_rate": 1.9907835425489263e-05, + "loss": 1.2697, + "step": 2118 + }, + { + "epoch": 0.788914614140615, + "grad_norm": 0.18189984560012817, + "learning_rate": 1.990767073203251e-05, + "loss": 1.2582, + "step": 2119 + }, + { + "epoch": 0.7892869192912241, + "grad_norm": 0.18040038645267487, + "learning_rate": 1.9907505892240084e-05, + "loss": 1.2587, + "step": 2120 + }, + { + "epoch": 0.7896592244418331, + "grad_norm": 0.18493278324604034, + "learning_rate": 1.9907340906114418e-05, + "loss": 1.2563, + "step": 2121 + }, + { + "epoch": 0.7900315295924422, + "grad_norm": 0.19034142792224884, + "learning_rate": 1.9907175773657945e-05, + "loss": 1.2633, + "step": 2122 + }, + { + "epoch": 0.7904038347430513, + "grad_norm": 0.18734246492385864, + "learning_rate": 1.990701049487311e-05, + "loss": 1.2665, + "step": 2123 + }, + { + "epoch": 0.7907761398936604, + "grad_norm": 0.18347962200641632, + "learning_rate": 1.9906845069762352e-05, + "loss": 1.2608, + "step": 2124 + }, + { + "epoch": 0.7911484450442694, + "grad_norm": 0.18983064591884613, + "learning_rate": 1.9906679498328114e-05, + "loss": 1.2524, + "step": 2125 + }, + { + "epoch": 0.7915207501948784, + "grad_norm": 0.17927458882331848, + "learning_rate": 1.990651378057284e-05, + "loss": 1.2826, + "step": 2126 + }, + { + "epoch": 0.7918930553454876, + "grad_norm": 0.1906890869140625, + "learning_rate": 1.990634791649898e-05, + "loss": 1.2494, + "step": 2127 + }, + { + "epoch": 0.7922653604960966, + "grad_norm": 0.19961805641651154, + "learning_rate": 1.9906181906108983e-05, + "loss": 1.2436, + "step": 2128 + }, + { + "epoch": 0.7926376656467057, + "grad_norm": 0.18504062294960022, + "learning_rate": 1.9906015749405302e-05, + "loss": 1.2439, + "step": 2129 + }, + { + "epoch": 0.7930099707973147, + "grad_norm": 0.18666766583919525, + "learning_rate": 1.9905849446390387e-05, + "loss": 1.2538, + "step": 2130 + }, + { + "epoch": 0.7933822759479238, + "grad_norm": 0.19338330626487732, + "learning_rate": 1.99056829970667e-05, + "loss": 1.2609, + "step": 2131 + }, + { + "epoch": 0.7937545810985329, + "grad_norm": 0.19320982694625854, + "learning_rate": 1.9905516401436698e-05, + "loss": 1.2667, + "step": 2132 + }, + { + "epoch": 0.794126886249142, + "grad_norm": 0.18429437279701233, + "learning_rate": 1.9905349659502836e-05, + "loss": 1.2662, + "step": 2133 + }, + { + "epoch": 0.794499191399751, + "grad_norm": 0.1833900511264801, + "learning_rate": 1.9905182771267583e-05, + "loss": 1.2662, + "step": 2134 + }, + { + "epoch": 0.79487149655036, + "grad_norm": 0.19020430743694305, + "learning_rate": 1.9905015736733406e-05, + "loss": 1.2428, + "step": 2135 + }, + { + "epoch": 0.7952438017009692, + "grad_norm": 0.18528495728969574, + "learning_rate": 1.9904848555902764e-05, + "loss": 1.2643, + "step": 2136 + }, + { + "epoch": 0.7956161068515782, + "grad_norm": 0.1845444291830063, + "learning_rate": 1.990468122877813e-05, + "loss": 1.2741, + "step": 2137 + }, + { + "epoch": 0.7959884120021873, + "grad_norm": 0.1760178506374359, + "learning_rate": 1.9904513755361978e-05, + "loss": 1.2555, + "step": 2138 + }, + { + "epoch": 0.7963607171527963, + "grad_norm": 0.1901838481426239, + "learning_rate": 1.990434613565678e-05, + "loss": 1.2593, + "step": 2139 + }, + { + "epoch": 0.7967330223034055, + "grad_norm": 0.19051726162433624, + "learning_rate": 1.990417836966501e-05, + "loss": 1.2585, + "step": 2140 + }, + { + "epoch": 0.7971053274540145, + "grad_norm": 0.17843446135520935, + "learning_rate": 1.9904010457389144e-05, + "loss": 1.2613, + "step": 2141 + }, + { + "epoch": 0.7974776326046236, + "grad_norm": 0.16943950951099396, + "learning_rate": 1.990384239883167e-05, + "loss": 1.2496, + "step": 2142 + }, + { + "epoch": 0.7978499377552326, + "grad_norm": 0.19439001381397247, + "learning_rate": 1.9903674193995064e-05, + "loss": 1.2617, + "step": 2143 + }, + { + "epoch": 0.7982222429058417, + "grad_norm": 0.17899306118488312, + "learning_rate": 1.990350584288181e-05, + "loss": 1.2463, + "step": 2144 + }, + { + "epoch": 0.7985945480564508, + "grad_norm": 0.1824941188097, + "learning_rate": 1.99033373454944e-05, + "loss": 1.243, + "step": 2145 + }, + { + "epoch": 0.7989668532070598, + "grad_norm": 0.1979120671749115, + "learning_rate": 1.9903168701835314e-05, + "loss": 1.2497, + "step": 2146 + }, + { + "epoch": 0.7993391583576689, + "grad_norm": 0.18462349474430084, + "learning_rate": 1.990299991190705e-05, + "loss": 1.2597, + "step": 2147 + }, + { + "epoch": 0.7997114635082779, + "grad_norm": 0.18362213671207428, + "learning_rate": 1.9902830975712096e-05, + "loss": 1.2657, + "step": 2148 + }, + { + "epoch": 0.8000837686588871, + "grad_norm": 0.18074361979961395, + "learning_rate": 1.9902661893252955e-05, + "loss": 1.263, + "step": 2149 + }, + { + "epoch": 0.8004560738094961, + "grad_norm": 0.1755630373954773, + "learning_rate": 1.9902492664532116e-05, + "loss": 1.2558, + "step": 2150 + }, + { + "epoch": 0.8008283789601052, + "grad_norm": 0.1772383749485016, + "learning_rate": 1.9902323289552084e-05, + "loss": 1.2668, + "step": 2151 + }, + { + "epoch": 0.8012006841107142, + "grad_norm": 0.18553152680397034, + "learning_rate": 1.9902153768315355e-05, + "loss": 1.2548, + "step": 2152 + }, + { + "epoch": 0.8015729892613234, + "grad_norm": 0.18078090250492096, + "learning_rate": 1.9901984100824442e-05, + "loss": 1.2535, + "step": 2153 + }, + { + "epoch": 0.8019452944119324, + "grad_norm": 0.19977954030036926, + "learning_rate": 1.990181428708184e-05, + "loss": 1.2606, + "step": 2154 + }, + { + "epoch": 0.8023175995625415, + "grad_norm": 0.18592652678489685, + "learning_rate": 1.9901644327090063e-05, + "loss": 1.2663, + "step": 2155 + }, + { + "epoch": 0.8026899047131505, + "grad_norm": 0.17959389090538025, + "learning_rate": 1.9901474220851625e-05, + "loss": 1.2591, + "step": 2156 + }, + { + "epoch": 0.8030622098637595, + "grad_norm": 0.19428277015686035, + "learning_rate": 1.9901303968369028e-05, + "loss": 1.2703, + "step": 2157 + }, + { + "epoch": 0.8034345150143687, + "grad_norm": 0.17926783859729767, + "learning_rate": 1.9901133569644794e-05, + "loss": 1.2682, + "step": 2158 + }, + { + "epoch": 0.8038068201649777, + "grad_norm": 0.18147458136081696, + "learning_rate": 1.9900963024681442e-05, + "loss": 1.2536, + "step": 2159 + }, + { + "epoch": 0.8041791253155868, + "grad_norm": 0.18848387897014618, + "learning_rate": 1.990079233348149e-05, + "loss": 1.2543, + "step": 2160 + }, + { + "epoch": 0.8045514304661958, + "grad_norm": 0.20593321323394775, + "learning_rate": 1.990062149604745e-05, + "loss": 1.2601, + "step": 2161 + }, + { + "epoch": 0.804923735616805, + "grad_norm": 0.17839789390563965, + "learning_rate": 1.990045051238185e-05, + "loss": 1.253, + "step": 2162 + }, + { + "epoch": 0.805296040767414, + "grad_norm": 0.18125569820404053, + "learning_rate": 1.9900279382487223e-05, + "loss": 1.2479, + "step": 2163 + }, + { + "epoch": 0.805668345918023, + "grad_norm": 0.1832691729068756, + "learning_rate": 1.990010810636609e-05, + "loss": 1.2563, + "step": 2164 + }, + { + "epoch": 0.8060406510686321, + "grad_norm": 0.18866664171218872, + "learning_rate": 1.9899936684020983e-05, + "loss": 1.2379, + "step": 2165 + }, + { + "epoch": 0.8064129562192413, + "grad_norm": 0.17657117545604706, + "learning_rate": 1.989976511545443e-05, + "loss": 1.2568, + "step": 2166 + }, + { + "epoch": 0.8067852613698503, + "grad_norm": 0.17251810431480408, + "learning_rate": 1.989959340066897e-05, + "loss": 1.2435, + "step": 2167 + }, + { + "epoch": 0.8071575665204593, + "grad_norm": 0.1733696609735489, + "learning_rate": 1.9899421539667132e-05, + "loss": 1.2434, + "step": 2168 + }, + { + "epoch": 0.8075298716710684, + "grad_norm": 0.1835404336452484, + "learning_rate": 1.989924953245146e-05, + "loss": 1.2721, + "step": 2169 + }, + { + "epoch": 0.8079021768216774, + "grad_norm": 0.18328076601028442, + "learning_rate": 1.9899077379024497e-05, + "loss": 1.2567, + "step": 2170 + }, + { + "epoch": 0.8082744819722866, + "grad_norm": 0.18229638040065765, + "learning_rate": 1.989890507938878e-05, + "loss": 1.2604, + "step": 2171 + }, + { + "epoch": 0.8086467871228956, + "grad_norm": 0.18881352245807648, + "learning_rate": 1.989873263354686e-05, + "loss": 1.2579, + "step": 2172 + }, + { + "epoch": 0.8090190922735047, + "grad_norm": 0.179013192653656, + "learning_rate": 1.9898560041501277e-05, + "loss": 1.2631, + "step": 2173 + }, + { + "epoch": 0.8093913974241137, + "grad_norm": 0.18358515202999115, + "learning_rate": 1.9898387303254584e-05, + "loss": 1.2431, + "step": 2174 + }, + { + "epoch": 0.8097637025747229, + "grad_norm": 0.179754838347435, + "learning_rate": 1.989821441880933e-05, + "loss": 1.2645, + "step": 2175 + }, + { + "epoch": 0.8101360077253319, + "grad_norm": 0.18313923478126526, + "learning_rate": 1.989804138816807e-05, + "loss": 1.2684, + "step": 2176 + }, + { + "epoch": 0.8105083128759409, + "grad_norm": 0.19836069643497467, + "learning_rate": 1.9897868211333362e-05, + "loss": 1.2612, + "step": 2177 + }, + { + "epoch": 0.81088061802655, + "grad_norm": 0.19085316359996796, + "learning_rate": 1.9897694888307763e-05, + "loss": 1.2471, + "step": 2178 + }, + { + "epoch": 0.811252923177159, + "grad_norm": 0.18687599897384644, + "learning_rate": 1.9897521419093828e-05, + "loss": 1.2446, + "step": 2179 + }, + { + "epoch": 0.8116252283277682, + "grad_norm": 0.1764868050813675, + "learning_rate": 1.9897347803694126e-05, + "loss": 1.2665, + "step": 2180 + }, + { + "epoch": 0.8119975334783772, + "grad_norm": 0.2134658545255661, + "learning_rate": 1.9897174042111214e-05, + "loss": 1.2447, + "step": 2181 + }, + { + "epoch": 0.8123698386289863, + "grad_norm": 0.1877439022064209, + "learning_rate": 1.9897000134347665e-05, + "loss": 1.2686, + "step": 2182 + }, + { + "epoch": 0.8127421437795953, + "grad_norm": 0.1730710119009018, + "learning_rate": 1.9896826080406046e-05, + "loss": 1.2661, + "step": 2183 + }, + { + "epoch": 0.8131144489302045, + "grad_norm": 0.2091267704963684, + "learning_rate": 1.9896651880288926e-05, + "loss": 1.2707, + "step": 2184 + }, + { + "epoch": 0.8134867540808135, + "grad_norm": 0.19189974665641785, + "learning_rate": 1.9896477533998883e-05, + "loss": 1.2617, + "step": 2185 + }, + { + "epoch": 0.8138590592314225, + "grad_norm": 0.18066149950027466, + "learning_rate": 1.989630304153848e-05, + "loss": 1.2609, + "step": 2186 + }, + { + "epoch": 0.8142313643820316, + "grad_norm": 0.2049744427204132, + "learning_rate": 1.9896128402910307e-05, + "loss": 1.2573, + "step": 2187 + }, + { + "epoch": 0.8146036695326407, + "grad_norm": 0.1748519241809845, + "learning_rate": 1.9895953618116935e-05, + "loss": 1.2437, + "step": 2188 + }, + { + "epoch": 0.8149759746832498, + "grad_norm": 0.17954249680042267, + "learning_rate": 1.9895778687160954e-05, + "loss": 1.2665, + "step": 2189 + }, + { + "epoch": 0.8153482798338588, + "grad_norm": 0.18017897009849548, + "learning_rate": 1.989560361004494e-05, + "loss": 1.2545, + "step": 2190 + }, + { + "epoch": 0.8157205849844679, + "grad_norm": 0.20069225132465363, + "learning_rate": 1.9895428386771482e-05, + "loss": 1.2736, + "step": 2191 + }, + { + "epoch": 0.8160928901350769, + "grad_norm": 0.17469745874404907, + "learning_rate": 1.989525301734317e-05, + "loss": 1.2449, + "step": 2192 + }, + { + "epoch": 0.8164651952856861, + "grad_norm": 0.18415339291095734, + "learning_rate": 1.9895077501762588e-05, + "loss": 1.2507, + "step": 2193 + }, + { + "epoch": 0.8168375004362951, + "grad_norm": 0.18301935493946075, + "learning_rate": 1.9894901840032336e-05, + "loss": 1.254, + "step": 2194 + }, + { + "epoch": 0.8172098055869041, + "grad_norm": 0.18746277689933777, + "learning_rate": 1.9894726032155e-05, + "loss": 1.2621, + "step": 2195 + }, + { + "epoch": 0.8175821107375132, + "grad_norm": 0.17902565002441406, + "learning_rate": 1.9894550078133186e-05, + "loss": 1.267, + "step": 2196 + }, + { + "epoch": 0.8179544158881223, + "grad_norm": 0.1712873876094818, + "learning_rate": 1.9894373977969486e-05, + "loss": 1.2545, + "step": 2197 + }, + { + "epoch": 0.8183267210387314, + "grad_norm": 0.1745031774044037, + "learning_rate": 1.9894197731666506e-05, + "loss": 1.244, + "step": 2198 + }, + { + "epoch": 0.8186990261893404, + "grad_norm": 0.18271346390247345, + "learning_rate": 1.9894021339226843e-05, + "loss": 1.263, + "step": 2199 + }, + { + "epoch": 0.8190713313399495, + "grad_norm": 0.17454905807971954, + "learning_rate": 1.989384480065311e-05, + "loss": 1.2636, + "step": 2200 + }, + { + "epoch": 0.8194436364905586, + "grad_norm": 0.17742374539375305, + "learning_rate": 1.9893668115947906e-05, + "loss": 1.2489, + "step": 2201 + }, + { + "epoch": 0.8198159416411677, + "grad_norm": 0.17203061282634735, + "learning_rate": 1.9893491285113845e-05, + "loss": 1.248, + "step": 2202 + }, + { + "epoch": 0.8201882467917767, + "grad_norm": 0.1837659776210785, + "learning_rate": 1.989331430815354e-05, + "loss": 1.2582, + "step": 2203 + }, + { + "epoch": 0.8205605519423858, + "grad_norm": 0.17900030314922333, + "learning_rate": 1.9893137185069603e-05, + "loss": 1.2472, + "step": 2204 + }, + { + "epoch": 0.8209328570929948, + "grad_norm": 0.17265020310878754, + "learning_rate": 1.9892959915864652e-05, + "loss": 1.2608, + "step": 2205 + }, + { + "epoch": 0.821305162243604, + "grad_norm": 0.1822979599237442, + "learning_rate": 1.9892782500541304e-05, + "loss": 1.2477, + "step": 2206 + }, + { + "epoch": 0.821677467394213, + "grad_norm": 0.18672309815883636, + "learning_rate": 1.9892604939102177e-05, + "loss": 1.2489, + "step": 2207 + }, + { + "epoch": 0.822049772544822, + "grad_norm": 0.18067359924316406, + "learning_rate": 1.9892427231549897e-05, + "loss": 1.2511, + "step": 2208 + }, + { + "epoch": 0.8224220776954311, + "grad_norm": 0.18508952856063843, + "learning_rate": 1.9892249377887086e-05, + "loss": 1.2522, + "step": 2209 + }, + { + "epoch": 0.8227943828460402, + "grad_norm": 0.17746296525001526, + "learning_rate": 1.9892071378116378e-05, + "loss": 1.2487, + "step": 2210 + }, + { + "epoch": 0.8231666879966493, + "grad_norm": 0.1730239987373352, + "learning_rate": 1.9891893232240394e-05, + "loss": 1.2631, + "step": 2211 + }, + { + "epoch": 0.8235389931472583, + "grad_norm": 0.1845473349094391, + "learning_rate": 1.9891714940261764e-05, + "loss": 1.2595, + "step": 2212 + }, + { + "epoch": 0.8239112982978674, + "grad_norm": 0.18520519137382507, + "learning_rate": 1.989153650218313e-05, + "loss": 1.2479, + "step": 2213 + }, + { + "epoch": 0.8242836034484765, + "grad_norm": 0.19345776736736298, + "learning_rate": 1.989135791800712e-05, + "loss": 1.2744, + "step": 2214 + }, + { + "epoch": 0.8246559085990856, + "grad_norm": 0.17609384655952454, + "learning_rate": 1.9891179187736375e-05, + "loss": 1.2465, + "step": 2215 + }, + { + "epoch": 0.8250282137496946, + "grad_norm": 0.1815994530916214, + "learning_rate": 1.9891000311373533e-05, + "loss": 1.2523, + "step": 2216 + }, + { + "epoch": 0.8254005189003036, + "grad_norm": 0.19329750537872314, + "learning_rate": 1.9890821288921238e-05, + "loss": 1.2651, + "step": 2217 + }, + { + "epoch": 0.8257728240509127, + "grad_norm": 0.18610511720180511, + "learning_rate": 1.9890642120382132e-05, + "loss": 1.2453, + "step": 2218 + }, + { + "epoch": 0.8261451292015218, + "grad_norm": 0.1695135533809662, + "learning_rate": 1.9890462805758863e-05, + "loss": 1.2745, + "step": 2219 + }, + { + "epoch": 0.8265174343521309, + "grad_norm": 0.18071943521499634, + "learning_rate": 1.9890283345054082e-05, + "loss": 1.2737, + "step": 2220 + }, + { + "epoch": 0.8268897395027399, + "grad_norm": 0.18198001384735107, + "learning_rate": 1.9890103738270433e-05, + "loss": 1.2495, + "step": 2221 + }, + { + "epoch": 0.827262044653349, + "grad_norm": 0.1641894280910492, + "learning_rate": 1.9889923985410576e-05, + "loss": 1.2556, + "step": 2222 + }, + { + "epoch": 0.8276343498039581, + "grad_norm": 0.18489384651184082, + "learning_rate": 1.9889744086477162e-05, + "loss": 1.2591, + "step": 2223 + }, + { + "epoch": 0.8280066549545672, + "grad_norm": 0.1797334849834442, + "learning_rate": 1.9889564041472846e-05, + "loss": 1.2376, + "step": 2224 + }, + { + "epoch": 0.8283789601051762, + "grad_norm": 0.16851265728473663, + "learning_rate": 1.988938385040029e-05, + "loss": 1.2442, + "step": 2225 + }, + { + "epoch": 0.8287512652557852, + "grad_norm": 0.17845316231250763, + "learning_rate": 1.9889203513262153e-05, + "loss": 1.2432, + "step": 2226 + }, + { + "epoch": 0.8291235704063944, + "grad_norm": 0.18235254287719727, + "learning_rate": 1.9889023030061106e-05, + "loss": 1.2593, + "step": 2227 + }, + { + "epoch": 0.8294958755570034, + "grad_norm": 0.1719173640012741, + "learning_rate": 1.9888842400799805e-05, + "loss": 1.2567, + "step": 2228 + }, + { + "epoch": 0.8298681807076125, + "grad_norm": 0.17865481972694397, + "learning_rate": 1.9888661625480927e-05, + "loss": 1.2712, + "step": 2229 + }, + { + "epoch": 0.8302404858582215, + "grad_norm": 0.1867910474538803, + "learning_rate": 1.9888480704107135e-05, + "loss": 1.2432, + "step": 2230 + }, + { + "epoch": 0.8306127910088306, + "grad_norm": 0.17481322586536407, + "learning_rate": 1.9888299636681105e-05, + "loss": 1.2538, + "step": 2231 + }, + { + "epoch": 0.8309850961594397, + "grad_norm": 0.18114839494228363, + "learning_rate": 1.9888118423205504e-05, + "loss": 1.2521, + "step": 2232 + }, + { + "epoch": 0.8313574013100488, + "grad_norm": 0.17833612859249115, + "learning_rate": 1.988793706368302e-05, + "loss": 1.2607, + "step": 2233 + }, + { + "epoch": 0.8317297064606578, + "grad_norm": 0.17668797075748444, + "learning_rate": 1.9887755558116324e-05, + "loss": 1.2573, + "step": 2234 + }, + { + "epoch": 0.8321020116112668, + "grad_norm": 0.1729530692100525, + "learning_rate": 1.9887573906508103e-05, + "loss": 1.2558, + "step": 2235 + }, + { + "epoch": 0.832474316761876, + "grad_norm": 0.19922944903373718, + "learning_rate": 1.988739210886103e-05, + "loss": 1.2743, + "step": 2236 + }, + { + "epoch": 0.832846621912485, + "grad_norm": 0.18021075427532196, + "learning_rate": 1.98872101651778e-05, + "loss": 1.2672, + "step": 2237 + }, + { + "epoch": 0.8332189270630941, + "grad_norm": 0.1811213493347168, + "learning_rate": 1.9887028075461096e-05, + "loss": 1.247, + "step": 2238 + }, + { + "epoch": 0.8335912322137031, + "grad_norm": 0.18018268048763275, + "learning_rate": 1.9886845839713604e-05, + "loss": 1.2451, + "step": 2239 + }, + { + "epoch": 0.8339635373643123, + "grad_norm": 0.1896362155675888, + "learning_rate": 1.9886663457938025e-05, + "loss": 1.2558, + "step": 2240 + }, + { + "epoch": 0.8343358425149213, + "grad_norm": 0.1847160905599594, + "learning_rate": 1.9886480930137046e-05, + "loss": 1.2619, + "step": 2241 + }, + { + "epoch": 0.8347081476655304, + "grad_norm": 0.17278259992599487, + "learning_rate": 1.988629825631336e-05, + "loss": 1.258, + "step": 2242 + }, + { + "epoch": 0.8350804528161394, + "grad_norm": 0.17939510941505432, + "learning_rate": 1.9886115436469674e-05, + "loss": 1.2665, + "step": 2243 + }, + { + "epoch": 0.8354527579667484, + "grad_norm": 0.17530454695224762, + "learning_rate": 1.9885932470608676e-05, + "loss": 1.2531, + "step": 2244 + }, + { + "epoch": 0.8358250631173576, + "grad_norm": 0.20562461018562317, + "learning_rate": 1.9885749358733086e-05, + "loss": 1.2467, + "step": 2245 + }, + { + "epoch": 0.8361973682679666, + "grad_norm": 0.17135387659072876, + "learning_rate": 1.988556610084559e-05, + "loss": 1.2486, + "step": 2246 + }, + { + "epoch": 0.8365696734185757, + "grad_norm": 0.18840156495571136, + "learning_rate": 1.9885382696948906e-05, + "loss": 1.2536, + "step": 2247 + }, + { + "epoch": 0.8369419785691847, + "grad_norm": 0.1828533560037613, + "learning_rate": 1.988519914704574e-05, + "loss": 1.2581, + "step": 2248 + }, + { + "epoch": 0.8373142837197939, + "grad_norm": 0.1792685091495514, + "learning_rate": 1.988501545113881e-05, + "loss": 1.2597, + "step": 2249 + }, + { + "epoch": 0.8376865888704029, + "grad_norm": 0.18444480001926422, + "learning_rate": 1.9884831609230813e-05, + "loss": 1.2414, + "step": 2250 + }, + { + "epoch": 0.838058894021012, + "grad_norm": 0.18098483979701996, + "learning_rate": 1.9884647621324475e-05, + "loss": 1.2448, + "step": 2251 + }, + { + "epoch": 0.838431199171621, + "grad_norm": 0.1816195845603943, + "learning_rate": 1.9884463487422515e-05, + "loss": 1.2499, + "step": 2252 + }, + { + "epoch": 0.83880350432223, + "grad_norm": 0.19393447041511536, + "learning_rate": 1.9884279207527647e-05, + "loss": 1.2584, + "step": 2253 + }, + { + "epoch": 0.8391758094728392, + "grad_norm": 0.18464012444019318, + "learning_rate": 1.9884094781642592e-05, + "loss": 1.2544, + "step": 2254 + }, + { + "epoch": 0.8395481146234482, + "grad_norm": 0.17956916987895966, + "learning_rate": 1.9883910209770083e-05, + "loss": 1.2602, + "step": 2255 + }, + { + "epoch": 0.8399204197740573, + "grad_norm": 0.18106333911418915, + "learning_rate": 1.988372549191284e-05, + "loss": 1.2553, + "step": 2256 + }, + { + "epoch": 0.8402927249246663, + "grad_norm": 0.18379664421081543, + "learning_rate": 1.9883540628073592e-05, + "loss": 1.2517, + "step": 2257 + }, + { + "epoch": 0.8406650300752755, + "grad_norm": 0.17581118643283844, + "learning_rate": 1.9883355618255068e-05, + "loss": 1.2534, + "step": 2258 + }, + { + "epoch": 0.8410373352258845, + "grad_norm": 0.17392081022262573, + "learning_rate": 1.988317046246e-05, + "loss": 1.2616, + "step": 2259 + }, + { + "epoch": 0.8414096403764936, + "grad_norm": 0.1733708381652832, + "learning_rate": 1.9882985160691127e-05, + "loss": 1.2549, + "step": 2260 + }, + { + "epoch": 0.8417819455271026, + "grad_norm": 0.1678503155708313, + "learning_rate": 1.9882799712951182e-05, + "loss": 1.2653, + "step": 2261 + }, + { + "epoch": 0.8421542506777118, + "grad_norm": 0.17663797736167908, + "learning_rate": 1.9882614119242906e-05, + "loss": 1.2556, + "step": 2262 + }, + { + "epoch": 0.8425265558283208, + "grad_norm": 0.17948319017887115, + "learning_rate": 1.988242837956904e-05, + "loss": 1.2459, + "step": 2263 + }, + { + "epoch": 0.8428988609789299, + "grad_norm": 0.17682886123657227, + "learning_rate": 1.9882242493932327e-05, + "loss": 1.2554, + "step": 2264 + }, + { + "epoch": 0.8432711661295389, + "grad_norm": 0.17694690823554993, + "learning_rate": 1.9882056462335513e-05, + "loss": 1.2536, + "step": 2265 + }, + { + "epoch": 0.8436434712801479, + "grad_norm": 0.18028870224952698, + "learning_rate": 1.9881870284781345e-05, + "loss": 1.2572, + "step": 2266 + }, + { + "epoch": 0.8440157764307571, + "grad_norm": 0.17367003858089447, + "learning_rate": 1.9881683961272572e-05, + "loss": 1.2496, + "step": 2267 + }, + { + "epoch": 0.8443880815813661, + "grad_norm": 0.18058733642101288, + "learning_rate": 1.988149749181195e-05, + "loss": 1.2648, + "step": 2268 + }, + { + "epoch": 0.8447603867319752, + "grad_norm": 0.1787886917591095, + "learning_rate": 1.9881310876402225e-05, + "loss": 1.2452, + "step": 2269 + }, + { + "epoch": 0.8451326918825842, + "grad_norm": 0.17933231592178345, + "learning_rate": 1.988112411504616e-05, + "loss": 1.2531, + "step": 2270 + }, + { + "epoch": 0.8455049970331934, + "grad_norm": 0.16929695010185242, + "learning_rate": 1.9880937207746515e-05, + "loss": 1.2432, + "step": 2271 + }, + { + "epoch": 0.8458773021838024, + "grad_norm": 0.17460580170154572, + "learning_rate": 1.9880750154506048e-05, + "loss": 1.2518, + "step": 2272 + }, + { + "epoch": 0.8462496073344115, + "grad_norm": 0.19186393916606903, + "learning_rate": 1.9880562955327516e-05, + "loss": 1.2534, + "step": 2273 + }, + { + "epoch": 0.8466219124850205, + "grad_norm": 0.17282183468341827, + "learning_rate": 1.9880375610213694e-05, + "loss": 1.241, + "step": 2274 + }, + { + "epoch": 0.8469942176356297, + "grad_norm": 0.16319245100021362, + "learning_rate": 1.9880188119167345e-05, + "loss": 1.2538, + "step": 2275 + }, + { + "epoch": 0.8473665227862387, + "grad_norm": 0.1815318912267685, + "learning_rate": 1.988000048219123e-05, + "loss": 1.2561, + "step": 2276 + }, + { + "epoch": 0.8477388279368477, + "grad_norm": 0.17304065823554993, + "learning_rate": 1.9879812699288136e-05, + "loss": 1.2545, + "step": 2277 + }, + { + "epoch": 0.8481111330874568, + "grad_norm": 0.17184028029441833, + "learning_rate": 1.9879624770460827e-05, + "loss": 1.2637, + "step": 2278 + }, + { + "epoch": 0.8484834382380658, + "grad_norm": 0.1777077466249466, + "learning_rate": 1.9879436695712076e-05, + "loss": 1.2599, + "step": 2279 + }, + { + "epoch": 0.848855743388675, + "grad_norm": 0.17770980298519135, + "learning_rate": 1.9879248475044668e-05, + "loss": 1.2289, + "step": 2280 + }, + { + "epoch": 0.849228048539284, + "grad_norm": 0.1785004436969757, + "learning_rate": 1.987906010846138e-05, + "loss": 1.2577, + "step": 2281 + }, + { + "epoch": 0.8496003536898931, + "grad_norm": 0.16683989763259888, + "learning_rate": 1.9878871595964993e-05, + "loss": 1.2454, + "step": 2282 + }, + { + "epoch": 0.8499726588405021, + "grad_norm": 0.18355853855609894, + "learning_rate": 1.9878682937558297e-05, + "loss": 1.284, + "step": 2283 + }, + { + "epoch": 0.8503449639911113, + "grad_norm": 0.1772661954164505, + "learning_rate": 1.987849413324407e-05, + "loss": 1.2512, + "step": 2284 + }, + { + "epoch": 0.8507172691417203, + "grad_norm": 0.17306406795978546, + "learning_rate": 1.9878305183025103e-05, + "loss": 1.2589, + "step": 2285 + }, + { + "epoch": 0.8510895742923293, + "grad_norm": 0.16759039461612701, + "learning_rate": 1.987811608690419e-05, + "loss": 1.2458, + "step": 2286 + }, + { + "epoch": 0.8514618794429384, + "grad_norm": 0.17323878407478333, + "learning_rate": 1.9877926844884126e-05, + "loss": 1.2559, + "step": 2287 + }, + { + "epoch": 0.8518341845935475, + "grad_norm": 0.16814987361431122, + "learning_rate": 1.9877737456967698e-05, + "loss": 1.2589, + "step": 2288 + }, + { + "epoch": 0.8522064897441566, + "grad_norm": 0.16971348226070404, + "learning_rate": 1.987754792315771e-05, + "loss": 1.2541, + "step": 2289 + }, + { + "epoch": 0.8525787948947656, + "grad_norm": 0.16793325543403625, + "learning_rate": 1.9877358243456956e-05, + "loss": 1.2716, + "step": 2290 + }, + { + "epoch": 0.8529511000453747, + "grad_norm": 0.17254911363124847, + "learning_rate": 1.9877168417868244e-05, + "loss": 1.253, + "step": 2291 + }, + { + "epoch": 0.8533234051959837, + "grad_norm": 0.17216616868972778, + "learning_rate": 1.9876978446394372e-05, + "loss": 1.2495, + "step": 2292 + }, + { + "epoch": 0.8536957103465929, + "grad_norm": 0.1849498599767685, + "learning_rate": 1.9876788329038147e-05, + "loss": 1.2736, + "step": 2293 + }, + { + "epoch": 0.8540680154972019, + "grad_norm": 0.17307497560977936, + "learning_rate": 1.987659806580238e-05, + "loss": 1.2549, + "step": 2294 + }, + { + "epoch": 0.854440320647811, + "grad_norm": 0.17321421205997467, + "learning_rate": 1.9876407656689883e-05, + "loss": 1.2634, + "step": 2295 + }, + { + "epoch": 0.85481262579842, + "grad_norm": 0.17435167729854584, + "learning_rate": 1.987621710170346e-05, + "loss": 1.2443, + "step": 2296 + }, + { + "epoch": 0.8551849309490291, + "grad_norm": 0.1697143167257309, + "learning_rate": 1.9876026400845933e-05, + "loss": 1.2621, + "step": 2297 + }, + { + "epoch": 0.8555572360996382, + "grad_norm": 0.18251660466194153, + "learning_rate": 1.9875835554120114e-05, + "loss": 1.2588, + "step": 2298 + }, + { + "epoch": 0.8559295412502472, + "grad_norm": 0.182033970952034, + "learning_rate": 1.9875644561528824e-05, + "loss": 1.2656, + "step": 2299 + }, + { + "epoch": 0.8563018464008563, + "grad_norm": 0.19597557187080383, + "learning_rate": 1.9875453423074883e-05, + "loss": 1.2545, + "step": 2300 + }, + { + "epoch": 0.8566741515514654, + "grad_norm": 0.17287831008434296, + "learning_rate": 1.9875262138761116e-05, + "loss": 1.2662, + "step": 2301 + }, + { + "epoch": 0.8570464567020745, + "grad_norm": 0.18034107983112335, + "learning_rate": 1.987507070859035e-05, + "loss": 1.2561, + "step": 2302 + }, + { + "epoch": 0.8574187618526835, + "grad_norm": 0.1734304428100586, + "learning_rate": 1.98748791325654e-05, + "loss": 1.2659, + "step": 2303 + }, + { + "epoch": 0.8577910670032926, + "grad_norm": 0.18945597112178802, + "learning_rate": 1.9874687410689114e-05, + "loss": 1.265, + "step": 2304 + }, + { + "epoch": 0.8581633721539016, + "grad_norm": 0.1864440143108368, + "learning_rate": 1.9874495542964308e-05, + "loss": 1.2634, + "step": 2305 + }, + { + "epoch": 0.8585356773045107, + "grad_norm": 0.1847706139087677, + "learning_rate": 1.987430352939383e-05, + "loss": 1.2572, + "step": 2306 + }, + { + "epoch": 0.8589079824551198, + "grad_norm": 0.18453647196292877, + "learning_rate": 1.98741113699805e-05, + "loss": 1.2501, + "step": 2307 + }, + { + "epoch": 0.8592802876057288, + "grad_norm": 0.17989476025104523, + "learning_rate": 1.9873919064727173e-05, + "loss": 1.2589, + "step": 2308 + }, + { + "epoch": 0.8596525927563379, + "grad_norm": 0.19050173461437225, + "learning_rate": 1.9873726613636678e-05, + "loss": 1.2489, + "step": 2309 + }, + { + "epoch": 0.860024897906947, + "grad_norm": 0.17572608590126038, + "learning_rate": 1.987353401671186e-05, + "loss": 1.2375, + "step": 2310 + }, + { + "epoch": 0.8603972030575561, + "grad_norm": 0.17381906509399414, + "learning_rate": 1.987334127395556e-05, + "loss": 1.2631, + "step": 2311 + }, + { + "epoch": 0.8607695082081651, + "grad_norm": 0.17476747930049896, + "learning_rate": 1.9873148385370635e-05, + "loss": 1.2418, + "step": 2312 + }, + { + "epoch": 0.8611418133587742, + "grad_norm": 0.1777612864971161, + "learning_rate": 1.9872955350959927e-05, + "loss": 1.2502, + "step": 2313 + }, + { + "epoch": 0.8615141185093833, + "grad_norm": 0.17522254586219788, + "learning_rate": 1.9872762170726284e-05, + "loss": 1.2653, + "step": 2314 + }, + { + "epoch": 0.8618864236599924, + "grad_norm": 0.17853045463562012, + "learning_rate": 1.9872568844672567e-05, + "loss": 1.2536, + "step": 2315 + }, + { + "epoch": 0.8622587288106014, + "grad_norm": 0.1815364956855774, + "learning_rate": 1.9872375372801627e-05, + "loss": 1.2518, + "step": 2316 + }, + { + "epoch": 0.8626310339612104, + "grad_norm": 0.1911124587059021, + "learning_rate": 1.9872181755116324e-05, + "loss": 1.2583, + "step": 2317 + }, + { + "epoch": 0.8630033391118195, + "grad_norm": 0.1666967123746872, + "learning_rate": 1.9871987991619516e-05, + "loss": 1.2509, + "step": 2318 + }, + { + "epoch": 0.8633756442624286, + "grad_norm": 0.19720950722694397, + "learning_rate": 1.9871794082314064e-05, + "loss": 1.2615, + "step": 2319 + }, + { + "epoch": 0.8637479494130377, + "grad_norm": 0.18802502751350403, + "learning_rate": 1.987160002720283e-05, + "loss": 1.2608, + "step": 2320 + }, + { + "epoch": 0.8641202545636467, + "grad_norm": 0.18227264285087585, + "learning_rate": 1.9871405826288685e-05, + "loss": 1.2452, + "step": 2321 + }, + { + "epoch": 0.8644925597142558, + "grad_norm": 0.1774458885192871, + "learning_rate": 1.9871211479574497e-05, + "loss": 1.2562, + "step": 2322 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.1868571639060974, + "learning_rate": 1.9871016987063133e-05, + "loss": 1.2485, + "step": 2323 + }, + { + "epoch": 0.865237170015474, + "grad_norm": 0.18438728153705597, + "learning_rate": 1.987082234875747e-05, + "loss": 1.2437, + "step": 2324 + }, + { + "epoch": 0.865609475166083, + "grad_norm": 0.17169415950775146, + "learning_rate": 1.987062756466038e-05, + "loss": 1.2487, + "step": 2325 + }, + { + "epoch": 0.865981780316692, + "grad_norm": 0.1830715835094452, + "learning_rate": 1.9870432634774737e-05, + "loss": 1.2454, + "step": 2326 + }, + { + "epoch": 0.8663540854673011, + "grad_norm": 0.18534304201602936, + "learning_rate": 1.9870237559103426e-05, + "loss": 1.2691, + "step": 2327 + }, + { + "epoch": 0.8667263906179102, + "grad_norm": 0.17846928536891937, + "learning_rate": 1.9870042337649328e-05, + "loss": 1.2524, + "step": 2328 + }, + { + "epoch": 0.8670986957685193, + "grad_norm": 0.17754697799682617, + "learning_rate": 1.9869846970415317e-05, + "loss": 1.2456, + "step": 2329 + }, + { + "epoch": 0.8674710009191283, + "grad_norm": 0.1744270771741867, + "learning_rate": 1.9869651457404293e-05, + "loss": 1.2393, + "step": 2330 + }, + { + "epoch": 0.8678433060697374, + "grad_norm": 0.18277184665203094, + "learning_rate": 1.9869455798619133e-05, + "loss": 1.2672, + "step": 2331 + }, + { + "epoch": 0.8682156112203465, + "grad_norm": 0.1796514391899109, + "learning_rate": 1.986925999406273e-05, + "loss": 1.247, + "step": 2332 + }, + { + "epoch": 0.8685879163709556, + "grad_norm": 0.17978057265281677, + "learning_rate": 1.9869064043737977e-05, + "loss": 1.258, + "step": 2333 + }, + { + "epoch": 0.8689602215215646, + "grad_norm": 0.1840755194425583, + "learning_rate": 1.9868867947647768e-05, + "loss": 1.2558, + "step": 2334 + }, + { + "epoch": 0.8693325266721736, + "grad_norm": 0.18063770234584808, + "learning_rate": 1.9868671705794997e-05, + "loss": 1.2623, + "step": 2335 + }, + { + "epoch": 0.8697048318227828, + "grad_norm": 0.16667230427265167, + "learning_rate": 1.9868475318182566e-05, + "loss": 1.2476, + "step": 2336 + }, + { + "epoch": 0.8700771369733918, + "grad_norm": 0.17838679254055023, + "learning_rate": 1.9868278784813374e-05, + "loss": 1.2663, + "step": 2337 + }, + { + "epoch": 0.8704494421240009, + "grad_norm": 0.1771472841501236, + "learning_rate": 1.9868082105690323e-05, + "loss": 1.2516, + "step": 2338 + }, + { + "epoch": 0.8708217472746099, + "grad_norm": 0.18569402396678925, + "learning_rate": 1.9867885280816317e-05, + "loss": 1.2496, + "step": 2339 + }, + { + "epoch": 0.871194052425219, + "grad_norm": 0.17394711077213287, + "learning_rate": 1.986768831019427e-05, + "loss": 1.2457, + "step": 2340 + }, + { + "epoch": 0.8715663575758281, + "grad_norm": 0.16605480015277863, + "learning_rate": 1.986749119382708e-05, + "loss": 1.2542, + "step": 2341 + }, + { + "epoch": 0.8719386627264372, + "grad_norm": 0.17628324031829834, + "learning_rate": 1.9867293931717664e-05, + "loss": 1.2544, + "step": 2342 + }, + { + "epoch": 0.8723109678770462, + "grad_norm": 0.18517057597637177, + "learning_rate": 1.986709652386894e-05, + "loss": 1.2576, + "step": 2343 + }, + { + "epoch": 0.8726832730276552, + "grad_norm": 0.17365185916423798, + "learning_rate": 1.9866898970283816e-05, + "loss": 1.2472, + "step": 2344 + }, + { + "epoch": 0.8730555781782644, + "grad_norm": 0.17242515087127686, + "learning_rate": 1.9866701270965217e-05, + "loss": 1.2536, + "step": 2345 + }, + { + "epoch": 0.8734278833288734, + "grad_norm": 0.1831977367401123, + "learning_rate": 1.986650342591606e-05, + "loss": 1.2538, + "step": 2346 + }, + { + "epoch": 0.8738001884794825, + "grad_norm": 0.1806264966726303, + "learning_rate": 1.986630543513926e-05, + "loss": 1.2417, + "step": 2347 + }, + { + "epoch": 0.8741724936300915, + "grad_norm": 0.1779744178056717, + "learning_rate": 1.9866107298637754e-05, + "loss": 1.2577, + "step": 2348 + }, + { + "epoch": 0.8745447987807007, + "grad_norm": 0.18145568668842316, + "learning_rate": 1.9865909016414462e-05, + "loss": 1.2498, + "step": 2349 + }, + { + "epoch": 0.8749171039313097, + "grad_norm": 0.1738026887178421, + "learning_rate": 1.9865710588472307e-05, + "loss": 1.2482, + "step": 2350 + }, + { + "epoch": 0.8752894090819188, + "grad_norm": 0.17503541707992554, + "learning_rate": 1.986551201481423e-05, + "loss": 1.2433, + "step": 2351 + }, + { + "epoch": 0.8756617142325278, + "grad_norm": 0.18008525669574738, + "learning_rate": 1.9865313295443164e-05, + "loss": 1.2692, + "step": 2352 + }, + { + "epoch": 0.8760340193831369, + "grad_norm": 0.17630065977573395, + "learning_rate": 1.9865114430362037e-05, + "loss": 1.256, + "step": 2353 + }, + { + "epoch": 0.876406324533746, + "grad_norm": 0.16775432229042053, + "learning_rate": 1.9864915419573787e-05, + "loss": 1.2493, + "step": 2354 + }, + { + "epoch": 0.876778629684355, + "grad_norm": 0.17783991992473602, + "learning_rate": 1.9864716263081356e-05, + "loss": 1.2344, + "step": 2355 + }, + { + "epoch": 0.8771509348349641, + "grad_norm": 0.1752803474664688, + "learning_rate": 1.9864516960887684e-05, + "loss": 1.2532, + "step": 2356 + }, + { + "epoch": 0.8775232399855731, + "grad_norm": 0.17517109215259552, + "learning_rate": 1.9864317512995718e-05, + "loss": 1.2508, + "step": 2357 + }, + { + "epoch": 0.8778955451361823, + "grad_norm": 0.19087643921375275, + "learning_rate": 1.9864117919408397e-05, + "loss": 1.2517, + "step": 2358 + }, + { + "epoch": 0.8782678502867913, + "grad_norm": 0.17448683083057404, + "learning_rate": 1.986391818012868e-05, + "loss": 1.2431, + "step": 2359 + }, + { + "epoch": 0.8786401554374004, + "grad_norm": 0.1711607575416565, + "learning_rate": 1.9863718295159503e-05, + "loss": 1.257, + "step": 2360 + }, + { + "epoch": 0.8790124605880094, + "grad_norm": 0.1857292354106903, + "learning_rate": 1.9863518264503832e-05, + "loss": 1.2477, + "step": 2361 + }, + { + "epoch": 0.8793847657386186, + "grad_norm": 0.18781475722789764, + "learning_rate": 1.9863318088164613e-05, + "loss": 1.2672, + "step": 2362 + }, + { + "epoch": 0.8797570708892276, + "grad_norm": 0.16976489126682281, + "learning_rate": 1.9863117766144807e-05, + "loss": 1.2528, + "step": 2363 + }, + { + "epoch": 0.8801293760398367, + "grad_norm": 0.17341898381710052, + "learning_rate": 1.9862917298447365e-05, + "loss": 1.2625, + "step": 2364 + }, + { + "epoch": 0.8805016811904457, + "grad_norm": 0.18177741765975952, + "learning_rate": 1.986271668507526e-05, + "loss": 1.2602, + "step": 2365 + }, + { + "epoch": 0.8808739863410547, + "grad_norm": 0.1771697849035263, + "learning_rate": 1.9862515926031444e-05, + "loss": 1.2413, + "step": 2366 + }, + { + "epoch": 0.8812462914916639, + "grad_norm": 0.17161764204502106, + "learning_rate": 1.9862315021318886e-05, + "loss": 1.2514, + "step": 2367 + }, + { + "epoch": 0.8816185966422729, + "grad_norm": 0.18902365863323212, + "learning_rate": 1.986211397094056e-05, + "loss": 1.2348, + "step": 2368 + }, + { + "epoch": 0.881990901792882, + "grad_norm": 0.17638647556304932, + "learning_rate": 1.9861912774899425e-05, + "loss": 1.2421, + "step": 2369 + }, + { + "epoch": 0.882363206943491, + "grad_norm": 0.17760038375854492, + "learning_rate": 1.9861711433198457e-05, + "loss": 1.2574, + "step": 2370 + }, + { + "epoch": 0.8827355120941002, + "grad_norm": 0.17845885455608368, + "learning_rate": 1.9861509945840632e-05, + "loss": 1.252, + "step": 2371 + }, + { + "epoch": 0.8831078172447092, + "grad_norm": 0.17632153630256653, + "learning_rate": 1.9861308312828923e-05, + "loss": 1.2648, + "step": 2372 + }, + { + "epoch": 0.8834801223953183, + "grad_norm": 0.17382317781448364, + "learning_rate": 1.9861106534166307e-05, + "loss": 1.2574, + "step": 2373 + }, + { + "epoch": 0.8838524275459273, + "grad_norm": 0.18132738769054413, + "learning_rate": 1.986090460985577e-05, + "loss": 1.2546, + "step": 2374 + }, + { + "epoch": 0.8842247326965365, + "grad_norm": 0.17880843579769135, + "learning_rate": 1.9860702539900288e-05, + "loss": 1.2542, + "step": 2375 + }, + { + "epoch": 0.8845970378471455, + "grad_norm": 0.17678992450237274, + "learning_rate": 1.9860500324302848e-05, + "loss": 1.258, + "step": 2376 + }, + { + "epoch": 0.8849693429977545, + "grad_norm": 0.17159947752952576, + "learning_rate": 1.986029796306644e-05, + "loss": 1.2545, + "step": 2377 + }, + { + "epoch": 0.8853416481483636, + "grad_norm": 0.17294186353683472, + "learning_rate": 1.9860095456194045e-05, + "loss": 1.2572, + "step": 2378 + }, + { + "epoch": 0.8857139532989726, + "grad_norm": 0.17959366738796234, + "learning_rate": 1.9859892803688666e-05, + "loss": 1.2406, + "step": 2379 + }, + { + "epoch": 0.8860862584495818, + "grad_norm": 0.1762249618768692, + "learning_rate": 1.9859690005553282e-05, + "loss": 1.2681, + "step": 2380 + }, + { + "epoch": 0.8864585636001908, + "grad_norm": 0.17363481223583221, + "learning_rate": 1.98594870617909e-05, + "loss": 1.2502, + "step": 2381 + }, + { + "epoch": 0.8868308687507999, + "grad_norm": 0.18337921798229218, + "learning_rate": 1.985928397240451e-05, + "loss": 1.2509, + "step": 2382 + }, + { + "epoch": 0.8872031739014089, + "grad_norm": 0.17734134197235107, + "learning_rate": 1.985908073739712e-05, + "loss": 1.2543, + "step": 2383 + }, + { + "epoch": 0.8875754790520181, + "grad_norm": 0.1737191379070282, + "learning_rate": 1.9858877356771722e-05, + "loss": 1.2543, + "step": 2384 + }, + { + "epoch": 0.8879477842026271, + "grad_norm": 0.18461737036705017, + "learning_rate": 1.9858673830531322e-05, + "loss": 1.2598, + "step": 2385 + }, + { + "epoch": 0.8883200893532361, + "grad_norm": 0.1724756807088852, + "learning_rate": 1.9858470158678932e-05, + "loss": 1.2436, + "step": 2386 + }, + { + "epoch": 0.8886923945038452, + "grad_norm": 0.16471019387245178, + "learning_rate": 1.9858266341217556e-05, + "loss": 1.2428, + "step": 2387 + }, + { + "epoch": 0.8890646996544542, + "grad_norm": 0.1689532995223999, + "learning_rate": 1.9858062378150204e-05, + "loss": 1.2496, + "step": 2388 + }, + { + "epoch": 0.8894370048050634, + "grad_norm": 0.17168131470680237, + "learning_rate": 1.9857858269479887e-05, + "loss": 1.2495, + "step": 2389 + }, + { + "epoch": 0.8898093099556724, + "grad_norm": 0.18134863674640656, + "learning_rate": 1.9857654015209627e-05, + "loss": 1.24, + "step": 2390 + }, + { + "epoch": 0.8901816151062815, + "grad_norm": 0.16769009828567505, + "learning_rate": 1.985744961534243e-05, + "loss": 1.244, + "step": 2391 + }, + { + "epoch": 0.8905539202568905, + "grad_norm": 0.16388821601867676, + "learning_rate": 1.9857245069881326e-05, + "loss": 1.2531, + "step": 2392 + }, + { + "epoch": 0.8909262254074997, + "grad_norm": 0.17608126997947693, + "learning_rate": 1.985704037882933e-05, + "loss": 1.2432, + "step": 2393 + }, + { + "epoch": 0.8912985305581087, + "grad_norm": 0.17434832453727722, + "learning_rate": 1.9856835542189464e-05, + "loss": 1.2555, + "step": 2394 + }, + { + "epoch": 0.8916708357087177, + "grad_norm": 0.16499166190624237, + "learning_rate": 1.9856630559964758e-05, + "loss": 1.2392, + "step": 2395 + }, + { + "epoch": 0.8920431408593268, + "grad_norm": 0.18076403439044952, + "learning_rate": 1.9856425432158236e-05, + "loss": 1.2435, + "step": 2396 + }, + { + "epoch": 0.8924154460099359, + "grad_norm": 0.17794503271579742, + "learning_rate": 1.9856220158772927e-05, + "loss": 1.2558, + "step": 2397 + }, + { + "epoch": 0.892787751160545, + "grad_norm": 0.17728637158870697, + "learning_rate": 1.9856014739811867e-05, + "loss": 1.2692, + "step": 2398 + }, + { + "epoch": 0.893160056311154, + "grad_norm": 0.17668506503105164, + "learning_rate": 1.9855809175278088e-05, + "loss": 1.2552, + "step": 2399 + }, + { + "epoch": 0.8935323614617631, + "grad_norm": 0.1653260886669159, + "learning_rate": 1.9855603465174623e-05, + "loss": 1.2485, + "step": 2400 + }, + { + "epoch": 0.8939046666123721, + "grad_norm": 0.17724184691905975, + "learning_rate": 1.9855397609504517e-05, + "loss": 1.2513, + "step": 2401 + }, + { + "epoch": 0.8942769717629813, + "grad_norm": 0.1668618619441986, + "learning_rate": 1.9855191608270807e-05, + "loss": 1.2387, + "step": 2402 + }, + { + "epoch": 0.8946492769135903, + "grad_norm": 0.16532456874847412, + "learning_rate": 1.9854985461476534e-05, + "loss": 1.2431, + "step": 2403 + }, + { + "epoch": 0.8950215820641994, + "grad_norm": 0.17286598682403564, + "learning_rate": 1.9854779169124745e-05, + "loss": 1.2322, + "step": 2404 + }, + { + "epoch": 0.8953938872148084, + "grad_norm": 0.17562846839427948, + "learning_rate": 1.9854572731218483e-05, + "loss": 1.2538, + "step": 2405 + }, + { + "epoch": 0.8957661923654175, + "grad_norm": 0.1721685379743576, + "learning_rate": 1.9854366147760803e-05, + "loss": 1.2421, + "step": 2406 + }, + { + "epoch": 0.8961384975160266, + "grad_norm": 0.16794408857822418, + "learning_rate": 1.9854159418754754e-05, + "loss": 1.2622, + "step": 2407 + }, + { + "epoch": 0.8965108026666356, + "grad_norm": 0.17553912103176117, + "learning_rate": 1.9853952544203387e-05, + "loss": 1.2526, + "step": 2408 + }, + { + "epoch": 0.8968831078172447, + "grad_norm": 0.17011193931102753, + "learning_rate": 1.985374552410976e-05, + "loss": 1.2588, + "step": 2409 + }, + { + "epoch": 0.8972554129678538, + "grad_norm": 0.17169992625713348, + "learning_rate": 1.9853538358476933e-05, + "loss": 1.2377, + "step": 2410 + }, + { + "epoch": 0.8976277181184629, + "grad_norm": 0.1703435331583023, + "learning_rate": 1.985333104730796e-05, + "loss": 1.2366, + "step": 2411 + }, + { + "epoch": 0.8980000232690719, + "grad_norm": 0.17589643597602844, + "learning_rate": 1.9853123590605904e-05, + "loss": 1.2371, + "step": 2412 + }, + { + "epoch": 0.898372328419681, + "grad_norm": 0.18404798209667206, + "learning_rate": 1.9852915988373834e-05, + "loss": 1.2483, + "step": 2413 + }, + { + "epoch": 0.89874463357029, + "grad_norm": 0.18587037920951843, + "learning_rate": 1.9852708240614812e-05, + "loss": 1.2575, + "step": 2414 + }, + { + "epoch": 0.8991169387208992, + "grad_norm": 0.16864776611328125, + "learning_rate": 1.9852500347331908e-05, + "loss": 1.2358, + "step": 2415 + }, + { + "epoch": 0.8994892438715082, + "grad_norm": 0.17366687953472137, + "learning_rate": 1.985229230852819e-05, + "loss": 1.2397, + "step": 2416 + }, + { + "epoch": 0.8998615490221172, + "grad_norm": 0.1771089732646942, + "learning_rate": 1.9852084124206735e-05, + "loss": 1.2628, + "step": 2417 + }, + { + "epoch": 0.9002338541727263, + "grad_norm": 0.16770784556865692, + "learning_rate": 1.9851875794370616e-05, + "loss": 1.2384, + "step": 2418 + }, + { + "epoch": 0.9006061593233354, + "grad_norm": 0.17862196266651154, + "learning_rate": 1.985166731902291e-05, + "loss": 1.2611, + "step": 2419 + }, + { + "epoch": 0.9009784644739445, + "grad_norm": 0.17286571860313416, + "learning_rate": 1.9851458698166693e-05, + "loss": 1.2643, + "step": 2420 + }, + { + "epoch": 0.9013507696245535, + "grad_norm": 0.17281019687652588, + "learning_rate": 1.9851249931805053e-05, + "loss": 1.2539, + "step": 2421 + }, + { + "epoch": 0.9017230747751626, + "grad_norm": 0.1699199378490448, + "learning_rate": 1.985104101994107e-05, + "loss": 1.2567, + "step": 2422 + }, + { + "epoch": 0.9020953799257717, + "grad_norm": 0.17170144617557526, + "learning_rate": 1.9850831962577824e-05, + "loss": 1.2658, + "step": 2423 + }, + { + "epoch": 0.9024676850763808, + "grad_norm": 0.17239661514759064, + "learning_rate": 1.9850622759718415e-05, + "loss": 1.2399, + "step": 2424 + }, + { + "epoch": 0.9028399902269898, + "grad_norm": 0.17295676469802856, + "learning_rate": 1.9850413411365923e-05, + "loss": 1.2494, + "step": 2425 + }, + { + "epoch": 0.9032122953775988, + "grad_norm": 0.17475001513957977, + "learning_rate": 1.985020391752344e-05, + "loss": 1.2487, + "step": 2426 + }, + { + "epoch": 0.9035846005282079, + "grad_norm": 0.16339634358882904, + "learning_rate": 1.9849994278194068e-05, + "loss": 1.2435, + "step": 2427 + }, + { + "epoch": 0.903956905678817, + "grad_norm": 0.1701515167951584, + "learning_rate": 1.9849784493380897e-05, + "loss": 1.2499, + "step": 2428 + }, + { + "epoch": 0.9043292108294261, + "grad_norm": 0.16852952539920807, + "learning_rate": 1.9849574563087025e-05, + "loss": 1.242, + "step": 2429 + }, + { + "epoch": 0.9047015159800351, + "grad_norm": 0.1811753362417221, + "learning_rate": 1.984936448731556e-05, + "loss": 1.2504, + "step": 2430 + }, + { + "epoch": 0.9050738211306442, + "grad_norm": 0.16984617710113525, + "learning_rate": 1.9849154266069597e-05, + "loss": 1.2338, + "step": 2431 + }, + { + "epoch": 0.9054461262812533, + "grad_norm": 0.18032081425189972, + "learning_rate": 1.984894389935224e-05, + "loss": 1.2283, + "step": 2432 + }, + { + "epoch": 0.9058184314318624, + "grad_norm": 0.1682289093732834, + "learning_rate": 1.9848733387166606e-05, + "loss": 1.2465, + "step": 2433 + }, + { + "epoch": 0.9061907365824714, + "grad_norm": 0.1777002066373825, + "learning_rate": 1.9848522729515794e-05, + "loss": 1.2515, + "step": 2434 + }, + { + "epoch": 0.9065630417330804, + "grad_norm": 0.17563197016716003, + "learning_rate": 1.984831192640292e-05, + "loss": 1.2523, + "step": 2435 + }, + { + "epoch": 0.9069353468836896, + "grad_norm": 0.17422862350940704, + "learning_rate": 1.9848100977831098e-05, + "loss": 1.2584, + "step": 2436 + }, + { + "epoch": 0.9073076520342986, + "grad_norm": 0.1725662648677826, + "learning_rate": 1.984788988380344e-05, + "loss": 1.2468, + "step": 2437 + }, + { + "epoch": 0.9076799571849077, + "grad_norm": 0.16899681091308594, + "learning_rate": 1.9847678644323068e-05, + "loss": 1.2547, + "step": 2438 + }, + { + "epoch": 0.9080522623355167, + "grad_norm": 0.1767120510339737, + "learning_rate": 1.9847467259393102e-05, + "loss": 1.2531, + "step": 2439 + }, + { + "epoch": 0.9084245674861258, + "grad_norm": 0.1795816719532013, + "learning_rate": 1.984725572901666e-05, + "loss": 1.2519, + "step": 2440 + }, + { + "epoch": 0.9087968726367349, + "grad_norm": 0.1791762113571167, + "learning_rate": 1.984704405319687e-05, + "loss": 1.2787, + "step": 2441 + }, + { + "epoch": 0.909169177787344, + "grad_norm": 0.1724022626876831, + "learning_rate": 1.9846832231936857e-05, + "loss": 1.259, + "step": 2442 + }, + { + "epoch": 0.909541482937953, + "grad_norm": 0.182962566614151, + "learning_rate": 1.984662026523975e-05, + "loss": 1.2568, + "step": 2443 + }, + { + "epoch": 0.909913788088562, + "grad_norm": 0.17582541704177856, + "learning_rate": 1.984640815310868e-05, + "loss": 1.237, + "step": 2444 + }, + { + "epoch": 0.9102860932391712, + "grad_norm": 0.17969557642936707, + "learning_rate": 1.984619589554678e-05, + "loss": 1.2662, + "step": 2445 + }, + { + "epoch": 0.9106583983897802, + "grad_norm": 0.17635543644428253, + "learning_rate": 1.9845983492557183e-05, + "loss": 1.2597, + "step": 2446 + }, + { + "epoch": 0.9110307035403893, + "grad_norm": 0.1814170926809311, + "learning_rate": 1.984577094414303e-05, + "loss": 1.243, + "step": 2447 + }, + { + "epoch": 0.9114030086909983, + "grad_norm": 0.16915488243103027, + "learning_rate": 1.984555825030746e-05, + "loss": 1.2477, + "step": 2448 + }, + { + "epoch": 0.9117753138416075, + "grad_norm": 0.1874271184206009, + "learning_rate": 1.9845345411053608e-05, + "loss": 1.2594, + "step": 2449 + }, + { + "epoch": 0.9121476189922165, + "grad_norm": 0.17666561901569366, + "learning_rate": 1.984513242638462e-05, + "loss": 1.2704, + "step": 2450 + }, + { + "epoch": 0.9125199241428256, + "grad_norm": 0.16944633424282074, + "learning_rate": 1.9844919296303647e-05, + "loss": 1.2457, + "step": 2451 + }, + { + "epoch": 0.9128922292934346, + "grad_norm": 0.18913982808589935, + "learning_rate": 1.9844706020813835e-05, + "loss": 1.242, + "step": 2452 + }, + { + "epoch": 0.9132645344440437, + "grad_norm": 0.17225903272628784, + "learning_rate": 1.9844492599918333e-05, + "loss": 1.2557, + "step": 2453 + }, + { + "epoch": 0.9136368395946528, + "grad_norm": 0.17968986928462982, + "learning_rate": 1.984427903362029e-05, + "loss": 1.2457, + "step": 2454 + }, + { + "epoch": 0.9140091447452618, + "grad_norm": 0.17983782291412354, + "learning_rate": 1.9844065321922867e-05, + "loss": 1.2449, + "step": 2455 + }, + { + "epoch": 0.9143814498958709, + "grad_norm": 0.17764170467853546, + "learning_rate": 1.9843851464829216e-05, + "loss": 1.241, + "step": 2456 + }, + { + "epoch": 0.9147537550464799, + "grad_norm": 0.17187097668647766, + "learning_rate": 1.9843637462342498e-05, + "loss": 1.2388, + "step": 2457 + }, + { + "epoch": 0.9151260601970891, + "grad_norm": 0.18232551217079163, + "learning_rate": 1.984342331446587e-05, + "loss": 1.2553, + "step": 2458 + }, + { + "epoch": 0.9154983653476981, + "grad_norm": 0.19238729774951935, + "learning_rate": 1.9843209021202496e-05, + "loss": 1.2357, + "step": 2459 + }, + { + "epoch": 0.9158706704983072, + "grad_norm": 0.1838630735874176, + "learning_rate": 1.9842994582555546e-05, + "loss": 1.2509, + "step": 2460 + }, + { + "epoch": 0.9162429756489162, + "grad_norm": 0.17731057107448578, + "learning_rate": 1.984277999852818e-05, + "loss": 1.2439, + "step": 2461 + }, + { + "epoch": 0.9166152807995253, + "grad_norm": 0.1820240616798401, + "learning_rate": 1.9842565269123577e-05, + "loss": 1.2467, + "step": 2462 + }, + { + "epoch": 0.9169875859501344, + "grad_norm": 0.1758500039577484, + "learning_rate": 1.9842350394344898e-05, + "loss": 1.2482, + "step": 2463 + }, + { + "epoch": 0.9173598911007435, + "grad_norm": 0.18491673469543457, + "learning_rate": 1.984213537419532e-05, + "loss": 1.2456, + "step": 2464 + }, + { + "epoch": 0.9177321962513525, + "grad_norm": 0.17699192464351654, + "learning_rate": 1.9841920208678024e-05, + "loss": 1.2429, + "step": 2465 + }, + { + "epoch": 0.9181045014019615, + "grad_norm": 0.17381340265274048, + "learning_rate": 1.9841704897796185e-05, + "loss": 1.2492, + "step": 2466 + }, + { + "epoch": 0.9184768065525707, + "grad_norm": 0.1726997047662735, + "learning_rate": 1.984148944155298e-05, + "loss": 1.2569, + "step": 2467 + }, + { + "epoch": 0.9188491117031797, + "grad_norm": 0.18082614243030548, + "learning_rate": 1.9841273839951595e-05, + "loss": 1.2426, + "step": 2468 + }, + { + "epoch": 0.9192214168537888, + "grad_norm": 0.17732475697994232, + "learning_rate": 1.984105809299521e-05, + "loss": 1.2449, + "step": 2469 + }, + { + "epoch": 0.9195937220043978, + "grad_norm": 0.17159555852413177, + "learning_rate": 1.9840842200687014e-05, + "loss": 1.2588, + "step": 2470 + }, + { + "epoch": 0.919966027155007, + "grad_norm": 0.17913275957107544, + "learning_rate": 1.9840626163030202e-05, + "loss": 1.2564, + "step": 2471 + }, + { + "epoch": 0.920338332305616, + "grad_norm": 0.17932996153831482, + "learning_rate": 1.9840409980027954e-05, + "loss": 1.2485, + "step": 2472 + }, + { + "epoch": 0.9207106374562251, + "grad_norm": 0.17403729259967804, + "learning_rate": 1.984019365168347e-05, + "loss": 1.2463, + "step": 2473 + }, + { + "epoch": 0.9210829426068341, + "grad_norm": 0.18577389419078827, + "learning_rate": 1.9839977177999942e-05, + "loss": 1.2486, + "step": 2474 + }, + { + "epoch": 0.9214552477574431, + "grad_norm": 0.1704840064048767, + "learning_rate": 1.9839760558980572e-05, + "loss": 1.2459, + "step": 2475 + }, + { + "epoch": 0.9218275529080523, + "grad_norm": 0.18757209181785583, + "learning_rate": 1.9839543794628553e-05, + "loss": 1.2486, + "step": 2476 + }, + { + "epoch": 0.9221998580586613, + "grad_norm": 0.18660812079906464, + "learning_rate": 1.9839326884947093e-05, + "loss": 1.2705, + "step": 2477 + }, + { + "epoch": 0.9225721632092704, + "grad_norm": 0.16449618339538574, + "learning_rate": 1.9839109829939388e-05, + "loss": 1.24, + "step": 2478 + }, + { + "epoch": 0.9229444683598794, + "grad_norm": 0.16529710590839386, + "learning_rate": 1.9838892629608652e-05, + "loss": 1.2578, + "step": 2479 + }, + { + "epoch": 0.9233167735104886, + "grad_norm": 0.17828698456287384, + "learning_rate": 1.9838675283958087e-05, + "loss": 1.256, + "step": 2480 + }, + { + "epoch": 0.9236890786610976, + "grad_norm": 0.17406156659126282, + "learning_rate": 1.9838457792990902e-05, + "loss": 1.2568, + "step": 2481 + }, + { + "epoch": 0.9240613838117067, + "grad_norm": 0.1763119399547577, + "learning_rate": 1.983824015671032e-05, + "loss": 1.2539, + "step": 2482 + }, + { + "epoch": 0.9244336889623157, + "grad_norm": 0.1707441657781601, + "learning_rate": 1.9838022375119544e-05, + "loss": 1.2391, + "step": 2483 + }, + { + "epoch": 0.9248059941129249, + "grad_norm": 0.1721871942281723, + "learning_rate": 1.9837804448221798e-05, + "loss": 1.2559, + "step": 2484 + }, + { + "epoch": 0.9251782992635339, + "grad_norm": 0.17778955399990082, + "learning_rate": 1.9837586376020293e-05, + "loss": 1.2563, + "step": 2485 + }, + { + "epoch": 0.9255506044141429, + "grad_norm": 0.17989608645439148, + "learning_rate": 1.983736815851826e-05, + "loss": 1.2358, + "step": 2486 + }, + { + "epoch": 0.925922909564752, + "grad_norm": 0.17223800718784332, + "learning_rate": 1.9837149795718913e-05, + "loss": 1.2619, + "step": 2487 + }, + { + "epoch": 0.926295214715361, + "grad_norm": 0.17492277920246124, + "learning_rate": 1.983693128762548e-05, + "loss": 1.24, + "step": 2488 + }, + { + "epoch": 0.9266675198659702, + "grad_norm": 0.17337973415851593, + "learning_rate": 1.9836712634241194e-05, + "loss": 1.2582, + "step": 2489 + }, + { + "epoch": 0.9270398250165792, + "grad_norm": 0.16389819979667664, + "learning_rate": 1.9836493835569278e-05, + "loss": 1.2379, + "step": 2490 + }, + { + "epoch": 0.9274121301671883, + "grad_norm": 0.17160442471504211, + "learning_rate": 1.9836274891612963e-05, + "loss": 1.2405, + "step": 2491 + }, + { + "epoch": 0.9277844353177973, + "grad_norm": 0.1783936619758606, + "learning_rate": 1.9836055802375488e-05, + "loss": 1.2555, + "step": 2492 + }, + { + "epoch": 0.9281567404684065, + "grad_norm": 0.17412415146827698, + "learning_rate": 1.9835836567860082e-05, + "loss": 1.2471, + "step": 2493 + }, + { + "epoch": 0.9285290456190155, + "grad_norm": 0.175662100315094, + "learning_rate": 1.983561718806999e-05, + "loss": 1.2335, + "step": 2494 + }, + { + "epoch": 0.9289013507696245, + "grad_norm": 0.1671876162290573, + "learning_rate": 1.983539766300845e-05, + "loss": 1.2543, + "step": 2495 + }, + { + "epoch": 0.9292736559202336, + "grad_norm": 0.16467541456222534, + "learning_rate": 1.9835177992678704e-05, + "loss": 1.2454, + "step": 2496 + }, + { + "epoch": 0.9296459610708427, + "grad_norm": 0.17586293816566467, + "learning_rate": 1.9834958177083995e-05, + "loss": 1.2568, + "step": 2497 + }, + { + "epoch": 0.9300182662214518, + "grad_norm": 0.170370951294899, + "learning_rate": 1.983473821622757e-05, + "loss": 1.2479, + "step": 2498 + }, + { + "epoch": 0.9303905713720608, + "grad_norm": 0.17328549921512604, + "learning_rate": 1.983451811011268e-05, + "loss": 1.2398, + "step": 2499 + }, + { + "epoch": 0.9307628765226699, + "grad_norm": 0.18288347125053406, + "learning_rate": 1.9834297858742574e-05, + "loss": 1.2438, + "step": 2500 + }, + { + "epoch": 0.9307628765226699, + "eval_loss": 1.3256843090057373, + "eval_runtime": 16.4567, + "eval_samples_per_second": 105.368, + "eval_steps_per_second": 5.287, + "step": 2500 + }, + { + "epoch": 0.9311351816732789, + "grad_norm": 0.17619635164737701, + "learning_rate": 1.9834077462120506e-05, + "loss": 1.2541, + "step": 2501 + }, + { + "epoch": 0.9315074868238881, + "grad_norm": 0.1839938461780548, + "learning_rate": 1.9833856920249733e-05, + "loss": 1.2747, + "step": 2502 + }, + { + "epoch": 0.9318797919744971, + "grad_norm": 0.18233221769332886, + "learning_rate": 1.9833636233133507e-05, + "loss": 1.2484, + "step": 2503 + }, + { + "epoch": 0.9322520971251061, + "grad_norm": 0.1671859472990036, + "learning_rate": 1.9833415400775092e-05, + "loss": 1.2481, + "step": 2504 + }, + { + "epoch": 0.9326244022757152, + "grad_norm": 0.17161886394023895, + "learning_rate": 1.9833194423177754e-05, + "loss": 1.2584, + "step": 2505 + }, + { + "epoch": 0.9329967074263243, + "grad_norm": 0.18463842570781708, + "learning_rate": 1.9832973300344745e-05, + "loss": 1.257, + "step": 2506 + }, + { + "epoch": 0.9333690125769334, + "grad_norm": 0.17637981474399567, + "learning_rate": 1.983275203227934e-05, + "loss": 1.2577, + "step": 2507 + }, + { + "epoch": 0.9337413177275424, + "grad_norm": 0.17521126568317413, + "learning_rate": 1.9832530618984802e-05, + "loss": 1.2564, + "step": 2508 + }, + { + "epoch": 0.9341136228781515, + "grad_norm": 0.17385391891002655, + "learning_rate": 1.9832309060464408e-05, + "loss": 1.245, + "step": 2509 + }, + { + "epoch": 0.9344859280287606, + "grad_norm": 0.16838274896144867, + "learning_rate": 1.9832087356721424e-05, + "loss": 1.2586, + "step": 2510 + }, + { + "epoch": 0.9348582331793697, + "grad_norm": 0.17618712782859802, + "learning_rate": 1.9831865507759125e-05, + "loss": 1.2478, + "step": 2511 + }, + { + "epoch": 0.9352305383299787, + "grad_norm": 0.17693477869033813, + "learning_rate": 1.983164351358079e-05, + "loss": 1.2573, + "step": 2512 + }, + { + "epoch": 0.9356028434805878, + "grad_norm": 0.17031066119670868, + "learning_rate": 1.9831421374189702e-05, + "loss": 1.2538, + "step": 2513 + }, + { + "epoch": 0.9359751486311968, + "grad_norm": 0.1740039438009262, + "learning_rate": 1.983119908958913e-05, + "loss": 1.2496, + "step": 2514 + }, + { + "epoch": 0.936347453781806, + "grad_norm": 0.18025629222393036, + "learning_rate": 1.983097665978237e-05, + "loss": 1.2503, + "step": 2515 + }, + { + "epoch": 0.936719758932415, + "grad_norm": 0.179831400513649, + "learning_rate": 1.98307540847727e-05, + "loss": 1.2372, + "step": 2516 + }, + { + "epoch": 0.937092064083024, + "grad_norm": 0.17471134662628174, + "learning_rate": 1.983053136456341e-05, + "loss": 1.2489, + "step": 2517 + }, + { + "epoch": 0.9374643692336331, + "grad_norm": 0.17015781998634338, + "learning_rate": 1.9830308499157787e-05, + "loss": 1.2523, + "step": 2518 + }, + { + "epoch": 0.9378366743842422, + "grad_norm": 0.17395399510860443, + "learning_rate": 1.9830085488559128e-05, + "loss": 1.2493, + "step": 2519 + }, + { + "epoch": 0.9382089795348513, + "grad_norm": 0.1732756495475769, + "learning_rate": 1.982986233277072e-05, + "loss": 1.2487, + "step": 2520 + }, + { + "epoch": 0.9385812846854603, + "grad_norm": 0.1738785058259964, + "learning_rate": 1.9829639031795862e-05, + "loss": 1.254, + "step": 2521 + }, + { + "epoch": 0.9389535898360694, + "grad_norm": 0.17151622474193573, + "learning_rate": 1.9829415585637853e-05, + "loss": 1.2453, + "step": 2522 + }, + { + "epoch": 0.9393258949866785, + "grad_norm": 0.16740648448467255, + "learning_rate": 1.982919199429999e-05, + "loss": 1.2497, + "step": 2523 + }, + { + "epoch": 0.9396982001372876, + "grad_norm": 0.1763676106929779, + "learning_rate": 1.9828968257785582e-05, + "loss": 1.2484, + "step": 2524 + }, + { + "epoch": 0.9400705052878966, + "grad_norm": 0.17714397609233856, + "learning_rate": 1.982874437609793e-05, + "loss": 1.2431, + "step": 2525 + }, + { + "epoch": 0.9404428104385056, + "grad_norm": 0.16998568177223206, + "learning_rate": 1.982852034924034e-05, + "loss": 1.2378, + "step": 2526 + }, + { + "epoch": 0.9408151155891147, + "grad_norm": 0.1771582067012787, + "learning_rate": 1.9828296177216118e-05, + "loss": 1.2533, + "step": 2527 + }, + { + "epoch": 0.9411874207397238, + "grad_norm": 0.18139605224132538, + "learning_rate": 1.9828071860028582e-05, + "loss": 1.2489, + "step": 2528 + }, + { + "epoch": 0.9415597258903329, + "grad_norm": 0.17635215818881989, + "learning_rate": 1.982784739768104e-05, + "loss": 1.2437, + "step": 2529 + }, + { + "epoch": 0.9419320310409419, + "grad_norm": 0.17219685018062592, + "learning_rate": 1.982762279017681e-05, + "loss": 1.2348, + "step": 2530 + }, + { + "epoch": 0.942304336191551, + "grad_norm": 0.18018695712089539, + "learning_rate": 1.9827398037519203e-05, + "loss": 1.2448, + "step": 2531 + }, + { + "epoch": 0.9426766413421601, + "grad_norm": 0.17004595696926117, + "learning_rate": 1.9827173139711547e-05, + "loss": 1.2489, + "step": 2532 + }, + { + "epoch": 0.9430489464927692, + "grad_norm": 0.17758417129516602, + "learning_rate": 1.9826948096757163e-05, + "loss": 1.2598, + "step": 2533 + }, + { + "epoch": 0.9434212516433782, + "grad_norm": 0.18373461067676544, + "learning_rate": 1.982672290865937e-05, + "loss": 1.2601, + "step": 2534 + }, + { + "epoch": 0.9437935567939872, + "grad_norm": 0.17354753613471985, + "learning_rate": 1.9826497575421498e-05, + "loss": 1.2353, + "step": 2535 + }, + { + "epoch": 0.9441658619445963, + "grad_norm": 0.179754376411438, + "learning_rate": 1.982627209704687e-05, + "loss": 1.2466, + "step": 2536 + }, + { + "epoch": 0.9445381670952054, + "grad_norm": 0.16391302645206451, + "learning_rate": 1.9826046473538823e-05, + "loss": 1.2386, + "step": 2537 + }, + { + "epoch": 0.9449104722458145, + "grad_norm": 0.1725778728723526, + "learning_rate": 1.9825820704900684e-05, + "loss": 1.2573, + "step": 2538 + }, + { + "epoch": 0.9452827773964235, + "grad_norm": 0.1723441183567047, + "learning_rate": 1.9825594791135792e-05, + "loss": 1.2355, + "step": 2539 + }, + { + "epoch": 0.9456550825470326, + "grad_norm": 0.16522756218910217, + "learning_rate": 1.982536873224748e-05, + "loss": 1.2386, + "step": 2540 + }, + { + "epoch": 0.9460273876976417, + "grad_norm": 0.1779457926750183, + "learning_rate": 1.982514252823909e-05, + "loss": 1.2522, + "step": 2541 + }, + { + "epoch": 0.9463996928482508, + "grad_norm": 0.17980381846427917, + "learning_rate": 1.982491617911396e-05, + "loss": 1.2639, + "step": 2542 + }, + { + "epoch": 0.9467719979988598, + "grad_norm": 0.17201921343803406, + "learning_rate": 1.9824689684875435e-05, + "loss": 1.2428, + "step": 2543 + }, + { + "epoch": 0.9471443031494688, + "grad_norm": 0.16897398233413696, + "learning_rate": 1.9824463045526857e-05, + "loss": 1.2488, + "step": 2544 + }, + { + "epoch": 0.947516608300078, + "grad_norm": 0.1801498979330063, + "learning_rate": 1.982423626107158e-05, + "loss": 1.2434, + "step": 2545 + }, + { + "epoch": 0.947888913450687, + "grad_norm": 0.1762707233428955, + "learning_rate": 1.982400933151295e-05, + "loss": 1.2467, + "step": 2546 + }, + { + "epoch": 0.9482612186012961, + "grad_norm": 0.1682814210653305, + "learning_rate": 1.982378225685432e-05, + "loss": 1.2472, + "step": 2547 + }, + { + "epoch": 0.9486335237519051, + "grad_norm": 0.17276160418987274, + "learning_rate": 1.982355503709904e-05, + "loss": 1.2439, + "step": 2548 + }, + { + "epoch": 0.9490058289025142, + "grad_norm": 0.16808001697063446, + "learning_rate": 1.982332767225047e-05, + "loss": 1.245, + "step": 2549 + }, + { + "epoch": 0.9493781340531233, + "grad_norm": 0.17079442739486694, + "learning_rate": 1.9823100162311967e-05, + "loss": 1.2439, + "step": 2550 + }, + { + "epoch": 0.9497504392037324, + "grad_norm": 0.17456983029842377, + "learning_rate": 1.982287250728689e-05, + "loss": 1.2551, + "step": 2551 + }, + { + "epoch": 0.9501227443543414, + "grad_norm": 0.1688481867313385, + "learning_rate": 1.98226447071786e-05, + "loss": 1.2487, + "step": 2552 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.1739954799413681, + "learning_rate": 1.982241676199047e-05, + "loss": 1.2556, + "step": 2553 + }, + { + "epoch": 0.9508673546555596, + "grad_norm": 0.1756342649459839, + "learning_rate": 1.9822188671725854e-05, + "loss": 1.2437, + "step": 2554 + }, + { + "epoch": 0.9512396598061686, + "grad_norm": 0.17892800271511078, + "learning_rate": 1.9821960436388134e-05, + "loss": 1.2401, + "step": 2555 + }, + { + "epoch": 0.9516119649567777, + "grad_norm": 0.17654842138290405, + "learning_rate": 1.9821732055980673e-05, + "loss": 1.2459, + "step": 2556 + }, + { + "epoch": 0.9519842701073867, + "grad_norm": 0.18467935919761658, + "learning_rate": 1.9821503530506843e-05, + "loss": 1.2376, + "step": 2557 + }, + { + "epoch": 0.9523565752579959, + "grad_norm": 0.1839521825313568, + "learning_rate": 1.9821274859970025e-05, + "loss": 1.2529, + "step": 2558 + }, + { + "epoch": 0.9527288804086049, + "grad_norm": 0.17149806022644043, + "learning_rate": 1.982104604437359e-05, + "loss": 1.2576, + "step": 2559 + }, + { + "epoch": 0.953101185559214, + "grad_norm": 0.17172785103321075, + "learning_rate": 1.9820817083720928e-05, + "loss": 1.2464, + "step": 2560 + }, + { + "epoch": 0.953473490709823, + "grad_norm": 0.17375504970550537, + "learning_rate": 1.9820587978015407e-05, + "loss": 1.2578, + "step": 2561 + }, + { + "epoch": 0.953845795860432, + "grad_norm": 0.17615382373332977, + "learning_rate": 1.982035872726042e-05, + "loss": 1.2555, + "step": 2562 + }, + { + "epoch": 0.9542181010110412, + "grad_norm": 0.18444810807704926, + "learning_rate": 1.9820129331459354e-05, + "loss": 1.2337, + "step": 2563 + }, + { + "epoch": 0.9545904061616503, + "grad_norm": 0.1793602854013443, + "learning_rate": 1.981989979061559e-05, + "loss": 1.2504, + "step": 2564 + }, + { + "epoch": 0.9549627113122593, + "grad_norm": 0.16998952627182007, + "learning_rate": 1.9819670104732528e-05, + "loss": 1.2424, + "step": 2565 + }, + { + "epoch": 0.9553350164628683, + "grad_norm": 0.16896101832389832, + "learning_rate": 1.981944027381355e-05, + "loss": 1.239, + "step": 2566 + }, + { + "epoch": 0.9557073216134775, + "grad_norm": 0.16461212933063507, + "learning_rate": 1.9819210297862055e-05, + "loss": 1.245, + "step": 2567 + }, + { + "epoch": 0.9560796267640865, + "grad_norm": 0.1697378307580948, + "learning_rate": 1.981898017688144e-05, + "loss": 1.2506, + "step": 2568 + }, + { + "epoch": 0.9564519319146956, + "grad_norm": 0.17256854474544525, + "learning_rate": 1.981874991087511e-05, + "loss": 1.2113, + "step": 2569 + }, + { + "epoch": 0.9568242370653046, + "grad_norm": 0.1716059297323227, + "learning_rate": 1.9818519499846457e-05, + "loss": 1.2552, + "step": 2570 + }, + { + "epoch": 0.9571965422159138, + "grad_norm": 0.16681870818138123, + "learning_rate": 1.9818288943798882e-05, + "loss": 1.2412, + "step": 2571 + }, + { + "epoch": 0.9575688473665228, + "grad_norm": 0.1805577576160431, + "learning_rate": 1.98180582427358e-05, + "loss": 1.2393, + "step": 2572 + }, + { + "epoch": 0.9579411525171319, + "grad_norm": 0.16774867475032806, + "learning_rate": 1.9817827396660615e-05, + "loss": 1.2327, + "step": 2573 + }, + { + "epoch": 0.9583134576677409, + "grad_norm": 0.1805572658777237, + "learning_rate": 1.9817596405576733e-05, + "loss": 1.2445, + "step": 2574 + }, + { + "epoch": 0.9586857628183499, + "grad_norm": 0.16827546060085297, + "learning_rate": 1.981736526948757e-05, + "loss": 1.2472, + "step": 2575 + }, + { + "epoch": 0.9590580679689591, + "grad_norm": 0.184850811958313, + "learning_rate": 1.9817133988396536e-05, + "loss": 1.2344, + "step": 2576 + }, + { + "epoch": 0.9594303731195681, + "grad_norm": 0.17866064608097076, + "learning_rate": 1.981690256230705e-05, + "loss": 1.247, + "step": 2577 + }, + { + "epoch": 0.9598026782701772, + "grad_norm": 0.17331889271736145, + "learning_rate": 1.981667099122253e-05, + "loss": 1.2627, + "step": 2578 + }, + { + "epoch": 0.9601749834207862, + "grad_norm": 0.17190296947956085, + "learning_rate": 1.9816439275146394e-05, + "loss": 1.2344, + "step": 2579 + }, + { + "epoch": 0.9605472885713954, + "grad_norm": 0.1726665496826172, + "learning_rate": 1.981620741408207e-05, + "loss": 1.247, + "step": 2580 + }, + { + "epoch": 0.9609195937220044, + "grad_norm": 0.1653529554605484, + "learning_rate": 1.9815975408032972e-05, + "loss": 1.2472, + "step": 2581 + }, + { + "epoch": 0.9612918988726135, + "grad_norm": 0.18165260553359985, + "learning_rate": 1.9815743257002537e-05, + "loss": 1.2457, + "step": 2582 + }, + { + "epoch": 0.9616642040232225, + "grad_norm": 0.18499329686164856, + "learning_rate": 1.981551096099419e-05, + "loss": 1.251, + "step": 2583 + }, + { + "epoch": 0.9620365091738317, + "grad_norm": 0.17018747329711914, + "learning_rate": 1.9815278520011364e-05, + "loss": 1.2342, + "step": 2584 + }, + { + "epoch": 0.9624088143244407, + "grad_norm": 0.17789021134376526, + "learning_rate": 1.9815045934057487e-05, + "loss": 1.2406, + "step": 2585 + }, + { + "epoch": 0.9627811194750497, + "grad_norm": 0.1729610115289688, + "learning_rate": 1.9814813203135998e-05, + "loss": 1.2466, + "step": 2586 + }, + { + "epoch": 0.9631534246256588, + "grad_norm": 0.17735299468040466, + "learning_rate": 1.9814580327250336e-05, + "loss": 1.2406, + "step": 2587 + }, + { + "epoch": 0.9635257297762678, + "grad_norm": 0.1712779998779297, + "learning_rate": 1.9814347306403936e-05, + "loss": 1.2408, + "step": 2588 + }, + { + "epoch": 0.963898034926877, + "grad_norm": 0.18011993169784546, + "learning_rate": 1.981411414060024e-05, + "loss": 1.2578, + "step": 2589 + }, + { + "epoch": 0.964270340077486, + "grad_norm": 0.17510679364204407, + "learning_rate": 1.9813880829842704e-05, + "loss": 1.2324, + "step": 2590 + }, + { + "epoch": 0.9646426452280951, + "grad_norm": 0.17391736805438995, + "learning_rate": 1.9813647374134756e-05, + "loss": 1.2433, + "step": 2591 + }, + { + "epoch": 0.9650149503787041, + "grad_norm": 0.17409949004650116, + "learning_rate": 1.9813413773479853e-05, + "loss": 1.2542, + "step": 2592 + }, + { + "epoch": 0.9653872555293133, + "grad_norm": 0.17249634861946106, + "learning_rate": 1.9813180027881445e-05, + "loss": 1.231, + "step": 2593 + }, + { + "epoch": 0.9657595606799223, + "grad_norm": 0.18046285212039948, + "learning_rate": 1.9812946137342984e-05, + "loss": 1.2408, + "step": 2594 + }, + { + "epoch": 0.9661318658305313, + "grad_norm": 0.16898088157176971, + "learning_rate": 1.9812712101867923e-05, + "loss": 1.2426, + "step": 2595 + }, + { + "epoch": 0.9665041709811404, + "grad_norm": 0.17144586145877838, + "learning_rate": 1.9812477921459724e-05, + "loss": 1.2358, + "step": 2596 + }, + { + "epoch": 0.9668764761317495, + "grad_norm": 0.17590296268463135, + "learning_rate": 1.981224359612184e-05, + "loss": 1.231, + "step": 2597 + }, + { + "epoch": 0.9672487812823586, + "grad_norm": 0.1714339703321457, + "learning_rate": 1.981200912585773e-05, + "loss": 1.2307, + "step": 2598 + }, + { + "epoch": 0.9676210864329676, + "grad_norm": 0.17280226945877075, + "learning_rate": 1.9811774510670866e-05, + "loss": 1.2432, + "step": 2599 + }, + { + "epoch": 0.9679933915835767, + "grad_norm": 0.19417522847652435, + "learning_rate": 1.9811539750564702e-05, + "loss": 1.2329, + "step": 2600 + }, + { + "epoch": 0.9683656967341857, + "grad_norm": 0.18655581772327423, + "learning_rate": 1.9811304845542717e-05, + "loss": 1.2364, + "step": 2601 + }, + { + "epoch": 0.9687380018847949, + "grad_norm": 0.1713135540485382, + "learning_rate": 1.9811069795608377e-05, + "loss": 1.266, + "step": 2602 + }, + { + "epoch": 0.9691103070354039, + "grad_norm": 0.18093515932559967, + "learning_rate": 1.9810834600765148e-05, + "loss": 1.2466, + "step": 2603 + }, + { + "epoch": 0.969482612186013, + "grad_norm": 0.17053551971912384, + "learning_rate": 1.9810599261016506e-05, + "loss": 1.2532, + "step": 2604 + }, + { + "epoch": 0.969854917336622, + "grad_norm": 0.1664905697107315, + "learning_rate": 1.9810363776365932e-05, + "loss": 1.232, + "step": 2605 + }, + { + "epoch": 0.9702272224872311, + "grad_norm": 0.1823788583278656, + "learning_rate": 1.98101281468169e-05, + "loss": 1.255, + "step": 2606 + }, + { + "epoch": 0.9705995276378402, + "grad_norm": 0.17830835282802582, + "learning_rate": 1.980989237237289e-05, + "loss": 1.2522, + "step": 2607 + }, + { + "epoch": 0.9709718327884492, + "grad_norm": 0.19234777987003326, + "learning_rate": 1.980965645303739e-05, + "loss": 1.2574, + "step": 2608 + }, + { + "epoch": 0.9713441379390583, + "grad_norm": 0.18148033320903778, + "learning_rate": 1.9809420388813874e-05, + "loss": 1.2511, + "step": 2609 + }, + { + "epoch": 0.9717164430896673, + "grad_norm": 0.1736968457698822, + "learning_rate": 1.9809184179705835e-05, + "loss": 1.2458, + "step": 2610 + }, + { + "epoch": 0.9720887482402765, + "grad_norm": 0.17815859615802765, + "learning_rate": 1.9808947825716768e-05, + "loss": 1.2548, + "step": 2611 + }, + { + "epoch": 0.9724610533908855, + "grad_norm": 0.1661943793296814, + "learning_rate": 1.980871132685015e-05, + "loss": 1.2295, + "step": 2612 + }, + { + "epoch": 0.9728333585414946, + "grad_norm": 0.16622133553028107, + "learning_rate": 1.980847468310948e-05, + "loss": 1.2403, + "step": 2613 + }, + { + "epoch": 0.9732056636921036, + "grad_norm": 0.17739859223365784, + "learning_rate": 1.980823789449826e-05, + "loss": 1.2407, + "step": 2614 + }, + { + "epoch": 0.9735779688427127, + "grad_norm": 0.17749415338039398, + "learning_rate": 1.980800096101998e-05, + "loss": 1.236, + "step": 2615 + }, + { + "epoch": 0.9739502739933218, + "grad_norm": 0.18621912598609924, + "learning_rate": 1.9807763882678143e-05, + "loss": 1.2393, + "step": 2616 + }, + { + "epoch": 0.9743225791439308, + "grad_norm": 0.1727140098810196, + "learning_rate": 1.9807526659476245e-05, + "loss": 1.2331, + "step": 2617 + }, + { + "epoch": 0.9746948842945399, + "grad_norm": 0.17797011137008667, + "learning_rate": 1.9807289291417795e-05, + "loss": 1.2603, + "step": 2618 + }, + { + "epoch": 0.975067189445149, + "grad_norm": 0.18820150196552277, + "learning_rate": 1.98070517785063e-05, + "loss": 1.2385, + "step": 2619 + }, + { + "epoch": 0.9754394945957581, + "grad_norm": 0.17448899149894714, + "learning_rate": 1.9806814120745265e-05, + "loss": 1.2361, + "step": 2620 + }, + { + "epoch": 0.9758117997463671, + "grad_norm": 0.1788555085659027, + "learning_rate": 1.9806576318138194e-05, + "loss": 1.2421, + "step": 2621 + }, + { + "epoch": 0.9761841048969762, + "grad_norm": 0.1790677309036255, + "learning_rate": 1.9806338370688615e-05, + "loss": 1.2379, + "step": 2622 + }, + { + "epoch": 0.9765564100475852, + "grad_norm": 0.17613591253757477, + "learning_rate": 1.980610027840003e-05, + "loss": 1.2557, + "step": 2623 + }, + { + "epoch": 0.9769287151981944, + "grad_norm": 0.19133977591991425, + "learning_rate": 1.9805862041275962e-05, + "loss": 1.2616, + "step": 2624 + }, + { + "epoch": 0.9773010203488034, + "grad_norm": 0.18040066957473755, + "learning_rate": 1.9805623659319924e-05, + "loss": 1.2437, + "step": 2625 + }, + { + "epoch": 0.9776733254994124, + "grad_norm": 0.17554713785648346, + "learning_rate": 1.980538513253544e-05, + "loss": 1.2483, + "step": 2626 + }, + { + "epoch": 0.9780456306500215, + "grad_norm": 0.17956022918224335, + "learning_rate": 1.9805146460926033e-05, + "loss": 1.2492, + "step": 2627 + }, + { + "epoch": 0.9784179358006306, + "grad_norm": 0.18192705512046814, + "learning_rate": 1.980490764449523e-05, + "loss": 1.2439, + "step": 2628 + }, + { + "epoch": 0.9787902409512397, + "grad_norm": 0.1905398815870285, + "learning_rate": 1.9804668683246556e-05, + "loss": 1.2384, + "step": 2629 + }, + { + "epoch": 0.9791625461018487, + "grad_norm": 0.17728392779827118, + "learning_rate": 1.980442957718354e-05, + "loss": 1.2492, + "step": 2630 + }, + { + "epoch": 0.9795348512524578, + "grad_norm": 0.17330513894557953, + "learning_rate": 1.9804190326309714e-05, + "loss": 1.2312, + "step": 2631 + }, + { + "epoch": 0.9799071564030669, + "grad_norm": 0.17659252882003784, + "learning_rate": 1.9803950930628616e-05, + "loss": 1.2429, + "step": 2632 + }, + { + "epoch": 0.980279461553676, + "grad_norm": 0.18615534901618958, + "learning_rate": 1.9803711390143774e-05, + "loss": 1.2442, + "step": 2633 + }, + { + "epoch": 0.980651766704285, + "grad_norm": 0.18511880934238434, + "learning_rate": 1.9803471704858733e-05, + "loss": 1.2489, + "step": 2634 + }, + { + "epoch": 0.981024071854894, + "grad_norm": 0.1814030408859253, + "learning_rate": 1.9803231874777025e-05, + "loss": 1.2361, + "step": 2635 + }, + { + "epoch": 0.9813963770055031, + "grad_norm": 0.1738506257534027, + "learning_rate": 1.98029918999022e-05, + "loss": 1.2462, + "step": 2636 + }, + { + "epoch": 0.9817686821561122, + "grad_norm": 0.1846051663160324, + "learning_rate": 1.98027517802378e-05, + "loss": 1.2436, + "step": 2637 + }, + { + "epoch": 0.9821409873067213, + "grad_norm": 0.19149626791477203, + "learning_rate": 1.9802511515787373e-05, + "loss": 1.2403, + "step": 2638 + }, + { + "epoch": 0.9825132924573303, + "grad_norm": 0.17990732192993164, + "learning_rate": 1.9802271106554464e-05, + "loss": 1.2479, + "step": 2639 + }, + { + "epoch": 0.9828855976079394, + "grad_norm": 0.18959732353687286, + "learning_rate": 1.9802030552542627e-05, + "loss": 1.238, + "step": 2640 + }, + { + "epoch": 0.9832579027585485, + "grad_norm": 0.1863657683134079, + "learning_rate": 1.9801789853755415e-05, + "loss": 1.2615, + "step": 2641 + }, + { + "epoch": 0.9836302079091576, + "grad_norm": 0.16739057004451752, + "learning_rate": 1.980154901019638e-05, + "loss": 1.2371, + "step": 2642 + }, + { + "epoch": 0.9840025130597666, + "grad_norm": 0.1758878231048584, + "learning_rate": 1.9801308021869084e-05, + "loss": 1.2301, + "step": 2643 + }, + { + "epoch": 0.9843748182103756, + "grad_norm": 0.18291743099689484, + "learning_rate": 1.9801066888777082e-05, + "loss": 1.2375, + "step": 2644 + }, + { + "epoch": 0.9847471233609848, + "grad_norm": 0.18052415549755096, + "learning_rate": 1.9800825610923937e-05, + "loss": 1.2571, + "step": 2645 + }, + { + "epoch": 0.9851194285115938, + "grad_norm": 0.17342287302017212, + "learning_rate": 1.980058418831321e-05, + "loss": 1.2584, + "step": 2646 + }, + { + "epoch": 0.9854917336622029, + "grad_norm": 0.17909403145313263, + "learning_rate": 1.9800342620948475e-05, + "loss": 1.2307, + "step": 2647 + }, + { + "epoch": 0.9858640388128119, + "grad_norm": 0.17488320171833038, + "learning_rate": 1.980010090883329e-05, + "loss": 1.2422, + "step": 2648 + }, + { + "epoch": 0.986236343963421, + "grad_norm": 0.17006643116474152, + "learning_rate": 1.9799859051971232e-05, + "loss": 1.2405, + "step": 2649 + }, + { + "epoch": 0.9866086491140301, + "grad_norm": 0.18193192780017853, + "learning_rate": 1.979961705036587e-05, + "loss": 1.2491, + "step": 2650 + }, + { + "epoch": 0.9869809542646392, + "grad_norm": 0.18248356878757477, + "learning_rate": 1.979937490402078e-05, + "loss": 1.2499, + "step": 2651 + }, + { + "epoch": 0.9873532594152482, + "grad_norm": 0.17710177600383759, + "learning_rate": 1.9799132612939535e-05, + "loss": 1.2346, + "step": 2652 + }, + { + "epoch": 0.9877255645658573, + "grad_norm": 0.17549732327461243, + "learning_rate": 1.979889017712572e-05, + "loss": 1.2341, + "step": 2653 + }, + { + "epoch": 0.9880978697164664, + "grad_norm": 0.17912618815898895, + "learning_rate": 1.979864759658291e-05, + "loss": 1.2562, + "step": 2654 + }, + { + "epoch": 0.9884701748670754, + "grad_norm": 0.17686498165130615, + "learning_rate": 1.979840487131469e-05, + "loss": 1.2422, + "step": 2655 + }, + { + "epoch": 0.9888424800176845, + "grad_norm": 0.1823192685842514, + "learning_rate": 1.9798162001324647e-05, + "loss": 1.2304, + "step": 2656 + }, + { + "epoch": 0.9892147851682935, + "grad_norm": 0.17622555792331696, + "learning_rate": 1.9797918986616362e-05, + "loss": 1.2241, + "step": 2657 + }, + { + "epoch": 0.9895870903189027, + "grad_norm": 0.1805817037820816, + "learning_rate": 1.979767582719343e-05, + "loss": 1.241, + "step": 2658 + }, + { + "epoch": 0.9899593954695117, + "grad_norm": 0.1678328961133957, + "learning_rate": 1.9797432523059442e-05, + "loss": 1.2248, + "step": 2659 + }, + { + "epoch": 0.9903317006201208, + "grad_norm": 0.17134131491184235, + "learning_rate": 1.9797189074217993e-05, + "loss": 1.2415, + "step": 2660 + }, + { + "epoch": 0.9907040057707298, + "grad_norm": 0.1789311319589615, + "learning_rate": 1.979694548067267e-05, + "loss": 1.2358, + "step": 2661 + }, + { + "epoch": 0.9910763109213389, + "grad_norm": 0.18833468854427338, + "learning_rate": 1.979670174242708e-05, + "loss": 1.2339, + "step": 2662 + }, + { + "epoch": 0.991448616071948, + "grad_norm": 0.18006418645381927, + "learning_rate": 1.9796457859484825e-05, + "loss": 1.2322, + "step": 2663 + }, + { + "epoch": 0.991820921222557, + "grad_norm": 0.1705029308795929, + "learning_rate": 1.9796213831849496e-05, + "loss": 1.2473, + "step": 2664 + }, + { + "epoch": 0.9921932263731661, + "grad_norm": 0.16797584295272827, + "learning_rate": 1.9795969659524705e-05, + "loss": 1.2422, + "step": 2665 + }, + { + "epoch": 0.9925655315237751, + "grad_norm": 0.19237132370471954, + "learning_rate": 1.9795725342514055e-05, + "loss": 1.2567, + "step": 2666 + }, + { + "epoch": 0.9929378366743843, + "grad_norm": 0.1747465878725052, + "learning_rate": 1.9795480880821162e-05, + "loss": 1.2496, + "step": 2667 + }, + { + "epoch": 0.9933101418249933, + "grad_norm": 0.17807847261428833, + "learning_rate": 1.9795236274449627e-05, + "loss": 1.2463, + "step": 2668 + }, + { + "epoch": 0.9936824469756024, + "grad_norm": 0.17453856766223907, + "learning_rate": 1.979499152340307e-05, + "loss": 1.2433, + "step": 2669 + }, + { + "epoch": 0.9940547521262114, + "grad_norm": 0.1748945713043213, + "learning_rate": 1.9794746627685097e-05, + "loss": 1.2363, + "step": 2670 + }, + { + "epoch": 0.9944270572768205, + "grad_norm": 0.1771155595779419, + "learning_rate": 1.9794501587299338e-05, + "loss": 1.236, + "step": 2671 + }, + { + "epoch": 0.9947993624274296, + "grad_norm": 0.17215952277183533, + "learning_rate": 1.9794256402249398e-05, + "loss": 1.233, + "step": 2672 + }, + { + "epoch": 0.9951716675780387, + "grad_norm": 0.1868884265422821, + "learning_rate": 1.979401107253891e-05, + "loss": 1.2446, + "step": 2673 + }, + { + "epoch": 0.9955439727286477, + "grad_norm": 0.18049085140228271, + "learning_rate": 1.9793765598171494e-05, + "loss": 1.2319, + "step": 2674 + }, + { + "epoch": 0.9959162778792567, + "grad_norm": 0.16808654367923737, + "learning_rate": 1.9793519979150773e-05, + "loss": 1.2382, + "step": 2675 + }, + { + "epoch": 0.9962885830298659, + "grad_norm": 0.18061494827270508, + "learning_rate": 1.9793274215480375e-05, + "loss": 1.2547, + "step": 2676 + }, + { + "epoch": 0.9966608881804749, + "grad_norm": 0.16883356869220734, + "learning_rate": 1.9793028307163937e-05, + "loss": 1.2517, + "step": 2677 + }, + { + "epoch": 0.997033193331084, + "grad_norm": 0.1701226681470871, + "learning_rate": 1.979278225420508e-05, + "loss": 1.2401, + "step": 2678 + }, + { + "epoch": 0.997405498481693, + "grad_norm": 0.17168228328227997, + "learning_rate": 1.9792536056607448e-05, + "loss": 1.2341, + "step": 2679 + }, + { + "epoch": 0.9977778036323022, + "grad_norm": 0.1792803853750229, + "learning_rate": 1.979228971437467e-05, + "loss": 1.2353, + "step": 2680 + }, + { + "epoch": 0.9981501087829112, + "grad_norm": 0.16425569355487823, + "learning_rate": 1.9792043227510387e-05, + "loss": 1.239, + "step": 2681 + }, + { + "epoch": 0.9985224139335203, + "grad_norm": 0.17416253685951233, + "learning_rate": 1.979179659601824e-05, + "loss": 1.2426, + "step": 2682 + }, + { + "epoch": 0.9988947190841293, + "grad_norm": 0.16776736080646515, + "learning_rate": 1.9791549819901875e-05, + "loss": 1.2377, + "step": 2683 + }, + { + "epoch": 0.9992670242347383, + "grad_norm": 0.16794565320014954, + "learning_rate": 1.9791302899164932e-05, + "loss": 1.2406, + "step": 2684 + }, + { + "epoch": 0.9996393293853475, + "grad_norm": 0.1647026389837265, + "learning_rate": 1.9791055833811056e-05, + "loss": 1.2427, + "step": 2685 + }, + { + "epoch": 1.0000116345359564, + "grad_norm": 0.19015128910541534, + "learning_rate": 1.9790808623843905e-05, + "loss": 1.2504, + "step": 2686 + }, + { + "epoch": 1.0003839396865657, + "grad_norm": 0.1643396019935608, + "learning_rate": 1.9790561269267122e-05, + "loss": 1.2387, + "step": 2687 + }, + { + "epoch": 1.0007562448371747, + "grad_norm": 0.178327739238739, + "learning_rate": 1.9790313770084363e-05, + "loss": 1.2417, + "step": 2688 + }, + { + "epoch": 1.0011285499877838, + "grad_norm": 0.18090787529945374, + "learning_rate": 1.9790066126299286e-05, + "loss": 1.2458, + "step": 2689 + }, + { + "epoch": 1.0015008551383928, + "grad_norm": 0.16783635318279266, + "learning_rate": 1.978981833791555e-05, + "loss": 1.2481, + "step": 2690 + }, + { + "epoch": 1.0018731602890019, + "grad_norm": 0.1700548678636551, + "learning_rate": 1.9789570404936805e-05, + "loss": 1.2429, + "step": 2691 + }, + { + "epoch": 1.002245465439611, + "grad_norm": 0.1691717803478241, + "learning_rate": 1.9789322327366722e-05, + "loss": 1.245, + "step": 2692 + }, + { + "epoch": 1.00261777059022, + "grad_norm": 0.16553466022014618, + "learning_rate": 1.9789074105208962e-05, + "loss": 1.2288, + "step": 2693 + }, + { + "epoch": 1.002990075740829, + "grad_norm": 0.17240577936172485, + "learning_rate": 1.9788825738467194e-05, + "loss": 1.2467, + "step": 2694 + }, + { + "epoch": 1.0033623808914383, + "grad_norm": 0.17340059578418732, + "learning_rate": 1.9788577227145084e-05, + "loss": 1.2392, + "step": 2695 + }, + { + "epoch": 1.0037346860420473, + "grad_norm": 0.1701386272907257, + "learning_rate": 1.97883285712463e-05, + "loss": 1.215, + "step": 2696 + }, + { + "epoch": 1.0041069911926563, + "grad_norm": 0.17179681360721588, + "learning_rate": 1.9788079770774517e-05, + "loss": 1.2226, + "step": 2697 + }, + { + "epoch": 1.0044792963432654, + "grad_norm": 0.1747562438249588, + "learning_rate": 1.9787830825733415e-05, + "loss": 1.2348, + "step": 2698 + }, + { + "epoch": 1.0048516014938744, + "grad_norm": 0.1735781580209732, + "learning_rate": 1.9787581736126663e-05, + "loss": 1.2438, + "step": 2699 + }, + { + "epoch": 1.0052239066444835, + "grad_norm": 0.16927596926689148, + "learning_rate": 1.9787332501957942e-05, + "loss": 1.2388, + "step": 2700 + }, + { + "epoch": 1.0055962117950925, + "grad_norm": 0.17900878190994263, + "learning_rate": 1.9787083123230933e-05, + "loss": 1.2479, + "step": 2701 + }, + { + "epoch": 1.0059685169457016, + "grad_norm": 0.1668340116739273, + "learning_rate": 1.9786833599949325e-05, + "loss": 1.2278, + "step": 2702 + }, + { + "epoch": 1.0063408220963106, + "grad_norm": 0.17158925533294678, + "learning_rate": 1.9786583932116795e-05, + "loss": 1.2364, + "step": 2703 + }, + { + "epoch": 1.0067131272469199, + "grad_norm": 0.17500467598438263, + "learning_rate": 1.9786334119737035e-05, + "loss": 1.2199, + "step": 2704 + }, + { + "epoch": 1.007085432397529, + "grad_norm": 0.1702401041984558, + "learning_rate": 1.9786084162813735e-05, + "loss": 1.252, + "step": 2705 + }, + { + "epoch": 1.007457737548138, + "grad_norm": 0.1826189160346985, + "learning_rate": 1.9785834061350585e-05, + "loss": 1.2441, + "step": 2706 + }, + { + "epoch": 1.007830042698747, + "grad_norm": 0.16807404160499573, + "learning_rate": 1.9785583815351285e-05, + "loss": 1.2245, + "step": 2707 + }, + { + "epoch": 1.008202347849356, + "grad_norm": 0.18120057880878448, + "learning_rate": 1.978533342481952e-05, + "loss": 1.2484, + "step": 2708 + }, + { + "epoch": 1.008574652999965, + "grad_norm": 0.16986186802387238, + "learning_rate": 1.9785082889759e-05, + "loss": 1.2615, + "step": 2709 + }, + { + "epoch": 1.0089469581505741, + "grad_norm": 0.18247941136360168, + "learning_rate": 1.9784832210173413e-05, + "loss": 1.2235, + "step": 2710 + }, + { + "epoch": 1.0093192633011832, + "grad_norm": 0.177805557847023, + "learning_rate": 1.9784581386066472e-05, + "loss": 1.2425, + "step": 2711 + }, + { + "epoch": 1.0096915684517922, + "grad_norm": 0.1815112829208374, + "learning_rate": 1.978433041744188e-05, + "loss": 1.2465, + "step": 2712 + }, + { + "epoch": 1.0100638736024015, + "grad_norm": 0.189849853515625, + "learning_rate": 1.9784079304303337e-05, + "loss": 1.2374, + "step": 2713 + }, + { + "epoch": 1.0104361787530105, + "grad_norm": 0.16993628442287445, + "learning_rate": 1.978382804665456e-05, + "loss": 1.2462, + "step": 2714 + }, + { + "epoch": 1.0108084839036195, + "grad_norm": 0.1785373091697693, + "learning_rate": 1.9783576644499257e-05, + "loss": 1.2423, + "step": 2715 + }, + { + "epoch": 1.0111807890542286, + "grad_norm": 0.19358791410923004, + "learning_rate": 1.978332509784114e-05, + "loss": 1.2281, + "step": 2716 + }, + { + "epoch": 1.0115530942048376, + "grad_norm": 0.16849210858345032, + "learning_rate": 1.9783073406683926e-05, + "loss": 1.2245, + "step": 2717 + }, + { + "epoch": 1.0119253993554467, + "grad_norm": 0.1804894357919693, + "learning_rate": 1.978282157103133e-05, + "loss": 1.2395, + "step": 2718 + }, + { + "epoch": 1.0122977045060557, + "grad_norm": 0.16764596104621887, + "learning_rate": 1.9782569590887075e-05, + "loss": 1.2343, + "step": 2719 + }, + { + "epoch": 1.0126700096566648, + "grad_norm": 0.17664223909378052, + "learning_rate": 1.978231746625488e-05, + "loss": 1.2432, + "step": 2720 + }, + { + "epoch": 1.0130423148072738, + "grad_norm": 0.17679232358932495, + "learning_rate": 1.978206519713847e-05, + "loss": 1.2355, + "step": 2721 + }, + { + "epoch": 1.013414619957883, + "grad_norm": 0.17845486104488373, + "learning_rate": 1.9781812783541574e-05, + "loss": 1.2529, + "step": 2722 + }, + { + "epoch": 1.013786925108492, + "grad_norm": 0.17717497050762177, + "learning_rate": 1.9781560225467913e-05, + "loss": 1.2383, + "step": 2723 + }, + { + "epoch": 1.0141592302591012, + "grad_norm": 0.17647191882133484, + "learning_rate": 1.9781307522921224e-05, + "loss": 1.2315, + "step": 2724 + }, + { + "epoch": 1.0145315354097102, + "grad_norm": 0.177278071641922, + "learning_rate": 1.9781054675905235e-05, + "loss": 1.2385, + "step": 2725 + }, + { + "epoch": 1.0149038405603192, + "grad_norm": 0.18225128948688507, + "learning_rate": 1.9780801684423684e-05, + "loss": 1.2356, + "step": 2726 + }, + { + "epoch": 1.0152761457109283, + "grad_norm": 0.1841052770614624, + "learning_rate": 1.9780548548480304e-05, + "loss": 1.2404, + "step": 2727 + }, + { + "epoch": 1.0156484508615373, + "grad_norm": 0.1847550868988037, + "learning_rate": 1.9780295268078834e-05, + "loss": 1.2343, + "step": 2728 + }, + { + "epoch": 1.0160207560121464, + "grad_norm": 0.18073725700378418, + "learning_rate": 1.9780041843223023e-05, + "loss": 1.2507, + "step": 2729 + }, + { + "epoch": 1.0163930611627556, + "grad_norm": 0.17755703628063202, + "learning_rate": 1.97797882739166e-05, + "loss": 1.2337, + "step": 2730 + }, + { + "epoch": 1.0167653663133647, + "grad_norm": 0.1812552660703659, + "learning_rate": 1.9779534560163324e-05, + "loss": 1.2212, + "step": 2731 + }, + { + "epoch": 1.0171376714639737, + "grad_norm": 0.18241752684116364, + "learning_rate": 1.9779280701966935e-05, + "loss": 1.2501, + "step": 2732 + }, + { + "epoch": 1.0175099766145828, + "grad_norm": 0.1742631196975708, + "learning_rate": 1.9779026699331183e-05, + "loss": 1.2476, + "step": 2733 + }, + { + "epoch": 1.0178822817651918, + "grad_norm": 0.17878419160842896, + "learning_rate": 1.9778772552259818e-05, + "loss": 1.2375, + "step": 2734 + }, + { + "epoch": 1.0182545869158008, + "grad_norm": 0.18265032768249512, + "learning_rate": 1.9778518260756602e-05, + "loss": 1.2443, + "step": 2735 + }, + { + "epoch": 1.0186268920664099, + "grad_norm": 0.17354628443717957, + "learning_rate": 1.977826382482528e-05, + "loss": 1.2458, + "step": 2736 + }, + { + "epoch": 1.018999197217019, + "grad_norm": 0.17573866248130798, + "learning_rate": 1.9778009244469617e-05, + "loss": 1.2469, + "step": 2737 + }, + { + "epoch": 1.019371502367628, + "grad_norm": 0.18138283491134644, + "learning_rate": 1.977775451969337e-05, + "loss": 1.2296, + "step": 2738 + }, + { + "epoch": 1.0197438075182372, + "grad_norm": 0.18564942479133606, + "learning_rate": 1.9777499650500303e-05, + "loss": 1.2492, + "step": 2739 + }, + { + "epoch": 1.0201161126688463, + "grad_norm": 0.18133315443992615, + "learning_rate": 1.977724463689418e-05, + "loss": 1.2432, + "step": 2740 + }, + { + "epoch": 1.0204884178194553, + "grad_norm": 0.18659467995166779, + "learning_rate": 1.9776989478878764e-05, + "loss": 1.258, + "step": 2741 + }, + { + "epoch": 1.0208607229700644, + "grad_norm": 0.2271583080291748, + "learning_rate": 1.9776734176457833e-05, + "loss": 1.236, + "step": 2742 + }, + { + "epoch": 1.0212330281206734, + "grad_norm": 0.1641337126493454, + "learning_rate": 1.9776478729635146e-05, + "loss": 1.2299, + "step": 2743 + }, + { + "epoch": 1.0216053332712824, + "grad_norm": 0.18119634687900543, + "learning_rate": 1.9776223138414486e-05, + "loss": 1.2252, + "step": 2744 + }, + { + "epoch": 1.0219776384218915, + "grad_norm": 0.17801545560359955, + "learning_rate": 1.977596740279962e-05, + "loss": 1.2272, + "step": 2745 + }, + { + "epoch": 1.0223499435725005, + "grad_norm": 0.18631921708583832, + "learning_rate": 1.9775711522794333e-05, + "loss": 1.252, + "step": 2746 + }, + { + "epoch": 1.0227222487231096, + "grad_norm": 0.18302686512470245, + "learning_rate": 1.97754554984024e-05, + "loss": 1.2308, + "step": 2747 + }, + { + "epoch": 1.0230945538737188, + "grad_norm": 0.178566575050354, + "learning_rate": 1.97751993296276e-05, + "loss": 1.2492, + "step": 2748 + }, + { + "epoch": 1.0234668590243279, + "grad_norm": 0.18399080634117126, + "learning_rate": 1.977494301647372e-05, + "loss": 1.2451, + "step": 2749 + }, + { + "epoch": 1.023839164174937, + "grad_norm": 0.18315501511096954, + "learning_rate": 1.9774686558944544e-05, + "loss": 1.2505, + "step": 2750 + }, + { + "epoch": 1.024211469325546, + "grad_norm": 0.184551402926445, + "learning_rate": 1.9774429957043866e-05, + "loss": 1.2294, + "step": 2751 + }, + { + "epoch": 1.024583774476155, + "grad_norm": 0.45721930265426636, + "learning_rate": 1.9774173210775466e-05, + "loss": 1.2194, + "step": 2752 + }, + { + "epoch": 1.024956079626764, + "grad_norm": 0.1807864010334015, + "learning_rate": 1.9773916320143144e-05, + "loss": 1.249, + "step": 2753 + }, + { + "epoch": 1.025328384777373, + "grad_norm": 0.18623942136764526, + "learning_rate": 1.977365928515069e-05, + "loss": 1.2281, + "step": 2754 + }, + { + "epoch": 1.0257006899279821, + "grad_norm": 0.18158674240112305, + "learning_rate": 1.97734021058019e-05, + "loss": 1.2333, + "step": 2755 + }, + { + "epoch": 1.0260729950785914, + "grad_norm": 0.1756281554698944, + "learning_rate": 1.9773144782100576e-05, + "loss": 1.2503, + "step": 2756 + }, + { + "epoch": 1.0264453002292004, + "grad_norm": 0.1774439960718155, + "learning_rate": 1.9772887314050516e-05, + "loss": 1.2371, + "step": 2757 + }, + { + "epoch": 1.0268176053798095, + "grad_norm": 0.17435887455940247, + "learning_rate": 1.9772629701655524e-05, + "loss": 1.2379, + "step": 2758 + }, + { + "epoch": 1.0271899105304185, + "grad_norm": 0.18114317953586578, + "learning_rate": 1.9772371944919406e-05, + "loss": 1.2389, + "step": 2759 + }, + { + "epoch": 1.0275622156810276, + "grad_norm": 0.1751965880393982, + "learning_rate": 1.9772114043845968e-05, + "loss": 1.2537, + "step": 2760 + }, + { + "epoch": 1.0279345208316366, + "grad_norm": 0.18120123445987701, + "learning_rate": 1.977185599843902e-05, + "loss": 1.2308, + "step": 2761 + }, + { + "epoch": 1.0283068259822457, + "grad_norm": 0.18496285378932953, + "learning_rate": 1.9771597808702366e-05, + "loss": 1.2399, + "step": 2762 + }, + { + "epoch": 1.0286791311328547, + "grad_norm": 0.17509673535823822, + "learning_rate": 1.9771339474639833e-05, + "loss": 1.2205, + "step": 2763 + }, + { + "epoch": 1.0290514362834637, + "grad_norm": 0.17925499379634857, + "learning_rate": 1.9771080996255226e-05, + "loss": 1.2414, + "step": 2764 + }, + { + "epoch": 1.029423741434073, + "grad_norm": 0.17115138471126556, + "learning_rate": 1.9770822373552362e-05, + "loss": 1.2282, + "step": 2765 + }, + { + "epoch": 1.029796046584682, + "grad_norm": 0.1737300306558609, + "learning_rate": 1.9770563606535068e-05, + "loss": 1.2417, + "step": 2766 + }, + { + "epoch": 1.030168351735291, + "grad_norm": 0.17681419849395752, + "learning_rate": 1.9770304695207164e-05, + "loss": 1.2274, + "step": 2767 + }, + { + "epoch": 1.0305406568859001, + "grad_norm": 0.1788710057735443, + "learning_rate": 1.9770045639572473e-05, + "loss": 1.2319, + "step": 2768 + }, + { + "epoch": 1.0309129620365092, + "grad_norm": 0.17474333941936493, + "learning_rate": 1.976978643963482e-05, + "loss": 1.2386, + "step": 2769 + }, + { + "epoch": 1.0312852671871182, + "grad_norm": 0.18028199672698975, + "learning_rate": 1.9769527095398033e-05, + "loss": 1.2426, + "step": 2770 + }, + { + "epoch": 1.0316575723377273, + "grad_norm": 0.17821404337882996, + "learning_rate": 1.9769267606865944e-05, + "loss": 1.256, + "step": 2771 + }, + { + "epoch": 1.0320298774883363, + "grad_norm": 0.17664535343647003, + "learning_rate": 1.976900797404239e-05, + "loss": 1.2397, + "step": 2772 + }, + { + "epoch": 1.0324021826389453, + "grad_norm": 0.17644058167934418, + "learning_rate": 1.9768748196931197e-05, + "loss": 1.259, + "step": 2773 + }, + { + "epoch": 1.0327744877895546, + "grad_norm": 0.18605953454971313, + "learning_rate": 1.976848827553621e-05, + "loss": 1.2336, + "step": 2774 + }, + { + "epoch": 1.0331467929401636, + "grad_norm": 0.17775902152061462, + "learning_rate": 1.9768228209861257e-05, + "loss": 1.2309, + "step": 2775 + }, + { + "epoch": 1.0335190980907727, + "grad_norm": 0.18057388067245483, + "learning_rate": 1.976796799991019e-05, + "loss": 1.231, + "step": 2776 + }, + { + "epoch": 1.0338914032413817, + "grad_norm": 0.17607726156711578, + "learning_rate": 1.9767707645686852e-05, + "loss": 1.2189, + "step": 2777 + }, + { + "epoch": 1.0342637083919908, + "grad_norm": 0.19016438722610474, + "learning_rate": 1.9767447147195083e-05, + "loss": 1.2416, + "step": 2778 + }, + { + "epoch": 1.0346360135425998, + "grad_norm": 0.1789761781692505, + "learning_rate": 1.976718650443873e-05, + "loss": 1.2498, + "step": 2779 + }, + { + "epoch": 1.0350083186932089, + "grad_norm": 0.1939614713191986, + "learning_rate": 1.9766925717421647e-05, + "loss": 1.2518, + "step": 2780 + }, + { + "epoch": 1.035380623843818, + "grad_norm": 0.1895962953567505, + "learning_rate": 1.9766664786147687e-05, + "loss": 1.2613, + "step": 2781 + }, + { + "epoch": 1.0357529289944272, + "grad_norm": 0.21478445827960968, + "learning_rate": 1.97664037106207e-05, + "loss": 1.235, + "step": 2782 + }, + { + "epoch": 1.0361252341450362, + "grad_norm": 0.17898200452327728, + "learning_rate": 1.976614249084454e-05, + "loss": 1.2397, + "step": 2783 + }, + { + "epoch": 1.0364975392956453, + "grad_norm": 0.18122392892837524, + "learning_rate": 1.976588112682307e-05, + "loss": 1.2253, + "step": 2784 + }, + { + "epoch": 1.0368698444462543, + "grad_norm": 0.17120665311813354, + "learning_rate": 1.9765619618560146e-05, + "loss": 1.2428, + "step": 2785 + }, + { + "epoch": 1.0372421495968633, + "grad_norm": 0.1834287941455841, + "learning_rate": 1.9765357966059638e-05, + "loss": 1.2253, + "step": 2786 + }, + { + "epoch": 1.0376144547474724, + "grad_norm": 0.1814526915550232, + "learning_rate": 1.9765096169325404e-05, + "loss": 1.242, + "step": 2787 + }, + { + "epoch": 1.0379867598980814, + "grad_norm": 0.18494722247123718, + "learning_rate": 1.9764834228361313e-05, + "loss": 1.2501, + "step": 2788 + }, + { + "epoch": 1.0383590650486905, + "grad_norm": 0.17375454306602478, + "learning_rate": 1.9764572143171232e-05, + "loss": 1.2292, + "step": 2789 + }, + { + "epoch": 1.0387313701992995, + "grad_norm": 0.18002989888191223, + "learning_rate": 1.9764309913759033e-05, + "loss": 1.2331, + "step": 2790 + }, + { + "epoch": 1.0391036753499088, + "grad_norm": 0.18305903673171997, + "learning_rate": 1.976404754012859e-05, + "loss": 1.2479, + "step": 2791 + }, + { + "epoch": 1.0394759805005178, + "grad_norm": 0.18125832080841064, + "learning_rate": 1.9763785022283778e-05, + "loss": 1.2238, + "step": 2792 + }, + { + "epoch": 1.0398482856511269, + "grad_norm": 0.1753983050584793, + "learning_rate": 1.976352236022847e-05, + "loss": 1.2198, + "step": 2793 + }, + { + "epoch": 1.040220590801736, + "grad_norm": 0.17953644692897797, + "learning_rate": 1.9763259553966553e-05, + "loss": 1.2477, + "step": 2794 + }, + { + "epoch": 1.040592895952345, + "grad_norm": 0.17996378242969513, + "learning_rate": 1.9762996603501908e-05, + "loss": 1.2696, + "step": 2795 + }, + { + "epoch": 1.040965201102954, + "grad_norm": 0.17698989808559418, + "learning_rate": 1.976273350883841e-05, + "loss": 1.248, + "step": 2796 + }, + { + "epoch": 1.041337506253563, + "grad_norm": 0.17306332290172577, + "learning_rate": 1.9762470269979955e-05, + "loss": 1.2555, + "step": 2797 + }, + { + "epoch": 1.041709811404172, + "grad_norm": 0.17857332527637482, + "learning_rate": 1.9762206886930423e-05, + "loss": 1.2386, + "step": 2798 + }, + { + "epoch": 1.0420821165547811, + "grad_norm": 0.18211491405963898, + "learning_rate": 1.9761943359693712e-05, + "loss": 1.2458, + "step": 2799 + }, + { + "epoch": 1.0424544217053904, + "grad_norm": 0.1814602017402649, + "learning_rate": 1.9761679688273708e-05, + "loss": 1.2442, + "step": 2800 + }, + { + "epoch": 1.0428267268559994, + "grad_norm": 0.1816301792860031, + "learning_rate": 1.976141587267431e-05, + "loss": 1.2377, + "step": 2801 + }, + { + "epoch": 1.0431990320066085, + "grad_norm": 0.1897687017917633, + "learning_rate": 1.976115191289941e-05, + "loss": 1.2532, + "step": 2802 + }, + { + "epoch": 1.0435713371572175, + "grad_norm": 0.174024760723114, + "learning_rate": 1.9760887808952908e-05, + "loss": 1.223, + "step": 2803 + }, + { + "epoch": 1.0439436423078265, + "grad_norm": 0.19265376031398773, + "learning_rate": 1.9760623560838707e-05, + "loss": 1.2442, + "step": 2804 + }, + { + "epoch": 1.0443159474584356, + "grad_norm": 0.17987599968910217, + "learning_rate": 1.9760359168560708e-05, + "loss": 1.2395, + "step": 2805 + }, + { + "epoch": 1.0446882526090446, + "grad_norm": 0.1701204478740692, + "learning_rate": 1.976009463212282e-05, + "loss": 1.2306, + "step": 2806 + }, + { + "epoch": 1.0450605577596537, + "grad_norm": 0.1814124584197998, + "learning_rate": 1.975982995152894e-05, + "loss": 1.2451, + "step": 2807 + }, + { + "epoch": 1.0454328629102627, + "grad_norm": 0.181508406996727, + "learning_rate": 1.9759565126782988e-05, + "loss": 1.2415, + "step": 2808 + }, + { + "epoch": 1.045805168060872, + "grad_norm": 0.167762890458107, + "learning_rate": 1.975930015788887e-05, + "loss": 1.2343, + "step": 2809 + }, + { + "epoch": 1.046177473211481, + "grad_norm": 0.17831118404865265, + "learning_rate": 1.9759035044850504e-05, + "loss": 1.2163, + "step": 2810 + }, + { + "epoch": 1.04654977836209, + "grad_norm": 0.18151675164699554, + "learning_rate": 1.9758769787671804e-05, + "loss": 1.2255, + "step": 2811 + }, + { + "epoch": 1.046922083512699, + "grad_norm": 0.18939217925071716, + "learning_rate": 1.9758504386356682e-05, + "loss": 1.2381, + "step": 2812 + }, + { + "epoch": 1.0472943886633082, + "grad_norm": 0.1697840541601181, + "learning_rate": 1.9758238840909066e-05, + "loss": 1.2406, + "step": 2813 + }, + { + "epoch": 1.0476666938139172, + "grad_norm": 0.17666485905647278, + "learning_rate": 1.975797315133287e-05, + "loss": 1.2312, + "step": 2814 + }, + { + "epoch": 1.0480389989645262, + "grad_norm": 0.1959092617034912, + "learning_rate": 1.975770731763203e-05, + "loss": 1.2444, + "step": 2815 + }, + { + "epoch": 1.0484113041151353, + "grad_norm": 0.17238794267177582, + "learning_rate": 1.9757441339810462e-05, + "loss": 1.227, + "step": 2816 + }, + { + "epoch": 1.0487836092657445, + "grad_norm": 0.17401093244552612, + "learning_rate": 1.9757175217872096e-05, + "loss": 1.2475, + "step": 2817 + }, + { + "epoch": 1.0491559144163536, + "grad_norm": 0.1765962839126587, + "learning_rate": 1.9756908951820866e-05, + "loss": 1.2348, + "step": 2818 + }, + { + "epoch": 1.0495282195669626, + "grad_norm": 0.17260397970676422, + "learning_rate": 1.9756642541660702e-05, + "loss": 1.2509, + "step": 2819 + }, + { + "epoch": 1.0499005247175717, + "grad_norm": 0.16606584191322327, + "learning_rate": 1.975637598739554e-05, + "loss": 1.2384, + "step": 2820 + }, + { + "epoch": 1.0502728298681807, + "grad_norm": 0.17037144303321838, + "learning_rate": 1.975610928902932e-05, + "loss": 1.2382, + "step": 2821 + }, + { + "epoch": 1.0506451350187898, + "grad_norm": 0.1816329061985016, + "learning_rate": 1.9755842446565975e-05, + "loss": 1.2329, + "step": 2822 + }, + { + "epoch": 1.0510174401693988, + "grad_norm": 0.18435317277908325, + "learning_rate": 1.975557546000945e-05, + "loss": 1.233, + "step": 2823 + }, + { + "epoch": 1.0513897453200078, + "grad_norm": 0.16666759550571442, + "learning_rate": 1.975530832936369e-05, + "loss": 1.2345, + "step": 2824 + }, + { + "epoch": 1.0517620504706169, + "grad_norm": 0.18911749124526978, + "learning_rate": 1.9755041054632634e-05, + "loss": 1.2416, + "step": 2825 + }, + { + "epoch": 1.0521343556212261, + "grad_norm": 0.17855370044708252, + "learning_rate": 1.9754773635820236e-05, + "loss": 1.246, + "step": 2826 + }, + { + "epoch": 1.0525066607718352, + "grad_norm": 0.17206569015979767, + "learning_rate": 1.9754506072930443e-05, + "loss": 1.2326, + "step": 2827 + }, + { + "epoch": 1.0528789659224442, + "grad_norm": 0.180022731423378, + "learning_rate": 1.9754238365967207e-05, + "loss": 1.2389, + "step": 2828 + }, + { + "epoch": 1.0532512710730533, + "grad_norm": 0.17861510813236237, + "learning_rate": 1.9753970514934485e-05, + "loss": 1.2355, + "step": 2829 + }, + { + "epoch": 1.0536235762236623, + "grad_norm": 0.17671099305152893, + "learning_rate": 1.9753702519836228e-05, + "loss": 1.2487, + "step": 2830 + }, + { + "epoch": 1.0539958813742714, + "grad_norm": 0.17241548001766205, + "learning_rate": 1.97534343806764e-05, + "loss": 1.2386, + "step": 2831 + }, + { + "epoch": 1.0543681865248804, + "grad_norm": 0.1827198565006256, + "learning_rate": 1.9753166097458957e-05, + "loss": 1.2326, + "step": 2832 + }, + { + "epoch": 1.0547404916754894, + "grad_norm": 0.17715848982334137, + "learning_rate": 1.975289767018786e-05, + "loss": 1.2346, + "step": 2833 + }, + { + "epoch": 1.0551127968260985, + "grad_norm": 0.17723973095417023, + "learning_rate": 1.975262909886708e-05, + "loss": 1.2426, + "step": 2834 + }, + { + "epoch": 1.0554851019767078, + "grad_norm": 0.1761004477739334, + "learning_rate": 1.975236038350058e-05, + "loss": 1.2411, + "step": 2835 + }, + { + "epoch": 1.0558574071273168, + "grad_norm": 0.18172328174114227, + "learning_rate": 1.9752091524092324e-05, + "loss": 1.2237, + "step": 2836 + }, + { + "epoch": 1.0562297122779258, + "grad_norm": 0.16737525165081024, + "learning_rate": 1.9751822520646297e-05, + "loss": 1.2503, + "step": 2837 + }, + { + "epoch": 1.0566020174285349, + "grad_norm": 0.1774168312549591, + "learning_rate": 1.9751553373166454e-05, + "loss": 1.2195, + "step": 2838 + }, + { + "epoch": 1.056974322579144, + "grad_norm": 0.17864681780338287, + "learning_rate": 1.9751284081656786e-05, + "loss": 1.2407, + "step": 2839 + }, + { + "epoch": 1.057346627729753, + "grad_norm": 0.1730041354894638, + "learning_rate": 1.975101464612126e-05, + "loss": 1.2441, + "step": 2840 + }, + { + "epoch": 1.057718932880362, + "grad_norm": 0.17690402269363403, + "learning_rate": 1.9750745066563864e-05, + "loss": 1.2445, + "step": 2841 + }, + { + "epoch": 1.058091238030971, + "grad_norm": 0.1674228459596634, + "learning_rate": 1.9750475342988572e-05, + "loss": 1.2313, + "step": 2842 + }, + { + "epoch": 1.0584635431815803, + "grad_norm": 0.1728244572877884, + "learning_rate": 1.9750205475399373e-05, + "loss": 1.2261, + "step": 2843 + }, + { + "epoch": 1.0588358483321894, + "grad_norm": 0.1766444444656372, + "learning_rate": 1.974993546380025e-05, + "loss": 1.2329, + "step": 2844 + }, + { + "epoch": 1.0592081534827984, + "grad_norm": 0.17906500399112701, + "learning_rate": 1.9749665308195194e-05, + "loss": 1.237, + "step": 2845 + }, + { + "epoch": 1.0595804586334074, + "grad_norm": 0.17800427973270416, + "learning_rate": 1.974939500858819e-05, + "loss": 1.2405, + "step": 2846 + }, + { + "epoch": 1.0599527637840165, + "grad_norm": 0.17567503452301025, + "learning_rate": 1.9749124564983237e-05, + "loss": 1.241, + "step": 2847 + }, + { + "epoch": 1.0603250689346255, + "grad_norm": 0.172315314412117, + "learning_rate": 1.9748853977384326e-05, + "loss": 1.2331, + "step": 2848 + }, + { + "epoch": 1.0606973740852346, + "grad_norm": 0.17983998358249664, + "learning_rate": 1.974858324579545e-05, + "loss": 1.2449, + "step": 2849 + }, + { + "epoch": 1.0610696792358436, + "grad_norm": 0.17935794591903687, + "learning_rate": 1.9748312370220613e-05, + "loss": 1.2357, + "step": 2850 + }, + { + "epoch": 1.0614419843864527, + "grad_norm": 0.1763681173324585, + "learning_rate": 1.9748041350663817e-05, + "loss": 1.2498, + "step": 2851 + }, + { + "epoch": 1.061814289537062, + "grad_norm": 0.17987823486328125, + "learning_rate": 1.9747770187129055e-05, + "loss": 1.2495, + "step": 2852 + }, + { + "epoch": 1.062186594687671, + "grad_norm": 0.17571550607681274, + "learning_rate": 1.9747498879620342e-05, + "loss": 1.2263, + "step": 2853 + }, + { + "epoch": 1.06255889983828, + "grad_norm": 0.17935210466384888, + "learning_rate": 1.9747227428141687e-05, + "loss": 1.2317, + "step": 2854 + }, + { + "epoch": 1.062931204988889, + "grad_norm": 0.16637402772903442, + "learning_rate": 1.9746955832697094e-05, + "loss": 1.2329, + "step": 2855 + }, + { + "epoch": 1.063303510139498, + "grad_norm": 0.17255236208438873, + "learning_rate": 1.974668409329057e-05, + "loss": 1.2353, + "step": 2856 + }, + { + "epoch": 1.0636758152901071, + "grad_norm": 0.17514565587043762, + "learning_rate": 1.9746412209926132e-05, + "loss": 1.2303, + "step": 2857 + }, + { + "epoch": 1.0640481204407162, + "grad_norm": 0.17487487196922302, + "learning_rate": 1.9746140182607805e-05, + "loss": 1.2199, + "step": 2858 + }, + { + "epoch": 1.0644204255913252, + "grad_norm": 0.16520456969738007, + "learning_rate": 1.9745868011339592e-05, + "loss": 1.2332, + "step": 2859 + }, + { + "epoch": 1.0647927307419343, + "grad_norm": 0.1774507462978363, + "learning_rate": 1.9745595696125526e-05, + "loss": 1.2407, + "step": 2860 + }, + { + "epoch": 1.0651650358925435, + "grad_norm": 0.16631199419498444, + "learning_rate": 1.974532323696962e-05, + "loss": 1.2345, + "step": 2861 + }, + { + "epoch": 1.0655373410431526, + "grad_norm": 0.17234042286872864, + "learning_rate": 1.97450506338759e-05, + "loss": 1.2222, + "step": 2862 + }, + { + "epoch": 1.0659096461937616, + "grad_norm": 0.18009594082832336, + "learning_rate": 1.97447778868484e-05, + "loss": 1.2411, + "step": 2863 + }, + { + "epoch": 1.0662819513443706, + "grad_norm": 0.1718275547027588, + "learning_rate": 1.9744504995891135e-05, + "loss": 1.2352, + "step": 2864 + }, + { + "epoch": 1.0666542564949797, + "grad_norm": 0.16826453804969788, + "learning_rate": 1.9744231961008147e-05, + "loss": 1.2314, + "step": 2865 + }, + { + "epoch": 1.0670265616455887, + "grad_norm": 0.17893622815608978, + "learning_rate": 1.9743958782203462e-05, + "loss": 1.2282, + "step": 2866 + }, + { + "epoch": 1.0673988667961978, + "grad_norm": 0.17840996384620667, + "learning_rate": 1.9743685459481118e-05, + "loss": 1.2473, + "step": 2867 + }, + { + "epoch": 1.0677711719468068, + "grad_norm": 0.1669299453496933, + "learning_rate": 1.974341199284515e-05, + "loss": 1.2345, + "step": 2868 + }, + { + "epoch": 1.0681434770974159, + "grad_norm": 0.1682414710521698, + "learning_rate": 1.97431383822996e-05, + "loss": 1.2212, + "step": 2869 + }, + { + "epoch": 1.0685157822480251, + "grad_norm": 0.17519889771938324, + "learning_rate": 1.974286462784851e-05, + "loss": 1.2377, + "step": 2870 + }, + { + "epoch": 1.0688880873986342, + "grad_norm": 0.16797272861003876, + "learning_rate": 1.9742590729495917e-05, + "loss": 1.2078, + "step": 2871 + }, + { + "epoch": 1.0692603925492432, + "grad_norm": 0.17429950833320618, + "learning_rate": 1.9742316687245873e-05, + "loss": 1.2334, + "step": 2872 + }, + { + "epoch": 1.0696326976998523, + "grad_norm": 0.1742522120475769, + "learning_rate": 1.974204250110242e-05, + "loss": 1.2474, + "step": 2873 + }, + { + "epoch": 1.0700050028504613, + "grad_norm": 0.17582035064697266, + "learning_rate": 1.9741768171069614e-05, + "loss": 1.241, + "step": 2874 + }, + { + "epoch": 1.0703773080010703, + "grad_norm": 0.17079758644104004, + "learning_rate": 1.9741493697151502e-05, + "loss": 1.2281, + "step": 2875 + }, + { + "epoch": 1.0707496131516794, + "grad_norm": 0.17140960693359375, + "learning_rate": 1.9741219079352142e-05, + "loss": 1.2393, + "step": 2876 + }, + { + "epoch": 1.0711219183022884, + "grad_norm": 0.1833873987197876, + "learning_rate": 1.9740944317675583e-05, + "loss": 1.2488, + "step": 2877 + }, + { + "epoch": 1.0714942234528975, + "grad_norm": 0.22053757309913635, + "learning_rate": 1.974066941212589e-05, + "loss": 1.2451, + "step": 2878 + }, + { + "epoch": 1.0718665286035067, + "grad_norm": 0.1896497905254364, + "learning_rate": 1.974039436270712e-05, + "loss": 1.2203, + "step": 2879 + }, + { + "epoch": 1.0722388337541158, + "grad_norm": 0.17092444002628326, + "learning_rate": 1.9740119169423337e-05, + "loss": 1.234, + "step": 2880 + }, + { + "epoch": 1.0726111389047248, + "grad_norm": 0.1865914762020111, + "learning_rate": 1.9739843832278604e-05, + "loss": 1.2252, + "step": 2881 + }, + { + "epoch": 1.0729834440553339, + "grad_norm": 0.1847282201051712, + "learning_rate": 1.973956835127699e-05, + "loss": 1.2307, + "step": 2882 + }, + { + "epoch": 1.073355749205943, + "grad_norm": 0.16904793679714203, + "learning_rate": 1.9739292726422565e-05, + "loss": 1.2399, + "step": 2883 + }, + { + "epoch": 1.073728054356552, + "grad_norm": 0.17725877463817596, + "learning_rate": 1.9739016957719393e-05, + "loss": 1.2425, + "step": 2884 + }, + { + "epoch": 1.074100359507161, + "grad_norm": 0.1759733408689499, + "learning_rate": 1.9738741045171556e-05, + "loss": 1.2151, + "step": 2885 + }, + { + "epoch": 1.07447266465777, + "grad_norm": 0.17010413110256195, + "learning_rate": 1.973846498878312e-05, + "loss": 1.2503, + "step": 2886 + }, + { + "epoch": 1.0748449698083793, + "grad_norm": 0.17372602224349976, + "learning_rate": 1.9738188788558174e-05, + "loss": 1.2469, + "step": 2887 + }, + { + "epoch": 1.0752172749589883, + "grad_norm": 0.17600518465042114, + "learning_rate": 1.9737912444500786e-05, + "loss": 1.2301, + "step": 2888 + }, + { + "epoch": 1.0755895801095974, + "grad_norm": 0.17310109734535217, + "learning_rate": 1.9737635956615044e-05, + "loss": 1.2356, + "step": 2889 + }, + { + "epoch": 1.0759618852602064, + "grad_norm": 0.1836235374212265, + "learning_rate": 1.973735932490503e-05, + "loss": 1.2501, + "step": 2890 + }, + { + "epoch": 1.0763341904108155, + "grad_norm": 0.16939985752105713, + "learning_rate": 1.9737082549374828e-05, + "loss": 1.2273, + "step": 2891 + }, + { + "epoch": 1.0767064955614245, + "grad_norm": 0.1679062396287918, + "learning_rate": 1.973680563002853e-05, + "loss": 1.2343, + "step": 2892 + }, + { + "epoch": 1.0770788007120335, + "grad_norm": 0.1816723644733429, + "learning_rate": 1.9736528566870223e-05, + "loss": 1.2332, + "step": 2893 + }, + { + "epoch": 1.0774511058626426, + "grad_norm": 0.17142869532108307, + "learning_rate": 1.9736251359904003e-05, + "loss": 1.2289, + "step": 2894 + }, + { + "epoch": 1.0778234110132519, + "grad_norm": 0.17428475618362427, + "learning_rate": 1.9735974009133957e-05, + "loss": 1.2244, + "step": 2895 + }, + { + "epoch": 1.078195716163861, + "grad_norm": 0.16701877117156982, + "learning_rate": 1.9735696514564188e-05, + "loss": 1.211, + "step": 2896 + }, + { + "epoch": 1.07856802131447, + "grad_norm": 0.17782089114189148, + "learning_rate": 1.9735418876198792e-05, + "loss": 1.2257, + "step": 2897 + }, + { + "epoch": 1.078940326465079, + "grad_norm": 0.1718931496143341, + "learning_rate": 1.973514109404187e-05, + "loss": 1.2393, + "step": 2898 + }, + { + "epoch": 1.079312631615688, + "grad_norm": 0.16768048703670502, + "learning_rate": 1.9734863168097526e-05, + "loss": 1.2331, + "step": 2899 + }, + { + "epoch": 1.079684936766297, + "grad_norm": 0.16998796164989471, + "learning_rate": 1.973458509836986e-05, + "loss": 1.226, + "step": 2900 + }, + { + "epoch": 1.080057241916906, + "grad_norm": 0.1749204695224762, + "learning_rate": 1.973430688486299e-05, + "loss": 1.2437, + "step": 2901 + }, + { + "epoch": 1.0804295470675152, + "grad_norm": 0.17514778673648834, + "learning_rate": 1.973402852758101e-05, + "loss": 1.2397, + "step": 2902 + }, + { + "epoch": 1.0808018522181242, + "grad_norm": 0.1695886254310608, + "learning_rate": 1.9733750026528046e-05, + "loss": 1.2261, + "step": 2903 + }, + { + "epoch": 1.0811741573687335, + "grad_norm": 0.16834037005901337, + "learning_rate": 1.9733471381708202e-05, + "loss": 1.2264, + "step": 2904 + }, + { + "epoch": 1.0815464625193425, + "grad_norm": 0.17187410593032837, + "learning_rate": 1.97331925931256e-05, + "loss": 1.229, + "step": 2905 + }, + { + "epoch": 1.0819187676699515, + "grad_norm": 0.17803004384040833, + "learning_rate": 1.973291366078435e-05, + "loss": 1.237, + "step": 2906 + }, + { + "epoch": 1.0822910728205606, + "grad_norm": 0.1769217699766159, + "learning_rate": 1.973263458468858e-05, + "loss": 1.2232, + "step": 2907 + }, + { + "epoch": 1.0826633779711696, + "grad_norm": 0.1921815276145935, + "learning_rate": 1.9732355364842404e-05, + "loss": 1.2533, + "step": 2908 + }, + { + "epoch": 1.0830356831217787, + "grad_norm": 0.17243653535842896, + "learning_rate": 1.973207600124995e-05, + "loss": 1.2333, + "step": 2909 + }, + { + "epoch": 1.0834079882723877, + "grad_norm": 0.1771405041217804, + "learning_rate": 1.9731796493915346e-05, + "loss": 1.2261, + "step": 2910 + }, + { + "epoch": 1.0837802934229968, + "grad_norm": 0.17933285236358643, + "learning_rate": 1.973151684284272e-05, + "loss": 1.243, + "step": 2911 + }, + { + "epoch": 1.0841525985736058, + "grad_norm": 0.16889271140098572, + "learning_rate": 1.9731237048036197e-05, + "loss": 1.227, + "step": 2912 + }, + { + "epoch": 1.084524903724215, + "grad_norm": 0.17501141130924225, + "learning_rate": 1.9730957109499917e-05, + "loss": 1.2439, + "step": 2913 + }, + { + "epoch": 1.084897208874824, + "grad_norm": 0.18403035402297974, + "learning_rate": 1.973067702723801e-05, + "loss": 1.2383, + "step": 2914 + }, + { + "epoch": 1.0852695140254331, + "grad_norm": 0.169046550989151, + "learning_rate": 1.9730396801254614e-05, + "loss": 1.2363, + "step": 2915 + }, + { + "epoch": 1.0856418191760422, + "grad_norm": 0.1726294606924057, + "learning_rate": 1.973011643155387e-05, + "loss": 1.2531, + "step": 2916 + }, + { + "epoch": 1.0860141243266512, + "grad_norm": 0.1682724505662918, + "learning_rate": 1.9729835918139914e-05, + "loss": 1.2225, + "step": 2917 + }, + { + "epoch": 1.0863864294772603, + "grad_norm": 0.2326967865228653, + "learning_rate": 1.9729555261016894e-05, + "loss": 1.2361, + "step": 2918 + }, + { + "epoch": 1.0867587346278693, + "grad_norm": 0.1621241271495819, + "learning_rate": 1.972927446018895e-05, + "loss": 1.2233, + "step": 2919 + }, + { + "epoch": 1.0871310397784784, + "grad_norm": 0.1713317185640335, + "learning_rate": 1.9728993515660236e-05, + "loss": 1.2284, + "step": 2920 + }, + { + "epoch": 1.0875033449290874, + "grad_norm": 0.1748856008052826, + "learning_rate": 1.9728712427434898e-05, + "loss": 1.2337, + "step": 2921 + }, + { + "epoch": 1.0878756500796967, + "grad_norm": 0.17016327381134033, + "learning_rate": 1.972843119551709e-05, + "loss": 1.2292, + "step": 2922 + }, + { + "epoch": 1.0882479552303057, + "grad_norm": 0.17471212148666382, + "learning_rate": 1.9728149819910958e-05, + "loss": 1.231, + "step": 2923 + }, + { + "epoch": 1.0886202603809148, + "grad_norm": 0.16408810019493103, + "learning_rate": 1.972786830062067e-05, + "loss": 1.2305, + "step": 2924 + }, + { + "epoch": 1.0889925655315238, + "grad_norm": 0.17399826645851135, + "learning_rate": 1.9727586637650373e-05, + "loss": 1.2377, + "step": 2925 + }, + { + "epoch": 1.0893648706821328, + "grad_norm": 0.18122164905071259, + "learning_rate": 1.9727304831004232e-05, + "loss": 1.2378, + "step": 2926 + }, + { + "epoch": 1.0897371758327419, + "grad_norm": 0.17349712550640106, + "learning_rate": 1.9727022880686413e-05, + "loss": 1.2237, + "step": 2927 + }, + { + "epoch": 1.090109480983351, + "grad_norm": 0.18894222378730774, + "learning_rate": 1.9726740786701075e-05, + "loss": 1.2254, + "step": 2928 + }, + { + "epoch": 1.09048178613396, + "grad_norm": 0.184165820479393, + "learning_rate": 1.9726458549052384e-05, + "loss": 1.2344, + "step": 2929 + }, + { + "epoch": 1.090854091284569, + "grad_norm": 0.18154264986515045, + "learning_rate": 1.9726176167744513e-05, + "loss": 1.225, + "step": 2930 + }, + { + "epoch": 1.0912263964351783, + "grad_norm": 0.17851291596889496, + "learning_rate": 1.972589364278163e-05, + "loss": 1.237, + "step": 2931 + }, + { + "epoch": 1.0915987015857873, + "grad_norm": 0.17904578149318695, + "learning_rate": 1.9725610974167907e-05, + "loss": 1.2378, + "step": 2932 + }, + { + "epoch": 1.0919710067363964, + "grad_norm": 0.16910214722156525, + "learning_rate": 1.972532816190752e-05, + "loss": 1.2342, + "step": 2933 + }, + { + "epoch": 1.0923433118870054, + "grad_norm": 0.17645606398582458, + "learning_rate": 1.972504520600465e-05, + "loss": 1.23, + "step": 2934 + }, + { + "epoch": 1.0927156170376144, + "grad_norm": 0.16807736456394196, + "learning_rate": 1.9724762106463467e-05, + "loss": 1.2305, + "step": 2935 + }, + { + "epoch": 1.0930879221882235, + "grad_norm": 0.20768019556999207, + "learning_rate": 1.972447886328816e-05, + "loss": 1.2291, + "step": 2936 + }, + { + "epoch": 1.0934602273388325, + "grad_norm": 0.1863759607076645, + "learning_rate": 1.9724195476482914e-05, + "loss": 1.2371, + "step": 2937 + }, + { + "epoch": 1.0938325324894416, + "grad_norm": 0.17857873439788818, + "learning_rate": 1.9723911946051905e-05, + "loss": 1.2179, + "step": 2938 + }, + { + "epoch": 1.0942048376400506, + "grad_norm": 0.17775695025920868, + "learning_rate": 1.972362827199933e-05, + "loss": 1.2498, + "step": 2939 + }, + { + "epoch": 1.0945771427906599, + "grad_norm": 0.1699213981628418, + "learning_rate": 1.9723344454329376e-05, + "loss": 1.2439, + "step": 2940 + }, + { + "epoch": 1.094949447941269, + "grad_norm": 0.1817527562379837, + "learning_rate": 1.9723060493046235e-05, + "loss": 1.2276, + "step": 2941 + }, + { + "epoch": 1.095321753091878, + "grad_norm": 0.17052580416202545, + "learning_rate": 1.97227763881541e-05, + "loss": 1.2161, + "step": 2942 + }, + { + "epoch": 1.095694058242487, + "grad_norm": 0.17107853293418884, + "learning_rate": 1.972249213965717e-05, + "loss": 1.2265, + "step": 2943 + }, + { + "epoch": 1.096066363393096, + "grad_norm": 0.17966510355472565, + "learning_rate": 1.9722207747559636e-05, + "loss": 1.2304, + "step": 2944 + }, + { + "epoch": 1.096438668543705, + "grad_norm": 0.1866060048341751, + "learning_rate": 1.972192321186571e-05, + "loss": 1.2266, + "step": 2945 + }, + { + "epoch": 1.0968109736943141, + "grad_norm": 0.17971013486385345, + "learning_rate": 1.9721638532579584e-05, + "loss": 1.2333, + "step": 2946 + }, + { + "epoch": 1.0971832788449232, + "grad_norm": 0.17302943766117096, + "learning_rate": 1.972135370970547e-05, + "loss": 1.2367, + "step": 2947 + }, + { + "epoch": 1.0975555839955324, + "grad_norm": 0.17640618979930878, + "learning_rate": 1.9721068743247568e-05, + "loss": 1.2385, + "step": 2948 + }, + { + "epoch": 1.0979278891461415, + "grad_norm": 0.17200317978858948, + "learning_rate": 1.972078363321009e-05, + "loss": 1.2277, + "step": 2949 + }, + { + "epoch": 1.0983001942967505, + "grad_norm": 0.1816646158695221, + "learning_rate": 1.9720498379597256e-05, + "loss": 1.2319, + "step": 2950 + }, + { + "epoch": 1.0986724994473596, + "grad_norm": 0.17960673570632935, + "learning_rate": 1.9720212982413262e-05, + "loss": 1.2235, + "step": 2951 + }, + { + "epoch": 1.0990448045979686, + "grad_norm": 0.17489486932754517, + "learning_rate": 1.971992744166234e-05, + "loss": 1.2377, + "step": 2952 + }, + { + "epoch": 1.0994171097485776, + "grad_norm": 0.17041970789432526, + "learning_rate": 1.9719641757348694e-05, + "loss": 1.226, + "step": 2953 + }, + { + "epoch": 1.0997894148991867, + "grad_norm": 0.18417948484420776, + "learning_rate": 1.9719355929476548e-05, + "loss": 1.237, + "step": 2954 + }, + { + "epoch": 1.1001617200497957, + "grad_norm": 0.17019295692443848, + "learning_rate": 1.971906995805013e-05, + "loss": 1.225, + "step": 2955 + }, + { + "epoch": 1.100534025200405, + "grad_norm": 0.17955681681632996, + "learning_rate": 1.9718783843073653e-05, + "loss": 1.2361, + "step": 2956 + }, + { + "epoch": 1.100906330351014, + "grad_norm": 0.3305249810218811, + "learning_rate": 1.9718497584551355e-05, + "loss": 1.2538, + "step": 2957 + }, + { + "epoch": 1.101278635501623, + "grad_norm": 0.18145957589149475, + "learning_rate": 1.9718211182487455e-05, + "loss": 1.2218, + "step": 2958 + }, + { + "epoch": 1.1016509406522321, + "grad_norm": 0.17651425302028656, + "learning_rate": 1.9717924636886186e-05, + "loss": 1.2455, + "step": 2959 + }, + { + "epoch": 1.1020232458028412, + "grad_norm": 0.17769207060337067, + "learning_rate": 1.971763794775178e-05, + "loss": 1.2381, + "step": 2960 + }, + { + "epoch": 1.1023955509534502, + "grad_norm": 0.17026957869529724, + "learning_rate": 1.971735111508847e-05, + "loss": 1.2176, + "step": 2961 + }, + { + "epoch": 1.1027678561040593, + "grad_norm": 0.17879103124141693, + "learning_rate": 1.9717064138900494e-05, + "loss": 1.2371, + "step": 2962 + }, + { + "epoch": 1.1031401612546683, + "grad_norm": 0.16946665942668915, + "learning_rate": 1.9716777019192087e-05, + "loss": 1.2431, + "step": 2963 + }, + { + "epoch": 1.1035124664052773, + "grad_norm": 0.16764768958091736, + "learning_rate": 1.97164897559675e-05, + "loss": 1.2293, + "step": 2964 + }, + { + "epoch": 1.1038847715558866, + "grad_norm": 0.1787741482257843, + "learning_rate": 1.9716202349230967e-05, + "loss": 1.2273, + "step": 2965 + }, + { + "epoch": 1.1042570767064956, + "grad_norm": 0.18097257614135742, + "learning_rate": 1.971591479898673e-05, + "loss": 1.2405, + "step": 2966 + }, + { + "epoch": 1.1046293818571047, + "grad_norm": 0.16577765345573425, + "learning_rate": 1.9715627105239048e-05, + "loss": 1.2213, + "step": 2967 + }, + { + "epoch": 1.1050016870077137, + "grad_norm": 0.17236430943012238, + "learning_rate": 1.9715339267992162e-05, + "loss": 1.2358, + "step": 2968 + }, + { + "epoch": 1.1053739921583228, + "grad_norm": 0.1793234795331955, + "learning_rate": 1.9715051287250322e-05, + "loss": 1.2465, + "step": 2969 + }, + { + "epoch": 1.1057462973089318, + "grad_norm": 0.17169257998466492, + "learning_rate": 1.9714763163017788e-05, + "loss": 1.2308, + "step": 2970 + }, + { + "epoch": 1.1061186024595409, + "grad_norm": 0.16924230754375458, + "learning_rate": 1.9714474895298807e-05, + "loss": 1.2452, + "step": 2971 + }, + { + "epoch": 1.10649090761015, + "grad_norm": 0.17970192432403564, + "learning_rate": 1.9714186484097646e-05, + "loss": 1.2216, + "step": 2972 + }, + { + "epoch": 1.106863212760759, + "grad_norm": 0.17152759432792664, + "learning_rate": 1.9713897929418556e-05, + "loss": 1.2237, + "step": 2973 + }, + { + "epoch": 1.1072355179113682, + "grad_norm": 0.18560026586055756, + "learning_rate": 1.9713609231265807e-05, + "loss": 1.2257, + "step": 2974 + }, + { + "epoch": 1.1076078230619772, + "grad_norm": 0.17931891977787018, + "learning_rate": 1.9713320389643658e-05, + "loss": 1.2455, + "step": 2975 + }, + { + "epoch": 1.1079801282125863, + "grad_norm": 0.17655859887599945, + "learning_rate": 1.9713031404556377e-05, + "loss": 1.2415, + "step": 2976 + }, + { + "epoch": 1.1083524333631953, + "grad_norm": 0.17913676798343658, + "learning_rate": 1.971274227600823e-05, + "loss": 1.2298, + "step": 2977 + }, + { + "epoch": 1.1087247385138044, + "grad_norm": 0.17090395092964172, + "learning_rate": 1.971245300400349e-05, + "loss": 1.2242, + "step": 2978 + }, + { + "epoch": 1.1090970436644134, + "grad_norm": 0.17184513807296753, + "learning_rate": 1.9712163588546426e-05, + "loss": 1.2221, + "step": 2979 + }, + { + "epoch": 1.1094693488150225, + "grad_norm": 0.17693102359771729, + "learning_rate": 1.971187402964132e-05, + "loss": 1.2345, + "step": 2980 + }, + { + "epoch": 1.1098416539656315, + "grad_norm": 0.17723120748996735, + "learning_rate": 1.9711584327292442e-05, + "loss": 1.2168, + "step": 2981 + }, + { + "epoch": 1.1102139591162405, + "grad_norm": 0.1746647208929062, + "learning_rate": 1.9711294481504074e-05, + "loss": 1.2297, + "step": 2982 + }, + { + "epoch": 1.1105862642668498, + "grad_norm": 0.17656634747982025, + "learning_rate": 1.9711004492280495e-05, + "loss": 1.2339, + "step": 2983 + }, + { + "epoch": 1.1109585694174589, + "grad_norm": 0.1806725114583969, + "learning_rate": 1.9710714359625987e-05, + "loss": 1.2305, + "step": 2984 + }, + { + "epoch": 1.111330874568068, + "grad_norm": 0.17355605959892273, + "learning_rate": 1.9710424083544837e-05, + "loss": 1.2275, + "step": 2985 + }, + { + "epoch": 1.111703179718677, + "grad_norm": 0.17555807530879974, + "learning_rate": 1.9710133664041338e-05, + "loss": 1.2219, + "step": 2986 + }, + { + "epoch": 1.112075484869286, + "grad_norm": 0.17484989762306213, + "learning_rate": 1.9709843101119772e-05, + "loss": 1.236, + "step": 2987 + }, + { + "epoch": 1.112447790019895, + "grad_norm": 0.1751967966556549, + "learning_rate": 1.970955239478443e-05, + "loss": 1.2551, + "step": 2988 + }, + { + "epoch": 1.112820095170504, + "grad_norm": 0.1683066189289093, + "learning_rate": 1.9709261545039614e-05, + "loss": 1.2307, + "step": 2989 + }, + { + "epoch": 1.113192400321113, + "grad_norm": 0.17595693469047546, + "learning_rate": 1.970897055188961e-05, + "loss": 1.247, + "step": 2990 + }, + { + "epoch": 1.1135647054717221, + "grad_norm": 0.17412763833999634, + "learning_rate": 1.9708679415338722e-05, + "loss": 1.2353, + "step": 2991 + }, + { + "epoch": 1.1139370106223314, + "grad_norm": 0.17140118777751923, + "learning_rate": 1.9708388135391247e-05, + "loss": 1.2259, + "step": 2992 + }, + { + "epoch": 1.1143093157729405, + "grad_norm": 0.17357899248600006, + "learning_rate": 1.970809671205149e-05, + "loss": 1.2314, + "step": 2993 + }, + { + "epoch": 1.1146816209235495, + "grad_norm": 0.16457659006118774, + "learning_rate": 1.970780514532375e-05, + "loss": 1.2371, + "step": 2994 + }, + { + "epoch": 1.1150539260741585, + "grad_norm": 0.17428340017795563, + "learning_rate": 1.970751343521234e-05, + "loss": 1.227, + "step": 2995 + }, + { + "epoch": 1.1154262312247676, + "grad_norm": 0.18031379580497742, + "learning_rate": 1.9707221581721568e-05, + "loss": 1.2366, + "step": 2996 + }, + { + "epoch": 1.1157985363753766, + "grad_norm": 0.16890005767345428, + "learning_rate": 1.9706929584855737e-05, + "loss": 1.2296, + "step": 2997 + }, + { + "epoch": 1.1161708415259857, + "grad_norm": 0.16828297078609467, + "learning_rate": 1.970663744461917e-05, + "loss": 1.2273, + "step": 2998 + }, + { + "epoch": 1.1165431466765947, + "grad_norm": 0.17010074853897095, + "learning_rate": 1.9706345161016177e-05, + "loss": 1.2263, + "step": 2999 + }, + { + "epoch": 1.116915451827204, + "grad_norm": 0.1739543378353119, + "learning_rate": 1.970605273405107e-05, + "loss": 1.2333, + "step": 3000 + }, + { + "epoch": 1.116915451827204, + "eval_loss": 1.324150562286377, + "eval_runtime": 16.2397, + "eval_samples_per_second": 106.776, + "eval_steps_per_second": 5.357, + "step": 3000 + }, + { + "epoch": 1.117287756977813, + "grad_norm": 0.1825607717037201, + "learning_rate": 1.970576016372818e-05, + "loss": 1.2399, + "step": 3001 + }, + { + "epoch": 1.117660062128422, + "grad_norm": 0.17324824631214142, + "learning_rate": 1.970546745005182e-05, + "loss": 1.2294, + "step": 3002 + }, + { + "epoch": 1.118032367279031, + "grad_norm": 0.17063994705677032, + "learning_rate": 1.970517459302631e-05, + "loss": 1.2403, + "step": 3003 + }, + { + "epoch": 1.1184046724296401, + "grad_norm": 0.16882003843784332, + "learning_rate": 1.9704881592655983e-05, + "loss": 1.2178, + "step": 3004 + }, + { + "epoch": 1.1187769775802492, + "grad_norm": 0.17386655509471893, + "learning_rate": 1.9704588448945167e-05, + "loss": 1.2296, + "step": 3005 + }, + { + "epoch": 1.1191492827308582, + "grad_norm": 0.16440868377685547, + "learning_rate": 1.9704295161898185e-05, + "loss": 1.2355, + "step": 3006 + }, + { + "epoch": 1.1195215878814673, + "grad_norm": 0.17162500321865082, + "learning_rate": 1.9704001731519374e-05, + "loss": 1.2373, + "step": 3007 + }, + { + "epoch": 1.1198938930320763, + "grad_norm": 0.1756156086921692, + "learning_rate": 1.9703708157813068e-05, + "loss": 1.2501, + "step": 3008 + }, + { + "epoch": 1.1202661981826856, + "grad_norm": 0.17049741744995117, + "learning_rate": 1.9703414440783596e-05, + "loss": 1.2334, + "step": 3009 + }, + { + "epoch": 1.1206385033332946, + "grad_norm": 0.17266800999641418, + "learning_rate": 1.9703120580435303e-05, + "loss": 1.2251, + "step": 3010 + }, + { + "epoch": 1.1210108084839037, + "grad_norm": 0.17048951983451843, + "learning_rate": 1.9702826576772535e-05, + "loss": 1.2344, + "step": 3011 + }, + { + "epoch": 1.1213831136345127, + "grad_norm": 0.17931020259857178, + "learning_rate": 1.9702532429799622e-05, + "loss": 1.2263, + "step": 3012 + }, + { + "epoch": 1.1217554187851217, + "grad_norm": 0.16861797869205475, + "learning_rate": 1.9702238139520912e-05, + "loss": 1.2332, + "step": 3013 + }, + { + "epoch": 1.1221277239357308, + "grad_norm": 0.1716170459985733, + "learning_rate": 1.9701943705940758e-05, + "loss": 1.2288, + "step": 3014 + }, + { + "epoch": 1.1225000290863398, + "grad_norm": 0.1761639565229416, + "learning_rate": 1.9701649129063503e-05, + "loss": 1.229, + "step": 3015 + }, + { + "epoch": 1.1228723342369489, + "grad_norm": 0.17630340158939362, + "learning_rate": 1.9701354408893495e-05, + "loss": 1.2453, + "step": 3016 + }, + { + "epoch": 1.1232446393875581, + "grad_norm": 0.17522114515304565, + "learning_rate": 1.9701059545435094e-05, + "loss": 1.2242, + "step": 3017 + }, + { + "epoch": 1.1236169445381672, + "grad_norm": 0.17642271518707275, + "learning_rate": 1.9700764538692657e-05, + "loss": 1.2354, + "step": 3018 + }, + { + "epoch": 1.1239892496887762, + "grad_norm": 0.17362841963768005, + "learning_rate": 1.970046938867053e-05, + "loss": 1.2199, + "step": 3019 + }, + { + "epoch": 1.1243615548393853, + "grad_norm": 0.17298756539821625, + "learning_rate": 1.970017409537308e-05, + "loss": 1.2324, + "step": 3020 + }, + { + "epoch": 1.1247338599899943, + "grad_norm": 0.17711667716503143, + "learning_rate": 1.9699878658804673e-05, + "loss": 1.2111, + "step": 3021 + }, + { + "epoch": 1.1251061651406034, + "grad_norm": 0.1722431480884552, + "learning_rate": 1.969958307896966e-05, + "loss": 1.2308, + "step": 3022 + }, + { + "epoch": 1.1254784702912124, + "grad_norm": 0.17593185603618622, + "learning_rate": 1.969928735587242e-05, + "loss": 1.2335, + "step": 3023 + }, + { + "epoch": 1.1258507754418214, + "grad_norm": 0.17119091749191284, + "learning_rate": 1.969899148951731e-05, + "loss": 1.2294, + "step": 3024 + }, + { + "epoch": 1.1262230805924305, + "grad_norm": 0.1774984896183014, + "learning_rate": 1.9698695479908706e-05, + "loss": 1.2279, + "step": 3025 + }, + { + "epoch": 1.1265953857430397, + "grad_norm": 0.17215633392333984, + "learning_rate": 1.9698399327050976e-05, + "loss": 1.227, + "step": 3026 + }, + { + "epoch": 1.1269676908936488, + "grad_norm": 0.1741963028907776, + "learning_rate": 1.96981030309485e-05, + "loss": 1.2408, + "step": 3027 + }, + { + "epoch": 1.1273399960442578, + "grad_norm": 0.1749897301197052, + "learning_rate": 1.969780659160565e-05, + "loss": 1.2416, + "step": 3028 + }, + { + "epoch": 1.1277123011948669, + "grad_norm": 0.16878561675548553, + "learning_rate": 1.9697510009026803e-05, + "loss": 1.2395, + "step": 3029 + }, + { + "epoch": 1.128084606345476, + "grad_norm": 0.1697298288345337, + "learning_rate": 1.9697213283216342e-05, + "loss": 1.2274, + "step": 3030 + }, + { + "epoch": 1.128456911496085, + "grad_norm": 0.166582852602005, + "learning_rate": 1.9696916414178652e-05, + "loss": 1.2283, + "step": 3031 + }, + { + "epoch": 1.128829216646694, + "grad_norm": 0.1713571697473526, + "learning_rate": 1.9696619401918112e-05, + "loss": 1.218, + "step": 3032 + }, + { + "epoch": 1.129201521797303, + "grad_norm": 0.1708502322435379, + "learning_rate": 1.9696322246439113e-05, + "loss": 1.2272, + "step": 3033 + }, + { + "epoch": 1.129573826947912, + "grad_norm": 0.16899080574512482, + "learning_rate": 1.9696024947746047e-05, + "loss": 1.2354, + "step": 3034 + }, + { + "epoch": 1.1299461320985213, + "grad_norm": 0.16772188246250153, + "learning_rate": 1.9695727505843298e-05, + "loss": 1.2415, + "step": 3035 + }, + { + "epoch": 1.1303184372491304, + "grad_norm": 0.17296922206878662, + "learning_rate": 1.969542992073526e-05, + "loss": 1.2145, + "step": 3036 + }, + { + "epoch": 1.1306907423997394, + "grad_norm": 0.16814911365509033, + "learning_rate": 1.9695132192426334e-05, + "loss": 1.2418, + "step": 3037 + }, + { + "epoch": 1.1310630475503485, + "grad_norm": 0.16937977075576782, + "learning_rate": 1.9694834320920912e-05, + "loss": 1.2245, + "step": 3038 + }, + { + "epoch": 1.1314353527009575, + "grad_norm": 0.17156411707401276, + "learning_rate": 1.9694536306223394e-05, + "loss": 1.2336, + "step": 3039 + }, + { + "epoch": 1.1318076578515666, + "grad_norm": 0.16524691879749298, + "learning_rate": 1.9694238148338186e-05, + "loss": 1.2354, + "step": 3040 + }, + { + "epoch": 1.1321799630021756, + "grad_norm": 0.1789710819721222, + "learning_rate": 1.9693939847269688e-05, + "loss": 1.2243, + "step": 3041 + }, + { + "epoch": 1.1325522681527846, + "grad_norm": 0.16184090077877045, + "learning_rate": 1.9693641403022308e-05, + "loss": 1.2194, + "step": 3042 + }, + { + "epoch": 1.1329245733033937, + "grad_norm": 0.17413978278636932, + "learning_rate": 1.969334281560045e-05, + "loss": 1.238, + "step": 3043 + }, + { + "epoch": 1.133296878454003, + "grad_norm": 0.17365868389606476, + "learning_rate": 1.969304408500853e-05, + "loss": 1.2253, + "step": 3044 + }, + { + "epoch": 1.133669183604612, + "grad_norm": 0.16915839910507202, + "learning_rate": 1.9692745211250957e-05, + "loss": 1.2234, + "step": 3045 + }, + { + "epoch": 1.134041488755221, + "grad_norm": 0.17093431949615479, + "learning_rate": 1.9692446194332144e-05, + "loss": 1.2275, + "step": 3046 + }, + { + "epoch": 1.13441379390583, + "grad_norm": 0.1693648397922516, + "learning_rate": 1.969214703425651e-05, + "loss": 1.2381, + "step": 3047 + }, + { + "epoch": 1.1347860990564391, + "grad_norm": 0.16665521264076233, + "learning_rate": 1.969184773102847e-05, + "loss": 1.2276, + "step": 3048 + }, + { + "epoch": 1.1351584042070482, + "grad_norm": 0.17075350880622864, + "learning_rate": 1.969154828465245e-05, + "loss": 1.2314, + "step": 3049 + }, + { + "epoch": 1.1355307093576572, + "grad_norm": 0.18020687997341156, + "learning_rate": 1.969124869513287e-05, + "loss": 1.2448, + "step": 3050 + }, + { + "epoch": 1.1359030145082663, + "grad_norm": 0.17107802629470825, + "learning_rate": 1.9690948962474152e-05, + "loss": 1.2441, + "step": 3051 + }, + { + "epoch": 1.1362753196588753, + "grad_norm": 0.1705516129732132, + "learning_rate": 1.969064908668073e-05, + "loss": 1.2429, + "step": 3052 + }, + { + "epoch": 1.1366476248094846, + "grad_norm": 0.16781964898109436, + "learning_rate": 1.9690349067757026e-05, + "loss": 1.2203, + "step": 3053 + }, + { + "epoch": 1.1370199299600936, + "grad_norm": 0.17965157330036163, + "learning_rate": 1.9690048905707476e-05, + "loss": 1.2352, + "step": 3054 + }, + { + "epoch": 1.1373922351107026, + "grad_norm": 0.16421709954738617, + "learning_rate": 1.9689748600536514e-05, + "loss": 1.2138, + "step": 3055 + }, + { + "epoch": 1.1377645402613117, + "grad_norm": 0.17832623422145844, + "learning_rate": 1.9689448152248568e-05, + "loss": 1.2288, + "step": 3056 + }, + { + "epoch": 1.1381368454119207, + "grad_norm": 0.16908565163612366, + "learning_rate": 1.9689147560848086e-05, + "loss": 1.2158, + "step": 3057 + }, + { + "epoch": 1.1385091505625298, + "grad_norm": 0.18035708367824554, + "learning_rate": 1.9688846826339498e-05, + "loss": 1.2292, + "step": 3058 + }, + { + "epoch": 1.1388814557131388, + "grad_norm": 0.18107391893863678, + "learning_rate": 1.9688545948727255e-05, + "loss": 1.2215, + "step": 3059 + }, + { + "epoch": 1.1392537608637479, + "grad_norm": 0.17275120317935944, + "learning_rate": 1.9688244928015795e-05, + "loss": 1.2262, + "step": 3060 + }, + { + "epoch": 1.139626066014357, + "grad_norm": 0.17838861048221588, + "learning_rate": 1.9687943764209564e-05, + "loss": 1.2246, + "step": 3061 + }, + { + "epoch": 1.1399983711649662, + "grad_norm": 0.17901575565338135, + "learning_rate": 1.968764245731301e-05, + "loss": 1.2299, + "step": 3062 + }, + { + "epoch": 1.1403706763155752, + "grad_norm": 0.17386050522327423, + "learning_rate": 1.9687341007330588e-05, + "loss": 1.2214, + "step": 3063 + }, + { + "epoch": 1.1407429814661842, + "grad_norm": 0.17698942124843597, + "learning_rate": 1.9687039414266745e-05, + "loss": 1.2363, + "step": 3064 + }, + { + "epoch": 1.1411152866167933, + "grad_norm": 0.16935193538665771, + "learning_rate": 1.968673767812594e-05, + "loss": 1.2189, + "step": 3065 + }, + { + "epoch": 1.1414875917674023, + "grad_norm": 0.18296149373054504, + "learning_rate": 1.9686435798912624e-05, + "loss": 1.2252, + "step": 3066 + }, + { + "epoch": 1.1418598969180114, + "grad_norm": 0.17983867228031158, + "learning_rate": 1.9686133776631263e-05, + "loss": 1.2338, + "step": 3067 + }, + { + "epoch": 1.1422322020686204, + "grad_norm": 0.17501530051231384, + "learning_rate": 1.9685831611286312e-05, + "loss": 1.2168, + "step": 3068 + }, + { + "epoch": 1.1426045072192297, + "grad_norm": 0.17005561292171478, + "learning_rate": 1.9685529302882237e-05, + "loss": 1.2367, + "step": 3069 + }, + { + "epoch": 1.1429768123698387, + "grad_norm": 0.1763869673013687, + "learning_rate": 1.9685226851423502e-05, + "loss": 1.2142, + "step": 3070 + }, + { + "epoch": 1.1433491175204478, + "grad_norm": 0.17087772488594055, + "learning_rate": 1.9684924256914574e-05, + "loss": 1.2228, + "step": 3071 + }, + { + "epoch": 1.1437214226710568, + "grad_norm": 0.167043536901474, + "learning_rate": 1.968462151935992e-05, + "loss": 1.2148, + "step": 3072 + }, + { + "epoch": 1.1440937278216659, + "grad_norm": 0.17006975412368774, + "learning_rate": 1.968431863876402e-05, + "loss": 1.2376, + "step": 3073 + }, + { + "epoch": 1.144466032972275, + "grad_norm": 0.175192192196846, + "learning_rate": 1.9684015615131336e-05, + "loss": 1.227, + "step": 3074 + }, + { + "epoch": 1.144838338122884, + "grad_norm": 0.1716625988483429, + "learning_rate": 1.968371244846635e-05, + "loss": 1.2399, + "step": 3075 + }, + { + "epoch": 1.145210643273493, + "grad_norm": 0.17771419882774353, + "learning_rate": 1.9683409138773538e-05, + "loss": 1.2311, + "step": 3076 + }, + { + "epoch": 1.145582948424102, + "grad_norm": 0.17725834250450134, + "learning_rate": 1.9683105686057383e-05, + "loss": 1.2195, + "step": 3077 + }, + { + "epoch": 1.1459552535747113, + "grad_norm": 0.17201553285121918, + "learning_rate": 1.9682802090322365e-05, + "loss": 1.2308, + "step": 3078 + }, + { + "epoch": 1.1463275587253203, + "grad_norm": 0.17508849501609802, + "learning_rate": 1.968249835157297e-05, + "loss": 1.2278, + "step": 3079 + }, + { + "epoch": 1.1466998638759294, + "grad_norm": 0.16712623834609985, + "learning_rate": 1.9682194469813675e-05, + "loss": 1.233, + "step": 3080 + }, + { + "epoch": 1.1470721690265384, + "grad_norm": 0.1693454533815384, + "learning_rate": 1.968189044504898e-05, + "loss": 1.2292, + "step": 3081 + }, + { + "epoch": 1.1474444741771475, + "grad_norm": 0.17314909398555756, + "learning_rate": 1.968158627728337e-05, + "loss": 1.2371, + "step": 3082 + }, + { + "epoch": 1.1478167793277565, + "grad_norm": 0.1748846024274826, + "learning_rate": 1.968128196652134e-05, + "loss": 1.2337, + "step": 3083 + }, + { + "epoch": 1.1481890844783655, + "grad_norm": 0.16931162774562836, + "learning_rate": 1.968097751276738e-05, + "loss": 1.2321, + "step": 3084 + }, + { + "epoch": 1.1485613896289746, + "grad_norm": 0.17732059955596924, + "learning_rate": 1.9680672916025993e-05, + "loss": 1.2152, + "step": 3085 + }, + { + "epoch": 1.1489336947795836, + "grad_norm": 0.17309656739234924, + "learning_rate": 1.9680368176301673e-05, + "loss": 1.235, + "step": 3086 + }, + { + "epoch": 1.149305999930193, + "grad_norm": 0.17718668282032013, + "learning_rate": 1.968006329359892e-05, + "loss": 1.2288, + "step": 3087 + }, + { + "epoch": 1.149678305080802, + "grad_norm": 0.17663928866386414, + "learning_rate": 1.9679758267922243e-05, + "loss": 1.2314, + "step": 3088 + }, + { + "epoch": 1.150050610231411, + "grad_norm": 0.16871769726276398, + "learning_rate": 1.9679453099276142e-05, + "loss": 1.2292, + "step": 3089 + }, + { + "epoch": 1.15042291538202, + "grad_norm": 0.177023783326149, + "learning_rate": 1.9679147787665128e-05, + "loss": 1.211, + "step": 3090 + }, + { + "epoch": 1.150795220532629, + "grad_norm": 0.17566607892513275, + "learning_rate": 1.9678842333093708e-05, + "loss": 1.2255, + "step": 3091 + }, + { + "epoch": 1.151167525683238, + "grad_norm": 0.17927557229995728, + "learning_rate": 1.9678536735566393e-05, + "loss": 1.2353, + "step": 3092 + }, + { + "epoch": 1.1515398308338471, + "grad_norm": 0.15841099619865417, + "learning_rate": 1.9678230995087696e-05, + "loss": 1.2296, + "step": 3093 + }, + { + "epoch": 1.1519121359844562, + "grad_norm": 0.17372049391269684, + "learning_rate": 1.9677925111662136e-05, + "loss": 1.2199, + "step": 3094 + }, + { + "epoch": 1.1522844411350652, + "grad_norm": 0.17242653667926788, + "learning_rate": 1.9677619085294234e-05, + "loss": 1.223, + "step": 3095 + }, + { + "epoch": 1.1526567462856745, + "grad_norm": 0.18381370604038239, + "learning_rate": 1.96773129159885e-05, + "loss": 1.2372, + "step": 3096 + }, + { + "epoch": 1.1530290514362835, + "grad_norm": 0.1743118166923523, + "learning_rate": 1.9677006603749463e-05, + "loss": 1.2133, + "step": 3097 + }, + { + "epoch": 1.1534013565868926, + "grad_norm": 0.18232518434524536, + "learning_rate": 1.967670014858165e-05, + "loss": 1.2328, + "step": 3098 + }, + { + "epoch": 1.1537736617375016, + "grad_norm": 0.17845921218395233, + "learning_rate": 1.9676393550489576e-05, + "loss": 1.2349, + "step": 3099 + }, + { + "epoch": 1.1541459668881107, + "grad_norm": 0.18814215064048767, + "learning_rate": 1.9676086809477778e-05, + "loss": 1.2375, + "step": 3100 + }, + { + "epoch": 1.1545182720387197, + "grad_norm": 0.17665289342403412, + "learning_rate": 1.9675779925550785e-05, + "loss": 1.216, + "step": 3101 + }, + { + "epoch": 1.1548905771893287, + "grad_norm": 0.1747797727584839, + "learning_rate": 1.9675472898713136e-05, + "loss": 1.2291, + "step": 3102 + }, + { + "epoch": 1.1552628823399378, + "grad_norm": 0.17490416765213013, + "learning_rate": 1.9675165728969353e-05, + "loss": 1.2187, + "step": 3103 + }, + { + "epoch": 1.1556351874905468, + "grad_norm": 0.17718598246574402, + "learning_rate": 1.967485841632398e-05, + "loss": 1.2374, + "step": 3104 + }, + { + "epoch": 1.156007492641156, + "grad_norm": 0.17890246212482452, + "learning_rate": 1.967455096078156e-05, + "loss": 1.2184, + "step": 3105 + }, + { + "epoch": 1.1563797977917651, + "grad_norm": 0.17774467170238495, + "learning_rate": 1.9674243362346624e-05, + "loss": 1.2251, + "step": 3106 + }, + { + "epoch": 1.1567521029423742, + "grad_norm": 0.1743806153535843, + "learning_rate": 1.9673935621023724e-05, + "loss": 1.2357, + "step": 3107 + }, + { + "epoch": 1.1571244080929832, + "grad_norm": 0.1773483157157898, + "learning_rate": 1.96736277368174e-05, + "loss": 1.2338, + "step": 3108 + }, + { + "epoch": 1.1574967132435923, + "grad_norm": 0.171565443277359, + "learning_rate": 1.9673319709732205e-05, + "loss": 1.2412, + "step": 3109 + }, + { + "epoch": 1.1578690183942013, + "grad_norm": 0.16667889058589935, + "learning_rate": 1.967301153977268e-05, + "loss": 1.227, + "step": 3110 + }, + { + "epoch": 1.1582413235448104, + "grad_norm": 0.18110859394073486, + "learning_rate": 1.9672703226943383e-05, + "loss": 1.247, + "step": 3111 + }, + { + "epoch": 1.1586136286954194, + "grad_norm": 0.18011483550071716, + "learning_rate": 1.9672394771248867e-05, + "loss": 1.23, + "step": 3112 + }, + { + "epoch": 1.1589859338460284, + "grad_norm": 0.17470338940620422, + "learning_rate": 1.967208617269369e-05, + "loss": 1.2272, + "step": 3113 + }, + { + "epoch": 1.1593582389966377, + "grad_norm": 0.16720262169837952, + "learning_rate": 1.9671777431282404e-05, + "loss": 1.2286, + "step": 3114 + }, + { + "epoch": 1.1597305441472467, + "grad_norm": 0.16974225640296936, + "learning_rate": 1.9671468547019575e-05, + "loss": 1.2358, + "step": 3115 + }, + { + "epoch": 1.1601028492978558, + "grad_norm": 0.17322564125061035, + "learning_rate": 1.9671159519909758e-05, + "loss": 1.2196, + "step": 3116 + }, + { + "epoch": 1.1604751544484648, + "grad_norm": 0.17480473220348358, + "learning_rate": 1.9670850349957525e-05, + "loss": 1.2199, + "step": 3117 + }, + { + "epoch": 1.1608474595990739, + "grad_norm": 0.17341922223567963, + "learning_rate": 1.967054103716744e-05, + "loss": 1.2197, + "step": 3118 + }, + { + "epoch": 1.161219764749683, + "grad_norm": 0.16872482001781464, + "learning_rate": 1.9670231581544068e-05, + "loss": 1.23, + "step": 3119 + }, + { + "epoch": 1.161592069900292, + "grad_norm": 0.16933618485927582, + "learning_rate": 1.9669921983091985e-05, + "loss": 1.2202, + "step": 3120 + }, + { + "epoch": 1.161964375050901, + "grad_norm": 0.164505273103714, + "learning_rate": 1.966961224181576e-05, + "loss": 1.2398, + "step": 3121 + }, + { + "epoch": 1.16233668020151, + "grad_norm": 0.1745142638683319, + "learning_rate": 1.9669302357719968e-05, + "loss": 1.2223, + "step": 3122 + }, + { + "epoch": 1.1627089853521193, + "grad_norm": 0.17189304530620575, + "learning_rate": 1.9668992330809187e-05, + "loss": 1.234, + "step": 3123 + }, + { + "epoch": 1.1630812905027283, + "grad_norm": 0.16852304339408875, + "learning_rate": 1.9668682161088e-05, + "loss": 1.219, + "step": 3124 + }, + { + "epoch": 1.1634535956533374, + "grad_norm": 0.17335280776023865, + "learning_rate": 1.966837184856098e-05, + "loss": 1.2107, + "step": 3125 + }, + { + "epoch": 1.1638259008039464, + "grad_norm": 0.17474551498889923, + "learning_rate": 1.9668061393232717e-05, + "loss": 1.2285, + "step": 3126 + }, + { + "epoch": 1.1641982059545555, + "grad_norm": 0.16944244503974915, + "learning_rate": 1.9667750795107793e-05, + "loss": 1.2281, + "step": 3127 + }, + { + "epoch": 1.1645705111051645, + "grad_norm": 0.1715431809425354, + "learning_rate": 1.9667440054190793e-05, + "loss": 1.2402, + "step": 3128 + }, + { + "epoch": 1.1649428162557736, + "grad_norm": 0.1723116636276245, + "learning_rate": 1.9667129170486315e-05, + "loss": 1.2295, + "step": 3129 + }, + { + "epoch": 1.1653151214063828, + "grad_norm": 0.172069251537323, + "learning_rate": 1.9666818143998944e-05, + "loss": 1.2467, + "step": 3130 + }, + { + "epoch": 1.1656874265569919, + "grad_norm": 0.16813892126083374, + "learning_rate": 1.9666506974733276e-05, + "loss": 1.2321, + "step": 3131 + }, + { + "epoch": 1.166059731707601, + "grad_norm": 0.17631420493125916, + "learning_rate": 1.9666195662693907e-05, + "loss": 1.2281, + "step": 3132 + }, + { + "epoch": 1.16643203685821, + "grad_norm": 0.1704123616218567, + "learning_rate": 1.9665884207885437e-05, + "loss": 1.2453, + "step": 3133 + }, + { + "epoch": 1.166804342008819, + "grad_norm": 0.17325717210769653, + "learning_rate": 1.966557261031246e-05, + "loss": 1.2172, + "step": 3134 + }, + { + "epoch": 1.167176647159428, + "grad_norm": 0.16483458876609802, + "learning_rate": 1.9665260869979585e-05, + "loss": 1.2209, + "step": 3135 + }, + { + "epoch": 1.167548952310037, + "grad_norm": 0.17396704852581024, + "learning_rate": 1.966494898689141e-05, + "loss": 1.2164, + "step": 3136 + }, + { + "epoch": 1.1679212574606461, + "grad_norm": 0.18191742897033691, + "learning_rate": 1.9664636961052547e-05, + "loss": 1.2402, + "step": 3137 + }, + { + "epoch": 1.1682935626112552, + "grad_norm": 0.18091581761837006, + "learning_rate": 1.9664324792467603e-05, + "loss": 1.2276, + "step": 3138 + }, + { + "epoch": 1.1686658677618644, + "grad_norm": 0.17447642982006073, + "learning_rate": 1.9664012481141185e-05, + "loss": 1.2262, + "step": 3139 + }, + { + "epoch": 1.1690381729124735, + "grad_norm": 0.17776253819465637, + "learning_rate": 1.9663700027077915e-05, + "loss": 1.2252, + "step": 3140 + }, + { + "epoch": 1.1694104780630825, + "grad_norm": 0.16708004474639893, + "learning_rate": 1.96633874302824e-05, + "loss": 1.2365, + "step": 3141 + }, + { + "epoch": 1.1697827832136916, + "grad_norm": 0.17193494737148285, + "learning_rate": 1.9663074690759255e-05, + "loss": 1.2224, + "step": 3142 + }, + { + "epoch": 1.1701550883643006, + "grad_norm": 0.16979169845581055, + "learning_rate": 1.9662761808513105e-05, + "loss": 1.2222, + "step": 3143 + }, + { + "epoch": 1.1705273935149096, + "grad_norm": 0.17003171145915985, + "learning_rate": 1.9662448783548575e-05, + "loss": 1.222, + "step": 3144 + }, + { + "epoch": 1.1708996986655187, + "grad_norm": 0.17789965867996216, + "learning_rate": 1.966213561587028e-05, + "loss": 1.2306, + "step": 3145 + }, + { + "epoch": 1.1712720038161277, + "grad_norm": 0.17377351224422455, + "learning_rate": 1.9661822305482845e-05, + "loss": 1.2268, + "step": 3146 + }, + { + "epoch": 1.1716443089667368, + "grad_norm": 0.17414116859436035, + "learning_rate": 1.96615088523909e-05, + "loss": 1.2161, + "step": 3147 + }, + { + "epoch": 1.172016614117346, + "grad_norm": 0.17168028652668, + "learning_rate": 1.966119525659908e-05, + "loss": 1.2287, + "step": 3148 + }, + { + "epoch": 1.172388919267955, + "grad_norm": 0.17371198534965515, + "learning_rate": 1.966088151811201e-05, + "loss": 1.2552, + "step": 3149 + }, + { + "epoch": 1.1727612244185641, + "grad_norm": 0.17159205675125122, + "learning_rate": 1.966056763693433e-05, + "loss": 1.2243, + "step": 3150 + }, + { + "epoch": 1.1731335295691732, + "grad_norm": 0.1716485470533371, + "learning_rate": 1.9660253613070667e-05, + "loss": 1.2413, + "step": 3151 + }, + { + "epoch": 1.1735058347197822, + "grad_norm": 0.17491815984249115, + "learning_rate": 1.9659939446525668e-05, + "loss": 1.2377, + "step": 3152 + }, + { + "epoch": 1.1738781398703912, + "grad_norm": 0.166838139295578, + "learning_rate": 1.965962513730397e-05, + "loss": 1.2133, + "step": 3153 + }, + { + "epoch": 1.1742504450210003, + "grad_norm": 0.16914509236812592, + "learning_rate": 1.9659310685410212e-05, + "loss": 1.2264, + "step": 3154 + }, + { + "epoch": 1.1746227501716093, + "grad_norm": 0.17503736913204193, + "learning_rate": 1.9658996090849042e-05, + "loss": 1.2341, + "step": 3155 + }, + { + "epoch": 1.1749950553222184, + "grad_norm": 0.17667143046855927, + "learning_rate": 1.9658681353625105e-05, + "loss": 1.231, + "step": 3156 + }, + { + "epoch": 1.1753673604728276, + "grad_norm": 0.17044112086296082, + "learning_rate": 1.9658366473743052e-05, + "loss": 1.2339, + "step": 3157 + }, + { + "epoch": 1.1757396656234367, + "grad_norm": 0.1663392037153244, + "learning_rate": 1.9658051451207536e-05, + "loss": 1.2352, + "step": 3158 + }, + { + "epoch": 1.1761119707740457, + "grad_norm": 0.18989932537078857, + "learning_rate": 1.96577362860232e-05, + "loss": 1.2266, + "step": 3159 + }, + { + "epoch": 1.1764842759246548, + "grad_norm": 0.18411101400852203, + "learning_rate": 1.965742097819471e-05, + "loss": 1.2343, + "step": 3160 + }, + { + "epoch": 1.1768565810752638, + "grad_norm": 0.17134809494018555, + "learning_rate": 1.9657105527726716e-05, + "loss": 1.2191, + "step": 3161 + }, + { + "epoch": 1.1772288862258729, + "grad_norm": 0.20970846712589264, + "learning_rate": 1.965678993462388e-05, + "loss": 1.2202, + "step": 3162 + }, + { + "epoch": 1.177601191376482, + "grad_norm": 0.20733484625816345, + "learning_rate": 1.9656474198890864e-05, + "loss": 1.2263, + "step": 3163 + }, + { + "epoch": 1.177973496527091, + "grad_norm": 0.18368762731552124, + "learning_rate": 1.965615832053233e-05, + "loss": 1.2363, + "step": 3164 + }, + { + "epoch": 1.1783458016777, + "grad_norm": 0.17400404810905457, + "learning_rate": 1.9655842299552938e-05, + "loss": 1.2301, + "step": 3165 + }, + { + "epoch": 1.1787181068283092, + "grad_norm": 0.19013762474060059, + "learning_rate": 1.9655526135957366e-05, + "loss": 1.2316, + "step": 3166 + }, + { + "epoch": 1.1790904119789183, + "grad_norm": 0.17968544363975525, + "learning_rate": 1.965520982975028e-05, + "loss": 1.2308, + "step": 3167 + }, + { + "epoch": 1.1794627171295273, + "grad_norm": 0.1678808182477951, + "learning_rate": 1.965489338093635e-05, + "loss": 1.2178, + "step": 3168 + }, + { + "epoch": 1.1798350222801364, + "grad_norm": 0.18025662004947662, + "learning_rate": 1.9654576789520248e-05, + "loss": 1.2261, + "step": 3169 + }, + { + "epoch": 1.1802073274307454, + "grad_norm": 0.1803680807352066, + "learning_rate": 1.965426005550665e-05, + "loss": 1.2291, + "step": 3170 + }, + { + "epoch": 1.1805796325813545, + "grad_norm": 0.1786152571439743, + "learning_rate": 1.9653943178900246e-05, + "loss": 1.2508, + "step": 3171 + }, + { + "epoch": 1.1809519377319635, + "grad_norm": 0.17993713915348053, + "learning_rate": 1.96536261597057e-05, + "loss": 1.224, + "step": 3172 + }, + { + "epoch": 1.1813242428825725, + "grad_norm": 0.18419890105724335, + "learning_rate": 1.9653308997927705e-05, + "loss": 1.236, + "step": 3173 + }, + { + "epoch": 1.1816965480331816, + "grad_norm": 0.17795118689537048, + "learning_rate": 1.9652991693570938e-05, + "loss": 1.2217, + "step": 3174 + }, + { + "epoch": 1.1820688531837908, + "grad_norm": 0.17739805579185486, + "learning_rate": 1.965267424664009e-05, + "loss": 1.2336, + "step": 3175 + }, + { + "epoch": 1.1824411583343999, + "grad_norm": 0.17250627279281616, + "learning_rate": 1.9652356657139854e-05, + "loss": 1.2117, + "step": 3176 + }, + { + "epoch": 1.182813463485009, + "grad_norm": 0.1707741916179657, + "learning_rate": 1.9652038925074916e-05, + "loss": 1.2457, + "step": 3177 + }, + { + "epoch": 1.183185768635618, + "grad_norm": 0.17577117681503296, + "learning_rate": 1.9651721050449964e-05, + "loss": 1.2213, + "step": 3178 + }, + { + "epoch": 1.183558073786227, + "grad_norm": 0.1711239516735077, + "learning_rate": 1.96514030332697e-05, + "loss": 1.2081, + "step": 3179 + }, + { + "epoch": 1.183930378936836, + "grad_norm": 0.16911734640598297, + "learning_rate": 1.9651084873538816e-05, + "loss": 1.2082, + "step": 3180 + }, + { + "epoch": 1.184302684087445, + "grad_norm": 0.1710880696773529, + "learning_rate": 1.965076657126202e-05, + "loss": 1.2193, + "step": 3181 + }, + { + "epoch": 1.1846749892380541, + "grad_norm": 0.1759801208972931, + "learning_rate": 1.9650448126444003e-05, + "loss": 1.2261, + "step": 3182 + }, + { + "epoch": 1.1850472943886632, + "grad_norm": 0.17261157929897308, + "learning_rate": 1.9650129539089477e-05, + "loss": 1.2322, + "step": 3183 + }, + { + "epoch": 1.1854195995392725, + "grad_norm": 0.17363622784614563, + "learning_rate": 1.9649810809203138e-05, + "loss": 1.2263, + "step": 3184 + }, + { + "epoch": 1.1857919046898815, + "grad_norm": 0.17991025745868683, + "learning_rate": 1.9649491936789702e-05, + "loss": 1.224, + "step": 3185 + }, + { + "epoch": 1.1861642098404905, + "grad_norm": 0.17815767228603363, + "learning_rate": 1.9649172921853873e-05, + "loss": 1.2289, + "step": 3186 + }, + { + "epoch": 1.1865365149910996, + "grad_norm": 0.1795058697462082, + "learning_rate": 1.964885376440037e-05, + "loss": 1.2335, + "step": 3187 + }, + { + "epoch": 1.1869088201417086, + "grad_norm": 0.16902460157871246, + "learning_rate": 1.9648534464433897e-05, + "loss": 1.2369, + "step": 3188 + }, + { + "epoch": 1.1872811252923177, + "grad_norm": 0.19541476666927338, + "learning_rate": 1.964821502195918e-05, + "loss": 1.2325, + "step": 3189 + }, + { + "epoch": 1.1876534304429267, + "grad_norm": 0.1734236776828766, + "learning_rate": 1.964789543698093e-05, + "loss": 1.2162, + "step": 3190 + }, + { + "epoch": 1.188025735593536, + "grad_norm": 0.26567402482032776, + "learning_rate": 1.9647575709503873e-05, + "loss": 1.2229, + "step": 3191 + }, + { + "epoch": 1.188398040744145, + "grad_norm": 0.1733405888080597, + "learning_rate": 1.9647255839532726e-05, + "loss": 1.2366, + "step": 3192 + }, + { + "epoch": 1.188770345894754, + "grad_norm": 0.17245125770568848, + "learning_rate": 1.9646935827072215e-05, + "loss": 1.2173, + "step": 3193 + }, + { + "epoch": 1.189142651045363, + "grad_norm": 0.16756393015384674, + "learning_rate": 1.9646615672127068e-05, + "loss": 1.2291, + "step": 3194 + }, + { + "epoch": 1.1895149561959721, + "grad_norm": 0.16170787811279297, + "learning_rate": 1.9646295374702014e-05, + "loss": 1.2338, + "step": 3195 + }, + { + "epoch": 1.1898872613465812, + "grad_norm": 0.1687183678150177, + "learning_rate": 1.964597493480178e-05, + "loss": 1.218, + "step": 3196 + }, + { + "epoch": 1.1902595664971902, + "grad_norm": 0.16018569469451904, + "learning_rate": 1.9645654352431105e-05, + "loss": 1.2158, + "step": 3197 + }, + { + "epoch": 1.1906318716477993, + "grad_norm": 0.1713458150625229, + "learning_rate": 1.9645333627594717e-05, + "loss": 1.2331, + "step": 3198 + }, + { + "epoch": 1.1910041767984083, + "grad_norm": 0.1839626431465149, + "learning_rate": 1.9645012760297358e-05, + "loss": 1.2246, + "step": 3199 + }, + { + "epoch": 1.1913764819490176, + "grad_norm": 0.16510577499866486, + "learning_rate": 1.964469175054377e-05, + "loss": 1.2169, + "step": 3200 + }, + { + "epoch": 1.1917487870996266, + "grad_norm": 0.16351880133152008, + "learning_rate": 1.9644370598338686e-05, + "loss": 1.237, + "step": 3201 + }, + { + "epoch": 1.1921210922502357, + "grad_norm": 0.16644950211048126, + "learning_rate": 1.9644049303686852e-05, + "loss": 1.2395, + "step": 3202 + }, + { + "epoch": 1.1924933974008447, + "grad_norm": 0.17122626304626465, + "learning_rate": 1.9643727866593015e-05, + "loss": 1.2404, + "step": 3203 + }, + { + "epoch": 1.1928657025514537, + "grad_norm": 0.1687421053647995, + "learning_rate": 1.9643406287061924e-05, + "loss": 1.227, + "step": 3204 + }, + { + "epoch": 1.1932380077020628, + "grad_norm": 0.17786358296871185, + "learning_rate": 1.964308456509833e-05, + "loss": 1.2271, + "step": 3205 + }, + { + "epoch": 1.1936103128526718, + "grad_norm": 0.16544969379901886, + "learning_rate": 1.964276270070698e-05, + "loss": 1.2288, + "step": 3206 + }, + { + "epoch": 1.1939826180032809, + "grad_norm": 0.1842457503080368, + "learning_rate": 1.964244069389263e-05, + "loss": 1.2244, + "step": 3207 + }, + { + "epoch": 1.19435492315389, + "grad_norm": 0.16619916260242462, + "learning_rate": 1.9642118544660036e-05, + "loss": 1.2357, + "step": 3208 + }, + { + "epoch": 1.1947272283044992, + "grad_norm": 0.1687583178281784, + "learning_rate": 1.9641796253013957e-05, + "loss": 1.2182, + "step": 3209 + }, + { + "epoch": 1.1950995334551082, + "grad_norm": 0.17150536179542542, + "learning_rate": 1.9641473818959153e-05, + "loss": 1.237, + "step": 3210 + }, + { + "epoch": 1.1954718386057173, + "grad_norm": 0.16135509312152863, + "learning_rate": 1.9641151242500383e-05, + "loss": 1.2405, + "step": 3211 + }, + { + "epoch": 1.1958441437563263, + "grad_norm": 0.17465688288211823, + "learning_rate": 1.9640828523642415e-05, + "loss": 1.2145, + "step": 3212 + }, + { + "epoch": 1.1962164489069353, + "grad_norm": 0.17879217863082886, + "learning_rate": 1.9640505662390017e-05, + "loss": 1.223, + "step": 3213 + }, + { + "epoch": 1.1965887540575444, + "grad_norm": 0.1738523691892624, + "learning_rate": 1.9640182658747955e-05, + "loss": 1.2235, + "step": 3214 + }, + { + "epoch": 1.1969610592081534, + "grad_norm": 0.17595960199832916, + "learning_rate": 1.9639859512721e-05, + "loss": 1.2419, + "step": 3215 + }, + { + "epoch": 1.1973333643587625, + "grad_norm": 0.17298002541065216, + "learning_rate": 1.963953622431392e-05, + "loss": 1.226, + "step": 3216 + }, + { + "epoch": 1.1977056695093715, + "grad_norm": 0.169357568025589, + "learning_rate": 1.96392127935315e-05, + "loss": 1.2276, + "step": 3217 + }, + { + "epoch": 1.1980779746599808, + "grad_norm": 0.17965613305568695, + "learning_rate": 1.963888922037851e-05, + "loss": 1.2232, + "step": 3218 + }, + { + "epoch": 1.1984502798105898, + "grad_norm": 0.16693131625652313, + "learning_rate": 1.9638565504859734e-05, + "loss": 1.2187, + "step": 3219 + }, + { + "epoch": 1.1988225849611989, + "grad_norm": 0.16887417435646057, + "learning_rate": 1.9638241646979947e-05, + "loss": 1.2175, + "step": 3220 + }, + { + "epoch": 1.199194890111808, + "grad_norm": 0.17603495717048645, + "learning_rate": 1.9637917646743937e-05, + "loss": 1.2293, + "step": 3221 + }, + { + "epoch": 1.199567195262417, + "grad_norm": 0.1772565096616745, + "learning_rate": 1.963759350415649e-05, + "loss": 1.2246, + "step": 3222 + }, + { + "epoch": 1.199939500413026, + "grad_norm": 0.16490373015403748, + "learning_rate": 1.963726921922239e-05, + "loss": 1.2226, + "step": 3223 + }, + { + "epoch": 1.200311805563635, + "grad_norm": 0.17405438423156738, + "learning_rate": 1.963694479194643e-05, + "loss": 1.2418, + "step": 3224 + }, + { + "epoch": 1.200684110714244, + "grad_norm": 0.1792418658733368, + "learning_rate": 1.9636620222333398e-05, + "loss": 1.2288, + "step": 3225 + }, + { + "epoch": 1.2010564158648531, + "grad_norm": 0.17188262939453125, + "learning_rate": 1.963629551038809e-05, + "loss": 1.2226, + "step": 3226 + }, + { + "epoch": 1.2014287210154624, + "grad_norm": 0.1634385883808136, + "learning_rate": 1.9635970656115303e-05, + "loss": 1.2126, + "step": 3227 + }, + { + "epoch": 1.2018010261660714, + "grad_norm": 0.17879730463027954, + "learning_rate": 1.9635645659519835e-05, + "loss": 1.226, + "step": 3228 + }, + { + "epoch": 1.2021733313166805, + "grad_norm": 0.19251345098018646, + "learning_rate": 1.9635320520606483e-05, + "loss": 1.228, + "step": 3229 + }, + { + "epoch": 1.2025456364672895, + "grad_norm": 0.16735433042049408, + "learning_rate": 1.9634995239380056e-05, + "loss": 1.2525, + "step": 3230 + }, + { + "epoch": 1.2029179416178986, + "grad_norm": 0.17983882129192352, + "learning_rate": 1.9634669815845352e-05, + "loss": 1.2331, + "step": 3231 + }, + { + "epoch": 1.2032902467685076, + "grad_norm": 0.17625576257705688, + "learning_rate": 1.9634344250007175e-05, + "loss": 1.2252, + "step": 3232 + }, + { + "epoch": 1.2036625519191166, + "grad_norm": 0.1711379885673523, + "learning_rate": 1.9634018541870342e-05, + "loss": 1.2268, + "step": 3233 + }, + { + "epoch": 1.2040348570697257, + "grad_norm": 0.17250944674015045, + "learning_rate": 1.9633692691439662e-05, + "loss": 1.214, + "step": 3234 + }, + { + "epoch": 1.2044071622203347, + "grad_norm": 0.17136184871196747, + "learning_rate": 1.963336669871994e-05, + "loss": 1.2315, + "step": 3235 + }, + { + "epoch": 1.204779467370944, + "grad_norm": 0.17495617270469666, + "learning_rate": 1.9633040563716e-05, + "loss": 1.2198, + "step": 3236 + }, + { + "epoch": 1.205151772521553, + "grad_norm": 0.16628341376781464, + "learning_rate": 1.9632714286432656e-05, + "loss": 1.2302, + "step": 3237 + }, + { + "epoch": 1.205524077672162, + "grad_norm": 0.17815084755420685, + "learning_rate": 1.963238786687472e-05, + "loss": 1.2172, + "step": 3238 + }, + { + "epoch": 1.2058963828227711, + "grad_norm": 0.18150483071804047, + "learning_rate": 1.9632061305047028e-05, + "loss": 1.2399, + "step": 3239 + }, + { + "epoch": 1.2062686879733802, + "grad_norm": 0.16711880266666412, + "learning_rate": 1.9631734600954392e-05, + "loss": 1.2162, + "step": 3240 + }, + { + "epoch": 1.2066409931239892, + "grad_norm": 0.16697241365909576, + "learning_rate": 1.963140775460164e-05, + "loss": 1.2246, + "step": 3241 + }, + { + "epoch": 1.2070132982745982, + "grad_norm": 0.17977376282215118, + "learning_rate": 1.96310807659936e-05, + "loss": 1.2305, + "step": 3242 + }, + { + "epoch": 1.2073856034252075, + "grad_norm": 0.1734996736049652, + "learning_rate": 1.9630753635135102e-05, + "loss": 1.229, + "step": 3243 + }, + { + "epoch": 1.2077579085758163, + "grad_norm": 0.17894618213176727, + "learning_rate": 1.9630426362030978e-05, + "loss": 1.2304, + "step": 3244 + }, + { + "epoch": 1.2081302137264256, + "grad_norm": 0.17757487297058105, + "learning_rate": 1.963009894668606e-05, + "loss": 1.2192, + "step": 3245 + }, + { + "epoch": 1.2085025188770346, + "grad_norm": 0.17967943847179413, + "learning_rate": 1.9629771389105185e-05, + "loss": 1.2279, + "step": 3246 + }, + { + "epoch": 1.2088748240276437, + "grad_norm": 0.18884268403053284, + "learning_rate": 1.962944368929319e-05, + "loss": 1.2305, + "step": 3247 + }, + { + "epoch": 1.2092471291782527, + "grad_norm": 0.17970938980579376, + "learning_rate": 1.9629115847254916e-05, + "loss": 1.2085, + "step": 3248 + }, + { + "epoch": 1.2096194343288618, + "grad_norm": 0.16718293726444244, + "learning_rate": 1.9628787862995207e-05, + "loss": 1.2305, + "step": 3249 + }, + { + "epoch": 1.2099917394794708, + "grad_norm": 0.17763985693454742, + "learning_rate": 1.9628459736518907e-05, + "loss": 1.2271, + "step": 3250 + }, + { + "epoch": 1.2103640446300798, + "grad_norm": 0.1723608523607254, + "learning_rate": 1.9628131467830856e-05, + "loss": 1.2224, + "step": 3251 + }, + { + "epoch": 1.2107363497806891, + "grad_norm": 0.16534040868282318, + "learning_rate": 1.9627803056935912e-05, + "loss": 1.2223, + "step": 3252 + }, + { + "epoch": 1.2111086549312982, + "grad_norm": 0.1700848489999771, + "learning_rate": 1.9627474503838918e-05, + "loss": 1.2291, + "step": 3253 + }, + { + "epoch": 1.2114809600819072, + "grad_norm": 0.18612729012966156, + "learning_rate": 1.9627145808544733e-05, + "loss": 1.2119, + "step": 3254 + }, + { + "epoch": 1.2118532652325162, + "grad_norm": 0.16971179842948914, + "learning_rate": 1.9626816971058205e-05, + "loss": 1.2249, + "step": 3255 + }, + { + "epoch": 1.2122255703831253, + "grad_norm": 0.1748899221420288, + "learning_rate": 1.9626487991384194e-05, + "loss": 1.2136, + "step": 3256 + }, + { + "epoch": 1.2125978755337343, + "grad_norm": 0.17875608801841736, + "learning_rate": 1.9626158869527564e-05, + "loss": 1.2189, + "step": 3257 + }, + { + "epoch": 1.2129701806843434, + "grad_norm": 0.17736589908599854, + "learning_rate": 1.962582960549317e-05, + "loss": 1.2349, + "step": 3258 + }, + { + "epoch": 1.2133424858349524, + "grad_norm": 0.17571696639060974, + "learning_rate": 1.9625500199285874e-05, + "loss": 1.2216, + "step": 3259 + }, + { + "epoch": 1.2137147909855615, + "grad_norm": 0.1696479171514511, + "learning_rate": 1.9625170650910547e-05, + "loss": 1.2141, + "step": 3260 + }, + { + "epoch": 1.2140870961361707, + "grad_norm": 0.17148007452487946, + "learning_rate": 1.9624840960372053e-05, + "loss": 1.2426, + "step": 3261 + }, + { + "epoch": 1.2144594012867798, + "grad_norm": 0.17675496637821198, + "learning_rate": 1.962451112767526e-05, + "loss": 1.2278, + "step": 3262 + }, + { + "epoch": 1.2148317064373888, + "grad_norm": 0.17009492218494415, + "learning_rate": 1.9624181152825044e-05, + "loss": 1.2285, + "step": 3263 + }, + { + "epoch": 1.2152040115879978, + "grad_norm": 0.1749381422996521, + "learning_rate": 1.9623851035826274e-05, + "loss": 1.2182, + "step": 3264 + }, + { + "epoch": 1.2155763167386069, + "grad_norm": 0.17395372688770294, + "learning_rate": 1.962352077668383e-05, + "loss": 1.2366, + "step": 3265 + }, + { + "epoch": 1.215948621889216, + "grad_norm": 0.17901504039764404, + "learning_rate": 1.962319037540259e-05, + "loss": 1.2295, + "step": 3266 + }, + { + "epoch": 1.216320927039825, + "grad_norm": 0.17020705342292786, + "learning_rate": 1.962285983198743e-05, + "loss": 1.239, + "step": 3267 + }, + { + "epoch": 1.216693232190434, + "grad_norm": 0.17261864244937897, + "learning_rate": 1.9622529146443235e-05, + "loss": 1.2242, + "step": 3268 + }, + { + "epoch": 1.217065537341043, + "grad_norm": 0.17174561321735382, + "learning_rate": 1.9622198318774884e-05, + "loss": 1.2337, + "step": 3269 + }, + { + "epoch": 1.2174378424916523, + "grad_norm": 0.1685078740119934, + "learning_rate": 1.9621867348987273e-05, + "loss": 1.2328, + "step": 3270 + }, + { + "epoch": 1.2178101476422614, + "grad_norm": 0.17111830413341522, + "learning_rate": 1.9621536237085285e-05, + "loss": 1.2164, + "step": 3271 + }, + { + "epoch": 1.2181824527928704, + "grad_norm": 0.1732606589794159, + "learning_rate": 1.9621204983073806e-05, + "loss": 1.2314, + "step": 3272 + }, + { + "epoch": 1.2185547579434794, + "grad_norm": 0.16949288547039032, + "learning_rate": 1.9620873586957735e-05, + "loss": 1.2231, + "step": 3273 + }, + { + "epoch": 1.2189270630940885, + "grad_norm": 0.17130224406719208, + "learning_rate": 1.962054204874197e-05, + "loss": 1.2315, + "step": 3274 + }, + { + "epoch": 1.2192993682446975, + "grad_norm": 0.16252587735652924, + "learning_rate": 1.96202103684314e-05, + "loss": 1.2077, + "step": 3275 + }, + { + "epoch": 1.2196716733953066, + "grad_norm": 0.1738114356994629, + "learning_rate": 1.961987854603092e-05, + "loss": 1.2344, + "step": 3276 + }, + { + "epoch": 1.2200439785459156, + "grad_norm": 0.16991567611694336, + "learning_rate": 1.9619546581545445e-05, + "loss": 1.2061, + "step": 3277 + }, + { + "epoch": 1.2204162836965247, + "grad_norm": 0.17030175030231476, + "learning_rate": 1.961921447497987e-05, + "loss": 1.2193, + "step": 3278 + }, + { + "epoch": 1.220788588847134, + "grad_norm": 0.16493943333625793, + "learning_rate": 1.9618882226339096e-05, + "loss": 1.2283, + "step": 3279 + }, + { + "epoch": 1.221160893997743, + "grad_norm": 0.17286229133605957, + "learning_rate": 1.961854983562804e-05, + "loss": 1.2392, + "step": 3280 + }, + { + "epoch": 1.221533199148352, + "grad_norm": 0.1747228503227234, + "learning_rate": 1.9618217302851607e-05, + "loss": 1.2216, + "step": 3281 + }, + { + "epoch": 1.221905504298961, + "grad_norm": 0.17101289331912994, + "learning_rate": 1.9617884628014707e-05, + "loss": 1.2203, + "step": 3282 + }, + { + "epoch": 1.22227780944957, + "grad_norm": 0.178693488240242, + "learning_rate": 1.9617551811122253e-05, + "loss": 1.2378, + "step": 3283 + }, + { + "epoch": 1.2226501146001791, + "grad_norm": 0.17429909110069275, + "learning_rate": 1.9617218852179162e-05, + "loss": 1.2383, + "step": 3284 + }, + { + "epoch": 1.2230224197507882, + "grad_norm": 0.17724157869815826, + "learning_rate": 1.9616885751190354e-05, + "loss": 1.2176, + "step": 3285 + }, + { + "epoch": 1.2233947249013972, + "grad_norm": 0.1739131063222885, + "learning_rate": 1.9616552508160747e-05, + "loss": 1.2302, + "step": 3286 + }, + { + "epoch": 1.2237670300520063, + "grad_norm": 0.1718931496143341, + "learning_rate": 1.9616219123095265e-05, + "loss": 1.2336, + "step": 3287 + }, + { + "epoch": 1.2241393352026155, + "grad_norm": 0.17403414845466614, + "learning_rate": 1.9615885595998825e-05, + "loss": 1.2193, + "step": 3288 + }, + { + "epoch": 1.2245116403532246, + "grad_norm": 0.16218788921833038, + "learning_rate": 1.9615551926876358e-05, + "loss": 1.2239, + "step": 3289 + }, + { + "epoch": 1.2248839455038336, + "grad_norm": 0.17476886510849, + "learning_rate": 1.9615218115732796e-05, + "loss": 1.2332, + "step": 3290 + }, + { + "epoch": 1.2252562506544427, + "grad_norm": 0.18384511768817902, + "learning_rate": 1.9614884162573067e-05, + "loss": 1.2243, + "step": 3291 + }, + { + "epoch": 1.2256285558050517, + "grad_norm": 0.174949049949646, + "learning_rate": 1.96145500674021e-05, + "loss": 1.2188, + "step": 3292 + }, + { + "epoch": 1.2260008609556607, + "grad_norm": 0.18955911695957184, + "learning_rate": 1.9614215830224832e-05, + "loss": 1.2269, + "step": 3293 + }, + { + "epoch": 1.2263731661062698, + "grad_norm": 0.17960596084594727, + "learning_rate": 1.96138814510462e-05, + "loss": 1.2313, + "step": 3294 + }, + { + "epoch": 1.2267454712568788, + "grad_norm": 0.18217206001281738, + "learning_rate": 1.961354692987114e-05, + "loss": 1.2345, + "step": 3295 + }, + { + "epoch": 1.2271177764074879, + "grad_norm": 0.1756083369255066, + "learning_rate": 1.9613212266704597e-05, + "loss": 1.2195, + "step": 3296 + }, + { + "epoch": 1.2274900815580971, + "grad_norm": 0.16914978623390198, + "learning_rate": 1.9612877461551516e-05, + "loss": 1.2131, + "step": 3297 + }, + { + "epoch": 1.2278623867087062, + "grad_norm": 0.1753888875246048, + "learning_rate": 1.9612542514416835e-05, + "loss": 1.24, + "step": 3298 + }, + { + "epoch": 1.2282346918593152, + "grad_norm": 0.17672811448574066, + "learning_rate": 1.96122074253055e-05, + "loss": 1.222, + "step": 3299 + }, + { + "epoch": 1.2286069970099243, + "grad_norm": 0.17020326852798462, + "learning_rate": 1.961187219422247e-05, + "loss": 1.2209, + "step": 3300 + }, + { + "epoch": 1.2289793021605333, + "grad_norm": 0.17728908360004425, + "learning_rate": 1.961153682117269e-05, + "loss": 1.2284, + "step": 3301 + }, + { + "epoch": 1.2293516073111423, + "grad_norm": 0.18490946292877197, + "learning_rate": 1.9611201306161115e-05, + "loss": 1.221, + "step": 3302 + }, + { + "epoch": 1.2297239124617514, + "grad_norm": 0.17374534904956818, + "learning_rate": 1.9610865649192695e-05, + "loss": 1.2393, + "step": 3303 + }, + { + "epoch": 1.2300962176123607, + "grad_norm": 0.17610304057598114, + "learning_rate": 1.96105298502724e-05, + "loss": 1.2163, + "step": 3304 + }, + { + "epoch": 1.2304685227629695, + "grad_norm": 0.17196281254291534, + "learning_rate": 1.961019390940518e-05, + "loss": 1.2256, + "step": 3305 + }, + { + "epoch": 1.2308408279135787, + "grad_norm": 0.16787520051002502, + "learning_rate": 1.9609857826595996e-05, + "loss": 1.2373, + "step": 3306 + }, + { + "epoch": 1.2312131330641878, + "grad_norm": 0.17314410209655762, + "learning_rate": 1.9609521601849815e-05, + "loss": 1.2213, + "step": 3307 + }, + { + "epoch": 1.2315854382147968, + "grad_norm": 0.1651289165019989, + "learning_rate": 1.9609185235171604e-05, + "loss": 1.2163, + "step": 3308 + }, + { + "epoch": 1.2319577433654059, + "grad_norm": 0.15858665108680725, + "learning_rate": 1.9608848726566328e-05, + "loss": 1.21, + "step": 3309 + }, + { + "epoch": 1.232330048516015, + "grad_norm": 0.1707463413476944, + "learning_rate": 1.9608512076038964e-05, + "loss": 1.216, + "step": 3310 + }, + { + "epoch": 1.232702353666624, + "grad_norm": 0.16724033653736115, + "learning_rate": 1.9608175283594476e-05, + "loss": 1.2289, + "step": 3311 + }, + { + "epoch": 1.233074658817233, + "grad_norm": 0.17127692699432373, + "learning_rate": 1.9607838349237847e-05, + "loss": 1.2233, + "step": 3312 + }, + { + "epoch": 1.2334469639678423, + "grad_norm": 0.17231450974941254, + "learning_rate": 1.9607501272974044e-05, + "loss": 1.2362, + "step": 3313 + }, + { + "epoch": 1.2338192691184513, + "grad_norm": 0.1658644676208496, + "learning_rate": 1.960716405480805e-05, + "loss": 1.2242, + "step": 3314 + }, + { + "epoch": 1.2341915742690603, + "grad_norm": 0.1748226135969162, + "learning_rate": 1.9606826694744847e-05, + "loss": 1.2307, + "step": 3315 + }, + { + "epoch": 1.2345638794196694, + "grad_norm": 0.1692144274711609, + "learning_rate": 1.9606489192789418e-05, + "loss": 1.2219, + "step": 3316 + }, + { + "epoch": 1.2349361845702784, + "grad_norm": 0.1659345030784607, + "learning_rate": 1.9606151548946744e-05, + "loss": 1.225, + "step": 3317 + }, + { + "epoch": 1.2353084897208875, + "grad_norm": 0.17193017899990082, + "learning_rate": 1.9605813763221816e-05, + "loss": 1.2318, + "step": 3318 + }, + { + "epoch": 1.2356807948714965, + "grad_norm": 0.17747105658054352, + "learning_rate": 1.960547583561962e-05, + "loss": 1.2413, + "step": 3319 + }, + { + "epoch": 1.2360531000221056, + "grad_norm": 0.1722479611635208, + "learning_rate": 1.9605137766145154e-05, + "loss": 1.2255, + "step": 3320 + }, + { + "epoch": 1.2364254051727146, + "grad_norm": 0.16848038136959076, + "learning_rate": 1.96047995548034e-05, + "loss": 1.2384, + "step": 3321 + }, + { + "epoch": 1.2367977103233239, + "grad_norm": 0.16798731684684753, + "learning_rate": 1.960446120159936e-05, + "loss": 1.222, + "step": 3322 + }, + { + "epoch": 1.237170015473933, + "grad_norm": 0.1701812744140625, + "learning_rate": 1.9604122706538033e-05, + "loss": 1.2218, + "step": 3323 + }, + { + "epoch": 1.237542320624542, + "grad_norm": 0.1743394136428833, + "learning_rate": 1.9603784069624417e-05, + "loss": 1.2381, + "step": 3324 + }, + { + "epoch": 1.237914625775151, + "grad_norm": 0.16606199741363525, + "learning_rate": 1.960344529086351e-05, + "loss": 1.2241, + "step": 3325 + }, + { + "epoch": 1.23828693092576, + "grad_norm": 0.17570117115974426, + "learning_rate": 1.960310637026032e-05, + "loss": 1.2356, + "step": 3326 + }, + { + "epoch": 1.238659236076369, + "grad_norm": 0.17124217748641968, + "learning_rate": 1.960276730781985e-05, + "loss": 1.2422, + "step": 3327 + }, + { + "epoch": 1.2390315412269781, + "grad_norm": 0.21202170848846436, + "learning_rate": 1.9602428103547112e-05, + "loss": 1.2334, + "step": 3328 + }, + { + "epoch": 1.2394038463775872, + "grad_norm": 0.17141880095005035, + "learning_rate": 1.9602088757447114e-05, + "loss": 1.2254, + "step": 3329 + }, + { + "epoch": 1.2397761515281962, + "grad_norm": 0.18297582864761353, + "learning_rate": 1.9601749269524867e-05, + "loss": 1.2451, + "step": 3330 + }, + { + "epoch": 1.2401484566788055, + "grad_norm": 0.1688864827156067, + "learning_rate": 1.9601409639785384e-05, + "loss": 1.2194, + "step": 3331 + }, + { + "epoch": 1.2405207618294145, + "grad_norm": 0.1724138855934143, + "learning_rate": 1.9601069868233687e-05, + "loss": 1.2278, + "step": 3332 + }, + { + "epoch": 1.2408930669800236, + "grad_norm": 0.174929678440094, + "learning_rate": 1.9600729954874786e-05, + "loss": 1.2341, + "step": 3333 + }, + { + "epoch": 1.2412653721306326, + "grad_norm": 0.17739325761795044, + "learning_rate": 1.9600389899713707e-05, + "loss": 1.2258, + "step": 3334 + }, + { + "epoch": 1.2416376772812416, + "grad_norm": 0.1830214411020279, + "learning_rate": 1.9600049702755473e-05, + "loss": 1.2313, + "step": 3335 + }, + { + "epoch": 1.2420099824318507, + "grad_norm": 0.16760197281837463, + "learning_rate": 1.9599709364005107e-05, + "loss": 1.2235, + "step": 3336 + }, + { + "epoch": 1.2423822875824597, + "grad_norm": 0.16950567066669464, + "learning_rate": 1.959936888346764e-05, + "loss": 1.2124, + "step": 3337 + }, + { + "epoch": 1.2427545927330688, + "grad_norm": 0.1735774725675583, + "learning_rate": 1.959902826114809e-05, + "loss": 1.213, + "step": 3338 + }, + { + "epoch": 1.2431268978836778, + "grad_norm": 0.17555302381515503, + "learning_rate": 1.9598687497051497e-05, + "loss": 1.2144, + "step": 3339 + }, + { + "epoch": 1.243499203034287, + "grad_norm": 0.17770478129386902, + "learning_rate": 1.9598346591182896e-05, + "loss": 1.2089, + "step": 3340 + }, + { + "epoch": 1.2438715081848961, + "grad_norm": 0.16483311355113983, + "learning_rate": 1.9598005543547315e-05, + "loss": 1.2113, + "step": 3341 + }, + { + "epoch": 1.2442438133355052, + "grad_norm": 0.17812606692314148, + "learning_rate": 1.9597664354149793e-05, + "loss": 1.2189, + "step": 3342 + }, + { + "epoch": 1.2446161184861142, + "grad_norm": 0.17400455474853516, + "learning_rate": 1.9597323022995375e-05, + "loss": 1.2183, + "step": 3343 + }, + { + "epoch": 1.2449884236367232, + "grad_norm": 0.17530007660388947, + "learning_rate": 1.95969815500891e-05, + "loss": 1.2222, + "step": 3344 + }, + { + "epoch": 1.2453607287873323, + "grad_norm": 0.16525647044181824, + "learning_rate": 1.9596639935436005e-05, + "loss": 1.2257, + "step": 3345 + }, + { + "epoch": 1.2457330339379413, + "grad_norm": 0.1811496466398239, + "learning_rate": 1.959629817904114e-05, + "loss": 1.2295, + "step": 3346 + }, + { + "epoch": 1.2461053390885504, + "grad_norm": 0.16356612741947174, + "learning_rate": 1.9595956280909552e-05, + "loss": 1.2251, + "step": 3347 + }, + { + "epoch": 1.2464776442391594, + "grad_norm": 0.1709991693496704, + "learning_rate": 1.9595614241046296e-05, + "loss": 1.2053, + "step": 3348 + }, + { + "epoch": 1.2468499493897687, + "grad_norm": 0.16833190619945526, + "learning_rate": 1.959527205945642e-05, + "loss": 1.2216, + "step": 3349 + }, + { + "epoch": 1.2472222545403777, + "grad_norm": 0.1702100783586502, + "learning_rate": 1.9594929736144978e-05, + "loss": 1.2207, + "step": 3350 + }, + { + "epoch": 1.2475945596909868, + "grad_norm": 0.1710902601480484, + "learning_rate": 1.9594587271117023e-05, + "loss": 1.2166, + "step": 3351 + }, + { + "epoch": 1.2479668648415958, + "grad_norm": 0.17266355454921722, + "learning_rate": 1.9594244664377617e-05, + "loss": 1.2489, + "step": 3352 + }, + { + "epoch": 1.2483391699922048, + "grad_norm": 0.16496752202510834, + "learning_rate": 1.959390191593182e-05, + "loss": 1.2185, + "step": 3353 + }, + { + "epoch": 1.2487114751428139, + "grad_norm": 0.16642600297927856, + "learning_rate": 1.9593559025784692e-05, + "loss": 1.2326, + "step": 3354 + }, + { + "epoch": 1.249083780293423, + "grad_norm": 0.17854921519756317, + "learning_rate": 1.95932159939413e-05, + "loss": 1.2324, + "step": 3355 + }, + { + "epoch": 1.249456085444032, + "grad_norm": 0.17069010436534882, + "learning_rate": 1.959287282040671e-05, + "loss": 1.2088, + "step": 3356 + }, + { + "epoch": 1.249828390594641, + "grad_norm": 0.18059507012367249, + "learning_rate": 1.9592529505185993e-05, + "loss": 1.2264, + "step": 3357 + }, + { + "epoch": 1.2502006957452503, + "grad_norm": 0.173674538731575, + "learning_rate": 1.9592186048284216e-05, + "loss": 1.2263, + "step": 3358 + }, + { + "epoch": 1.2505730008958593, + "grad_norm": 0.16623902320861816, + "learning_rate": 1.9591842449706454e-05, + "loss": 1.2264, + "step": 3359 + }, + { + "epoch": 1.2509453060464684, + "grad_norm": 0.1817052662372589, + "learning_rate": 1.9591498709457776e-05, + "loss": 1.2062, + "step": 3360 + }, + { + "epoch": 1.2513176111970774, + "grad_norm": 0.17939196527004242, + "learning_rate": 1.959115482754327e-05, + "loss": 1.2464, + "step": 3361 + }, + { + "epoch": 1.2516899163476864, + "grad_norm": 0.17410658299922943, + "learning_rate": 1.9590810803968003e-05, + "loss": 1.2175, + "step": 3362 + }, + { + "epoch": 1.2520622214982955, + "grad_norm": 0.19902339577674866, + "learning_rate": 1.9590466638737068e-05, + "loss": 1.2195, + "step": 3363 + }, + { + "epoch": 1.2524345266489045, + "grad_norm": 0.1737726330757141, + "learning_rate": 1.9590122331855543e-05, + "loss": 1.2097, + "step": 3364 + }, + { + "epoch": 1.2528068317995138, + "grad_norm": 0.17546404898166656, + "learning_rate": 1.9589777883328506e-05, + "loss": 1.2249, + "step": 3365 + }, + { + "epoch": 1.2531791369501226, + "grad_norm": 0.1736469715833664, + "learning_rate": 1.9589433293161057e-05, + "loss": 1.2135, + "step": 3366 + }, + { + "epoch": 1.2535514421007319, + "grad_norm": 0.17716355621814728, + "learning_rate": 1.9589088561358278e-05, + "loss": 1.2203, + "step": 3367 + }, + { + "epoch": 1.253923747251341, + "grad_norm": 0.18464305996894836, + "learning_rate": 1.9588743687925264e-05, + "loss": 1.2134, + "step": 3368 + }, + { + "epoch": 1.25429605240195, + "grad_norm": 0.17414681613445282, + "learning_rate": 1.9588398672867108e-05, + "loss": 1.2212, + "step": 3369 + }, + { + "epoch": 1.254668357552559, + "grad_norm": 0.17853331565856934, + "learning_rate": 1.9588053516188906e-05, + "loss": 1.2398, + "step": 3370 + }, + { + "epoch": 1.255040662703168, + "grad_norm": 0.17003083229064941, + "learning_rate": 1.958770821789575e-05, + "loss": 1.2234, + "step": 3371 + }, + { + "epoch": 1.255412967853777, + "grad_norm": 0.18236912786960602, + "learning_rate": 1.9587362777992753e-05, + "loss": 1.2246, + "step": 3372 + }, + { + "epoch": 1.2557852730043861, + "grad_norm": 0.18175008893013, + "learning_rate": 1.9587017196485007e-05, + "loss": 1.2313, + "step": 3373 + }, + { + "epoch": 1.2561575781549954, + "grad_norm": 0.18152983486652374, + "learning_rate": 1.9586671473377614e-05, + "loss": 1.222, + "step": 3374 + }, + { + "epoch": 1.2565298833056042, + "grad_norm": 0.1859092116355896, + "learning_rate": 1.9586325608675688e-05, + "loss": 1.231, + "step": 3375 + }, + { + "epoch": 1.2569021884562135, + "grad_norm": 0.19084875285625458, + "learning_rate": 1.9585979602384334e-05, + "loss": 1.2234, + "step": 3376 + }, + { + "epoch": 1.2572744936068225, + "grad_norm": 0.18165820837020874, + "learning_rate": 1.9585633454508665e-05, + "loss": 1.225, + "step": 3377 + }, + { + "epoch": 1.2576467987574316, + "grad_norm": 0.16506415605545044, + "learning_rate": 1.958528716505379e-05, + "loss": 1.2329, + "step": 3378 + }, + { + "epoch": 1.2580191039080406, + "grad_norm": 0.17367565631866455, + "learning_rate": 1.9584940734024826e-05, + "loss": 1.2263, + "step": 3379 + }, + { + "epoch": 1.2583914090586497, + "grad_norm": 0.16884395480155945, + "learning_rate": 1.9584594161426888e-05, + "loss": 1.2087, + "step": 3380 + }, + { + "epoch": 1.2587637142092587, + "grad_norm": 0.18630245327949524, + "learning_rate": 1.9584247447265095e-05, + "loss": 1.2279, + "step": 3381 + }, + { + "epoch": 1.2591360193598677, + "grad_norm": 0.16664794087409973, + "learning_rate": 1.958390059154457e-05, + "loss": 1.2317, + "step": 3382 + }, + { + "epoch": 1.259508324510477, + "grad_norm": 0.17144736647605896, + "learning_rate": 1.9583553594270433e-05, + "loss": 1.2157, + "step": 3383 + }, + { + "epoch": 1.259880629661086, + "grad_norm": 0.17621219158172607, + "learning_rate": 1.9583206455447812e-05, + "loss": 1.2179, + "step": 3384 + }, + { + "epoch": 1.260252934811695, + "grad_norm": 0.17258892953395844, + "learning_rate": 1.9582859175081835e-05, + "loss": 1.2349, + "step": 3385 + }, + { + "epoch": 1.2606252399623041, + "grad_norm": 0.16975384950637817, + "learning_rate": 1.9582511753177625e-05, + "loss": 1.2155, + "step": 3386 + }, + { + "epoch": 1.2609975451129132, + "grad_norm": 0.171031653881073, + "learning_rate": 1.9582164189740322e-05, + "loss": 1.2121, + "step": 3387 + }, + { + "epoch": 1.2613698502635222, + "grad_norm": 0.16449801623821259, + "learning_rate": 1.9581816484775055e-05, + "loss": 1.2154, + "step": 3388 + }, + { + "epoch": 1.2617421554141313, + "grad_norm": 0.17565850913524628, + "learning_rate": 1.9581468638286954e-05, + "loss": 1.2266, + "step": 3389 + }, + { + "epoch": 1.2621144605647403, + "grad_norm": 0.1752316802740097, + "learning_rate": 1.9581120650281166e-05, + "loss": 1.2237, + "step": 3390 + }, + { + "epoch": 1.2624867657153493, + "grad_norm": 0.16294018924236298, + "learning_rate": 1.958077252076283e-05, + "loss": 1.2161, + "step": 3391 + }, + { + "epoch": 1.2628590708659586, + "grad_norm": 0.16601049900054932, + "learning_rate": 1.9580424249737085e-05, + "loss": 1.2202, + "step": 3392 + }, + { + "epoch": 1.2632313760165677, + "grad_norm": 0.17357154190540314, + "learning_rate": 1.958007583720907e-05, + "loss": 1.2107, + "step": 3393 + }, + { + "epoch": 1.2636036811671767, + "grad_norm": 0.17299290001392365, + "learning_rate": 1.957972728318394e-05, + "loss": 1.2297, + "step": 3394 + }, + { + "epoch": 1.2639759863177857, + "grad_norm": 0.1687079221010208, + "learning_rate": 1.9579378587666838e-05, + "loss": 1.2087, + "step": 3395 + }, + { + "epoch": 1.2643482914683948, + "grad_norm": 0.16745884716510773, + "learning_rate": 1.9579029750662918e-05, + "loss": 1.2154, + "step": 3396 + }, + { + "epoch": 1.2647205966190038, + "grad_norm": 0.17740416526794434, + "learning_rate": 1.9578680772177327e-05, + "loss": 1.2304, + "step": 3397 + }, + { + "epoch": 1.2650929017696129, + "grad_norm": 0.16838476061820984, + "learning_rate": 1.9578331652215224e-05, + "loss": 1.2315, + "step": 3398 + }, + { + "epoch": 1.265465206920222, + "grad_norm": 0.1673501580953598, + "learning_rate": 1.9577982390781766e-05, + "loss": 1.2072, + "step": 3399 + }, + { + "epoch": 1.265837512070831, + "grad_norm": 0.1591964215040207, + "learning_rate": 1.9577632987882103e-05, + "loss": 1.2121, + "step": 3400 + }, + { + "epoch": 1.2662098172214402, + "grad_norm": 0.1804029494524002, + "learning_rate": 1.9577283443521403e-05, + "loss": 1.225, + "step": 3401 + }, + { + "epoch": 1.2665821223720493, + "grad_norm": 0.17372415959835052, + "learning_rate": 1.957693375770483e-05, + "loss": 1.212, + "step": 3402 + }, + { + "epoch": 1.2669544275226583, + "grad_norm": 0.17061223089694977, + "learning_rate": 1.9576583930437546e-05, + "loss": 1.2165, + "step": 3403 + }, + { + "epoch": 1.2673267326732673, + "grad_norm": 0.19941537082195282, + "learning_rate": 1.9576233961724716e-05, + "loss": 1.2322, + "step": 3404 + }, + { + "epoch": 1.2676990378238764, + "grad_norm": 0.1921522617340088, + "learning_rate": 1.9575883851571516e-05, + "loss": 1.2069, + "step": 3405 + }, + { + "epoch": 1.2680713429744854, + "grad_norm": 0.17495955526828766, + "learning_rate": 1.957553359998311e-05, + "loss": 1.227, + "step": 3406 + }, + { + "epoch": 1.2684436481250945, + "grad_norm": 0.18259906768798828, + "learning_rate": 1.9575183206964673e-05, + "loss": 1.2292, + "step": 3407 + }, + { + "epoch": 1.2688159532757037, + "grad_norm": 0.16548015177249908, + "learning_rate": 1.9574832672521384e-05, + "loss": 1.209, + "step": 3408 + }, + { + "epoch": 1.2691882584263126, + "grad_norm": 0.17932672798633575, + "learning_rate": 1.9574481996658412e-05, + "loss": 1.2247, + "step": 3409 + }, + { + "epoch": 1.2695605635769218, + "grad_norm": 0.16864344477653503, + "learning_rate": 1.9574131179380945e-05, + "loss": 1.209, + "step": 3410 + }, + { + "epoch": 1.2699328687275309, + "grad_norm": 0.16725954413414001, + "learning_rate": 1.957378022069416e-05, + "loss": 1.2094, + "step": 3411 + }, + { + "epoch": 1.27030517387814, + "grad_norm": 0.17758184671401978, + "learning_rate": 1.9573429120603245e-05, + "loss": 1.2177, + "step": 3412 + }, + { + "epoch": 1.270677479028749, + "grad_norm": 0.17803123593330383, + "learning_rate": 1.957307787911338e-05, + "loss": 1.2244, + "step": 3413 + }, + { + "epoch": 1.271049784179358, + "grad_norm": 0.1861790120601654, + "learning_rate": 1.9572726496229754e-05, + "loss": 1.2335, + "step": 3414 + }, + { + "epoch": 1.271422089329967, + "grad_norm": 0.17091241478919983, + "learning_rate": 1.9572374971957562e-05, + "loss": 1.2198, + "step": 3415 + }, + { + "epoch": 1.271794394480576, + "grad_norm": 0.16536393761634827, + "learning_rate": 1.957202330630199e-05, + "loss": 1.2227, + "step": 3416 + }, + { + "epoch": 1.2721666996311853, + "grad_norm": 0.1737341731786728, + "learning_rate": 1.9571671499268238e-05, + "loss": 1.2294, + "step": 3417 + }, + { + "epoch": 1.2725390047817942, + "grad_norm": 0.16454610228538513, + "learning_rate": 1.9571319550861493e-05, + "loss": 1.2099, + "step": 3418 + }, + { + "epoch": 1.2729113099324034, + "grad_norm": 0.17201176285743713, + "learning_rate": 1.957096746108696e-05, + "loss": 1.23, + "step": 3419 + }, + { + "epoch": 1.2732836150830125, + "grad_norm": 0.16498921811580658, + "learning_rate": 1.9570615229949844e-05, + "loss": 1.2081, + "step": 3420 + }, + { + "epoch": 1.2736559202336215, + "grad_norm": 0.16812345385551453, + "learning_rate": 1.9570262857455336e-05, + "loss": 1.2207, + "step": 3421 + }, + { + "epoch": 1.2740282253842306, + "grad_norm": 0.17763420939445496, + "learning_rate": 1.956991034360865e-05, + "loss": 1.233, + "step": 3422 + }, + { + "epoch": 1.2744005305348396, + "grad_norm": 0.18109717965126038, + "learning_rate": 1.9569557688414985e-05, + "loss": 1.2304, + "step": 3423 + }, + { + "epoch": 1.2747728356854486, + "grad_norm": 0.1638157218694687, + "learning_rate": 1.9569204891879554e-05, + "loss": 1.2216, + "step": 3424 + }, + { + "epoch": 1.2751451408360577, + "grad_norm": 0.17872877418994904, + "learning_rate": 1.956885195400757e-05, + "loss": 1.215, + "step": 3425 + }, + { + "epoch": 1.275517445986667, + "grad_norm": 0.1729113757610321, + "learning_rate": 1.956849887480424e-05, + "loss": 1.2073, + "step": 3426 + }, + { + "epoch": 1.2758897511372758, + "grad_norm": 0.16412028670310974, + "learning_rate": 1.9568145654274787e-05, + "loss": 1.2321, + "step": 3427 + }, + { + "epoch": 1.276262056287885, + "grad_norm": 0.17767532169818878, + "learning_rate": 1.9567792292424417e-05, + "loss": 1.2291, + "step": 3428 + }, + { + "epoch": 1.276634361438494, + "grad_norm": 0.17381222546100616, + "learning_rate": 1.956743878925836e-05, + "loss": 1.2315, + "step": 3429 + }, + { + "epoch": 1.2770066665891031, + "grad_norm": 0.17282582819461823, + "learning_rate": 1.956708514478183e-05, + "loss": 1.2208, + "step": 3430 + }, + { + "epoch": 1.2773789717397122, + "grad_norm": 0.16768918931484222, + "learning_rate": 1.9566731359000056e-05, + "loss": 1.2107, + "step": 3431 + }, + { + "epoch": 1.2777512768903212, + "grad_norm": 0.1659182608127594, + "learning_rate": 1.9566377431918258e-05, + "loss": 1.216, + "step": 3432 + }, + { + "epoch": 1.2781235820409302, + "grad_norm": 0.1740458458662033, + "learning_rate": 1.9566023363541664e-05, + "loss": 1.2357, + "step": 3433 + }, + { + "epoch": 1.2784958871915393, + "grad_norm": 0.167833611369133, + "learning_rate": 1.956566915387551e-05, + "loss": 1.2286, + "step": 3434 + }, + { + "epoch": 1.2788681923421485, + "grad_norm": 0.17230744659900665, + "learning_rate": 1.9565314802925017e-05, + "loss": 1.2251, + "step": 3435 + }, + { + "epoch": 1.2792404974927574, + "grad_norm": 0.16119259595870972, + "learning_rate": 1.9564960310695426e-05, + "loss": 1.2184, + "step": 3436 + }, + { + "epoch": 1.2796128026433666, + "grad_norm": 0.15957672894001007, + "learning_rate": 1.9564605677191975e-05, + "loss": 1.2133, + "step": 3437 + }, + { + "epoch": 1.2799851077939757, + "grad_norm": 0.16447031497955322, + "learning_rate": 1.9564250902419895e-05, + "loss": 1.2355, + "step": 3438 + }, + { + "epoch": 1.2803574129445847, + "grad_norm": 0.17678163945674896, + "learning_rate": 1.956389598638443e-05, + "loss": 1.2152, + "step": 3439 + }, + { + "epoch": 1.2807297180951938, + "grad_norm": 0.18602001667022705, + "learning_rate": 1.9563540929090825e-05, + "loss": 1.2069, + "step": 3440 + }, + { + "epoch": 1.2811020232458028, + "grad_norm": 0.16143512725830078, + "learning_rate": 1.9563185730544316e-05, + "loss": 1.2164, + "step": 3441 + }, + { + "epoch": 1.2814743283964118, + "grad_norm": 0.17867274582386017, + "learning_rate": 1.9562830390750157e-05, + "loss": 1.2228, + "step": 3442 + }, + { + "epoch": 1.2818466335470209, + "grad_norm": 0.16659069061279297, + "learning_rate": 1.9562474909713592e-05, + "loss": 1.2173, + "step": 3443 + }, + { + "epoch": 1.2822189386976302, + "grad_norm": 0.17568208277225494, + "learning_rate": 1.9562119287439874e-05, + "loss": 1.2199, + "step": 3444 + }, + { + "epoch": 1.2825912438482392, + "grad_norm": 0.18032683432102203, + "learning_rate": 1.956176352393425e-05, + "loss": 1.2246, + "step": 3445 + }, + { + "epoch": 1.2829635489988482, + "grad_norm": 0.1706835776567459, + "learning_rate": 1.956140761920198e-05, + "loss": 1.2079, + "step": 3446 + }, + { + "epoch": 1.2833358541494573, + "grad_norm": 0.17287270724773407, + "learning_rate": 1.9561051573248325e-05, + "loss": 1.2145, + "step": 3447 + }, + { + "epoch": 1.2837081593000663, + "grad_norm": 0.1678660660982132, + "learning_rate": 1.9560695386078536e-05, + "loss": 1.219, + "step": 3448 + }, + { + "epoch": 1.2840804644506754, + "grad_norm": 0.17408515512943268, + "learning_rate": 1.956033905769787e-05, + "loss": 1.2178, + "step": 3449 + }, + { + "epoch": 1.2844527696012844, + "grad_norm": 0.1688593178987503, + "learning_rate": 1.9559982588111604e-05, + "loss": 1.2167, + "step": 3450 + }, + { + "epoch": 1.2848250747518934, + "grad_norm": 0.16934001445770264, + "learning_rate": 1.9559625977324992e-05, + "loss": 1.2089, + "step": 3451 + }, + { + "epoch": 1.2851973799025025, + "grad_norm": 0.17669974267482758, + "learning_rate": 1.9559269225343303e-05, + "loss": 1.2276, + "step": 3452 + }, + { + "epoch": 1.2855696850531118, + "grad_norm": 0.18074657022953033, + "learning_rate": 1.955891233217181e-05, + "loss": 1.2208, + "step": 3453 + }, + { + "epoch": 1.2859419902037208, + "grad_norm": 0.16438442468643188, + "learning_rate": 1.955855529781578e-05, + "loss": 1.2096, + "step": 3454 + }, + { + "epoch": 1.2863142953543298, + "grad_norm": 0.20270679891109467, + "learning_rate": 1.9558198122280488e-05, + "loss": 1.2206, + "step": 3455 + }, + { + "epoch": 1.2866866005049389, + "grad_norm": 0.18252705037593842, + "learning_rate": 1.955784080557121e-05, + "loss": 1.2188, + "step": 3456 + }, + { + "epoch": 1.287058905655548, + "grad_norm": 0.17641377449035645, + "learning_rate": 1.9557483347693226e-05, + "loss": 1.2156, + "step": 3457 + }, + { + "epoch": 1.287431210806157, + "grad_norm": 0.16303670406341553, + "learning_rate": 1.955712574865181e-05, + "loss": 1.2378, + "step": 3458 + }, + { + "epoch": 1.287803515956766, + "grad_norm": 0.20156008005142212, + "learning_rate": 1.9556768008452245e-05, + "loss": 1.2284, + "step": 3459 + }, + { + "epoch": 1.288175821107375, + "grad_norm": 0.16495651006698608, + "learning_rate": 1.9556410127099817e-05, + "loss": 1.2097, + "step": 3460 + }, + { + "epoch": 1.288548126257984, + "grad_norm": 0.1750713586807251, + "learning_rate": 1.9556052104599813e-05, + "loss": 1.2175, + "step": 3461 + }, + { + "epoch": 1.2889204314085934, + "grad_norm": 0.17808473110198975, + "learning_rate": 1.9555693940957518e-05, + "loss": 1.2354, + "step": 3462 + }, + { + "epoch": 1.2892927365592024, + "grad_norm": 0.1678575724363327, + "learning_rate": 1.955533563617822e-05, + "loss": 1.2293, + "step": 3463 + }, + { + "epoch": 1.2896650417098114, + "grad_norm": 0.18347376585006714, + "learning_rate": 1.955497719026722e-05, + "loss": 1.2276, + "step": 3464 + }, + { + "epoch": 1.2900373468604205, + "grad_norm": 0.17216694355010986, + "learning_rate": 1.9554618603229804e-05, + "loss": 1.2365, + "step": 3465 + }, + { + "epoch": 1.2904096520110295, + "grad_norm": 0.16860246658325195, + "learning_rate": 1.9554259875071274e-05, + "loss": 1.2087, + "step": 3466 + }, + { + "epoch": 1.2907819571616386, + "grad_norm": 0.1744735687971115, + "learning_rate": 1.955390100579692e-05, + "loss": 1.2064, + "step": 3467 + }, + { + "epoch": 1.2911542623122476, + "grad_norm": 0.16828221082687378, + "learning_rate": 1.955354199541205e-05, + "loss": 1.2235, + "step": 3468 + }, + { + "epoch": 1.2915265674628569, + "grad_norm": 0.19916963577270508, + "learning_rate": 1.9553182843921963e-05, + "loss": 1.2269, + "step": 3469 + }, + { + "epoch": 1.2918988726134657, + "grad_norm": 0.17060431838035583, + "learning_rate": 1.9552823551331966e-05, + "loss": 1.2064, + "step": 3470 + }, + { + "epoch": 1.292271177764075, + "grad_norm": 0.17713354527950287, + "learning_rate": 1.9552464117647365e-05, + "loss": 1.2306, + "step": 3471 + }, + { + "epoch": 1.292643482914684, + "grad_norm": 0.17107143998146057, + "learning_rate": 1.955210454287347e-05, + "loss": 1.2413, + "step": 3472 + }, + { + "epoch": 1.293015788065293, + "grad_norm": 0.17678503692150116, + "learning_rate": 1.955174482701559e-05, + "loss": 1.2145, + "step": 3473 + }, + { + "epoch": 1.293388093215902, + "grad_norm": 0.17001782357692719, + "learning_rate": 1.955138497007904e-05, + "loss": 1.2357, + "step": 3474 + }, + { + "epoch": 1.2937603983665111, + "grad_norm": 0.1619400829076767, + "learning_rate": 1.9551024972069127e-05, + "loss": 1.2237, + "step": 3475 + }, + { + "epoch": 1.2941327035171202, + "grad_norm": 0.1685052067041397, + "learning_rate": 1.9550664832991178e-05, + "loss": 1.2217, + "step": 3476 + }, + { + "epoch": 1.2945050086677292, + "grad_norm": 0.1692323237657547, + "learning_rate": 1.9550304552850506e-05, + "loss": 1.2116, + "step": 3477 + }, + { + "epoch": 1.2948773138183385, + "grad_norm": 0.1704968959093094, + "learning_rate": 1.954994413165244e-05, + "loss": 1.2236, + "step": 3478 + }, + { + "epoch": 1.2952496189689473, + "grad_norm": 0.17128580808639526, + "learning_rate": 1.9549583569402297e-05, + "loss": 1.2362, + "step": 3479 + }, + { + "epoch": 1.2956219241195566, + "grad_norm": 0.16995707154273987, + "learning_rate": 1.95492228661054e-05, + "loss": 1.2178, + "step": 3480 + }, + { + "epoch": 1.2959942292701656, + "grad_norm": 0.16950835287570953, + "learning_rate": 1.9548862021767084e-05, + "loss": 1.2147, + "step": 3481 + }, + { + "epoch": 1.2963665344207747, + "grad_norm": 0.1846836656332016, + "learning_rate": 1.9548501036392676e-05, + "loss": 1.215, + "step": 3482 + }, + { + "epoch": 1.2967388395713837, + "grad_norm": 0.16860432922840118, + "learning_rate": 1.954813990998751e-05, + "loss": 1.2209, + "step": 3483 + }, + { + "epoch": 1.2971111447219927, + "grad_norm": 0.162691131234169, + "learning_rate": 1.9547778642556913e-05, + "loss": 1.1976, + "step": 3484 + }, + { + "epoch": 1.2974834498726018, + "grad_norm": 0.17532385885715485, + "learning_rate": 1.954741723410622e-05, + "loss": 1.226, + "step": 3485 + }, + { + "epoch": 1.2978557550232108, + "grad_norm": 0.18503205478191376, + "learning_rate": 1.954705568464078e-05, + "loss": 1.2223, + "step": 3486 + }, + { + "epoch": 1.29822806017382, + "grad_norm": 0.1707049161195755, + "learning_rate": 1.9546693994165922e-05, + "loss": 1.2179, + "step": 3487 + }, + { + "epoch": 1.298600365324429, + "grad_norm": 0.17060478031635284, + "learning_rate": 1.9546332162687e-05, + "loss": 1.2217, + "step": 3488 + }, + { + "epoch": 1.2989726704750382, + "grad_norm": 0.17310044169425964, + "learning_rate": 1.9545970190209346e-05, + "loss": 1.221, + "step": 3489 + }, + { + "epoch": 1.2993449756256472, + "grad_norm": 0.19877584278583527, + "learning_rate": 1.9545608076738312e-05, + "loss": 1.2391, + "step": 3490 + }, + { + "epoch": 1.2997172807762563, + "grad_norm": 0.16506221890449524, + "learning_rate": 1.9545245822279243e-05, + "loss": 1.2112, + "step": 3491 + }, + { + "epoch": 1.3000895859268653, + "grad_norm": 0.18883390724658966, + "learning_rate": 1.9544883426837497e-05, + "loss": 1.2221, + "step": 3492 + }, + { + "epoch": 1.3004618910774743, + "grad_norm": 0.18333663046360016, + "learning_rate": 1.954452089041842e-05, + "loss": 1.231, + "step": 3493 + }, + { + "epoch": 1.3008341962280834, + "grad_norm": 0.17763221263885498, + "learning_rate": 1.954415821302737e-05, + "loss": 1.2195, + "step": 3494 + }, + { + "epoch": 1.3012065013786924, + "grad_norm": 0.1757153868675232, + "learning_rate": 1.9543795394669696e-05, + "loss": 1.2183, + "step": 3495 + }, + { + "epoch": 1.3015788065293017, + "grad_norm": 0.1739683747291565, + "learning_rate": 1.954343243535077e-05, + "loss": 1.2039, + "step": 3496 + }, + { + "epoch": 1.3019511116799105, + "grad_norm": 0.17304451763629913, + "learning_rate": 1.9543069335075945e-05, + "loss": 1.2128, + "step": 3497 + }, + { + "epoch": 1.3023234168305198, + "grad_norm": 0.17375996708869934, + "learning_rate": 1.9542706093850585e-05, + "loss": 1.2332, + "step": 3498 + }, + { + "epoch": 1.3026957219811288, + "grad_norm": 0.18385480344295502, + "learning_rate": 1.9542342711680053e-05, + "loss": 1.2246, + "step": 3499 + }, + { + "epoch": 1.3030680271317379, + "grad_norm": 0.17756296694278717, + "learning_rate": 1.9541979188569717e-05, + "loss": 1.2084, + "step": 3500 + }, + { + "epoch": 1.3030680271317379, + "eval_loss": 1.3167004585266113, + "eval_runtime": 16.5219, + "eval_samples_per_second": 104.951, + "eval_steps_per_second": 5.266, + "step": 3500 + }, + { + "epoch": 1.303440332282347, + "grad_norm": 0.18115080893039703, + "learning_rate": 1.9541615524524946e-05, + "loss": 1.2279, + "step": 3501 + }, + { + "epoch": 1.303812637432956, + "grad_norm": 0.18096600472927094, + "learning_rate": 1.9541251719551116e-05, + "loss": 1.238, + "step": 3502 + }, + { + "epoch": 1.304184942583565, + "grad_norm": 0.18809261918067932, + "learning_rate": 1.9540887773653594e-05, + "loss": 1.2256, + "step": 3503 + }, + { + "epoch": 1.304557247734174, + "grad_norm": 0.19462642073631287, + "learning_rate": 1.954052368683776e-05, + "loss": 1.2191, + "step": 3504 + }, + { + "epoch": 1.3049295528847833, + "grad_norm": 0.17324475944042206, + "learning_rate": 1.9540159459108992e-05, + "loss": 1.218, + "step": 3505 + }, + { + "epoch": 1.3053018580353923, + "grad_norm": 0.18626105785369873, + "learning_rate": 1.9539795090472665e-05, + "loss": 1.2095, + "step": 3506 + }, + { + "epoch": 1.3056741631860014, + "grad_norm": 0.17831236124038696, + "learning_rate": 1.9539430580934162e-05, + "loss": 1.2298, + "step": 3507 + }, + { + "epoch": 1.3060464683366104, + "grad_norm": 0.17475959658622742, + "learning_rate": 1.953906593049887e-05, + "loss": 1.2262, + "step": 3508 + }, + { + "epoch": 1.3064187734872195, + "grad_norm": 0.1782606840133667, + "learning_rate": 1.9538701139172174e-05, + "loss": 1.2196, + "step": 3509 + }, + { + "epoch": 1.3067910786378285, + "grad_norm": 0.181804358959198, + "learning_rate": 1.9538336206959457e-05, + "loss": 1.2171, + "step": 3510 + }, + { + "epoch": 1.3071633837884375, + "grad_norm": 0.17738310992717743, + "learning_rate": 1.9537971133866116e-05, + "loss": 1.2304, + "step": 3511 + }, + { + "epoch": 1.3075356889390466, + "grad_norm": 0.16684716939926147, + "learning_rate": 1.953760591989754e-05, + "loss": 1.203, + "step": 3512 + }, + { + "epoch": 1.3079079940896556, + "grad_norm": 0.17668993771076202, + "learning_rate": 1.953724056505912e-05, + "loss": 1.2144, + "step": 3513 + }, + { + "epoch": 1.308280299240265, + "grad_norm": 0.17957065999507904, + "learning_rate": 1.953687506935626e-05, + "loss": 1.2201, + "step": 3514 + }, + { + "epoch": 1.308652604390874, + "grad_norm": 0.16784153878688812, + "learning_rate": 1.953650943279435e-05, + "loss": 1.2145, + "step": 3515 + }, + { + "epoch": 1.309024909541483, + "grad_norm": 0.17085857689380646, + "learning_rate": 1.9536143655378795e-05, + "loss": 1.2283, + "step": 3516 + }, + { + "epoch": 1.309397214692092, + "grad_norm": 0.1773957908153534, + "learning_rate": 1.9535777737114997e-05, + "loss": 1.2117, + "step": 3517 + }, + { + "epoch": 1.309769519842701, + "grad_norm": 0.1848251074552536, + "learning_rate": 1.953541167800836e-05, + "loss": 1.2149, + "step": 3518 + }, + { + "epoch": 1.3101418249933101, + "grad_norm": 0.1742524355649948, + "learning_rate": 1.9535045478064293e-05, + "loss": 1.2426, + "step": 3519 + }, + { + "epoch": 1.3105141301439192, + "grad_norm": 0.1685061901807785, + "learning_rate": 1.95346791372882e-05, + "loss": 1.2167, + "step": 3520 + }, + { + "epoch": 1.3108864352945282, + "grad_norm": 0.15838715434074402, + "learning_rate": 1.9534312655685497e-05, + "loss": 1.2213, + "step": 3521 + }, + { + "epoch": 1.3112587404451372, + "grad_norm": 0.1682320386171341, + "learning_rate": 1.9533946033261593e-05, + "loss": 1.2267, + "step": 3522 + }, + { + "epoch": 1.3116310455957465, + "grad_norm": 0.17389652132987976, + "learning_rate": 1.9533579270021904e-05, + "loss": 1.2222, + "step": 3523 + }, + { + "epoch": 1.3120033507463555, + "grad_norm": 0.17018966376781464, + "learning_rate": 1.9533212365971844e-05, + "loss": 1.2272, + "step": 3524 + }, + { + "epoch": 1.3123756558969646, + "grad_norm": 0.16955389082431793, + "learning_rate": 1.953284532111684e-05, + "loss": 1.2204, + "step": 3525 + }, + { + "epoch": 1.3127479610475736, + "grad_norm": 0.16709072887897491, + "learning_rate": 1.9532478135462313e-05, + "loss": 1.222, + "step": 3526 + }, + { + "epoch": 1.3131202661981827, + "grad_norm": 0.1752871423959732, + "learning_rate": 1.9532110809013676e-05, + "loss": 1.22, + "step": 3527 + }, + { + "epoch": 1.3134925713487917, + "grad_norm": 0.1721472591161728, + "learning_rate": 1.953174334177636e-05, + "loss": 1.2195, + "step": 3528 + }, + { + "epoch": 1.3138648764994008, + "grad_norm": 0.17040152847766876, + "learning_rate": 1.9531375733755795e-05, + "loss": 1.224, + "step": 3529 + }, + { + "epoch": 1.31423718165001, + "grad_norm": 0.17984364926815033, + "learning_rate": 1.9531007984957408e-05, + "loss": 1.2178, + "step": 3530 + }, + { + "epoch": 1.3146094868006188, + "grad_norm": 0.16760718822479248, + "learning_rate": 1.953064009538663e-05, + "loss": 1.2096, + "step": 3531 + }, + { + "epoch": 1.314981791951228, + "grad_norm": 0.1705697625875473, + "learning_rate": 1.9530272065048903e-05, + "loss": 1.2189, + "step": 3532 + }, + { + "epoch": 1.3153540971018371, + "grad_norm": 0.17441794276237488, + "learning_rate": 1.9529903893949647e-05, + "loss": 1.2218, + "step": 3533 + }, + { + "epoch": 1.3157264022524462, + "grad_norm": 0.17177151143550873, + "learning_rate": 1.9529535582094315e-05, + "loss": 1.219, + "step": 3534 + }, + { + "epoch": 1.3160987074030552, + "grad_norm": 0.18386155366897583, + "learning_rate": 1.9529167129488335e-05, + "loss": 1.2187, + "step": 3535 + }, + { + "epoch": 1.3164710125536643, + "grad_norm": 0.17924180626869202, + "learning_rate": 1.9528798536137157e-05, + "loss": 1.2303, + "step": 3536 + }, + { + "epoch": 1.3168433177042733, + "grad_norm": 0.16562217473983765, + "learning_rate": 1.9528429802046225e-05, + "loss": 1.2155, + "step": 3537 + }, + { + "epoch": 1.3172156228548824, + "grad_norm": 0.1981254667043686, + "learning_rate": 1.952806092722098e-05, + "loss": 1.203, + "step": 3538 + }, + { + "epoch": 1.3175879280054916, + "grad_norm": 0.17058147490024567, + "learning_rate": 1.9527691911666875e-05, + "loss": 1.2058, + "step": 3539 + }, + { + "epoch": 1.3179602331561004, + "grad_norm": 0.1780320107936859, + "learning_rate": 1.9527322755389355e-05, + "loss": 1.2242, + "step": 3540 + }, + { + "epoch": 1.3183325383067097, + "grad_norm": 0.17311114072799683, + "learning_rate": 1.9526953458393878e-05, + "loss": 1.2142, + "step": 3541 + }, + { + "epoch": 1.3187048434573188, + "grad_norm": 0.1679919809103012, + "learning_rate": 1.9526584020685896e-05, + "loss": 1.213, + "step": 3542 + }, + { + "epoch": 1.3190771486079278, + "grad_norm": 0.17678605020046234, + "learning_rate": 1.9526214442270865e-05, + "loss": 1.2275, + "step": 3543 + }, + { + "epoch": 1.3194494537585368, + "grad_norm": 0.16861461102962494, + "learning_rate": 1.9525844723154246e-05, + "loss": 1.2262, + "step": 3544 + }, + { + "epoch": 1.3198217589091459, + "grad_norm": 0.18046718835830688, + "learning_rate": 1.95254748633415e-05, + "loss": 1.2258, + "step": 3545 + }, + { + "epoch": 1.320194064059755, + "grad_norm": 0.16833364963531494, + "learning_rate": 1.9525104862838085e-05, + "loss": 1.2195, + "step": 3546 + }, + { + "epoch": 1.320566369210364, + "grad_norm": 0.16752612590789795, + "learning_rate": 1.952473472164947e-05, + "loss": 1.2227, + "step": 3547 + }, + { + "epoch": 1.3209386743609732, + "grad_norm": 0.16177985072135925, + "learning_rate": 1.952436443978112e-05, + "loss": 1.2036, + "step": 3548 + }, + { + "epoch": 1.321310979511582, + "grad_norm": 0.1741490513086319, + "learning_rate": 1.9523994017238505e-05, + "loss": 1.2292, + "step": 3549 + }, + { + "epoch": 1.3216832846621913, + "grad_norm": 0.17161637544631958, + "learning_rate": 1.9523623454027095e-05, + "loss": 1.2195, + "step": 3550 + }, + { + "epoch": 1.3220555898128004, + "grad_norm": 0.16780497133731842, + "learning_rate": 1.9523252750152367e-05, + "loss": 1.2235, + "step": 3551 + }, + { + "epoch": 1.3224278949634094, + "grad_norm": 0.17077326774597168, + "learning_rate": 1.9522881905619794e-05, + "loss": 1.2144, + "step": 3552 + }, + { + "epoch": 1.3228002001140184, + "grad_norm": 0.16986922919750214, + "learning_rate": 1.9522510920434853e-05, + "loss": 1.2222, + "step": 3553 + }, + { + "epoch": 1.3231725052646275, + "grad_norm": 0.17960046231746674, + "learning_rate": 1.9522139794603018e-05, + "loss": 1.2397, + "step": 3554 + }, + { + "epoch": 1.3235448104152365, + "grad_norm": 0.17524504661560059, + "learning_rate": 1.9521768528129782e-05, + "loss": 1.2136, + "step": 3555 + }, + { + "epoch": 1.3239171155658456, + "grad_norm": 0.16311313211917877, + "learning_rate": 1.952139712102062e-05, + "loss": 1.2182, + "step": 3556 + }, + { + "epoch": 1.3242894207164548, + "grad_norm": 0.2031218409538269, + "learning_rate": 1.952102557328102e-05, + "loss": 1.211, + "step": 3557 + }, + { + "epoch": 1.3246617258670637, + "grad_norm": 0.18743805587291718, + "learning_rate": 1.952065388491647e-05, + "loss": 1.2244, + "step": 3558 + }, + { + "epoch": 1.325034031017673, + "grad_norm": 0.17010486125946045, + "learning_rate": 1.952028205593246e-05, + "loss": 1.2096, + "step": 3559 + }, + { + "epoch": 1.325406336168282, + "grad_norm": 0.16542263329029083, + "learning_rate": 1.951991008633448e-05, + "loss": 1.233, + "step": 3560 + }, + { + "epoch": 1.325778641318891, + "grad_norm": 0.1798142045736313, + "learning_rate": 1.9519537976128025e-05, + "loss": 1.2222, + "step": 3561 + }, + { + "epoch": 1.3261509464695, + "grad_norm": 0.16815944015979767, + "learning_rate": 1.9519165725318594e-05, + "loss": 1.2215, + "step": 3562 + }, + { + "epoch": 1.326523251620109, + "grad_norm": 0.16989260911941528, + "learning_rate": 1.951879333391168e-05, + "loss": 1.2323, + "step": 3563 + }, + { + "epoch": 1.3268955567707181, + "grad_norm": 0.17635375261306763, + "learning_rate": 1.9518420801912787e-05, + "loss": 1.2118, + "step": 3564 + }, + { + "epoch": 1.3272678619213272, + "grad_norm": 0.1696915477514267, + "learning_rate": 1.951804812932742e-05, + "loss": 1.2136, + "step": 3565 + }, + { + "epoch": 1.3276401670719364, + "grad_norm": 0.17209488153457642, + "learning_rate": 1.9517675316161074e-05, + "loss": 1.2209, + "step": 3566 + }, + { + "epoch": 1.3280124722225455, + "grad_norm": 0.1687251627445221, + "learning_rate": 1.951730236241926e-05, + "loss": 1.2153, + "step": 3567 + }, + { + "epoch": 1.3283847773731545, + "grad_norm": 0.17375510931015015, + "learning_rate": 1.9516929268107492e-05, + "loss": 1.2257, + "step": 3568 + }, + { + "epoch": 1.3287570825237636, + "grad_norm": 0.17510513961315155, + "learning_rate": 1.9516556033231276e-05, + "loss": 1.2204, + "step": 3569 + }, + { + "epoch": 1.3291293876743726, + "grad_norm": 0.16425535082817078, + "learning_rate": 1.9516182657796123e-05, + "loss": 1.2278, + "step": 3570 + }, + { + "epoch": 1.3295016928249817, + "grad_norm": 0.17641210556030273, + "learning_rate": 1.9515809141807547e-05, + "loss": 1.2086, + "step": 3571 + }, + { + "epoch": 1.3298739979755907, + "grad_norm": 0.1740121841430664, + "learning_rate": 1.951543548527107e-05, + "loss": 1.2131, + "step": 3572 + }, + { + "epoch": 1.3302463031261997, + "grad_norm": 0.16927821934223175, + "learning_rate": 1.951506168819221e-05, + "loss": 1.2278, + "step": 3573 + }, + { + "epoch": 1.3306186082768088, + "grad_norm": 0.17070408165454865, + "learning_rate": 1.9514687750576483e-05, + "loss": 1.2097, + "step": 3574 + }, + { + "epoch": 1.330990913427418, + "grad_norm": 0.17762641608715057, + "learning_rate": 1.9514313672429414e-05, + "loss": 1.2277, + "step": 3575 + }, + { + "epoch": 1.331363218578027, + "grad_norm": 0.1731291264295578, + "learning_rate": 1.951393945375653e-05, + "loss": 1.2455, + "step": 3576 + }, + { + "epoch": 1.3317355237286361, + "grad_norm": 0.17669746279716492, + "learning_rate": 1.9513565094563358e-05, + "loss": 1.2225, + "step": 3577 + }, + { + "epoch": 1.3321078288792452, + "grad_norm": 0.16807028651237488, + "learning_rate": 1.9513190594855427e-05, + "loss": 1.2223, + "step": 3578 + }, + { + "epoch": 1.3324801340298542, + "grad_norm": 0.16439288854599, + "learning_rate": 1.9512815954638266e-05, + "loss": 1.2235, + "step": 3579 + }, + { + "epoch": 1.3328524391804633, + "grad_norm": 0.17088323831558228, + "learning_rate": 1.9512441173917415e-05, + "loss": 1.2066, + "step": 3580 + }, + { + "epoch": 1.3332247443310723, + "grad_norm": 0.1691887080669403, + "learning_rate": 1.95120662526984e-05, + "loss": 1.2274, + "step": 3581 + }, + { + "epoch": 1.3335970494816813, + "grad_norm": 0.17315596342086792, + "learning_rate": 1.9511691190986767e-05, + "loss": 1.2285, + "step": 3582 + }, + { + "epoch": 1.3339693546322904, + "grad_norm": 0.1639329493045807, + "learning_rate": 1.9511315988788046e-05, + "loss": 1.2236, + "step": 3583 + }, + { + "epoch": 1.3343416597828996, + "grad_norm": 0.16537517309188843, + "learning_rate": 1.951094064610779e-05, + "loss": 1.2046, + "step": 3584 + }, + { + "epoch": 1.3347139649335087, + "grad_norm": 0.173024520277977, + "learning_rate": 1.9510565162951538e-05, + "loss": 1.2142, + "step": 3585 + }, + { + "epoch": 1.3350862700841177, + "grad_norm": 0.1649821549654007, + "learning_rate": 1.9510189539324832e-05, + "loss": 1.225, + "step": 3586 + }, + { + "epoch": 1.3354585752347268, + "grad_norm": 0.1695072054862976, + "learning_rate": 1.9509813775233227e-05, + "loss": 1.2062, + "step": 3587 + }, + { + "epoch": 1.3358308803853358, + "grad_norm": 0.16297145187854767, + "learning_rate": 1.9509437870682268e-05, + "loss": 1.2252, + "step": 3588 + }, + { + "epoch": 1.3362031855359449, + "grad_norm": 0.1728506088256836, + "learning_rate": 1.9509061825677508e-05, + "loss": 1.2158, + "step": 3589 + }, + { + "epoch": 1.336575490686554, + "grad_norm": 0.16616937518119812, + "learning_rate": 1.95086856402245e-05, + "loss": 1.2066, + "step": 3590 + }, + { + "epoch": 1.3369477958371632, + "grad_norm": 0.16450245678424835, + "learning_rate": 1.9508309314328803e-05, + "loss": 1.2085, + "step": 3591 + }, + { + "epoch": 1.337320100987772, + "grad_norm": 0.17758624255657196, + "learning_rate": 1.9507932847995974e-05, + "loss": 1.2121, + "step": 3592 + }, + { + "epoch": 1.3376924061383813, + "grad_norm": 0.17107117176055908, + "learning_rate": 1.9507556241231574e-05, + "loss": 1.2223, + "step": 3593 + }, + { + "epoch": 1.3380647112889903, + "grad_norm": 0.17417475581169128, + "learning_rate": 1.9507179494041166e-05, + "loss": 1.2306, + "step": 3594 + }, + { + "epoch": 1.3384370164395993, + "grad_norm": 0.16709604859352112, + "learning_rate": 1.9506802606430314e-05, + "loss": 1.2262, + "step": 3595 + }, + { + "epoch": 1.3388093215902084, + "grad_norm": 0.1753034144639969, + "learning_rate": 1.950642557840458e-05, + "loss": 1.2329, + "step": 3596 + }, + { + "epoch": 1.3391816267408174, + "grad_norm": 0.16027675569057465, + "learning_rate": 1.950604840996954e-05, + "loss": 1.2315, + "step": 3597 + }, + { + "epoch": 1.3395539318914265, + "grad_norm": 0.17138755321502686, + "learning_rate": 1.950567110113076e-05, + "loss": 1.2045, + "step": 3598 + }, + { + "epoch": 1.3399262370420355, + "grad_norm": 0.1684652864933014, + "learning_rate": 1.9505293651893817e-05, + "loss": 1.2353, + "step": 3599 + }, + { + "epoch": 1.3402985421926448, + "grad_norm": 0.178995743393898, + "learning_rate": 1.9504916062264285e-05, + "loss": 1.2311, + "step": 3600 + }, + { + "epoch": 1.3406708473432536, + "grad_norm": 0.17695268988609314, + "learning_rate": 1.950453833224773e-05, + "loss": 1.2295, + "step": 3601 + }, + { + "epoch": 1.3410431524938629, + "grad_norm": 0.16872268915176392, + "learning_rate": 1.950416046184975e-05, + "loss": 1.2149, + "step": 3602 + }, + { + "epoch": 1.341415457644472, + "grad_norm": 0.1644962728023529, + "learning_rate": 1.9503782451075912e-05, + "loss": 1.2012, + "step": 3603 + }, + { + "epoch": 1.341787762795081, + "grad_norm": 0.16995151340961456, + "learning_rate": 1.9503404299931806e-05, + "loss": 1.2115, + "step": 3604 + }, + { + "epoch": 1.34216006794569, + "grad_norm": 0.1838231086730957, + "learning_rate": 1.950302600842301e-05, + "loss": 1.2096, + "step": 3605 + }, + { + "epoch": 1.342532373096299, + "grad_norm": 0.1684899926185608, + "learning_rate": 1.950264757655512e-05, + "loss": 1.2167, + "step": 3606 + }, + { + "epoch": 1.342904678246908, + "grad_norm": 0.17310784757137299, + "learning_rate": 1.9502269004333722e-05, + "loss": 1.2252, + "step": 3607 + }, + { + "epoch": 1.343276983397517, + "grad_norm": 0.17696700990200043, + "learning_rate": 1.950189029176441e-05, + "loss": 1.2212, + "step": 3608 + }, + { + "epoch": 1.3436492885481264, + "grad_norm": 0.16353021562099457, + "learning_rate": 1.9501511438852767e-05, + "loss": 1.2122, + "step": 3609 + }, + { + "epoch": 1.3440215936987352, + "grad_norm": 0.1800825595855713, + "learning_rate": 1.95011324456044e-05, + "loss": 1.222, + "step": 3610 + }, + { + "epoch": 1.3443938988493445, + "grad_norm": 0.17191185057163239, + "learning_rate": 1.9500753312024904e-05, + "loss": 1.2084, + "step": 3611 + }, + { + "epoch": 1.3447662039999535, + "grad_norm": 0.16857829689979553, + "learning_rate": 1.9500374038119877e-05, + "loss": 1.219, + "step": 3612 + }, + { + "epoch": 1.3451385091505625, + "grad_norm": 0.16424717009067535, + "learning_rate": 1.949999462389492e-05, + "loss": 1.2135, + "step": 3613 + }, + { + "epoch": 1.3455108143011716, + "grad_norm": 0.1720798909664154, + "learning_rate": 1.9499615069355644e-05, + "loss": 1.2162, + "step": 3614 + }, + { + "epoch": 1.3458831194517806, + "grad_norm": 0.1672721654176712, + "learning_rate": 1.9499235374507646e-05, + "loss": 1.2197, + "step": 3615 + }, + { + "epoch": 1.3462554246023897, + "grad_norm": 0.17355121672153473, + "learning_rate": 1.9498855539356534e-05, + "loss": 1.2289, + "step": 3616 + }, + { + "epoch": 1.3466277297529987, + "grad_norm": 0.17053773999214172, + "learning_rate": 1.9498475563907927e-05, + "loss": 1.2154, + "step": 3617 + }, + { + "epoch": 1.347000034903608, + "grad_norm": 0.17740602791309357, + "learning_rate": 1.9498095448167435e-05, + "loss": 1.2198, + "step": 3618 + }, + { + "epoch": 1.3473723400542168, + "grad_norm": 0.16917094588279724, + "learning_rate": 1.949771519214066e-05, + "loss": 1.2174, + "step": 3619 + }, + { + "epoch": 1.347744645204826, + "grad_norm": 0.20187613368034363, + "learning_rate": 1.9497334795833235e-05, + "loss": 1.213, + "step": 3620 + }, + { + "epoch": 1.348116950355435, + "grad_norm": 0.1762245148420334, + "learning_rate": 1.949695425925077e-05, + "loss": 1.1968, + "step": 3621 + }, + { + "epoch": 1.3484892555060441, + "grad_norm": 0.17733825743198395, + "learning_rate": 1.9496573582398884e-05, + "loss": 1.2316, + "step": 3622 + }, + { + "epoch": 1.3488615606566532, + "grad_norm": 0.16012229025363922, + "learning_rate": 1.9496192765283205e-05, + "loss": 1.2156, + "step": 3623 + }, + { + "epoch": 1.3492338658072622, + "grad_norm": 0.17193962633609772, + "learning_rate": 1.9495811807909353e-05, + "loss": 1.229, + "step": 3624 + }, + { + "epoch": 1.3496061709578713, + "grad_norm": 0.17869403958320618, + "learning_rate": 1.9495430710282956e-05, + "loss": 1.2101, + "step": 3625 + }, + { + "epoch": 1.3499784761084803, + "grad_norm": 0.1690213680267334, + "learning_rate": 1.9495049472409644e-05, + "loss": 1.2194, + "step": 3626 + }, + { + "epoch": 1.3503507812590896, + "grad_norm": 0.16633479297161102, + "learning_rate": 1.9494668094295046e-05, + "loss": 1.2069, + "step": 3627 + }, + { + "epoch": 1.3507230864096986, + "grad_norm": 0.17453880608081818, + "learning_rate": 1.94942865759448e-05, + "loss": 1.2156, + "step": 3628 + }, + { + "epoch": 1.3510953915603077, + "grad_norm": 0.176530122756958, + "learning_rate": 1.9493904917364533e-05, + "loss": 1.201, + "step": 3629 + }, + { + "epoch": 1.3514676967109167, + "grad_norm": 0.1691957712173462, + "learning_rate": 1.9493523118559888e-05, + "loss": 1.2361, + "step": 3630 + }, + { + "epoch": 1.3518400018615258, + "grad_norm": 0.16371867060661316, + "learning_rate": 1.9493141179536504e-05, + "loss": 1.2186, + "step": 3631 + }, + { + "epoch": 1.3522123070121348, + "grad_norm": 0.17232230305671692, + "learning_rate": 1.949275910030002e-05, + "loss": 1.2164, + "step": 3632 + }, + { + "epoch": 1.3525846121627438, + "grad_norm": 0.17034488916397095, + "learning_rate": 1.9492376880856075e-05, + "loss": 1.2146, + "step": 3633 + }, + { + "epoch": 1.3529569173133529, + "grad_norm": 0.16732533276081085, + "learning_rate": 1.9491994521210326e-05, + "loss": 1.2249, + "step": 3634 + }, + { + "epoch": 1.353329222463962, + "grad_norm": 0.16528116166591644, + "learning_rate": 1.949161202136841e-05, + "loss": 1.2163, + "step": 3635 + }, + { + "epoch": 1.3537015276145712, + "grad_norm": 0.1649235486984253, + "learning_rate": 1.9491229381335978e-05, + "loss": 1.2159, + "step": 3636 + }, + { + "epoch": 1.3540738327651802, + "grad_norm": 0.16926534473896027, + "learning_rate": 1.9490846601118685e-05, + "loss": 1.211, + "step": 3637 + }, + { + "epoch": 1.3544461379157893, + "grad_norm": 0.17798763513565063, + "learning_rate": 1.9490463680722183e-05, + "loss": 1.2198, + "step": 3638 + }, + { + "epoch": 1.3548184430663983, + "grad_norm": 0.1720110923051834, + "learning_rate": 1.9490080620152133e-05, + "loss": 1.2144, + "step": 3639 + }, + { + "epoch": 1.3551907482170074, + "grad_norm": 0.16815277934074402, + "learning_rate": 1.948969741941418e-05, + "loss": 1.2294, + "step": 3640 + }, + { + "epoch": 1.3555630533676164, + "grad_norm": 0.1696767359972, + "learning_rate": 1.9489314078514e-05, + "loss": 1.2161, + "step": 3641 + }, + { + "epoch": 1.3559353585182254, + "grad_norm": 0.17275430262088776, + "learning_rate": 1.9488930597457242e-05, + "loss": 1.2219, + "step": 3642 + }, + { + "epoch": 1.3563076636688347, + "grad_norm": 0.17057517170906067, + "learning_rate": 1.9488546976249572e-05, + "loss": 1.2106, + "step": 3643 + }, + { + "epoch": 1.3566799688194435, + "grad_norm": 0.16843107342720032, + "learning_rate": 1.9488163214896666e-05, + "loss": 1.2274, + "step": 3644 + }, + { + "epoch": 1.3570522739700528, + "grad_norm": 0.1690577119588852, + "learning_rate": 1.948777931340418e-05, + "loss": 1.2187, + "step": 3645 + }, + { + "epoch": 1.3574245791206618, + "grad_norm": 0.17254535853862762, + "learning_rate": 1.9487395271777787e-05, + "loss": 1.2228, + "step": 3646 + }, + { + "epoch": 1.3577968842712709, + "grad_norm": 0.17045117914676666, + "learning_rate": 1.9487011090023165e-05, + "loss": 1.195, + "step": 3647 + }, + { + "epoch": 1.35816918942188, + "grad_norm": 0.17185580730438232, + "learning_rate": 1.9486626768145986e-05, + "loss": 1.2162, + "step": 3648 + }, + { + "epoch": 1.358541494572489, + "grad_norm": 0.1752527505159378, + "learning_rate": 1.948624230615192e-05, + "loss": 1.2336, + "step": 3649 + }, + { + "epoch": 1.358913799723098, + "grad_norm": 0.1720474511384964, + "learning_rate": 1.9485857704046652e-05, + "loss": 1.2268, + "step": 3650 + }, + { + "epoch": 1.359286104873707, + "grad_norm": 0.1814902275800705, + "learning_rate": 1.9485472961835865e-05, + "loss": 1.2179, + "step": 3651 + }, + { + "epoch": 1.3596584100243163, + "grad_norm": 0.16393816471099854, + "learning_rate": 1.948508807952523e-05, + "loss": 1.2179, + "step": 3652 + }, + { + "epoch": 1.3600307151749251, + "grad_norm": 0.17267872393131256, + "learning_rate": 1.9484703057120444e-05, + "loss": 1.2152, + "step": 3653 + }, + { + "epoch": 1.3604030203255344, + "grad_norm": 0.1774880588054657, + "learning_rate": 1.948431789462719e-05, + "loss": 1.2323, + "step": 3654 + }, + { + "epoch": 1.3607753254761434, + "grad_norm": 0.17601603269577026, + "learning_rate": 1.9483932592051154e-05, + "loss": 1.2211, + "step": 3655 + }, + { + "epoch": 1.3611476306267525, + "grad_norm": 0.17215831577777863, + "learning_rate": 1.948354714939803e-05, + "loss": 1.2171, + "step": 3656 + }, + { + "epoch": 1.3615199357773615, + "grad_norm": 0.16573548316955566, + "learning_rate": 1.948316156667351e-05, + "loss": 1.2386, + "step": 3657 + }, + { + "epoch": 1.3618922409279706, + "grad_norm": 0.17070265114307404, + "learning_rate": 1.9482775843883287e-05, + "loss": 1.2306, + "step": 3658 + }, + { + "epoch": 1.3622645460785796, + "grad_norm": 0.17669783532619476, + "learning_rate": 1.948238998103306e-05, + "loss": 1.2186, + "step": 3659 + }, + { + "epoch": 1.3626368512291887, + "grad_norm": 0.17972193658351898, + "learning_rate": 1.948200397812853e-05, + "loss": 1.2208, + "step": 3660 + }, + { + "epoch": 1.363009156379798, + "grad_norm": 0.16259367763996124, + "learning_rate": 1.9481617835175394e-05, + "loss": 1.1995, + "step": 3661 + }, + { + "epoch": 1.3633814615304067, + "grad_norm": 0.16560673713684082, + "learning_rate": 1.948123155217936e-05, + "loss": 1.2124, + "step": 3662 + }, + { + "epoch": 1.363753766681016, + "grad_norm": 0.17198215425014496, + "learning_rate": 1.948084512914613e-05, + "loss": 1.223, + "step": 3663 + }, + { + "epoch": 1.364126071831625, + "grad_norm": 0.16872192919254303, + "learning_rate": 1.948045856608141e-05, + "loss": 1.213, + "step": 3664 + }, + { + "epoch": 1.364498376982234, + "grad_norm": 0.17418809235095978, + "learning_rate": 1.9480071862990917e-05, + "loss": 1.2059, + "step": 3665 + }, + { + "epoch": 1.3648706821328431, + "grad_norm": 0.1725587695837021, + "learning_rate": 1.9479685019880356e-05, + "loss": 1.2234, + "step": 3666 + }, + { + "epoch": 1.3652429872834522, + "grad_norm": 0.16984142363071442, + "learning_rate": 1.947929803675544e-05, + "loss": 1.2106, + "step": 3667 + }, + { + "epoch": 1.3656152924340612, + "grad_norm": 0.1837463080883026, + "learning_rate": 1.9478910913621888e-05, + "loss": 1.2143, + "step": 3668 + }, + { + "epoch": 1.3659875975846703, + "grad_norm": 0.1684904843568802, + "learning_rate": 1.9478523650485416e-05, + "loss": 1.2236, + "step": 3669 + }, + { + "epoch": 1.3663599027352795, + "grad_norm": 0.1728762686252594, + "learning_rate": 1.9478136247351745e-05, + "loss": 1.205, + "step": 3670 + }, + { + "epoch": 1.3667322078858883, + "grad_norm": 0.16816040873527527, + "learning_rate": 1.9477748704226597e-05, + "loss": 1.2231, + "step": 3671 + }, + { + "epoch": 1.3671045130364976, + "grad_norm": 0.17051219940185547, + "learning_rate": 1.9477361021115695e-05, + "loss": 1.2277, + "step": 3672 + }, + { + "epoch": 1.3674768181871066, + "grad_norm": 0.16704906523227692, + "learning_rate": 1.9476973198024766e-05, + "loss": 1.2166, + "step": 3673 + }, + { + "epoch": 1.3678491233377157, + "grad_norm": 0.16872693598270416, + "learning_rate": 1.9476585234959538e-05, + "loss": 1.2136, + "step": 3674 + }, + { + "epoch": 1.3682214284883247, + "grad_norm": 0.16978906095027924, + "learning_rate": 1.947619713192574e-05, + "loss": 1.209, + "step": 3675 + }, + { + "epoch": 1.3685937336389338, + "grad_norm": 0.16473500430583954, + "learning_rate": 1.9475808888929104e-05, + "loss": 1.2284, + "step": 3676 + }, + { + "epoch": 1.3689660387895428, + "grad_norm": 0.16459238529205322, + "learning_rate": 1.9475420505975366e-05, + "loss": 1.2103, + "step": 3677 + }, + { + "epoch": 1.3693383439401519, + "grad_norm": 0.1704069972038269, + "learning_rate": 1.9475031983070264e-05, + "loss": 1.2207, + "step": 3678 + }, + { + "epoch": 1.3697106490907611, + "grad_norm": 0.18187867105007172, + "learning_rate": 1.9474643320219534e-05, + "loss": 1.1893, + "step": 3679 + }, + { + "epoch": 1.3700829542413702, + "grad_norm": 0.1664721667766571, + "learning_rate": 1.9474254517428912e-05, + "loss": 1.23, + "step": 3680 + }, + { + "epoch": 1.3704552593919792, + "grad_norm": 0.17463192343711853, + "learning_rate": 1.947386557470415e-05, + "loss": 1.2172, + "step": 3681 + }, + { + "epoch": 1.3708275645425883, + "grad_norm": 0.17818517982959747, + "learning_rate": 1.9473476492050984e-05, + "loss": 1.2206, + "step": 3682 + }, + { + "epoch": 1.3711998696931973, + "grad_norm": 0.1713617891073227, + "learning_rate": 1.947308726947517e-05, + "loss": 1.2358, + "step": 3683 + }, + { + "epoch": 1.3715721748438063, + "grad_norm": 0.16868345439434052, + "learning_rate": 1.947269790698245e-05, + "loss": 1.2138, + "step": 3684 + }, + { + "epoch": 1.3719444799944154, + "grad_norm": 0.17152869701385498, + "learning_rate": 1.9472308404578574e-05, + "loss": 1.2142, + "step": 3685 + }, + { + "epoch": 1.3723167851450244, + "grad_norm": 0.16501954197883606, + "learning_rate": 1.9471918762269298e-05, + "loss": 1.2098, + "step": 3686 + }, + { + "epoch": 1.3726890902956335, + "grad_norm": 0.1698669195175171, + "learning_rate": 1.9471528980060378e-05, + "loss": 1.2238, + "step": 3687 + }, + { + "epoch": 1.3730613954462427, + "grad_norm": 0.17526181042194366, + "learning_rate": 1.9471139057957566e-05, + "loss": 1.226, + "step": 3688 + }, + { + "epoch": 1.3734337005968518, + "grad_norm": 0.1682656854391098, + "learning_rate": 1.947074899596663e-05, + "loss": 1.2115, + "step": 3689 + }, + { + "epoch": 1.3738060057474608, + "grad_norm": 0.16698940098285675, + "learning_rate": 1.947035879409332e-05, + "loss": 1.1977, + "step": 3690 + }, + { + "epoch": 1.3741783108980699, + "grad_norm": 0.17004626989364624, + "learning_rate": 1.9469968452343408e-05, + "loss": 1.2197, + "step": 3691 + }, + { + "epoch": 1.374550616048679, + "grad_norm": 0.17727632820606232, + "learning_rate": 1.9469577970722656e-05, + "loss": 1.2242, + "step": 3692 + }, + { + "epoch": 1.374922921199288, + "grad_norm": 0.16782982647418976, + "learning_rate": 1.9469187349236832e-05, + "loss": 1.2111, + "step": 3693 + }, + { + "epoch": 1.375295226349897, + "grad_norm": 0.17160236835479736, + "learning_rate": 1.9468796587891706e-05, + "loss": 1.2168, + "step": 3694 + }, + { + "epoch": 1.375667531500506, + "grad_norm": 0.16954834759235382, + "learning_rate": 1.9468405686693044e-05, + "loss": 1.2077, + "step": 3695 + }, + { + "epoch": 1.376039836651115, + "grad_norm": 0.168158158659935, + "learning_rate": 1.9468014645646628e-05, + "loss": 1.2249, + "step": 3696 + }, + { + "epoch": 1.3764121418017243, + "grad_norm": 0.16404445469379425, + "learning_rate": 1.946762346475823e-05, + "loss": 1.2158, + "step": 3697 + }, + { + "epoch": 1.3767844469523334, + "grad_norm": 0.1699007749557495, + "learning_rate": 1.946723214403363e-05, + "loss": 1.2043, + "step": 3698 + }, + { + "epoch": 1.3771567521029424, + "grad_norm": 0.17556270956993103, + "learning_rate": 1.9466840683478597e-05, + "loss": 1.2167, + "step": 3699 + }, + { + "epoch": 1.3775290572535515, + "grad_norm": 0.1715071052312851, + "learning_rate": 1.9466449083098927e-05, + "loss": 1.2228, + "step": 3700 + }, + { + "epoch": 1.3779013624041605, + "grad_norm": 0.1757829487323761, + "learning_rate": 1.94660573429004e-05, + "loss": 1.232, + "step": 3701 + }, + { + "epoch": 1.3782736675547695, + "grad_norm": 0.1744741052389145, + "learning_rate": 1.9465665462888798e-05, + "loss": 1.2026, + "step": 3702 + }, + { + "epoch": 1.3786459727053786, + "grad_norm": 0.17323531210422516, + "learning_rate": 1.9465273443069908e-05, + "loss": 1.2306, + "step": 3703 + }, + { + "epoch": 1.3790182778559879, + "grad_norm": 0.16859839856624603, + "learning_rate": 1.9464881283449525e-05, + "loss": 1.2169, + "step": 3704 + }, + { + "epoch": 1.3793905830065967, + "grad_norm": 0.1696728765964508, + "learning_rate": 1.9464488984033442e-05, + "loss": 1.2084, + "step": 3705 + }, + { + "epoch": 1.379762888157206, + "grad_norm": 0.16950103640556335, + "learning_rate": 1.946409654482745e-05, + "loss": 1.2233, + "step": 3706 + }, + { + "epoch": 1.380135193307815, + "grad_norm": 0.17010366916656494, + "learning_rate": 1.9463703965837344e-05, + "loss": 1.2042, + "step": 3707 + }, + { + "epoch": 1.380507498458424, + "grad_norm": 0.16800427436828613, + "learning_rate": 1.9463311247068927e-05, + "loss": 1.2012, + "step": 3708 + }, + { + "epoch": 1.380879803609033, + "grad_norm": 0.16539591550827026, + "learning_rate": 1.9462918388527994e-05, + "loss": 1.2236, + "step": 3709 + }, + { + "epoch": 1.381252108759642, + "grad_norm": 0.17388111352920532, + "learning_rate": 1.946252539022035e-05, + "loss": 1.2115, + "step": 3710 + }, + { + "epoch": 1.3816244139102511, + "grad_norm": 0.17521414160728455, + "learning_rate": 1.94621322521518e-05, + "loss": 1.2319, + "step": 3711 + }, + { + "epoch": 1.3819967190608602, + "grad_norm": 0.16813969612121582, + "learning_rate": 1.9461738974328156e-05, + "loss": 1.2158, + "step": 3712 + }, + { + "epoch": 1.3823690242114695, + "grad_norm": 0.16460061073303223, + "learning_rate": 1.9461345556755216e-05, + "loss": 1.2084, + "step": 3713 + }, + { + "epoch": 1.3827413293620783, + "grad_norm": 0.18938884139060974, + "learning_rate": 1.9460951999438798e-05, + "loss": 1.2169, + "step": 3714 + }, + { + "epoch": 1.3831136345126875, + "grad_norm": 0.19839580357074738, + "learning_rate": 1.946055830238471e-05, + "loss": 1.221, + "step": 3715 + }, + { + "epoch": 1.3834859396632966, + "grad_norm": 0.1847252994775772, + "learning_rate": 1.9460164465598773e-05, + "loss": 1.21, + "step": 3716 + }, + { + "epoch": 1.3838582448139056, + "grad_norm": 0.1856468766927719, + "learning_rate": 1.94597704890868e-05, + "loss": 1.222, + "step": 3717 + }, + { + "epoch": 1.3842305499645147, + "grad_norm": 0.1739731878042221, + "learning_rate": 1.945937637285461e-05, + "loss": 1.1989, + "step": 3718 + }, + { + "epoch": 1.3846028551151237, + "grad_norm": 0.16499097645282745, + "learning_rate": 1.945898211690802e-05, + "loss": 1.2297, + "step": 3719 + }, + { + "epoch": 1.3849751602657328, + "grad_norm": 0.17512036859989166, + "learning_rate": 1.9458587721252862e-05, + "loss": 1.2238, + "step": 3720 + }, + { + "epoch": 1.3853474654163418, + "grad_norm": 0.1747319996356964, + "learning_rate": 1.9458193185894957e-05, + "loss": 1.214, + "step": 3721 + }, + { + "epoch": 1.385719770566951, + "grad_norm": 0.17503361403942108, + "learning_rate": 1.9457798510840132e-05, + "loss": 1.225, + "step": 3722 + }, + { + "epoch": 1.3860920757175599, + "grad_norm": 0.16246047616004944, + "learning_rate": 1.9457403696094216e-05, + "loss": 1.2121, + "step": 3723 + }, + { + "epoch": 1.3864643808681691, + "grad_norm": 0.17145591974258423, + "learning_rate": 1.945700874166304e-05, + "loss": 1.1996, + "step": 3724 + }, + { + "epoch": 1.3868366860187782, + "grad_norm": 0.17188484966754913, + "learning_rate": 1.945661364755244e-05, + "loss": 1.217, + "step": 3725 + }, + { + "epoch": 1.3872089911693872, + "grad_norm": 0.17199209332466125, + "learning_rate": 1.945621841376825e-05, + "loss": 1.2215, + "step": 3726 + }, + { + "epoch": 1.3875812963199963, + "grad_norm": 0.17529122531414032, + "learning_rate": 1.945582304031631e-05, + "loss": 1.2076, + "step": 3727 + }, + { + "epoch": 1.3879536014706053, + "grad_norm": 0.16724155843257904, + "learning_rate": 1.945542752720245e-05, + "loss": 1.2258, + "step": 3728 + }, + { + "epoch": 1.3883259066212144, + "grad_norm": 0.16579319536685944, + "learning_rate": 1.9455031874432526e-05, + "loss": 1.2206, + "step": 3729 + }, + { + "epoch": 1.3886982117718234, + "grad_norm": 0.17617808282375336, + "learning_rate": 1.9454636082012373e-05, + "loss": 1.2258, + "step": 3730 + }, + { + "epoch": 1.3890705169224327, + "grad_norm": 0.17041444778442383, + "learning_rate": 1.9454240149947834e-05, + "loss": 1.2164, + "step": 3731 + }, + { + "epoch": 1.3894428220730415, + "grad_norm": 0.17237654328346252, + "learning_rate": 1.9453844078244767e-05, + "loss": 1.2238, + "step": 3732 + }, + { + "epoch": 1.3898151272236507, + "grad_norm": 0.17883872985839844, + "learning_rate": 1.9453447866909013e-05, + "loss": 1.2267, + "step": 3733 + }, + { + "epoch": 1.3901874323742598, + "grad_norm": 0.1674152910709381, + "learning_rate": 1.9453051515946428e-05, + "loss": 1.2181, + "step": 3734 + }, + { + "epoch": 1.3905597375248688, + "grad_norm": 0.16891363263130188, + "learning_rate": 1.9452655025362865e-05, + "loss": 1.2301, + "step": 3735 + }, + { + "epoch": 1.3909320426754779, + "grad_norm": 0.16902455687522888, + "learning_rate": 1.9452258395164182e-05, + "loss": 1.2097, + "step": 3736 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.1644563525915146, + "learning_rate": 1.9451861625356235e-05, + "loss": 1.2291, + "step": 3737 + }, + { + "epoch": 1.391676652976696, + "grad_norm": 0.1637285202741623, + "learning_rate": 1.9451464715944885e-05, + "loss": 1.2201, + "step": 3738 + }, + { + "epoch": 1.392048958127305, + "grad_norm": 0.1716940551996231, + "learning_rate": 1.945106766693599e-05, + "loss": 1.2151, + "step": 3739 + }, + { + "epoch": 1.3924212632779143, + "grad_norm": 0.17560310661792755, + "learning_rate": 1.9450670478335424e-05, + "loss": 1.2121, + "step": 3740 + }, + { + "epoch": 1.3927935684285233, + "grad_norm": 0.164300337433815, + "learning_rate": 1.9450273150149047e-05, + "loss": 1.2194, + "step": 3741 + }, + { + "epoch": 1.3931658735791324, + "grad_norm": 0.17114891111850739, + "learning_rate": 1.944987568238273e-05, + "loss": 1.201, + "step": 3742 + }, + { + "epoch": 1.3935381787297414, + "grad_norm": 0.1734459102153778, + "learning_rate": 1.944947807504234e-05, + "loss": 1.221, + "step": 3743 + }, + { + "epoch": 1.3939104838803504, + "grad_norm": 0.16712883114814758, + "learning_rate": 1.944908032813375e-05, + "loss": 1.2168, + "step": 3744 + }, + { + "epoch": 1.3942827890309595, + "grad_norm": 0.17574042081832886, + "learning_rate": 1.944868244166284e-05, + "loss": 1.2226, + "step": 3745 + }, + { + "epoch": 1.3946550941815685, + "grad_norm": 0.167069673538208, + "learning_rate": 1.944828441563548e-05, + "loss": 1.1964, + "step": 3746 + }, + { + "epoch": 1.3950273993321776, + "grad_norm": 0.1734757423400879, + "learning_rate": 1.9447886250057556e-05, + "loss": 1.2265, + "step": 3747 + }, + { + "epoch": 1.3953997044827866, + "grad_norm": 0.1713225245475769, + "learning_rate": 1.944748794493494e-05, + "loss": 1.1987, + "step": 3748 + }, + { + "epoch": 1.3957720096333959, + "grad_norm": 0.18104128539562225, + "learning_rate": 1.9447089500273524e-05, + "loss": 1.212, + "step": 3749 + }, + { + "epoch": 1.396144314784005, + "grad_norm": 0.17931242287158966, + "learning_rate": 1.944669091607919e-05, + "loss": 1.218, + "step": 3750 + }, + { + "epoch": 1.396516619934614, + "grad_norm": 0.1705280989408493, + "learning_rate": 1.944629219235782e-05, + "loss": 1.2186, + "step": 3751 + }, + { + "epoch": 1.396888925085223, + "grad_norm": 0.1851729452610016, + "learning_rate": 1.944589332911531e-05, + "loss": 1.2086, + "step": 3752 + }, + { + "epoch": 1.397261230235832, + "grad_norm": 0.17898760735988617, + "learning_rate": 1.9445494326357548e-05, + "loss": 1.2299, + "step": 3753 + }, + { + "epoch": 1.397633535386441, + "grad_norm": 0.17653051018714905, + "learning_rate": 1.9445095184090428e-05, + "loss": 1.2135, + "step": 3754 + }, + { + "epoch": 1.3980058405370501, + "grad_norm": 0.17303214967250824, + "learning_rate": 1.9444695902319845e-05, + "loss": 1.2214, + "step": 3755 + }, + { + "epoch": 1.3983781456876592, + "grad_norm": 0.16854806244373322, + "learning_rate": 1.9444296481051697e-05, + "loss": 1.1992, + "step": 3756 + }, + { + "epoch": 1.3987504508382682, + "grad_norm": 0.17003561556339264, + "learning_rate": 1.944389692029188e-05, + "loss": 1.2143, + "step": 3757 + }, + { + "epoch": 1.3991227559888775, + "grad_norm": 0.17145571112632751, + "learning_rate": 1.9443497220046298e-05, + "loss": 1.2136, + "step": 3758 + }, + { + "epoch": 1.3994950611394865, + "grad_norm": 0.1773233562707901, + "learning_rate": 1.9443097380320855e-05, + "loss": 1.1977, + "step": 3759 + }, + { + "epoch": 1.3998673662900956, + "grad_norm": 0.1766585260629654, + "learning_rate": 1.944269740112146e-05, + "loss": 1.2133, + "step": 3760 + }, + { + "epoch": 1.4002396714407046, + "grad_norm": 0.18044057488441467, + "learning_rate": 1.9442297282454012e-05, + "loss": 1.206, + "step": 3761 + }, + { + "epoch": 1.4006119765913136, + "grad_norm": 0.1800515502691269, + "learning_rate": 1.9441897024324428e-05, + "loss": 1.2115, + "step": 3762 + }, + { + "epoch": 1.4009842817419227, + "grad_norm": 0.17421506345272064, + "learning_rate": 1.944149662673862e-05, + "loss": 1.2186, + "step": 3763 + }, + { + "epoch": 1.4013565868925317, + "grad_norm": 0.17194457352161407, + "learning_rate": 1.9441096089702495e-05, + "loss": 1.2109, + "step": 3764 + }, + { + "epoch": 1.401728892043141, + "grad_norm": 0.1739378422498703, + "learning_rate": 1.9440695413221975e-05, + "loss": 1.2066, + "step": 3765 + }, + { + "epoch": 1.4021011971937498, + "grad_norm": 0.17094124853610992, + "learning_rate": 1.9440294597302977e-05, + "loss": 1.217, + "step": 3766 + }, + { + "epoch": 1.402473502344359, + "grad_norm": 0.16742725670337677, + "learning_rate": 1.9439893641951418e-05, + "loss": 1.2153, + "step": 3767 + }, + { + "epoch": 1.4028458074949681, + "grad_norm": 0.17719261348247528, + "learning_rate": 1.9439492547173225e-05, + "loss": 1.2097, + "step": 3768 + }, + { + "epoch": 1.4032181126455772, + "grad_norm": 0.16668783128261566, + "learning_rate": 1.9439091312974317e-05, + "loss": 1.2193, + "step": 3769 + }, + { + "epoch": 1.4035904177961862, + "grad_norm": 0.17551898956298828, + "learning_rate": 1.9438689939360627e-05, + "loss": 1.2115, + "step": 3770 + }, + { + "epoch": 1.4039627229467952, + "grad_norm": 0.18291716277599335, + "learning_rate": 1.9438288426338073e-05, + "loss": 1.2085, + "step": 3771 + }, + { + "epoch": 1.4043350280974043, + "grad_norm": 0.16662460565567017, + "learning_rate": 1.9437886773912595e-05, + "loss": 1.2078, + "step": 3772 + }, + { + "epoch": 1.4047073332480133, + "grad_norm": 0.17178334295749664, + "learning_rate": 1.9437484982090122e-05, + "loss": 1.2195, + "step": 3773 + }, + { + "epoch": 1.4050796383986226, + "grad_norm": 0.17247992753982544, + "learning_rate": 1.9437083050876588e-05, + "loss": 1.2026, + "step": 3774 + }, + { + "epoch": 1.4054519435492314, + "grad_norm": 0.17090709507465363, + "learning_rate": 1.943668098027793e-05, + "loss": 1.2013, + "step": 3775 + }, + { + "epoch": 1.4058242486998407, + "grad_norm": 0.1711760014295578, + "learning_rate": 1.9436278770300082e-05, + "loss": 1.2116, + "step": 3776 + }, + { + "epoch": 1.4061965538504497, + "grad_norm": 0.17857705056667328, + "learning_rate": 1.943587642094899e-05, + "loss": 1.2237, + "step": 3777 + }, + { + "epoch": 1.4065688590010588, + "grad_norm": 0.18631206452846527, + "learning_rate": 1.9435473932230597e-05, + "loss": 1.2378, + "step": 3778 + }, + { + "epoch": 1.4069411641516678, + "grad_norm": 0.209711492061615, + "learning_rate": 1.9435071304150846e-05, + "loss": 1.2207, + "step": 3779 + }, + { + "epoch": 1.4073134693022769, + "grad_norm": 0.2716389298439026, + "learning_rate": 1.9434668536715686e-05, + "loss": 1.2311, + "step": 3780 + }, + { + "epoch": 1.407685774452886, + "grad_norm": 0.2101835310459137, + "learning_rate": 1.9434265629931063e-05, + "loss": 1.2202, + "step": 3781 + }, + { + "epoch": 1.408058079603495, + "grad_norm": 0.1864628791809082, + "learning_rate": 1.9433862583802927e-05, + "loss": 1.2012, + "step": 3782 + }, + { + "epoch": 1.4084303847541042, + "grad_norm": 0.1653883457183838, + "learning_rate": 1.9433459398337234e-05, + "loss": 1.2191, + "step": 3783 + }, + { + "epoch": 1.408802689904713, + "grad_norm": 0.17915792763233185, + "learning_rate": 1.9433056073539934e-05, + "loss": 1.2218, + "step": 3784 + }, + { + "epoch": 1.4091749950553223, + "grad_norm": 0.20036007463932037, + "learning_rate": 1.9432652609416993e-05, + "loss": 1.2227, + "step": 3785 + }, + { + "epoch": 1.4095473002059313, + "grad_norm": 0.18710802495479584, + "learning_rate": 1.943224900597436e-05, + "loss": 1.2126, + "step": 3786 + }, + { + "epoch": 1.4099196053565404, + "grad_norm": 0.18362727761268616, + "learning_rate": 1.9431845263218005e-05, + "loss": 1.219, + "step": 3787 + }, + { + "epoch": 1.4102919105071494, + "grad_norm": 0.18792852759361267, + "learning_rate": 1.943144138115389e-05, + "loss": 1.2113, + "step": 3788 + }, + { + "epoch": 1.4106642156577585, + "grad_norm": 0.1898011863231659, + "learning_rate": 1.943103735978797e-05, + "loss": 1.2172, + "step": 3789 + }, + { + "epoch": 1.4110365208083675, + "grad_norm": 0.1731170117855072, + "learning_rate": 1.9430633199126225e-05, + "loss": 1.2306, + "step": 3790 + }, + { + "epoch": 1.4114088259589765, + "grad_norm": 0.1677362322807312, + "learning_rate": 1.9430228899174617e-05, + "loss": 1.2239, + "step": 3791 + }, + { + "epoch": 1.4117811311095858, + "grad_norm": 0.16974781453609467, + "learning_rate": 1.9429824459939125e-05, + "loss": 1.2035, + "step": 3792 + }, + { + "epoch": 1.4121534362601946, + "grad_norm": 0.19337321817874908, + "learning_rate": 1.9429419881425713e-05, + "loss": 1.2091, + "step": 3793 + }, + { + "epoch": 1.412525741410804, + "grad_norm": 0.1791197508573532, + "learning_rate": 1.9429015163640363e-05, + "loss": 1.2148, + "step": 3794 + }, + { + "epoch": 1.412898046561413, + "grad_norm": 0.17587502300739288, + "learning_rate": 1.9428610306589047e-05, + "loss": 1.2287, + "step": 3795 + }, + { + "epoch": 1.413270351712022, + "grad_norm": 0.17841365933418274, + "learning_rate": 1.9428205310277752e-05, + "loss": 1.2108, + "step": 3796 + }, + { + "epoch": 1.413642656862631, + "grad_norm": 0.17350056767463684, + "learning_rate": 1.9427800174712455e-05, + "loss": 1.2035, + "step": 3797 + }, + { + "epoch": 1.41401496201324, + "grad_norm": 0.1747625321149826, + "learning_rate": 1.942739489989914e-05, + "loss": 1.2333, + "step": 3798 + }, + { + "epoch": 1.414387267163849, + "grad_norm": 0.17848855257034302, + "learning_rate": 1.9426989485843796e-05, + "loss": 1.2028, + "step": 3799 + }, + { + "epoch": 1.4147595723144581, + "grad_norm": 0.17129814624786377, + "learning_rate": 1.942658393255241e-05, + "loss": 1.228, + "step": 3800 + }, + { + "epoch": 1.4151318774650674, + "grad_norm": 0.1709514707326889, + "learning_rate": 1.942617824003097e-05, + "loss": 1.2262, + "step": 3801 + }, + { + "epoch": 1.4155041826156765, + "grad_norm": 0.16819478571414948, + "learning_rate": 1.942577240828547e-05, + "loss": 1.1978, + "step": 3802 + }, + { + "epoch": 1.4158764877662855, + "grad_norm": 0.17637069523334503, + "learning_rate": 1.94253664373219e-05, + "loss": 1.2166, + "step": 3803 + }, + { + "epoch": 1.4162487929168945, + "grad_norm": 0.1750773936510086, + "learning_rate": 1.942496032714626e-05, + "loss": 1.2239, + "step": 3804 + }, + { + "epoch": 1.4166210980675036, + "grad_norm": 0.16642986238002777, + "learning_rate": 1.9424554077764548e-05, + "loss": 1.2061, + "step": 3805 + }, + { + "epoch": 1.4169934032181126, + "grad_norm": 0.16567951440811157, + "learning_rate": 1.9424147689182765e-05, + "loss": 1.2207, + "step": 3806 + }, + { + "epoch": 1.4173657083687217, + "grad_norm": 0.17164906859397888, + "learning_rate": 1.942374116140691e-05, + "loss": 1.2136, + "step": 3807 + }, + { + "epoch": 1.4177380135193307, + "grad_norm": 0.1757207065820694, + "learning_rate": 1.942333449444299e-05, + "loss": 1.2229, + "step": 3808 + }, + { + "epoch": 1.4181103186699398, + "grad_norm": 0.1720336526632309, + "learning_rate": 1.9422927688297012e-05, + "loss": 1.2173, + "step": 3809 + }, + { + "epoch": 1.418482623820549, + "grad_norm": 0.16701281070709229, + "learning_rate": 1.9422520742974984e-05, + "loss": 1.2145, + "step": 3810 + }, + { + "epoch": 1.418854928971158, + "grad_norm": 0.16450679302215576, + "learning_rate": 1.9422113658482912e-05, + "loss": 1.2042, + "step": 3811 + }, + { + "epoch": 1.419227234121767, + "grad_norm": 0.16200435161590576, + "learning_rate": 1.942170643482682e-05, + "loss": 1.2152, + "step": 3812 + }, + { + "epoch": 1.4195995392723761, + "grad_norm": 0.1700204312801361, + "learning_rate": 1.9421299072012705e-05, + "loss": 1.2209, + "step": 3813 + }, + { + "epoch": 1.4199718444229852, + "grad_norm": 0.1691514402627945, + "learning_rate": 1.94208915700466e-05, + "loss": 1.2218, + "step": 3814 + }, + { + "epoch": 1.4203441495735942, + "grad_norm": 0.17180638015270233, + "learning_rate": 1.942048392893452e-05, + "loss": 1.2112, + "step": 3815 + }, + { + "epoch": 1.4207164547242033, + "grad_norm": 0.17067848145961761, + "learning_rate": 1.942007614868248e-05, + "loss": 1.2182, + "step": 3816 + }, + { + "epoch": 1.4210887598748123, + "grad_norm": 0.17079667747020721, + "learning_rate": 1.9419668229296507e-05, + "loss": 1.2248, + "step": 3817 + }, + { + "epoch": 1.4214610650254214, + "grad_norm": 0.16801148653030396, + "learning_rate": 1.9419260170782624e-05, + "loss": 1.217, + "step": 3818 + }, + { + "epoch": 1.4218333701760306, + "grad_norm": 0.16779641807079315, + "learning_rate": 1.9418851973146864e-05, + "loss": 1.2039, + "step": 3819 + }, + { + "epoch": 1.4222056753266397, + "grad_norm": 0.17368653416633606, + "learning_rate": 1.941844363639525e-05, + "loss": 1.2103, + "step": 3820 + }, + { + "epoch": 1.4225779804772487, + "grad_norm": 0.1701452136039734, + "learning_rate": 1.9418035160533813e-05, + "loss": 1.2069, + "step": 3821 + }, + { + "epoch": 1.4229502856278577, + "grad_norm": 0.17342039942741394, + "learning_rate": 1.9417626545568588e-05, + "loss": 1.222, + "step": 3822 + }, + { + "epoch": 1.4233225907784668, + "grad_norm": 0.173239603638649, + "learning_rate": 1.941721779150561e-05, + "loss": 1.2248, + "step": 3823 + }, + { + "epoch": 1.4236948959290758, + "grad_norm": 0.16423793137073517, + "learning_rate": 1.9416808898350915e-05, + "loss": 1.2084, + "step": 3824 + }, + { + "epoch": 1.4240672010796849, + "grad_norm": 0.16744939982891083, + "learning_rate": 1.9416399866110545e-05, + "loss": 1.2066, + "step": 3825 + }, + { + "epoch": 1.4244395062302941, + "grad_norm": 0.17226476967334747, + "learning_rate": 1.941599069479054e-05, + "loss": 1.2074, + "step": 3826 + }, + { + "epoch": 1.424811811380903, + "grad_norm": 0.16691239178180695, + "learning_rate": 1.9415581384396944e-05, + "loss": 1.2157, + "step": 3827 + }, + { + "epoch": 1.4251841165315122, + "grad_norm": 0.16759438812732697, + "learning_rate": 1.9415171934935798e-05, + "loss": 1.2214, + "step": 3828 + }, + { + "epoch": 1.4255564216821213, + "grad_norm": 0.1606573760509491, + "learning_rate": 1.9414762346413157e-05, + "loss": 1.202, + "step": 3829 + }, + { + "epoch": 1.4259287268327303, + "grad_norm": 0.16710183024406433, + "learning_rate": 1.9414352618835065e-05, + "loss": 1.226, + "step": 3830 + }, + { + "epoch": 1.4263010319833394, + "grad_norm": 0.16753865778446198, + "learning_rate": 1.9413942752207577e-05, + "loss": 1.2131, + "step": 3831 + }, + { + "epoch": 1.4266733371339484, + "grad_norm": 0.17130768299102783, + "learning_rate": 1.9413532746536744e-05, + "loss": 1.2127, + "step": 3832 + }, + { + "epoch": 1.4270456422845574, + "grad_norm": 0.1705525517463684, + "learning_rate": 1.9413122601828624e-05, + "loss": 1.2116, + "step": 3833 + }, + { + "epoch": 1.4274179474351665, + "grad_norm": 0.16685357689857483, + "learning_rate": 1.941271231808928e-05, + "loss": 1.2101, + "step": 3834 + }, + { + "epoch": 1.4277902525857757, + "grad_norm": 0.16948597133159637, + "learning_rate": 1.9412301895324755e-05, + "loss": 1.2146, + "step": 3835 + }, + { + "epoch": 1.4281625577363846, + "grad_norm": 0.16796061396598816, + "learning_rate": 1.941189133354113e-05, + "loss": 1.2168, + "step": 3836 + }, + { + "epoch": 1.4285348628869938, + "grad_norm": 0.1637044996023178, + "learning_rate": 1.9411480632744454e-05, + "loss": 1.2151, + "step": 3837 + }, + { + "epoch": 1.4289071680376029, + "grad_norm": 0.16353152692317963, + "learning_rate": 1.9411069792940803e-05, + "loss": 1.2055, + "step": 3838 + }, + { + "epoch": 1.429279473188212, + "grad_norm": 0.17246000468730927, + "learning_rate": 1.9410658814136243e-05, + "loss": 1.2039, + "step": 3839 + }, + { + "epoch": 1.429651778338821, + "grad_norm": 0.17234675586223602, + "learning_rate": 1.9410247696336842e-05, + "loss": 1.2052, + "step": 3840 + }, + { + "epoch": 1.43002408348943, + "grad_norm": 0.1646290272474289, + "learning_rate": 1.940983643954867e-05, + "loss": 1.203, + "step": 3841 + }, + { + "epoch": 1.430396388640039, + "grad_norm": 0.17096149921417236, + "learning_rate": 1.9409425043777806e-05, + "loss": 1.2062, + "step": 3842 + }, + { + "epoch": 1.430768693790648, + "grad_norm": 0.16843508183956146, + "learning_rate": 1.9409013509030327e-05, + "loss": 1.2088, + "step": 3843 + }, + { + "epoch": 1.4311409989412573, + "grad_norm": 0.17243371903896332, + "learning_rate": 1.940860183531231e-05, + "loss": 1.2183, + "step": 3844 + }, + { + "epoch": 1.4315133040918662, + "grad_norm": 0.16187606751918793, + "learning_rate": 1.940819002262983e-05, + "loss": 1.207, + "step": 3845 + }, + { + "epoch": 1.4318856092424754, + "grad_norm": 0.17250527441501617, + "learning_rate": 1.9407778070988978e-05, + "loss": 1.2233, + "step": 3846 + }, + { + "epoch": 1.4322579143930845, + "grad_norm": 0.17004723846912384, + "learning_rate": 1.9407365980395833e-05, + "loss": 1.2143, + "step": 3847 + }, + { + "epoch": 1.4326302195436935, + "grad_norm": 0.16374097764492035, + "learning_rate": 1.940695375085648e-05, + "loss": 1.2106, + "step": 3848 + }, + { + "epoch": 1.4330025246943026, + "grad_norm": 0.17270928621292114, + "learning_rate": 1.9406541382377012e-05, + "loss": 1.2206, + "step": 3849 + }, + { + "epoch": 1.4333748298449116, + "grad_norm": 0.17662329971790314, + "learning_rate": 1.940612887496352e-05, + "loss": 1.2189, + "step": 3850 + }, + { + "epoch": 1.4337471349955206, + "grad_norm": 0.1634707897901535, + "learning_rate": 1.9405716228622094e-05, + "loss": 1.2006, + "step": 3851 + }, + { + "epoch": 1.4341194401461297, + "grad_norm": 0.16489939391613007, + "learning_rate": 1.9405303443358827e-05, + "loss": 1.2142, + "step": 3852 + }, + { + "epoch": 1.434491745296739, + "grad_norm": 0.17095866799354553, + "learning_rate": 1.940489051917982e-05, + "loss": 1.2122, + "step": 3853 + }, + { + "epoch": 1.4348640504473478, + "grad_norm": 0.16927289962768555, + "learning_rate": 1.940447745609117e-05, + "loss": 1.2124, + "step": 3854 + }, + { + "epoch": 1.435236355597957, + "grad_norm": 0.17228350043296814, + "learning_rate": 1.940406425409898e-05, + "loss": 1.2221, + "step": 3855 + }, + { + "epoch": 1.435608660748566, + "grad_norm": 0.1686760038137436, + "learning_rate": 1.940365091320935e-05, + "loss": 1.2281, + "step": 3856 + }, + { + "epoch": 1.4359809658991751, + "grad_norm": 0.16984900832176208, + "learning_rate": 1.9403237433428384e-05, + "loss": 1.2348, + "step": 3857 + }, + { + "epoch": 1.4363532710497842, + "grad_norm": 0.16201676428318024, + "learning_rate": 1.9402823814762194e-05, + "loss": 1.2072, + "step": 3858 + }, + { + "epoch": 1.4367255762003932, + "grad_norm": 0.17006635665893555, + "learning_rate": 1.9402410057216886e-05, + "loss": 1.2149, + "step": 3859 + }, + { + "epoch": 1.4370978813510022, + "grad_norm": 0.17508037388324738, + "learning_rate": 1.9401996160798574e-05, + "loss": 1.2096, + "step": 3860 + }, + { + "epoch": 1.4374701865016113, + "grad_norm": 0.1723199486732483, + "learning_rate": 1.9401582125513364e-05, + "loss": 1.2175, + "step": 3861 + }, + { + "epoch": 1.4378424916522206, + "grad_norm": 0.16605544090270996, + "learning_rate": 1.9401167951367375e-05, + "loss": 1.2092, + "step": 3862 + }, + { + "epoch": 1.4382147968028296, + "grad_norm": 0.16768351197242737, + "learning_rate": 1.940075363836673e-05, + "loss": 1.229, + "step": 3863 + }, + { + "epoch": 1.4385871019534386, + "grad_norm": 0.17820565402507782, + "learning_rate": 1.9400339186517544e-05, + "loss": 1.2276, + "step": 3864 + }, + { + "epoch": 1.4389594071040477, + "grad_norm": 0.17079061269760132, + "learning_rate": 1.9399924595825936e-05, + "loss": 1.2158, + "step": 3865 + }, + { + "epoch": 1.4393317122546567, + "grad_norm": 0.16866520047187805, + "learning_rate": 1.939950986629803e-05, + "loss": 1.1979, + "step": 3866 + }, + { + "epoch": 1.4397040174052658, + "grad_norm": 0.16329576075077057, + "learning_rate": 1.9399094997939957e-05, + "loss": 1.2184, + "step": 3867 + }, + { + "epoch": 1.4400763225558748, + "grad_norm": 0.17052018642425537, + "learning_rate": 1.9398679990757837e-05, + "loss": 1.2144, + "step": 3868 + }, + { + "epoch": 1.4404486277064839, + "grad_norm": 0.16311021149158478, + "learning_rate": 1.9398264844757805e-05, + "loss": 1.206, + "step": 3869 + }, + { + "epoch": 1.440820932857093, + "grad_norm": 0.17270159721374512, + "learning_rate": 1.9397849559945993e-05, + "loss": 1.2159, + "step": 3870 + }, + { + "epoch": 1.4411932380077022, + "grad_norm": 0.17571993172168732, + "learning_rate": 1.939743413632853e-05, + "loss": 1.202, + "step": 3871 + }, + { + "epoch": 1.4415655431583112, + "grad_norm": 0.16022472083568573, + "learning_rate": 1.9397018573911558e-05, + "loss": 1.2048, + "step": 3872 + }, + { + "epoch": 1.4419378483089202, + "grad_norm": 0.1789269596338272, + "learning_rate": 1.9396602872701205e-05, + "loss": 1.2145, + "step": 3873 + }, + { + "epoch": 1.4423101534595293, + "grad_norm": 0.18328560888767242, + "learning_rate": 1.9396187032703624e-05, + "loss": 1.2088, + "step": 3874 + }, + { + "epoch": 1.4426824586101383, + "grad_norm": 0.16325511038303375, + "learning_rate": 1.939577105392495e-05, + "loss": 1.2033, + "step": 3875 + }, + { + "epoch": 1.4430547637607474, + "grad_norm": 0.16945096850395203, + "learning_rate": 1.9395354936371323e-05, + "loss": 1.2225, + "step": 3876 + }, + { + "epoch": 1.4434270689113564, + "grad_norm": 0.16546650230884552, + "learning_rate": 1.9394938680048893e-05, + "loss": 1.2079, + "step": 3877 + }, + { + "epoch": 1.4437993740619655, + "grad_norm": 0.1665334403514862, + "learning_rate": 1.9394522284963814e-05, + "loss": 1.2367, + "step": 3878 + }, + { + "epoch": 1.4441716792125745, + "grad_norm": 0.1650296449661255, + "learning_rate": 1.9394105751122226e-05, + "loss": 1.2172, + "step": 3879 + }, + { + "epoch": 1.4445439843631838, + "grad_norm": 0.16569402813911438, + "learning_rate": 1.9393689078530285e-05, + "loss": 1.2179, + "step": 3880 + }, + { + "epoch": 1.4449162895137928, + "grad_norm": 0.1602257788181305, + "learning_rate": 1.9393272267194144e-05, + "loss": 1.2111, + "step": 3881 + }, + { + "epoch": 1.4452885946644018, + "grad_norm": 0.16653552651405334, + "learning_rate": 1.9392855317119966e-05, + "loss": 1.2216, + "step": 3882 + }, + { + "epoch": 1.445660899815011, + "grad_norm": 0.16940967738628387, + "learning_rate": 1.93924382283139e-05, + "loss": 1.2129, + "step": 3883 + }, + { + "epoch": 1.44603320496562, + "grad_norm": 0.1640087068080902, + "learning_rate": 1.9392021000782114e-05, + "loss": 1.2136, + "step": 3884 + }, + { + "epoch": 1.446405510116229, + "grad_norm": 0.15895035862922668, + "learning_rate": 1.939160363453077e-05, + "loss": 1.197, + "step": 3885 + }, + { + "epoch": 1.446777815266838, + "grad_norm": 0.16390269994735718, + "learning_rate": 1.9391186129566025e-05, + "loss": 1.2022, + "step": 3886 + }, + { + "epoch": 1.4471501204174473, + "grad_norm": 0.16444234549999237, + "learning_rate": 1.939076848589405e-05, + "loss": 1.2089, + "step": 3887 + }, + { + "epoch": 1.447522425568056, + "grad_norm": 0.16005566716194153, + "learning_rate": 1.9390350703521015e-05, + "loss": 1.1956, + "step": 3888 + }, + { + "epoch": 1.4478947307186654, + "grad_norm": 0.16827602684497833, + "learning_rate": 1.938993278245309e-05, + "loss": 1.2111, + "step": 3889 + }, + { + "epoch": 1.4482670358692744, + "grad_norm": 0.16684898734092712, + "learning_rate": 1.938951472269645e-05, + "loss": 1.2087, + "step": 3890 + }, + { + "epoch": 1.4486393410198835, + "grad_norm": 0.1641646921634674, + "learning_rate": 1.9389096524257263e-05, + "loss": 1.2132, + "step": 3891 + }, + { + "epoch": 1.4490116461704925, + "grad_norm": 0.17110654711723328, + "learning_rate": 1.938867818714171e-05, + "loss": 1.2203, + "step": 3892 + }, + { + "epoch": 1.4493839513211015, + "grad_norm": 0.15883228182792664, + "learning_rate": 1.938825971135597e-05, + "loss": 1.2096, + "step": 3893 + }, + { + "epoch": 1.4497562564717106, + "grad_norm": 0.1638491004705429, + "learning_rate": 1.938784109690622e-05, + "loss": 1.2125, + "step": 3894 + }, + { + "epoch": 1.4501285616223196, + "grad_norm": 0.16501039266586304, + "learning_rate": 1.938742234379865e-05, + "loss": 1.2235, + "step": 3895 + }, + { + "epoch": 1.4505008667729289, + "grad_norm": 0.16907702386379242, + "learning_rate": 1.938700345203944e-05, + "loss": 1.2204, + "step": 3896 + }, + { + "epoch": 1.4508731719235377, + "grad_norm": 0.16508372128009796, + "learning_rate": 1.938658442163478e-05, + "loss": 1.2152, + "step": 3897 + }, + { + "epoch": 1.451245477074147, + "grad_norm": 0.1664949506521225, + "learning_rate": 1.9386165252590854e-05, + "loss": 1.2177, + "step": 3898 + }, + { + "epoch": 1.451617782224756, + "grad_norm": 0.1608046591281891, + "learning_rate": 1.9385745944913858e-05, + "loss": 1.2062, + "step": 3899 + }, + { + "epoch": 1.451990087375365, + "grad_norm": 0.16553208231925964, + "learning_rate": 1.9385326498609983e-05, + "loss": 1.2204, + "step": 3900 + }, + { + "epoch": 1.452362392525974, + "grad_norm": 0.16579049825668335, + "learning_rate": 1.9384906913685426e-05, + "loss": 1.2166, + "step": 3901 + }, + { + "epoch": 1.4527346976765831, + "grad_norm": 0.17063631117343903, + "learning_rate": 1.9384487190146383e-05, + "loss": 1.2075, + "step": 3902 + }, + { + "epoch": 1.4531070028271922, + "grad_norm": 0.16345682740211487, + "learning_rate": 1.938406732799905e-05, + "loss": 1.2176, + "step": 3903 + }, + { + "epoch": 1.4534793079778012, + "grad_norm": 0.16839192807674408, + "learning_rate": 1.9383647327249635e-05, + "loss": 1.223, + "step": 3904 + }, + { + "epoch": 1.4538516131284105, + "grad_norm": 0.1658104509115219, + "learning_rate": 1.9383227187904334e-05, + "loss": 1.2142, + "step": 3905 + }, + { + "epoch": 1.4542239182790193, + "grad_norm": 0.16820955276489258, + "learning_rate": 1.9382806909969362e-05, + "loss": 1.2302, + "step": 3906 + }, + { + "epoch": 1.4545962234296286, + "grad_norm": 0.16950318217277527, + "learning_rate": 1.9382386493450913e-05, + "loss": 1.2091, + "step": 3907 + }, + { + "epoch": 1.4549685285802376, + "grad_norm": 0.17140018939971924, + "learning_rate": 1.938196593835521e-05, + "loss": 1.2267, + "step": 3908 + }, + { + "epoch": 1.4553408337308467, + "grad_norm": 0.17573173344135284, + "learning_rate": 1.938154524468846e-05, + "loss": 1.211, + "step": 3909 + }, + { + "epoch": 1.4557131388814557, + "grad_norm": 0.16952942311763763, + "learning_rate": 1.938112441245687e-05, + "loss": 1.2111, + "step": 3910 + }, + { + "epoch": 1.4560854440320647, + "grad_norm": 0.17475372552871704, + "learning_rate": 1.9380703441666666e-05, + "loss": 1.1979, + "step": 3911 + }, + { + "epoch": 1.4564577491826738, + "grad_norm": 0.16706113517284393, + "learning_rate": 1.9380282332324055e-05, + "loss": 1.2209, + "step": 3912 + }, + { + "epoch": 1.4568300543332828, + "grad_norm": 0.1862281709909439, + "learning_rate": 1.937986108443527e-05, + "loss": 1.2208, + "step": 3913 + }, + { + "epoch": 1.457202359483892, + "grad_norm": 0.17388898134231567, + "learning_rate": 1.9379439698006522e-05, + "loss": 1.2103, + "step": 3914 + }, + { + "epoch": 1.457574664634501, + "grad_norm": 0.1705702692270279, + "learning_rate": 1.9379018173044038e-05, + "loss": 1.2244, + "step": 3915 + }, + { + "epoch": 1.4579469697851102, + "grad_norm": 0.16031381487846375, + "learning_rate": 1.9378596509554045e-05, + "loss": 1.2021, + "step": 3916 + }, + { + "epoch": 1.4583192749357192, + "grad_norm": 0.17074525356292725, + "learning_rate": 1.937817470754277e-05, + "loss": 1.2091, + "step": 3917 + }, + { + "epoch": 1.4586915800863283, + "grad_norm": 0.1819746345281601, + "learning_rate": 1.9377752767016443e-05, + "loss": 1.2182, + "step": 3918 + }, + { + "epoch": 1.4590638852369373, + "grad_norm": 0.16856756806373596, + "learning_rate": 1.9377330687981295e-05, + "loss": 1.2114, + "step": 3919 + }, + { + "epoch": 1.4594361903875464, + "grad_norm": 0.16812150180339813, + "learning_rate": 1.9376908470443562e-05, + "loss": 1.2114, + "step": 3920 + }, + { + "epoch": 1.4598084955381554, + "grad_norm": 0.17135708034038544, + "learning_rate": 1.937648611440948e-05, + "loss": 1.2128, + "step": 3921 + }, + { + "epoch": 1.4601808006887644, + "grad_norm": 0.17878791689872742, + "learning_rate": 1.9376063619885285e-05, + "loss": 1.2219, + "step": 3922 + }, + { + "epoch": 1.4605531058393737, + "grad_norm": 0.162892147898674, + "learning_rate": 1.937564098687722e-05, + "loss": 1.2149, + "step": 3923 + }, + { + "epoch": 1.4609254109899827, + "grad_norm": 0.16608810424804688, + "learning_rate": 1.9375218215391527e-05, + "loss": 1.1958, + "step": 3924 + }, + { + "epoch": 1.4612977161405918, + "grad_norm": 0.17034226655960083, + "learning_rate": 1.9374795305434446e-05, + "loss": 1.193, + "step": 3925 + }, + { + "epoch": 1.4616700212912008, + "grad_norm": 0.15888184309005737, + "learning_rate": 1.937437225701223e-05, + "loss": 1.2113, + "step": 3926 + }, + { + "epoch": 1.4620423264418099, + "grad_norm": 0.17904046177864075, + "learning_rate": 1.937394907013112e-05, + "loss": 1.2128, + "step": 3927 + }, + { + "epoch": 1.462414631592419, + "grad_norm": 0.16831357777118683, + "learning_rate": 1.9373525744797377e-05, + "loss": 1.2023, + "step": 3928 + }, + { + "epoch": 1.462786936743028, + "grad_norm": 0.17215365171432495, + "learning_rate": 1.9373102281017246e-05, + "loss": 1.21, + "step": 3929 + }, + { + "epoch": 1.463159241893637, + "grad_norm": 0.16621056199073792, + "learning_rate": 1.937267867879698e-05, + "loss": 1.2021, + "step": 3930 + }, + { + "epoch": 1.463531547044246, + "grad_norm": 0.17535530030727386, + "learning_rate": 1.9372254938142835e-05, + "loss": 1.2345, + "step": 3931 + }, + { + "epoch": 1.4639038521948553, + "grad_norm": 0.16179418563842773, + "learning_rate": 1.9371831059061078e-05, + "loss": 1.2097, + "step": 3932 + }, + { + "epoch": 1.4642761573454643, + "grad_norm": 0.17177635431289673, + "learning_rate": 1.937140704155796e-05, + "loss": 1.2181, + "step": 3933 + }, + { + "epoch": 1.4646484624960734, + "grad_norm": 0.17407308518886566, + "learning_rate": 1.9370982885639752e-05, + "loss": 1.215, + "step": 3934 + }, + { + "epoch": 1.4650207676466824, + "grad_norm": 0.1671421378850937, + "learning_rate": 1.937055859131271e-05, + "loss": 1.2185, + "step": 3935 + }, + { + "epoch": 1.4653930727972915, + "grad_norm": 0.16444651782512665, + "learning_rate": 1.937013415858311e-05, + "loss": 1.2247, + "step": 3936 + }, + { + "epoch": 1.4657653779479005, + "grad_norm": 0.1708529144525528, + "learning_rate": 1.9369709587457217e-05, + "loss": 1.1963, + "step": 3937 + }, + { + "epoch": 1.4661376830985096, + "grad_norm": 0.17232173681259155, + "learning_rate": 1.93692848779413e-05, + "loss": 1.2015, + "step": 3938 + }, + { + "epoch": 1.4665099882491186, + "grad_norm": 0.16869337856769562, + "learning_rate": 1.936886003004163e-05, + "loss": 1.2328, + "step": 3939 + }, + { + "epoch": 1.4668822933997276, + "grad_norm": 0.17547161877155304, + "learning_rate": 1.9368435043764493e-05, + "loss": 1.2094, + "step": 3940 + }, + { + "epoch": 1.467254598550337, + "grad_norm": 0.1809409260749817, + "learning_rate": 1.9368009919116152e-05, + "loss": 1.2065, + "step": 3941 + }, + { + "epoch": 1.467626903700946, + "grad_norm": 0.1913343220949173, + "learning_rate": 1.9367584656102895e-05, + "loss": 1.2309, + "step": 3942 + }, + { + "epoch": 1.467999208851555, + "grad_norm": 0.16980275511741638, + "learning_rate": 1.9367159254731e-05, + "loss": 1.2062, + "step": 3943 + }, + { + "epoch": 1.468371514002164, + "grad_norm": 0.17768965661525726, + "learning_rate": 1.936673371500675e-05, + "loss": 1.2134, + "step": 3944 + }, + { + "epoch": 1.468743819152773, + "grad_norm": 0.17502190172672272, + "learning_rate": 1.9366308036936433e-05, + "loss": 1.2255, + "step": 3945 + }, + { + "epoch": 1.4691161243033821, + "grad_norm": 0.176423579454422, + "learning_rate": 1.936588222052633e-05, + "loss": 1.2131, + "step": 3946 + }, + { + "epoch": 1.4694884294539912, + "grad_norm": 0.18307115137577057, + "learning_rate": 1.936545626578274e-05, + "loss": 1.2093, + "step": 3947 + }, + { + "epoch": 1.4698607346046004, + "grad_norm": 0.16688889265060425, + "learning_rate": 1.9365030172711946e-05, + "loss": 1.2074, + "step": 3948 + }, + { + "epoch": 1.4702330397552092, + "grad_norm": 0.16726486384868622, + "learning_rate": 1.9364603941320243e-05, + "loss": 1.1952, + "step": 3949 + }, + { + "epoch": 1.4706053449058185, + "grad_norm": 0.1644599884748459, + "learning_rate": 1.9364177571613927e-05, + "loss": 1.2164, + "step": 3950 + }, + { + "epoch": 1.4709776500564276, + "grad_norm": 0.17639389634132385, + "learning_rate": 1.9363751063599298e-05, + "loss": 1.2219, + "step": 3951 + }, + { + "epoch": 1.4713499552070366, + "grad_norm": 0.1668757051229477, + "learning_rate": 1.936332441728265e-05, + "loss": 1.2021, + "step": 3952 + }, + { + "epoch": 1.4717222603576456, + "grad_norm": 0.17846433818340302, + "learning_rate": 1.9362897632670293e-05, + "loss": 1.2116, + "step": 3953 + }, + { + "epoch": 1.4720945655082547, + "grad_norm": 0.18401432037353516, + "learning_rate": 1.936247070976852e-05, + "loss": 1.2167, + "step": 3954 + }, + { + "epoch": 1.4724668706588637, + "grad_norm": 0.16676369309425354, + "learning_rate": 1.9362043648583647e-05, + "loss": 1.2144, + "step": 3955 + }, + { + "epoch": 1.4728391758094728, + "grad_norm": 0.17086462676525116, + "learning_rate": 1.9361616449121973e-05, + "loss": 1.2163, + "step": 3956 + }, + { + "epoch": 1.473211480960082, + "grad_norm": 0.1699550896883011, + "learning_rate": 1.9361189111389817e-05, + "loss": 1.2083, + "step": 3957 + }, + { + "epoch": 1.4735837861106909, + "grad_norm": 0.16862007975578308, + "learning_rate": 1.936076163539348e-05, + "loss": 1.2129, + "step": 3958 + }, + { + "epoch": 1.4739560912613001, + "grad_norm": 0.168454110622406, + "learning_rate": 1.936033402113928e-05, + "loss": 1.2052, + "step": 3959 + }, + { + "epoch": 1.4743283964119092, + "grad_norm": 0.17576009035110474, + "learning_rate": 1.9359906268633542e-05, + "loss": 1.1988, + "step": 3960 + }, + { + "epoch": 1.4747007015625182, + "grad_norm": 0.17533783614635468, + "learning_rate": 1.9359478377882567e-05, + "loss": 1.2165, + "step": 3961 + }, + { + "epoch": 1.4750730067131272, + "grad_norm": 0.18363003432750702, + "learning_rate": 1.935905034889269e-05, + "loss": 1.2095, + "step": 3962 + }, + { + "epoch": 1.4754453118637363, + "grad_norm": 0.1739223450422287, + "learning_rate": 1.9358622181670225e-05, + "loss": 1.2043, + "step": 3963 + }, + { + "epoch": 1.4758176170143453, + "grad_norm": 0.1666637510061264, + "learning_rate": 1.9358193876221497e-05, + "loss": 1.2036, + "step": 3964 + }, + { + "epoch": 1.4761899221649544, + "grad_norm": 0.18564556539058685, + "learning_rate": 1.935776543255283e-05, + "loss": 1.2267, + "step": 3965 + }, + { + "epoch": 1.4765622273155636, + "grad_norm": 0.16822317242622375, + "learning_rate": 1.935733685067056e-05, + "loss": 1.2294, + "step": 3966 + }, + { + "epoch": 1.4769345324661725, + "grad_norm": 0.16911527514457703, + "learning_rate": 1.9356908130581008e-05, + "loss": 1.2128, + "step": 3967 + }, + { + "epoch": 1.4773068376167817, + "grad_norm": 0.19350941479206085, + "learning_rate": 1.9356479272290514e-05, + "loss": 1.2107, + "step": 3968 + }, + { + "epoch": 1.4776791427673908, + "grad_norm": 0.1877276450395584, + "learning_rate": 1.9356050275805406e-05, + "loss": 1.2059, + "step": 3969 + }, + { + "epoch": 1.4780514479179998, + "grad_norm": 0.17434152960777283, + "learning_rate": 1.9355621141132022e-05, + "loss": 1.2185, + "step": 3970 + }, + { + "epoch": 1.4784237530686088, + "grad_norm": 0.19158223271369934, + "learning_rate": 1.9355191868276702e-05, + "loss": 1.2107, + "step": 3971 + }, + { + "epoch": 1.478796058219218, + "grad_norm": 0.17890071868896484, + "learning_rate": 1.9354762457245782e-05, + "loss": 1.205, + "step": 3972 + }, + { + "epoch": 1.479168363369827, + "grad_norm": 0.16865649819374084, + "learning_rate": 1.935433290804561e-05, + "loss": 1.2112, + "step": 3973 + }, + { + "epoch": 1.479540668520436, + "grad_norm": 0.16355697810649872, + "learning_rate": 1.935390322068253e-05, + "loss": 1.2194, + "step": 3974 + }, + { + "epoch": 1.4799129736710452, + "grad_norm": 0.169992133975029, + "learning_rate": 1.9353473395162882e-05, + "loss": 1.2088, + "step": 3975 + }, + { + "epoch": 1.480285278821654, + "grad_norm": 0.16917189955711365, + "learning_rate": 1.9353043431493024e-05, + "loss": 1.2149, + "step": 3976 + }, + { + "epoch": 1.4806575839722633, + "grad_norm": 0.16801759600639343, + "learning_rate": 1.9352613329679298e-05, + "loss": 1.2251, + "step": 3977 + }, + { + "epoch": 1.4810298891228724, + "grad_norm": 0.16183248162269592, + "learning_rate": 1.935218308972806e-05, + "loss": 1.2019, + "step": 3978 + }, + { + "epoch": 1.4814021942734814, + "grad_norm": 0.16628491878509521, + "learning_rate": 1.935175271164567e-05, + "loss": 1.2186, + "step": 3979 + }, + { + "epoch": 1.4817744994240905, + "grad_norm": 0.167390838265419, + "learning_rate": 1.9351322195438472e-05, + "loss": 1.2196, + "step": 3980 + }, + { + "epoch": 1.4821468045746995, + "grad_norm": 0.17368678748607635, + "learning_rate": 1.9350891541112836e-05, + "loss": 1.2114, + "step": 3981 + }, + { + "epoch": 1.4825191097253085, + "grad_norm": 0.17052781581878662, + "learning_rate": 1.9350460748675117e-05, + "loss": 1.2013, + "step": 3982 + }, + { + "epoch": 1.4828914148759176, + "grad_norm": 0.16377271711826324, + "learning_rate": 1.935002981813168e-05, + "loss": 1.2114, + "step": 3983 + }, + { + "epoch": 1.4832637200265268, + "grad_norm": 0.1672280728816986, + "learning_rate": 1.934959874948889e-05, + "loss": 1.2214, + "step": 3984 + }, + { + "epoch": 1.4836360251771359, + "grad_norm": 0.16582052409648895, + "learning_rate": 1.9349167542753116e-05, + "loss": 1.2097, + "step": 3985 + }, + { + "epoch": 1.484008330327745, + "grad_norm": 0.15691331028938293, + "learning_rate": 1.934873619793072e-05, + "loss": 1.2001, + "step": 3986 + }, + { + "epoch": 1.484380635478354, + "grad_norm": 0.16252057254314423, + "learning_rate": 1.934830471502808e-05, + "loss": 1.2105, + "step": 3987 + }, + { + "epoch": 1.484752940628963, + "grad_norm": 0.18076485395431519, + "learning_rate": 1.9347873094051565e-05, + "loss": 1.1981, + "step": 3988 + }, + { + "epoch": 1.485125245779572, + "grad_norm": 0.18029817938804626, + "learning_rate": 1.9347441335007547e-05, + "loss": 1.2136, + "step": 3989 + }, + { + "epoch": 1.485497550930181, + "grad_norm": 0.17143037915229797, + "learning_rate": 1.9347009437902414e-05, + "loss": 1.1981, + "step": 3990 + }, + { + "epoch": 1.4858698560807901, + "grad_norm": 0.1707201451063156, + "learning_rate": 1.9346577402742532e-05, + "loss": 1.2137, + "step": 3991 + }, + { + "epoch": 1.4862421612313992, + "grad_norm": 0.17134663462638855, + "learning_rate": 1.9346145229534295e-05, + "loss": 1.2097, + "step": 3992 + }, + { + "epoch": 1.4866144663820084, + "grad_norm": 0.18079330027103424, + "learning_rate": 1.9345712918284074e-05, + "loss": 1.2135, + "step": 3993 + }, + { + "epoch": 1.4869867715326175, + "grad_norm": 0.16749143600463867, + "learning_rate": 1.934528046899826e-05, + "loss": 1.2145, + "step": 3994 + }, + { + "epoch": 1.4873590766832265, + "grad_norm": 0.17483735084533691, + "learning_rate": 1.9344847881683242e-05, + "loss": 1.2164, + "step": 3995 + }, + { + "epoch": 1.4877313818338356, + "grad_norm": 0.17424844205379486, + "learning_rate": 1.9344415156345407e-05, + "loss": 1.2007, + "step": 3996 + }, + { + "epoch": 1.4881036869844446, + "grad_norm": 0.18237757682800293, + "learning_rate": 1.9343982292991147e-05, + "loss": 1.1958, + "step": 3997 + }, + { + "epoch": 1.4884759921350537, + "grad_norm": 0.2026931643486023, + "learning_rate": 1.9343549291626853e-05, + "loss": 1.2126, + "step": 3998 + }, + { + "epoch": 1.4888482972856627, + "grad_norm": 0.2018553465604782, + "learning_rate": 1.9343116152258924e-05, + "loss": 1.2323, + "step": 3999 + }, + { + "epoch": 1.489220602436272, + "grad_norm": 0.17525236308574677, + "learning_rate": 1.9342682874893756e-05, + "loss": 1.2227, + "step": 4000 + }, + { + "epoch": 1.489220602436272, + "eval_loss": 1.3177570104599, + "eval_runtime": 16.441, + "eval_samples_per_second": 105.468, + "eval_steps_per_second": 5.292, + "step": 4000 + }, + { + "epoch": 1.4895929075868808, + "grad_norm": 0.20407749712467194, + "learning_rate": 1.9342249459537746e-05, + "loss": 1.2125, + "step": 4001 + }, + { + "epoch": 1.48996521273749, + "grad_norm": 0.1680436134338379, + "learning_rate": 1.93418159061973e-05, + "loss": 1.208, + "step": 4002 + }, + { + "epoch": 1.490337517888099, + "grad_norm": 0.16656237840652466, + "learning_rate": 1.934138221487882e-05, + "loss": 1.2182, + "step": 4003 + }, + { + "epoch": 1.4907098230387081, + "grad_norm": 0.17523828148841858, + "learning_rate": 1.9340948385588708e-05, + "loss": 1.2013, + "step": 4004 + }, + { + "epoch": 1.4910821281893172, + "grad_norm": 0.17147956788539886, + "learning_rate": 1.9340514418333375e-05, + "loss": 1.221, + "step": 4005 + }, + { + "epoch": 1.4914544333399262, + "grad_norm": 0.16101637482643127, + "learning_rate": 1.934008031311923e-05, + "loss": 1.2107, + "step": 4006 + }, + { + "epoch": 1.4918267384905353, + "grad_norm": 0.17050312459468842, + "learning_rate": 1.933964606995269e-05, + "loss": 1.216, + "step": 4007 + }, + { + "epoch": 1.4921990436411443, + "grad_norm": 0.17762567102909088, + "learning_rate": 1.933921168884016e-05, + "loss": 1.219, + "step": 4008 + }, + { + "epoch": 1.4925713487917536, + "grad_norm": 0.179819718003273, + "learning_rate": 1.9338777169788058e-05, + "loss": 1.2271, + "step": 4009 + }, + { + "epoch": 1.4929436539423624, + "grad_norm": 0.17511019110679626, + "learning_rate": 1.9338342512802805e-05, + "loss": 1.2128, + "step": 4010 + }, + { + "epoch": 1.4933159590929717, + "grad_norm": 0.17348961532115936, + "learning_rate": 1.9337907717890817e-05, + "loss": 1.2215, + "step": 4011 + }, + { + "epoch": 1.4936882642435807, + "grad_norm": 0.16590604186058044, + "learning_rate": 1.933747278505852e-05, + "loss": 1.2048, + "step": 4012 + }, + { + "epoch": 1.4940605693941897, + "grad_norm": 0.16932128369808197, + "learning_rate": 1.9337037714312337e-05, + "loss": 1.22, + "step": 4013 + }, + { + "epoch": 1.4944328745447988, + "grad_norm": 0.18030394613742828, + "learning_rate": 1.933660250565869e-05, + "loss": 1.2031, + "step": 4014 + }, + { + "epoch": 1.4948051796954078, + "grad_norm": 0.16804668307304382, + "learning_rate": 1.9336167159104012e-05, + "loss": 1.2167, + "step": 4015 + }, + { + "epoch": 1.4951774848460169, + "grad_norm": 0.17750367522239685, + "learning_rate": 1.9335731674654732e-05, + "loss": 1.2102, + "step": 4016 + }, + { + "epoch": 1.495549789996626, + "grad_norm": 0.1765013486146927, + "learning_rate": 1.9335296052317278e-05, + "loss": 1.2192, + "step": 4017 + }, + { + "epoch": 1.4959220951472352, + "grad_norm": 0.17468391358852386, + "learning_rate": 1.933486029209809e-05, + "loss": 1.2093, + "step": 4018 + }, + { + "epoch": 1.496294400297844, + "grad_norm": 0.16882555186748505, + "learning_rate": 1.93344243940036e-05, + "loss": 1.2092, + "step": 4019 + }, + { + "epoch": 1.4966667054484533, + "grad_norm": 0.16576802730560303, + "learning_rate": 1.9333988358040246e-05, + "loss": 1.2021, + "step": 4020 + }, + { + "epoch": 1.4970390105990623, + "grad_norm": 0.17148758471012115, + "learning_rate": 1.9333552184214473e-05, + "loss": 1.2202, + "step": 4021 + }, + { + "epoch": 1.4974113157496713, + "grad_norm": 0.16886599361896515, + "learning_rate": 1.933311587253272e-05, + "loss": 1.2053, + "step": 4022 + }, + { + "epoch": 1.4977836209002804, + "grad_norm": 0.16921773552894592, + "learning_rate": 1.9332679423001428e-05, + "loss": 1.2064, + "step": 4023 + }, + { + "epoch": 1.4981559260508894, + "grad_norm": 0.16829746961593628, + "learning_rate": 1.9332242835627048e-05, + "loss": 1.2097, + "step": 4024 + }, + { + "epoch": 1.4985282312014985, + "grad_norm": 0.16895058751106262, + "learning_rate": 1.9331806110416027e-05, + "loss": 1.2116, + "step": 4025 + }, + { + "epoch": 1.4989005363521075, + "grad_norm": 0.17444762587547302, + "learning_rate": 1.9331369247374815e-05, + "loss": 1.1994, + "step": 4026 + }, + { + "epoch": 1.4992728415027168, + "grad_norm": 0.17568188905715942, + "learning_rate": 1.9330932246509867e-05, + "loss": 1.1921, + "step": 4027 + }, + { + "epoch": 1.4996451466533256, + "grad_norm": 0.164281964302063, + "learning_rate": 1.9330495107827633e-05, + "loss": 1.221, + "step": 4028 + }, + { + "epoch": 1.5000174518039349, + "grad_norm": 0.1712205559015274, + "learning_rate": 1.9330057831334573e-05, + "loss": 1.2055, + "step": 4029 + }, + { + "epoch": 1.500389756954544, + "grad_norm": 0.17232446372509003, + "learning_rate": 1.9329620417037143e-05, + "loss": 1.2094, + "step": 4030 + }, + { + "epoch": 1.500762062105153, + "grad_norm": 0.16828641295433044, + "learning_rate": 1.9329182864941808e-05, + "loss": 1.2161, + "step": 4031 + }, + { + "epoch": 1.501134367255762, + "grad_norm": 0.1642870008945465, + "learning_rate": 1.9328745175055025e-05, + "loss": 1.2076, + "step": 4032 + }, + { + "epoch": 1.501506672406371, + "grad_norm": 0.16758432984352112, + "learning_rate": 1.932830734738326e-05, + "loss": 1.2149, + "step": 4033 + }, + { + "epoch": 1.5018789775569803, + "grad_norm": 0.16760718822479248, + "learning_rate": 1.9327869381932984e-05, + "loss": 1.221, + "step": 4034 + }, + { + "epoch": 1.5022512827075891, + "grad_norm": 0.16775357723236084, + "learning_rate": 1.932743127871066e-05, + "loss": 1.2106, + "step": 4035 + }, + { + "epoch": 1.5026235878581984, + "grad_norm": 0.16413766145706177, + "learning_rate": 1.9326993037722762e-05, + "loss": 1.2133, + "step": 4036 + }, + { + "epoch": 1.5029958930088072, + "grad_norm": 0.17524518072605133, + "learning_rate": 1.9326554658975766e-05, + "loss": 1.2083, + "step": 4037 + }, + { + "epoch": 1.5033681981594165, + "grad_norm": 0.17292781174182892, + "learning_rate": 1.9326116142476137e-05, + "loss": 1.2126, + "step": 4038 + }, + { + "epoch": 1.5037405033100255, + "grad_norm": 0.17386804521083832, + "learning_rate": 1.9325677488230364e-05, + "loss": 1.2151, + "step": 4039 + }, + { + "epoch": 1.5041128084606346, + "grad_norm": 0.1756100207567215, + "learning_rate": 1.9325238696244914e-05, + "loss": 1.2122, + "step": 4040 + }, + { + "epoch": 1.5044851136112436, + "grad_norm": 0.16365545988082886, + "learning_rate": 1.9324799766526276e-05, + "loss": 1.2159, + "step": 4041 + }, + { + "epoch": 1.5048574187618526, + "grad_norm": 0.1651315987110138, + "learning_rate": 1.932436069908093e-05, + "loss": 1.2278, + "step": 4042 + }, + { + "epoch": 1.505229723912462, + "grad_norm": 0.1678977757692337, + "learning_rate": 1.9323921493915364e-05, + "loss": 1.2131, + "step": 4043 + }, + { + "epoch": 1.5056020290630707, + "grad_norm": 0.16497032344341278, + "learning_rate": 1.932348215103606e-05, + "loss": 1.2105, + "step": 4044 + }, + { + "epoch": 1.50597433421368, + "grad_norm": 0.1694755107164383, + "learning_rate": 1.932304267044951e-05, + "loss": 1.2084, + "step": 4045 + }, + { + "epoch": 1.5063466393642888, + "grad_norm": 0.16757144033908844, + "learning_rate": 1.932260305216221e-05, + "loss": 1.2132, + "step": 4046 + }, + { + "epoch": 1.506718944514898, + "grad_norm": 0.16843421757221222, + "learning_rate": 1.932216329618064e-05, + "loss": 1.2041, + "step": 4047 + }, + { + "epoch": 1.5070912496655071, + "grad_norm": 0.17438024282455444, + "learning_rate": 1.9321723402511308e-05, + "loss": 1.2206, + "step": 4048 + }, + { + "epoch": 1.5074635548161162, + "grad_norm": 0.17273013293743134, + "learning_rate": 1.9321283371160704e-05, + "loss": 1.2153, + "step": 4049 + }, + { + "epoch": 1.5078358599667252, + "grad_norm": 0.16855518519878387, + "learning_rate": 1.9320843202135333e-05, + "loss": 1.2327, + "step": 4050 + }, + { + "epoch": 1.5082081651173342, + "grad_norm": 0.17092865705490112, + "learning_rate": 1.932040289544169e-05, + "loss": 1.2183, + "step": 4051 + }, + { + "epoch": 1.5085804702679435, + "grad_norm": 0.16333964467048645, + "learning_rate": 1.9319962451086282e-05, + "loss": 1.2078, + "step": 4052 + }, + { + "epoch": 1.5089527754185523, + "grad_norm": 0.16895681619644165, + "learning_rate": 1.9319521869075612e-05, + "loss": 1.2129, + "step": 4053 + }, + { + "epoch": 1.5093250805691616, + "grad_norm": 0.17089246213436127, + "learning_rate": 1.931908114941619e-05, + "loss": 1.1971, + "step": 4054 + }, + { + "epoch": 1.5096973857197704, + "grad_norm": 0.1670157015323639, + "learning_rate": 1.9318640292114526e-05, + "loss": 1.2041, + "step": 4055 + }, + { + "epoch": 1.5100696908703797, + "grad_norm": 0.16973476111888885, + "learning_rate": 1.9318199297177127e-05, + "loss": 1.2034, + "step": 4056 + }, + { + "epoch": 1.5104419960209887, + "grad_norm": 0.173712819814682, + "learning_rate": 1.931775816461051e-05, + "loss": 1.2121, + "step": 4057 + }, + { + "epoch": 1.5108143011715978, + "grad_norm": 0.1701354682445526, + "learning_rate": 1.931731689442119e-05, + "loss": 1.2193, + "step": 4058 + }, + { + "epoch": 1.5111866063222068, + "grad_norm": 0.16733410954475403, + "learning_rate": 1.9316875486615684e-05, + "loss": 1.2152, + "step": 4059 + }, + { + "epoch": 1.5115589114728158, + "grad_norm": 0.17362013459205627, + "learning_rate": 1.931643394120051e-05, + "loss": 1.2194, + "step": 4060 + }, + { + "epoch": 1.5119312166234251, + "grad_norm": 0.1801014095544815, + "learning_rate": 1.9315992258182196e-05, + "loss": 1.2112, + "step": 4061 + }, + { + "epoch": 1.512303521774034, + "grad_norm": 0.17193296551704407, + "learning_rate": 1.9315550437567258e-05, + "loss": 1.2177, + "step": 4062 + }, + { + "epoch": 1.5126758269246432, + "grad_norm": 0.17398405075073242, + "learning_rate": 1.9315108479362226e-05, + "loss": 1.2154, + "step": 4063 + }, + { + "epoch": 1.513048132075252, + "grad_norm": 0.18025220930576324, + "learning_rate": 1.9314666383573622e-05, + "loss": 1.2044, + "step": 4064 + }, + { + "epoch": 1.5134204372258613, + "grad_norm": 0.15889345109462738, + "learning_rate": 1.9314224150207986e-05, + "loss": 1.2138, + "step": 4065 + }, + { + "epoch": 1.5137927423764703, + "grad_norm": 0.1648905724287033, + "learning_rate": 1.9313781779271842e-05, + "loss": 1.2124, + "step": 4066 + }, + { + "epoch": 1.5141650475270794, + "grad_norm": 0.1780308037996292, + "learning_rate": 1.9313339270771724e-05, + "loss": 1.203, + "step": 4067 + }, + { + "epoch": 1.5145373526776884, + "grad_norm": 0.16824577748775482, + "learning_rate": 1.931289662471417e-05, + "loss": 1.2033, + "step": 4068 + }, + { + "epoch": 1.5149096578282975, + "grad_norm": 0.1748679131269455, + "learning_rate": 1.9312453841105716e-05, + "loss": 1.2034, + "step": 4069 + }, + { + "epoch": 1.5152819629789067, + "grad_norm": 0.16885723173618317, + "learning_rate": 1.9312010919952907e-05, + "loss": 1.2147, + "step": 4070 + }, + { + "epoch": 1.5156542681295155, + "grad_norm": 0.16560660302639008, + "learning_rate": 1.931156786126228e-05, + "loss": 1.219, + "step": 4071 + }, + { + "epoch": 1.5160265732801248, + "grad_norm": 0.16653484106063843, + "learning_rate": 1.9311124665040378e-05, + "loss": 1.2258, + "step": 4072 + }, + { + "epoch": 1.5163988784307338, + "grad_norm": 0.16899293661117554, + "learning_rate": 1.931068133129375e-05, + "loss": 1.208, + "step": 4073 + }, + { + "epoch": 1.5167711835813429, + "grad_norm": 0.17353177070617676, + "learning_rate": 1.931023786002894e-05, + "loss": 1.196, + "step": 4074 + }, + { + "epoch": 1.517143488731952, + "grad_norm": 0.16658303141593933, + "learning_rate": 1.9309794251252506e-05, + "loss": 1.2167, + "step": 4075 + }, + { + "epoch": 1.517515793882561, + "grad_norm": 0.17217494547367096, + "learning_rate": 1.9309350504970996e-05, + "loss": 1.2224, + "step": 4076 + }, + { + "epoch": 1.51788809903317, + "grad_norm": 0.1716787964105606, + "learning_rate": 1.930890662119096e-05, + "loss": 1.214, + "step": 4077 + }, + { + "epoch": 1.518260404183779, + "grad_norm": 0.16615904867649078, + "learning_rate": 1.9308462599918955e-05, + "loss": 1.2204, + "step": 4078 + }, + { + "epoch": 1.5186327093343883, + "grad_norm": 0.17430458962917328, + "learning_rate": 1.9308018441161547e-05, + "loss": 1.2237, + "step": 4079 + }, + { + "epoch": 1.5190050144849971, + "grad_norm": 0.17217344045639038, + "learning_rate": 1.9307574144925288e-05, + "loss": 1.22, + "step": 4080 + }, + { + "epoch": 1.5193773196356064, + "grad_norm": 0.1643909215927124, + "learning_rate": 1.930712971121674e-05, + "loss": 1.2008, + "step": 4081 + }, + { + "epoch": 1.5197496247862154, + "grad_norm": 0.17074188590049744, + "learning_rate": 1.9306685140042476e-05, + "loss": 1.2294, + "step": 4082 + }, + { + "epoch": 1.5201219299368245, + "grad_norm": 0.16520312428474426, + "learning_rate": 1.9306240431409056e-05, + "loss": 1.2173, + "step": 4083 + }, + { + "epoch": 1.5204942350874335, + "grad_norm": 0.16149955987930298, + "learning_rate": 1.9305795585323043e-05, + "loss": 1.2173, + "step": 4084 + }, + { + "epoch": 1.5208665402380426, + "grad_norm": 0.17141614854335785, + "learning_rate": 1.930535060179102e-05, + "loss": 1.2022, + "step": 4085 + }, + { + "epoch": 1.5212388453886518, + "grad_norm": 0.16703253984451294, + "learning_rate": 1.930490548081955e-05, + "loss": 1.2247, + "step": 4086 + }, + { + "epoch": 1.5216111505392607, + "grad_norm": 0.16546790301799774, + "learning_rate": 1.9304460222415207e-05, + "loss": 1.2045, + "step": 4087 + }, + { + "epoch": 1.52198345568987, + "grad_norm": 0.16973921656608582, + "learning_rate": 1.9304014826584578e-05, + "loss": 1.226, + "step": 4088 + }, + { + "epoch": 1.5223557608404787, + "grad_norm": 0.16389979422092438, + "learning_rate": 1.9303569293334226e-05, + "loss": 1.2052, + "step": 4089 + }, + { + "epoch": 1.522728065991088, + "grad_norm": 0.16464626789093018, + "learning_rate": 1.9303123622670743e-05, + "loss": 1.2047, + "step": 4090 + }, + { + "epoch": 1.523100371141697, + "grad_norm": 0.1665724664926529, + "learning_rate": 1.9302677814600707e-05, + "loss": 1.2025, + "step": 4091 + }, + { + "epoch": 1.523472676292306, + "grad_norm": 0.18270178139209747, + "learning_rate": 1.9302231869130703e-05, + "loss": 1.2181, + "step": 4092 + }, + { + "epoch": 1.5238449814429151, + "grad_norm": 0.1672508865594864, + "learning_rate": 1.9301785786267323e-05, + "loss": 1.2081, + "step": 4093 + }, + { + "epoch": 1.5242172865935242, + "grad_norm": 0.16370201110839844, + "learning_rate": 1.9301339566017144e-05, + "loss": 1.2036, + "step": 4094 + }, + { + "epoch": 1.5245895917441334, + "grad_norm": 0.17350997030735016, + "learning_rate": 1.930089320838677e-05, + "loss": 1.2072, + "step": 4095 + }, + { + "epoch": 1.5249618968947423, + "grad_norm": 0.17266206443309784, + "learning_rate": 1.930044671338278e-05, + "loss": 1.2169, + "step": 4096 + }, + { + "epoch": 1.5253342020453515, + "grad_norm": 0.16876213252544403, + "learning_rate": 1.9300000081011778e-05, + "loss": 1.2076, + "step": 4097 + }, + { + "epoch": 1.5257065071959603, + "grad_norm": 0.17879076302051544, + "learning_rate": 1.9299553311280358e-05, + "loss": 1.2003, + "step": 4098 + }, + { + "epoch": 1.5260788123465696, + "grad_norm": 0.16374212503433228, + "learning_rate": 1.9299106404195123e-05, + "loss": 1.2203, + "step": 4099 + }, + { + "epoch": 1.5264511174971787, + "grad_norm": 0.17050255835056305, + "learning_rate": 1.929865935976267e-05, + "loss": 1.2112, + "step": 4100 + }, + { + "epoch": 1.5268234226477877, + "grad_norm": 0.17496921122074127, + "learning_rate": 1.92982121779896e-05, + "loss": 1.2123, + "step": 4101 + }, + { + "epoch": 1.5271957277983967, + "grad_norm": 0.17470447719097137, + "learning_rate": 1.9297764858882516e-05, + "loss": 1.2115, + "step": 4102 + }, + { + "epoch": 1.5275680329490058, + "grad_norm": 0.166524276137352, + "learning_rate": 1.9297317402448032e-05, + "loss": 1.2235, + "step": 4103 + }, + { + "epoch": 1.527940338099615, + "grad_norm": 0.1667315661907196, + "learning_rate": 1.929686980869275e-05, + "loss": 1.2164, + "step": 4104 + }, + { + "epoch": 1.5283126432502239, + "grad_norm": 0.17172963917255402, + "learning_rate": 1.9296422077623293e-05, + "loss": 1.2133, + "step": 4105 + }, + { + "epoch": 1.5286849484008331, + "grad_norm": 0.16795189678668976, + "learning_rate": 1.9295974209246257e-05, + "loss": 1.2076, + "step": 4106 + }, + { + "epoch": 1.529057253551442, + "grad_norm": 0.1728687286376953, + "learning_rate": 1.9295526203568268e-05, + "loss": 1.214, + "step": 4107 + }, + { + "epoch": 1.5294295587020512, + "grad_norm": 0.16910706460475922, + "learning_rate": 1.9295078060595938e-05, + "loss": 1.2089, + "step": 4108 + }, + { + "epoch": 1.5298018638526603, + "grad_norm": 0.16839970648288727, + "learning_rate": 1.9294629780335895e-05, + "loss": 1.2014, + "step": 4109 + }, + { + "epoch": 1.5301741690032693, + "grad_norm": 0.17429544031620026, + "learning_rate": 1.9294181362794745e-05, + "loss": 1.2086, + "step": 4110 + }, + { + "epoch": 1.5305464741538783, + "grad_norm": 0.16927529871463776, + "learning_rate": 1.9293732807979127e-05, + "loss": 1.222, + "step": 4111 + }, + { + "epoch": 1.5309187793044874, + "grad_norm": 0.17815326154232025, + "learning_rate": 1.9293284115895656e-05, + "loss": 1.2107, + "step": 4112 + }, + { + "epoch": 1.5312910844550967, + "grad_norm": 0.17721591889858246, + "learning_rate": 1.929283528655096e-05, + "loss": 1.2062, + "step": 4113 + }, + { + "epoch": 1.5316633896057055, + "grad_norm": 0.16361045837402344, + "learning_rate": 1.929238631995167e-05, + "loss": 1.2002, + "step": 4114 + }, + { + "epoch": 1.5320356947563147, + "grad_norm": 0.17079226672649384, + "learning_rate": 1.929193721610442e-05, + "loss": 1.1955, + "step": 4115 + }, + { + "epoch": 1.5324079999069236, + "grad_norm": 0.17936839163303375, + "learning_rate": 1.9291487975015835e-05, + "loss": 1.2175, + "step": 4116 + }, + { + "epoch": 1.5327803050575328, + "grad_norm": 0.17177017033100128, + "learning_rate": 1.9291038596692562e-05, + "loss": 1.2078, + "step": 4117 + }, + { + "epoch": 1.5331526102081419, + "grad_norm": 0.1675311028957367, + "learning_rate": 1.929058908114123e-05, + "loss": 1.2289, + "step": 4118 + }, + { + "epoch": 1.533524915358751, + "grad_norm": 0.18256328999996185, + "learning_rate": 1.9290139428368482e-05, + "loss": 1.2118, + "step": 4119 + }, + { + "epoch": 1.53389722050936, + "grad_norm": 0.16636022925376892, + "learning_rate": 1.9289689638380956e-05, + "loss": 1.2061, + "step": 4120 + }, + { + "epoch": 1.534269525659969, + "grad_norm": 0.22456076741218567, + "learning_rate": 1.9289239711185293e-05, + "loss": 1.2134, + "step": 4121 + }, + { + "epoch": 1.5346418308105783, + "grad_norm": 0.19078871607780457, + "learning_rate": 1.928878964678815e-05, + "loss": 1.2185, + "step": 4122 + }, + { + "epoch": 1.535014135961187, + "grad_norm": 0.18136771023273468, + "learning_rate": 1.928833944519616e-05, + "loss": 1.2092, + "step": 4123 + }, + { + "epoch": 1.5353864411117963, + "grad_norm": 0.17033730447292328, + "learning_rate": 1.9287889106415983e-05, + "loss": 1.2218, + "step": 4124 + }, + { + "epoch": 1.5357587462624052, + "grad_norm": 0.17553742229938507, + "learning_rate": 1.9287438630454268e-05, + "loss": 1.2242, + "step": 4125 + }, + { + "epoch": 1.5361310514130144, + "grad_norm": 0.18240036070346832, + "learning_rate": 1.9286988017317664e-05, + "loss": 1.2029, + "step": 4126 + }, + { + "epoch": 1.5365033565636235, + "grad_norm": 0.17878971993923187, + "learning_rate": 1.928653726701283e-05, + "loss": 1.2057, + "step": 4127 + }, + { + "epoch": 1.5368756617142325, + "grad_norm": 0.16980788111686707, + "learning_rate": 1.9286086379546427e-05, + "loss": 1.2071, + "step": 4128 + }, + { + "epoch": 1.5372479668648416, + "grad_norm": 0.17709119617938995, + "learning_rate": 1.9285635354925107e-05, + "loss": 1.1857, + "step": 4129 + }, + { + "epoch": 1.5376202720154506, + "grad_norm": 0.1764722466468811, + "learning_rate": 1.9285184193155536e-05, + "loss": 1.2124, + "step": 4130 + }, + { + "epoch": 1.5379925771660599, + "grad_norm": 0.18060049414634705, + "learning_rate": 1.9284732894244378e-05, + "loss": 1.2073, + "step": 4131 + }, + { + "epoch": 1.5383648823166687, + "grad_norm": 0.16817817091941833, + "learning_rate": 1.92842814581983e-05, + "loss": 1.2128, + "step": 4132 + }, + { + "epoch": 1.538737187467278, + "grad_norm": 0.1727471500635147, + "learning_rate": 1.9283829885023967e-05, + "loss": 1.1974, + "step": 4133 + }, + { + "epoch": 1.539109492617887, + "grad_norm": 0.1839704066514969, + "learning_rate": 1.9283378174728046e-05, + "loss": 1.2003, + "step": 4134 + }, + { + "epoch": 1.539481797768496, + "grad_norm": 0.17027804255485535, + "learning_rate": 1.9282926327317213e-05, + "loss": 1.2135, + "step": 4135 + }, + { + "epoch": 1.539854102919105, + "grad_norm": 0.1670842319726944, + "learning_rate": 1.9282474342798143e-05, + "loss": 1.2023, + "step": 4136 + }, + { + "epoch": 1.5402264080697141, + "grad_norm": 0.1733679473400116, + "learning_rate": 1.9282022221177507e-05, + "loss": 1.2102, + "step": 4137 + }, + { + "epoch": 1.5405987132203232, + "grad_norm": 0.17217591404914856, + "learning_rate": 1.9281569962461986e-05, + "loss": 1.2237, + "step": 4138 + }, + { + "epoch": 1.5409710183709322, + "grad_norm": 0.174224391579628, + "learning_rate": 1.928111756665826e-05, + "loss": 1.2072, + "step": 4139 + }, + { + "epoch": 1.5413433235215415, + "grad_norm": 0.17632028460502625, + "learning_rate": 1.928066503377301e-05, + "loss": 1.2182, + "step": 4140 + }, + { + "epoch": 1.5417156286721503, + "grad_norm": 0.17433500289916992, + "learning_rate": 1.9280212363812918e-05, + "loss": 1.2297, + "step": 4141 + }, + { + "epoch": 1.5420879338227595, + "grad_norm": 0.1642346978187561, + "learning_rate": 1.9279759556784673e-05, + "loss": 1.2103, + "step": 4142 + }, + { + "epoch": 1.5424602389733686, + "grad_norm": 0.172978937625885, + "learning_rate": 1.9279306612694963e-05, + "loss": 1.2225, + "step": 4143 + }, + { + "epoch": 1.5428325441239776, + "grad_norm": 0.17284728586673737, + "learning_rate": 1.9278853531550475e-05, + "loss": 1.2209, + "step": 4144 + }, + { + "epoch": 1.5432048492745867, + "grad_norm": 0.16585254669189453, + "learning_rate": 1.9278400313357902e-05, + "loss": 1.2256, + "step": 4145 + }, + { + "epoch": 1.5435771544251957, + "grad_norm": 0.1627526581287384, + "learning_rate": 1.927794695812394e-05, + "loss": 1.2172, + "step": 4146 + }, + { + "epoch": 1.543949459575805, + "grad_norm": 0.17496748268604279, + "learning_rate": 1.9277493465855287e-05, + "loss": 1.197, + "step": 4147 + }, + { + "epoch": 1.5443217647264138, + "grad_norm": 0.1616668850183487, + "learning_rate": 1.9277039836558635e-05, + "loss": 1.2088, + "step": 4148 + }, + { + "epoch": 1.544694069877023, + "grad_norm": 0.16618521511554718, + "learning_rate": 1.9276586070240684e-05, + "loss": 1.215, + "step": 4149 + }, + { + "epoch": 1.5450663750276319, + "grad_norm": 0.16764888167381287, + "learning_rate": 1.927613216690814e-05, + "loss": 1.2012, + "step": 4150 + }, + { + "epoch": 1.5454386801782412, + "grad_norm": 0.166963592171669, + "learning_rate": 1.9275678126567707e-05, + "loss": 1.2234, + "step": 4151 + }, + { + "epoch": 1.5458109853288502, + "grad_norm": 0.166300967335701, + "learning_rate": 1.927522394922609e-05, + "loss": 1.2119, + "step": 4152 + }, + { + "epoch": 1.5461832904794592, + "grad_norm": 0.16552546620368958, + "learning_rate": 1.9274769634889997e-05, + "loss": 1.1968, + "step": 4153 + }, + { + "epoch": 1.5465555956300683, + "grad_norm": 0.1656341403722763, + "learning_rate": 1.927431518356614e-05, + "loss": 1.2084, + "step": 4154 + }, + { + "epoch": 1.5469279007806773, + "grad_norm": 0.16084742546081543, + "learning_rate": 1.9273860595261232e-05, + "loss": 1.2007, + "step": 4155 + }, + { + "epoch": 1.5473002059312866, + "grad_norm": 0.16208963096141815, + "learning_rate": 1.927340586998198e-05, + "loss": 1.2251, + "step": 4156 + }, + { + "epoch": 1.5476725110818954, + "grad_norm": 0.16170455515384674, + "learning_rate": 1.9272951007735108e-05, + "loss": 1.2208, + "step": 4157 + }, + { + "epoch": 1.5480448162325047, + "grad_norm": 0.159234881401062, + "learning_rate": 1.927249600852733e-05, + "loss": 1.2103, + "step": 4158 + }, + { + "epoch": 1.5484171213831135, + "grad_norm": 0.17041054368019104, + "learning_rate": 1.927204087236537e-05, + "loss": 1.2171, + "step": 4159 + }, + { + "epoch": 1.5487894265337228, + "grad_norm": 0.16318999230861664, + "learning_rate": 1.9271585599255945e-05, + "loss": 1.2101, + "step": 4160 + }, + { + "epoch": 1.5491617316843318, + "grad_norm": 0.15880700945854187, + "learning_rate": 1.9271130189205786e-05, + "loss": 1.1997, + "step": 4161 + }, + { + "epoch": 1.5495340368349408, + "grad_norm": 0.16643404960632324, + "learning_rate": 1.9270674642221614e-05, + "loss": 1.193, + "step": 4162 + }, + { + "epoch": 1.5499063419855499, + "grad_norm": 0.17623448371887207, + "learning_rate": 1.927021895831016e-05, + "loss": 1.2188, + "step": 4163 + }, + { + "epoch": 1.550278647136159, + "grad_norm": 0.1635187864303589, + "learning_rate": 1.9269763137478152e-05, + "loss": 1.1997, + "step": 4164 + }, + { + "epoch": 1.5506509522867682, + "grad_norm": 0.16812646389007568, + "learning_rate": 1.9269307179732325e-05, + "loss": 1.2083, + "step": 4165 + }, + { + "epoch": 1.551023257437377, + "grad_norm": 0.164559006690979, + "learning_rate": 1.9268851085079415e-05, + "loss": 1.1901, + "step": 4166 + }, + { + "epoch": 1.5513955625879863, + "grad_norm": 0.1598326563835144, + "learning_rate": 1.9268394853526156e-05, + "loss": 1.198, + "step": 4167 + }, + { + "epoch": 1.551767867738595, + "grad_norm": 0.16687574982643127, + "learning_rate": 1.9267938485079285e-05, + "loss": 1.2133, + "step": 4168 + }, + { + "epoch": 1.5521401728892044, + "grad_norm": 0.16783174872398376, + "learning_rate": 1.9267481979745544e-05, + "loss": 1.2217, + "step": 4169 + }, + { + "epoch": 1.5525124780398134, + "grad_norm": 0.1639779806137085, + "learning_rate": 1.9267025337531678e-05, + "loss": 1.2243, + "step": 4170 + }, + { + "epoch": 1.5528847831904224, + "grad_norm": 0.16327700018882751, + "learning_rate": 1.9266568558444426e-05, + "loss": 1.2213, + "step": 4171 + }, + { + "epoch": 1.5532570883410315, + "grad_norm": 0.16662442684173584, + "learning_rate": 1.926611164249054e-05, + "loss": 1.198, + "step": 4172 + }, + { + "epoch": 1.5536293934916405, + "grad_norm": 0.1651596873998642, + "learning_rate": 1.9265654589676767e-05, + "loss": 1.2113, + "step": 4173 + }, + { + "epoch": 1.5540016986422498, + "grad_norm": 0.16614238917827606, + "learning_rate": 1.9265197400009854e-05, + "loss": 1.2092, + "step": 4174 + }, + { + "epoch": 1.5543740037928586, + "grad_norm": 0.1736796498298645, + "learning_rate": 1.926474007349656e-05, + "loss": 1.2147, + "step": 4175 + }, + { + "epoch": 1.5547463089434679, + "grad_norm": 0.1689198911190033, + "learning_rate": 1.9264282610143638e-05, + "loss": 1.2119, + "step": 4176 + }, + { + "epoch": 1.5551186140940767, + "grad_norm": 0.16373711824417114, + "learning_rate": 1.926382500995784e-05, + "loss": 1.2077, + "step": 4177 + }, + { + "epoch": 1.555490919244686, + "grad_norm": 0.1627718210220337, + "learning_rate": 1.9263367272945927e-05, + "loss": 1.2128, + "step": 4178 + }, + { + "epoch": 1.555863224395295, + "grad_norm": 0.16306252777576447, + "learning_rate": 1.9262909399114663e-05, + "loss": 1.218, + "step": 4179 + }, + { + "epoch": 1.556235529545904, + "grad_norm": 0.16775202751159668, + "learning_rate": 1.9262451388470807e-05, + "loss": 1.2039, + "step": 4180 + }, + { + "epoch": 1.556607834696513, + "grad_norm": 0.1653129607439041, + "learning_rate": 1.926199324102113e-05, + "loss": 1.2081, + "step": 4181 + }, + { + "epoch": 1.5569801398471221, + "grad_norm": 0.1624881476163864, + "learning_rate": 1.9261534956772386e-05, + "loss": 1.2163, + "step": 4182 + }, + { + "epoch": 1.5573524449977314, + "grad_norm": 0.16071651875972748, + "learning_rate": 1.9261076535731356e-05, + "loss": 1.2008, + "step": 4183 + }, + { + "epoch": 1.5577247501483402, + "grad_norm": 0.17238472402095795, + "learning_rate": 1.926061797790481e-05, + "loss": 1.221, + "step": 4184 + }, + { + "epoch": 1.5580970552989495, + "grad_norm": 0.17098172008991241, + "learning_rate": 1.9260159283299514e-05, + "loss": 1.206, + "step": 4185 + }, + { + "epoch": 1.5584693604495583, + "grad_norm": 0.164502814412117, + "learning_rate": 1.9259700451922247e-05, + "loss": 1.2157, + "step": 4186 + }, + { + "epoch": 1.5588416656001676, + "grad_norm": 0.17409628629684448, + "learning_rate": 1.9259241483779787e-05, + "loss": 1.226, + "step": 4187 + }, + { + "epoch": 1.5592139707507766, + "grad_norm": 0.17351455986499786, + "learning_rate": 1.9258782378878908e-05, + "loss": 1.2167, + "step": 4188 + }, + { + "epoch": 1.5595862759013857, + "grad_norm": 0.16940516233444214, + "learning_rate": 1.92583231372264e-05, + "loss": 1.2044, + "step": 4189 + }, + { + "epoch": 1.5599585810519947, + "grad_norm": 0.16605758666992188, + "learning_rate": 1.9257863758829038e-05, + "loss": 1.2009, + "step": 4190 + }, + { + "epoch": 1.5603308862026037, + "grad_norm": 0.17315179109573364, + "learning_rate": 1.9257404243693606e-05, + "loss": 1.2139, + "step": 4191 + }, + { + "epoch": 1.560703191353213, + "grad_norm": 0.1688748598098755, + "learning_rate": 1.9256944591826893e-05, + "loss": 1.2212, + "step": 4192 + }, + { + "epoch": 1.5610754965038218, + "grad_norm": 0.16964520514011383, + "learning_rate": 1.9256484803235696e-05, + "loss": 1.2092, + "step": 4193 + }, + { + "epoch": 1.561447801654431, + "grad_norm": 0.16854505240917206, + "learning_rate": 1.9256024877926796e-05, + "loss": 1.2015, + "step": 4194 + }, + { + "epoch": 1.5618201068050401, + "grad_norm": 0.168868288397789, + "learning_rate": 1.9255564815906988e-05, + "loss": 1.2013, + "step": 4195 + }, + { + "epoch": 1.5621924119556492, + "grad_norm": 0.17103056609630585, + "learning_rate": 1.9255104617183068e-05, + "loss": 1.211, + "step": 4196 + }, + { + "epoch": 1.5625647171062582, + "grad_norm": 0.1685587465763092, + "learning_rate": 1.9254644281761838e-05, + "loss": 1.1984, + "step": 4197 + }, + { + "epoch": 1.5629370222568673, + "grad_norm": 0.17588046193122864, + "learning_rate": 1.925418380965009e-05, + "loss": 1.2144, + "step": 4198 + }, + { + "epoch": 1.5633093274074763, + "grad_norm": 0.16657619178295135, + "learning_rate": 1.925372320085463e-05, + "loss": 1.2011, + "step": 4199 + }, + { + "epoch": 1.5636816325580853, + "grad_norm": 0.16856853663921356, + "learning_rate": 1.9253262455382256e-05, + "loss": 1.2057, + "step": 4200 + }, + { + "epoch": 1.5640539377086946, + "grad_norm": 0.16726350784301758, + "learning_rate": 1.925280157323978e-05, + "loss": 1.2062, + "step": 4201 + }, + { + "epoch": 1.5644262428593034, + "grad_norm": 0.18542660772800446, + "learning_rate": 1.9252340554434003e-05, + "loss": 1.21, + "step": 4202 + }, + { + "epoch": 1.5647985480099127, + "grad_norm": 0.17337799072265625, + "learning_rate": 1.9251879398971733e-05, + "loss": 1.203, + "step": 4203 + }, + { + "epoch": 1.5651708531605217, + "grad_norm": 0.1717308908700943, + "learning_rate": 1.925141810685979e-05, + "loss": 1.22, + "step": 4204 + }, + { + "epoch": 1.5655431583111308, + "grad_norm": 0.1700487732887268, + "learning_rate": 1.925095667810498e-05, + "loss": 1.2018, + "step": 4205 + }, + { + "epoch": 1.5659154634617398, + "grad_norm": 0.1768191158771515, + "learning_rate": 1.9250495112714123e-05, + "loss": 1.1853, + "step": 4206 + }, + { + "epoch": 1.5662877686123489, + "grad_norm": 0.17097093164920807, + "learning_rate": 1.9250033410694032e-05, + "loss": 1.2047, + "step": 4207 + }, + { + "epoch": 1.5666600737629581, + "grad_norm": 0.1712087094783783, + "learning_rate": 1.9249571572051528e-05, + "loss": 1.2028, + "step": 4208 + }, + { + "epoch": 1.567032378913567, + "grad_norm": 0.16526564955711365, + "learning_rate": 1.924910959679343e-05, + "loss": 1.205, + "step": 4209 + }, + { + "epoch": 1.5674046840641762, + "grad_norm": 0.16260464489459991, + "learning_rate": 1.9248647484926568e-05, + "loss": 1.201, + "step": 4210 + }, + { + "epoch": 1.567776989214785, + "grad_norm": 0.1643940955400467, + "learning_rate": 1.924818523645776e-05, + "loss": 1.2147, + "step": 4211 + }, + { + "epoch": 1.5681492943653943, + "grad_norm": 0.17409846186637878, + "learning_rate": 1.9247722851393838e-05, + "loss": 1.2043, + "step": 4212 + }, + { + "epoch": 1.5685215995160033, + "grad_norm": 0.16997475922107697, + "learning_rate": 1.924726032974163e-05, + "loss": 1.2171, + "step": 4213 + }, + { + "epoch": 1.5688939046666124, + "grad_norm": 0.171822652220726, + "learning_rate": 1.9246797671507966e-05, + "loss": 1.2126, + "step": 4214 + }, + { + "epoch": 1.5692662098172214, + "grad_norm": 0.1666400134563446, + "learning_rate": 1.9246334876699682e-05, + "loss": 1.2152, + "step": 4215 + }, + { + "epoch": 1.5696385149678305, + "grad_norm": 0.17441390454769135, + "learning_rate": 1.924587194532361e-05, + "loss": 1.2002, + "step": 4216 + }, + { + "epoch": 1.5700108201184397, + "grad_norm": 0.18336564302444458, + "learning_rate": 1.9245408877386592e-05, + "loss": 1.2119, + "step": 4217 + }, + { + "epoch": 1.5703831252690486, + "grad_norm": 0.16609658300876617, + "learning_rate": 1.9244945672895464e-05, + "loss": 1.2084, + "step": 4218 + }, + { + "epoch": 1.5707554304196578, + "grad_norm": 0.17070931196212769, + "learning_rate": 1.924448233185707e-05, + "loss": 1.2045, + "step": 4219 + }, + { + "epoch": 1.5711277355702666, + "grad_norm": 0.16091009974479675, + "learning_rate": 1.924401885427825e-05, + "loss": 1.1902, + "step": 4220 + }, + { + "epoch": 1.571500040720876, + "grad_norm": 0.1693873405456543, + "learning_rate": 1.9243555240165855e-05, + "loss": 1.2213, + "step": 4221 + }, + { + "epoch": 1.571872345871485, + "grad_norm": 0.17035622894763947, + "learning_rate": 1.9243091489526728e-05, + "loss": 1.2, + "step": 4222 + }, + { + "epoch": 1.572244651022094, + "grad_norm": 0.17594295740127563, + "learning_rate": 1.9242627602367716e-05, + "loss": 1.2161, + "step": 4223 + }, + { + "epoch": 1.572616956172703, + "grad_norm": 0.16813448071479797, + "learning_rate": 1.924216357869568e-05, + "loss": 1.2029, + "step": 4224 + }, + { + "epoch": 1.572989261323312, + "grad_norm": 0.18052363395690918, + "learning_rate": 1.924169941851746e-05, + "loss": 1.2017, + "step": 4225 + }, + { + "epoch": 1.5733615664739213, + "grad_norm": 0.18110094964504242, + "learning_rate": 1.9241235121839927e-05, + "loss": 1.1987, + "step": 4226 + }, + { + "epoch": 1.5737338716245302, + "grad_norm": 0.17271529138088226, + "learning_rate": 1.9240770688669928e-05, + "loss": 1.2058, + "step": 4227 + }, + { + "epoch": 1.5741061767751394, + "grad_norm": 0.1713894158601761, + "learning_rate": 1.924030611901433e-05, + "loss": 1.2011, + "step": 4228 + }, + { + "epoch": 1.5744784819257482, + "grad_norm": 0.17483104765415192, + "learning_rate": 1.9239841412879983e-05, + "loss": 1.2166, + "step": 4229 + }, + { + "epoch": 1.5748507870763575, + "grad_norm": 0.17396202683448792, + "learning_rate": 1.9239376570273763e-05, + "loss": 1.2059, + "step": 4230 + }, + { + "epoch": 1.5752230922269665, + "grad_norm": 0.16693198680877686, + "learning_rate": 1.923891159120253e-05, + "loss": 1.2113, + "step": 4231 + }, + { + "epoch": 1.5755953973775756, + "grad_norm": 0.17103157937526703, + "learning_rate": 1.9238446475673155e-05, + "loss": 1.2166, + "step": 4232 + }, + { + "epoch": 1.5759677025281846, + "grad_norm": 0.1680799126625061, + "learning_rate": 1.92379812236925e-05, + "loss": 1.2189, + "step": 4233 + }, + { + "epoch": 1.5763400076787937, + "grad_norm": 0.18774433434009552, + "learning_rate": 1.9237515835267447e-05, + "loss": 1.2121, + "step": 4234 + }, + { + "epoch": 1.576712312829403, + "grad_norm": 0.16608703136444092, + "learning_rate": 1.9237050310404862e-05, + "loss": 1.1935, + "step": 4235 + }, + { + "epoch": 1.5770846179800118, + "grad_norm": 0.17058461904525757, + "learning_rate": 1.9236584649111628e-05, + "loss": 1.2014, + "step": 4236 + }, + { + "epoch": 1.577456923130621, + "grad_norm": 0.16331979632377625, + "learning_rate": 1.923611885139461e-05, + "loss": 1.2044, + "step": 4237 + }, + { + "epoch": 1.5778292282812298, + "grad_norm": 0.17175064980983734, + "learning_rate": 1.9235652917260705e-05, + "loss": 1.2032, + "step": 4238 + }, + { + "epoch": 1.578201533431839, + "grad_norm": 0.1720954328775406, + "learning_rate": 1.9235186846716784e-05, + "loss": 1.2092, + "step": 4239 + }, + { + "epoch": 1.5785738385824482, + "grad_norm": 0.16295336186885834, + "learning_rate": 1.923472063976973e-05, + "loss": 1.1955, + "step": 4240 + }, + { + "epoch": 1.5789461437330572, + "grad_norm": 0.16375908255577087, + "learning_rate": 1.923425429642643e-05, + "loss": 1.1909, + "step": 4241 + }, + { + "epoch": 1.5793184488836662, + "grad_norm": 0.1621014028787613, + "learning_rate": 1.923378781669378e-05, + "loss": 1.201, + "step": 4242 + }, + { + "epoch": 1.5796907540342753, + "grad_norm": 0.1648714393377304, + "learning_rate": 1.9233321200578657e-05, + "loss": 1.2119, + "step": 4243 + }, + { + "epoch": 1.5800630591848845, + "grad_norm": 0.16513188183307648, + "learning_rate": 1.9232854448087962e-05, + "loss": 1.2129, + "step": 4244 + }, + { + "epoch": 1.5804353643354934, + "grad_norm": 0.1719852089881897, + "learning_rate": 1.9232387559228587e-05, + "loss": 1.2095, + "step": 4245 + }, + { + "epoch": 1.5808076694861026, + "grad_norm": 0.16509710252285004, + "learning_rate": 1.9231920534007428e-05, + "loss": 1.2118, + "step": 4246 + }, + { + "epoch": 1.5811799746367114, + "grad_norm": 0.16327592730522156, + "learning_rate": 1.923145337243138e-05, + "loss": 1.2081, + "step": 4247 + }, + { + "epoch": 1.5815522797873207, + "grad_norm": 0.17513926327228546, + "learning_rate": 1.9230986074507347e-05, + "loss": 1.2215, + "step": 4248 + }, + { + "epoch": 1.5819245849379298, + "grad_norm": 0.17169274389743805, + "learning_rate": 1.9230518640242228e-05, + "loss": 1.2164, + "step": 4249 + }, + { + "epoch": 1.5822968900885388, + "grad_norm": 0.17580832540988922, + "learning_rate": 1.9230051069642927e-05, + "loss": 1.2066, + "step": 4250 + }, + { + "epoch": 1.5826691952391478, + "grad_norm": 0.16311529278755188, + "learning_rate": 1.922958336271635e-05, + "loss": 1.2061, + "step": 4251 + }, + { + "epoch": 1.5830415003897569, + "grad_norm": 0.18009954690933228, + "learning_rate": 1.922911551946941e-05, + "loss": 1.2161, + "step": 4252 + }, + { + "epoch": 1.5834138055403661, + "grad_norm": 0.17402635514736176, + "learning_rate": 1.922864753990901e-05, + "loss": 1.228, + "step": 4253 + }, + { + "epoch": 1.583786110690975, + "grad_norm": 0.17226436734199524, + "learning_rate": 1.9228179424042064e-05, + "loss": 1.209, + "step": 4254 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.1807158887386322, + "learning_rate": 1.9227711171875486e-05, + "loss": 1.202, + "step": 4255 + }, + { + "epoch": 1.5845307209921933, + "grad_norm": 0.16312603652477264, + "learning_rate": 1.92272427834162e-05, + "loss": 1.2014, + "step": 4256 + }, + { + "epoch": 1.5849030261428023, + "grad_norm": 0.16579662263393402, + "learning_rate": 1.9226774258671112e-05, + "loss": 1.208, + "step": 4257 + }, + { + "epoch": 1.5852753312934114, + "grad_norm": 0.1716030240058899, + "learning_rate": 1.9226305597647145e-05, + "loss": 1.2052, + "step": 4258 + }, + { + "epoch": 1.5856476364440204, + "grad_norm": 0.16442036628723145, + "learning_rate": 1.922583680035123e-05, + "loss": 1.2109, + "step": 4259 + }, + { + "epoch": 1.5860199415946294, + "grad_norm": 0.15719719231128693, + "learning_rate": 1.922536786679028e-05, + "loss": 1.2087, + "step": 4260 + }, + { + "epoch": 1.5863922467452385, + "grad_norm": 0.16500256955623627, + "learning_rate": 1.9224898796971224e-05, + "loss": 1.2206, + "step": 4261 + }, + { + "epoch": 1.5867645518958478, + "grad_norm": 0.17196062207221985, + "learning_rate": 1.9224429590900997e-05, + "loss": 1.206, + "step": 4262 + }, + { + "epoch": 1.5871368570464566, + "grad_norm": 0.17409005761146545, + "learning_rate": 1.9223960248586523e-05, + "loss": 1.2111, + "step": 4263 + }, + { + "epoch": 1.5875091621970658, + "grad_norm": 0.17429877817630768, + "learning_rate": 1.922349077003473e-05, + "loss": 1.2316, + "step": 4264 + }, + { + "epoch": 1.5878814673476749, + "grad_norm": 0.1687888205051422, + "learning_rate": 1.922302115525256e-05, + "loss": 1.2126, + "step": 4265 + }, + { + "epoch": 1.588253772498284, + "grad_norm": 0.1785813719034195, + "learning_rate": 1.922255140424695e-05, + "loss": 1.2094, + "step": 4266 + }, + { + "epoch": 1.588626077648893, + "grad_norm": 0.1841798722743988, + "learning_rate": 1.922208151702483e-05, + "loss": 1.2105, + "step": 4267 + }, + { + "epoch": 1.588998382799502, + "grad_norm": 0.16933433711528778, + "learning_rate": 1.9221611493593145e-05, + "loss": 1.2078, + "step": 4268 + }, + { + "epoch": 1.5893706879501113, + "grad_norm": 0.21182604134082794, + "learning_rate": 1.9221141333958837e-05, + "loss": 1.2086, + "step": 4269 + }, + { + "epoch": 1.58974299310072, + "grad_norm": 0.18182890117168427, + "learning_rate": 1.922067103812885e-05, + "loss": 1.2054, + "step": 4270 + }, + { + "epoch": 1.5901152982513294, + "grad_norm": 0.18493826687335968, + "learning_rate": 1.9220200606110132e-05, + "loss": 1.2018, + "step": 4271 + }, + { + "epoch": 1.5904876034019382, + "grad_norm": 0.17452014982700348, + "learning_rate": 1.921973003790963e-05, + "loss": 1.2287, + "step": 4272 + }, + { + "epoch": 1.5908599085525474, + "grad_norm": 0.16467060148715973, + "learning_rate": 1.9219259333534292e-05, + "loss": 1.2003, + "step": 4273 + }, + { + "epoch": 1.5912322137031565, + "grad_norm": 0.18403173983097076, + "learning_rate": 1.9218788492991075e-05, + "loss": 1.228, + "step": 4274 + }, + { + "epoch": 1.5916045188537655, + "grad_norm": 0.18004946410655975, + "learning_rate": 1.921831751628693e-05, + "loss": 1.202, + "step": 4275 + }, + { + "epoch": 1.5919768240043746, + "grad_norm": 0.16663858294487, + "learning_rate": 1.921784640342881e-05, + "loss": 1.2187, + "step": 4276 + }, + { + "epoch": 1.5923491291549836, + "grad_norm": 0.17807120084762573, + "learning_rate": 1.921737515442368e-05, + "loss": 1.2171, + "step": 4277 + }, + { + "epoch": 1.5927214343055929, + "grad_norm": 0.18053670227527618, + "learning_rate": 1.9216903769278498e-05, + "loss": 1.2175, + "step": 4278 + }, + { + "epoch": 1.5930937394562017, + "grad_norm": 0.17122212052345276, + "learning_rate": 1.9216432248000224e-05, + "loss": 1.2032, + "step": 4279 + }, + { + "epoch": 1.593466044606811, + "grad_norm": 0.16505327820777893, + "learning_rate": 1.9215960590595824e-05, + "loss": 1.2039, + "step": 4280 + }, + { + "epoch": 1.5938383497574198, + "grad_norm": 0.18405233323574066, + "learning_rate": 1.9215488797072267e-05, + "loss": 1.212, + "step": 4281 + }, + { + "epoch": 1.594210654908029, + "grad_norm": 0.17196938395500183, + "learning_rate": 1.9215016867436516e-05, + "loss": 1.2118, + "step": 4282 + }, + { + "epoch": 1.594582960058638, + "grad_norm": 0.16480191051959991, + "learning_rate": 1.9214544801695547e-05, + "loss": 1.2089, + "step": 4283 + }, + { + "epoch": 1.5949552652092471, + "grad_norm": 0.16730982065200806, + "learning_rate": 1.9214072599856326e-05, + "loss": 1.2027, + "step": 4284 + }, + { + "epoch": 1.5953275703598562, + "grad_norm": 0.17092877626419067, + "learning_rate": 1.9213600261925832e-05, + "loss": 1.2149, + "step": 4285 + }, + { + "epoch": 1.5956998755104652, + "grad_norm": 0.1654144823551178, + "learning_rate": 1.9213127787911045e-05, + "loss": 1.2017, + "step": 4286 + }, + { + "epoch": 1.5960721806610745, + "grad_norm": 0.17047090828418732, + "learning_rate": 1.9212655177818935e-05, + "loss": 1.1947, + "step": 4287 + }, + { + "epoch": 1.5964444858116833, + "grad_norm": 0.17292653024196625, + "learning_rate": 1.9212182431656487e-05, + "loss": 1.2226, + "step": 4288 + }, + { + "epoch": 1.5968167909622926, + "grad_norm": 0.16899065673351288, + "learning_rate": 1.9211709549430678e-05, + "loss": 1.196, + "step": 4289 + }, + { + "epoch": 1.5971890961129014, + "grad_norm": 0.16645705699920654, + "learning_rate": 1.92112365311485e-05, + "loss": 1.2066, + "step": 4290 + }, + { + "epoch": 1.5975614012635106, + "grad_norm": 0.1680358499288559, + "learning_rate": 1.921076337681694e-05, + "loss": 1.2134, + "step": 4291 + }, + { + "epoch": 1.5979337064141197, + "grad_norm": 0.16596710681915283, + "learning_rate": 1.9210290086442983e-05, + "loss": 1.2085, + "step": 4292 + }, + { + "epoch": 1.5983060115647287, + "grad_norm": 0.1577194184064865, + "learning_rate": 1.9209816660033613e-05, + "loss": 1.1921, + "step": 4293 + }, + { + "epoch": 1.5986783167153378, + "grad_norm": 0.1676270067691803, + "learning_rate": 1.9209343097595834e-05, + "loss": 1.2075, + "step": 4294 + }, + { + "epoch": 1.5990506218659468, + "grad_norm": 0.16375909745693207, + "learning_rate": 1.9208869399136633e-05, + "loss": 1.2086, + "step": 4295 + }, + { + "epoch": 1.599422927016556, + "grad_norm": 0.16282792389392853, + "learning_rate": 1.9208395564663012e-05, + "loss": 1.2115, + "step": 4296 + }, + { + "epoch": 1.599795232167165, + "grad_norm": 0.16538910567760468, + "learning_rate": 1.9207921594181964e-05, + "loss": 1.2063, + "step": 4297 + }, + { + "epoch": 1.6001675373177742, + "grad_norm": 0.1656702756881714, + "learning_rate": 1.9207447487700494e-05, + "loss": 1.2123, + "step": 4298 + }, + { + "epoch": 1.600539842468383, + "grad_norm": 0.16824668645858765, + "learning_rate": 1.92069732452256e-05, + "loss": 1.2085, + "step": 4299 + }, + { + "epoch": 1.6009121476189923, + "grad_norm": 0.16301900148391724, + "learning_rate": 1.920649886676429e-05, + "loss": 1.2227, + "step": 4300 + }, + { + "epoch": 1.6012844527696013, + "grad_norm": 0.173927441239357, + "learning_rate": 1.920602435232357e-05, + "loss": 1.2114, + "step": 4301 + }, + { + "epoch": 1.6016567579202103, + "grad_norm": 0.17178916931152344, + "learning_rate": 1.9205549701910445e-05, + "loss": 1.2076, + "step": 4302 + }, + { + "epoch": 1.6020290630708194, + "grad_norm": 0.16292187571525574, + "learning_rate": 1.920507491553193e-05, + "loss": 1.2231, + "step": 4303 + }, + { + "epoch": 1.6024013682214284, + "grad_norm": 0.1758165806531906, + "learning_rate": 1.9204599993195038e-05, + "loss": 1.2088, + "step": 4304 + }, + { + "epoch": 1.6027736733720377, + "grad_norm": 0.1699971705675125, + "learning_rate": 1.920412493490678e-05, + "loss": 1.2018, + "step": 4305 + }, + { + "epoch": 1.6031459785226465, + "grad_norm": 0.16813789308071136, + "learning_rate": 1.920364974067418e-05, + "loss": 1.2054, + "step": 4306 + }, + { + "epoch": 1.6035182836732558, + "grad_norm": 0.16365012526512146, + "learning_rate": 1.9203174410504243e-05, + "loss": 1.2084, + "step": 4307 + }, + { + "epoch": 1.6038905888238648, + "grad_norm": 0.16683803498744965, + "learning_rate": 1.9202698944404002e-05, + "loss": 1.2064, + "step": 4308 + }, + { + "epoch": 1.6042628939744739, + "grad_norm": 0.16585657000541687, + "learning_rate": 1.9202223342380475e-05, + "loss": 1.2156, + "step": 4309 + }, + { + "epoch": 1.604635199125083, + "grad_norm": 0.16558226943016052, + "learning_rate": 1.9201747604440686e-05, + "loss": 1.2181, + "step": 4310 + }, + { + "epoch": 1.605007504275692, + "grad_norm": 0.1677953600883484, + "learning_rate": 1.920127173059166e-05, + "loss": 1.2002, + "step": 4311 + }, + { + "epoch": 1.605379809426301, + "grad_norm": 0.16227871179580688, + "learning_rate": 1.920079572084043e-05, + "loss": 1.201, + "step": 4312 + }, + { + "epoch": 1.60575211457691, + "grad_norm": 0.16449685394763947, + "learning_rate": 1.9200319575194025e-05, + "loss": 1.2128, + "step": 4313 + }, + { + "epoch": 1.6061244197275193, + "grad_norm": 0.1711970865726471, + "learning_rate": 1.919984329365948e-05, + "loss": 1.2181, + "step": 4314 + }, + { + "epoch": 1.6064967248781281, + "grad_norm": 0.16856355965137482, + "learning_rate": 1.919936687624382e-05, + "loss": 1.2065, + "step": 4315 + }, + { + "epoch": 1.6068690300287374, + "grad_norm": 0.1796281933784485, + "learning_rate": 1.9198890322954092e-05, + "loss": 1.1957, + "step": 4316 + }, + { + "epoch": 1.6072413351793464, + "grad_norm": 0.17579133808612823, + "learning_rate": 1.9198413633797334e-05, + "loss": 1.2143, + "step": 4317 + }, + { + "epoch": 1.6076136403299555, + "grad_norm": 0.16579852998256683, + "learning_rate": 1.919793680878058e-05, + "loss": 1.2078, + "step": 4318 + }, + { + "epoch": 1.6079859454805645, + "grad_norm": 0.16327063739299774, + "learning_rate": 1.9197459847910878e-05, + "loss": 1.2, + "step": 4319 + }, + { + "epoch": 1.6083582506311735, + "grad_norm": 0.16912123560905457, + "learning_rate": 1.9196982751195272e-05, + "loss": 1.2125, + "step": 4320 + }, + { + "epoch": 1.6087305557817826, + "grad_norm": 0.16993340849876404, + "learning_rate": 1.9196505518640807e-05, + "loss": 1.2124, + "step": 4321 + }, + { + "epoch": 1.6091028609323916, + "grad_norm": 0.16973137855529785, + "learning_rate": 1.9196028150254535e-05, + "loss": 1.2082, + "step": 4322 + }, + { + "epoch": 1.609475166083001, + "grad_norm": 0.1582946479320526, + "learning_rate": 1.91955506460435e-05, + "loss": 1.2028, + "step": 4323 + }, + { + "epoch": 1.6098474712336097, + "grad_norm": 0.17046526074409485, + "learning_rate": 1.9195073006014762e-05, + "loss": 1.2113, + "step": 4324 + }, + { + "epoch": 1.610219776384219, + "grad_norm": 0.17357958853244781, + "learning_rate": 1.9194595230175373e-05, + "loss": 1.2014, + "step": 4325 + }, + { + "epoch": 1.610592081534828, + "grad_norm": 0.17264226078987122, + "learning_rate": 1.9194117318532387e-05, + "loss": 1.1983, + "step": 4326 + }, + { + "epoch": 1.610964386685437, + "grad_norm": 0.1646987348794937, + "learning_rate": 1.9193639271092866e-05, + "loss": 1.2043, + "step": 4327 + }, + { + "epoch": 1.611336691836046, + "grad_norm": 0.17559926211833954, + "learning_rate": 1.919316108786387e-05, + "loss": 1.2055, + "step": 4328 + }, + { + "epoch": 1.6117089969866552, + "grad_norm": 0.1712273806333542, + "learning_rate": 1.9192682768852464e-05, + "loss": 1.2072, + "step": 4329 + }, + { + "epoch": 1.6120813021372644, + "grad_norm": 0.16717827320098877, + "learning_rate": 1.919220431406571e-05, + "loss": 1.2085, + "step": 4330 + }, + { + "epoch": 1.6124536072878732, + "grad_norm": 0.1609518676996231, + "learning_rate": 1.919172572351067e-05, + "loss": 1.2075, + "step": 4331 + }, + { + "epoch": 1.6128259124384825, + "grad_norm": 0.1639975607395172, + "learning_rate": 1.9191246997194426e-05, + "loss": 1.2182, + "step": 4332 + }, + { + "epoch": 1.6131982175890913, + "grad_norm": 0.16888481378555298, + "learning_rate": 1.9190768135124034e-05, + "loss": 1.1964, + "step": 4333 + }, + { + "epoch": 1.6135705227397006, + "grad_norm": 0.17243389785289764, + "learning_rate": 1.9190289137306577e-05, + "loss": 1.2095, + "step": 4334 + }, + { + "epoch": 1.6139428278903096, + "grad_norm": 0.16882330179214478, + "learning_rate": 1.9189810003749125e-05, + "loss": 1.2119, + "step": 4335 + }, + { + "epoch": 1.6143151330409187, + "grad_norm": 0.16220669448375702, + "learning_rate": 1.9189330734458757e-05, + "loss": 1.2105, + "step": 4336 + }, + { + "epoch": 1.6146874381915277, + "grad_norm": 0.16079674661159515, + "learning_rate": 1.918885132944255e-05, + "loss": 1.2148, + "step": 4337 + }, + { + "epoch": 1.6150597433421368, + "grad_norm": 0.17351467907428741, + "learning_rate": 1.9188371788707585e-05, + "loss": 1.2062, + "step": 4338 + }, + { + "epoch": 1.615432048492746, + "grad_norm": 0.1602046936750412, + "learning_rate": 1.9187892112260944e-05, + "loss": 1.203, + "step": 4339 + }, + { + "epoch": 1.6158043536433548, + "grad_norm": 0.1660042405128479, + "learning_rate": 1.9187412300109714e-05, + "loss": 1.2168, + "step": 4340 + }, + { + "epoch": 1.616176658793964, + "grad_norm": 0.16898569464683533, + "learning_rate": 1.9186932352260984e-05, + "loss": 1.2069, + "step": 4341 + }, + { + "epoch": 1.616548963944573, + "grad_norm": 0.17615294456481934, + "learning_rate": 1.9186452268721838e-05, + "loss": 1.2044, + "step": 4342 + }, + { + "epoch": 1.6169212690951822, + "grad_norm": 0.17343638837337494, + "learning_rate": 1.9185972049499368e-05, + "loss": 1.2107, + "step": 4343 + }, + { + "epoch": 1.6172935742457912, + "grad_norm": 0.15935227274894714, + "learning_rate": 1.9185491694600668e-05, + "loss": 1.2153, + "step": 4344 + }, + { + "epoch": 1.6176658793964003, + "grad_norm": 0.1717858910560608, + "learning_rate": 1.9185011204032832e-05, + "loss": 1.211, + "step": 4345 + }, + { + "epoch": 1.6180381845470093, + "grad_norm": 0.17080077528953552, + "learning_rate": 1.9184530577802953e-05, + "loss": 1.1987, + "step": 4346 + }, + { + "epoch": 1.6184104896976184, + "grad_norm": 0.1667952537536621, + "learning_rate": 1.918404981591814e-05, + "loss": 1.2066, + "step": 4347 + }, + { + "epoch": 1.6187827948482276, + "grad_norm": 0.17135196924209595, + "learning_rate": 1.9183568918385484e-05, + "loss": 1.1975, + "step": 4348 + }, + { + "epoch": 1.6191550999988364, + "grad_norm": 0.16429933905601501, + "learning_rate": 1.918308788521209e-05, + "loss": 1.1992, + "step": 4349 + }, + { + "epoch": 1.6195274051494457, + "grad_norm": 0.16419640183448792, + "learning_rate": 1.918260671640507e-05, + "loss": 1.1938, + "step": 4350 + }, + { + "epoch": 1.6198997103000545, + "grad_norm": 0.1755063682794571, + "learning_rate": 1.9182125411971522e-05, + "loss": 1.2272, + "step": 4351 + }, + { + "epoch": 1.6202720154506638, + "grad_norm": 0.17690351605415344, + "learning_rate": 1.9181643971918557e-05, + "loss": 1.1957, + "step": 4352 + }, + { + "epoch": 1.6206443206012728, + "grad_norm": 0.16387419402599335, + "learning_rate": 1.9181162396253286e-05, + "loss": 1.1932, + "step": 4353 + }, + { + "epoch": 1.6210166257518819, + "grad_norm": 0.1648489236831665, + "learning_rate": 1.918068068498282e-05, + "loss": 1.2002, + "step": 4354 + }, + { + "epoch": 1.621388930902491, + "grad_norm": 0.171859472990036, + "learning_rate": 1.9180198838114284e-05, + "loss": 1.1869, + "step": 4355 + }, + { + "epoch": 1.6217612360531, + "grad_norm": 0.1747763305902481, + "learning_rate": 1.9179716855654783e-05, + "loss": 1.2189, + "step": 4356 + }, + { + "epoch": 1.6221335412037092, + "grad_norm": 0.1828392744064331, + "learning_rate": 1.917923473761144e-05, + "loss": 1.1953, + "step": 4357 + }, + { + "epoch": 1.622505846354318, + "grad_norm": 0.16498731076717377, + "learning_rate": 1.917875248399138e-05, + "loss": 1.2088, + "step": 4358 + }, + { + "epoch": 1.6228781515049273, + "grad_norm": 0.17804205417633057, + "learning_rate": 1.9178270094801713e-05, + "loss": 1.2074, + "step": 4359 + }, + { + "epoch": 1.6232504566555361, + "grad_norm": 0.1724364459514618, + "learning_rate": 1.917778757004958e-05, + "loss": 1.2236, + "step": 4360 + }, + { + "epoch": 1.6236227618061454, + "grad_norm": 0.17162750661373138, + "learning_rate": 1.91773049097421e-05, + "loss": 1.2016, + "step": 4361 + }, + { + "epoch": 1.6239950669567544, + "grad_norm": 0.17239919304847717, + "learning_rate": 1.91768221138864e-05, + "loss": 1.1871, + "step": 4362 + }, + { + "epoch": 1.6243673721073635, + "grad_norm": 0.1668175607919693, + "learning_rate": 1.9176339182489614e-05, + "loss": 1.1971, + "step": 4363 + }, + { + "epoch": 1.6247396772579725, + "grad_norm": 0.1689068228006363, + "learning_rate": 1.9175856115558876e-05, + "loss": 1.2086, + "step": 4364 + }, + { + "epoch": 1.6251119824085816, + "grad_norm": 0.16748180985450745, + "learning_rate": 1.9175372913101317e-05, + "loss": 1.2124, + "step": 4365 + }, + { + "epoch": 1.6254842875591908, + "grad_norm": 0.16516231000423431, + "learning_rate": 1.9174889575124077e-05, + "loss": 1.1986, + "step": 4366 + }, + { + "epoch": 1.6258565927097997, + "grad_norm": 0.16919684410095215, + "learning_rate": 1.9174406101634294e-05, + "loss": 1.2042, + "step": 4367 + }, + { + "epoch": 1.626228897860409, + "grad_norm": 0.1688244789838791, + "learning_rate": 1.917392249263911e-05, + "loss": 1.1963, + "step": 4368 + }, + { + "epoch": 1.626601203011018, + "grad_norm": 0.17357051372528076, + "learning_rate": 1.917343874814566e-05, + "loss": 1.2133, + "step": 4369 + }, + { + "epoch": 1.626973508161627, + "grad_norm": 0.16557662189006805, + "learning_rate": 1.9172954868161098e-05, + "loss": 1.1971, + "step": 4370 + }, + { + "epoch": 1.627345813312236, + "grad_norm": 0.16236409544944763, + "learning_rate": 1.9172470852692572e-05, + "loss": 1.1992, + "step": 4371 + }, + { + "epoch": 1.627718118462845, + "grad_norm": 0.16975420713424683, + "learning_rate": 1.9171986701747227e-05, + "loss": 1.1992, + "step": 4372 + }, + { + "epoch": 1.6280904236134541, + "grad_norm": 0.1642945110797882, + "learning_rate": 1.9171502415332214e-05, + "loss": 1.2119, + "step": 4373 + }, + { + "epoch": 1.6284627287640632, + "grad_norm": 0.15670865774154663, + "learning_rate": 1.9171017993454684e-05, + "loss": 1.1994, + "step": 4374 + }, + { + "epoch": 1.6288350339146724, + "grad_norm": 0.16495923697948456, + "learning_rate": 1.9170533436121793e-05, + "loss": 1.2089, + "step": 4375 + }, + { + "epoch": 1.6292073390652813, + "grad_norm": 0.16799496114253998, + "learning_rate": 1.9170048743340698e-05, + "loss": 1.1993, + "step": 4376 + }, + { + "epoch": 1.6295796442158905, + "grad_norm": 0.1674148440361023, + "learning_rate": 1.9169563915118562e-05, + "loss": 1.2032, + "step": 4377 + }, + { + "epoch": 1.6299519493664996, + "grad_norm": 0.16675646603107452, + "learning_rate": 1.9169078951462537e-05, + "loss": 1.2214, + "step": 4378 + }, + { + "epoch": 1.6303242545171086, + "grad_norm": 0.16630315780639648, + "learning_rate": 1.9168593852379798e-05, + "loss": 1.1958, + "step": 4379 + }, + { + "epoch": 1.6306965596677176, + "grad_norm": 0.16184255480766296, + "learning_rate": 1.91681086178775e-05, + "loss": 1.2152, + "step": 4380 + }, + { + "epoch": 1.6310688648183267, + "grad_norm": 0.1670718640089035, + "learning_rate": 1.9167623247962816e-05, + "loss": 1.2088, + "step": 4381 + }, + { + "epoch": 1.6314411699689357, + "grad_norm": 0.1772667020559311, + "learning_rate": 1.916713774264291e-05, + "loss": 1.2204, + "step": 4382 + }, + { + "epoch": 1.6318134751195448, + "grad_norm": 0.16231748461723328, + "learning_rate": 1.916665210192495e-05, + "loss": 1.1968, + "step": 4383 + }, + { + "epoch": 1.632185780270154, + "grad_norm": 0.1662776619195938, + "learning_rate": 1.916616632581612e-05, + "loss": 1.1995, + "step": 4384 + }, + { + "epoch": 1.6325580854207629, + "grad_norm": 0.16376206278800964, + "learning_rate": 1.9165680414323585e-05, + "loss": 1.207, + "step": 4385 + }, + { + "epoch": 1.6329303905713721, + "grad_norm": 0.16616666316986084, + "learning_rate": 1.916519436745453e-05, + "loss": 1.2061, + "step": 4386 + }, + { + "epoch": 1.6333026957219812, + "grad_norm": 0.16720803081989288, + "learning_rate": 1.9164708185216122e-05, + "loss": 1.2103, + "step": 4387 + }, + { + "epoch": 1.6336750008725902, + "grad_norm": 0.16905055940151215, + "learning_rate": 1.9164221867615556e-05, + "loss": 1.2003, + "step": 4388 + }, + { + "epoch": 1.6340473060231993, + "grad_norm": 0.17031989991664886, + "learning_rate": 1.9163735414660005e-05, + "loss": 1.1962, + "step": 4389 + }, + { + "epoch": 1.6344196111738083, + "grad_norm": 0.17323459684848785, + "learning_rate": 1.9163248826356657e-05, + "loss": 1.2219, + "step": 4390 + }, + { + "epoch": 1.6347919163244176, + "grad_norm": 0.17302441596984863, + "learning_rate": 1.91627621027127e-05, + "loss": 1.2199, + "step": 4391 + }, + { + "epoch": 1.6351642214750264, + "grad_norm": 0.19569987058639526, + "learning_rate": 1.916227524373532e-05, + "loss": 1.2103, + "step": 4392 + }, + { + "epoch": 1.6355365266256356, + "grad_norm": 0.16403131186962128, + "learning_rate": 1.916178824943171e-05, + "loss": 1.1942, + "step": 4393 + }, + { + "epoch": 1.6359088317762445, + "grad_norm": 0.17407037317752838, + "learning_rate": 1.9161301119809065e-05, + "loss": 1.2064, + "step": 4394 + }, + { + "epoch": 1.6362811369268537, + "grad_norm": 0.19399769604206085, + "learning_rate": 1.916081385487458e-05, + "loss": 1.2097, + "step": 4395 + }, + { + "epoch": 1.6366534420774628, + "grad_norm": 0.1640399843454361, + "learning_rate": 1.9160326454635442e-05, + "loss": 1.208, + "step": 4396 + }, + { + "epoch": 1.6370257472280718, + "grad_norm": 0.1667345017194748, + "learning_rate": 1.9159838919098862e-05, + "loss": 1.1953, + "step": 4397 + }, + { + "epoch": 1.6373980523786809, + "grad_norm": 0.16879738867282867, + "learning_rate": 1.9159351248272032e-05, + "loss": 1.2016, + "step": 4398 + }, + { + "epoch": 1.63777035752929, + "grad_norm": 0.19763416051864624, + "learning_rate": 1.9158863442162162e-05, + "loss": 1.2099, + "step": 4399 + }, + { + "epoch": 1.6381426626798992, + "grad_norm": 0.1749972701072693, + "learning_rate": 1.9158375500776454e-05, + "loss": 1.2072, + "step": 4400 + }, + { + "epoch": 1.638514967830508, + "grad_norm": 0.17299896478652954, + "learning_rate": 1.9157887424122112e-05, + "loss": 1.2046, + "step": 4401 + }, + { + "epoch": 1.6388872729811172, + "grad_norm": 0.16272982954978943, + "learning_rate": 1.915739921220635e-05, + "loss": 1.2075, + "step": 4402 + }, + { + "epoch": 1.639259578131726, + "grad_norm": 0.186982199549675, + "learning_rate": 1.9156910865036375e-05, + "loss": 1.2091, + "step": 4403 + }, + { + "epoch": 1.6396318832823353, + "grad_norm": 0.1684504896402359, + "learning_rate": 1.91564223826194e-05, + "loss": 1.212, + "step": 4404 + }, + { + "epoch": 1.6400041884329444, + "grad_norm": 0.17304614186286926, + "learning_rate": 1.9155933764962645e-05, + "loss": 1.2291, + "step": 4405 + }, + { + "epoch": 1.6403764935835534, + "grad_norm": 0.1625901609659195, + "learning_rate": 1.915544501207332e-05, + "loss": 1.2118, + "step": 4406 + }, + { + "epoch": 1.6407487987341625, + "grad_norm": 0.17311595380306244, + "learning_rate": 1.915495612395865e-05, + "loss": 1.2139, + "step": 4407 + }, + { + "epoch": 1.6411211038847715, + "grad_norm": 0.17164155840873718, + "learning_rate": 1.9154467100625848e-05, + "loss": 1.2048, + "step": 4408 + }, + { + "epoch": 1.6414934090353808, + "grad_norm": 0.169975146651268, + "learning_rate": 1.9153977942082143e-05, + "loss": 1.2059, + "step": 4409 + }, + { + "epoch": 1.6418657141859896, + "grad_norm": 0.17380517721176147, + "learning_rate": 1.915348864833476e-05, + "loss": 1.2203, + "step": 4410 + }, + { + "epoch": 1.6422380193365989, + "grad_norm": 0.16856248676776886, + "learning_rate": 1.9152999219390924e-05, + "loss": 1.2108, + "step": 4411 + }, + { + "epoch": 1.6426103244872077, + "grad_norm": 0.17079463601112366, + "learning_rate": 1.915250965525786e-05, + "loss": 1.207, + "step": 4412 + }, + { + "epoch": 1.642982629637817, + "grad_norm": 0.16879814863204956, + "learning_rate": 1.9152019955942808e-05, + "loss": 1.2071, + "step": 4413 + }, + { + "epoch": 1.643354934788426, + "grad_norm": 0.17056001722812653, + "learning_rate": 1.915153012145299e-05, + "loss": 1.2001, + "step": 4414 + }, + { + "epoch": 1.643727239939035, + "grad_norm": 0.1714567393064499, + "learning_rate": 1.915104015179565e-05, + "loss": 1.2208, + "step": 4415 + }, + { + "epoch": 1.644099545089644, + "grad_norm": 0.16902725398540497, + "learning_rate": 1.9150550046978022e-05, + "loss": 1.2072, + "step": 4416 + }, + { + "epoch": 1.644471850240253, + "grad_norm": 0.1708216667175293, + "learning_rate": 1.9150059807007343e-05, + "loss": 1.204, + "step": 4417 + }, + { + "epoch": 1.6448441553908624, + "grad_norm": 0.17180079221725464, + "learning_rate": 1.9149569431890854e-05, + "loss": 1.2099, + "step": 4418 + }, + { + "epoch": 1.6452164605414712, + "grad_norm": 0.17007555067539215, + "learning_rate": 1.91490789216358e-05, + "loss": 1.217, + "step": 4419 + }, + { + "epoch": 1.6455887656920805, + "grad_norm": 0.16884230077266693, + "learning_rate": 1.9148588276249423e-05, + "loss": 1.2053, + "step": 4420 + }, + { + "epoch": 1.6459610708426893, + "grad_norm": 0.16880488395690918, + "learning_rate": 1.9148097495738974e-05, + "loss": 1.1978, + "step": 4421 + }, + { + "epoch": 1.6463333759932985, + "grad_norm": 0.16282446682453156, + "learning_rate": 1.9147606580111696e-05, + "loss": 1.2012, + "step": 4422 + }, + { + "epoch": 1.6467056811439076, + "grad_norm": 0.17796848714351654, + "learning_rate": 1.9147115529374846e-05, + "loss": 1.2069, + "step": 4423 + }, + { + "epoch": 1.6470779862945166, + "grad_norm": 0.16863879561424255, + "learning_rate": 1.914662434353567e-05, + "loss": 1.2175, + "step": 4424 + }, + { + "epoch": 1.6474502914451257, + "grad_norm": 0.1625177413225174, + "learning_rate": 1.914613302260143e-05, + "loss": 1.1944, + "step": 4425 + }, + { + "epoch": 1.6478225965957347, + "grad_norm": 0.1714029610157013, + "learning_rate": 1.9145641566579377e-05, + "loss": 1.2129, + "step": 4426 + }, + { + "epoch": 1.648194901746344, + "grad_norm": 0.16849827766418457, + "learning_rate": 1.914514997547677e-05, + "loss": 1.2187, + "step": 4427 + }, + { + "epoch": 1.6485672068969528, + "grad_norm": 0.16397565603256226, + "learning_rate": 1.9144658249300877e-05, + "loss": 1.1988, + "step": 4428 + }, + { + "epoch": 1.648939512047562, + "grad_norm": 0.16494201123714447, + "learning_rate": 1.9144166388058952e-05, + "loss": 1.2045, + "step": 4429 + }, + { + "epoch": 1.649311817198171, + "grad_norm": 0.16765427589416504, + "learning_rate": 1.9143674391758264e-05, + "loss": 1.1967, + "step": 4430 + }, + { + "epoch": 1.6496841223487801, + "grad_norm": 0.16366161406040192, + "learning_rate": 1.914318226040608e-05, + "loss": 1.1777, + "step": 4431 + }, + { + "epoch": 1.6500564274993892, + "grad_norm": 0.16279412806034088, + "learning_rate": 1.9142689994009666e-05, + "loss": 1.2158, + "step": 4432 + }, + { + "epoch": 1.6504287326499982, + "grad_norm": 0.16934585571289062, + "learning_rate": 1.9142197592576294e-05, + "loss": 1.2082, + "step": 4433 + }, + { + "epoch": 1.6508010378006073, + "grad_norm": 0.17340950667858124, + "learning_rate": 1.914170505611324e-05, + "loss": 1.2204, + "step": 4434 + }, + { + "epoch": 1.6511733429512163, + "grad_norm": 0.16891925036907196, + "learning_rate": 1.9141212384627777e-05, + "loss": 1.2084, + "step": 4435 + }, + { + "epoch": 1.6515456481018256, + "grad_norm": 0.1604902744293213, + "learning_rate": 1.914071957812718e-05, + "loss": 1.1887, + "step": 4436 + }, + { + "epoch": 1.6519179532524344, + "grad_norm": 0.17385107278823853, + "learning_rate": 1.9140226636618726e-05, + "loss": 1.2132, + "step": 4437 + }, + { + "epoch": 1.6522902584030437, + "grad_norm": 0.17028431594371796, + "learning_rate": 1.91397335601097e-05, + "loss": 1.1935, + "step": 4438 + }, + { + "epoch": 1.6526625635536527, + "grad_norm": 0.17365415394306183, + "learning_rate": 1.913924034860738e-05, + "loss": 1.2097, + "step": 4439 + }, + { + "epoch": 1.6530348687042618, + "grad_norm": 0.1641881912946701, + "learning_rate": 1.913874700211906e-05, + "loss": 1.2043, + "step": 4440 + }, + { + "epoch": 1.6534071738548708, + "grad_norm": 0.17010696232318878, + "learning_rate": 1.9138253520652014e-05, + "loss": 1.1914, + "step": 4441 + }, + { + "epoch": 1.6537794790054798, + "grad_norm": 0.16985437273979187, + "learning_rate": 1.913775990421354e-05, + "loss": 1.2232, + "step": 4442 + }, + { + "epoch": 1.654151784156089, + "grad_norm": 0.16199250519275665, + "learning_rate": 1.9137266152810925e-05, + "loss": 1.1949, + "step": 4443 + }, + { + "epoch": 1.654524089306698, + "grad_norm": 0.16791431605815887, + "learning_rate": 1.9136772266451462e-05, + "loss": 1.2102, + "step": 4444 + }, + { + "epoch": 1.6548963944573072, + "grad_norm": 0.16607221961021423, + "learning_rate": 1.9136278245142446e-05, + "loss": 1.205, + "step": 4445 + }, + { + "epoch": 1.655268699607916, + "grad_norm": 0.16292716562747955, + "learning_rate": 1.9135784088891175e-05, + "loss": 1.2006, + "step": 4446 + }, + { + "epoch": 1.6556410047585253, + "grad_norm": 0.1653973013162613, + "learning_rate": 1.9135289797704946e-05, + "loss": 1.191, + "step": 4447 + }, + { + "epoch": 1.6560133099091343, + "grad_norm": 0.17572778463363647, + "learning_rate": 1.913479537159106e-05, + "loss": 1.2133, + "step": 4448 + }, + { + "epoch": 1.6563856150597434, + "grad_norm": 0.17195762693881989, + "learning_rate": 1.913430081055682e-05, + "loss": 1.1863, + "step": 4449 + }, + { + "epoch": 1.6567579202103524, + "grad_norm": 0.17644944787025452, + "learning_rate": 1.9133806114609527e-05, + "loss": 1.1969, + "step": 4450 + }, + { + "epoch": 1.6571302253609614, + "grad_norm": 0.16783183813095093, + "learning_rate": 1.9133311283756493e-05, + "loss": 1.2021, + "step": 4451 + }, + { + "epoch": 1.6575025305115707, + "grad_norm": 0.17917399108409882, + "learning_rate": 1.9132816318005026e-05, + "loss": 1.2142, + "step": 4452 + }, + { + "epoch": 1.6578748356621795, + "grad_norm": 0.16448262333869934, + "learning_rate": 1.9132321217362434e-05, + "loss": 1.205, + "step": 4453 + }, + { + "epoch": 1.6582471408127888, + "grad_norm": 0.1729598045349121, + "learning_rate": 1.913182598183603e-05, + "loss": 1.202, + "step": 4454 + }, + { + "epoch": 1.6586194459633976, + "grad_norm": 0.1698109656572342, + "learning_rate": 1.913133061143313e-05, + "loss": 1.1936, + "step": 4455 + }, + { + "epoch": 1.6589917511140069, + "grad_norm": 0.1744636744260788, + "learning_rate": 1.913083510616105e-05, + "loss": 1.1998, + "step": 4456 + }, + { + "epoch": 1.659364056264616, + "grad_norm": 0.1762547791004181, + "learning_rate": 1.9130339466027108e-05, + "loss": 1.2128, + "step": 4457 + }, + { + "epoch": 1.659736361415225, + "grad_norm": 0.16493569314479828, + "learning_rate": 1.9129843691038625e-05, + "loss": 1.205, + "step": 4458 + }, + { + "epoch": 1.660108666565834, + "grad_norm": 0.16134680807590485, + "learning_rate": 1.9129347781202924e-05, + "loss": 1.1999, + "step": 4459 + }, + { + "epoch": 1.660480971716443, + "grad_norm": 0.16520054638385773, + "learning_rate": 1.912885173652733e-05, + "loss": 1.2073, + "step": 4460 + }, + { + "epoch": 1.6608532768670523, + "grad_norm": 0.16811229288578033, + "learning_rate": 1.9128355557019168e-05, + "loss": 1.2041, + "step": 4461 + }, + { + "epoch": 1.6612255820176611, + "grad_norm": 0.16674669086933136, + "learning_rate": 1.912785924268577e-05, + "loss": 1.212, + "step": 4462 + }, + { + "epoch": 1.6615978871682704, + "grad_norm": 0.17780810594558716, + "learning_rate": 1.912736279353446e-05, + "loss": 1.2236, + "step": 4463 + }, + { + "epoch": 1.6619701923188792, + "grad_norm": 0.18256306648254395, + "learning_rate": 1.9126866209572575e-05, + "loss": 1.2028, + "step": 4464 + }, + { + "epoch": 1.6623424974694885, + "grad_norm": 0.18052802979946136, + "learning_rate": 1.912636949080745e-05, + "loss": 1.215, + "step": 4465 + }, + { + "epoch": 1.6627148026200975, + "grad_norm": 0.16654305160045624, + "learning_rate": 1.912587263724642e-05, + "loss": 1.2013, + "step": 4466 + }, + { + "epoch": 1.6630871077707066, + "grad_norm": 0.17217010259628296, + "learning_rate": 1.9125375648896823e-05, + "loss": 1.218, + "step": 4467 + }, + { + "epoch": 1.6634594129213156, + "grad_norm": 0.17608584463596344, + "learning_rate": 1.9124878525766002e-05, + "loss": 1.2081, + "step": 4468 + }, + { + "epoch": 1.6638317180719246, + "grad_norm": 0.16333593428134918, + "learning_rate": 1.9124381267861295e-05, + "loss": 1.2036, + "step": 4469 + }, + { + "epoch": 1.664204023222534, + "grad_norm": 0.1907689869403839, + "learning_rate": 1.9123883875190052e-05, + "loss": 1.202, + "step": 4470 + }, + { + "epoch": 1.6645763283731427, + "grad_norm": 0.18991775810718536, + "learning_rate": 1.9123386347759614e-05, + "loss": 1.2024, + "step": 4471 + }, + { + "epoch": 1.664948633523752, + "grad_norm": 0.17199549078941345, + "learning_rate": 1.9122888685577337e-05, + "loss": 1.2002, + "step": 4472 + }, + { + "epoch": 1.6653209386743608, + "grad_norm": 0.20347337424755096, + "learning_rate": 1.9122390888650564e-05, + "loss": 1.2212, + "step": 4473 + }, + { + "epoch": 1.66569324382497, + "grad_norm": 0.16607064008712769, + "learning_rate": 1.912189295698665e-05, + "loss": 1.2063, + "step": 4474 + }, + { + "epoch": 1.6660655489755791, + "grad_norm": 0.16356223821640015, + "learning_rate": 1.9121394890592948e-05, + "loss": 1.2087, + "step": 4475 + }, + { + "epoch": 1.6664378541261882, + "grad_norm": 0.1626313030719757, + "learning_rate": 1.9120896689476817e-05, + "loss": 1.1938, + "step": 4476 + }, + { + "epoch": 1.6668101592767972, + "grad_norm": 0.17197953164577484, + "learning_rate": 1.9120398353645615e-05, + "loss": 1.2206, + "step": 4477 + }, + { + "epoch": 1.6671824644274063, + "grad_norm": 0.16226613521575928, + "learning_rate": 1.9119899883106702e-05, + "loss": 1.2073, + "step": 4478 + }, + { + "epoch": 1.6675547695780155, + "grad_norm": 0.16794995963573456, + "learning_rate": 1.911940127786744e-05, + "loss": 1.2096, + "step": 4479 + }, + { + "epoch": 1.6679270747286243, + "grad_norm": 0.17185860872268677, + "learning_rate": 1.911890253793519e-05, + "loss": 1.1917, + "step": 4480 + }, + { + "epoch": 1.6682993798792336, + "grad_norm": 0.16930022835731506, + "learning_rate": 1.9118403663317323e-05, + "loss": 1.2207, + "step": 4481 + }, + { + "epoch": 1.6686716850298424, + "grad_norm": 0.16357828676700592, + "learning_rate": 1.911790465402121e-05, + "loss": 1.2031, + "step": 4482 + }, + { + "epoch": 1.6690439901804517, + "grad_norm": 0.16401416063308716, + "learning_rate": 1.9117405510054216e-05, + "loss": 1.1962, + "step": 4483 + }, + { + "epoch": 1.6694162953310607, + "grad_norm": 0.1795455366373062, + "learning_rate": 1.9116906231423712e-05, + "loss": 1.2093, + "step": 4484 + }, + { + "epoch": 1.6697886004816698, + "grad_norm": 0.17180180549621582, + "learning_rate": 1.9116406818137077e-05, + "loss": 1.2082, + "step": 4485 + }, + { + "epoch": 1.6701609056322788, + "grad_norm": 0.16501714289188385, + "learning_rate": 1.9115907270201684e-05, + "loss": 1.1998, + "step": 4486 + }, + { + "epoch": 1.6705332107828879, + "grad_norm": 0.16321603953838348, + "learning_rate": 1.9115407587624915e-05, + "loss": 1.2053, + "step": 4487 + }, + { + "epoch": 1.6709055159334971, + "grad_norm": 0.16696272790431976, + "learning_rate": 1.911490777041415e-05, + "loss": 1.2012, + "step": 4488 + }, + { + "epoch": 1.671277821084106, + "grad_norm": 0.16422761976718903, + "learning_rate": 1.9114407818576767e-05, + "loss": 1.1892, + "step": 4489 + }, + { + "epoch": 1.6716501262347152, + "grad_norm": 0.17487448453903198, + "learning_rate": 1.911390773212015e-05, + "loss": 1.2041, + "step": 4490 + }, + { + "epoch": 1.6720224313853242, + "grad_norm": 0.16392962634563446, + "learning_rate": 1.911340751105169e-05, + "loss": 1.2164, + "step": 4491 + }, + { + "epoch": 1.6723947365359333, + "grad_norm": 0.1618296504020691, + "learning_rate": 1.9112907155378772e-05, + "loss": 1.2109, + "step": 4492 + }, + { + "epoch": 1.6727670416865423, + "grad_norm": 0.17908450961112976, + "learning_rate": 1.911240666510879e-05, + "loss": 1.2248, + "step": 4493 + }, + { + "epoch": 1.6731393468371514, + "grad_norm": 0.17229048907756805, + "learning_rate": 1.9111906040249134e-05, + "loss": 1.2005, + "step": 4494 + }, + { + "epoch": 1.6735116519877604, + "grad_norm": 0.16625456511974335, + "learning_rate": 1.9111405280807192e-05, + "loss": 1.1994, + "step": 4495 + }, + { + "epoch": 1.6738839571383695, + "grad_norm": 0.18167874217033386, + "learning_rate": 1.911090438679037e-05, + "loss": 1.1959, + "step": 4496 + }, + { + "epoch": 1.6742562622889787, + "grad_norm": 0.16596947610378265, + "learning_rate": 1.911040335820606e-05, + "loss": 1.1882, + "step": 4497 + }, + { + "epoch": 1.6746285674395875, + "grad_norm": 0.17403769493103027, + "learning_rate": 1.9109902195061666e-05, + "loss": 1.2009, + "step": 4498 + }, + { + "epoch": 1.6750008725901968, + "grad_norm": 0.1789177507162094, + "learning_rate": 1.9109400897364584e-05, + "loss": 1.2039, + "step": 4499 + }, + { + "epoch": 1.6753731777408059, + "grad_norm": 0.1641203761100769, + "learning_rate": 1.9108899465122227e-05, + "loss": 1.2151, + "step": 4500 + }, + { + "epoch": 1.6753731777408059, + "eval_loss": 1.3091740608215332, + "eval_runtime": 17.081, + "eval_samples_per_second": 101.517, + "eval_steps_per_second": 5.093, + "step": 4500 + }, + { + "epoch": 1.675745482891415, + "grad_norm": 0.17204515635967255, + "learning_rate": 1.9108397898342e-05, + "loss": 1.2264, + "step": 4501 + }, + { + "epoch": 1.676117788042024, + "grad_norm": 0.16915835440158844, + "learning_rate": 1.9107896197031298e-05, + "loss": 1.2102, + "step": 4502 + }, + { + "epoch": 1.676490093192633, + "grad_norm": 0.22267740964889526, + "learning_rate": 1.9107394361197545e-05, + "loss": 1.2142, + "step": 4503 + }, + { + "epoch": 1.6768623983432422, + "grad_norm": 0.16702081263065338, + "learning_rate": 1.9106892390848154e-05, + "loss": 1.1892, + "step": 4504 + }, + { + "epoch": 1.677234703493851, + "grad_norm": 0.17088332772254944, + "learning_rate": 1.9106390285990527e-05, + "loss": 1.2173, + "step": 4505 + }, + { + "epoch": 1.6776070086444603, + "grad_norm": 0.16930538415908813, + "learning_rate": 1.9105888046632088e-05, + "loss": 1.1886, + "step": 4506 + }, + { + "epoch": 1.6779793137950691, + "grad_norm": 0.18196465075016022, + "learning_rate": 1.9105385672780256e-05, + "loss": 1.1952, + "step": 4507 + }, + { + "epoch": 1.6783516189456784, + "grad_norm": 0.17563000321388245, + "learning_rate": 1.910488316444245e-05, + "loss": 1.2135, + "step": 4508 + }, + { + "epoch": 1.6787239240962875, + "grad_norm": 0.17271769046783447, + "learning_rate": 1.910438052162609e-05, + "loss": 1.2188, + "step": 4509 + }, + { + "epoch": 1.6790962292468965, + "grad_norm": 0.21126636862754822, + "learning_rate": 1.91038777443386e-05, + "loss": 1.2002, + "step": 4510 + }, + { + "epoch": 1.6794685343975055, + "grad_norm": 0.24127759039402008, + "learning_rate": 1.9103374832587406e-05, + "loss": 1.1998, + "step": 4511 + }, + { + "epoch": 1.6798408395481146, + "grad_norm": 0.1943320333957672, + "learning_rate": 1.910287178637994e-05, + "loss": 1.2146, + "step": 4512 + }, + { + "epoch": 1.6802131446987238, + "grad_norm": 0.18076887726783752, + "learning_rate": 1.9102368605723626e-05, + "loss": 1.1915, + "step": 4513 + }, + { + "epoch": 1.6805854498493327, + "grad_norm": 0.2074926644563675, + "learning_rate": 1.9101865290625903e-05, + "loss": 1.1936, + "step": 4514 + }, + { + "epoch": 1.680957754999942, + "grad_norm": 0.17887428402900696, + "learning_rate": 1.91013618410942e-05, + "loss": 1.2016, + "step": 4515 + }, + { + "epoch": 1.6813300601505508, + "grad_norm": 0.1693534553050995, + "learning_rate": 1.910085825713595e-05, + "loss": 1.2022, + "step": 4516 + }, + { + "epoch": 1.68170236530116, + "grad_norm": 0.1669137179851532, + "learning_rate": 1.9100354538758598e-05, + "loss": 1.2027, + "step": 4517 + }, + { + "epoch": 1.682074670451769, + "grad_norm": 0.1691608726978302, + "learning_rate": 1.9099850685969578e-05, + "loss": 1.2078, + "step": 4518 + }, + { + "epoch": 1.682446975602378, + "grad_norm": 0.16599372029304504, + "learning_rate": 1.9099346698776338e-05, + "loss": 1.185, + "step": 4519 + }, + { + "epoch": 1.6828192807529871, + "grad_norm": 0.164974182844162, + "learning_rate": 1.9098842577186315e-05, + "loss": 1.2028, + "step": 4520 + }, + { + "epoch": 1.6831915859035962, + "grad_norm": 0.1712837517261505, + "learning_rate": 1.909833832120696e-05, + "loss": 1.215, + "step": 4521 + }, + { + "epoch": 1.6835638910542055, + "grad_norm": 0.16040773689746857, + "learning_rate": 1.9097833930845718e-05, + "loss": 1.2097, + "step": 4522 + }, + { + "epoch": 1.6839361962048143, + "grad_norm": 0.163836270570755, + "learning_rate": 1.9097329406110038e-05, + "loss": 1.2142, + "step": 4523 + }, + { + "epoch": 1.6843085013554235, + "grad_norm": 0.16300654411315918, + "learning_rate": 1.9096824747007378e-05, + "loss": 1.203, + "step": 4524 + }, + { + "epoch": 1.6846808065060324, + "grad_norm": 0.17333677411079407, + "learning_rate": 1.9096319953545186e-05, + "loss": 1.2009, + "step": 4525 + }, + { + "epoch": 1.6850531116566416, + "grad_norm": 0.1657482534646988, + "learning_rate": 1.9095815025730918e-05, + "loss": 1.224, + "step": 4526 + }, + { + "epoch": 1.6854254168072507, + "grad_norm": 0.17389807105064392, + "learning_rate": 1.9095309963572034e-05, + "loss": 1.208, + "step": 4527 + }, + { + "epoch": 1.6857977219578597, + "grad_norm": 0.16647979617118835, + "learning_rate": 1.909480476707599e-05, + "loss": 1.1943, + "step": 4528 + }, + { + "epoch": 1.6861700271084687, + "grad_norm": 0.16131357848644257, + "learning_rate": 1.9094299436250254e-05, + "loss": 1.1994, + "step": 4529 + }, + { + "epoch": 1.6865423322590778, + "grad_norm": 0.16403862833976746, + "learning_rate": 1.9093793971102282e-05, + "loss": 1.1896, + "step": 4530 + }, + { + "epoch": 1.686914637409687, + "grad_norm": 0.16016656160354614, + "learning_rate": 1.9093288371639547e-05, + "loss": 1.2013, + "step": 4531 + }, + { + "epoch": 1.6872869425602959, + "grad_norm": 0.1598949134349823, + "learning_rate": 1.9092782637869513e-05, + "loss": 1.1843, + "step": 4532 + }, + { + "epoch": 1.6876592477109051, + "grad_norm": 0.16211171448230743, + "learning_rate": 1.909227676979965e-05, + "loss": 1.1962, + "step": 4533 + }, + { + "epoch": 1.688031552861514, + "grad_norm": 0.1622130423784256, + "learning_rate": 1.9091770767437428e-05, + "loss": 1.1995, + "step": 4534 + }, + { + "epoch": 1.6884038580121232, + "grad_norm": 0.1642407476902008, + "learning_rate": 1.9091264630790324e-05, + "loss": 1.1968, + "step": 4535 + }, + { + "epoch": 1.6887761631627323, + "grad_norm": 0.16541729867458344, + "learning_rate": 1.9090758359865812e-05, + "loss": 1.2072, + "step": 4536 + }, + { + "epoch": 1.6891484683133413, + "grad_norm": 0.16923236846923828, + "learning_rate": 1.9090251954671372e-05, + "loss": 1.2137, + "step": 4537 + }, + { + "epoch": 1.6895207734639504, + "grad_norm": 0.16798345744609833, + "learning_rate": 1.9089745415214474e-05, + "loss": 1.1978, + "step": 4538 + }, + { + "epoch": 1.6898930786145594, + "grad_norm": 0.1650944948196411, + "learning_rate": 1.9089238741502614e-05, + "loss": 1.2105, + "step": 4539 + }, + { + "epoch": 1.6902653837651687, + "grad_norm": 0.17048072814941406, + "learning_rate": 1.9088731933543262e-05, + "loss": 1.208, + "step": 4540 + }, + { + "epoch": 1.6906376889157775, + "grad_norm": 0.15930227935314178, + "learning_rate": 1.9088224991343916e-05, + "loss": 1.217, + "step": 4541 + }, + { + "epoch": 1.6910099940663867, + "grad_norm": 0.163492351770401, + "learning_rate": 1.9087717914912054e-05, + "loss": 1.2102, + "step": 4542 + }, + { + "epoch": 1.6913822992169956, + "grad_norm": 0.16682982444763184, + "learning_rate": 1.908721070425517e-05, + "loss": 1.1899, + "step": 4543 + }, + { + "epoch": 1.6917546043676048, + "grad_norm": 0.16626925766468048, + "learning_rate": 1.908670335938075e-05, + "loss": 1.1931, + "step": 4544 + }, + { + "epoch": 1.6921269095182139, + "grad_norm": 0.1644056737422943, + "learning_rate": 1.9086195880296294e-05, + "loss": 1.2129, + "step": 4545 + }, + { + "epoch": 1.692499214668823, + "grad_norm": 0.16008804738521576, + "learning_rate": 1.9085688267009298e-05, + "loss": 1.2037, + "step": 4546 + }, + { + "epoch": 1.692871519819432, + "grad_norm": 0.16731880605220795, + "learning_rate": 1.9085180519527252e-05, + "loss": 1.1954, + "step": 4547 + }, + { + "epoch": 1.693243824970041, + "grad_norm": 0.1611679047346115, + "learning_rate": 1.9084672637857663e-05, + "loss": 1.2074, + "step": 4548 + }, + { + "epoch": 1.6936161301206503, + "grad_norm": 0.16875462234020233, + "learning_rate": 1.908416462200803e-05, + "loss": 1.2177, + "step": 4549 + }, + { + "epoch": 1.693988435271259, + "grad_norm": 0.15939076244831085, + "learning_rate": 1.9083656471985855e-05, + "loss": 1.2001, + "step": 4550 + }, + { + "epoch": 1.6943607404218683, + "grad_norm": 0.16312842071056366, + "learning_rate": 1.908314818779864e-05, + "loss": 1.1995, + "step": 4551 + }, + { + "epoch": 1.6947330455724774, + "grad_norm": 0.16439701616764069, + "learning_rate": 1.90826397694539e-05, + "loss": 1.2002, + "step": 4552 + }, + { + "epoch": 1.6951053507230864, + "grad_norm": 0.16645435988903046, + "learning_rate": 1.9082131216959137e-05, + "loss": 1.207, + "step": 4553 + }, + { + "epoch": 1.6954776558736955, + "grad_norm": 0.1708313226699829, + "learning_rate": 1.9081622530321874e-05, + "loss": 1.2047, + "step": 4554 + }, + { + "epoch": 1.6958499610243045, + "grad_norm": 0.16160647571086884, + "learning_rate": 1.908111370954961e-05, + "loss": 1.2187, + "step": 4555 + }, + { + "epoch": 1.6962222661749136, + "grad_norm": 0.15889696776866913, + "learning_rate": 1.9080604754649865e-05, + "loss": 1.2136, + "step": 4556 + }, + { + "epoch": 1.6965945713255226, + "grad_norm": 0.1612309068441391, + "learning_rate": 1.908009566563016e-05, + "loss": 1.1969, + "step": 4557 + }, + { + "epoch": 1.6969668764761319, + "grad_norm": 0.16933639347553253, + "learning_rate": 1.9079586442498016e-05, + "loss": 1.2167, + "step": 4558 + }, + { + "epoch": 1.6973391816267407, + "grad_norm": 0.1634131371974945, + "learning_rate": 1.9079077085260943e-05, + "loss": 1.216, + "step": 4559 + }, + { + "epoch": 1.69771148677735, + "grad_norm": 0.16775217652320862, + "learning_rate": 1.9078567593926472e-05, + "loss": 1.2064, + "step": 4560 + }, + { + "epoch": 1.698083791927959, + "grad_norm": 0.164699524641037, + "learning_rate": 1.9078057968502132e-05, + "loss": 1.2052, + "step": 4561 + }, + { + "epoch": 1.698456097078568, + "grad_norm": 0.1709076166152954, + "learning_rate": 1.9077548208995442e-05, + "loss": 1.2005, + "step": 4562 + }, + { + "epoch": 1.698828402229177, + "grad_norm": 0.1730164885520935, + "learning_rate": 1.907703831541394e-05, + "loss": 1.2262, + "step": 4563 + }, + { + "epoch": 1.6992007073797861, + "grad_norm": 0.1677834689617157, + "learning_rate": 1.9076528287765145e-05, + "loss": 1.1978, + "step": 4564 + }, + { + "epoch": 1.6995730125303954, + "grad_norm": 0.16488604247570038, + "learning_rate": 1.90760181260566e-05, + "loss": 1.1987, + "step": 4565 + }, + { + "epoch": 1.6999453176810042, + "grad_norm": 0.18518555164337158, + "learning_rate": 1.9075507830295837e-05, + "loss": 1.1877, + "step": 4566 + }, + { + "epoch": 1.7003176228316135, + "grad_norm": 0.16280607879161835, + "learning_rate": 1.9074997400490392e-05, + "loss": 1.2053, + "step": 4567 + }, + { + "epoch": 1.7006899279822223, + "grad_norm": 0.16857080161571503, + "learning_rate": 1.90744868366478e-05, + "loss": 1.2054, + "step": 4568 + }, + { + "epoch": 1.7010622331328316, + "grad_norm": 0.16795577108860016, + "learning_rate": 1.9073976138775613e-05, + "loss": 1.2102, + "step": 4569 + }, + { + "epoch": 1.7014345382834406, + "grad_norm": 0.17121511697769165, + "learning_rate": 1.907346530688137e-05, + "loss": 1.1977, + "step": 4570 + }, + { + "epoch": 1.7018068434340496, + "grad_norm": 0.1754191517829895, + "learning_rate": 1.907295434097261e-05, + "loss": 1.2035, + "step": 4571 + }, + { + "epoch": 1.7021791485846587, + "grad_norm": 0.1667300909757614, + "learning_rate": 1.9072443241056884e-05, + "loss": 1.2076, + "step": 4572 + }, + { + "epoch": 1.7025514537352677, + "grad_norm": 0.19166409969329834, + "learning_rate": 1.9071932007141742e-05, + "loss": 1.207, + "step": 4573 + }, + { + "epoch": 1.702923758885877, + "grad_norm": 0.16179704666137695, + "learning_rate": 1.907142063923473e-05, + "loss": 1.1928, + "step": 4574 + }, + { + "epoch": 1.7032960640364858, + "grad_norm": 0.17530782520771027, + "learning_rate": 1.907090913734341e-05, + "loss": 1.2057, + "step": 4575 + }, + { + "epoch": 1.703668369187095, + "grad_norm": 0.16238301992416382, + "learning_rate": 1.9070397501475327e-05, + "loss": 1.1965, + "step": 4576 + }, + { + "epoch": 1.704040674337704, + "grad_norm": 0.18561150133609772, + "learning_rate": 1.9069885731638045e-05, + "loss": 1.2201, + "step": 4577 + }, + { + "epoch": 1.7044129794883132, + "grad_norm": 0.17264258861541748, + "learning_rate": 1.9069373827839117e-05, + "loss": 1.1941, + "step": 4578 + }, + { + "epoch": 1.7047852846389222, + "grad_norm": 0.16849012672901154, + "learning_rate": 1.906886179008611e-05, + "loss": 1.2056, + "step": 4579 + }, + { + "epoch": 1.7051575897895312, + "grad_norm": 0.16359888017177582, + "learning_rate": 1.906834961838658e-05, + "loss": 1.2024, + "step": 4580 + }, + { + "epoch": 1.7055298949401403, + "grad_norm": 0.18342570960521698, + "learning_rate": 1.9067837312748097e-05, + "loss": 1.1985, + "step": 4581 + }, + { + "epoch": 1.7059022000907493, + "grad_norm": 0.16952671110630035, + "learning_rate": 1.9067324873178227e-05, + "loss": 1.2096, + "step": 4582 + }, + { + "epoch": 1.7062745052413586, + "grad_norm": 0.16725878417491913, + "learning_rate": 1.9066812299684537e-05, + "loss": 1.2007, + "step": 4583 + }, + { + "epoch": 1.7066468103919674, + "grad_norm": 0.1666475087404251, + "learning_rate": 1.9066299592274596e-05, + "loss": 1.2142, + "step": 4584 + }, + { + "epoch": 1.7070191155425767, + "grad_norm": 0.17125262320041656, + "learning_rate": 1.9065786750955983e-05, + "loss": 1.2146, + "step": 4585 + }, + { + "epoch": 1.7073914206931855, + "grad_norm": 0.16806922852993011, + "learning_rate": 1.9065273775736264e-05, + "loss": 1.1995, + "step": 4586 + }, + { + "epoch": 1.7077637258437948, + "grad_norm": 0.16687040030956268, + "learning_rate": 1.9064760666623025e-05, + "loss": 1.2051, + "step": 4587 + }, + { + "epoch": 1.7081360309944038, + "grad_norm": 0.1716591715812683, + "learning_rate": 1.9064247423623838e-05, + "loss": 1.204, + "step": 4588 + }, + { + "epoch": 1.7085083361450129, + "grad_norm": 0.1665405035018921, + "learning_rate": 1.9063734046746286e-05, + "loss": 1.1918, + "step": 4589 + }, + { + "epoch": 1.708880641295622, + "grad_norm": 0.17192888259887695, + "learning_rate": 1.906322053599795e-05, + "loss": 1.2096, + "step": 4590 + }, + { + "epoch": 1.709252946446231, + "grad_norm": 0.16220897436141968, + "learning_rate": 1.9062706891386414e-05, + "loss": 1.1828, + "step": 4591 + }, + { + "epoch": 1.7096252515968402, + "grad_norm": 0.17330990731716156, + "learning_rate": 1.9062193112919266e-05, + "loss": 1.1909, + "step": 4592 + }, + { + "epoch": 1.709997556747449, + "grad_norm": 0.17249375581741333, + "learning_rate": 1.9061679200604097e-05, + "loss": 1.1908, + "step": 4593 + }, + { + "epoch": 1.7103698618980583, + "grad_norm": 0.1671372950077057, + "learning_rate": 1.9061165154448496e-05, + "loss": 1.2069, + "step": 4594 + }, + { + "epoch": 1.710742167048667, + "grad_norm": 0.17133872210979462, + "learning_rate": 1.906065097446005e-05, + "loss": 1.1977, + "step": 4595 + }, + { + "epoch": 1.7111144721992764, + "grad_norm": 0.18286733329296112, + "learning_rate": 1.9060136660646362e-05, + "loss": 1.2198, + "step": 4596 + }, + { + "epoch": 1.7114867773498854, + "grad_norm": 0.17125152051448822, + "learning_rate": 1.9059622213015023e-05, + "loss": 1.2111, + "step": 4597 + }, + { + "epoch": 1.7118590825004945, + "grad_norm": 0.16331323981285095, + "learning_rate": 1.905910763157363e-05, + "loss": 1.212, + "step": 4598 + }, + { + "epoch": 1.7122313876511035, + "grad_norm": 0.17022307217121124, + "learning_rate": 1.905859291632979e-05, + "loss": 1.2115, + "step": 4599 + }, + { + "epoch": 1.7126036928017125, + "grad_norm": 0.17252519726753235, + "learning_rate": 1.9058078067291095e-05, + "loss": 1.2075, + "step": 4600 + }, + { + "epoch": 1.7129759979523218, + "grad_norm": 0.19809845089912415, + "learning_rate": 1.9057563084465157e-05, + "loss": 1.1885, + "step": 4601 + }, + { + "epoch": 1.7133483031029306, + "grad_norm": 0.1879017949104309, + "learning_rate": 1.9057047967859584e-05, + "loss": 1.2113, + "step": 4602 + }, + { + "epoch": 1.71372060825354, + "grad_norm": 0.17755410075187683, + "learning_rate": 1.905653271748198e-05, + "loss": 1.205, + "step": 4603 + }, + { + "epoch": 1.7140929134041487, + "grad_norm": 0.24146291613578796, + "learning_rate": 1.905601733333996e-05, + "loss": 1.1976, + "step": 4604 + }, + { + "epoch": 1.714465218554758, + "grad_norm": 0.2119484394788742, + "learning_rate": 1.9055501815441126e-05, + "loss": 1.1983, + "step": 4605 + }, + { + "epoch": 1.714837523705367, + "grad_norm": 0.18424339592456818, + "learning_rate": 1.90549861637931e-05, + "loss": 1.2067, + "step": 4606 + }, + { + "epoch": 1.715209828855976, + "grad_norm": 0.16043433547019958, + "learning_rate": 1.9054470378403495e-05, + "loss": 1.2139, + "step": 4607 + }, + { + "epoch": 1.715582134006585, + "grad_norm": 0.19521519541740417, + "learning_rate": 1.9053954459279934e-05, + "loss": 1.2078, + "step": 4608 + }, + { + "epoch": 1.7159544391571941, + "grad_norm": 0.1905895173549652, + "learning_rate": 1.905343840643003e-05, + "loss": 1.1947, + "step": 4609 + }, + { + "epoch": 1.7163267443078034, + "grad_norm": 0.17229844629764557, + "learning_rate": 1.9052922219861413e-05, + "loss": 1.2018, + "step": 4610 + }, + { + "epoch": 1.7166990494584122, + "grad_norm": 0.16754981875419617, + "learning_rate": 1.90524058995817e-05, + "loss": 1.2105, + "step": 4611 + }, + { + "epoch": 1.7170713546090215, + "grad_norm": 0.17675986886024475, + "learning_rate": 1.9051889445598524e-05, + "loss": 1.196, + "step": 4612 + }, + { + "epoch": 1.7174436597596305, + "grad_norm": 0.1735907346010208, + "learning_rate": 1.9051372857919505e-05, + "loss": 1.2019, + "step": 4613 + }, + { + "epoch": 1.7178159649102396, + "grad_norm": 0.16587558388710022, + "learning_rate": 1.905085613655228e-05, + "loss": 1.2019, + "step": 4614 + }, + { + "epoch": 1.7181882700608486, + "grad_norm": 0.17289356887340546, + "learning_rate": 1.9050339281504474e-05, + "loss": 1.1919, + "step": 4615 + }, + { + "epoch": 1.7185605752114577, + "grad_norm": 0.1681814342737198, + "learning_rate": 1.904982229278373e-05, + "loss": 1.1936, + "step": 4616 + }, + { + "epoch": 1.7189328803620667, + "grad_norm": 0.15795257687568665, + "learning_rate": 1.9049305170397673e-05, + "loss": 1.203, + "step": 4617 + }, + { + "epoch": 1.7193051855126757, + "grad_norm": 0.16355076432228088, + "learning_rate": 1.904878791435395e-05, + "loss": 1.2002, + "step": 4618 + }, + { + "epoch": 1.719677490663285, + "grad_norm": 0.16879984736442566, + "learning_rate": 1.9048270524660197e-05, + "loss": 1.2025, + "step": 4619 + }, + { + "epoch": 1.7200497958138938, + "grad_norm": 0.1663307398557663, + "learning_rate": 1.9047753001324057e-05, + "loss": 1.1998, + "step": 4620 + }, + { + "epoch": 1.720422100964503, + "grad_norm": 0.16687534749507904, + "learning_rate": 1.9047235344353173e-05, + "loss": 1.2093, + "step": 4621 + }, + { + "epoch": 1.7207944061151121, + "grad_norm": 0.1707238256931305, + "learning_rate": 1.9046717553755187e-05, + "loss": 1.2089, + "step": 4622 + }, + { + "epoch": 1.7211667112657212, + "grad_norm": 0.16789813339710236, + "learning_rate": 1.9046199629537754e-05, + "loss": 1.2204, + "step": 4623 + }, + { + "epoch": 1.7215390164163302, + "grad_norm": 0.16236470639705658, + "learning_rate": 1.9045681571708517e-05, + "loss": 1.2088, + "step": 4624 + }, + { + "epoch": 1.7219113215669393, + "grad_norm": 0.1650935858488083, + "learning_rate": 1.9045163380275134e-05, + "loss": 1.2009, + "step": 4625 + }, + { + "epoch": 1.7222836267175485, + "grad_norm": 0.17150568962097168, + "learning_rate": 1.9044645055245254e-05, + "loss": 1.2058, + "step": 4626 + }, + { + "epoch": 1.7226559318681574, + "grad_norm": 0.15954706072807312, + "learning_rate": 1.9044126596626536e-05, + "loss": 1.1959, + "step": 4627 + }, + { + "epoch": 1.7230282370187666, + "grad_norm": 0.16907548904418945, + "learning_rate": 1.9043608004426635e-05, + "loss": 1.2186, + "step": 4628 + }, + { + "epoch": 1.7234005421693754, + "grad_norm": 0.16456788778305054, + "learning_rate": 1.904308927865321e-05, + "loss": 1.2052, + "step": 4629 + }, + { + "epoch": 1.7237728473199847, + "grad_norm": 0.16648238897323608, + "learning_rate": 1.9042570419313927e-05, + "loss": 1.1964, + "step": 4630 + }, + { + "epoch": 1.7241451524705937, + "grad_norm": 0.16598086059093475, + "learning_rate": 1.904205142641644e-05, + "loss": 1.1976, + "step": 4631 + }, + { + "epoch": 1.7245174576212028, + "grad_norm": 0.16780155897140503, + "learning_rate": 1.9041532299968426e-05, + "loss": 1.202, + "step": 4632 + }, + { + "epoch": 1.7248897627718118, + "grad_norm": 0.1815885603427887, + "learning_rate": 1.9041013039977548e-05, + "loss": 1.2154, + "step": 4633 + }, + { + "epoch": 1.7252620679224209, + "grad_norm": 0.1682657152414322, + "learning_rate": 1.9040493646451472e-05, + "loss": 1.2037, + "step": 4634 + }, + { + "epoch": 1.7256343730730301, + "grad_norm": 0.16929581761360168, + "learning_rate": 1.9039974119397872e-05, + "loss": 1.2131, + "step": 4635 + }, + { + "epoch": 1.726006678223639, + "grad_norm": 0.15777097642421722, + "learning_rate": 1.9039454458824426e-05, + "loss": 1.1929, + "step": 4636 + }, + { + "epoch": 1.7263789833742482, + "grad_norm": 0.16508424282073975, + "learning_rate": 1.90389346647388e-05, + "loss": 1.2151, + "step": 4637 + }, + { + "epoch": 1.726751288524857, + "grad_norm": 0.16090057790279388, + "learning_rate": 1.903841473714868e-05, + "loss": 1.1994, + "step": 4638 + }, + { + "epoch": 1.7271235936754663, + "grad_norm": 0.16870178282260895, + "learning_rate": 1.903789467606174e-05, + "loss": 1.1991, + "step": 4639 + }, + { + "epoch": 1.7274958988260753, + "grad_norm": 0.17125073075294495, + "learning_rate": 1.903737448148566e-05, + "loss": 1.2113, + "step": 4640 + }, + { + "epoch": 1.7278682039766844, + "grad_norm": 0.17486359179019928, + "learning_rate": 1.903685415342813e-05, + "loss": 1.2241, + "step": 4641 + }, + { + "epoch": 1.7282405091272934, + "grad_norm": 0.16174235939979553, + "learning_rate": 1.903633369189683e-05, + "loss": 1.2012, + "step": 4642 + }, + { + "epoch": 1.7286128142779025, + "grad_norm": 0.1670721024274826, + "learning_rate": 1.9035813096899448e-05, + "loss": 1.2032, + "step": 4643 + }, + { + "epoch": 1.7289851194285117, + "grad_norm": 0.1667913943529129, + "learning_rate": 1.9035292368443674e-05, + "loss": 1.1978, + "step": 4644 + }, + { + "epoch": 1.7293574245791206, + "grad_norm": 0.16571226716041565, + "learning_rate": 1.90347715065372e-05, + "loss": 1.2055, + "step": 4645 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.16005724668502808, + "learning_rate": 1.9034250511187716e-05, + "loss": 1.1902, + "step": 4646 + }, + { + "epoch": 1.7301020348803386, + "grad_norm": 0.1715211570262909, + "learning_rate": 1.903372938240292e-05, + "loss": 1.2029, + "step": 4647 + }, + { + "epoch": 1.730474340030948, + "grad_norm": 0.16407069563865662, + "learning_rate": 1.9033208120190507e-05, + "loss": 1.1987, + "step": 4648 + }, + { + "epoch": 1.730846645181557, + "grad_norm": 0.16125743091106415, + "learning_rate": 1.9032686724558177e-05, + "loss": 1.1941, + "step": 4649 + }, + { + "epoch": 1.731218950332166, + "grad_norm": 0.17684829235076904, + "learning_rate": 1.9032165195513634e-05, + "loss": 1.2181, + "step": 4650 + }, + { + "epoch": 1.731591255482775, + "grad_norm": 0.1644441783428192, + "learning_rate": 1.9031643533064573e-05, + "loss": 1.2018, + "step": 4651 + }, + { + "epoch": 1.731963560633384, + "grad_norm": 0.1682443469762802, + "learning_rate": 1.9031121737218706e-05, + "loss": 1.197, + "step": 4652 + }, + { + "epoch": 1.7323358657839933, + "grad_norm": 0.16315114498138428, + "learning_rate": 1.9030599807983737e-05, + "loss": 1.2067, + "step": 4653 + }, + { + "epoch": 1.7327081709346022, + "grad_norm": 0.16800329089164734, + "learning_rate": 1.9030077745367377e-05, + "loss": 1.2031, + "step": 4654 + }, + { + "epoch": 1.7330804760852114, + "grad_norm": 0.16758573055267334, + "learning_rate": 1.9029555549377335e-05, + "loss": 1.1791, + "step": 4655 + }, + { + "epoch": 1.7334527812358203, + "grad_norm": 0.1693509966135025, + "learning_rate": 1.9029033220021325e-05, + "loss": 1.2177, + "step": 4656 + }, + { + "epoch": 1.7338250863864295, + "grad_norm": 0.15949083864688873, + "learning_rate": 1.902851075730706e-05, + "loss": 1.2019, + "step": 4657 + }, + { + "epoch": 1.7341973915370386, + "grad_norm": 0.16945861279964447, + "learning_rate": 1.9027988161242258e-05, + "loss": 1.1955, + "step": 4658 + }, + { + "epoch": 1.7345696966876476, + "grad_norm": 0.16500024497509003, + "learning_rate": 1.9027465431834637e-05, + "loss": 1.2125, + "step": 4659 + }, + { + "epoch": 1.7349420018382566, + "grad_norm": 0.1751050353050232, + "learning_rate": 1.9026942569091917e-05, + "loss": 1.1917, + "step": 4660 + }, + { + "epoch": 1.7353143069888657, + "grad_norm": 0.16122271120548248, + "learning_rate": 1.902641957302182e-05, + "loss": 1.1966, + "step": 4661 + }, + { + "epoch": 1.735686612139475, + "grad_norm": 0.16455887258052826, + "learning_rate": 1.9025896443632076e-05, + "loss": 1.1993, + "step": 4662 + }, + { + "epoch": 1.7360589172900838, + "grad_norm": 0.16358114778995514, + "learning_rate": 1.902537318093041e-05, + "loss": 1.2037, + "step": 4663 + }, + { + "epoch": 1.736431222440693, + "grad_norm": 0.16991184651851654, + "learning_rate": 1.9024849784924546e-05, + "loss": 1.2098, + "step": 4664 + }, + { + "epoch": 1.7368035275913019, + "grad_norm": 0.1610153317451477, + "learning_rate": 1.9024326255622215e-05, + "loss": 1.2134, + "step": 4665 + }, + { + "epoch": 1.7371758327419111, + "grad_norm": 0.16271455585956573, + "learning_rate": 1.9023802593031156e-05, + "loss": 1.2164, + "step": 4666 + }, + { + "epoch": 1.7375481378925202, + "grad_norm": 0.16444487869739532, + "learning_rate": 1.9023278797159096e-05, + "loss": 1.2124, + "step": 4667 + }, + { + "epoch": 1.7379204430431292, + "grad_norm": 0.16799621284008026, + "learning_rate": 1.9022754868013775e-05, + "loss": 1.2078, + "step": 4668 + }, + { + "epoch": 1.7382927481937382, + "grad_norm": 0.16734634339809418, + "learning_rate": 1.902223080560293e-05, + "loss": 1.2088, + "step": 4669 + }, + { + "epoch": 1.7386650533443473, + "grad_norm": 0.16235774755477905, + "learning_rate": 1.9021706609934305e-05, + "loss": 1.2093, + "step": 4670 + }, + { + "epoch": 1.7390373584949566, + "grad_norm": 0.1678633987903595, + "learning_rate": 1.9021182281015636e-05, + "loss": 1.2058, + "step": 4671 + }, + { + "epoch": 1.7394096636455654, + "grad_norm": 0.17294074594974518, + "learning_rate": 1.9020657818854673e-05, + "loss": 1.1982, + "step": 4672 + }, + { + "epoch": 1.7397819687961746, + "grad_norm": 0.17070721089839935, + "learning_rate": 1.902013322345916e-05, + "loss": 1.1967, + "step": 4673 + }, + { + "epoch": 1.7401542739467837, + "grad_norm": 0.16906553506851196, + "learning_rate": 1.9019608494836843e-05, + "loss": 1.2101, + "step": 4674 + }, + { + "epoch": 1.7405265790973927, + "grad_norm": 0.17362748086452484, + "learning_rate": 1.9019083632995476e-05, + "loss": 1.2191, + "step": 4675 + }, + { + "epoch": 1.7408988842480018, + "grad_norm": 0.17202383279800415, + "learning_rate": 1.9018558637942813e-05, + "loss": 1.2034, + "step": 4676 + }, + { + "epoch": 1.7412711893986108, + "grad_norm": 0.16142207384109497, + "learning_rate": 1.9018033509686603e-05, + "loss": 1.1993, + "step": 4677 + }, + { + "epoch": 1.7416434945492199, + "grad_norm": 0.16045573353767395, + "learning_rate": 1.9017508248234603e-05, + "loss": 1.2045, + "step": 4678 + }, + { + "epoch": 1.742015799699829, + "grad_norm": 0.16805481910705566, + "learning_rate": 1.901698285359457e-05, + "loss": 1.1967, + "step": 4679 + }, + { + "epoch": 1.7423881048504382, + "grad_norm": 0.16612930595874786, + "learning_rate": 1.9016457325774268e-05, + "loss": 1.1955, + "step": 4680 + }, + { + "epoch": 1.742760410001047, + "grad_norm": 0.16055123507976532, + "learning_rate": 1.901593166478146e-05, + "loss": 1.1979, + "step": 4681 + }, + { + "epoch": 1.7431327151516562, + "grad_norm": 0.16279493272304535, + "learning_rate": 1.90154058706239e-05, + "loss": 1.2181, + "step": 4682 + }, + { + "epoch": 1.7435050203022653, + "grad_norm": 0.18220022320747375, + "learning_rate": 1.9014879943309367e-05, + "loss": 1.2013, + "step": 4683 + }, + { + "epoch": 1.7438773254528743, + "grad_norm": 0.17012886703014374, + "learning_rate": 1.9014353882845626e-05, + "loss": 1.1997, + "step": 4684 + }, + { + "epoch": 1.7442496306034834, + "grad_norm": 0.16687701642513275, + "learning_rate": 1.9013827689240434e-05, + "loss": 1.1968, + "step": 4685 + }, + { + "epoch": 1.7446219357540924, + "grad_norm": 0.16534671187400818, + "learning_rate": 1.9013301362501583e-05, + "loss": 1.1906, + "step": 4686 + }, + { + "epoch": 1.7449942409047017, + "grad_norm": 0.16701330244541168, + "learning_rate": 1.901277490263683e-05, + "loss": 1.1929, + "step": 4687 + }, + { + "epoch": 1.7453665460553105, + "grad_norm": 0.17027342319488525, + "learning_rate": 1.901224830965396e-05, + "loss": 1.2095, + "step": 4688 + }, + { + "epoch": 1.7457388512059198, + "grad_norm": 0.17393416166305542, + "learning_rate": 1.9011721583560747e-05, + "loss": 1.2137, + "step": 4689 + }, + { + "epoch": 1.7461111563565286, + "grad_norm": 0.15819013118743896, + "learning_rate": 1.901119472436497e-05, + "loss": 1.2161, + "step": 4690 + }, + { + "epoch": 1.7464834615071378, + "grad_norm": 0.17175039649009705, + "learning_rate": 1.9010667732074415e-05, + "loss": 1.2104, + "step": 4691 + }, + { + "epoch": 1.746855766657747, + "grad_norm": 0.16250735521316528, + "learning_rate": 1.9010140606696865e-05, + "loss": 1.1984, + "step": 4692 + }, + { + "epoch": 1.747228071808356, + "grad_norm": 0.16283249855041504, + "learning_rate": 1.90096133482401e-05, + "loss": 1.1991, + "step": 4693 + }, + { + "epoch": 1.747600376958965, + "grad_norm": 0.16619625687599182, + "learning_rate": 1.9009085956711916e-05, + "loss": 1.2008, + "step": 4694 + }, + { + "epoch": 1.747972682109574, + "grad_norm": 0.17469939589500427, + "learning_rate": 1.9008558432120094e-05, + "loss": 1.2038, + "step": 4695 + }, + { + "epoch": 1.7483449872601833, + "grad_norm": 0.1646946370601654, + "learning_rate": 1.900803077447243e-05, + "loss": 1.1974, + "step": 4696 + }, + { + "epoch": 1.748717292410792, + "grad_norm": 0.17054390907287598, + "learning_rate": 1.9007502983776712e-05, + "loss": 1.2065, + "step": 4697 + }, + { + "epoch": 1.7490895975614014, + "grad_norm": 0.1633417159318924, + "learning_rate": 1.9006975060040746e-05, + "loss": 1.2084, + "step": 4698 + }, + { + "epoch": 1.7494619027120102, + "grad_norm": 0.18991155922412872, + "learning_rate": 1.9006447003272322e-05, + "loss": 1.1942, + "step": 4699 + }, + { + "epoch": 1.7498342078626195, + "grad_norm": 0.16292832791805267, + "learning_rate": 1.900591881347924e-05, + "loss": 1.1923, + "step": 4700 + }, + { + "epoch": 1.7502065130132285, + "grad_norm": 0.17191947996616364, + "learning_rate": 1.9005390490669305e-05, + "loss": 1.2142, + "step": 4701 + }, + { + "epoch": 1.7505788181638375, + "grad_norm": 0.17528562247753143, + "learning_rate": 1.9004862034850314e-05, + "loss": 1.2203, + "step": 4702 + }, + { + "epoch": 1.7509511233144466, + "grad_norm": 0.17579975724220276, + "learning_rate": 1.900433344603008e-05, + "loss": 1.2078, + "step": 4703 + }, + { + "epoch": 1.7513234284650556, + "grad_norm": 0.16660520434379578, + "learning_rate": 1.9003804724216402e-05, + "loss": 1.2123, + "step": 4704 + }, + { + "epoch": 1.7516957336156649, + "grad_norm": 0.16542848944664001, + "learning_rate": 1.90032758694171e-05, + "loss": 1.1933, + "step": 4705 + }, + { + "epoch": 1.7520680387662737, + "grad_norm": 0.16875770688056946, + "learning_rate": 1.9002746881639972e-05, + "loss": 1.1999, + "step": 4706 + }, + { + "epoch": 1.752440343916883, + "grad_norm": 0.1760881096124649, + "learning_rate": 1.900221776089284e-05, + "loss": 1.1932, + "step": 4707 + }, + { + "epoch": 1.7528126490674918, + "grad_norm": 0.17353412508964539, + "learning_rate": 1.9001688507183517e-05, + "loss": 1.2093, + "step": 4708 + }, + { + "epoch": 1.753184954218101, + "grad_norm": 0.16783374547958374, + "learning_rate": 1.9001159120519817e-05, + "loss": 1.2056, + "step": 4709 + }, + { + "epoch": 1.75355725936871, + "grad_norm": 0.1809961050748825, + "learning_rate": 1.9000629600909562e-05, + "loss": 1.21, + "step": 4710 + }, + { + "epoch": 1.7539295645193191, + "grad_norm": 0.1838044971227646, + "learning_rate": 1.9000099948360577e-05, + "loss": 1.1983, + "step": 4711 + }, + { + "epoch": 1.7543018696699282, + "grad_norm": 0.18065232038497925, + "learning_rate": 1.8999570162880676e-05, + "loss": 1.1836, + "step": 4712 + }, + { + "epoch": 1.7546741748205372, + "grad_norm": 0.17441041767597198, + "learning_rate": 1.899904024447769e-05, + "loss": 1.1929, + "step": 4713 + }, + { + "epoch": 1.7550464799711465, + "grad_norm": 0.16926120221614838, + "learning_rate": 1.8998510193159445e-05, + "loss": 1.2176, + "step": 4714 + }, + { + "epoch": 1.7554187851217553, + "grad_norm": 0.168484628200531, + "learning_rate": 1.8997980008933767e-05, + "loss": 1.2068, + "step": 4715 + }, + { + "epoch": 1.7557910902723646, + "grad_norm": 0.1681792289018631, + "learning_rate": 1.899744969180849e-05, + "loss": 1.1982, + "step": 4716 + }, + { + "epoch": 1.7561633954229734, + "grad_norm": 0.18074312806129456, + "learning_rate": 1.8996919241791446e-05, + "loss": 1.2009, + "step": 4717 + }, + { + "epoch": 1.7565357005735827, + "grad_norm": 0.17582470178604126, + "learning_rate": 1.899638865889047e-05, + "loss": 1.2103, + "step": 4718 + }, + { + "epoch": 1.7569080057241917, + "grad_norm": 0.17479169368743896, + "learning_rate": 1.89958579431134e-05, + "loss": 1.1962, + "step": 4719 + }, + { + "epoch": 1.7572803108748007, + "grad_norm": 0.21927672624588013, + "learning_rate": 1.899532709446807e-05, + "loss": 1.1914, + "step": 4720 + }, + { + "epoch": 1.7576526160254098, + "grad_norm": 0.18427489697933197, + "learning_rate": 1.8994796112962325e-05, + "loss": 1.189, + "step": 4721 + }, + { + "epoch": 1.7580249211760188, + "grad_norm": 0.17045900225639343, + "learning_rate": 1.8994264998604005e-05, + "loss": 1.192, + "step": 4722 + }, + { + "epoch": 1.758397226326628, + "grad_norm": 0.16136103868484497, + "learning_rate": 1.8993733751400953e-05, + "loss": 1.2078, + "step": 4723 + }, + { + "epoch": 1.758769531477237, + "grad_norm": 0.17973823845386505, + "learning_rate": 1.899320237136102e-05, + "loss": 1.2029, + "step": 4724 + }, + { + "epoch": 1.7591418366278462, + "grad_norm": 0.1666877269744873, + "learning_rate": 1.8992670858492053e-05, + "loss": 1.1991, + "step": 4725 + }, + { + "epoch": 1.7595141417784552, + "grad_norm": 0.16305620968341827, + "learning_rate": 1.89921392128019e-05, + "loss": 1.1932, + "step": 4726 + }, + { + "epoch": 1.7598864469290643, + "grad_norm": 0.1701706200838089, + "learning_rate": 1.899160743429842e-05, + "loss": 1.2117, + "step": 4727 + }, + { + "epoch": 1.7602587520796733, + "grad_norm": 0.17170703411102295, + "learning_rate": 1.8991075522989458e-05, + "loss": 1.1967, + "step": 4728 + }, + { + "epoch": 1.7606310572302823, + "grad_norm": 0.15608349442481995, + "learning_rate": 1.8990543478882875e-05, + "loss": 1.1778, + "step": 4729 + }, + { + "epoch": 1.7610033623808914, + "grad_norm": 0.17074252665042877, + "learning_rate": 1.899001130198653e-05, + "loss": 1.1975, + "step": 4730 + }, + { + "epoch": 1.7613756675315004, + "grad_norm": 0.168598935008049, + "learning_rate": 1.898947899230828e-05, + "loss": 1.2009, + "step": 4731 + }, + { + "epoch": 1.7617479726821097, + "grad_norm": 0.16988085210323334, + "learning_rate": 1.8988946549855994e-05, + "loss": 1.2064, + "step": 4732 + }, + { + "epoch": 1.7621202778327185, + "grad_norm": 0.16802845895290375, + "learning_rate": 1.8988413974637527e-05, + "loss": 1.1968, + "step": 4733 + }, + { + "epoch": 1.7624925829833278, + "grad_norm": 0.16767756640911102, + "learning_rate": 1.8987881266660754e-05, + "loss": 1.2002, + "step": 4734 + }, + { + "epoch": 1.7628648881339368, + "grad_norm": 0.16321119666099548, + "learning_rate": 1.8987348425933535e-05, + "loss": 1.2016, + "step": 4735 + }, + { + "epoch": 1.7632371932845459, + "grad_norm": 0.15938079357147217, + "learning_rate": 1.8986815452463747e-05, + "loss": 1.2024, + "step": 4736 + }, + { + "epoch": 1.763609498435155, + "grad_norm": 0.164838969707489, + "learning_rate": 1.8986282346259255e-05, + "loss": 1.1904, + "step": 4737 + }, + { + "epoch": 1.763981803585764, + "grad_norm": 0.16808170080184937, + "learning_rate": 1.898574910732794e-05, + "loss": 1.2027, + "step": 4738 + }, + { + "epoch": 1.764354108736373, + "grad_norm": 0.16249272227287292, + "learning_rate": 1.8985215735677673e-05, + "loss": 1.2025, + "step": 4739 + }, + { + "epoch": 1.764726413886982, + "grad_norm": 0.16439345479011536, + "learning_rate": 1.8984682231316335e-05, + "loss": 1.2028, + "step": 4740 + }, + { + "epoch": 1.7650987190375913, + "grad_norm": 0.16751708090305328, + "learning_rate": 1.89841485942518e-05, + "loss": 1.1973, + "step": 4741 + }, + { + "epoch": 1.7654710241882001, + "grad_norm": 0.15885969996452332, + "learning_rate": 1.8983614824491958e-05, + "loss": 1.1985, + "step": 4742 + }, + { + "epoch": 1.7658433293388094, + "grad_norm": 0.15906503796577454, + "learning_rate": 1.8983080922044687e-05, + "loss": 1.2051, + "step": 4743 + }, + { + "epoch": 1.7662156344894184, + "grad_norm": 0.16614754498004913, + "learning_rate": 1.8982546886917878e-05, + "loss": 1.2041, + "step": 4744 + }, + { + "epoch": 1.7665879396400275, + "grad_norm": 0.16927511990070343, + "learning_rate": 1.8982012719119414e-05, + "loss": 1.2092, + "step": 4745 + }, + { + "epoch": 1.7669602447906365, + "grad_norm": 0.16667069494724274, + "learning_rate": 1.8981478418657185e-05, + "loss": 1.2005, + "step": 4746 + }, + { + "epoch": 1.7673325499412456, + "grad_norm": 0.15757103264331818, + "learning_rate": 1.898094398553908e-05, + "loss": 1.2002, + "step": 4747 + }, + { + "epoch": 1.7677048550918548, + "grad_norm": 0.1850326508283615, + "learning_rate": 1.8980409419772998e-05, + "loss": 1.2049, + "step": 4748 + }, + { + "epoch": 1.7680771602424636, + "grad_norm": 0.1695263683795929, + "learning_rate": 1.8979874721366834e-05, + "loss": 1.1879, + "step": 4749 + }, + { + "epoch": 1.768449465393073, + "grad_norm": 0.17270156741142273, + "learning_rate": 1.8979339890328484e-05, + "loss": 1.2006, + "step": 4750 + }, + { + "epoch": 1.7688217705436817, + "grad_norm": 0.18274691700935364, + "learning_rate": 1.8978804926665848e-05, + "loss": 1.2109, + "step": 4751 + }, + { + "epoch": 1.769194075694291, + "grad_norm": 0.16077204048633575, + "learning_rate": 1.8978269830386825e-05, + "loss": 1.1919, + "step": 4752 + }, + { + "epoch": 1.7695663808449, + "grad_norm": 0.1679827719926834, + "learning_rate": 1.8977734601499322e-05, + "loss": 1.2057, + "step": 4753 + }, + { + "epoch": 1.769938685995509, + "grad_norm": 0.1700560301542282, + "learning_rate": 1.8977199240011237e-05, + "loss": 1.2149, + "step": 4754 + }, + { + "epoch": 1.7703109911461181, + "grad_norm": 0.17137514054775238, + "learning_rate": 1.8976663745930488e-05, + "loss": 1.2163, + "step": 4755 + }, + { + "epoch": 1.7706832962967272, + "grad_norm": 0.16699162125587463, + "learning_rate": 1.897612811926498e-05, + "loss": 1.1973, + "step": 4756 + }, + { + "epoch": 1.7710556014473364, + "grad_norm": 0.1748683899641037, + "learning_rate": 1.8975592360022616e-05, + "loss": 1.2049, + "step": 4757 + }, + { + "epoch": 1.7714279065979452, + "grad_norm": 0.17756815254688263, + "learning_rate": 1.8975056468211323e-05, + "loss": 1.1937, + "step": 4758 + }, + { + "epoch": 1.7718002117485545, + "grad_norm": 0.16323328018188477, + "learning_rate": 1.8974520443839007e-05, + "loss": 1.2013, + "step": 4759 + }, + { + "epoch": 1.7721725168991633, + "grad_norm": 0.19412948191165924, + "learning_rate": 1.8973984286913584e-05, + "loss": 1.2092, + "step": 4760 + }, + { + "epoch": 1.7725448220497726, + "grad_norm": 0.1670580357313156, + "learning_rate": 1.897344799744298e-05, + "loss": 1.2202, + "step": 4761 + }, + { + "epoch": 1.7729171272003816, + "grad_norm": 0.17248012125492096, + "learning_rate": 1.8972911575435112e-05, + "loss": 1.1881, + "step": 4762 + }, + { + "epoch": 1.7732894323509907, + "grad_norm": 0.17886854708194733, + "learning_rate": 1.8972375020897905e-05, + "loss": 1.2028, + "step": 4763 + }, + { + "epoch": 1.7736617375015997, + "grad_norm": 0.1719491332769394, + "learning_rate": 1.897183833383928e-05, + "loss": 1.2057, + "step": 4764 + }, + { + "epoch": 1.7740340426522088, + "grad_norm": 0.18928195536136627, + "learning_rate": 1.8971301514267162e-05, + "loss": 1.1843, + "step": 4765 + }, + { + "epoch": 1.774406347802818, + "grad_norm": 0.16735054552555084, + "learning_rate": 1.897076456218949e-05, + "loss": 1.1963, + "step": 4766 + }, + { + "epoch": 1.7747786529534268, + "grad_norm": 0.16620579361915588, + "learning_rate": 1.897022747761418e-05, + "loss": 1.2001, + "step": 4767 + }, + { + "epoch": 1.7751509581040361, + "grad_norm": 0.16270288825035095, + "learning_rate": 1.8969690260549183e-05, + "loss": 1.2023, + "step": 4768 + }, + { + "epoch": 1.775523263254645, + "grad_norm": 0.17014147341251373, + "learning_rate": 1.8969152911002417e-05, + "loss": 1.1932, + "step": 4769 + }, + { + "epoch": 1.7758955684052542, + "grad_norm": 0.1710130274295807, + "learning_rate": 1.896861542898183e-05, + "loss": 1.2129, + "step": 4770 + }, + { + "epoch": 1.7762678735558632, + "grad_norm": 0.16842330992221832, + "learning_rate": 1.8968077814495355e-05, + "loss": 1.195, + "step": 4771 + }, + { + "epoch": 1.7766401787064723, + "grad_norm": 0.16350851953029633, + "learning_rate": 1.896754006755093e-05, + "loss": 1.1914, + "step": 4772 + }, + { + "epoch": 1.7770124838570813, + "grad_norm": 0.18311436474323273, + "learning_rate": 1.8967002188156503e-05, + "loss": 1.1927, + "step": 4773 + }, + { + "epoch": 1.7773847890076904, + "grad_norm": 0.16735710203647614, + "learning_rate": 1.8966464176320015e-05, + "loss": 1.2019, + "step": 4774 + }, + { + "epoch": 1.7777570941582996, + "grad_norm": 0.16617412865161896, + "learning_rate": 1.8965926032049418e-05, + "loss": 1.2009, + "step": 4775 + }, + { + "epoch": 1.7781293993089085, + "grad_norm": 0.1925051510334015, + "learning_rate": 1.896538775535265e-05, + "loss": 1.1979, + "step": 4776 + }, + { + "epoch": 1.7785017044595177, + "grad_norm": 0.17179760336875916, + "learning_rate": 1.8964849346237676e-05, + "loss": 1.1963, + "step": 4777 + }, + { + "epoch": 1.7788740096101265, + "grad_norm": 0.172499880194664, + "learning_rate": 1.8964310804712435e-05, + "loss": 1.1916, + "step": 4778 + }, + { + "epoch": 1.7792463147607358, + "grad_norm": 0.1798262745141983, + "learning_rate": 1.8963772130784882e-05, + "loss": 1.2, + "step": 4779 + }, + { + "epoch": 1.7796186199113448, + "grad_norm": 0.16507485508918762, + "learning_rate": 1.8963233324462982e-05, + "loss": 1.1805, + "step": 4780 + }, + { + "epoch": 1.7799909250619539, + "grad_norm": 0.1641186773777008, + "learning_rate": 1.896269438575469e-05, + "loss": 1.1815, + "step": 4781 + }, + { + "epoch": 1.780363230212563, + "grad_norm": 0.16876539587974548, + "learning_rate": 1.896215531466796e-05, + "loss": 1.2031, + "step": 4782 + }, + { + "epoch": 1.780735535363172, + "grad_norm": 0.17245730757713318, + "learning_rate": 1.896161611121076e-05, + "loss": 1.1912, + "step": 4783 + }, + { + "epoch": 1.7811078405137812, + "grad_norm": 0.1740397810935974, + "learning_rate": 1.896107677539105e-05, + "loss": 1.2029, + "step": 4784 + }, + { + "epoch": 1.78148014566439, + "grad_norm": 0.1714610755443573, + "learning_rate": 1.8960537307216804e-05, + "loss": 1.208, + "step": 4785 + }, + { + "epoch": 1.7818524508149993, + "grad_norm": 0.17852343618869781, + "learning_rate": 1.895999770669598e-05, + "loss": 1.2083, + "step": 4786 + }, + { + "epoch": 1.7822247559656084, + "grad_norm": 0.16778838634490967, + "learning_rate": 1.8959457973836554e-05, + "loss": 1.2066, + "step": 4787 + }, + { + "epoch": 1.7825970611162174, + "grad_norm": 0.1763961762189865, + "learning_rate": 1.8958918108646495e-05, + "loss": 1.1975, + "step": 4788 + }, + { + "epoch": 1.7829693662668264, + "grad_norm": 0.16524644196033478, + "learning_rate": 1.8958378111133777e-05, + "loss": 1.2086, + "step": 4789 + }, + { + "epoch": 1.7833416714174355, + "grad_norm": 0.17384812235832214, + "learning_rate": 1.895783798130638e-05, + "loss": 1.1881, + "step": 4790 + }, + { + "epoch": 1.7837139765680445, + "grad_norm": 0.16518385708332062, + "learning_rate": 1.8957297719172278e-05, + "loss": 1.1994, + "step": 4791 + }, + { + "epoch": 1.7840862817186536, + "grad_norm": 0.16743482649326324, + "learning_rate": 1.8956757324739445e-05, + "loss": 1.199, + "step": 4792 + }, + { + "epoch": 1.7844585868692628, + "grad_norm": 0.1630256474018097, + "learning_rate": 1.8956216798015873e-05, + "loss": 1.2051, + "step": 4793 + }, + { + "epoch": 1.7848308920198717, + "grad_norm": 0.16784405708312988, + "learning_rate": 1.895567613900954e-05, + "loss": 1.2047, + "step": 4794 + }, + { + "epoch": 1.785203197170481, + "grad_norm": 0.16349439322948456, + "learning_rate": 1.8955135347728434e-05, + "loss": 1.1983, + "step": 4795 + }, + { + "epoch": 1.78557550232109, + "grad_norm": 0.16735652089118958, + "learning_rate": 1.895459442418054e-05, + "loss": 1.1949, + "step": 4796 + }, + { + "epoch": 1.785947807471699, + "grad_norm": 0.1649763435125351, + "learning_rate": 1.8954053368373846e-05, + "loss": 1.2026, + "step": 4797 + }, + { + "epoch": 1.786320112622308, + "grad_norm": 0.1692332774400711, + "learning_rate": 1.895351218031635e-05, + "loss": 1.2029, + "step": 4798 + }, + { + "epoch": 1.786692417772917, + "grad_norm": 0.1696692705154419, + "learning_rate": 1.895297086001604e-05, + "loss": 1.1935, + "step": 4799 + }, + { + "epoch": 1.7870647229235264, + "grad_norm": 0.1772766262292862, + "learning_rate": 1.8952429407480908e-05, + "loss": 1.1929, + "step": 4800 + }, + { + "epoch": 1.7874370280741352, + "grad_norm": 0.17916709184646606, + "learning_rate": 1.895188782271896e-05, + "loss": 1.2017, + "step": 4801 + }, + { + "epoch": 1.7878093332247444, + "grad_norm": 0.16637340188026428, + "learning_rate": 1.8951346105738188e-05, + "loss": 1.1952, + "step": 4802 + }, + { + "epoch": 1.7881816383753533, + "grad_norm": 0.20684456825256348, + "learning_rate": 1.8950804256546597e-05, + "loss": 1.2116, + "step": 4803 + }, + { + "epoch": 1.7885539435259625, + "grad_norm": 0.17692846059799194, + "learning_rate": 1.8950262275152188e-05, + "loss": 1.1984, + "step": 4804 + }, + { + "epoch": 1.7889262486765716, + "grad_norm": 0.17448139190673828, + "learning_rate": 1.8949720161562967e-05, + "loss": 1.1976, + "step": 4805 + }, + { + "epoch": 1.7892985538271806, + "grad_norm": 0.17009678483009338, + "learning_rate": 1.8949177915786942e-05, + "loss": 1.1925, + "step": 4806 + }, + { + "epoch": 1.7896708589777897, + "grad_norm": 0.17584814131259918, + "learning_rate": 1.894863553783212e-05, + "loss": 1.1907, + "step": 4807 + }, + { + "epoch": 1.7900431641283987, + "grad_norm": 0.17380961775779724, + "learning_rate": 1.8948093027706512e-05, + "loss": 1.2074, + "step": 4808 + }, + { + "epoch": 1.790415469279008, + "grad_norm": 0.1762208640575409, + "learning_rate": 1.8947550385418136e-05, + "loss": 1.214, + "step": 4809 + }, + { + "epoch": 1.7907877744296168, + "grad_norm": 0.16765017807483673, + "learning_rate": 1.8947007610974998e-05, + "loss": 1.2002, + "step": 4810 + }, + { + "epoch": 1.791160079580226, + "grad_norm": 0.197531059384346, + "learning_rate": 1.894646470438512e-05, + "loss": 1.1988, + "step": 4811 + }, + { + "epoch": 1.7915323847308349, + "grad_norm": 0.17233921587467194, + "learning_rate": 1.894592166565652e-05, + "loss": 1.1891, + "step": 4812 + }, + { + "epoch": 1.7919046898814441, + "grad_norm": 0.1792616844177246, + "learning_rate": 1.8945378494797216e-05, + "loss": 1.1974, + "step": 4813 + }, + { + "epoch": 1.7922769950320532, + "grad_norm": 0.16458769142627716, + "learning_rate": 1.8944835191815233e-05, + "loss": 1.2007, + "step": 4814 + }, + { + "epoch": 1.7926493001826622, + "grad_norm": 0.20892377197742462, + "learning_rate": 1.89442917567186e-05, + "loss": 1.1936, + "step": 4815 + }, + { + "epoch": 1.7930216053332713, + "grad_norm": 0.17339985072612762, + "learning_rate": 1.894374818951534e-05, + "loss": 1.1883, + "step": 4816 + }, + { + "epoch": 1.7933939104838803, + "grad_norm": 0.17067044973373413, + "learning_rate": 1.8943204490213474e-05, + "loss": 1.2031, + "step": 4817 + }, + { + "epoch": 1.7937662156344896, + "grad_norm": 0.15906625986099243, + "learning_rate": 1.8942660658821045e-05, + "loss": 1.2074, + "step": 4818 + }, + { + "epoch": 1.7941385207850984, + "grad_norm": 0.1729053556919098, + "learning_rate": 1.8942116695346073e-05, + "loss": 1.1915, + "step": 4819 + }, + { + "epoch": 1.7945108259357077, + "grad_norm": 0.16248184442520142, + "learning_rate": 1.8941572599796602e-05, + "loss": 1.1905, + "step": 4820 + }, + { + "epoch": 1.7948831310863165, + "grad_norm": 0.17368124425411224, + "learning_rate": 1.8941028372180667e-05, + "loss": 1.1896, + "step": 4821 + }, + { + "epoch": 1.7952554362369257, + "grad_norm": 0.1708829700946808, + "learning_rate": 1.8940484012506298e-05, + "loss": 1.2013, + "step": 4822 + }, + { + "epoch": 1.7956277413875348, + "grad_norm": 0.15987025201320648, + "learning_rate": 1.8939939520781546e-05, + "loss": 1.22, + "step": 4823 + }, + { + "epoch": 1.7960000465381438, + "grad_norm": 0.16917409002780914, + "learning_rate": 1.8939394897014448e-05, + "loss": 1.1861, + "step": 4824 + }, + { + "epoch": 1.7963723516887529, + "grad_norm": 0.1673855185508728, + "learning_rate": 1.893885014121305e-05, + "loss": 1.202, + "step": 4825 + }, + { + "epoch": 1.796744656839362, + "grad_norm": 0.1666531264781952, + "learning_rate": 1.8938305253385395e-05, + "loss": 1.2077, + "step": 4826 + }, + { + "epoch": 1.7971169619899712, + "grad_norm": 0.16617925465106964, + "learning_rate": 1.893776023353953e-05, + "loss": 1.2171, + "step": 4827 + }, + { + "epoch": 1.79748926714058, + "grad_norm": 0.16386044025421143, + "learning_rate": 1.893721508168351e-05, + "loss": 1.1957, + "step": 4828 + }, + { + "epoch": 1.7978615722911893, + "grad_norm": 0.19208215177059174, + "learning_rate": 1.8936669797825384e-05, + "loss": 1.2114, + "step": 4829 + }, + { + "epoch": 1.798233877441798, + "grad_norm": 0.16761602461338043, + "learning_rate": 1.8936124381973203e-05, + "loss": 1.194, + "step": 4830 + }, + { + "epoch": 1.7986061825924073, + "grad_norm": 0.1722363531589508, + "learning_rate": 1.893557883413503e-05, + "loss": 1.1789, + "step": 4831 + }, + { + "epoch": 1.7989784877430164, + "grad_norm": 0.1653546690940857, + "learning_rate": 1.8935033154318914e-05, + "loss": 1.2039, + "step": 4832 + }, + { + "epoch": 1.7993507928936254, + "grad_norm": 0.16508762538433075, + "learning_rate": 1.8934487342532925e-05, + "loss": 1.1938, + "step": 4833 + }, + { + "epoch": 1.7997230980442345, + "grad_norm": 0.17712639272212982, + "learning_rate": 1.893394139878511e-05, + "loss": 1.1873, + "step": 4834 + }, + { + "epoch": 1.8000954031948435, + "grad_norm": 0.17879736423492432, + "learning_rate": 1.8933395323083547e-05, + "loss": 1.2158, + "step": 4835 + }, + { + "epoch": 1.8004677083454528, + "grad_norm": 0.1629367172718048, + "learning_rate": 1.8932849115436296e-05, + "loss": 1.1977, + "step": 4836 + }, + { + "epoch": 1.8008400134960616, + "grad_norm": 0.16029439866542816, + "learning_rate": 1.8932302775851423e-05, + "loss": 1.2091, + "step": 4837 + }, + { + "epoch": 1.8012123186466709, + "grad_norm": 0.17447559535503387, + "learning_rate": 1.8931756304337e-05, + "loss": 1.2146, + "step": 4838 + }, + { + "epoch": 1.8015846237972797, + "grad_norm": 0.16988350450992584, + "learning_rate": 1.8931209700901096e-05, + "loss": 1.198, + "step": 4839 + }, + { + "epoch": 1.801956928947889, + "grad_norm": 0.1667841076850891, + "learning_rate": 1.8930662965551784e-05, + "loss": 1.1971, + "step": 4840 + }, + { + "epoch": 1.802329234098498, + "grad_norm": 0.17226542532444, + "learning_rate": 1.893011609829714e-05, + "loss": 1.1918, + "step": 4841 + }, + { + "epoch": 1.802701539249107, + "grad_norm": 0.16706976294517517, + "learning_rate": 1.892956909914524e-05, + "loss": 1.2056, + "step": 4842 + }, + { + "epoch": 1.803073844399716, + "grad_norm": 0.1757022887468338, + "learning_rate": 1.892902196810417e-05, + "loss": 1.211, + "step": 4843 + }, + { + "epoch": 1.8034461495503251, + "grad_norm": 0.17053164541721344, + "learning_rate": 1.8928474705182002e-05, + "loss": 1.219, + "step": 4844 + }, + { + "epoch": 1.8038184547009344, + "grad_norm": 0.1679467409849167, + "learning_rate": 1.8927927310386827e-05, + "loss": 1.1871, + "step": 4845 + }, + { + "epoch": 1.8041907598515432, + "grad_norm": 0.18097002804279327, + "learning_rate": 1.892737978372672e-05, + "loss": 1.2076, + "step": 4846 + }, + { + "epoch": 1.8045630650021525, + "grad_norm": 0.16347897052764893, + "learning_rate": 1.8926832125209776e-05, + "loss": 1.1959, + "step": 4847 + }, + { + "epoch": 1.8049353701527615, + "grad_norm": 0.16471904516220093, + "learning_rate": 1.8926284334844086e-05, + "loss": 1.1894, + "step": 4848 + }, + { + "epoch": 1.8053076753033706, + "grad_norm": 0.17791272699832916, + "learning_rate": 1.8925736412637734e-05, + "loss": 1.1922, + "step": 4849 + }, + { + "epoch": 1.8056799804539796, + "grad_norm": 0.16574202477931976, + "learning_rate": 1.8925188358598815e-05, + "loss": 1.2123, + "step": 4850 + }, + { + "epoch": 1.8060522856045886, + "grad_norm": 0.16811169683933258, + "learning_rate": 1.8924640172735423e-05, + "loss": 1.2018, + "step": 4851 + }, + { + "epoch": 1.8064245907551977, + "grad_norm": 0.1643866002559662, + "learning_rate": 1.8924091855055656e-05, + "loss": 1.1997, + "step": 4852 + }, + { + "epoch": 1.8067968959058067, + "grad_norm": 0.1829267144203186, + "learning_rate": 1.8923543405567612e-05, + "loss": 1.1984, + "step": 4853 + }, + { + "epoch": 1.807169201056416, + "grad_norm": 0.17171823978424072, + "learning_rate": 1.8922994824279394e-05, + "loss": 1.2109, + "step": 4854 + }, + { + "epoch": 1.8075415062070248, + "grad_norm": 0.1727183312177658, + "learning_rate": 1.89224461111991e-05, + "loss": 1.1947, + "step": 4855 + }, + { + "epoch": 1.807913811357634, + "grad_norm": 0.23297062516212463, + "learning_rate": 1.8921897266334837e-05, + "loss": 1.207, + "step": 4856 + }, + { + "epoch": 1.8082861165082431, + "grad_norm": 0.1721329540014267, + "learning_rate": 1.8921348289694713e-05, + "loss": 1.1893, + "step": 4857 + }, + { + "epoch": 1.8086584216588522, + "grad_norm": 0.17270666360855103, + "learning_rate": 1.8920799181286837e-05, + "loss": 1.2045, + "step": 4858 + }, + { + "epoch": 1.8090307268094612, + "grad_norm": 0.16383390128612518, + "learning_rate": 1.8920249941119313e-05, + "loss": 1.1815, + "step": 4859 + }, + { + "epoch": 1.8094030319600702, + "grad_norm": 0.16056787967681885, + "learning_rate": 1.891970056920026e-05, + "loss": 1.215, + "step": 4860 + }, + { + "epoch": 1.8097753371106795, + "grad_norm": 0.1632789522409439, + "learning_rate": 1.8919151065537788e-05, + "loss": 1.2041, + "step": 4861 + }, + { + "epoch": 1.8101476422612883, + "grad_norm": 0.17091669142246246, + "learning_rate": 1.8918601430140012e-05, + "loss": 1.2226, + "step": 4862 + }, + { + "epoch": 1.8105199474118976, + "grad_norm": 0.16258689761161804, + "learning_rate": 1.8918051663015057e-05, + "loss": 1.1869, + "step": 4863 + }, + { + "epoch": 1.8108922525625064, + "grad_norm": 0.1618296056985855, + "learning_rate": 1.8917501764171034e-05, + "loss": 1.211, + "step": 4864 + }, + { + "epoch": 1.8112645577131157, + "grad_norm": 0.1654951572418213, + "learning_rate": 1.8916951733616074e-05, + "loss": 1.2002, + "step": 4865 + }, + { + "epoch": 1.8116368628637247, + "grad_norm": 0.1661575883626938, + "learning_rate": 1.8916401571358292e-05, + "loss": 1.2041, + "step": 4866 + }, + { + "epoch": 1.8120091680143338, + "grad_norm": 0.16568347811698914, + "learning_rate": 1.8915851277405823e-05, + "loss": 1.2076, + "step": 4867 + }, + { + "epoch": 1.8123814731649428, + "grad_norm": 0.1610947996377945, + "learning_rate": 1.8915300851766786e-05, + "loss": 1.194, + "step": 4868 + }, + { + "epoch": 1.8127537783155518, + "grad_norm": 0.16845589876174927, + "learning_rate": 1.8914750294449317e-05, + "loss": 1.1927, + "step": 4869 + }, + { + "epoch": 1.813126083466161, + "grad_norm": 0.16183564066886902, + "learning_rate": 1.8914199605461546e-05, + "loss": 1.2055, + "step": 4870 + }, + { + "epoch": 1.81349838861677, + "grad_norm": 0.1744004338979721, + "learning_rate": 1.8913648784811607e-05, + "loss": 1.2018, + "step": 4871 + }, + { + "epoch": 1.8138706937673792, + "grad_norm": 0.1690053641796112, + "learning_rate": 1.8913097832507632e-05, + "loss": 1.199, + "step": 4872 + }, + { + "epoch": 1.814242998917988, + "grad_norm": 0.16530083119869232, + "learning_rate": 1.8912546748557762e-05, + "loss": 1.1827, + "step": 4873 + }, + { + "epoch": 1.8146153040685973, + "grad_norm": 0.17327569425106049, + "learning_rate": 1.891199553297014e-05, + "loss": 1.1972, + "step": 4874 + }, + { + "epoch": 1.8149876092192063, + "grad_norm": 0.163415789604187, + "learning_rate": 1.89114441857529e-05, + "loss": 1.1924, + "step": 4875 + }, + { + "epoch": 1.8153599143698154, + "grad_norm": 0.16614532470703125, + "learning_rate": 1.891089270691419e-05, + "loss": 1.1986, + "step": 4876 + }, + { + "epoch": 1.8157322195204244, + "grad_norm": 0.1664610356092453, + "learning_rate": 1.891034109646215e-05, + "loss": 1.2012, + "step": 4877 + }, + { + "epoch": 1.8161045246710334, + "grad_norm": 0.1688005030155182, + "learning_rate": 1.8909789354404934e-05, + "loss": 1.1825, + "step": 4878 + }, + { + "epoch": 1.8164768298216427, + "grad_norm": 0.16336117684841156, + "learning_rate": 1.890923748075069e-05, + "loss": 1.1941, + "step": 4879 + }, + { + "epoch": 1.8168491349722515, + "grad_norm": 0.17047706246376038, + "learning_rate": 1.8908685475507566e-05, + "loss": 1.1988, + "step": 4880 + }, + { + "epoch": 1.8172214401228608, + "grad_norm": 0.16434811055660248, + "learning_rate": 1.8908133338683715e-05, + "loss": 1.204, + "step": 4881 + }, + { + "epoch": 1.8175937452734696, + "grad_norm": 0.21952207386493683, + "learning_rate": 1.8907581070287295e-05, + "loss": 1.1928, + "step": 4882 + }, + { + "epoch": 1.8179660504240789, + "grad_norm": 0.1890069991350174, + "learning_rate": 1.8907028670326462e-05, + "loss": 1.2033, + "step": 4883 + }, + { + "epoch": 1.818338355574688, + "grad_norm": 0.17524456977844238, + "learning_rate": 1.8906476138809374e-05, + "loss": 1.1934, + "step": 4884 + }, + { + "epoch": 1.818710660725297, + "grad_norm": 0.17356589436531067, + "learning_rate": 1.890592347574419e-05, + "loss": 1.1977, + "step": 4885 + }, + { + "epoch": 1.819082965875906, + "grad_norm": 0.18048810958862305, + "learning_rate": 1.8905370681139083e-05, + "loss": 1.2109, + "step": 4886 + }, + { + "epoch": 1.819455271026515, + "grad_norm": 0.16388888657093048, + "learning_rate": 1.89048177550022e-05, + "loss": 1.1987, + "step": 4887 + }, + { + "epoch": 1.8198275761771243, + "grad_norm": 0.1702636033296585, + "learning_rate": 1.8904264697341723e-05, + "loss": 1.2081, + "step": 4888 + }, + { + "epoch": 1.8201998813277331, + "grad_norm": 0.16600562632083893, + "learning_rate": 1.8903711508165816e-05, + "loss": 1.1876, + "step": 4889 + }, + { + "epoch": 1.8205721864783424, + "grad_norm": 0.17300425469875336, + "learning_rate": 1.8903158187482646e-05, + "loss": 1.1927, + "step": 4890 + }, + { + "epoch": 1.8209444916289512, + "grad_norm": 0.18208463490009308, + "learning_rate": 1.890260473530039e-05, + "loss": 1.1933, + "step": 4891 + }, + { + "epoch": 1.8213167967795605, + "grad_norm": 0.16697552800178528, + "learning_rate": 1.890205115162722e-05, + "loss": 1.209, + "step": 4892 + }, + { + "epoch": 1.8216891019301695, + "grad_norm": 0.16606757044792175, + "learning_rate": 1.8901497436471314e-05, + "loss": 1.2104, + "step": 4893 + }, + { + "epoch": 1.8220614070807786, + "grad_norm": 0.16475386917591095, + "learning_rate": 1.890094358984085e-05, + "loss": 1.2086, + "step": 4894 + }, + { + "epoch": 1.8224337122313876, + "grad_norm": 0.16822832822799683, + "learning_rate": 1.890038961174401e-05, + "loss": 1.2009, + "step": 4895 + }, + { + "epoch": 1.8228060173819967, + "grad_norm": 0.1646614372730255, + "learning_rate": 1.889983550218897e-05, + "loss": 1.1945, + "step": 4896 + }, + { + "epoch": 1.823178322532606, + "grad_norm": 0.16594818234443665, + "learning_rate": 1.8899281261183916e-05, + "loss": 1.2005, + "step": 4897 + }, + { + "epoch": 1.8235506276832147, + "grad_norm": 0.15717746317386627, + "learning_rate": 1.889872688873704e-05, + "loss": 1.1948, + "step": 4898 + }, + { + "epoch": 1.823922932833824, + "grad_norm": 0.16720950603485107, + "learning_rate": 1.8898172384856526e-05, + "loss": 1.1799, + "step": 4899 + }, + { + "epoch": 1.8242952379844328, + "grad_norm": 0.17484848201274872, + "learning_rate": 1.8897617749550565e-05, + "loss": 1.1892, + "step": 4900 + }, + { + "epoch": 1.824667543135042, + "grad_norm": 0.16109175980091095, + "learning_rate": 1.8897062982827347e-05, + "loss": 1.1825, + "step": 4901 + }, + { + "epoch": 1.8250398482856511, + "grad_norm": 0.1625639796257019, + "learning_rate": 1.8896508084695068e-05, + "loss": 1.201, + "step": 4902 + }, + { + "epoch": 1.8254121534362602, + "grad_norm": 0.16512739658355713, + "learning_rate": 1.889595305516192e-05, + "loss": 1.2067, + "step": 4903 + }, + { + "epoch": 1.8257844585868692, + "grad_norm": 0.15775848925113678, + "learning_rate": 1.889539789423611e-05, + "loss": 1.1929, + "step": 4904 + }, + { + "epoch": 1.8261567637374783, + "grad_norm": 0.1584419310092926, + "learning_rate": 1.8894842601925823e-05, + "loss": 1.1877, + "step": 4905 + }, + { + "epoch": 1.8265290688880875, + "grad_norm": 0.16281954944133759, + "learning_rate": 1.8894287178239274e-05, + "loss": 1.1972, + "step": 4906 + }, + { + "epoch": 1.8269013740386963, + "grad_norm": 0.16380088031291962, + "learning_rate": 1.889373162318466e-05, + "loss": 1.18, + "step": 4907 + }, + { + "epoch": 1.8272736791893056, + "grad_norm": 0.17226645350456238, + "learning_rate": 1.8893175936770188e-05, + "loss": 1.2165, + "step": 4908 + }, + { + "epoch": 1.8276459843399147, + "grad_norm": 0.16510790586471558, + "learning_rate": 1.8892620119004067e-05, + "loss": 1.2002, + "step": 4909 + }, + { + "epoch": 1.8280182894905237, + "grad_norm": 0.18165461719036102, + "learning_rate": 1.8892064169894504e-05, + "loss": 1.1977, + "step": 4910 + }, + { + "epoch": 1.8283905946411327, + "grad_norm": 0.2052212953567505, + "learning_rate": 1.889150808944971e-05, + "loss": 1.1974, + "step": 4911 + }, + { + "epoch": 1.8287628997917418, + "grad_norm": 0.18859870731830597, + "learning_rate": 1.8890951877677903e-05, + "loss": 1.1985, + "step": 4912 + }, + { + "epoch": 1.8291352049423508, + "grad_norm": 0.16892032325267792, + "learning_rate": 1.889039553458729e-05, + "loss": 1.2059, + "step": 4913 + }, + { + "epoch": 1.8295075100929599, + "grad_norm": 0.16577079892158508, + "learning_rate": 1.8889839060186095e-05, + "loss": 1.1789, + "step": 4914 + }, + { + "epoch": 1.8298798152435691, + "grad_norm": 0.18138210475444794, + "learning_rate": 1.8889282454482538e-05, + "loss": 1.2013, + "step": 4915 + }, + { + "epoch": 1.830252120394178, + "grad_norm": 0.17776082456111908, + "learning_rate": 1.8888725717484834e-05, + "loss": 1.2137, + "step": 4916 + }, + { + "epoch": 1.8306244255447872, + "grad_norm": 0.1690797209739685, + "learning_rate": 1.888816884920121e-05, + "loss": 1.2019, + "step": 4917 + }, + { + "epoch": 1.8309967306953963, + "grad_norm": 0.17596592009067535, + "learning_rate": 1.888761184963989e-05, + "loss": 1.209, + "step": 4918 + }, + { + "epoch": 1.8313690358460053, + "grad_norm": 0.17080815136432648, + "learning_rate": 1.88870547188091e-05, + "loss": 1.1965, + "step": 4919 + }, + { + "epoch": 1.8317413409966143, + "grad_norm": 0.21370607614517212, + "learning_rate": 1.8886497456717073e-05, + "loss": 1.1991, + "step": 4920 + }, + { + "epoch": 1.8321136461472234, + "grad_norm": 0.18233619630336761, + "learning_rate": 1.888594006337203e-05, + "loss": 1.2039, + "step": 4921 + }, + { + "epoch": 1.8324859512978326, + "grad_norm": 0.1737797111272812, + "learning_rate": 1.8885382538782213e-05, + "loss": 1.1976, + "step": 4922 + }, + { + "epoch": 1.8328582564484415, + "grad_norm": 0.16964221000671387, + "learning_rate": 1.8884824882955853e-05, + "loss": 1.1955, + "step": 4923 + }, + { + "epoch": 1.8332305615990507, + "grad_norm": 0.18157334625720978, + "learning_rate": 1.888426709590119e-05, + "loss": 1.1951, + "step": 4924 + }, + { + "epoch": 1.8336028667496596, + "grad_norm": 0.1712358593940735, + "learning_rate": 1.8883709177626456e-05, + "loss": 1.1945, + "step": 4925 + }, + { + "epoch": 1.8339751719002688, + "grad_norm": 0.16328178346157074, + "learning_rate": 1.8883151128139898e-05, + "loss": 1.194, + "step": 4926 + }, + { + "epoch": 1.8343474770508779, + "grad_norm": 0.16288742423057556, + "learning_rate": 1.8882592947449753e-05, + "loss": 1.1876, + "step": 4927 + }, + { + "epoch": 1.834719782201487, + "grad_norm": 0.1691894680261612, + "learning_rate": 1.8882034635564266e-05, + "loss": 1.2148, + "step": 4928 + }, + { + "epoch": 1.835092087352096, + "grad_norm": 0.16904255747795105, + "learning_rate": 1.888147619249169e-05, + "loss": 1.1911, + "step": 4929 + }, + { + "epoch": 1.835464392502705, + "grad_norm": 0.1638314574956894, + "learning_rate": 1.8880917618240265e-05, + "loss": 1.1922, + "step": 4930 + }, + { + "epoch": 1.8358366976533143, + "grad_norm": 0.17079859972000122, + "learning_rate": 1.888035891281824e-05, + "loss": 1.1861, + "step": 4931 + }, + { + "epoch": 1.836209002803923, + "grad_norm": 0.1680002361536026, + "learning_rate": 1.8879800076233875e-05, + "loss": 1.2024, + "step": 4932 + }, + { + "epoch": 1.8365813079545323, + "grad_norm": 0.15936627984046936, + "learning_rate": 1.8879241108495423e-05, + "loss": 1.1981, + "step": 4933 + }, + { + "epoch": 1.8369536131051412, + "grad_norm": 0.16010630130767822, + "learning_rate": 1.8878682009611134e-05, + "loss": 1.1983, + "step": 4934 + }, + { + "epoch": 1.8373259182557504, + "grad_norm": 0.16550739109516144, + "learning_rate": 1.887812277958927e-05, + "loss": 1.2154, + "step": 4935 + }, + { + "epoch": 1.8376982234063595, + "grad_norm": 0.16892309486865997, + "learning_rate": 1.887756341843809e-05, + "loss": 1.2044, + "step": 4936 + }, + { + "epoch": 1.8380705285569685, + "grad_norm": 0.16828759014606476, + "learning_rate": 1.8877003926165852e-05, + "loss": 1.2074, + "step": 4937 + }, + { + "epoch": 1.8384428337075776, + "grad_norm": 0.16481132805347443, + "learning_rate": 1.8876444302780826e-05, + "loss": 1.2012, + "step": 4938 + }, + { + "epoch": 1.8388151388581866, + "grad_norm": 0.1642160415649414, + "learning_rate": 1.8875884548291274e-05, + "loss": 1.195, + "step": 4939 + }, + { + "epoch": 1.8391874440087959, + "grad_norm": 0.15789036452770233, + "learning_rate": 1.8875324662705467e-05, + "loss": 1.2151, + "step": 4940 + }, + { + "epoch": 1.8395597491594047, + "grad_norm": 0.1689959466457367, + "learning_rate": 1.8874764646031665e-05, + "loss": 1.1944, + "step": 4941 + }, + { + "epoch": 1.839932054310014, + "grad_norm": 0.17505736649036407, + "learning_rate": 1.8874204498278153e-05, + "loss": 1.2001, + "step": 4942 + }, + { + "epoch": 1.8403043594606228, + "grad_norm": 0.1680225431919098, + "learning_rate": 1.8873644219453194e-05, + "loss": 1.2081, + "step": 4943 + }, + { + "epoch": 1.840676664611232, + "grad_norm": 0.17328615486621857, + "learning_rate": 1.887308380956507e-05, + "loss": 1.2141, + "step": 4944 + }, + { + "epoch": 1.841048969761841, + "grad_norm": 0.16859564185142517, + "learning_rate": 1.887252326862205e-05, + "loss": 1.1836, + "step": 4945 + }, + { + "epoch": 1.8414212749124501, + "grad_norm": 0.17477625608444214, + "learning_rate": 1.887196259663242e-05, + "loss": 1.2045, + "step": 4946 + }, + { + "epoch": 1.8417935800630592, + "grad_norm": 0.16840267181396484, + "learning_rate": 1.8871401793604463e-05, + "loss": 1.2002, + "step": 4947 + }, + { + "epoch": 1.8421658852136682, + "grad_norm": 0.16595827043056488, + "learning_rate": 1.8870840859546455e-05, + "loss": 1.1985, + "step": 4948 + }, + { + "epoch": 1.8425381903642775, + "grad_norm": 0.17932920157909393, + "learning_rate": 1.8870279794466686e-05, + "loss": 1.1966, + "step": 4949 + }, + { + "epoch": 1.8429104955148863, + "grad_norm": 0.17470277845859528, + "learning_rate": 1.8869718598373438e-05, + "loss": 1.1887, + "step": 4950 + }, + { + "epoch": 1.8432828006654955, + "grad_norm": 0.16657747328281403, + "learning_rate": 1.8869157271275008e-05, + "loss": 1.191, + "step": 4951 + }, + { + "epoch": 1.8436551058161044, + "grad_norm": 0.1673271358013153, + "learning_rate": 1.8868595813179677e-05, + "loss": 1.192, + "step": 4952 + }, + { + "epoch": 1.8440274109667136, + "grad_norm": 0.1705188900232315, + "learning_rate": 1.8868034224095742e-05, + "loss": 1.2007, + "step": 4953 + }, + { + "epoch": 1.8443997161173227, + "grad_norm": 0.1590953916311264, + "learning_rate": 1.88674725040315e-05, + "loss": 1.2006, + "step": 4954 + }, + { + "epoch": 1.8447720212679317, + "grad_norm": 0.16473130881786346, + "learning_rate": 1.8866910652995244e-05, + "loss": 1.2069, + "step": 4955 + }, + { + "epoch": 1.8451443264185408, + "grad_norm": 0.16664549708366394, + "learning_rate": 1.886634867099528e-05, + "loss": 1.1985, + "step": 4956 + }, + { + "epoch": 1.8455166315691498, + "grad_norm": 0.1601409912109375, + "learning_rate": 1.8865786558039895e-05, + "loss": 1.1817, + "step": 4957 + }, + { + "epoch": 1.845888936719759, + "grad_norm": 0.16703009605407715, + "learning_rate": 1.8865224314137404e-05, + "loss": 1.1998, + "step": 4958 + }, + { + "epoch": 1.8462612418703679, + "grad_norm": 0.16715413331985474, + "learning_rate": 1.88646619392961e-05, + "loss": 1.1901, + "step": 4959 + }, + { + "epoch": 1.8466335470209772, + "grad_norm": 0.18171849846839905, + "learning_rate": 1.8864099433524302e-05, + "loss": 1.2008, + "step": 4960 + }, + { + "epoch": 1.847005852171586, + "grad_norm": 0.20484322309494019, + "learning_rate": 1.886353679683031e-05, + "loss": 1.1864, + "step": 4961 + }, + { + "epoch": 1.8473781573221952, + "grad_norm": 0.19610385596752167, + "learning_rate": 1.8862974029222438e-05, + "loss": 1.2133, + "step": 4962 + }, + { + "epoch": 1.8477504624728043, + "grad_norm": 0.18955834209918976, + "learning_rate": 1.8862411130708992e-05, + "loss": 1.2175, + "step": 4963 + }, + { + "epoch": 1.8481227676234133, + "grad_norm": 0.2106134295463562, + "learning_rate": 1.8861848101298287e-05, + "loss": 1.2053, + "step": 4964 + }, + { + "epoch": 1.8484950727740224, + "grad_norm": 0.16345474123954773, + "learning_rate": 1.8861284940998647e-05, + "loss": 1.2037, + "step": 4965 + }, + { + "epoch": 1.8488673779246314, + "grad_norm": 0.16316930949687958, + "learning_rate": 1.8860721649818383e-05, + "loss": 1.197, + "step": 4966 + }, + { + "epoch": 1.8492396830752407, + "grad_norm": 0.16572824120521545, + "learning_rate": 1.8860158227765816e-05, + "loss": 1.1979, + "step": 4967 + }, + { + "epoch": 1.8496119882258495, + "grad_norm": 0.16799062490463257, + "learning_rate": 1.8859594674849267e-05, + "loss": 1.1983, + "step": 4968 + }, + { + "epoch": 1.8499842933764588, + "grad_norm": 0.15991763770580292, + "learning_rate": 1.8859030991077062e-05, + "loss": 1.2133, + "step": 4969 + }, + { + "epoch": 1.8503565985270678, + "grad_norm": 0.1664821207523346, + "learning_rate": 1.8858467176457528e-05, + "loss": 1.2058, + "step": 4970 + }, + { + "epoch": 1.8507289036776768, + "grad_norm": 0.16562390327453613, + "learning_rate": 1.8857903230998986e-05, + "loss": 1.2027, + "step": 4971 + }, + { + "epoch": 1.8511012088282859, + "grad_norm": 0.16051267087459564, + "learning_rate": 1.885733915470977e-05, + "loss": 1.1803, + "step": 4972 + }, + { + "epoch": 1.851473513978895, + "grad_norm": 0.1623574048280716, + "learning_rate": 1.8856774947598212e-05, + "loss": 1.1947, + "step": 4973 + }, + { + "epoch": 1.851845819129504, + "grad_norm": 0.16899898648262024, + "learning_rate": 1.8856210609672643e-05, + "loss": 1.2068, + "step": 4974 + }, + { + "epoch": 1.852218124280113, + "grad_norm": 0.16852141916751862, + "learning_rate": 1.88556461409414e-05, + "loss": 1.1886, + "step": 4975 + }, + { + "epoch": 1.8525904294307223, + "grad_norm": 0.16997982561588287, + "learning_rate": 1.8855081541412814e-05, + "loss": 1.1986, + "step": 4976 + }, + { + "epoch": 1.852962734581331, + "grad_norm": 0.17644338309764862, + "learning_rate": 1.8854516811095234e-05, + "loss": 1.1934, + "step": 4977 + }, + { + "epoch": 1.8533350397319404, + "grad_norm": 0.18668413162231445, + "learning_rate": 1.8853951949996997e-05, + "loss": 1.1824, + "step": 4978 + }, + { + "epoch": 1.8537073448825494, + "grad_norm": 0.16832074522972107, + "learning_rate": 1.8853386958126444e-05, + "loss": 1.1913, + "step": 4979 + }, + { + "epoch": 1.8540796500331584, + "grad_norm": 0.18081939220428467, + "learning_rate": 1.885282183549192e-05, + "loss": 1.1971, + "step": 4980 + }, + { + "epoch": 1.8544519551837675, + "grad_norm": 0.17961086332798004, + "learning_rate": 1.8852256582101772e-05, + "loss": 1.206, + "step": 4981 + }, + { + "epoch": 1.8548242603343765, + "grad_norm": 0.16785657405853271, + "learning_rate": 1.8851691197964356e-05, + "loss": 1.1924, + "step": 4982 + }, + { + "epoch": 1.8551965654849858, + "grad_norm": 0.17671982944011688, + "learning_rate": 1.885112568308801e-05, + "loss": 1.1868, + "step": 4983 + }, + { + "epoch": 1.8555688706355946, + "grad_norm": 0.16151179373264313, + "learning_rate": 1.8850560037481095e-05, + "loss": 1.1999, + "step": 4984 + }, + { + "epoch": 1.8559411757862039, + "grad_norm": 0.18711332976818085, + "learning_rate": 1.8849994261151968e-05, + "loss": 1.1959, + "step": 4985 + }, + { + "epoch": 1.8563134809368127, + "grad_norm": 0.16686466336250305, + "learning_rate": 1.8849428354108977e-05, + "loss": 1.2055, + "step": 4986 + }, + { + "epoch": 1.856685786087422, + "grad_norm": 0.17078348994255066, + "learning_rate": 1.8848862316360485e-05, + "loss": 1.2076, + "step": 4987 + }, + { + "epoch": 1.857058091238031, + "grad_norm": 0.18692351877689362, + "learning_rate": 1.884829614791485e-05, + "loss": 1.2099, + "step": 4988 + }, + { + "epoch": 1.85743039638864, + "grad_norm": 0.16382160782814026, + "learning_rate": 1.884772984878044e-05, + "loss": 1.2002, + "step": 4989 + }, + { + "epoch": 1.857802701539249, + "grad_norm": 0.16507616639137268, + "learning_rate": 1.8847163418965613e-05, + "loss": 1.2038, + "step": 4990 + }, + { + "epoch": 1.8581750066898581, + "grad_norm": 0.1630743145942688, + "learning_rate": 1.884659685847874e-05, + "loss": 1.2044, + "step": 4991 + }, + { + "epoch": 1.8585473118404674, + "grad_norm": 0.18251408636569977, + "learning_rate": 1.884603016732818e-05, + "loss": 1.1941, + "step": 4992 + }, + { + "epoch": 1.8589196169910762, + "grad_norm": 0.1670679897069931, + "learning_rate": 1.8845463345522317e-05, + "loss": 1.1857, + "step": 4993 + }, + { + "epoch": 1.8592919221416855, + "grad_norm": 0.17508168518543243, + "learning_rate": 1.8844896393069514e-05, + "loss": 1.1886, + "step": 4994 + }, + { + "epoch": 1.8596642272922943, + "grad_norm": 0.16918566823005676, + "learning_rate": 1.8844329309978146e-05, + "loss": 1.2011, + "step": 4995 + }, + { + "epoch": 1.8600365324429036, + "grad_norm": 0.18971340358257294, + "learning_rate": 1.8843762096256587e-05, + "loss": 1.2074, + "step": 4996 + }, + { + "epoch": 1.8604088375935126, + "grad_norm": 0.1661243587732315, + "learning_rate": 1.8843194751913217e-05, + "loss": 1.194, + "step": 4997 + }, + { + "epoch": 1.8607811427441217, + "grad_norm": 0.1802155077457428, + "learning_rate": 1.8842627276956418e-05, + "loss": 1.1916, + "step": 4998 + }, + { + "epoch": 1.8611534478947307, + "grad_norm": 0.16783170402050018, + "learning_rate": 1.884205967139457e-05, + "loss": 1.1981, + "step": 4999 + }, + { + "epoch": 1.8615257530453397, + "grad_norm": 0.19096462428569794, + "learning_rate": 1.884149193523605e-05, + "loss": 1.2114, + "step": 5000 + }, + { + "epoch": 1.8615257530453397, + "eval_loss": 1.3059989213943481, + "eval_runtime": 16.6737, + "eval_samples_per_second": 103.996, + "eval_steps_per_second": 5.218, + "step": 5000 + }, + { + "epoch": 1.861898058195949, + "grad_norm": 0.16396373510360718, + "learning_rate": 1.884092406848925e-05, + "loss": 1.2119, + "step": 5001 + }, + { + "epoch": 1.8622703633465578, + "grad_norm": 0.17009267210960388, + "learning_rate": 1.8840356071162565e-05, + "loss": 1.1872, + "step": 5002 + }, + { + "epoch": 1.862642668497167, + "grad_norm": 0.16647803783416748, + "learning_rate": 1.8839787943264367e-05, + "loss": 1.199, + "step": 5003 + }, + { + "epoch": 1.863014973647776, + "grad_norm": 0.17351815104484558, + "learning_rate": 1.8839219684803057e-05, + "loss": 1.2002, + "step": 5004 + }, + { + "epoch": 1.8633872787983852, + "grad_norm": 0.17014597356319427, + "learning_rate": 1.8838651295787028e-05, + "loss": 1.205, + "step": 5005 + }, + { + "epoch": 1.8637595839489942, + "grad_norm": 0.17426498234272003, + "learning_rate": 1.8838082776224675e-05, + "loss": 1.2107, + "step": 5006 + }, + { + "epoch": 1.8641318890996033, + "grad_norm": 0.17104274034500122, + "learning_rate": 1.883751412612439e-05, + "loss": 1.2051, + "step": 5007 + }, + { + "epoch": 1.8645041942502123, + "grad_norm": 0.16714368760585785, + "learning_rate": 1.8836945345494584e-05, + "loss": 1.1975, + "step": 5008 + }, + { + "epoch": 1.8648764994008213, + "grad_norm": 0.1665792465209961, + "learning_rate": 1.8836376434343644e-05, + "loss": 1.2, + "step": 5009 + }, + { + "epoch": 1.8652488045514306, + "grad_norm": 0.16620507836341858, + "learning_rate": 1.8835807392679978e-05, + "loss": 1.1974, + "step": 5010 + }, + { + "epoch": 1.8656211097020394, + "grad_norm": 0.16727782785892487, + "learning_rate": 1.8835238220511997e-05, + "loss": 1.2209, + "step": 5011 + }, + { + "epoch": 1.8659934148526487, + "grad_norm": 0.17242635786533356, + "learning_rate": 1.8834668917848097e-05, + "loss": 1.2057, + "step": 5012 + }, + { + "epoch": 1.8663657200032575, + "grad_norm": 0.16240327060222626, + "learning_rate": 1.883409948469669e-05, + "loss": 1.1868, + "step": 5013 + }, + { + "epoch": 1.8667380251538668, + "grad_norm": 0.16446274518966675, + "learning_rate": 1.8833529921066193e-05, + "loss": 1.1848, + "step": 5014 + }, + { + "epoch": 1.8671103303044758, + "grad_norm": 0.16615699231624603, + "learning_rate": 1.883296022696501e-05, + "loss": 1.1968, + "step": 5015 + }, + { + "epoch": 1.8674826354550849, + "grad_norm": 0.15990613400936127, + "learning_rate": 1.883239040240156e-05, + "loss": 1.1975, + "step": 5016 + }, + { + "epoch": 1.867854940605694, + "grad_norm": 0.1651614010334015, + "learning_rate": 1.8831820447384256e-05, + "loss": 1.1967, + "step": 5017 + }, + { + "epoch": 1.868227245756303, + "grad_norm": 0.16867800056934357, + "learning_rate": 1.8831250361921522e-05, + "loss": 1.2167, + "step": 5018 + }, + { + "epoch": 1.8685995509069122, + "grad_norm": 0.1658807247877121, + "learning_rate": 1.8830680146021773e-05, + "loss": 1.1987, + "step": 5019 + }, + { + "epoch": 1.868971856057521, + "grad_norm": 0.1625361442565918, + "learning_rate": 1.8830109799693434e-05, + "loss": 1.2042, + "step": 5020 + }, + { + "epoch": 1.8693441612081303, + "grad_norm": 0.47724783420562744, + "learning_rate": 1.882953932294492e-05, + "loss": 1.2001, + "step": 5021 + }, + { + "epoch": 1.8697164663587391, + "grad_norm": 0.1839105486869812, + "learning_rate": 1.882896871578467e-05, + "loss": 1.1972, + "step": 5022 + }, + { + "epoch": 1.8700887715093484, + "grad_norm": 0.22842104732990265, + "learning_rate": 1.8828397978221108e-05, + "loss": 1.1797, + "step": 5023 + }, + { + "epoch": 1.8704610766599574, + "grad_norm": 0.18718920648097992, + "learning_rate": 1.882782711026266e-05, + "loss": 1.1971, + "step": 5024 + }, + { + "epoch": 1.8708333818105665, + "grad_norm": 0.17810329794883728, + "learning_rate": 1.8827256111917757e-05, + "loss": 1.1894, + "step": 5025 + }, + { + "epoch": 1.8712056869611755, + "grad_norm": 0.16336943209171295, + "learning_rate": 1.882668498319484e-05, + "loss": 1.2008, + "step": 5026 + }, + { + "epoch": 1.8715779921117845, + "grad_norm": 0.18069854378700256, + "learning_rate": 1.882611372410234e-05, + "loss": 1.198, + "step": 5027 + }, + { + "epoch": 1.8719502972623938, + "grad_norm": 0.17028245329856873, + "learning_rate": 1.8825542334648687e-05, + "loss": 1.1885, + "step": 5028 + }, + { + "epoch": 1.8723226024130026, + "grad_norm": 0.16441023349761963, + "learning_rate": 1.8824970814842332e-05, + "loss": 1.1954, + "step": 5029 + }, + { + "epoch": 1.872694907563612, + "grad_norm": 0.16318555176258087, + "learning_rate": 1.8824399164691712e-05, + "loss": 1.1845, + "step": 5030 + }, + { + "epoch": 1.873067212714221, + "grad_norm": 0.16445982456207275, + "learning_rate": 1.882382738420527e-05, + "loss": 1.1954, + "step": 5031 + }, + { + "epoch": 1.87343951786483, + "grad_norm": 0.16694188117980957, + "learning_rate": 1.8823255473391454e-05, + "loss": 1.2076, + "step": 5032 + }, + { + "epoch": 1.873811823015439, + "grad_norm": 0.17121298611164093, + "learning_rate": 1.8822683432258703e-05, + "loss": 1.2013, + "step": 5033 + }, + { + "epoch": 1.874184128166048, + "grad_norm": 0.16623573005199432, + "learning_rate": 1.8822111260815475e-05, + "loss": 1.1941, + "step": 5034 + }, + { + "epoch": 1.8745564333166571, + "grad_norm": 0.16473905742168427, + "learning_rate": 1.882153895907022e-05, + "loss": 1.1876, + "step": 5035 + }, + { + "epoch": 1.8749287384672662, + "grad_norm": 0.1598900854587555, + "learning_rate": 1.8820966527031383e-05, + "loss": 1.2061, + "step": 5036 + }, + { + "epoch": 1.8753010436178754, + "grad_norm": 0.16518044471740723, + "learning_rate": 1.8820393964707424e-05, + "loss": 1.1909, + "step": 5037 + }, + { + "epoch": 1.8756733487684842, + "grad_norm": 0.1635928601026535, + "learning_rate": 1.8819821272106803e-05, + "loss": 1.1977, + "step": 5038 + }, + { + "epoch": 1.8760456539190935, + "grad_norm": 0.16230987012386322, + "learning_rate": 1.8819248449237973e-05, + "loss": 1.1884, + "step": 5039 + }, + { + "epoch": 1.8764179590697025, + "grad_norm": 0.16847163438796997, + "learning_rate": 1.8818675496109398e-05, + "loss": 1.1951, + "step": 5040 + }, + { + "epoch": 1.8767902642203116, + "grad_norm": 0.16432876884937286, + "learning_rate": 1.8818102412729537e-05, + "loss": 1.1996, + "step": 5041 + }, + { + "epoch": 1.8771625693709206, + "grad_norm": 0.16060929000377655, + "learning_rate": 1.8817529199106858e-05, + "loss": 1.1812, + "step": 5042 + }, + { + "epoch": 1.8775348745215297, + "grad_norm": 0.15831544995307922, + "learning_rate": 1.8816955855249827e-05, + "loss": 1.2125, + "step": 5043 + }, + { + "epoch": 1.877907179672139, + "grad_norm": 0.1665830910205841, + "learning_rate": 1.8816382381166912e-05, + "loss": 1.1976, + "step": 5044 + }, + { + "epoch": 1.8782794848227478, + "grad_norm": 0.16302713751792908, + "learning_rate": 1.8815808776866583e-05, + "loss": 1.2007, + "step": 5045 + }, + { + "epoch": 1.878651789973357, + "grad_norm": 0.16040857136249542, + "learning_rate": 1.881523504235731e-05, + "loss": 1.2007, + "step": 5046 + }, + { + "epoch": 1.8790240951239658, + "grad_norm": 0.16229379177093506, + "learning_rate": 1.8814661177647567e-05, + "loss": 1.2095, + "step": 5047 + }, + { + "epoch": 1.879396400274575, + "grad_norm": 0.15823791921138763, + "learning_rate": 1.8814087182745835e-05, + "loss": 1.2027, + "step": 5048 + }, + { + "epoch": 1.8797687054251841, + "grad_norm": 0.16429072618484497, + "learning_rate": 1.8813513057660586e-05, + "loss": 1.1974, + "step": 5049 + }, + { + "epoch": 1.8801410105757932, + "grad_norm": 0.16185057163238525, + "learning_rate": 1.8812938802400303e-05, + "loss": 1.2046, + "step": 5050 + }, + { + "epoch": 1.8805133157264022, + "grad_norm": 0.16304458677768707, + "learning_rate": 1.8812364416973467e-05, + "loss": 1.1941, + "step": 5051 + }, + { + "epoch": 1.8808856208770113, + "grad_norm": 0.1603989452123642, + "learning_rate": 1.881178990138856e-05, + "loss": 1.2074, + "step": 5052 + }, + { + "epoch": 1.8812579260276205, + "grad_norm": 0.16219423711299896, + "learning_rate": 1.8811215255654074e-05, + "loss": 1.1982, + "step": 5053 + }, + { + "epoch": 1.8816302311782294, + "grad_norm": 0.16032250225543976, + "learning_rate": 1.8810640479778488e-05, + "loss": 1.1879, + "step": 5054 + }, + { + "epoch": 1.8820025363288386, + "grad_norm": 0.16628342866897583, + "learning_rate": 1.88100655737703e-05, + "loss": 1.201, + "step": 5055 + }, + { + "epoch": 1.8823748414794474, + "grad_norm": 0.15982067584991455, + "learning_rate": 1.8809490537637988e-05, + "loss": 1.1874, + "step": 5056 + }, + { + "epoch": 1.8827471466300567, + "grad_norm": 0.16088199615478516, + "learning_rate": 1.880891537139006e-05, + "loss": 1.1981, + "step": 5057 + }, + { + "epoch": 1.8831194517806658, + "grad_norm": 0.16720576584339142, + "learning_rate": 1.8808340075035e-05, + "loss": 1.1988, + "step": 5058 + }, + { + "epoch": 1.8834917569312748, + "grad_norm": 0.1663171947002411, + "learning_rate": 1.880776464858131e-05, + "loss": 1.2205, + "step": 5059 + }, + { + "epoch": 1.8838640620818838, + "grad_norm": 0.15910829603672028, + "learning_rate": 1.8807189092037494e-05, + "loss": 1.198, + "step": 5060 + }, + { + "epoch": 1.8842363672324929, + "grad_norm": 0.17297664284706116, + "learning_rate": 1.8806613405412045e-05, + "loss": 1.1931, + "step": 5061 + }, + { + "epoch": 1.8846086723831021, + "grad_norm": 0.18729183077812195, + "learning_rate": 1.880603758871347e-05, + "loss": 1.2086, + "step": 5062 + }, + { + "epoch": 1.884980977533711, + "grad_norm": 0.17871494591236115, + "learning_rate": 1.880546164195027e-05, + "loss": 1.202, + "step": 5063 + }, + { + "epoch": 1.8853532826843202, + "grad_norm": 0.16181787848472595, + "learning_rate": 1.8804885565130956e-05, + "loss": 1.1903, + "step": 5064 + }, + { + "epoch": 1.885725587834929, + "grad_norm": 0.16055096685886383, + "learning_rate": 1.8804309358264034e-05, + "loss": 1.1956, + "step": 5065 + }, + { + "epoch": 1.8860978929855383, + "grad_norm": 0.16259528696537018, + "learning_rate": 1.8803733021358015e-05, + "loss": 1.1927, + "step": 5066 + }, + { + "epoch": 1.8864701981361474, + "grad_norm": 0.16635549068450928, + "learning_rate": 1.880315655442141e-05, + "loss": 1.1886, + "step": 5067 + }, + { + "epoch": 1.8868425032867564, + "grad_norm": 0.1678260862827301, + "learning_rate": 1.8802579957462738e-05, + "loss": 1.216, + "step": 5068 + }, + { + "epoch": 1.8872148084373654, + "grad_norm": 0.1632586270570755, + "learning_rate": 1.880200323049051e-05, + "loss": 1.1899, + "step": 5069 + }, + { + "epoch": 1.8875871135879745, + "grad_norm": 0.17265328764915466, + "learning_rate": 1.880142637351325e-05, + "loss": 1.1961, + "step": 5070 + }, + { + "epoch": 1.8879594187385838, + "grad_norm": 0.19594860076904297, + "learning_rate": 1.8800849386539476e-05, + "loss": 1.2123, + "step": 5071 + }, + { + "epoch": 1.8883317238891926, + "grad_norm": 0.1857958883047104, + "learning_rate": 1.8800272269577706e-05, + "loss": 1.1959, + "step": 5072 + }, + { + "epoch": 1.8887040290398018, + "grad_norm": 0.17190240323543549, + "learning_rate": 1.8799695022636466e-05, + "loss": 1.1803, + "step": 5073 + }, + { + "epoch": 1.8890763341904107, + "grad_norm": 0.2080855667591095, + "learning_rate": 1.8799117645724282e-05, + "loss": 1.1875, + "step": 5074 + }, + { + "epoch": 1.88944863934102, + "grad_norm": 0.18487951159477234, + "learning_rate": 1.8798540138849685e-05, + "loss": 1.2109, + "step": 5075 + }, + { + "epoch": 1.889820944491629, + "grad_norm": 0.17496329545974731, + "learning_rate": 1.8797962502021203e-05, + "loss": 1.196, + "step": 5076 + }, + { + "epoch": 1.890193249642238, + "grad_norm": 0.16391214728355408, + "learning_rate": 1.8797384735247367e-05, + "loss": 1.2139, + "step": 5077 + }, + { + "epoch": 1.890565554792847, + "grad_norm": 0.1845993846654892, + "learning_rate": 1.8796806838536708e-05, + "loss": 1.208, + "step": 5078 + }, + { + "epoch": 1.890937859943456, + "grad_norm": 0.17227643728256226, + "learning_rate": 1.8796228811897764e-05, + "loss": 1.1862, + "step": 5079 + }, + { + "epoch": 1.8913101650940654, + "grad_norm": 0.16078023612499237, + "learning_rate": 1.8795650655339075e-05, + "loss": 1.1944, + "step": 5080 + }, + { + "epoch": 1.8916824702446742, + "grad_norm": 0.1699201762676239, + "learning_rate": 1.8795072368869176e-05, + "loss": 1.2018, + "step": 5081 + }, + { + "epoch": 1.8920547753952834, + "grad_norm": 0.17085964977741241, + "learning_rate": 1.879449395249661e-05, + "loss": 1.1928, + "step": 5082 + }, + { + "epoch": 1.8924270805458925, + "grad_norm": 0.19768460094928741, + "learning_rate": 1.8793915406229924e-05, + "loss": 1.2065, + "step": 5083 + }, + { + "epoch": 1.8927993856965015, + "grad_norm": 0.1722254604101181, + "learning_rate": 1.8793336730077657e-05, + "loss": 1.1908, + "step": 5084 + }, + { + "epoch": 1.8931716908471106, + "grad_norm": 0.17522525787353516, + "learning_rate": 1.8792757924048354e-05, + "loss": 1.1937, + "step": 5085 + }, + { + "epoch": 1.8935439959977196, + "grad_norm": 0.17701640725135803, + "learning_rate": 1.8792178988150574e-05, + "loss": 1.194, + "step": 5086 + }, + { + "epoch": 1.8939163011483287, + "grad_norm": 0.17206698656082153, + "learning_rate": 1.879159992239286e-05, + "loss": 1.2049, + "step": 5087 + }, + { + "epoch": 1.8942886062989377, + "grad_norm": 0.1732310801744461, + "learning_rate": 1.8791020726783767e-05, + "loss": 1.2098, + "step": 5088 + }, + { + "epoch": 1.894660911449547, + "grad_norm": 0.1696704626083374, + "learning_rate": 1.8790441401331848e-05, + "loss": 1.1892, + "step": 5089 + }, + { + "epoch": 1.8950332166001558, + "grad_norm": 0.17553074657917023, + "learning_rate": 1.8789861946045662e-05, + "loss": 1.1939, + "step": 5090 + }, + { + "epoch": 1.895405521750765, + "grad_norm": 0.16884693503379822, + "learning_rate": 1.8789282360933767e-05, + "loss": 1.1892, + "step": 5091 + }, + { + "epoch": 1.895777826901374, + "grad_norm": 0.17038117349147797, + "learning_rate": 1.8788702646004725e-05, + "loss": 1.2057, + "step": 5092 + }, + { + "epoch": 1.8961501320519831, + "grad_norm": 0.16676990687847137, + "learning_rate": 1.8788122801267094e-05, + "loss": 1.212, + "step": 5093 + }, + { + "epoch": 1.8965224372025922, + "grad_norm": 0.1800990104675293, + "learning_rate": 1.878754282672944e-05, + "loss": 1.2153, + "step": 5094 + }, + { + "epoch": 1.8968947423532012, + "grad_norm": 0.20043106377124786, + "learning_rate": 1.8786962722400334e-05, + "loss": 1.1969, + "step": 5095 + }, + { + "epoch": 1.8972670475038103, + "grad_norm": 0.1726304590702057, + "learning_rate": 1.878638248828834e-05, + "loss": 1.2091, + "step": 5096 + }, + { + "epoch": 1.8976393526544193, + "grad_norm": 0.17626087367534637, + "learning_rate": 1.8785802124402022e-05, + "loss": 1.1895, + "step": 5097 + }, + { + "epoch": 1.8980116578050286, + "grad_norm": 0.19044530391693115, + "learning_rate": 1.8785221630749964e-05, + "loss": 1.2022, + "step": 5098 + }, + { + "epoch": 1.8983839629556374, + "grad_norm": 0.17916062474250793, + "learning_rate": 1.878464100734073e-05, + "loss": 1.2064, + "step": 5099 + }, + { + "epoch": 1.8987562681062466, + "grad_norm": 0.17990046739578247, + "learning_rate": 1.8784060254182904e-05, + "loss": 1.1868, + "step": 5100 + }, + { + "epoch": 1.8991285732568557, + "grad_norm": 0.1693127304315567, + "learning_rate": 1.8783479371285054e-05, + "loss": 1.2112, + "step": 5101 + }, + { + "epoch": 1.8995008784074647, + "grad_norm": 0.1721697747707367, + "learning_rate": 1.8782898358655767e-05, + "loss": 1.1971, + "step": 5102 + }, + { + "epoch": 1.8998731835580738, + "grad_norm": 0.1762087643146515, + "learning_rate": 1.8782317216303624e-05, + "loss": 1.1838, + "step": 5103 + }, + { + "epoch": 1.9002454887086828, + "grad_norm": 0.2079368680715561, + "learning_rate": 1.8781735944237204e-05, + "loss": 1.1848, + "step": 5104 + }, + { + "epoch": 1.900617793859292, + "grad_norm": 0.18409818410873413, + "learning_rate": 1.87811545424651e-05, + "loss": 1.1941, + "step": 5105 + }, + { + "epoch": 1.900990099009901, + "grad_norm": 0.1844477504491806, + "learning_rate": 1.878057301099589e-05, + "loss": 1.1866, + "step": 5106 + }, + { + "epoch": 1.9013624041605102, + "grad_norm": 0.19511154294013977, + "learning_rate": 1.877999134983817e-05, + "loss": 1.2066, + "step": 5107 + }, + { + "epoch": 1.901734709311119, + "grad_norm": 0.16205884516239166, + "learning_rate": 1.877940955900053e-05, + "loss": 1.1894, + "step": 5108 + }, + { + "epoch": 1.9021070144617283, + "grad_norm": 0.1605387181043625, + "learning_rate": 1.8778827638491556e-05, + "loss": 1.2014, + "step": 5109 + }, + { + "epoch": 1.9024793196123373, + "grad_norm": 0.15819790959358215, + "learning_rate": 1.877824558831985e-05, + "loss": 1.2027, + "step": 5110 + }, + { + "epoch": 1.9028516247629463, + "grad_norm": 0.1695229858160019, + "learning_rate": 1.8777663408494013e-05, + "loss": 1.181, + "step": 5111 + }, + { + "epoch": 1.9032239299135554, + "grad_norm": 0.18196141719818115, + "learning_rate": 1.8777081099022636e-05, + "loss": 1.1962, + "step": 5112 + }, + { + "epoch": 1.9035962350641644, + "grad_norm": 0.18630284070968628, + "learning_rate": 1.877649865991432e-05, + "loss": 1.2019, + "step": 5113 + }, + { + "epoch": 1.9039685402147737, + "grad_norm": 0.16647082567214966, + "learning_rate": 1.8775916091177674e-05, + "loss": 1.1968, + "step": 5114 + }, + { + "epoch": 1.9043408453653825, + "grad_norm": 0.20907706022262573, + "learning_rate": 1.8775333392821294e-05, + "loss": 1.1929, + "step": 5115 + }, + { + "epoch": 1.9047131505159918, + "grad_norm": 0.19712580740451813, + "learning_rate": 1.877475056485379e-05, + "loss": 1.2029, + "step": 5116 + }, + { + "epoch": 1.9050854556666006, + "grad_norm": 0.17977239191532135, + "learning_rate": 1.8774167607283772e-05, + "loss": 1.198, + "step": 5117 + }, + { + "epoch": 1.9054577608172099, + "grad_norm": 0.1739930659532547, + "learning_rate": 1.8773584520119848e-05, + "loss": 1.1902, + "step": 5118 + }, + { + "epoch": 1.905830065967819, + "grad_norm": 0.21335825324058533, + "learning_rate": 1.8773001303370634e-05, + "loss": 1.1862, + "step": 5119 + }, + { + "epoch": 1.906202371118428, + "grad_norm": 0.15986791253089905, + "learning_rate": 1.8772417957044743e-05, + "loss": 1.2015, + "step": 5120 + }, + { + "epoch": 1.906574676269037, + "grad_norm": 0.1675948053598404, + "learning_rate": 1.8771834481150782e-05, + "loss": 1.1985, + "step": 5121 + }, + { + "epoch": 1.906946981419646, + "grad_norm": 0.16435672342777252, + "learning_rate": 1.8771250875697383e-05, + "loss": 1.2114, + "step": 5122 + }, + { + "epoch": 1.9073192865702553, + "grad_norm": 0.16337592899799347, + "learning_rate": 1.8770667140693155e-05, + "loss": 1.2001, + "step": 5123 + }, + { + "epoch": 1.907691591720864, + "grad_norm": 0.1674809753894806, + "learning_rate": 1.8770083276146726e-05, + "loss": 1.1977, + "step": 5124 + }, + { + "epoch": 1.9080638968714734, + "grad_norm": 0.17421855032444, + "learning_rate": 1.8769499282066716e-05, + "loss": 1.1829, + "step": 5125 + }, + { + "epoch": 1.9084362020220822, + "grad_norm": 0.16936717927455902, + "learning_rate": 1.8768915158461755e-05, + "loss": 1.2072, + "step": 5126 + }, + { + "epoch": 1.9088085071726915, + "grad_norm": 0.16133807599544525, + "learning_rate": 1.8768330905340462e-05, + "loss": 1.2025, + "step": 5127 + }, + { + "epoch": 1.9091808123233005, + "grad_norm": 0.1636780947446823, + "learning_rate": 1.8767746522711478e-05, + "loss": 1.2099, + "step": 5128 + }, + { + "epoch": 1.9095531174739095, + "grad_norm": 0.17051872611045837, + "learning_rate": 1.876716201058342e-05, + "loss": 1.1913, + "step": 5129 + }, + { + "epoch": 1.9099254226245186, + "grad_norm": 0.16685360670089722, + "learning_rate": 1.8766577368964937e-05, + "loss": 1.1992, + "step": 5130 + }, + { + "epoch": 1.9102977277751276, + "grad_norm": 0.1661144345998764, + "learning_rate": 1.8765992597864654e-05, + "loss": 1.1953, + "step": 5131 + }, + { + "epoch": 1.910670032925737, + "grad_norm": 0.16702596843242645, + "learning_rate": 1.8765407697291208e-05, + "loss": 1.2023, + "step": 5132 + }, + { + "epoch": 1.9110423380763457, + "grad_norm": 0.16298018395900726, + "learning_rate": 1.8764822667253244e-05, + "loss": 1.203, + "step": 5133 + }, + { + "epoch": 1.911414643226955, + "grad_norm": 0.16323602199554443, + "learning_rate": 1.87642375077594e-05, + "loss": 1.1987, + "step": 5134 + }, + { + "epoch": 1.9117869483775638, + "grad_norm": 0.1665439009666443, + "learning_rate": 1.8763652218818314e-05, + "loss": 1.1812, + "step": 5135 + }, + { + "epoch": 1.912159253528173, + "grad_norm": 0.16453301906585693, + "learning_rate": 1.8763066800438638e-05, + "loss": 1.2071, + "step": 5136 + }, + { + "epoch": 1.912531558678782, + "grad_norm": 0.1619269996881485, + "learning_rate": 1.8762481252629013e-05, + "loss": 1.2125, + "step": 5137 + }, + { + "epoch": 1.9129038638293911, + "grad_norm": 0.16248470544815063, + "learning_rate": 1.8761895575398094e-05, + "loss": 1.209, + "step": 5138 + }, + { + "epoch": 1.9132761689800002, + "grad_norm": 0.16813617944717407, + "learning_rate": 1.876130976875452e-05, + "loss": 1.1875, + "step": 5139 + }, + { + "epoch": 1.9136484741306092, + "grad_norm": 0.16750268638134003, + "learning_rate": 1.8760723832706955e-05, + "loss": 1.1921, + "step": 5140 + }, + { + "epoch": 1.9140207792812185, + "grad_norm": 0.16163413226604462, + "learning_rate": 1.8760137767264048e-05, + "loss": 1.18, + "step": 5141 + }, + { + "epoch": 1.9143930844318273, + "grad_norm": 0.16053326427936554, + "learning_rate": 1.875955157243446e-05, + "loss": 1.1977, + "step": 5142 + }, + { + "epoch": 1.9147653895824366, + "grad_norm": 0.16916437447071075, + "learning_rate": 1.8758965248226836e-05, + "loss": 1.2144, + "step": 5143 + }, + { + "epoch": 1.9151376947330456, + "grad_norm": 0.1655954271554947, + "learning_rate": 1.875837879464985e-05, + "loss": 1.2031, + "step": 5144 + }, + { + "epoch": 1.9155099998836547, + "grad_norm": 0.16790400445461273, + "learning_rate": 1.875779221171216e-05, + "loss": 1.1969, + "step": 5145 + }, + { + "epoch": 1.9158823050342637, + "grad_norm": 0.16594092547893524, + "learning_rate": 1.8757205499422428e-05, + "loss": 1.191, + "step": 5146 + }, + { + "epoch": 1.9162546101848728, + "grad_norm": 0.16297580301761627, + "learning_rate": 1.8756618657789322e-05, + "loss": 1.2073, + "step": 5147 + }, + { + "epoch": 1.9166269153354818, + "grad_norm": 0.16412940621376038, + "learning_rate": 1.8756031686821506e-05, + "loss": 1.1907, + "step": 5148 + }, + { + "epoch": 1.9169992204860908, + "grad_norm": 0.16388413310050964, + "learning_rate": 1.875544458652765e-05, + "loss": 1.182, + "step": 5149 + }, + { + "epoch": 1.9173715256367, + "grad_norm": 0.16367630660533905, + "learning_rate": 1.875485735691643e-05, + "loss": 1.2072, + "step": 5150 + }, + { + "epoch": 1.917743830787309, + "grad_norm": 0.16208967566490173, + "learning_rate": 1.8754269997996512e-05, + "loss": 1.1871, + "step": 5151 + }, + { + "epoch": 1.9181161359379182, + "grad_norm": 0.1770969182252884, + "learning_rate": 1.875368250977658e-05, + "loss": 1.2146, + "step": 5152 + }, + { + "epoch": 1.9184884410885272, + "grad_norm": 0.17094440758228302, + "learning_rate": 1.8753094892265308e-05, + "loss": 1.1936, + "step": 5153 + }, + { + "epoch": 1.9188607462391363, + "grad_norm": 0.16425853967666626, + "learning_rate": 1.875250714547137e-05, + "loss": 1.1972, + "step": 5154 + }, + { + "epoch": 1.9192330513897453, + "grad_norm": 0.16301047801971436, + "learning_rate": 1.875191926940345e-05, + "loss": 1.1832, + "step": 5155 + }, + { + "epoch": 1.9196053565403544, + "grad_norm": 0.16729892790317535, + "learning_rate": 1.8751331264070232e-05, + "loss": 1.2042, + "step": 5156 + }, + { + "epoch": 1.9199776616909636, + "grad_norm": 0.16945315897464752, + "learning_rate": 1.87507431294804e-05, + "loss": 1.1815, + "step": 5157 + }, + { + "epoch": 1.9203499668415724, + "grad_norm": 0.15983742475509644, + "learning_rate": 1.8750154865642644e-05, + "loss": 1.1798, + "step": 5158 + }, + { + "epoch": 1.9207222719921817, + "grad_norm": 0.1647757738828659, + "learning_rate": 1.874956647256565e-05, + "loss": 1.2057, + "step": 5159 + }, + { + "epoch": 1.9210945771427905, + "grad_norm": 0.16134053468704224, + "learning_rate": 1.8748977950258105e-05, + "loss": 1.2005, + "step": 5160 + }, + { + "epoch": 1.9214668822933998, + "grad_norm": 0.16527298092842102, + "learning_rate": 1.8748389298728708e-05, + "loss": 1.2001, + "step": 5161 + }, + { + "epoch": 1.9218391874440088, + "grad_norm": 0.15893900394439697, + "learning_rate": 1.8747800517986147e-05, + "loss": 1.2154, + "step": 5162 + }, + { + "epoch": 1.9222114925946179, + "grad_norm": 0.16622401773929596, + "learning_rate": 1.8747211608039124e-05, + "loss": 1.1916, + "step": 5163 + }, + { + "epoch": 1.922583797745227, + "grad_norm": 0.16279946267604828, + "learning_rate": 1.8746622568896334e-05, + "loss": 1.1944, + "step": 5164 + }, + { + "epoch": 1.922956102895836, + "grad_norm": 0.16495847702026367, + "learning_rate": 1.874603340056648e-05, + "loss": 1.2027, + "step": 5165 + }, + { + "epoch": 1.9233284080464452, + "grad_norm": 0.156333789229393, + "learning_rate": 1.8745444103058257e-05, + "loss": 1.1909, + "step": 5166 + }, + { + "epoch": 1.923700713197054, + "grad_norm": 0.16494396328926086, + "learning_rate": 1.8744854676380374e-05, + "loss": 1.193, + "step": 5167 + }, + { + "epoch": 1.9240730183476633, + "grad_norm": 0.1679079830646515, + "learning_rate": 1.8744265120541537e-05, + "loss": 1.2151, + "step": 5168 + }, + { + "epoch": 1.9244453234982721, + "grad_norm": 0.16192030906677246, + "learning_rate": 1.8743675435550453e-05, + "loss": 1.2061, + "step": 5169 + }, + { + "epoch": 1.9248176286488814, + "grad_norm": 0.16086839139461517, + "learning_rate": 1.8743085621415834e-05, + "loss": 1.1853, + "step": 5170 + }, + { + "epoch": 1.9251899337994904, + "grad_norm": 0.16591200232505798, + "learning_rate": 1.8742495678146384e-05, + "loss": 1.1983, + "step": 5171 + }, + { + "epoch": 1.9255622389500995, + "grad_norm": 0.16515636444091797, + "learning_rate": 1.8741905605750826e-05, + "loss": 1.1886, + "step": 5172 + }, + { + "epoch": 1.9259345441007085, + "grad_norm": 0.16431131958961487, + "learning_rate": 1.874131540423787e-05, + "loss": 1.2073, + "step": 5173 + }, + { + "epoch": 1.9263068492513176, + "grad_norm": 0.16539840400218964, + "learning_rate": 1.874072507361623e-05, + "loss": 1.1931, + "step": 5174 + }, + { + "epoch": 1.9266791544019268, + "grad_norm": 0.1654292792081833, + "learning_rate": 1.8740134613894633e-05, + "loss": 1.2022, + "step": 5175 + }, + { + "epoch": 1.9270514595525357, + "grad_norm": 0.16364841163158417, + "learning_rate": 1.8739544025081797e-05, + "loss": 1.1803, + "step": 5176 + }, + { + "epoch": 1.927423764703145, + "grad_norm": 0.16501003503799438, + "learning_rate": 1.8738953307186442e-05, + "loss": 1.1918, + "step": 5177 + }, + { + "epoch": 1.9277960698537537, + "grad_norm": 0.15340885519981384, + "learning_rate": 1.8738362460217296e-05, + "loss": 1.1955, + "step": 5178 + }, + { + "epoch": 1.928168375004363, + "grad_norm": 0.16795547306537628, + "learning_rate": 1.8737771484183084e-05, + "loss": 1.2006, + "step": 5179 + }, + { + "epoch": 1.928540680154972, + "grad_norm": 0.16092796623706818, + "learning_rate": 1.8737180379092536e-05, + "loss": 1.192, + "step": 5180 + }, + { + "epoch": 1.928912985305581, + "grad_norm": 0.1616433709859848, + "learning_rate": 1.8736589144954384e-05, + "loss": 1.1703, + "step": 5181 + }, + { + "epoch": 1.9292852904561901, + "grad_norm": 0.16441045701503754, + "learning_rate": 1.8735997781777356e-05, + "loss": 1.1818, + "step": 5182 + }, + { + "epoch": 1.9296575956067992, + "grad_norm": 0.16628822684288025, + "learning_rate": 1.8735406289570193e-05, + "loss": 1.1943, + "step": 5183 + }, + { + "epoch": 1.9300299007574084, + "grad_norm": 0.15827049314975739, + "learning_rate": 1.873481466834162e-05, + "loss": 1.1896, + "step": 5184 + }, + { + "epoch": 1.9304022059080173, + "grad_norm": 0.16958533227443695, + "learning_rate": 1.873422291810039e-05, + "loss": 1.1987, + "step": 5185 + }, + { + "epoch": 1.9307745110586265, + "grad_norm": 0.1640082448720932, + "learning_rate": 1.8733631038855232e-05, + "loss": 1.2068, + "step": 5186 + }, + { + "epoch": 1.9311468162092353, + "grad_norm": 0.16046150028705597, + "learning_rate": 1.8733039030614893e-05, + "loss": 1.1939, + "step": 5187 + }, + { + "epoch": 1.9315191213598446, + "grad_norm": 0.15839648246765137, + "learning_rate": 1.8732446893388116e-05, + "loss": 1.1919, + "step": 5188 + }, + { + "epoch": 1.9318914265104536, + "grad_norm": 0.15910884737968445, + "learning_rate": 1.8731854627183642e-05, + "loss": 1.2, + "step": 5189 + }, + { + "epoch": 1.9322637316610627, + "grad_norm": 0.17085225880146027, + "learning_rate": 1.8731262232010226e-05, + "loss": 1.1909, + "step": 5190 + }, + { + "epoch": 1.9326360368116717, + "grad_norm": 0.16858842968940735, + "learning_rate": 1.8730669707876617e-05, + "loss": 1.2053, + "step": 5191 + }, + { + "epoch": 1.9330083419622808, + "grad_norm": 0.16198118031024933, + "learning_rate": 1.8730077054791562e-05, + "loss": 1.1851, + "step": 5192 + }, + { + "epoch": 1.93338064711289, + "grad_norm": 0.16233724355697632, + "learning_rate": 1.8729484272763817e-05, + "loss": 1.1811, + "step": 5193 + }, + { + "epoch": 1.9337529522634989, + "grad_norm": 0.1625370979309082, + "learning_rate": 1.8728891361802136e-05, + "loss": 1.1946, + "step": 5194 + }, + { + "epoch": 1.9341252574141081, + "grad_norm": 0.16381089389324188, + "learning_rate": 1.872829832191528e-05, + "loss": 1.1884, + "step": 5195 + }, + { + "epoch": 1.934497562564717, + "grad_norm": 0.15602105855941772, + "learning_rate": 1.8727705153112007e-05, + "loss": 1.192, + "step": 5196 + }, + { + "epoch": 1.9348698677153262, + "grad_norm": 0.1663997620344162, + "learning_rate": 1.8727111855401073e-05, + "loss": 1.2075, + "step": 5197 + }, + { + "epoch": 1.9352421728659353, + "grad_norm": 0.16173984110355377, + "learning_rate": 1.8726518428791246e-05, + "loss": 1.1854, + "step": 5198 + }, + { + "epoch": 1.9356144780165443, + "grad_norm": 0.16155578196048737, + "learning_rate": 1.872592487329129e-05, + "loss": 1.1947, + "step": 5199 + }, + { + "epoch": 1.9359867831671533, + "grad_norm": 0.1606961041688919, + "learning_rate": 1.872533118890997e-05, + "loss": 1.1849, + "step": 5200 + }, + { + "epoch": 1.9363590883177624, + "grad_norm": 0.15817643702030182, + "learning_rate": 1.8724737375656054e-05, + "loss": 1.1949, + "step": 5201 + }, + { + "epoch": 1.9367313934683716, + "grad_norm": 0.16562969982624054, + "learning_rate": 1.8724143433538317e-05, + "loss": 1.1889, + "step": 5202 + }, + { + "epoch": 1.9371036986189805, + "grad_norm": 0.1643010526895523, + "learning_rate": 1.872354936256553e-05, + "loss": 1.1853, + "step": 5203 + }, + { + "epoch": 1.9374760037695897, + "grad_norm": 0.16269844770431519, + "learning_rate": 1.8722955162746465e-05, + "loss": 1.207, + "step": 5204 + }, + { + "epoch": 1.9378483089201988, + "grad_norm": 0.15886783599853516, + "learning_rate": 1.87223608340899e-05, + "loss": 1.1989, + "step": 5205 + }, + { + "epoch": 1.9382206140708078, + "grad_norm": 0.1662435382604599, + "learning_rate": 1.8721766376604612e-05, + "loss": 1.1928, + "step": 5206 + }, + { + "epoch": 1.9385929192214169, + "grad_norm": 0.16843284666538239, + "learning_rate": 1.8721171790299385e-05, + "loss": 1.1914, + "step": 5207 + }, + { + "epoch": 1.938965224372026, + "grad_norm": 0.16134469211101532, + "learning_rate": 1.8720577075182992e-05, + "loss": 1.1829, + "step": 5208 + }, + { + "epoch": 1.939337529522635, + "grad_norm": 0.16114822030067444, + "learning_rate": 1.8719982231264228e-05, + "loss": 1.2182, + "step": 5209 + }, + { + "epoch": 1.939709834673244, + "grad_norm": 0.15821397304534912, + "learning_rate": 1.8719387258551874e-05, + "loss": 1.1835, + "step": 5210 + }, + { + "epoch": 1.9400821398238532, + "grad_norm": 0.16827209293842316, + "learning_rate": 1.8718792157054714e-05, + "loss": 1.1927, + "step": 5211 + }, + { + "epoch": 1.940454444974462, + "grad_norm": 0.16307982802391052, + "learning_rate": 1.871819692678154e-05, + "loss": 1.1895, + "step": 5212 + }, + { + "epoch": 1.9408267501250713, + "grad_norm": 0.16273893415927887, + "learning_rate": 1.8717601567741147e-05, + "loss": 1.2002, + "step": 5213 + }, + { + "epoch": 1.9411990552756804, + "grad_norm": 0.16210372745990753, + "learning_rate": 1.871700607994233e-05, + "loss": 1.2093, + "step": 5214 + }, + { + "epoch": 1.9415713604262894, + "grad_norm": 0.16374051570892334, + "learning_rate": 1.8716410463393873e-05, + "loss": 1.2023, + "step": 5215 + }, + { + "epoch": 1.9419436655768985, + "grad_norm": 0.16489435732364655, + "learning_rate": 1.8715814718104585e-05, + "loss": 1.201, + "step": 5216 + }, + { + "epoch": 1.9423159707275075, + "grad_norm": 0.15813589096069336, + "learning_rate": 1.871521884408326e-05, + "loss": 1.1857, + "step": 5217 + }, + { + "epoch": 1.9426882758781168, + "grad_norm": 0.16261295974254608, + "learning_rate": 1.8714622841338696e-05, + "loss": 1.1985, + "step": 5218 + }, + { + "epoch": 1.9430605810287256, + "grad_norm": 0.15933023393154144, + "learning_rate": 1.8714026709879704e-05, + "loss": 1.2005, + "step": 5219 + }, + { + "epoch": 1.9434328861793349, + "grad_norm": 0.1632496863603592, + "learning_rate": 1.8713430449715086e-05, + "loss": 1.1946, + "step": 5220 + }, + { + "epoch": 1.9438051913299437, + "grad_norm": 0.16615080833435059, + "learning_rate": 1.8712834060853643e-05, + "loss": 1.2015, + "step": 5221 + }, + { + "epoch": 1.944177496480553, + "grad_norm": 0.1673406958580017, + "learning_rate": 1.871223754330419e-05, + "loss": 1.1935, + "step": 5222 + }, + { + "epoch": 1.944549801631162, + "grad_norm": 0.17133355140686035, + "learning_rate": 1.871164089707553e-05, + "loss": 1.198, + "step": 5223 + }, + { + "epoch": 1.944922106781771, + "grad_norm": 0.16196174919605255, + "learning_rate": 1.8711044122176484e-05, + "loss": 1.1964, + "step": 5224 + }, + { + "epoch": 1.94529441193238, + "grad_norm": 0.16071192920207977, + "learning_rate": 1.8710447218615865e-05, + "loss": 1.2009, + "step": 5225 + }, + { + "epoch": 1.945666717082989, + "grad_norm": 0.17128324508666992, + "learning_rate": 1.8709850186402487e-05, + "loss": 1.193, + "step": 5226 + }, + { + "epoch": 1.9460390222335984, + "grad_norm": 0.16837619245052338, + "learning_rate": 1.8709253025545167e-05, + "loss": 1.1982, + "step": 5227 + }, + { + "epoch": 1.9464113273842072, + "grad_norm": 0.16107183694839478, + "learning_rate": 1.8708655736052725e-05, + "loss": 1.1822, + "step": 5228 + }, + { + "epoch": 1.9467836325348165, + "grad_norm": 0.16632641851902008, + "learning_rate": 1.8708058317933986e-05, + "loss": 1.1895, + "step": 5229 + }, + { + "epoch": 1.9471559376854253, + "grad_norm": 0.16798974573612213, + "learning_rate": 1.8707460771197773e-05, + "loss": 1.2121, + "step": 5230 + }, + { + "epoch": 1.9475282428360345, + "grad_norm": 0.16306382417678833, + "learning_rate": 1.870686309585291e-05, + "loss": 1.1866, + "step": 5231 + }, + { + "epoch": 1.9479005479866436, + "grad_norm": 0.1670602411031723, + "learning_rate": 1.8706265291908226e-05, + "loss": 1.1971, + "step": 5232 + }, + { + "epoch": 1.9482728531372526, + "grad_norm": 0.1586066037416458, + "learning_rate": 1.870566735937255e-05, + "loss": 1.1894, + "step": 5233 + }, + { + "epoch": 1.9486451582878617, + "grad_norm": 0.16661204397678375, + "learning_rate": 1.870506929825471e-05, + "loss": 1.2033, + "step": 5234 + }, + { + "epoch": 1.9490174634384707, + "grad_norm": 0.16679902374744415, + "learning_rate": 1.870447110856355e-05, + "loss": 1.2003, + "step": 5235 + }, + { + "epoch": 1.94938976858908, + "grad_norm": 0.1620539128780365, + "learning_rate": 1.8703872790307892e-05, + "loss": 1.1742, + "step": 5236 + }, + { + "epoch": 1.9497620737396888, + "grad_norm": 0.16605715453624725, + "learning_rate": 1.870327434349658e-05, + "loss": 1.1964, + "step": 5237 + }, + { + "epoch": 1.950134378890298, + "grad_norm": 0.1599631905555725, + "learning_rate": 1.8702675768138453e-05, + "loss": 1.1964, + "step": 5238 + }, + { + "epoch": 1.9505066840409069, + "grad_norm": 0.1627977043390274, + "learning_rate": 1.870207706424235e-05, + "loss": 1.1941, + "step": 5239 + }, + { + "epoch": 1.9508789891915161, + "grad_norm": 0.16092990338802338, + "learning_rate": 1.8701478231817116e-05, + "loss": 1.2014, + "step": 5240 + }, + { + "epoch": 1.9512512943421252, + "grad_norm": 0.16339315474033356, + "learning_rate": 1.8700879270871594e-05, + "loss": 1.1798, + "step": 5241 + }, + { + "epoch": 1.9516235994927342, + "grad_norm": 0.16573093831539154, + "learning_rate": 1.8700280181414633e-05, + "loss": 1.1786, + "step": 5242 + }, + { + "epoch": 1.9519959046433433, + "grad_norm": 0.16669802367687225, + "learning_rate": 1.8699680963455076e-05, + "loss": 1.202, + "step": 5243 + }, + { + "epoch": 1.9523682097939523, + "grad_norm": 0.16122578084468842, + "learning_rate": 1.8699081617001784e-05, + "loss": 1.1927, + "step": 5244 + }, + { + "epoch": 1.9527405149445616, + "grad_norm": 0.1680309772491455, + "learning_rate": 1.8698482142063595e-05, + "loss": 1.1934, + "step": 5245 + }, + { + "epoch": 1.9531128200951704, + "grad_norm": 0.16212962567806244, + "learning_rate": 1.8697882538649373e-05, + "loss": 1.1842, + "step": 5246 + }, + { + "epoch": 1.9534851252457797, + "grad_norm": 0.16280242800712585, + "learning_rate": 1.8697282806767974e-05, + "loss": 1.2018, + "step": 5247 + }, + { + "epoch": 1.9538574303963885, + "grad_norm": 0.15737560391426086, + "learning_rate": 1.8696682946428253e-05, + "loss": 1.2001, + "step": 5248 + }, + { + "epoch": 1.9542297355469977, + "grad_norm": 0.1623457968235016, + "learning_rate": 1.8696082957639068e-05, + "loss": 1.1931, + "step": 5249 + }, + { + "epoch": 1.9546020406976068, + "grad_norm": 0.15832823514938354, + "learning_rate": 1.8695482840409287e-05, + "loss": 1.2063, + "step": 5250 + }, + { + "epoch": 1.9549743458482158, + "grad_norm": 0.15807972848415375, + "learning_rate": 1.869488259474777e-05, + "loss": 1.1896, + "step": 5251 + }, + { + "epoch": 1.9553466509988249, + "grad_norm": 0.16206707060337067, + "learning_rate": 1.869428222066338e-05, + "loss": 1.2002, + "step": 5252 + }, + { + "epoch": 1.955718956149434, + "grad_norm": 0.16285400092601776, + "learning_rate": 1.8693681718164987e-05, + "loss": 1.2033, + "step": 5253 + }, + { + "epoch": 1.9560912613000432, + "grad_norm": 0.1658763289451599, + "learning_rate": 1.8693081087261463e-05, + "loss": 1.1955, + "step": 5254 + }, + { + "epoch": 1.956463566450652, + "grad_norm": 0.15834468603134155, + "learning_rate": 1.8692480327961673e-05, + "loss": 1.1961, + "step": 5255 + }, + { + "epoch": 1.9568358716012613, + "grad_norm": 0.15797977149486542, + "learning_rate": 1.8691879440274498e-05, + "loss": 1.1883, + "step": 5256 + }, + { + "epoch": 1.95720817675187, + "grad_norm": 0.16849857568740845, + "learning_rate": 1.869127842420881e-05, + "loss": 1.1909, + "step": 5257 + }, + { + "epoch": 1.9575804819024794, + "grad_norm": 0.16109803318977356, + "learning_rate": 1.8690677279773482e-05, + "loss": 1.1815, + "step": 5258 + }, + { + "epoch": 1.9579527870530884, + "grad_norm": 0.15917837619781494, + "learning_rate": 1.86900760069774e-05, + "loss": 1.2017, + "step": 5259 + }, + { + "epoch": 1.9583250922036974, + "grad_norm": 0.1662922352552414, + "learning_rate": 1.8689474605829436e-05, + "loss": 1.2067, + "step": 5260 + }, + { + "epoch": 1.9586973973543065, + "grad_norm": 0.17103338241577148, + "learning_rate": 1.868887307633848e-05, + "loss": 1.196, + "step": 5261 + }, + { + "epoch": 1.9590697025049155, + "grad_norm": 0.1583465188741684, + "learning_rate": 1.8688271418513415e-05, + "loss": 1.1932, + "step": 5262 + }, + { + "epoch": 1.9594420076555248, + "grad_norm": 0.1627221256494522, + "learning_rate": 1.8687669632363122e-05, + "loss": 1.1863, + "step": 5263 + }, + { + "epoch": 1.9598143128061336, + "grad_norm": 0.1791868507862091, + "learning_rate": 1.86870677178965e-05, + "loss": 1.2089, + "step": 5264 + }, + { + "epoch": 1.9601866179567429, + "grad_norm": 0.17789161205291748, + "learning_rate": 1.8686465675122426e-05, + "loss": 1.1899, + "step": 5265 + }, + { + "epoch": 1.960558923107352, + "grad_norm": 0.16588279604911804, + "learning_rate": 1.8685863504049804e-05, + "loss": 1.1984, + "step": 5266 + }, + { + "epoch": 1.960931228257961, + "grad_norm": 0.16590848565101624, + "learning_rate": 1.8685261204687523e-05, + "loss": 1.2052, + "step": 5267 + }, + { + "epoch": 1.96130353340857, + "grad_norm": 0.22419647872447968, + "learning_rate": 1.8684658777044478e-05, + "loss": 1.1945, + "step": 5268 + }, + { + "epoch": 1.961675838559179, + "grad_norm": 0.20315192639827728, + "learning_rate": 1.8684056221129568e-05, + "loss": 1.2025, + "step": 5269 + }, + { + "epoch": 1.962048143709788, + "grad_norm": 0.17711172997951508, + "learning_rate": 1.868345353695169e-05, + "loss": 1.1777, + "step": 5270 + }, + { + "epoch": 1.9624204488603971, + "grad_norm": 0.16664499044418335, + "learning_rate": 1.8682850724519752e-05, + "loss": 1.1837, + "step": 5271 + }, + { + "epoch": 1.9627927540110064, + "grad_norm": 0.1918790340423584, + "learning_rate": 1.8682247783842654e-05, + "loss": 1.1987, + "step": 5272 + }, + { + "epoch": 1.9631650591616152, + "grad_norm": 0.17362017929553986, + "learning_rate": 1.86816447149293e-05, + "loss": 1.2091, + "step": 5273 + }, + { + "epoch": 1.9635373643122245, + "grad_norm": 0.1695503294467926, + "learning_rate": 1.8681041517788596e-05, + "loss": 1.1945, + "step": 5274 + }, + { + "epoch": 1.9639096694628335, + "grad_norm": 0.17335741221904755, + "learning_rate": 1.8680438192429455e-05, + "loss": 1.185, + "step": 5275 + }, + { + "epoch": 1.9642819746134426, + "grad_norm": 0.17371761798858643, + "learning_rate": 1.867983473886079e-05, + "loss": 1.1801, + "step": 5276 + }, + { + "epoch": 1.9646542797640516, + "grad_norm": 0.1588549017906189, + "learning_rate": 1.8679231157091507e-05, + "loss": 1.1924, + "step": 5277 + }, + { + "epoch": 1.9650265849146606, + "grad_norm": 0.16421490907669067, + "learning_rate": 1.8678627447130524e-05, + "loss": 1.1931, + "step": 5278 + }, + { + "epoch": 1.96539889006527, + "grad_norm": 0.17214372754096985, + "learning_rate": 1.8678023608986758e-05, + "loss": 1.1904, + "step": 5279 + }, + { + "epoch": 1.9657711952158787, + "grad_norm": 0.18433736264705658, + "learning_rate": 1.867741964266913e-05, + "loss": 1.1974, + "step": 5280 + }, + { + "epoch": 1.966143500366488, + "grad_norm": 0.15704011917114258, + "learning_rate": 1.867681554818656e-05, + "loss": 1.1743, + "step": 5281 + }, + { + "epoch": 1.9665158055170968, + "grad_norm": 0.16321733593940735, + "learning_rate": 1.8676211325547965e-05, + "loss": 1.204, + "step": 5282 + }, + { + "epoch": 1.966888110667706, + "grad_norm": 0.17481832206249237, + "learning_rate": 1.8675606974762274e-05, + "loss": 1.1891, + "step": 5283 + }, + { + "epoch": 1.9672604158183151, + "grad_norm": 0.16488084197044373, + "learning_rate": 1.8675002495838415e-05, + "loss": 1.2052, + "step": 5284 + }, + { + "epoch": 1.9676327209689242, + "grad_norm": 0.15766631066799164, + "learning_rate": 1.8674397888785312e-05, + "loss": 1.1902, + "step": 5285 + }, + { + "epoch": 1.9680050261195332, + "grad_norm": 0.17218059301376343, + "learning_rate": 1.8673793153611893e-05, + "loss": 1.1857, + "step": 5286 + }, + { + "epoch": 1.9683773312701422, + "grad_norm": 0.15942028164863586, + "learning_rate": 1.86731882903271e-05, + "loss": 1.1842, + "step": 5287 + }, + { + "epoch": 1.9687496364207515, + "grad_norm": 0.16559404134750366, + "learning_rate": 1.8672583298939857e-05, + "loss": 1.1907, + "step": 5288 + }, + { + "epoch": 1.9691219415713603, + "grad_norm": 0.16306178271770477, + "learning_rate": 1.8671978179459102e-05, + "loss": 1.1881, + "step": 5289 + }, + { + "epoch": 1.9694942467219696, + "grad_norm": 0.16512851417064667, + "learning_rate": 1.8671372931893775e-05, + "loss": 1.2026, + "step": 5290 + }, + { + "epoch": 1.9698665518725784, + "grad_norm": 0.1704392433166504, + "learning_rate": 1.8670767556252812e-05, + "loss": 1.207, + "step": 5291 + }, + { + "epoch": 1.9702388570231877, + "grad_norm": 0.160672128200531, + "learning_rate": 1.867016205254516e-05, + "loss": 1.196, + "step": 5292 + }, + { + "epoch": 1.9706111621737967, + "grad_norm": 0.16421304643154144, + "learning_rate": 1.866955642077976e-05, + "loss": 1.1896, + "step": 5293 + }, + { + "epoch": 1.9709834673244058, + "grad_norm": 0.1669030487537384, + "learning_rate": 1.866895066096555e-05, + "loss": 1.1982, + "step": 5294 + }, + { + "epoch": 1.9713557724750148, + "grad_norm": 0.171576127409935, + "learning_rate": 1.8668344773111483e-05, + "loss": 1.1997, + "step": 5295 + }, + { + "epoch": 1.9717280776256239, + "grad_norm": 0.16728414595127106, + "learning_rate": 1.866773875722651e-05, + "loss": 1.1875, + "step": 5296 + }, + { + "epoch": 1.9721003827762331, + "grad_norm": 0.15906740725040436, + "learning_rate": 1.866713261331958e-05, + "loss": 1.1797, + "step": 5297 + }, + { + "epoch": 1.972472687926842, + "grad_norm": 0.16088546812534332, + "learning_rate": 1.8666526341399644e-05, + "loss": 1.1836, + "step": 5298 + }, + { + "epoch": 1.9728449930774512, + "grad_norm": 0.16941684484481812, + "learning_rate": 1.866591994147566e-05, + "loss": 1.1921, + "step": 5299 + }, + { + "epoch": 1.97321729822806, + "grad_norm": 0.16476041078567505, + "learning_rate": 1.866531341355658e-05, + "loss": 1.1841, + "step": 5300 + }, + { + "epoch": 1.9735896033786693, + "grad_norm": 0.15881778299808502, + "learning_rate": 1.8664706757651365e-05, + "loss": 1.1853, + "step": 5301 + }, + { + "epoch": 1.9739619085292783, + "grad_norm": 0.164139986038208, + "learning_rate": 1.8664099973768975e-05, + "loss": 1.1816, + "step": 5302 + }, + { + "epoch": 1.9743342136798874, + "grad_norm": 0.16207018494606018, + "learning_rate": 1.8663493061918375e-05, + "loss": 1.1838, + "step": 5303 + }, + { + "epoch": 1.9747065188304964, + "grad_norm": 0.17509856820106506, + "learning_rate": 1.8662886022108524e-05, + "loss": 1.2028, + "step": 5304 + }, + { + "epoch": 1.9750788239811055, + "grad_norm": 0.17713604867458344, + "learning_rate": 1.8662278854348387e-05, + "loss": 1.2028, + "step": 5305 + }, + { + "epoch": 1.9754511291317147, + "grad_norm": 0.17039844393730164, + "learning_rate": 1.8661671558646938e-05, + "loss": 1.1883, + "step": 5306 + }, + { + "epoch": 1.9758234342823235, + "grad_norm": 0.17008256912231445, + "learning_rate": 1.8661064135013143e-05, + "loss": 1.1971, + "step": 5307 + }, + { + "epoch": 1.9761957394329328, + "grad_norm": 0.20797152817249298, + "learning_rate": 1.8660456583455974e-05, + "loss": 1.1907, + "step": 5308 + }, + { + "epoch": 1.9765680445835416, + "grad_norm": 0.20216019451618195, + "learning_rate": 1.865984890398441e-05, + "loss": 1.2112, + "step": 5309 + }, + { + "epoch": 1.976940349734151, + "grad_norm": 0.18128228187561035, + "learning_rate": 1.8659241096607416e-05, + "loss": 1.1903, + "step": 5310 + }, + { + "epoch": 1.97731265488476, + "grad_norm": 0.1619497686624527, + "learning_rate": 1.8658633161333974e-05, + "loss": 1.2063, + "step": 5311 + }, + { + "epoch": 1.977684960035369, + "grad_norm": 0.1790294498205185, + "learning_rate": 1.865802509817306e-05, + "loss": 1.197, + "step": 5312 + }, + { + "epoch": 1.978057265185978, + "grad_norm": 0.18129126727581024, + "learning_rate": 1.8657416907133668e-05, + "loss": 1.1738, + "step": 5313 + }, + { + "epoch": 1.978429570336587, + "grad_norm": 0.16873174905776978, + "learning_rate": 1.8656808588224767e-05, + "loss": 1.1904, + "step": 5314 + }, + { + "epoch": 1.9788018754871963, + "grad_norm": 0.1654161512851715, + "learning_rate": 1.8656200141455345e-05, + "loss": 1.1814, + "step": 5315 + }, + { + "epoch": 1.9791741806378051, + "grad_norm": 0.1696605384349823, + "learning_rate": 1.865559156683439e-05, + "loss": 1.1972, + "step": 5316 + }, + { + "epoch": 1.9795464857884144, + "grad_norm": 0.17438465356826782, + "learning_rate": 1.8654982864370893e-05, + "loss": 1.2101, + "step": 5317 + }, + { + "epoch": 1.9799187909390232, + "grad_norm": 0.16162490844726562, + "learning_rate": 1.8654374034073843e-05, + "loss": 1.1928, + "step": 5318 + }, + { + "epoch": 1.9802910960896325, + "grad_norm": 0.16213704645633698, + "learning_rate": 1.865376507595223e-05, + "loss": 1.1729, + "step": 5319 + }, + { + "epoch": 1.9806634012402415, + "grad_norm": 0.19320641458034515, + "learning_rate": 1.865315599001505e-05, + "loss": 1.1971, + "step": 5320 + }, + { + "epoch": 1.9810357063908506, + "grad_norm": 0.16562476754188538, + "learning_rate": 1.8652546776271297e-05, + "loss": 1.1873, + "step": 5321 + }, + { + "epoch": 1.9814080115414596, + "grad_norm": 0.16413205862045288, + "learning_rate": 1.8651937434729975e-05, + "loss": 1.1935, + "step": 5322 + }, + { + "epoch": 1.9817803166920687, + "grad_norm": 0.17290954291820526, + "learning_rate": 1.865132796540008e-05, + "loss": 1.1915, + "step": 5323 + }, + { + "epoch": 1.982152621842678, + "grad_norm": 0.17080329358577728, + "learning_rate": 1.8650718368290613e-05, + "loss": 1.1958, + "step": 5324 + }, + { + "epoch": 1.9825249269932868, + "grad_norm": 0.1644285023212433, + "learning_rate": 1.865010864341058e-05, + "loss": 1.1918, + "step": 5325 + }, + { + "epoch": 1.982897232143896, + "grad_norm": 0.1660122573375702, + "learning_rate": 1.864949879076898e-05, + "loss": 1.1808, + "step": 5326 + }, + { + "epoch": 1.983269537294505, + "grad_norm": 0.16145946085453033, + "learning_rate": 1.864888881037483e-05, + "loss": 1.2083, + "step": 5327 + }, + { + "epoch": 1.983641842445114, + "grad_norm": 0.16719859838485718, + "learning_rate": 1.8648278702237133e-05, + "loss": 1.1856, + "step": 5328 + }, + { + "epoch": 1.9840141475957231, + "grad_norm": 0.16676342487335205, + "learning_rate": 1.8647668466364903e-05, + "loss": 1.2016, + "step": 5329 + }, + { + "epoch": 1.9843864527463322, + "grad_norm": 0.16827064752578735, + "learning_rate": 1.864705810276715e-05, + "loss": 1.1768, + "step": 5330 + }, + { + "epoch": 1.9847587578969412, + "grad_norm": 0.1711677461862564, + "learning_rate": 1.8646447611452895e-05, + "loss": 1.1931, + "step": 5331 + }, + { + "epoch": 1.9851310630475503, + "grad_norm": 0.16668665409088135, + "learning_rate": 1.8645836992431152e-05, + "loss": 1.2098, + "step": 5332 + }, + { + "epoch": 1.9855033681981595, + "grad_norm": 0.1718050241470337, + "learning_rate": 1.8645226245710936e-05, + "loss": 1.2019, + "step": 5333 + }, + { + "epoch": 1.9858756733487684, + "grad_norm": 0.1724565029144287, + "learning_rate": 1.8644615371301275e-05, + "loss": 1.1771, + "step": 5334 + }, + { + "epoch": 1.9862479784993776, + "grad_norm": 0.1688944697380066, + "learning_rate": 1.8644004369211183e-05, + "loss": 1.1902, + "step": 5335 + }, + { + "epoch": 1.9866202836499867, + "grad_norm": 0.17193703353405, + "learning_rate": 1.864339323944969e-05, + "loss": 1.1893, + "step": 5336 + }, + { + "epoch": 1.9869925888005957, + "grad_norm": 0.166243776679039, + "learning_rate": 1.8642781982025827e-05, + "loss": 1.203, + "step": 5337 + }, + { + "epoch": 1.9873648939512047, + "grad_norm": 0.1623961478471756, + "learning_rate": 1.864217059694861e-05, + "loss": 1.1703, + "step": 5338 + }, + { + "epoch": 1.9877371991018138, + "grad_norm": 0.1668027937412262, + "learning_rate": 1.8641559084227078e-05, + "loss": 1.1886, + "step": 5339 + }, + { + "epoch": 1.988109504252423, + "grad_norm": 0.16468320786952972, + "learning_rate": 1.864094744387026e-05, + "loss": 1.2047, + "step": 5340 + }, + { + "epoch": 1.9884818094030319, + "grad_norm": 0.16658997535705566, + "learning_rate": 1.864033567588719e-05, + "loss": 1.1944, + "step": 5341 + }, + { + "epoch": 1.9888541145536411, + "grad_norm": 0.16415011882781982, + "learning_rate": 1.8639723780286903e-05, + "loss": 1.1896, + "step": 5342 + }, + { + "epoch": 1.98922641970425, + "grad_norm": 0.16742385923862457, + "learning_rate": 1.8639111757078444e-05, + "loss": 1.2031, + "step": 5343 + }, + { + "epoch": 1.9895987248548592, + "grad_norm": 0.16341787576675415, + "learning_rate": 1.863849960627084e-05, + "loss": 1.1822, + "step": 5344 + }, + { + "epoch": 1.9899710300054683, + "grad_norm": 0.16611303389072418, + "learning_rate": 1.863788732787314e-05, + "loss": 1.1853, + "step": 5345 + }, + { + "epoch": 1.9903433351560773, + "grad_norm": 0.17013627290725708, + "learning_rate": 1.863727492189439e-05, + "loss": 1.1891, + "step": 5346 + }, + { + "epoch": 1.9907156403066864, + "grad_norm": 0.17224200069904327, + "learning_rate": 1.863666238834363e-05, + "loss": 1.1946, + "step": 5347 + }, + { + "epoch": 1.9910879454572954, + "grad_norm": 0.1723671704530716, + "learning_rate": 1.8636049727229908e-05, + "loss": 1.1886, + "step": 5348 + }, + { + "epoch": 1.9914602506079047, + "grad_norm": 0.1694638729095459, + "learning_rate": 1.8635436938562273e-05, + "loss": 1.185, + "step": 5349 + }, + { + "epoch": 1.9918325557585135, + "grad_norm": 0.1634708046913147, + "learning_rate": 1.8634824022349773e-05, + "loss": 1.201, + "step": 5350 + }, + { + "epoch": 1.9922048609091227, + "grad_norm": 0.16697536408901215, + "learning_rate": 1.863421097860147e-05, + "loss": 1.1841, + "step": 5351 + }, + { + "epoch": 1.9925771660597316, + "grad_norm": 0.16353781521320343, + "learning_rate": 1.863359780732641e-05, + "loss": 1.189, + "step": 5352 + }, + { + "epoch": 1.9929494712103408, + "grad_norm": 0.17699691653251648, + "learning_rate": 1.8632984508533654e-05, + "loss": 1.1828, + "step": 5353 + }, + { + "epoch": 1.9933217763609499, + "grad_norm": 0.19866538047790527, + "learning_rate": 1.8632371082232254e-05, + "loss": 1.2084, + "step": 5354 + }, + { + "epoch": 1.993694081511559, + "grad_norm": 0.16737166047096252, + "learning_rate": 1.863175752843128e-05, + "loss": 1.204, + "step": 5355 + }, + { + "epoch": 1.994066386662168, + "grad_norm": 0.1831938922405243, + "learning_rate": 1.8631143847139785e-05, + "loss": 1.1924, + "step": 5356 + }, + { + "epoch": 1.994438691812777, + "grad_norm": 0.17462186515331268, + "learning_rate": 1.863053003836684e-05, + "loss": 1.2058, + "step": 5357 + }, + { + "epoch": 1.9948109969633863, + "grad_norm": 0.175477534532547, + "learning_rate": 1.8629916102121507e-05, + "loss": 1.1933, + "step": 5358 + }, + { + "epoch": 1.995183302113995, + "grad_norm": 0.22154709696769714, + "learning_rate": 1.8629302038412856e-05, + "loss": 1.1926, + "step": 5359 + }, + { + "epoch": 1.9955556072646043, + "grad_norm": 0.16697679460048676, + "learning_rate": 1.8628687847249955e-05, + "loss": 1.1954, + "step": 5360 + }, + { + "epoch": 1.9959279124152132, + "grad_norm": 0.17017914354801178, + "learning_rate": 1.8628073528641873e-05, + "loss": 1.203, + "step": 5361 + }, + { + "epoch": 1.9963002175658224, + "grad_norm": 0.1623312085866928, + "learning_rate": 1.862745908259769e-05, + "loss": 1.1899, + "step": 5362 + }, + { + "epoch": 1.9966725227164315, + "grad_norm": 0.16061881184577942, + "learning_rate": 1.8626844509126476e-05, + "loss": 1.1995, + "step": 5363 + }, + { + "epoch": 1.9970448278670405, + "grad_norm": 0.18128757178783417, + "learning_rate": 1.862622980823731e-05, + "loss": 1.2044, + "step": 5364 + }, + { + "epoch": 1.9974171330176496, + "grad_norm": 0.15951292216777802, + "learning_rate": 1.8625614979939273e-05, + "loss": 1.1834, + "step": 5365 + }, + { + "epoch": 1.9977894381682586, + "grad_norm": 0.15819518268108368, + "learning_rate": 1.8625000024241445e-05, + "loss": 1.1907, + "step": 5366 + }, + { + "epoch": 1.9981617433188679, + "grad_norm": 0.1662832796573639, + "learning_rate": 1.86243849411529e-05, + "loss": 1.1928, + "step": 5367 + }, + { + "epoch": 1.9985340484694767, + "grad_norm": 0.16362832486629486, + "learning_rate": 1.8623769730682738e-05, + "loss": 1.1923, + "step": 5368 + }, + { + "epoch": 1.998906353620086, + "grad_norm": 0.16274386644363403, + "learning_rate": 1.8623154392840036e-05, + "loss": 1.1892, + "step": 5369 + }, + { + "epoch": 1.9992786587706948, + "grad_norm": 0.2034921646118164, + "learning_rate": 1.8622538927633886e-05, + "loss": 1.1806, + "step": 5370 + }, + { + "epoch": 1.999650963921304, + "grad_norm": 0.1652044951915741, + "learning_rate": 1.8621923335073378e-05, + "loss": 1.2014, + "step": 5371 + }, + { + "epoch": 2.000023269071913, + "grad_norm": 0.16666065156459808, + "learning_rate": 1.86213076151676e-05, + "loss": 1.1979, + "step": 5372 + }, + { + "epoch": 2.000395574222522, + "grad_norm": 0.16674093902111053, + "learning_rate": 1.8620691767925655e-05, + "loss": 1.1733, + "step": 5373 + }, + { + "epoch": 2.0007678793731314, + "grad_norm": 0.16234448552131653, + "learning_rate": 1.862007579335663e-05, + "loss": 1.1845, + "step": 5374 + }, + { + "epoch": 2.00114018452374, + "grad_norm": 0.16867627203464508, + "learning_rate": 1.8619459691469625e-05, + "loss": 1.1914, + "step": 5375 + }, + { + "epoch": 2.0015124896743495, + "grad_norm": 0.17261743545532227, + "learning_rate": 1.8618843462273743e-05, + "loss": 1.1808, + "step": 5376 + }, + { + "epoch": 2.0018847948249583, + "grad_norm": 0.16891492903232574, + "learning_rate": 1.8618227105778086e-05, + "loss": 1.1928, + "step": 5377 + }, + { + "epoch": 2.0022570999755676, + "grad_norm": 0.17338630557060242, + "learning_rate": 1.8617610621991753e-05, + "loss": 1.181, + "step": 5378 + }, + { + "epoch": 2.0026294051261764, + "grad_norm": 0.15824103355407715, + "learning_rate": 1.861699401092385e-05, + "loss": 1.1842, + "step": 5379 + }, + { + "epoch": 2.0030017102767856, + "grad_norm": 0.19989395141601562, + "learning_rate": 1.861637727258349e-05, + "loss": 1.1842, + "step": 5380 + }, + { + "epoch": 2.0033740154273945, + "grad_norm": 0.1691095381975174, + "learning_rate": 1.8615760406979778e-05, + "loss": 1.1976, + "step": 5381 + }, + { + "epoch": 2.0037463205780037, + "grad_norm": 0.17154785990715027, + "learning_rate": 1.8615143414121823e-05, + "loss": 1.1795, + "step": 5382 + }, + { + "epoch": 2.004118625728613, + "grad_norm": 0.16465583443641663, + "learning_rate": 1.8614526294018743e-05, + "loss": 1.1805, + "step": 5383 + }, + { + "epoch": 2.004490930879222, + "grad_norm": 0.16546522080898285, + "learning_rate": 1.8613909046679646e-05, + "loss": 1.1785, + "step": 5384 + }, + { + "epoch": 2.004863236029831, + "grad_norm": 0.1838424950838089, + "learning_rate": 1.861329167211366e-05, + "loss": 1.1865, + "step": 5385 + }, + { + "epoch": 2.00523554118044, + "grad_norm": 0.16737298667430878, + "learning_rate": 1.861267417032989e-05, + "loss": 1.1896, + "step": 5386 + }, + { + "epoch": 2.005607846331049, + "grad_norm": 0.16740891337394714, + "learning_rate": 1.8612056541337466e-05, + "loss": 1.1955, + "step": 5387 + }, + { + "epoch": 2.005980151481658, + "grad_norm": 0.1717260479927063, + "learning_rate": 1.8611438785145508e-05, + "loss": 1.1835, + "step": 5388 + }, + { + "epoch": 2.0063524566322672, + "grad_norm": 0.16396482288837433, + "learning_rate": 1.8610820901763137e-05, + "loss": 1.1755, + "step": 5389 + }, + { + "epoch": 2.0067247617828765, + "grad_norm": 0.16121700406074524, + "learning_rate": 1.8610202891199484e-05, + "loss": 1.1849, + "step": 5390 + }, + { + "epoch": 2.0070970669334853, + "grad_norm": 0.18463963270187378, + "learning_rate": 1.8609584753463676e-05, + "loss": 1.1814, + "step": 5391 + }, + { + "epoch": 2.0074693720840946, + "grad_norm": 0.16385917365550995, + "learning_rate": 1.8608966488564838e-05, + "loss": 1.1919, + "step": 5392 + }, + { + "epoch": 2.0078416772347034, + "grad_norm": 0.1689513474702835, + "learning_rate": 1.8608348096512107e-05, + "loss": 1.2054, + "step": 5393 + }, + { + "epoch": 2.0082139823853127, + "grad_norm": 0.16833508014678955, + "learning_rate": 1.8607729577314615e-05, + "loss": 1.1865, + "step": 5394 + }, + { + "epoch": 2.0085862875359215, + "grad_norm": 0.16249039769172668, + "learning_rate": 1.8607110930981496e-05, + "loss": 1.1882, + "step": 5395 + }, + { + "epoch": 2.0089585926865308, + "grad_norm": 0.15944033861160278, + "learning_rate": 1.860649215752189e-05, + "loss": 1.1781, + "step": 5396 + }, + { + "epoch": 2.0093308978371396, + "grad_norm": 0.16826555132865906, + "learning_rate": 1.8605873256944934e-05, + "loss": 1.1748, + "step": 5397 + }, + { + "epoch": 2.009703202987749, + "grad_norm": 0.1765327900648117, + "learning_rate": 1.860525422925977e-05, + "loss": 1.1894, + "step": 5398 + }, + { + "epoch": 2.010075508138358, + "grad_norm": 0.1757960021495819, + "learning_rate": 1.8604635074475542e-05, + "loss": 1.1952, + "step": 5399 + }, + { + "epoch": 2.010447813288967, + "grad_norm": 0.17114783823490143, + "learning_rate": 1.8604015792601395e-05, + "loss": 1.1784, + "step": 5400 + }, + { + "epoch": 2.010820118439576, + "grad_norm": 0.16381992399692535, + "learning_rate": 1.860339638364647e-05, + "loss": 1.1807, + "step": 5401 + }, + { + "epoch": 2.011192423590185, + "grad_norm": 0.1801488697528839, + "learning_rate": 1.8602776847619926e-05, + "loss": 1.2051, + "step": 5402 + }, + { + "epoch": 2.0115647287407943, + "grad_norm": 0.17057554423809052, + "learning_rate": 1.8602157184530907e-05, + "loss": 1.1815, + "step": 5403 + }, + { + "epoch": 2.011937033891403, + "grad_norm": 0.175872340798378, + "learning_rate": 1.8601537394388565e-05, + "loss": 1.199, + "step": 5404 + }, + { + "epoch": 2.0123093390420124, + "grad_norm": 0.16850432753562927, + "learning_rate": 1.8600917477202055e-05, + "loss": 1.1914, + "step": 5405 + }, + { + "epoch": 2.012681644192621, + "grad_norm": 0.1771266758441925, + "learning_rate": 1.8600297432980533e-05, + "loss": 1.1708, + "step": 5406 + }, + { + "epoch": 2.0130539493432305, + "grad_norm": 0.17538318037986755, + "learning_rate": 1.859967726173316e-05, + "loss": 1.1903, + "step": 5407 + }, + { + "epoch": 2.0134262544938397, + "grad_norm": 0.1645265519618988, + "learning_rate": 1.859905696346909e-05, + "loss": 1.1905, + "step": 5408 + }, + { + "epoch": 2.0137985596444485, + "grad_norm": 0.16865003108978271, + "learning_rate": 1.8598436538197494e-05, + "loss": 1.1704, + "step": 5409 + }, + { + "epoch": 2.014170864795058, + "grad_norm": 0.1677810251712799, + "learning_rate": 1.8597815985927524e-05, + "loss": 1.2029, + "step": 5410 + }, + { + "epoch": 2.0145431699456666, + "grad_norm": 0.1736694723367691, + "learning_rate": 1.8597195306668355e-05, + "loss": 1.1831, + "step": 5411 + }, + { + "epoch": 2.014915475096276, + "grad_norm": 0.17751945555210114, + "learning_rate": 1.859657450042915e-05, + "loss": 1.19, + "step": 5412 + }, + { + "epoch": 2.0152877802468847, + "grad_norm": 0.17150509357452393, + "learning_rate": 1.8595953567219077e-05, + "loss": 1.2011, + "step": 5413 + }, + { + "epoch": 2.015660085397494, + "grad_norm": 0.16916759312152863, + "learning_rate": 1.859533250704731e-05, + "loss": 1.1708, + "step": 5414 + }, + { + "epoch": 2.016032390548103, + "grad_norm": 0.1699335128068924, + "learning_rate": 1.8594711319923026e-05, + "loss": 1.1795, + "step": 5415 + }, + { + "epoch": 2.016404695698712, + "grad_norm": 0.18379724025726318, + "learning_rate": 1.859409000585539e-05, + "loss": 1.1816, + "step": 5416 + }, + { + "epoch": 2.0167770008493213, + "grad_norm": 0.16987095773220062, + "learning_rate": 1.8593468564853587e-05, + "loss": 1.1684, + "step": 5417 + }, + { + "epoch": 2.01714930599993, + "grad_norm": 0.177865669131279, + "learning_rate": 1.8592846996926793e-05, + "loss": 1.1965, + "step": 5418 + }, + { + "epoch": 2.0175216111505394, + "grad_norm": 0.16508503258228302, + "learning_rate": 1.8592225302084187e-05, + "loss": 1.1869, + "step": 5419 + }, + { + "epoch": 2.0178939163011482, + "grad_norm": 0.21210977435112, + "learning_rate": 1.8591603480334947e-05, + "loss": 1.1911, + "step": 5420 + }, + { + "epoch": 2.0182662214517575, + "grad_norm": 0.16533805429935455, + "learning_rate": 1.859098153168827e-05, + "loss": 1.1882, + "step": 5421 + }, + { + "epoch": 2.0186385266023663, + "grad_norm": 0.17425382137298584, + "learning_rate": 1.8590359456153333e-05, + "loss": 1.1772, + "step": 5422 + }, + { + "epoch": 2.0190108317529756, + "grad_norm": 0.16604116559028625, + "learning_rate": 1.8589737253739325e-05, + "loss": 1.1949, + "step": 5423 + }, + { + "epoch": 2.0193831369035844, + "grad_norm": 0.16940464079380035, + "learning_rate": 1.8589114924455438e-05, + "loss": 1.1811, + "step": 5424 + }, + { + "epoch": 2.0197554420541937, + "grad_norm": 0.17113879323005676, + "learning_rate": 1.858849246831086e-05, + "loss": 1.1798, + "step": 5425 + }, + { + "epoch": 2.020127747204803, + "grad_norm": 0.1643594354391098, + "learning_rate": 1.8587869885314788e-05, + "loss": 1.1811, + "step": 5426 + }, + { + "epoch": 2.0205000523554117, + "grad_norm": 0.16450834274291992, + "learning_rate": 1.858724717547642e-05, + "loss": 1.1875, + "step": 5427 + }, + { + "epoch": 2.020872357506021, + "grad_norm": 0.1718573272228241, + "learning_rate": 1.8586624338804947e-05, + "loss": 1.1845, + "step": 5428 + }, + { + "epoch": 2.02124466265663, + "grad_norm": 0.16138458251953125, + "learning_rate": 1.8586001375309576e-05, + "loss": 1.1766, + "step": 5429 + }, + { + "epoch": 2.021616967807239, + "grad_norm": 0.20273058116436005, + "learning_rate": 1.85853782849995e-05, + "loss": 1.1965, + "step": 5430 + }, + { + "epoch": 2.021989272957848, + "grad_norm": 0.17914332449436188, + "learning_rate": 1.8584755067883923e-05, + "loss": 1.1734, + "step": 5431 + }, + { + "epoch": 2.022361578108457, + "grad_norm": 0.17309610545635223, + "learning_rate": 1.8584131723972055e-05, + "loss": 1.1775, + "step": 5432 + }, + { + "epoch": 2.022733883259066, + "grad_norm": 0.18708333373069763, + "learning_rate": 1.8583508253273098e-05, + "loss": 1.195, + "step": 5433 + }, + { + "epoch": 2.0231061884096753, + "grad_norm": 0.16678020358085632, + "learning_rate": 1.8582884655796266e-05, + "loss": 1.1987, + "step": 5434 + }, + { + "epoch": 2.0234784935602845, + "grad_norm": 0.16107121109962463, + "learning_rate": 1.8582260931550766e-05, + "loss": 1.1866, + "step": 5435 + }, + { + "epoch": 2.0238507987108934, + "grad_norm": 0.16838499903678894, + "learning_rate": 1.8581637080545813e-05, + "loss": 1.1874, + "step": 5436 + }, + { + "epoch": 2.0242231038615026, + "grad_norm": 0.16616487503051758, + "learning_rate": 1.8581013102790612e-05, + "loss": 1.1863, + "step": 5437 + }, + { + "epoch": 2.0245954090121114, + "grad_norm": 0.18162907660007477, + "learning_rate": 1.8580388998294393e-05, + "loss": 1.1745, + "step": 5438 + }, + { + "epoch": 2.0249677141627207, + "grad_norm": 0.16428324580192566, + "learning_rate": 1.8579764767066362e-05, + "loss": 1.1795, + "step": 5439 + }, + { + "epoch": 2.0253400193133295, + "grad_norm": 0.17782078683376312, + "learning_rate": 1.8579140409115744e-05, + "loss": 1.1999, + "step": 5440 + }, + { + "epoch": 2.025712324463939, + "grad_norm": 0.16830138862133026, + "learning_rate": 1.8578515924451765e-05, + "loss": 1.173, + "step": 5441 + }, + { + "epoch": 2.0260846296145476, + "grad_norm": 0.19714663922786713, + "learning_rate": 1.8577891313083637e-05, + "loss": 1.2026, + "step": 5442 + }, + { + "epoch": 2.026456934765157, + "grad_norm": 0.18386825919151306, + "learning_rate": 1.8577266575020598e-05, + "loss": 1.1741, + "step": 5443 + }, + { + "epoch": 2.026829239915766, + "grad_norm": 0.18373322486877441, + "learning_rate": 1.857664171027187e-05, + "loss": 1.171, + "step": 5444 + }, + { + "epoch": 2.027201545066375, + "grad_norm": 0.23901142179965973, + "learning_rate": 1.8576016718846678e-05, + "loss": 1.1837, + "step": 5445 + }, + { + "epoch": 2.027573850216984, + "grad_norm": 0.16936446726322174, + "learning_rate": 1.8575391600754266e-05, + "loss": 1.1687, + "step": 5446 + }, + { + "epoch": 2.027946155367593, + "grad_norm": 0.17396526038646698, + "learning_rate": 1.857476635600385e-05, + "loss": 1.1989, + "step": 5447 + }, + { + "epoch": 2.0283184605182023, + "grad_norm": 0.17292600870132446, + "learning_rate": 1.8574140984604672e-05, + "loss": 1.1754, + "step": 5448 + }, + { + "epoch": 2.028690765668811, + "grad_norm": 0.1673480123281479, + "learning_rate": 1.8573515486565976e-05, + "loss": 1.1891, + "step": 5449 + }, + { + "epoch": 2.0290630708194204, + "grad_norm": 0.16834761202335358, + "learning_rate": 1.8572889861896993e-05, + "loss": 1.1937, + "step": 5450 + }, + { + "epoch": 2.0294353759700297, + "grad_norm": 0.16884836554527283, + "learning_rate": 1.857226411060696e-05, + "loss": 1.1812, + "step": 5451 + }, + { + "epoch": 2.0298076811206385, + "grad_norm": 0.16621071100234985, + "learning_rate": 1.857163823270513e-05, + "loss": 1.1886, + "step": 5452 + }, + { + "epoch": 2.0301799862712477, + "grad_norm": 0.16866537928581238, + "learning_rate": 1.8571012228200737e-05, + "loss": 1.1831, + "step": 5453 + }, + { + "epoch": 2.0305522914218566, + "grad_norm": 0.16921500861644745, + "learning_rate": 1.8570386097103033e-05, + "loss": 1.1753, + "step": 5454 + }, + { + "epoch": 2.030924596572466, + "grad_norm": 0.1673291027545929, + "learning_rate": 1.8569759839421263e-05, + "loss": 1.1827, + "step": 5455 + }, + { + "epoch": 2.0312969017230746, + "grad_norm": 0.1631641387939453, + "learning_rate": 1.856913345516468e-05, + "loss": 1.1886, + "step": 5456 + }, + { + "epoch": 2.031669206873684, + "grad_norm": 0.16557154059410095, + "learning_rate": 1.8568506944342535e-05, + "loss": 1.1644, + "step": 5457 + }, + { + "epoch": 2.0320415120242927, + "grad_norm": 0.1667705774307251, + "learning_rate": 1.8567880306964077e-05, + "loss": 1.1846, + "step": 5458 + }, + { + "epoch": 2.032413817174902, + "grad_norm": 0.16573351621627808, + "learning_rate": 1.8567253543038564e-05, + "loss": 1.1714, + "step": 5459 + }, + { + "epoch": 2.0327861223255113, + "grad_norm": 0.17770934104919434, + "learning_rate": 1.8566626652575257e-05, + "loss": 1.1844, + "step": 5460 + }, + { + "epoch": 2.03315842747612, + "grad_norm": 0.1648428589105606, + "learning_rate": 1.856599963558341e-05, + "loss": 1.197, + "step": 5461 + }, + { + "epoch": 2.0335307326267293, + "grad_norm": 0.16865648329257965, + "learning_rate": 1.8565372492072288e-05, + "loss": 1.2002, + "step": 5462 + }, + { + "epoch": 2.033903037777338, + "grad_norm": 0.1726176142692566, + "learning_rate": 1.856474522205115e-05, + "loss": 1.1789, + "step": 5463 + }, + { + "epoch": 2.0342753429279474, + "grad_norm": 0.1659863144159317, + "learning_rate": 1.856411782552926e-05, + "loss": 1.1927, + "step": 5464 + }, + { + "epoch": 2.0346476480785562, + "grad_norm": 0.17737741768360138, + "learning_rate": 1.856349030251589e-05, + "loss": 1.1853, + "step": 5465 + }, + { + "epoch": 2.0350199532291655, + "grad_norm": 0.1588411182165146, + "learning_rate": 1.8562862653020306e-05, + "loss": 1.1897, + "step": 5466 + }, + { + "epoch": 2.0353922583797743, + "grad_norm": 0.200198695063591, + "learning_rate": 1.8562234877051778e-05, + "loss": 1.1845, + "step": 5467 + }, + { + "epoch": 2.0357645635303836, + "grad_norm": 0.18898078799247742, + "learning_rate": 1.8561606974619577e-05, + "loss": 1.1717, + "step": 5468 + }, + { + "epoch": 2.036136868680993, + "grad_norm": 0.17676281929016113, + "learning_rate": 1.856097894573298e-05, + "loss": 1.1838, + "step": 5469 + }, + { + "epoch": 2.0365091738316017, + "grad_norm": 0.23812207579612732, + "learning_rate": 1.856035079040126e-05, + "loss": 1.1728, + "step": 5470 + }, + { + "epoch": 2.036881478982211, + "grad_norm": 0.17304615676403046, + "learning_rate": 1.8559722508633698e-05, + "loss": 1.1913, + "step": 5471 + }, + { + "epoch": 2.0372537841328198, + "grad_norm": 0.17756541073322296, + "learning_rate": 1.8559094100439568e-05, + "loss": 1.1787, + "step": 5472 + }, + { + "epoch": 2.037626089283429, + "grad_norm": 0.16720567643642426, + "learning_rate": 1.8558465565828156e-05, + "loss": 1.1778, + "step": 5473 + }, + { + "epoch": 2.037998394434038, + "grad_norm": 0.19061878323554993, + "learning_rate": 1.855783690480875e-05, + "loss": 1.1798, + "step": 5474 + }, + { + "epoch": 2.038370699584647, + "grad_norm": 0.16539441049098969, + "learning_rate": 1.8557208117390626e-05, + "loss": 1.1939, + "step": 5475 + }, + { + "epoch": 2.038743004735256, + "grad_norm": 0.16110876202583313, + "learning_rate": 1.8556579203583075e-05, + "loss": 1.1937, + "step": 5476 + }, + { + "epoch": 2.039115309885865, + "grad_norm": 0.16478274762630463, + "learning_rate": 1.8555950163395383e-05, + "loss": 1.189, + "step": 5477 + }, + { + "epoch": 2.0394876150364745, + "grad_norm": 0.16724629700183868, + "learning_rate": 1.855532099683685e-05, + "loss": 1.1846, + "step": 5478 + }, + { + "epoch": 2.0398599201870833, + "grad_norm": 0.1901393085718155, + "learning_rate": 1.855469170391676e-05, + "loss": 1.1695, + "step": 5479 + }, + { + "epoch": 2.0402322253376926, + "grad_norm": 0.16991356015205383, + "learning_rate": 1.855406228464441e-05, + "loss": 1.2068, + "step": 5480 + }, + { + "epoch": 2.0406045304883014, + "grad_norm": 0.1755204200744629, + "learning_rate": 1.85534327390291e-05, + "loss": 1.178, + "step": 5481 + }, + { + "epoch": 2.0409768356389106, + "grad_norm": 0.17370069026947021, + "learning_rate": 1.855280306708012e-05, + "loss": 1.1727, + "step": 5482 + }, + { + "epoch": 2.0413491407895195, + "grad_norm": 0.17671510577201843, + "learning_rate": 1.8552173268806778e-05, + "loss": 1.1941, + "step": 5483 + }, + { + "epoch": 2.0417214459401287, + "grad_norm": 0.17848750948905945, + "learning_rate": 1.8551543344218372e-05, + "loss": 1.169, + "step": 5484 + }, + { + "epoch": 2.0420937510907375, + "grad_norm": 0.18597623705863953, + "learning_rate": 1.855091329332421e-05, + "loss": 1.1696, + "step": 5485 + }, + { + "epoch": 2.042466056241347, + "grad_norm": 0.19254527986049652, + "learning_rate": 1.8550283116133595e-05, + "loss": 1.1791, + "step": 5486 + }, + { + "epoch": 2.042838361391956, + "grad_norm": 0.17649273574352264, + "learning_rate": 1.8549652812655836e-05, + "loss": 1.1929, + "step": 5487 + }, + { + "epoch": 2.043210666542565, + "grad_norm": 0.16401410102844238, + "learning_rate": 1.854902238290024e-05, + "loss": 1.1727, + "step": 5488 + }, + { + "epoch": 2.043582971693174, + "grad_norm": 0.16937775909900665, + "learning_rate": 1.854839182687612e-05, + "loss": 1.1872, + "step": 5489 + }, + { + "epoch": 2.043955276843783, + "grad_norm": 0.16704395413398743, + "learning_rate": 1.854776114459279e-05, + "loss": 1.1857, + "step": 5490 + }, + { + "epoch": 2.0443275819943922, + "grad_norm": 0.16636380553245544, + "learning_rate": 1.8547130336059562e-05, + "loss": 1.1877, + "step": 5491 + }, + { + "epoch": 2.044699887145001, + "grad_norm": 0.17131738364696503, + "learning_rate": 1.8546499401285755e-05, + "loss": 1.1872, + "step": 5492 + }, + { + "epoch": 2.0450721922956103, + "grad_norm": 0.1786455512046814, + "learning_rate": 1.854586834028069e-05, + "loss": 1.1894, + "step": 5493 + }, + { + "epoch": 2.045444497446219, + "grad_norm": 0.17036451399326324, + "learning_rate": 1.8545237153053688e-05, + "loss": 1.1865, + "step": 5494 + }, + { + "epoch": 2.0458168025968284, + "grad_norm": 0.17519360780715942, + "learning_rate": 1.8544605839614066e-05, + "loss": 1.1922, + "step": 5495 + }, + { + "epoch": 2.0461891077474377, + "grad_norm": 0.17141428589820862, + "learning_rate": 1.8543974399971153e-05, + "loss": 1.1991, + "step": 5496 + }, + { + "epoch": 2.0465614128980465, + "grad_norm": 0.17702911794185638, + "learning_rate": 1.8543342834134276e-05, + "loss": 1.1764, + "step": 5497 + }, + { + "epoch": 2.0469337180486558, + "grad_norm": 0.16847530007362366, + "learning_rate": 1.854271114211276e-05, + "loss": 1.1831, + "step": 5498 + }, + { + "epoch": 2.0473060231992646, + "grad_norm": 0.17060035467147827, + "learning_rate": 1.8542079323915935e-05, + "loss": 1.2039, + "step": 5499 + }, + { + "epoch": 2.047678328349874, + "grad_norm": 0.1642797440290451, + "learning_rate": 1.8541447379553136e-05, + "loss": 1.1645, + "step": 5500 + }, + { + "epoch": 2.047678328349874, + "eval_loss": 1.3067960739135742, + "eval_runtime": 17.3329, + "eval_samples_per_second": 100.041, + "eval_steps_per_second": 5.019, + "step": 5500 + }, + { + "epoch": 2.0480506335004827, + "grad_norm": 0.2160232812166214, + "learning_rate": 1.8540815309033697e-05, + "loss": 1.2004, + "step": 5501 + }, + { + "epoch": 2.048422938651092, + "grad_norm": 0.1832730919122696, + "learning_rate": 1.854018311236695e-05, + "loss": 1.1862, + "step": 5502 + }, + { + "epoch": 2.0487952438017007, + "grad_norm": 0.18487650156021118, + "learning_rate": 1.8539550789562234e-05, + "loss": 1.2079, + "step": 5503 + }, + { + "epoch": 2.04916754895231, + "grad_norm": 0.16732271015644073, + "learning_rate": 1.853891834062889e-05, + "loss": 1.1865, + "step": 5504 + }, + { + "epoch": 2.0495398541029193, + "grad_norm": 0.21880120038986206, + "learning_rate": 1.853828576557626e-05, + "loss": 1.1793, + "step": 5505 + }, + { + "epoch": 2.049912159253528, + "grad_norm": 0.16671521961688995, + "learning_rate": 1.853765306441368e-05, + "loss": 1.1817, + "step": 5506 + }, + { + "epoch": 2.0502844644041374, + "grad_norm": 0.17051751911640167, + "learning_rate": 1.8537020237150503e-05, + "loss": 1.1646, + "step": 5507 + }, + { + "epoch": 2.050656769554746, + "grad_norm": 0.16859528422355652, + "learning_rate": 1.853638728379607e-05, + "loss": 1.1764, + "step": 5508 + }, + { + "epoch": 2.0510290747053554, + "grad_norm": 0.16496269404888153, + "learning_rate": 1.8535754204359737e-05, + "loss": 1.183, + "step": 5509 + }, + { + "epoch": 2.0514013798559643, + "grad_norm": 0.16721811890602112, + "learning_rate": 1.853512099885085e-05, + "loss": 1.2113, + "step": 5510 + }, + { + "epoch": 2.0517736850065735, + "grad_norm": 0.17551200091838837, + "learning_rate": 1.8534487667278757e-05, + "loss": 1.177, + "step": 5511 + }, + { + "epoch": 2.052145990157183, + "grad_norm": 0.16415713727474213, + "learning_rate": 1.853385420965282e-05, + "loss": 1.1782, + "step": 5512 + }, + { + "epoch": 2.0525182953077916, + "grad_norm": 0.16708432137966156, + "learning_rate": 1.8533220625982392e-05, + "loss": 1.2093, + "step": 5513 + }, + { + "epoch": 2.052890600458401, + "grad_norm": 0.16967132687568665, + "learning_rate": 1.8532586916276828e-05, + "loss": 1.18, + "step": 5514 + }, + { + "epoch": 2.0532629056090097, + "grad_norm": 0.17362266778945923, + "learning_rate": 1.8531953080545494e-05, + "loss": 1.1906, + "step": 5515 + }, + { + "epoch": 2.053635210759619, + "grad_norm": 0.1716122329235077, + "learning_rate": 1.853131911879775e-05, + "loss": 1.1809, + "step": 5516 + }, + { + "epoch": 2.054007515910228, + "grad_norm": 0.1693599373102188, + "learning_rate": 1.8530685031042952e-05, + "loss": 1.1829, + "step": 5517 + }, + { + "epoch": 2.054379821060837, + "grad_norm": 0.1685894876718521, + "learning_rate": 1.8530050817290477e-05, + "loss": 1.1855, + "step": 5518 + }, + { + "epoch": 2.054752126211446, + "grad_norm": 0.18109537661075592, + "learning_rate": 1.852941647754968e-05, + "loss": 1.1782, + "step": 5519 + }, + { + "epoch": 2.055124431362055, + "grad_norm": 0.17635756731033325, + "learning_rate": 1.8528782011829945e-05, + "loss": 1.1857, + "step": 5520 + }, + { + "epoch": 2.0554967365126644, + "grad_norm": 0.1745537966489792, + "learning_rate": 1.852814742014063e-05, + "loss": 1.1963, + "step": 5521 + }, + { + "epoch": 2.0558690416632732, + "grad_norm": 0.17796571552753448, + "learning_rate": 1.8527512702491116e-05, + "loss": 1.1818, + "step": 5522 + }, + { + "epoch": 2.0562413468138825, + "grad_norm": 0.17338848114013672, + "learning_rate": 1.852687785889077e-05, + "loss": 1.1864, + "step": 5523 + }, + { + "epoch": 2.0566136519644913, + "grad_norm": 0.18581651151180267, + "learning_rate": 1.8526242889348976e-05, + "loss": 1.1952, + "step": 5524 + }, + { + "epoch": 2.0569859571151006, + "grad_norm": 0.1682850867509842, + "learning_rate": 1.852560779387511e-05, + "loss": 1.1846, + "step": 5525 + }, + { + "epoch": 2.0573582622657094, + "grad_norm": 0.19426992535591125, + "learning_rate": 1.8524972572478554e-05, + "loss": 1.1823, + "step": 5526 + }, + { + "epoch": 2.0577305674163187, + "grad_norm": 0.17708083987236023, + "learning_rate": 1.8524337225168682e-05, + "loss": 1.1776, + "step": 5527 + }, + { + "epoch": 2.0581028725669275, + "grad_norm": 0.1654316484928131, + "learning_rate": 1.852370175195489e-05, + "loss": 1.1824, + "step": 5528 + }, + { + "epoch": 2.0584751777175367, + "grad_norm": 0.20494744181632996, + "learning_rate": 1.8523066152846552e-05, + "loss": 1.1866, + "step": 5529 + }, + { + "epoch": 2.058847482868146, + "grad_norm": 0.17771178483963013, + "learning_rate": 1.852243042785307e-05, + "loss": 1.1918, + "step": 5530 + }, + { + "epoch": 2.059219788018755, + "grad_norm": 0.17502543330192566, + "learning_rate": 1.852179457698382e-05, + "loss": 1.1821, + "step": 5531 + }, + { + "epoch": 2.059592093169364, + "grad_norm": 0.1928001195192337, + "learning_rate": 1.85211586002482e-05, + "loss": 1.1901, + "step": 5532 + }, + { + "epoch": 2.059964398319973, + "grad_norm": 0.16364991664886475, + "learning_rate": 1.85205224976556e-05, + "loss": 1.1747, + "step": 5533 + }, + { + "epoch": 2.060336703470582, + "grad_norm": 0.1756642460823059, + "learning_rate": 1.851988626921542e-05, + "loss": 1.1723, + "step": 5534 + }, + { + "epoch": 2.060709008621191, + "grad_norm": 0.19008730351924896, + "learning_rate": 1.8519249914937056e-05, + "loss": 1.1996, + "step": 5535 + }, + { + "epoch": 2.0610813137718003, + "grad_norm": 0.17250213027000427, + "learning_rate": 1.85186134348299e-05, + "loss": 1.1882, + "step": 5536 + }, + { + "epoch": 2.061453618922409, + "grad_norm": 0.1646471917629242, + "learning_rate": 1.8517976828903365e-05, + "loss": 1.1918, + "step": 5537 + }, + { + "epoch": 2.0618259240730183, + "grad_norm": 0.1910063773393631, + "learning_rate": 1.851734009716684e-05, + "loss": 1.187, + "step": 5538 + }, + { + "epoch": 2.0621982292236276, + "grad_norm": 0.19279128313064575, + "learning_rate": 1.8516703239629744e-05, + "loss": 1.1942, + "step": 5539 + }, + { + "epoch": 2.0625705343742364, + "grad_norm": 0.16666115820407867, + "learning_rate": 1.8516066256301468e-05, + "loss": 1.1883, + "step": 5540 + }, + { + "epoch": 2.0629428395248457, + "grad_norm": 0.20289574563503265, + "learning_rate": 1.8515429147191434e-05, + "loss": 1.1917, + "step": 5541 + }, + { + "epoch": 2.0633151446754545, + "grad_norm": 0.18597279489040375, + "learning_rate": 1.851479191230904e-05, + "loss": 1.1754, + "step": 5542 + }, + { + "epoch": 2.063687449826064, + "grad_norm": 0.17310966551303864, + "learning_rate": 1.851415455166371e-05, + "loss": 1.1892, + "step": 5543 + }, + { + "epoch": 2.0640597549766726, + "grad_norm": 0.20302632451057434, + "learning_rate": 1.851351706526485e-05, + "loss": 1.1921, + "step": 5544 + }, + { + "epoch": 2.064432060127282, + "grad_norm": 0.17062672972679138, + "learning_rate": 1.8512879453121874e-05, + "loss": 1.1821, + "step": 5545 + }, + { + "epoch": 2.0648043652778907, + "grad_norm": 0.16479294002056122, + "learning_rate": 1.8512241715244203e-05, + "loss": 1.1813, + "step": 5546 + }, + { + "epoch": 2.0651766704285, + "grad_norm": 0.17303155362606049, + "learning_rate": 1.8511603851641256e-05, + "loss": 1.1988, + "step": 5547 + }, + { + "epoch": 2.065548975579109, + "grad_norm": 0.17052410542964935, + "learning_rate": 1.8510965862322455e-05, + "loss": 1.1997, + "step": 5548 + }, + { + "epoch": 2.065921280729718, + "grad_norm": 0.15899476408958435, + "learning_rate": 1.8510327747297225e-05, + "loss": 1.1912, + "step": 5549 + }, + { + "epoch": 2.0662935858803273, + "grad_norm": 0.18540219962596893, + "learning_rate": 1.8509689506574986e-05, + "loss": 1.198, + "step": 5550 + }, + { + "epoch": 2.066665891030936, + "grad_norm": 0.1953384429216385, + "learning_rate": 1.8509051140165167e-05, + "loss": 1.1713, + "step": 5551 + }, + { + "epoch": 2.0670381961815454, + "grad_norm": 0.1603497862815857, + "learning_rate": 1.85084126480772e-05, + "loss": 1.1924, + "step": 5552 + }, + { + "epoch": 2.067410501332154, + "grad_norm": 0.3583022654056549, + "learning_rate": 1.8507774030320508e-05, + "loss": 1.1836, + "step": 5553 + }, + { + "epoch": 2.0677828064827635, + "grad_norm": 0.1765451580286026, + "learning_rate": 1.8507135286904527e-05, + "loss": 1.1841, + "step": 5554 + }, + { + "epoch": 2.0681551116333723, + "grad_norm": 0.17876610159873962, + "learning_rate": 1.8506496417838695e-05, + "loss": 1.1916, + "step": 5555 + }, + { + "epoch": 2.0685274167839816, + "grad_norm": 0.17126069962978363, + "learning_rate": 1.8505857423132447e-05, + "loss": 1.1818, + "step": 5556 + }, + { + "epoch": 2.068899721934591, + "grad_norm": 0.1569199413061142, + "learning_rate": 1.8505218302795213e-05, + "loss": 1.193, + "step": 5557 + }, + { + "epoch": 2.0692720270851996, + "grad_norm": 0.17595240473747253, + "learning_rate": 1.8504579056836437e-05, + "loss": 1.1999, + "step": 5558 + }, + { + "epoch": 2.069644332235809, + "grad_norm": 0.18248499929904938, + "learning_rate": 1.850393968526557e-05, + "loss": 1.1813, + "step": 5559 + }, + { + "epoch": 2.0700166373864177, + "grad_norm": 0.17279589176177979, + "learning_rate": 1.850330018809204e-05, + "loss": 1.1837, + "step": 5560 + }, + { + "epoch": 2.070388942537027, + "grad_norm": 0.16098688542842865, + "learning_rate": 1.8502660565325302e-05, + "loss": 1.1816, + "step": 5561 + }, + { + "epoch": 2.070761247687636, + "grad_norm": 0.16793614625930786, + "learning_rate": 1.8502020816974806e-05, + "loss": 1.1893, + "step": 5562 + }, + { + "epoch": 2.071133552838245, + "grad_norm": 0.17572426795959473, + "learning_rate": 1.850138094304999e-05, + "loss": 1.182, + "step": 5563 + }, + { + "epoch": 2.0715058579888543, + "grad_norm": 0.1719566136598587, + "learning_rate": 1.850074094356031e-05, + "loss": 1.191, + "step": 5564 + }, + { + "epoch": 2.071878163139463, + "grad_norm": 0.16841794550418854, + "learning_rate": 1.8500100818515224e-05, + "loss": 1.1896, + "step": 5565 + }, + { + "epoch": 2.0722504682900724, + "grad_norm": 0.16912922263145447, + "learning_rate": 1.8499460567924182e-05, + "loss": 1.1908, + "step": 5566 + }, + { + "epoch": 2.0726227734406812, + "grad_norm": 0.1644641011953354, + "learning_rate": 1.849882019179664e-05, + "loss": 1.1758, + "step": 5567 + }, + { + "epoch": 2.0729950785912905, + "grad_norm": 0.16581396758556366, + "learning_rate": 1.8498179690142057e-05, + "loss": 1.1841, + "step": 5568 + }, + { + "epoch": 2.0733673837418993, + "grad_norm": 0.165495365858078, + "learning_rate": 1.8497539062969893e-05, + "loss": 1.1847, + "step": 5569 + }, + { + "epoch": 2.0737396888925086, + "grad_norm": 0.16472892463207245, + "learning_rate": 1.849689831028961e-05, + "loss": 1.1978, + "step": 5570 + }, + { + "epoch": 2.0741119940431174, + "grad_norm": 0.1632337123155594, + "learning_rate": 1.8496257432110673e-05, + "loss": 1.1641, + "step": 5571 + }, + { + "epoch": 2.0744842991937267, + "grad_norm": 0.16693541407585144, + "learning_rate": 1.8495616428442546e-05, + "loss": 1.178, + "step": 5572 + }, + { + "epoch": 2.074856604344336, + "grad_norm": 0.17431595921516418, + "learning_rate": 1.84949752992947e-05, + "loss": 1.1958, + "step": 5573 + }, + { + "epoch": 2.0752289094949448, + "grad_norm": 0.1731480062007904, + "learning_rate": 1.84943340446766e-05, + "loss": 1.1769, + "step": 5574 + }, + { + "epoch": 2.075601214645554, + "grad_norm": 0.16712194681167603, + "learning_rate": 1.849369266459772e-05, + "loss": 1.1871, + "step": 5575 + }, + { + "epoch": 2.075973519796163, + "grad_norm": 0.16776369512081146, + "learning_rate": 1.849305115906753e-05, + "loss": 1.1734, + "step": 5576 + }, + { + "epoch": 2.076345824946772, + "grad_norm": 0.17154593765735626, + "learning_rate": 1.849240952809551e-05, + "loss": 1.1959, + "step": 5577 + }, + { + "epoch": 2.076718130097381, + "grad_norm": 0.16923579573631287, + "learning_rate": 1.8491767771691133e-05, + "loss": 1.1937, + "step": 5578 + }, + { + "epoch": 2.07709043524799, + "grad_norm": 0.17263515293598175, + "learning_rate": 1.849112588986388e-05, + "loss": 1.1912, + "step": 5579 + }, + { + "epoch": 2.077462740398599, + "grad_norm": 0.16480253636837006, + "learning_rate": 1.849048388262323e-05, + "loss": 1.1821, + "step": 5580 + }, + { + "epoch": 2.0778350455492083, + "grad_norm": 0.16835813224315643, + "learning_rate": 1.8489841749978668e-05, + "loss": 1.1754, + "step": 5581 + }, + { + "epoch": 2.0782073506998175, + "grad_norm": 0.17042113840579987, + "learning_rate": 1.848919949193967e-05, + "loss": 1.1767, + "step": 5582 + }, + { + "epoch": 2.0785796558504264, + "grad_norm": 0.17123256623744965, + "learning_rate": 1.8488557108515736e-05, + "loss": 1.1778, + "step": 5583 + }, + { + "epoch": 2.0789519610010356, + "grad_norm": 0.16714586317539215, + "learning_rate": 1.848791459971634e-05, + "loss": 1.183, + "step": 5584 + }, + { + "epoch": 2.0793242661516445, + "grad_norm": 0.16388732194900513, + "learning_rate": 1.848727196555098e-05, + "loss": 1.1881, + "step": 5585 + }, + { + "epoch": 2.0796965713022537, + "grad_norm": 0.1711377501487732, + "learning_rate": 1.8486629206029146e-05, + "loss": 1.1975, + "step": 5586 + }, + { + "epoch": 2.0800688764528625, + "grad_norm": 0.16601723432540894, + "learning_rate": 1.8485986321160335e-05, + "loss": 1.1854, + "step": 5587 + }, + { + "epoch": 2.080441181603472, + "grad_norm": 0.16255028545856476, + "learning_rate": 1.8485343310954033e-05, + "loss": 1.1762, + "step": 5588 + }, + { + "epoch": 2.0808134867540806, + "grad_norm": 0.1604170799255371, + "learning_rate": 1.8484700175419747e-05, + "loss": 1.1835, + "step": 5589 + }, + { + "epoch": 2.08118579190469, + "grad_norm": 0.16844023764133453, + "learning_rate": 1.8484056914566967e-05, + "loss": 1.189, + "step": 5590 + }, + { + "epoch": 2.081558097055299, + "grad_norm": 0.1678459644317627, + "learning_rate": 1.84834135284052e-05, + "loss": 1.1828, + "step": 5591 + }, + { + "epoch": 2.081930402205908, + "grad_norm": 0.16233506798744202, + "learning_rate": 1.8482770016943952e-05, + "loss": 1.1749, + "step": 5592 + }, + { + "epoch": 2.0823027073565172, + "grad_norm": 0.16579484939575195, + "learning_rate": 1.848212638019272e-05, + "loss": 1.1794, + "step": 5593 + }, + { + "epoch": 2.082675012507126, + "grad_norm": 0.16897903382778168, + "learning_rate": 1.8481482618161016e-05, + "loss": 1.2122, + "step": 5594 + }, + { + "epoch": 2.0830473176577353, + "grad_norm": 0.16713303327560425, + "learning_rate": 1.848083873085834e-05, + "loss": 1.1682, + "step": 5595 + }, + { + "epoch": 2.083419622808344, + "grad_norm": 0.1653788685798645, + "learning_rate": 1.8480194718294212e-05, + "loss": 1.183, + "step": 5596 + }, + { + "epoch": 2.0837919279589534, + "grad_norm": 0.15681540966033936, + "learning_rate": 1.847955058047814e-05, + "loss": 1.1812, + "step": 5597 + }, + { + "epoch": 2.0841642331095622, + "grad_norm": 0.16332903504371643, + "learning_rate": 1.8478906317419644e-05, + "loss": 1.1776, + "step": 5598 + }, + { + "epoch": 2.0845365382601715, + "grad_norm": 0.16454966366291046, + "learning_rate": 1.8478261929128226e-05, + "loss": 1.1807, + "step": 5599 + }, + { + "epoch": 2.0849088434107808, + "grad_norm": 0.16205929219722748, + "learning_rate": 1.8477617415613413e-05, + "loss": 1.1902, + "step": 5600 + }, + { + "epoch": 2.0852811485613896, + "grad_norm": 0.16746239364147186, + "learning_rate": 1.8476972776884724e-05, + "loss": 1.2076, + "step": 5601 + }, + { + "epoch": 2.085653453711999, + "grad_norm": 0.1641511619091034, + "learning_rate": 1.8476328012951677e-05, + "loss": 1.1847, + "step": 5602 + }, + { + "epoch": 2.0860257588626077, + "grad_norm": 0.16240178048610687, + "learning_rate": 1.84756831238238e-05, + "loss": 1.1733, + "step": 5603 + }, + { + "epoch": 2.086398064013217, + "grad_norm": 0.16634000837802887, + "learning_rate": 1.8475038109510612e-05, + "loss": 1.1749, + "step": 5604 + }, + { + "epoch": 2.0867703691638257, + "grad_norm": 0.17073014378547668, + "learning_rate": 1.847439297002165e-05, + "loss": 1.1751, + "step": 5605 + }, + { + "epoch": 2.087142674314435, + "grad_norm": 0.15842556953430176, + "learning_rate": 1.8473747705366427e-05, + "loss": 1.1722, + "step": 5606 + }, + { + "epoch": 2.087514979465044, + "grad_norm": 0.15961557626724243, + "learning_rate": 1.8473102315554484e-05, + "loss": 1.1957, + "step": 5607 + }, + { + "epoch": 2.087887284615653, + "grad_norm": 0.1633174568414688, + "learning_rate": 1.8472456800595355e-05, + "loss": 1.1652, + "step": 5608 + }, + { + "epoch": 2.0882595897662624, + "grad_norm": 0.17014336585998535, + "learning_rate": 1.847181116049857e-05, + "loss": 1.1878, + "step": 5609 + }, + { + "epoch": 2.088631894916871, + "grad_norm": 0.16674593091011047, + "learning_rate": 1.8471165395273666e-05, + "loss": 1.1844, + "step": 5610 + }, + { + "epoch": 2.0890042000674804, + "grad_norm": 0.16320446133613586, + "learning_rate": 1.8470519504930178e-05, + "loss": 1.1932, + "step": 5611 + }, + { + "epoch": 2.0893765052180893, + "grad_norm": 0.1698993742465973, + "learning_rate": 1.846987348947765e-05, + "loss": 1.1791, + "step": 5612 + }, + { + "epoch": 2.0897488103686985, + "grad_norm": 0.16859601438045502, + "learning_rate": 1.8469227348925624e-05, + "loss": 1.1977, + "step": 5613 + }, + { + "epoch": 2.0901211155193073, + "grad_norm": 0.16423748433589935, + "learning_rate": 1.846858108328364e-05, + "loss": 1.1899, + "step": 5614 + }, + { + "epoch": 2.0904934206699166, + "grad_norm": 0.15783381462097168, + "learning_rate": 1.8467934692561242e-05, + "loss": 1.1945, + "step": 5615 + }, + { + "epoch": 2.0908657258205254, + "grad_norm": 0.16708320379257202, + "learning_rate": 1.8467288176767986e-05, + "loss": 1.1789, + "step": 5616 + }, + { + "epoch": 2.0912380309711347, + "grad_norm": 0.16855353116989136, + "learning_rate": 1.846664153591341e-05, + "loss": 1.1807, + "step": 5617 + }, + { + "epoch": 2.091610336121744, + "grad_norm": 0.17245307564735413, + "learning_rate": 1.846599477000707e-05, + "loss": 1.1927, + "step": 5618 + }, + { + "epoch": 2.091982641272353, + "grad_norm": 0.1593155562877655, + "learning_rate": 1.8465347879058524e-05, + "loss": 1.1677, + "step": 5619 + }, + { + "epoch": 2.092354946422962, + "grad_norm": 0.16127386689186096, + "learning_rate": 1.8464700863077313e-05, + "loss": 1.1698, + "step": 5620 + }, + { + "epoch": 2.092727251573571, + "grad_norm": 0.16885069012641907, + "learning_rate": 1.846405372207301e-05, + "loss": 1.1863, + "step": 5621 + }, + { + "epoch": 2.09309955672418, + "grad_norm": 0.16606591641902924, + "learning_rate": 1.846340645605516e-05, + "loss": 1.1731, + "step": 5622 + }, + { + "epoch": 2.093471861874789, + "grad_norm": 0.16918626427650452, + "learning_rate": 1.8462759065033326e-05, + "loss": 1.1936, + "step": 5623 + }, + { + "epoch": 2.093844167025398, + "grad_norm": 0.16348494589328766, + "learning_rate": 1.8462111549017074e-05, + "loss": 1.1815, + "step": 5624 + }, + { + "epoch": 2.094216472176007, + "grad_norm": 0.16731733083724976, + "learning_rate": 1.8461463908015967e-05, + "loss": 1.2031, + "step": 5625 + }, + { + "epoch": 2.0945887773266163, + "grad_norm": 0.1636827141046524, + "learning_rate": 1.8460816142039566e-05, + "loss": 1.1862, + "step": 5626 + }, + { + "epoch": 2.0949610824772256, + "grad_norm": 0.1637611985206604, + "learning_rate": 1.8460168251097444e-05, + "loss": 1.1915, + "step": 5627 + }, + { + "epoch": 2.0953333876278344, + "grad_norm": 0.16247594356536865, + "learning_rate": 1.8459520235199165e-05, + "loss": 1.1826, + "step": 5628 + }, + { + "epoch": 2.0957056927784437, + "grad_norm": 0.1628650724887848, + "learning_rate": 1.8458872094354307e-05, + "loss": 1.1848, + "step": 5629 + }, + { + "epoch": 2.0960779979290525, + "grad_norm": 0.16687503457069397, + "learning_rate": 1.8458223828572435e-05, + "loss": 1.1787, + "step": 5630 + }, + { + "epoch": 2.0964503030796617, + "grad_norm": 0.16528525948524475, + "learning_rate": 1.8457575437863134e-05, + "loss": 1.1788, + "step": 5631 + }, + { + "epoch": 2.0968226082302706, + "grad_norm": 0.16622121632099152, + "learning_rate": 1.845692692223597e-05, + "loss": 1.1934, + "step": 5632 + }, + { + "epoch": 2.09719491338088, + "grad_norm": 0.1611912101507187, + "learning_rate": 1.8456278281700527e-05, + "loss": 1.1901, + "step": 5633 + }, + { + "epoch": 2.097567218531489, + "grad_norm": 0.1634882390499115, + "learning_rate": 1.845562951626638e-05, + "loss": 1.1897, + "step": 5634 + }, + { + "epoch": 2.097939523682098, + "grad_norm": 0.1702599674463272, + "learning_rate": 1.8454980625943122e-05, + "loss": 1.2001, + "step": 5635 + }, + { + "epoch": 2.098311828832707, + "grad_norm": 0.15951082110404968, + "learning_rate": 1.845433161074033e-05, + "loss": 1.1773, + "step": 5636 + }, + { + "epoch": 2.098684133983316, + "grad_norm": 0.16114339232444763, + "learning_rate": 1.845368247066759e-05, + "loss": 1.1853, + "step": 5637 + }, + { + "epoch": 2.0990564391339253, + "grad_norm": 0.17119161784648895, + "learning_rate": 1.845303320573449e-05, + "loss": 1.1879, + "step": 5638 + }, + { + "epoch": 2.099428744284534, + "grad_norm": 0.1637708991765976, + "learning_rate": 1.8452383815950616e-05, + "loss": 1.1693, + "step": 5639 + }, + { + "epoch": 2.0998010494351433, + "grad_norm": 0.16330254077911377, + "learning_rate": 1.845173430132557e-05, + "loss": 1.1775, + "step": 5640 + }, + { + "epoch": 2.100173354585752, + "grad_norm": 0.16474707424640656, + "learning_rate": 1.8451084661868936e-05, + "loss": 1.166, + "step": 5641 + }, + { + "epoch": 2.1005456597363614, + "grad_norm": 0.16186347603797913, + "learning_rate": 1.845043489759031e-05, + "loss": 1.1778, + "step": 5642 + }, + { + "epoch": 2.1009179648869707, + "grad_norm": 0.1683456301689148, + "learning_rate": 1.844978500849929e-05, + "loss": 1.1905, + "step": 5643 + }, + { + "epoch": 2.1012902700375795, + "grad_norm": 0.15968085825443268, + "learning_rate": 1.8449134994605483e-05, + "loss": 1.1779, + "step": 5644 + }, + { + "epoch": 2.1016625751881888, + "grad_norm": 0.15909917652606964, + "learning_rate": 1.8448484855918476e-05, + "loss": 1.1993, + "step": 5645 + }, + { + "epoch": 2.1020348803387976, + "grad_norm": 0.16778358817100525, + "learning_rate": 1.844783459244788e-05, + "loss": 1.2041, + "step": 5646 + }, + { + "epoch": 2.102407185489407, + "grad_norm": 0.1642913818359375, + "learning_rate": 1.8447184204203297e-05, + "loss": 1.1888, + "step": 5647 + }, + { + "epoch": 2.1027794906400157, + "grad_norm": 0.1615377813577652, + "learning_rate": 1.8446533691194332e-05, + "loss": 1.1834, + "step": 5648 + }, + { + "epoch": 2.103151795790625, + "grad_norm": 0.16255392134189606, + "learning_rate": 1.8445883053430597e-05, + "loss": 1.167, + "step": 5649 + }, + { + "epoch": 2.1035241009412338, + "grad_norm": 0.1646036058664322, + "learning_rate": 1.84452322909217e-05, + "loss": 1.1865, + "step": 5650 + }, + { + "epoch": 2.103896406091843, + "grad_norm": 0.17090143263339996, + "learning_rate": 1.844458140367725e-05, + "loss": 1.1907, + "step": 5651 + }, + { + "epoch": 2.1042687112424523, + "grad_norm": 0.17261987924575806, + "learning_rate": 1.8443930391706863e-05, + "loss": 1.194, + "step": 5652 + }, + { + "epoch": 2.104641016393061, + "grad_norm": 0.166230246424675, + "learning_rate": 1.8443279255020153e-05, + "loss": 1.188, + "step": 5653 + }, + { + "epoch": 2.1050133215436704, + "grad_norm": 0.168045312166214, + "learning_rate": 1.844262799362674e-05, + "loss": 1.1929, + "step": 5654 + }, + { + "epoch": 2.105385626694279, + "grad_norm": 0.1657107025384903, + "learning_rate": 1.844197660753624e-05, + "loss": 1.1787, + "step": 5655 + }, + { + "epoch": 2.1057579318448885, + "grad_norm": 0.1633072793483734, + "learning_rate": 1.8441325096758275e-05, + "loss": 1.1949, + "step": 5656 + }, + { + "epoch": 2.1061302369954973, + "grad_norm": 0.16375941038131714, + "learning_rate": 1.844067346130247e-05, + "loss": 1.1742, + "step": 5657 + }, + { + "epoch": 2.1065025421461065, + "grad_norm": 0.16451074182987213, + "learning_rate": 1.8440021701178445e-05, + "loss": 1.1873, + "step": 5658 + }, + { + "epoch": 2.1068748472967154, + "grad_norm": 0.1651698499917984, + "learning_rate": 1.843936981639583e-05, + "loss": 1.1857, + "step": 5659 + }, + { + "epoch": 2.1072471524473246, + "grad_norm": 0.1603275090456009, + "learning_rate": 1.8438717806964254e-05, + "loss": 1.1923, + "step": 5660 + }, + { + "epoch": 2.107619457597934, + "grad_norm": 0.16892802715301514, + "learning_rate": 1.8438065672893348e-05, + "loss": 1.1954, + "step": 5661 + }, + { + "epoch": 2.1079917627485427, + "grad_norm": 0.1669367104768753, + "learning_rate": 1.8437413414192734e-05, + "loss": 1.19, + "step": 5662 + }, + { + "epoch": 2.108364067899152, + "grad_norm": 0.16005094349384308, + "learning_rate": 1.843676103087206e-05, + "loss": 1.1905, + "step": 5663 + }, + { + "epoch": 2.108736373049761, + "grad_norm": 0.16192243993282318, + "learning_rate": 1.8436108522940953e-05, + "loss": 1.1865, + "step": 5664 + }, + { + "epoch": 2.10910867820037, + "grad_norm": 0.16024897992610931, + "learning_rate": 1.8435455890409054e-05, + "loss": 1.1641, + "step": 5665 + }, + { + "epoch": 2.109480983350979, + "grad_norm": 0.1648632138967514, + "learning_rate": 1.8434803133285998e-05, + "loss": 1.1792, + "step": 5666 + }, + { + "epoch": 2.109853288501588, + "grad_norm": 0.15843743085861206, + "learning_rate": 1.843415025158143e-05, + "loss": 1.1728, + "step": 5667 + }, + { + "epoch": 2.110225593652197, + "grad_norm": 0.1614643931388855, + "learning_rate": 1.843349724530499e-05, + "loss": 1.1726, + "step": 5668 + }, + { + "epoch": 2.1105978988028062, + "grad_norm": 0.16818486154079437, + "learning_rate": 1.843284411446633e-05, + "loss": 1.1832, + "step": 5669 + }, + { + "epoch": 2.1109702039534155, + "grad_norm": 0.16501553356647491, + "learning_rate": 1.8432190859075088e-05, + "loss": 1.1941, + "step": 5670 + }, + { + "epoch": 2.1113425091040243, + "grad_norm": 0.16194112598896027, + "learning_rate": 1.8431537479140916e-05, + "loss": 1.1829, + "step": 5671 + }, + { + "epoch": 2.1117148142546336, + "grad_norm": 0.1625380516052246, + "learning_rate": 1.8430883974673467e-05, + "loss": 1.1866, + "step": 5672 + }, + { + "epoch": 2.1120871194052424, + "grad_norm": 0.16628055274486542, + "learning_rate": 1.8430230345682393e-05, + "loss": 1.167, + "step": 5673 + }, + { + "epoch": 2.1124594245558517, + "grad_norm": 0.16523656249046326, + "learning_rate": 1.842957659217734e-05, + "loss": 1.1693, + "step": 5674 + }, + { + "epoch": 2.1128317297064605, + "grad_norm": 0.16235126554965973, + "learning_rate": 1.842892271416797e-05, + "loss": 1.1814, + "step": 5675 + }, + { + "epoch": 2.1132040348570698, + "grad_norm": 0.16224858164787292, + "learning_rate": 1.8428268711663943e-05, + "loss": 1.1778, + "step": 5676 + }, + { + "epoch": 2.113576340007679, + "grad_norm": 0.1565116047859192, + "learning_rate": 1.842761458467492e-05, + "loss": 1.1834, + "step": 5677 + }, + { + "epoch": 2.113948645158288, + "grad_norm": 0.1609666496515274, + "learning_rate": 1.842696033321055e-05, + "loss": 1.188, + "step": 5678 + }, + { + "epoch": 2.114320950308897, + "grad_norm": 0.17431986331939697, + "learning_rate": 1.842630595728051e-05, + "loss": 1.2025, + "step": 5679 + }, + { + "epoch": 2.114693255459506, + "grad_norm": 0.16619716584682465, + "learning_rate": 1.842565145689446e-05, + "loss": 1.1908, + "step": 5680 + }, + { + "epoch": 2.115065560610115, + "grad_norm": 0.16613732278347015, + "learning_rate": 1.8424996832062066e-05, + "loss": 1.1826, + "step": 5681 + }, + { + "epoch": 2.115437865760724, + "grad_norm": 0.16963297128677368, + "learning_rate": 1.8424342082792996e-05, + "loss": 1.1904, + "step": 5682 + }, + { + "epoch": 2.1158101709113333, + "grad_norm": 0.15998825430870056, + "learning_rate": 1.8423687209096926e-05, + "loss": 1.1873, + "step": 5683 + }, + { + "epoch": 2.116182476061942, + "grad_norm": 0.15529409050941467, + "learning_rate": 1.842303221098352e-05, + "loss": 1.1879, + "step": 5684 + }, + { + "epoch": 2.1165547812125514, + "grad_norm": 0.16346554458141327, + "learning_rate": 1.8422377088462456e-05, + "loss": 1.1873, + "step": 5685 + }, + { + "epoch": 2.1169270863631606, + "grad_norm": 0.16552503407001495, + "learning_rate": 1.8421721841543412e-05, + "loss": 1.1971, + "step": 5686 + }, + { + "epoch": 2.1172993915137694, + "grad_norm": 0.16993537545204163, + "learning_rate": 1.842106647023607e-05, + "loss": 1.2047, + "step": 5687 + }, + { + "epoch": 2.1176716966643787, + "grad_norm": 0.16615287959575653, + "learning_rate": 1.8420410974550103e-05, + "loss": 1.1826, + "step": 5688 + }, + { + "epoch": 2.1180440018149875, + "grad_norm": 0.16618463397026062, + "learning_rate": 1.8419755354495194e-05, + "loss": 1.1757, + "step": 5689 + }, + { + "epoch": 2.118416306965597, + "grad_norm": 0.16526402533054352, + "learning_rate": 1.8419099610081025e-05, + "loss": 1.1833, + "step": 5690 + }, + { + "epoch": 2.1187886121162056, + "grad_norm": 0.1653960645198822, + "learning_rate": 1.841844374131728e-05, + "loss": 1.176, + "step": 5691 + }, + { + "epoch": 2.119160917266815, + "grad_norm": 0.16341429948806763, + "learning_rate": 1.8417787748213655e-05, + "loss": 1.1807, + "step": 5692 + }, + { + "epoch": 2.1195332224174237, + "grad_norm": 0.15940816700458527, + "learning_rate": 1.8417131630779834e-05, + "loss": 1.1814, + "step": 5693 + }, + { + "epoch": 2.119905527568033, + "grad_norm": 0.1656249612569809, + "learning_rate": 1.84164753890255e-05, + "loss": 1.1895, + "step": 5694 + }, + { + "epoch": 2.1202778327186422, + "grad_norm": 0.1686115264892578, + "learning_rate": 1.841581902296036e-05, + "loss": 1.1888, + "step": 5695 + }, + { + "epoch": 2.120650137869251, + "grad_norm": 0.1635192185640335, + "learning_rate": 1.8415162532594096e-05, + "loss": 1.183, + "step": 5696 + }, + { + "epoch": 2.1210224430198603, + "grad_norm": 0.15994298458099365, + "learning_rate": 1.841450591793641e-05, + "loss": 1.1839, + "step": 5697 + }, + { + "epoch": 2.121394748170469, + "grad_norm": 0.16956914961338043, + "learning_rate": 1.8413849178997002e-05, + "loss": 1.1802, + "step": 5698 + }, + { + "epoch": 2.1217670533210784, + "grad_norm": 0.16558878123760223, + "learning_rate": 1.841319231578557e-05, + "loss": 1.2054, + "step": 5699 + }, + { + "epoch": 2.122139358471687, + "grad_norm": 0.16168220341205597, + "learning_rate": 1.8412535328311813e-05, + "loss": 1.1613, + "step": 5700 + }, + { + "epoch": 2.1225116636222965, + "grad_norm": 0.16287264227867126, + "learning_rate": 1.8411878216585436e-05, + "loss": 1.1889, + "step": 5701 + }, + { + "epoch": 2.1228839687729053, + "grad_norm": 0.1619829684495926, + "learning_rate": 1.841122098061615e-05, + "loss": 1.1877, + "step": 5702 + }, + { + "epoch": 2.1232562739235146, + "grad_norm": 0.16612021625041962, + "learning_rate": 1.8410563620413658e-05, + "loss": 1.1818, + "step": 5703 + }, + { + "epoch": 2.123628579074124, + "grad_norm": 0.16354532539844513, + "learning_rate": 1.8409906135987668e-05, + "loss": 1.1882, + "step": 5704 + }, + { + "epoch": 2.1240008842247327, + "grad_norm": 0.1652521938085556, + "learning_rate": 1.8409248527347888e-05, + "loss": 1.1854, + "step": 5705 + }, + { + "epoch": 2.124373189375342, + "grad_norm": 0.1664263755083084, + "learning_rate": 1.840859079450404e-05, + "loss": 1.1875, + "step": 5706 + }, + { + "epoch": 2.1247454945259507, + "grad_norm": 0.16455532610416412, + "learning_rate": 1.8407932937465835e-05, + "loss": 1.1903, + "step": 5707 + }, + { + "epoch": 2.12511779967656, + "grad_norm": 0.1682821661233902, + "learning_rate": 1.8407274956242983e-05, + "loss": 1.1965, + "step": 5708 + }, + { + "epoch": 2.125490104827169, + "grad_norm": 0.16767260432243347, + "learning_rate": 1.840661685084521e-05, + "loss": 1.178, + "step": 5709 + }, + { + "epoch": 2.125862409977778, + "grad_norm": 0.16613049805164337, + "learning_rate": 1.8405958621282232e-05, + "loss": 1.172, + "step": 5710 + }, + { + "epoch": 2.126234715128387, + "grad_norm": 0.16709165275096893, + "learning_rate": 1.8405300267563774e-05, + "loss": 1.1834, + "step": 5711 + }, + { + "epoch": 2.126607020278996, + "grad_norm": 0.16999532282352448, + "learning_rate": 1.840464178969956e-05, + "loss": 1.1879, + "step": 5712 + }, + { + "epoch": 2.1269793254296054, + "grad_norm": 0.1657310575246811, + "learning_rate": 1.840398318769931e-05, + "loss": 1.1959, + "step": 5713 + }, + { + "epoch": 2.1273516305802143, + "grad_norm": 0.16610091924667358, + "learning_rate": 1.8403324461572762e-05, + "loss": 1.1929, + "step": 5714 + }, + { + "epoch": 2.1277239357308235, + "grad_norm": 0.16511550545692444, + "learning_rate": 1.8402665611329635e-05, + "loss": 1.1819, + "step": 5715 + }, + { + "epoch": 2.1280962408814323, + "grad_norm": 0.1685413420200348, + "learning_rate": 1.8402006636979667e-05, + "loss": 1.1871, + "step": 5716 + }, + { + "epoch": 2.1284685460320416, + "grad_norm": 0.1703380048274994, + "learning_rate": 1.8401347538532585e-05, + "loss": 1.1795, + "step": 5717 + }, + { + "epoch": 2.1288408511826504, + "grad_norm": 0.16510550677776337, + "learning_rate": 1.8400688315998128e-05, + "loss": 1.1991, + "step": 5718 + }, + { + "epoch": 2.1292131563332597, + "grad_norm": 0.15952372550964355, + "learning_rate": 1.8400028969386033e-05, + "loss": 1.1844, + "step": 5719 + }, + { + "epoch": 2.1295854614838685, + "grad_norm": 0.1626126617193222, + "learning_rate": 1.8399369498706034e-05, + "loss": 1.189, + "step": 5720 + }, + { + "epoch": 2.129957766634478, + "grad_norm": 0.1677938997745514, + "learning_rate": 1.8398709903967878e-05, + "loss": 1.1905, + "step": 5721 + }, + { + "epoch": 2.130330071785087, + "grad_norm": 0.16606491804122925, + "learning_rate": 1.8398050185181303e-05, + "loss": 1.1932, + "step": 5722 + }, + { + "epoch": 2.130702376935696, + "grad_norm": 0.16483409702777863, + "learning_rate": 1.8397390342356054e-05, + "loss": 1.1819, + "step": 5723 + }, + { + "epoch": 2.131074682086305, + "grad_norm": 0.15964972972869873, + "learning_rate": 1.8396730375501877e-05, + "loss": 1.1887, + "step": 5724 + }, + { + "epoch": 2.131446987236914, + "grad_norm": 0.1669287085533142, + "learning_rate": 1.8396070284628518e-05, + "loss": 1.1811, + "step": 5725 + }, + { + "epoch": 2.131819292387523, + "grad_norm": 0.16600339114665985, + "learning_rate": 1.839541006974573e-05, + "loss": 1.187, + "step": 5726 + }, + { + "epoch": 2.132191597538132, + "grad_norm": 0.16649706661701202, + "learning_rate": 1.839474973086326e-05, + "loss": 1.1863, + "step": 5727 + }, + { + "epoch": 2.1325639026887413, + "grad_norm": 0.16561785340309143, + "learning_rate": 1.839408926799086e-05, + "loss": 1.1911, + "step": 5728 + }, + { + "epoch": 2.13293620783935, + "grad_norm": 0.16608673334121704, + "learning_rate": 1.8393428681138298e-05, + "loss": 1.1734, + "step": 5729 + }, + { + "epoch": 2.1333085129899594, + "grad_norm": 0.16552896797657013, + "learning_rate": 1.8392767970315314e-05, + "loss": 1.1823, + "step": 5730 + }, + { + "epoch": 2.1336808181405686, + "grad_norm": 0.16509748995304108, + "learning_rate": 1.8392107135531674e-05, + "loss": 1.1801, + "step": 5731 + }, + { + "epoch": 2.1340531232911775, + "grad_norm": 0.16142834722995758, + "learning_rate": 1.839144617679714e-05, + "loss": 1.174, + "step": 5732 + }, + { + "epoch": 2.1344254284417867, + "grad_norm": 0.17052648961544037, + "learning_rate": 1.839078509412147e-05, + "loss": 1.1891, + "step": 5733 + }, + { + "epoch": 2.1347977335923956, + "grad_norm": 0.15803445875644684, + "learning_rate": 1.8390123887514436e-05, + "loss": 1.1954, + "step": 5734 + }, + { + "epoch": 2.135170038743005, + "grad_norm": 0.16502295434474945, + "learning_rate": 1.8389462556985793e-05, + "loss": 1.1777, + "step": 5735 + }, + { + "epoch": 2.1355423438936136, + "grad_norm": 0.1639169305562973, + "learning_rate": 1.838880110254532e-05, + "loss": 1.1857, + "step": 5736 + }, + { + "epoch": 2.135914649044223, + "grad_norm": 0.16261015832424164, + "learning_rate": 1.8388139524202776e-05, + "loss": 1.1836, + "step": 5737 + }, + { + "epoch": 2.1362869541948317, + "grad_norm": 0.1698441356420517, + "learning_rate": 1.8387477821967938e-05, + "loss": 1.1822, + "step": 5738 + }, + { + "epoch": 2.136659259345441, + "grad_norm": 0.15911483764648438, + "learning_rate": 1.8386815995850584e-05, + "loss": 1.1733, + "step": 5739 + }, + { + "epoch": 2.1370315644960503, + "grad_norm": 0.16580499708652496, + "learning_rate": 1.838615404586048e-05, + "loss": 1.1809, + "step": 5740 + }, + { + "epoch": 2.137403869646659, + "grad_norm": 0.16274124383926392, + "learning_rate": 1.8385491972007408e-05, + "loss": 1.1737, + "step": 5741 + }, + { + "epoch": 2.1377761747972683, + "grad_norm": 0.1641577035188675, + "learning_rate": 1.8384829774301145e-05, + "loss": 1.1926, + "step": 5742 + }, + { + "epoch": 2.138148479947877, + "grad_norm": 0.159241184592247, + "learning_rate": 1.8384167452751473e-05, + "loss": 1.1794, + "step": 5743 + }, + { + "epoch": 2.1385207850984864, + "grad_norm": 0.17162783443927765, + "learning_rate": 1.8383505007368175e-05, + "loss": 1.1891, + "step": 5744 + }, + { + "epoch": 2.1388930902490952, + "grad_norm": 0.17057040333747864, + "learning_rate": 1.8382842438161034e-05, + "loss": 1.1981, + "step": 5745 + }, + { + "epoch": 2.1392653953997045, + "grad_norm": 0.16203658282756805, + "learning_rate": 1.8382179745139835e-05, + "loss": 1.1763, + "step": 5746 + }, + { + "epoch": 2.1396377005503133, + "grad_norm": 0.16306722164154053, + "learning_rate": 1.838151692831437e-05, + "loss": 1.1903, + "step": 5747 + }, + { + "epoch": 2.1400100057009226, + "grad_norm": 0.16811603307724, + "learning_rate": 1.838085398769442e-05, + "loss": 1.1941, + "step": 5748 + }, + { + "epoch": 2.140382310851532, + "grad_norm": 0.1626274734735489, + "learning_rate": 1.8380190923289785e-05, + "loss": 1.1709, + "step": 5749 + }, + { + "epoch": 2.1407546160021407, + "grad_norm": 0.16186201572418213, + "learning_rate": 1.837952773511026e-05, + "loss": 1.1861, + "step": 5750 + }, + { + "epoch": 2.14112692115275, + "grad_norm": 0.1650025099515915, + "learning_rate": 1.8378864423165632e-05, + "loss": 1.18, + "step": 5751 + }, + { + "epoch": 2.1414992263033588, + "grad_norm": 0.15938428044319153, + "learning_rate": 1.8378200987465704e-05, + "loss": 1.2017, + "step": 5752 + }, + { + "epoch": 2.141871531453968, + "grad_norm": 0.16518081724643707, + "learning_rate": 1.8377537428020273e-05, + "loss": 1.1805, + "step": 5753 + }, + { + "epoch": 2.142243836604577, + "grad_norm": 0.16796717047691345, + "learning_rate": 1.837687374483914e-05, + "loss": 1.1798, + "step": 5754 + }, + { + "epoch": 2.142616141755186, + "grad_norm": 0.1624600887298584, + "learning_rate": 1.837620993793211e-05, + "loss": 1.1888, + "step": 5755 + }, + { + "epoch": 2.142988446905795, + "grad_norm": 0.1630149781703949, + "learning_rate": 1.837554600730898e-05, + "loss": 1.1848, + "step": 5756 + }, + { + "epoch": 2.143360752056404, + "grad_norm": 0.16488610208034515, + "learning_rate": 1.837488195297956e-05, + "loss": 1.1831, + "step": 5757 + }, + { + "epoch": 2.1437330572070135, + "grad_norm": 0.1614954173564911, + "learning_rate": 1.8374217774953663e-05, + "loss": 1.1871, + "step": 5758 + }, + { + "epoch": 2.1441053623576223, + "grad_norm": 0.16131506860256195, + "learning_rate": 1.8373553473241097e-05, + "loss": 1.1734, + "step": 5759 + }, + { + "epoch": 2.1444776675082315, + "grad_norm": 0.16610832512378693, + "learning_rate": 1.837288904785167e-05, + "loss": 1.1945, + "step": 5760 + }, + { + "epoch": 2.1448499726588404, + "grad_norm": 0.16947484016418457, + "learning_rate": 1.8372224498795198e-05, + "loss": 1.1775, + "step": 5761 + }, + { + "epoch": 2.1452222778094496, + "grad_norm": 0.17164815962314606, + "learning_rate": 1.8371559826081492e-05, + "loss": 1.1955, + "step": 5762 + }, + { + "epoch": 2.1455945829600584, + "grad_norm": 0.16294875741004944, + "learning_rate": 1.8370895029720374e-05, + "loss": 1.1761, + "step": 5763 + }, + { + "epoch": 2.1459668881106677, + "grad_norm": 0.16515691578388214, + "learning_rate": 1.8370230109721664e-05, + "loss": 1.1913, + "step": 5764 + }, + { + "epoch": 2.146339193261277, + "grad_norm": 0.16761860251426697, + "learning_rate": 1.836956506609518e-05, + "loss": 1.183, + "step": 5765 + }, + { + "epoch": 2.146711498411886, + "grad_norm": 0.16894914209842682, + "learning_rate": 1.8368899898850744e-05, + "loss": 1.1825, + "step": 5766 + }, + { + "epoch": 2.147083803562495, + "grad_norm": 0.159805029630661, + "learning_rate": 1.836823460799818e-05, + "loss": 1.1854, + "step": 5767 + }, + { + "epoch": 2.147456108713104, + "grad_norm": 0.16420911252498627, + "learning_rate": 1.836756919354732e-05, + "loss": 1.1941, + "step": 5768 + }, + { + "epoch": 2.147828413863713, + "grad_norm": 0.1619480699300766, + "learning_rate": 1.8366903655507987e-05, + "loss": 1.1958, + "step": 5769 + }, + { + "epoch": 2.148200719014322, + "grad_norm": 0.16295601427555084, + "learning_rate": 1.836623799389001e-05, + "loss": 1.204, + "step": 5770 + }, + { + "epoch": 2.1485730241649312, + "grad_norm": 0.1661759316921234, + "learning_rate": 1.8365572208703225e-05, + "loss": 1.2004, + "step": 5771 + }, + { + "epoch": 2.14894532931554, + "grad_norm": 0.1644112765789032, + "learning_rate": 1.8364906299957464e-05, + "loss": 1.1852, + "step": 5772 + }, + { + "epoch": 2.1493176344661493, + "grad_norm": 0.16289396584033966, + "learning_rate": 1.836424026766256e-05, + "loss": 1.1738, + "step": 5773 + }, + { + "epoch": 2.1496899396167586, + "grad_norm": 0.16191107034683228, + "learning_rate": 1.836357411182835e-05, + "loss": 1.1809, + "step": 5774 + }, + { + "epoch": 2.1500622447673674, + "grad_norm": 0.16520683467388153, + "learning_rate": 1.8362907832464678e-05, + "loss": 1.179, + "step": 5775 + }, + { + "epoch": 2.1504345499179767, + "grad_norm": 0.16534850001335144, + "learning_rate": 1.8362241429581386e-05, + "loss": 1.1973, + "step": 5776 + }, + { + "epoch": 2.1508068550685855, + "grad_norm": 0.16141214966773987, + "learning_rate": 1.8361574903188307e-05, + "loss": 1.1926, + "step": 5777 + }, + { + "epoch": 2.1511791602191948, + "grad_norm": 0.15976232290267944, + "learning_rate": 1.8360908253295293e-05, + "loss": 1.1813, + "step": 5778 + }, + { + "epoch": 2.1515514653698036, + "grad_norm": 0.16501028835773468, + "learning_rate": 1.836024147991219e-05, + "loss": 1.1883, + "step": 5779 + }, + { + "epoch": 2.151923770520413, + "grad_norm": 0.16739144921302795, + "learning_rate": 1.8359574583048846e-05, + "loss": 1.1884, + "step": 5780 + }, + { + "epoch": 2.1522960756710217, + "grad_norm": 0.1655547022819519, + "learning_rate": 1.8358907562715104e-05, + "loss": 1.2032, + "step": 5781 + }, + { + "epoch": 2.152668380821631, + "grad_norm": 0.16336333751678467, + "learning_rate": 1.835824041892083e-05, + "loss": 1.1947, + "step": 5782 + }, + { + "epoch": 2.15304068597224, + "grad_norm": 0.16573452949523926, + "learning_rate": 1.8357573151675864e-05, + "loss": 1.1751, + "step": 5783 + }, + { + "epoch": 2.153412991122849, + "grad_norm": 0.1619141399860382, + "learning_rate": 1.835690576099007e-05, + "loss": 1.1685, + "step": 5784 + }, + { + "epoch": 2.1537852962734583, + "grad_norm": 0.16914691030979156, + "learning_rate": 1.8356238246873302e-05, + "loss": 1.1747, + "step": 5785 + }, + { + "epoch": 2.154157601424067, + "grad_norm": 0.16027076542377472, + "learning_rate": 1.8355570609335416e-05, + "loss": 1.1737, + "step": 5786 + }, + { + "epoch": 2.1545299065746764, + "grad_norm": 0.15901793539524078, + "learning_rate": 1.835490284838628e-05, + "loss": 1.1896, + "step": 5787 + }, + { + "epoch": 2.154902211725285, + "grad_norm": 0.17148235440254211, + "learning_rate": 1.8354234964035754e-05, + "loss": 1.192, + "step": 5788 + }, + { + "epoch": 2.1552745168758944, + "grad_norm": 0.16743379831314087, + "learning_rate": 1.83535669562937e-05, + "loss": 1.1747, + "step": 5789 + }, + { + "epoch": 2.1556468220265037, + "grad_norm": 0.1674523502588272, + "learning_rate": 1.8352898825169986e-05, + "loss": 1.1853, + "step": 5790 + }, + { + "epoch": 2.1560191271771125, + "grad_norm": 0.15756765007972717, + "learning_rate": 1.835223057067448e-05, + "loss": 1.1702, + "step": 5791 + }, + { + "epoch": 2.156391432327722, + "grad_norm": 0.16530895233154297, + "learning_rate": 1.8351562192817054e-05, + "loss": 1.1814, + "step": 5792 + }, + { + "epoch": 2.1567637374783306, + "grad_norm": 0.1682775914669037, + "learning_rate": 1.835089369160758e-05, + "loss": 1.1956, + "step": 5793 + }, + { + "epoch": 2.15713604262894, + "grad_norm": 0.1804119050502777, + "learning_rate": 1.8350225067055927e-05, + "loss": 1.175, + "step": 5794 + }, + { + "epoch": 2.1575083477795487, + "grad_norm": 0.1660710573196411, + "learning_rate": 1.8349556319171977e-05, + "loss": 1.1795, + "step": 5795 + }, + { + "epoch": 2.157880652930158, + "grad_norm": 0.16514445841312408, + "learning_rate": 1.83488874479656e-05, + "loss": 1.1923, + "step": 5796 + }, + { + "epoch": 2.158252958080767, + "grad_norm": 0.1635090708732605, + "learning_rate": 1.8348218453446685e-05, + "loss": 1.1853, + "step": 5797 + }, + { + "epoch": 2.158625263231376, + "grad_norm": 0.17032501101493835, + "learning_rate": 1.8347549335625104e-05, + "loss": 1.1996, + "step": 5798 + }, + { + "epoch": 2.1589975683819853, + "grad_norm": 0.17020241916179657, + "learning_rate": 1.834688009451074e-05, + "loss": 1.1841, + "step": 5799 + }, + { + "epoch": 2.159369873532594, + "grad_norm": 0.1642071157693863, + "learning_rate": 1.8346210730113484e-05, + "loss": 1.195, + "step": 5800 + }, + { + "epoch": 2.1597421786832034, + "grad_norm": 0.16940705478191376, + "learning_rate": 1.8345541242443223e-05, + "loss": 1.1909, + "step": 5801 + }, + { + "epoch": 2.160114483833812, + "grad_norm": 0.15778762102127075, + "learning_rate": 1.8344871631509837e-05, + "loss": 1.1872, + "step": 5802 + }, + { + "epoch": 2.1604867889844215, + "grad_norm": 0.1702834963798523, + "learning_rate": 1.834420189732322e-05, + "loss": 1.1847, + "step": 5803 + }, + { + "epoch": 2.1608590941350303, + "grad_norm": 0.16944074630737305, + "learning_rate": 1.834353203989327e-05, + "loss": 1.1858, + "step": 5804 + }, + { + "epoch": 2.1612313992856396, + "grad_norm": 0.17070455849170685, + "learning_rate": 1.834286205922987e-05, + "loss": 1.1943, + "step": 5805 + }, + { + "epoch": 2.1616037044362484, + "grad_norm": 0.16586799919605255, + "learning_rate": 1.8342191955342926e-05, + "loss": 1.1877, + "step": 5806 + }, + { + "epoch": 2.1619760095868576, + "grad_norm": 0.17121802270412445, + "learning_rate": 1.8341521728242324e-05, + "loss": 1.1779, + "step": 5807 + }, + { + "epoch": 2.162348314737467, + "grad_norm": 0.16471904516220093, + "learning_rate": 1.8340851377937975e-05, + "loss": 1.1963, + "step": 5808 + }, + { + "epoch": 2.1627206198880757, + "grad_norm": 0.16698746383190155, + "learning_rate": 1.834018090443977e-05, + "loss": 1.1695, + "step": 5809 + }, + { + "epoch": 2.163092925038685, + "grad_norm": 0.16752393543720245, + "learning_rate": 1.833951030775762e-05, + "loss": 1.1872, + "step": 5810 + }, + { + "epoch": 2.163465230189294, + "grad_norm": 0.16329661011695862, + "learning_rate": 1.8338839587901426e-05, + "loss": 1.1855, + "step": 5811 + }, + { + "epoch": 2.163837535339903, + "grad_norm": 0.16880545020103455, + "learning_rate": 1.833816874488109e-05, + "loss": 1.1856, + "step": 5812 + }, + { + "epoch": 2.164209840490512, + "grad_norm": 0.16855594515800476, + "learning_rate": 1.833749777870653e-05, + "loss": 1.1756, + "step": 5813 + }, + { + "epoch": 2.164582145641121, + "grad_norm": 0.16726450622081757, + "learning_rate": 1.833682668938765e-05, + "loss": 1.1606, + "step": 5814 + }, + { + "epoch": 2.16495445079173, + "grad_norm": 0.16840289533138275, + "learning_rate": 1.8336155476934365e-05, + "loss": 1.1811, + "step": 5815 + }, + { + "epoch": 2.1653267559423393, + "grad_norm": 0.16863827407360077, + "learning_rate": 1.8335484141356582e-05, + "loss": 1.1907, + "step": 5816 + }, + { + "epoch": 2.1656990610929485, + "grad_norm": 0.16578540205955505, + "learning_rate": 1.8334812682664224e-05, + "loss": 1.1792, + "step": 5817 + }, + { + "epoch": 2.1660713662435573, + "grad_norm": 0.16219614446163177, + "learning_rate": 1.8334141100867208e-05, + "loss": 1.1759, + "step": 5818 + }, + { + "epoch": 2.1664436713941666, + "grad_norm": 0.16222938895225525, + "learning_rate": 1.8333469395975446e-05, + "loss": 1.1734, + "step": 5819 + }, + { + "epoch": 2.1668159765447754, + "grad_norm": 0.17056572437286377, + "learning_rate": 1.8332797567998865e-05, + "loss": 1.1854, + "step": 5820 + }, + { + "epoch": 2.1671882816953847, + "grad_norm": 0.1574883610010147, + "learning_rate": 1.833212561694739e-05, + "loss": 1.1951, + "step": 5821 + }, + { + "epoch": 2.1675605868459935, + "grad_norm": 0.1614379733800888, + "learning_rate": 1.833145354283094e-05, + "loss": 1.1806, + "step": 5822 + }, + { + "epoch": 2.1679328919966028, + "grad_norm": 0.17167092859745026, + "learning_rate": 1.8330781345659447e-05, + "loss": 1.1928, + "step": 5823 + }, + { + "epoch": 2.1683051971472116, + "grad_norm": 0.17068855464458466, + "learning_rate": 1.8330109025442834e-05, + "loss": 1.184, + "step": 5824 + }, + { + "epoch": 2.168677502297821, + "grad_norm": 0.1654396653175354, + "learning_rate": 1.832943658219103e-05, + "loss": 1.1839, + "step": 5825 + }, + { + "epoch": 2.16904980744843, + "grad_norm": 0.16501565277576447, + "learning_rate": 1.832876401591398e-05, + "loss": 1.1828, + "step": 5826 + }, + { + "epoch": 2.169422112599039, + "grad_norm": 0.16627371311187744, + "learning_rate": 1.8328091326621597e-05, + "loss": 1.1904, + "step": 5827 + }, + { + "epoch": 2.169794417749648, + "grad_norm": 0.15978330373764038, + "learning_rate": 1.8327418514323833e-05, + "loss": 1.1706, + "step": 5828 + }, + { + "epoch": 2.170166722900257, + "grad_norm": 0.16594457626342773, + "learning_rate": 1.8326745579030623e-05, + "loss": 1.2023, + "step": 5829 + }, + { + "epoch": 2.1705390280508663, + "grad_norm": 0.17066100239753723, + "learning_rate": 1.83260725207519e-05, + "loss": 1.1809, + "step": 5830 + }, + { + "epoch": 2.170911333201475, + "grad_norm": 0.16543124616146088, + "learning_rate": 1.8325399339497608e-05, + "loss": 1.1749, + "step": 5831 + }, + { + "epoch": 2.1712836383520844, + "grad_norm": 0.16617171466350555, + "learning_rate": 1.8324726035277694e-05, + "loss": 1.1901, + "step": 5832 + }, + { + "epoch": 2.171655943502693, + "grad_norm": 0.17173443734645844, + "learning_rate": 1.8324052608102095e-05, + "loss": 1.1847, + "step": 5833 + }, + { + "epoch": 2.1720282486533025, + "grad_norm": 0.1652347892522812, + "learning_rate": 1.832337905798076e-05, + "loss": 1.1766, + "step": 5834 + }, + { + "epoch": 2.1724005538039117, + "grad_norm": 0.16114427149295807, + "learning_rate": 1.8322705384923644e-05, + "loss": 1.1835, + "step": 5835 + }, + { + "epoch": 2.1727728589545205, + "grad_norm": 0.17017604410648346, + "learning_rate": 1.8322031588940687e-05, + "loss": 1.1778, + "step": 5836 + }, + { + "epoch": 2.17314516410513, + "grad_norm": 0.16408562660217285, + "learning_rate": 1.8321357670041848e-05, + "loss": 1.1863, + "step": 5837 + }, + { + "epoch": 2.1735174692557386, + "grad_norm": 0.1657814234495163, + "learning_rate": 1.832068362823708e-05, + "loss": 1.1847, + "step": 5838 + }, + { + "epoch": 2.173889774406348, + "grad_norm": 0.15966399013996124, + "learning_rate": 1.8320009463536333e-05, + "loss": 1.1818, + "step": 5839 + }, + { + "epoch": 2.1742620795569567, + "grad_norm": 0.15668559074401855, + "learning_rate": 1.831933517594957e-05, + "loss": 1.1798, + "step": 5840 + }, + { + "epoch": 2.174634384707566, + "grad_norm": 0.1669110357761383, + "learning_rate": 1.831866076548675e-05, + "loss": 1.2053, + "step": 5841 + }, + { + "epoch": 2.175006689858175, + "grad_norm": 0.15878044068813324, + "learning_rate": 1.831798623215783e-05, + "loss": 1.1697, + "step": 5842 + }, + { + "epoch": 2.175378995008784, + "grad_norm": 0.16765941679477692, + "learning_rate": 1.8317311575972777e-05, + "loss": 1.1806, + "step": 5843 + }, + { + "epoch": 2.1757513001593933, + "grad_norm": 0.1708860546350479, + "learning_rate": 1.8316636796941555e-05, + "loss": 1.2002, + "step": 5844 + }, + { + "epoch": 2.176123605310002, + "grad_norm": 0.16999541223049164, + "learning_rate": 1.8315961895074127e-05, + "loss": 1.1778, + "step": 5845 + }, + { + "epoch": 2.1764959104606114, + "grad_norm": 0.1659235954284668, + "learning_rate": 1.8315286870380468e-05, + "loss": 1.1821, + "step": 5846 + }, + { + "epoch": 2.1768682156112202, + "grad_norm": 0.15911462903022766, + "learning_rate": 1.831461172287054e-05, + "loss": 1.197, + "step": 5847 + }, + { + "epoch": 2.1772405207618295, + "grad_norm": 0.15904010832309723, + "learning_rate": 1.8313936452554318e-05, + "loss": 1.1777, + "step": 5848 + }, + { + "epoch": 2.1776128259124383, + "grad_norm": 0.1634838581085205, + "learning_rate": 1.8313261059441777e-05, + "loss": 1.1752, + "step": 5849 + }, + { + "epoch": 2.1779851310630476, + "grad_norm": 0.16560928523540497, + "learning_rate": 1.8312585543542893e-05, + "loss": 1.1859, + "step": 5850 + }, + { + "epoch": 2.1783574362136564, + "grad_norm": 0.16026043891906738, + "learning_rate": 1.8311909904867643e-05, + "loss": 1.1715, + "step": 5851 + }, + { + "epoch": 2.1787297413642657, + "grad_norm": 0.1598140001296997, + "learning_rate": 1.8311234143426003e-05, + "loss": 1.1755, + "step": 5852 + }, + { + "epoch": 2.179102046514875, + "grad_norm": 0.1634136587381363, + "learning_rate": 1.831055825922796e-05, + "loss": 1.1806, + "step": 5853 + }, + { + "epoch": 2.1794743516654838, + "grad_norm": 0.16830392181873322, + "learning_rate": 1.830988225228349e-05, + "loss": 1.1638, + "step": 5854 + }, + { + "epoch": 2.179846656816093, + "grad_norm": 0.16452953219413757, + "learning_rate": 1.8309206122602582e-05, + "loss": 1.184, + "step": 5855 + }, + { + "epoch": 2.180218961966702, + "grad_norm": 0.16308681666851044, + "learning_rate": 1.830852987019522e-05, + "loss": 1.1898, + "step": 5856 + }, + { + "epoch": 2.180591267117311, + "grad_norm": 0.1646089106798172, + "learning_rate": 1.8307853495071394e-05, + "loss": 1.1717, + "step": 5857 + }, + { + "epoch": 2.18096357226792, + "grad_norm": 0.16182439029216766, + "learning_rate": 1.8307176997241092e-05, + "loss": 1.1833, + "step": 5858 + }, + { + "epoch": 2.181335877418529, + "grad_norm": 0.16360101103782654, + "learning_rate": 1.8306500376714307e-05, + "loss": 1.1876, + "step": 5859 + }, + { + "epoch": 2.181708182569138, + "grad_norm": 0.1654779016971588, + "learning_rate": 1.8305823633501033e-05, + "loss": 1.1686, + "step": 5860 + }, + { + "epoch": 2.1820804877197473, + "grad_norm": 0.16477808356285095, + "learning_rate": 1.8305146767611267e-05, + "loss": 1.1878, + "step": 5861 + }, + { + "epoch": 2.1824527928703565, + "grad_norm": 0.16140249371528625, + "learning_rate": 1.8304469779055e-05, + "loss": 1.1852, + "step": 5862 + }, + { + "epoch": 2.1828250980209654, + "grad_norm": 0.1675121784210205, + "learning_rate": 1.830379266784224e-05, + "loss": 1.1675, + "step": 5863 + }, + { + "epoch": 2.1831974031715746, + "grad_norm": 0.1668505072593689, + "learning_rate": 1.8303115433982984e-05, + "loss": 1.1938, + "step": 5864 + }, + { + "epoch": 2.1835697083221834, + "grad_norm": 0.16749152541160583, + "learning_rate": 1.830243807748723e-05, + "loss": 1.1722, + "step": 5865 + }, + { + "epoch": 2.1839420134727927, + "grad_norm": 0.16352511942386627, + "learning_rate": 1.830176059836499e-05, + "loss": 1.1873, + "step": 5866 + }, + { + "epoch": 2.1843143186234015, + "grad_norm": 0.16892147064208984, + "learning_rate": 1.8301082996626268e-05, + "loss": 1.1968, + "step": 5867 + }, + { + "epoch": 2.184686623774011, + "grad_norm": 0.1627473384141922, + "learning_rate": 1.830040527228107e-05, + "loss": 1.185, + "step": 5868 + }, + { + "epoch": 2.1850589289246196, + "grad_norm": 0.16038550436496735, + "learning_rate": 1.8299727425339405e-05, + "loss": 1.1742, + "step": 5869 + }, + { + "epoch": 2.185431234075229, + "grad_norm": 0.1608399599790573, + "learning_rate": 1.8299049455811285e-05, + "loss": 1.1837, + "step": 5870 + }, + { + "epoch": 2.185803539225838, + "grad_norm": 0.16327451169490814, + "learning_rate": 1.829837136370673e-05, + "loss": 1.1792, + "step": 5871 + }, + { + "epoch": 2.186175844376447, + "grad_norm": 0.16703878343105316, + "learning_rate": 1.829769314903575e-05, + "loss": 1.2048, + "step": 5872 + }, + { + "epoch": 2.1865481495270562, + "grad_norm": 0.16750751435756683, + "learning_rate": 1.8297014811808363e-05, + "loss": 1.1813, + "step": 5873 + }, + { + "epoch": 2.186920454677665, + "grad_norm": 0.16518592834472656, + "learning_rate": 1.8296336352034585e-05, + "loss": 1.1782, + "step": 5874 + }, + { + "epoch": 2.1872927598282743, + "grad_norm": 0.1652197390794754, + "learning_rate": 1.829565776972444e-05, + "loss": 1.1771, + "step": 5875 + }, + { + "epoch": 2.187665064978883, + "grad_norm": 0.16577941179275513, + "learning_rate": 1.829497906488795e-05, + "loss": 1.1896, + "step": 5876 + }, + { + "epoch": 2.1880373701294924, + "grad_norm": 0.16348230838775635, + "learning_rate": 1.8294300237535145e-05, + "loss": 1.185, + "step": 5877 + }, + { + "epoch": 2.188409675280101, + "grad_norm": 0.16359402239322662, + "learning_rate": 1.8293621287676043e-05, + "loss": 1.1909, + "step": 5878 + }, + { + "epoch": 2.1887819804307105, + "grad_norm": 0.16968275606632233, + "learning_rate": 1.8292942215320675e-05, + "loss": 1.1827, + "step": 5879 + }, + { + "epoch": 2.1891542855813197, + "grad_norm": 0.16383975744247437, + "learning_rate": 1.829226302047907e-05, + "loss": 1.1892, + "step": 5880 + }, + { + "epoch": 2.1895265907319286, + "grad_norm": 0.16417528688907623, + "learning_rate": 1.8291583703161263e-05, + "loss": 1.1879, + "step": 5881 + }, + { + "epoch": 2.189898895882538, + "grad_norm": 0.16152073442935944, + "learning_rate": 1.8290904263377284e-05, + "loss": 1.1755, + "step": 5882 + }, + { + "epoch": 2.1902712010331467, + "grad_norm": 0.16532781720161438, + "learning_rate": 1.8290224701137165e-05, + "loss": 1.1987, + "step": 5883 + }, + { + "epoch": 2.190643506183756, + "grad_norm": 0.1655869036912918, + "learning_rate": 1.8289545016450953e-05, + "loss": 1.187, + "step": 5884 + }, + { + "epoch": 2.1910158113343647, + "grad_norm": 0.16325949132442474, + "learning_rate": 1.828886520932868e-05, + "loss": 1.1926, + "step": 5885 + }, + { + "epoch": 2.191388116484974, + "grad_norm": 0.1616744101047516, + "learning_rate": 1.8288185279780388e-05, + "loss": 1.1851, + "step": 5886 + }, + { + "epoch": 2.1917604216355833, + "grad_norm": 0.15696094930171967, + "learning_rate": 1.828750522781612e-05, + "loss": 1.1899, + "step": 5887 + }, + { + "epoch": 2.192132726786192, + "grad_norm": 0.15940669178962708, + "learning_rate": 1.8286825053445916e-05, + "loss": 1.1872, + "step": 5888 + }, + { + "epoch": 2.1925050319368014, + "grad_norm": 0.16422483325004578, + "learning_rate": 1.828614475667983e-05, + "loss": 1.1823, + "step": 5889 + }, + { + "epoch": 2.19287733708741, + "grad_norm": 0.16130252182483673, + "learning_rate": 1.8285464337527906e-05, + "loss": 1.1784, + "step": 5890 + }, + { + "epoch": 2.1932496422380194, + "grad_norm": 0.1615862399339676, + "learning_rate": 1.8284783796000193e-05, + "loss": 1.185, + "step": 5891 + }, + { + "epoch": 2.1936219473886283, + "grad_norm": 0.15964102745056152, + "learning_rate": 1.8284103132106743e-05, + "loss": 1.1796, + "step": 5892 + }, + { + "epoch": 2.1939942525392375, + "grad_norm": 0.15634554624557495, + "learning_rate": 1.828342234585761e-05, + "loss": 1.1821, + "step": 5893 + }, + { + "epoch": 2.1943665576898463, + "grad_norm": 0.1648419350385666, + "learning_rate": 1.828274143726285e-05, + "loss": 1.1926, + "step": 5894 + }, + { + "epoch": 2.1947388628404556, + "grad_norm": 0.1677112877368927, + "learning_rate": 1.8282060406332513e-05, + "loss": 1.1815, + "step": 5895 + }, + { + "epoch": 2.195111167991065, + "grad_norm": 0.16209574043750763, + "learning_rate": 1.828137925307667e-05, + "loss": 1.1888, + "step": 5896 + }, + { + "epoch": 2.1954834731416737, + "grad_norm": 0.16768576204776764, + "learning_rate": 1.828069797750537e-05, + "loss": 1.1844, + "step": 5897 + }, + { + "epoch": 2.195855778292283, + "grad_norm": 0.15827785432338715, + "learning_rate": 1.8280016579628686e-05, + "loss": 1.1645, + "step": 5898 + }, + { + "epoch": 2.1962280834428918, + "grad_norm": 0.17181847989559174, + "learning_rate": 1.8279335059456673e-05, + "loss": 1.1777, + "step": 5899 + }, + { + "epoch": 2.196600388593501, + "grad_norm": 0.16436168551445007, + "learning_rate": 1.8278653416999402e-05, + "loss": 1.1708, + "step": 5900 + }, + { + "epoch": 2.19697269374411, + "grad_norm": 0.16275320947170258, + "learning_rate": 1.827797165226694e-05, + "loss": 1.1883, + "step": 5901 + }, + { + "epoch": 2.197344998894719, + "grad_norm": 0.15981367230415344, + "learning_rate": 1.8277289765269353e-05, + "loss": 1.1709, + "step": 5902 + }, + { + "epoch": 2.197717304045328, + "grad_norm": 0.17126475274562836, + "learning_rate": 1.8276607756016722e-05, + "loss": 1.1784, + "step": 5903 + }, + { + "epoch": 2.198089609195937, + "grad_norm": 0.16240163147449493, + "learning_rate": 1.827592562451911e-05, + "loss": 1.1772, + "step": 5904 + }, + { + "epoch": 2.1984619143465465, + "grad_norm": 0.16826151311397552, + "learning_rate": 1.8275243370786594e-05, + "loss": 1.1728, + "step": 5905 + }, + { + "epoch": 2.1988342194971553, + "grad_norm": 0.1712343990802765, + "learning_rate": 1.8274560994829256e-05, + "loss": 1.1836, + "step": 5906 + }, + { + "epoch": 2.1992065246477646, + "grad_norm": 0.166199192404747, + "learning_rate": 1.827387849665717e-05, + "loss": 1.1996, + "step": 5907 + }, + { + "epoch": 2.1995788297983734, + "grad_norm": 0.16163288056850433, + "learning_rate": 1.827319587628042e-05, + "loss": 1.1775, + "step": 5908 + }, + { + "epoch": 2.1999511349489826, + "grad_norm": 0.171270489692688, + "learning_rate": 1.827251313370908e-05, + "loss": 1.1866, + "step": 5909 + }, + { + "epoch": 2.2003234400995915, + "grad_norm": 0.1645098328590393, + "learning_rate": 1.8271830268953248e-05, + "loss": 1.1814, + "step": 5910 + }, + { + "epoch": 2.2006957452502007, + "grad_norm": 0.1669131964445114, + "learning_rate": 1.8271147282022998e-05, + "loss": 1.1928, + "step": 5911 + }, + { + "epoch": 2.20106805040081, + "grad_norm": 0.17018841207027435, + "learning_rate": 1.8270464172928423e-05, + "loss": 1.1825, + "step": 5912 + }, + { + "epoch": 2.201440355551419, + "grad_norm": 0.167917862534523, + "learning_rate": 1.826978094167961e-05, + "loss": 1.1882, + "step": 5913 + }, + { + "epoch": 2.201812660702028, + "grad_norm": 0.16571766138076782, + "learning_rate": 1.826909758828665e-05, + "loss": 1.1726, + "step": 5914 + }, + { + "epoch": 2.202184965852637, + "grad_norm": 0.16511200368404388, + "learning_rate": 1.826841411275964e-05, + "loss": 1.1786, + "step": 5915 + }, + { + "epoch": 2.202557271003246, + "grad_norm": 0.16730502247810364, + "learning_rate": 1.8267730515108674e-05, + "loss": 1.1868, + "step": 5916 + }, + { + "epoch": 2.202929576153855, + "grad_norm": 0.16795329749584198, + "learning_rate": 1.8267046795343845e-05, + "loss": 1.1757, + "step": 5917 + }, + { + "epoch": 2.2033018813044642, + "grad_norm": 0.17142654955387115, + "learning_rate": 1.8266362953475252e-05, + "loss": 1.1918, + "step": 5918 + }, + { + "epoch": 2.203674186455073, + "grad_norm": 0.1681024432182312, + "learning_rate": 1.8265678989513e-05, + "loss": 1.1676, + "step": 5919 + }, + { + "epoch": 2.2040464916056823, + "grad_norm": 0.1688128411769867, + "learning_rate": 1.8264994903467187e-05, + "loss": 1.1888, + "step": 5920 + }, + { + "epoch": 2.2044187967562916, + "grad_norm": 0.16253378987312317, + "learning_rate": 1.8264310695347918e-05, + "loss": 1.1793, + "step": 5921 + }, + { + "epoch": 2.2047911019069004, + "grad_norm": 0.16472946107387543, + "learning_rate": 1.8263626365165296e-05, + "loss": 1.1855, + "step": 5922 + }, + { + "epoch": 2.2051634070575097, + "grad_norm": 0.1619625687599182, + "learning_rate": 1.8262941912929434e-05, + "loss": 1.1799, + "step": 5923 + }, + { + "epoch": 2.2055357122081185, + "grad_norm": 0.16878865659236908, + "learning_rate": 1.826225733865044e-05, + "loss": 1.1956, + "step": 5924 + }, + { + "epoch": 2.2059080173587278, + "grad_norm": 0.16465233266353607, + "learning_rate": 1.8261572642338418e-05, + "loss": 1.1848, + "step": 5925 + }, + { + "epoch": 2.2062803225093366, + "grad_norm": 0.1674007624387741, + "learning_rate": 1.826088782400349e-05, + "loss": 1.1781, + "step": 5926 + }, + { + "epoch": 2.206652627659946, + "grad_norm": 0.16674551367759705, + "learning_rate": 1.8260202883655773e-05, + "loss": 1.1907, + "step": 5927 + }, + { + "epoch": 2.2070249328105547, + "grad_norm": 0.16409821808338165, + "learning_rate": 1.825951782130537e-05, + "loss": 1.1699, + "step": 5928 + }, + { + "epoch": 2.207397237961164, + "grad_norm": 0.16593073308467865, + "learning_rate": 1.825883263696241e-05, + "loss": 1.1953, + "step": 5929 + }, + { + "epoch": 2.207769543111773, + "grad_norm": 0.16026705503463745, + "learning_rate": 1.825814733063701e-05, + "loss": 1.1621, + "step": 5930 + }, + { + "epoch": 2.208141848262382, + "grad_norm": 0.16227182745933533, + "learning_rate": 1.825746190233929e-05, + "loss": 1.1973, + "step": 5931 + }, + { + "epoch": 2.2085141534129913, + "grad_norm": 0.16535073518753052, + "learning_rate": 1.8256776352079377e-05, + "loss": 1.1731, + "step": 5932 + }, + { + "epoch": 2.2088864585636, + "grad_norm": 0.16271446645259857, + "learning_rate": 1.82560906798674e-05, + "loss": 1.1718, + "step": 5933 + }, + { + "epoch": 2.2092587637142094, + "grad_norm": 0.16261382400989532, + "learning_rate": 1.8255404885713478e-05, + "loss": 1.1795, + "step": 5934 + }, + { + "epoch": 2.209631068864818, + "grad_norm": 0.16760127246379852, + "learning_rate": 1.825471896962774e-05, + "loss": 1.1688, + "step": 5935 + }, + { + "epoch": 2.2100033740154275, + "grad_norm": 0.16549386084079742, + "learning_rate": 1.8254032931620326e-05, + "loss": 1.1906, + "step": 5936 + }, + { + "epoch": 2.2103756791660363, + "grad_norm": 0.16376779973506927, + "learning_rate": 1.8253346771701363e-05, + "loss": 1.1764, + "step": 5937 + }, + { + "epoch": 2.2107479843166455, + "grad_norm": 0.1606384664773941, + "learning_rate": 1.8252660489880986e-05, + "loss": 1.1735, + "step": 5938 + }, + { + "epoch": 2.211120289467255, + "grad_norm": 0.16291670501232147, + "learning_rate": 1.8251974086169332e-05, + "loss": 1.178, + "step": 5939 + }, + { + "epoch": 2.2114925946178636, + "grad_norm": 0.16411124169826508, + "learning_rate": 1.8251287560576535e-05, + "loss": 1.1845, + "step": 5940 + }, + { + "epoch": 2.211864899768473, + "grad_norm": 0.15855379402637482, + "learning_rate": 1.8250600913112743e-05, + "loss": 1.1883, + "step": 5941 + }, + { + "epoch": 2.2122372049190817, + "grad_norm": 0.15984077751636505, + "learning_rate": 1.824991414378809e-05, + "loss": 1.1848, + "step": 5942 + }, + { + "epoch": 2.212609510069691, + "grad_norm": 0.16627465188503265, + "learning_rate": 1.8249227252612725e-05, + "loss": 1.1914, + "step": 5943 + }, + { + "epoch": 2.2129818152203, + "grad_norm": 0.1621500700712204, + "learning_rate": 1.824854023959679e-05, + "loss": 1.1999, + "step": 5944 + }, + { + "epoch": 2.213354120370909, + "grad_norm": 0.1592169553041458, + "learning_rate": 1.8247853104750433e-05, + "loss": 1.1838, + "step": 5945 + }, + { + "epoch": 2.213726425521518, + "grad_norm": 0.16420015692710876, + "learning_rate": 1.8247165848083805e-05, + "loss": 1.1777, + "step": 5946 + }, + { + "epoch": 2.214098730672127, + "grad_norm": 0.16197781264781952, + "learning_rate": 1.8246478469607055e-05, + "loss": 1.1815, + "step": 5947 + }, + { + "epoch": 2.2144710358227364, + "grad_norm": 0.1656760275363922, + "learning_rate": 1.8245790969330336e-05, + "loss": 1.185, + "step": 5948 + }, + { + "epoch": 2.2148433409733452, + "grad_norm": 0.16433504223823547, + "learning_rate": 1.82451033472638e-05, + "loss": 1.1878, + "step": 5949 + }, + { + "epoch": 2.2152156461239545, + "grad_norm": 0.16674500703811646, + "learning_rate": 1.8244415603417603e-05, + "loss": 1.1901, + "step": 5950 + }, + { + "epoch": 2.2155879512745633, + "grad_norm": 0.16727973520755768, + "learning_rate": 1.824372773780191e-05, + "loss": 1.1683, + "step": 5951 + }, + { + "epoch": 2.2159602564251726, + "grad_norm": 0.16087013483047485, + "learning_rate": 1.8243039750426872e-05, + "loss": 1.1982, + "step": 5952 + }, + { + "epoch": 2.2163325615757814, + "grad_norm": 0.1639922559261322, + "learning_rate": 1.8242351641302657e-05, + "loss": 1.1971, + "step": 5953 + }, + { + "epoch": 2.2167048667263907, + "grad_norm": 0.16302375495433807, + "learning_rate": 1.8241663410439424e-05, + "loss": 1.1748, + "step": 5954 + }, + { + "epoch": 2.2170771718769995, + "grad_norm": 0.16024988889694214, + "learning_rate": 1.8240975057847338e-05, + "loss": 1.1837, + "step": 5955 + }, + { + "epoch": 2.2174494770276088, + "grad_norm": 0.16668042540550232, + "learning_rate": 1.824028658353657e-05, + "loss": 1.1922, + "step": 5956 + }, + { + "epoch": 2.217821782178218, + "grad_norm": 0.1660262942314148, + "learning_rate": 1.8239597987517284e-05, + "loss": 1.1947, + "step": 5957 + }, + { + "epoch": 2.218194087328827, + "grad_norm": 0.15916001796722412, + "learning_rate": 1.8238909269799655e-05, + "loss": 1.1774, + "step": 5958 + }, + { + "epoch": 2.218566392479436, + "grad_norm": 0.16157934069633484, + "learning_rate": 1.8238220430393855e-05, + "loss": 1.1705, + "step": 5959 + }, + { + "epoch": 2.218938697630045, + "grad_norm": 0.17225240170955658, + "learning_rate": 1.8237531469310054e-05, + "loss": 1.1772, + "step": 5960 + }, + { + "epoch": 2.219311002780654, + "grad_norm": 0.16769270598888397, + "learning_rate": 1.823684238655843e-05, + "loss": 1.1843, + "step": 5961 + }, + { + "epoch": 2.219683307931263, + "grad_norm": 0.18250252306461334, + "learning_rate": 1.8236153182149158e-05, + "loss": 1.1767, + "step": 5962 + }, + { + "epoch": 2.2200556130818723, + "grad_norm": 0.16508784890174866, + "learning_rate": 1.8235463856092423e-05, + "loss": 1.178, + "step": 5963 + }, + { + "epoch": 2.220427918232481, + "grad_norm": 0.16602812707424164, + "learning_rate": 1.8234774408398405e-05, + "loss": 1.1758, + "step": 5964 + }, + { + "epoch": 2.2208002233830904, + "grad_norm": 0.16944126784801483, + "learning_rate": 1.8234084839077283e-05, + "loss": 1.1842, + "step": 5965 + }, + { + "epoch": 2.2211725285336996, + "grad_norm": 0.1692361980676651, + "learning_rate": 1.8233395148139246e-05, + "loss": 1.2078, + "step": 5966 + }, + { + "epoch": 2.2215448336843084, + "grad_norm": 0.175007626414299, + "learning_rate": 1.823270533559448e-05, + "loss": 1.1778, + "step": 5967 + }, + { + "epoch": 2.2219171388349177, + "grad_norm": 0.16647957265377045, + "learning_rate": 1.823201540145317e-05, + "loss": 1.1834, + "step": 5968 + }, + { + "epoch": 2.2222894439855265, + "grad_norm": 0.16229864954948425, + "learning_rate": 1.8231325345725514e-05, + "loss": 1.1748, + "step": 5969 + }, + { + "epoch": 2.222661749136136, + "grad_norm": 0.16611957550048828, + "learning_rate": 1.8230635168421694e-05, + "loss": 1.1781, + "step": 5970 + }, + { + "epoch": 2.2230340542867446, + "grad_norm": 0.17674987018108368, + "learning_rate": 1.8229944869551915e-05, + "loss": 1.1974, + "step": 5971 + }, + { + "epoch": 2.223406359437354, + "grad_norm": 0.16097386181354523, + "learning_rate": 1.8229254449126365e-05, + "loss": 1.1807, + "step": 5972 + }, + { + "epoch": 2.2237786645879627, + "grad_norm": 0.16559158265590668, + "learning_rate": 1.822856390715524e-05, + "loss": 1.1776, + "step": 5973 + }, + { + "epoch": 2.224150969738572, + "grad_norm": 0.16879995167255402, + "learning_rate": 1.8227873243648748e-05, + "loss": 1.1829, + "step": 5974 + }, + { + "epoch": 2.2245232748891812, + "grad_norm": 0.1685383915901184, + "learning_rate": 1.8227182458617076e-05, + "loss": 1.1909, + "step": 5975 + }, + { + "epoch": 2.22489558003979, + "grad_norm": 0.16316437721252441, + "learning_rate": 1.822649155207044e-05, + "loss": 1.1669, + "step": 5976 + }, + { + "epoch": 2.2252678851903993, + "grad_norm": 0.18334704637527466, + "learning_rate": 1.822580052401904e-05, + "loss": 1.1755, + "step": 5977 + }, + { + "epoch": 2.225640190341008, + "grad_norm": 0.16785620152950287, + "learning_rate": 1.8225109374473087e-05, + "loss": 1.1818, + "step": 5978 + }, + { + "epoch": 2.2260124954916174, + "grad_norm": 0.16493113338947296, + "learning_rate": 1.822441810344278e-05, + "loss": 1.2018, + "step": 5979 + }, + { + "epoch": 2.226384800642226, + "grad_norm": 0.16159646213054657, + "learning_rate": 1.822372671093833e-05, + "loss": 1.1709, + "step": 5980 + }, + { + "epoch": 2.2267571057928355, + "grad_norm": 0.16819043457508087, + "learning_rate": 1.822303519696996e-05, + "loss": 1.1913, + "step": 5981 + }, + { + "epoch": 2.2271294109434443, + "grad_norm": 0.16642168164253235, + "learning_rate": 1.8222343561547876e-05, + "loss": 1.1787, + "step": 5982 + }, + { + "epoch": 2.2275017160940536, + "grad_norm": 0.16357922554016113, + "learning_rate": 1.8221651804682287e-05, + "loss": 1.1764, + "step": 5983 + }, + { + "epoch": 2.227874021244663, + "grad_norm": 0.16216666996479034, + "learning_rate": 1.8220959926383422e-05, + "loss": 1.18, + "step": 5984 + }, + { + "epoch": 2.2282463263952716, + "grad_norm": 0.16932249069213867, + "learning_rate": 1.8220267926661494e-05, + "loss": 1.1882, + "step": 5985 + }, + { + "epoch": 2.228618631545881, + "grad_norm": 0.16690555214881897, + "learning_rate": 1.8219575805526723e-05, + "loss": 1.1948, + "step": 5986 + }, + { + "epoch": 2.2289909366964897, + "grad_norm": 0.16043587028980255, + "learning_rate": 1.8218883562989335e-05, + "loss": 1.1779, + "step": 5987 + }, + { + "epoch": 2.229363241847099, + "grad_norm": 0.16727250814437866, + "learning_rate": 1.8218191199059553e-05, + "loss": 1.1728, + "step": 5988 + }, + { + "epoch": 2.229735546997708, + "grad_norm": 0.17071470618247986, + "learning_rate": 1.82174987137476e-05, + "loss": 1.1852, + "step": 5989 + }, + { + "epoch": 2.230107852148317, + "grad_norm": 0.17446862161159515, + "learning_rate": 1.8216806107063705e-05, + "loss": 1.1967, + "step": 5990 + }, + { + "epoch": 2.230480157298926, + "grad_norm": 0.16440711915493011, + "learning_rate": 1.8216113379018105e-05, + "loss": 1.1864, + "step": 5991 + }, + { + "epoch": 2.230852462449535, + "grad_norm": 0.16457001864910126, + "learning_rate": 1.8215420529621025e-05, + "loss": 1.1746, + "step": 5992 + }, + { + "epoch": 2.2312247676001444, + "grad_norm": 0.16013966500759125, + "learning_rate": 1.82147275588827e-05, + "loss": 1.1715, + "step": 5993 + }, + { + "epoch": 2.2315970727507533, + "grad_norm": 0.1729276180267334, + "learning_rate": 1.821403446681336e-05, + "loss": 1.2026, + "step": 5994 + }, + { + "epoch": 2.2319693779013625, + "grad_norm": 0.16060994565486908, + "learning_rate": 1.8213341253423248e-05, + "loss": 1.1569, + "step": 5995 + }, + { + "epoch": 2.2323416830519713, + "grad_norm": 0.16661281883716583, + "learning_rate": 1.8212647918722605e-05, + "loss": 1.1791, + "step": 5996 + }, + { + "epoch": 2.2327139882025806, + "grad_norm": 0.16243353486061096, + "learning_rate": 1.8211954462721663e-05, + "loss": 1.1822, + "step": 5997 + }, + { + "epoch": 2.2330862933531894, + "grad_norm": 0.16968528926372528, + "learning_rate": 1.8211260885430672e-05, + "loss": 1.1697, + "step": 5998 + }, + { + "epoch": 2.2334585985037987, + "grad_norm": 0.16825567185878754, + "learning_rate": 1.821056718685987e-05, + "loss": 1.1784, + "step": 5999 + }, + { + "epoch": 2.233830903654408, + "grad_norm": 0.1593291014432907, + "learning_rate": 1.820987336701951e-05, + "loss": 1.1793, + "step": 6000 + }, + { + "epoch": 2.233830903654408, + "eval_loss": 1.3026354312896729, + "eval_runtime": 16.489, + "eval_samples_per_second": 105.161, + "eval_steps_per_second": 5.276, + "step": 6000 + }, + { + "epoch": 2.2342032088050168, + "grad_norm": 0.16981658339500427, + "learning_rate": 1.8209179425919832e-05, + "loss": 1.1754, + "step": 6001 + }, + { + "epoch": 2.234575513955626, + "grad_norm": 0.16528399288654327, + "learning_rate": 1.820848536357109e-05, + "loss": 1.1896, + "step": 6002 + }, + { + "epoch": 2.234947819106235, + "grad_norm": 0.16212095320224762, + "learning_rate": 1.8207791179983535e-05, + "loss": 1.1762, + "step": 6003 + }, + { + "epoch": 2.235320124256844, + "grad_norm": 0.16362003982067108, + "learning_rate": 1.8207096875167417e-05, + "loss": 1.1724, + "step": 6004 + }, + { + "epoch": 2.235692429407453, + "grad_norm": 0.16708360612392426, + "learning_rate": 1.8206402449132997e-05, + "loss": 1.1824, + "step": 6005 + }, + { + "epoch": 2.236064734558062, + "grad_norm": 0.16234560310840607, + "learning_rate": 1.8205707901890524e-05, + "loss": 1.178, + "step": 6006 + }, + { + "epoch": 2.236437039708671, + "grad_norm": 0.16618958115577698, + "learning_rate": 1.8205013233450268e-05, + "loss": 1.1672, + "step": 6007 + }, + { + "epoch": 2.2368093448592803, + "grad_norm": 0.17038699984550476, + "learning_rate": 1.8204318443822473e-05, + "loss": 1.1991, + "step": 6008 + }, + { + "epoch": 2.2371816500098896, + "grad_norm": 0.16078020632266998, + "learning_rate": 1.8203623533017413e-05, + "loss": 1.1836, + "step": 6009 + }, + { + "epoch": 2.2375539551604984, + "grad_norm": 0.16053731739521027, + "learning_rate": 1.8202928501045347e-05, + "loss": 1.1889, + "step": 6010 + }, + { + "epoch": 2.2379262603111076, + "grad_norm": 0.16496212780475616, + "learning_rate": 1.820223334791654e-05, + "loss": 1.1908, + "step": 6011 + }, + { + "epoch": 2.2382985654617165, + "grad_norm": 0.16259627044200897, + "learning_rate": 1.8201538073641264e-05, + "loss": 1.1845, + "step": 6012 + }, + { + "epoch": 2.2386708706123257, + "grad_norm": 0.16362962126731873, + "learning_rate": 1.8200842678229786e-05, + "loss": 1.1731, + "step": 6013 + }, + { + "epoch": 2.2390431757629345, + "grad_norm": 0.16907905042171478, + "learning_rate": 1.8200147161692373e-05, + "loss": 1.1932, + "step": 6014 + }, + { + "epoch": 2.239415480913544, + "grad_norm": 0.1673828512430191, + "learning_rate": 1.8199451524039308e-05, + "loss": 1.1848, + "step": 6015 + }, + { + "epoch": 2.2397877860641526, + "grad_norm": 0.16971193253993988, + "learning_rate": 1.819875576528085e-05, + "loss": 1.1777, + "step": 6016 + }, + { + "epoch": 2.240160091214762, + "grad_norm": 0.173600435256958, + "learning_rate": 1.819805988542729e-05, + "loss": 1.1947, + "step": 6017 + }, + { + "epoch": 2.240532396365371, + "grad_norm": 0.1610695868730545, + "learning_rate": 1.81973638844889e-05, + "loss": 1.1796, + "step": 6018 + }, + { + "epoch": 2.24090470151598, + "grad_norm": 0.16606231033802032, + "learning_rate": 1.8196667762475953e-05, + "loss": 1.1824, + "step": 6019 + }, + { + "epoch": 2.2412770066665892, + "grad_norm": 0.16269895434379578, + "learning_rate": 1.8195971519398744e-05, + "loss": 1.1918, + "step": 6020 + }, + { + "epoch": 2.241649311817198, + "grad_norm": 0.1672276109457016, + "learning_rate": 1.8195275155267546e-05, + "loss": 1.197, + "step": 6021 + }, + { + "epoch": 2.2420216169678073, + "grad_norm": 0.16324323415756226, + "learning_rate": 1.8194578670092654e-05, + "loss": 1.1799, + "step": 6022 + }, + { + "epoch": 2.242393922118416, + "grad_norm": 0.1610366255044937, + "learning_rate": 1.8193882063884346e-05, + "loss": 1.1787, + "step": 6023 + }, + { + "epoch": 2.2427662272690254, + "grad_norm": 0.163301020860672, + "learning_rate": 1.8193185336652912e-05, + "loss": 1.1851, + "step": 6024 + }, + { + "epoch": 2.2431385324196347, + "grad_norm": 0.16629861295223236, + "learning_rate": 1.819248848840865e-05, + "loss": 1.1963, + "step": 6025 + }, + { + "epoch": 2.2435108375702435, + "grad_norm": 0.1662205159664154, + "learning_rate": 1.819179151916184e-05, + "loss": 1.189, + "step": 6026 + }, + { + "epoch": 2.2438831427208528, + "grad_norm": 0.15960603952407837, + "learning_rate": 1.819109442892279e-05, + "loss": 1.1654, + "step": 6027 + }, + { + "epoch": 2.2442554478714616, + "grad_norm": 0.16344155371189117, + "learning_rate": 1.8190397217701785e-05, + "loss": 1.1841, + "step": 6028 + }, + { + "epoch": 2.244627753022071, + "grad_norm": 0.16831251978874207, + "learning_rate": 1.8189699885509128e-05, + "loss": 1.1615, + "step": 6029 + }, + { + "epoch": 2.2450000581726797, + "grad_norm": 0.1629910171031952, + "learning_rate": 1.818900243235512e-05, + "loss": 1.1961, + "step": 6030 + }, + { + "epoch": 2.245372363323289, + "grad_norm": 0.16238047182559967, + "learning_rate": 1.818830485825006e-05, + "loss": 1.1766, + "step": 6031 + }, + { + "epoch": 2.2457446684738978, + "grad_norm": 0.1695476919412613, + "learning_rate": 1.8187607163204246e-05, + "loss": 1.173, + "step": 6032 + }, + { + "epoch": 2.246116973624507, + "grad_norm": 0.16819293797016144, + "learning_rate": 1.8186909347227992e-05, + "loss": 1.1899, + "step": 6033 + }, + { + "epoch": 2.2464892787751163, + "grad_norm": 0.16680686175823212, + "learning_rate": 1.81862114103316e-05, + "loss": 1.1771, + "step": 6034 + }, + { + "epoch": 2.246861583925725, + "grad_norm": 0.1659124493598938, + "learning_rate": 1.818551335252538e-05, + "loss": 1.1693, + "step": 6035 + }, + { + "epoch": 2.2472338890763344, + "grad_norm": 0.15798041224479675, + "learning_rate": 1.818481517381964e-05, + "loss": 1.1757, + "step": 6036 + }, + { + "epoch": 2.247606194226943, + "grad_norm": 0.17841339111328125, + "learning_rate": 1.8184116874224695e-05, + "loss": 1.1828, + "step": 6037 + }, + { + "epoch": 2.2479784993775525, + "grad_norm": 0.1704692840576172, + "learning_rate": 1.818341845375086e-05, + "loss": 1.1874, + "step": 6038 + }, + { + "epoch": 2.2483508045281613, + "grad_norm": 0.16449272632598877, + "learning_rate": 1.818271991240844e-05, + "loss": 1.1822, + "step": 6039 + }, + { + "epoch": 2.2487231096787705, + "grad_norm": 0.15764431655406952, + "learning_rate": 1.818202125020777e-05, + "loss": 1.1709, + "step": 6040 + }, + { + "epoch": 2.2490954148293794, + "grad_norm": 0.18003995716571808, + "learning_rate": 1.8181322467159153e-05, + "loss": 1.1801, + "step": 6041 + }, + { + "epoch": 2.2494677199799886, + "grad_norm": 0.1611638069152832, + "learning_rate": 1.818062356327292e-05, + "loss": 1.1702, + "step": 6042 + }, + { + "epoch": 2.249840025130598, + "grad_norm": 0.17060458660125732, + "learning_rate": 1.8179924538559385e-05, + "loss": 1.1737, + "step": 6043 + }, + { + "epoch": 2.2502123302812067, + "grad_norm": 0.1664137840270996, + "learning_rate": 1.8179225393028883e-05, + "loss": 1.1984, + "step": 6044 + }, + { + "epoch": 2.250584635431816, + "grad_norm": 0.16480514407157898, + "learning_rate": 1.8178526126691734e-05, + "loss": 1.1802, + "step": 6045 + }, + { + "epoch": 2.250956940582425, + "grad_norm": 0.1663130819797516, + "learning_rate": 1.8177826739558268e-05, + "loss": 1.1844, + "step": 6046 + }, + { + "epoch": 2.251329245733034, + "grad_norm": 0.15989989042282104, + "learning_rate": 1.8177127231638815e-05, + "loss": 1.1773, + "step": 6047 + }, + { + "epoch": 2.251701550883643, + "grad_norm": 0.1636444330215454, + "learning_rate": 1.8176427602943705e-05, + "loss": 1.1707, + "step": 6048 + }, + { + "epoch": 2.252073856034252, + "grad_norm": 0.16399884223937988, + "learning_rate": 1.817572785348327e-05, + "loss": 1.1714, + "step": 6049 + }, + { + "epoch": 2.252446161184861, + "grad_norm": 0.1703033149242401, + "learning_rate": 1.817502798326785e-05, + "loss": 1.199, + "step": 6050 + }, + { + "epoch": 2.2528184663354702, + "grad_norm": 0.16354358196258545, + "learning_rate": 1.817432799230778e-05, + "loss": 1.1994, + "step": 6051 + }, + { + "epoch": 2.2531907714860795, + "grad_norm": 0.16181591153144836, + "learning_rate": 1.8173627880613394e-05, + "loss": 1.1794, + "step": 6052 + }, + { + "epoch": 2.2535630766366883, + "grad_norm": 0.1724006086587906, + "learning_rate": 1.8172927648195043e-05, + "loss": 1.1879, + "step": 6053 + }, + { + "epoch": 2.2539353817872976, + "grad_norm": 0.16579163074493408, + "learning_rate": 1.8172227295063062e-05, + "loss": 1.1827, + "step": 6054 + }, + { + "epoch": 2.2543076869379064, + "grad_norm": 0.16125434637069702, + "learning_rate": 1.81715268212278e-05, + "loss": 1.1758, + "step": 6055 + }, + { + "epoch": 2.2546799920885157, + "grad_norm": 0.16942323744297028, + "learning_rate": 1.8170826226699593e-05, + "loss": 1.1849, + "step": 6056 + }, + { + "epoch": 2.2550522972391245, + "grad_norm": 0.1689453423023224, + "learning_rate": 1.81701255114888e-05, + "loss": 1.1837, + "step": 6057 + }, + { + "epoch": 2.2554246023897337, + "grad_norm": 0.1727559119462967, + "learning_rate": 1.8169424675605766e-05, + "loss": 1.1916, + "step": 6058 + }, + { + "epoch": 2.2557969075403426, + "grad_norm": 0.17401790618896484, + "learning_rate": 1.816872371906084e-05, + "loss": 1.1693, + "step": 6059 + }, + { + "epoch": 2.256169212690952, + "grad_norm": 0.16698595881462097, + "learning_rate": 1.816802264186438e-05, + "loss": 1.18, + "step": 6060 + }, + { + "epoch": 2.256541517841561, + "grad_norm": 0.1707891970872879, + "learning_rate": 1.816732144402673e-05, + "loss": 1.1742, + "step": 6061 + }, + { + "epoch": 2.25691382299217, + "grad_norm": 0.16598151624202728, + "learning_rate": 1.8166620125558263e-05, + "loss": 1.188, + "step": 6062 + }, + { + "epoch": 2.257286128142779, + "grad_norm": 0.16978704929351807, + "learning_rate": 1.816591868646933e-05, + "loss": 1.1884, + "step": 6063 + }, + { + "epoch": 2.257658433293388, + "grad_norm": 0.16990995407104492, + "learning_rate": 1.8165217126770285e-05, + "loss": 1.1826, + "step": 6064 + }, + { + "epoch": 2.2580307384439973, + "grad_norm": 0.1661897599697113, + "learning_rate": 1.81645154464715e-05, + "loss": 1.1702, + "step": 6065 + }, + { + "epoch": 2.258403043594606, + "grad_norm": 0.16440676152706146, + "learning_rate": 1.816381364558333e-05, + "loss": 1.1769, + "step": 6066 + }, + { + "epoch": 2.2587753487452154, + "grad_norm": 0.18172229826450348, + "learning_rate": 1.816311172411615e-05, + "loss": 1.1747, + "step": 6067 + }, + { + "epoch": 2.259147653895824, + "grad_norm": 0.16828443109989166, + "learning_rate": 1.816240968208032e-05, + "loss": 1.2007, + "step": 6068 + }, + { + "epoch": 2.2595199590464334, + "grad_norm": 0.16745588183403015, + "learning_rate": 1.816170751948621e-05, + "loss": 1.1842, + "step": 6069 + }, + { + "epoch": 2.2598922641970427, + "grad_norm": 0.16287997364997864, + "learning_rate": 1.8161005236344193e-05, + "loss": 1.1852, + "step": 6070 + }, + { + "epoch": 2.2602645693476515, + "grad_norm": 0.1797943264245987, + "learning_rate": 1.816030283266464e-05, + "loss": 1.1838, + "step": 6071 + }, + { + "epoch": 2.260636874498261, + "grad_norm": 0.1775754690170288, + "learning_rate": 1.815960030845793e-05, + "loss": 1.1877, + "step": 6072 + }, + { + "epoch": 2.2610091796488696, + "grad_norm": 0.1645868569612503, + "learning_rate": 1.815889766373443e-05, + "loss": 1.184, + "step": 6073 + }, + { + "epoch": 2.261381484799479, + "grad_norm": 0.17757607996463776, + "learning_rate": 1.8158194898504526e-05, + "loss": 1.1702, + "step": 6074 + }, + { + "epoch": 2.2617537899500877, + "grad_norm": 0.16869355738162994, + "learning_rate": 1.8157492012778598e-05, + "loss": 1.1741, + "step": 6075 + }, + { + "epoch": 2.262126095100697, + "grad_norm": 0.1690586805343628, + "learning_rate": 1.8156789006567018e-05, + "loss": 1.1676, + "step": 6076 + }, + { + "epoch": 2.2624984002513058, + "grad_norm": 0.17510120570659637, + "learning_rate": 1.815608587988018e-05, + "loss": 1.1863, + "step": 6077 + }, + { + "epoch": 2.262870705401915, + "grad_norm": 0.1718439757823944, + "learning_rate": 1.8155382632728468e-05, + "loss": 1.1847, + "step": 6078 + }, + { + "epoch": 2.2632430105525243, + "grad_norm": 0.1764347404241562, + "learning_rate": 1.8154679265122265e-05, + "loss": 1.1627, + "step": 6079 + }, + { + "epoch": 2.263615315703133, + "grad_norm": 0.1741929054260254, + "learning_rate": 1.815397577707196e-05, + "loss": 1.186, + "step": 6080 + }, + { + "epoch": 2.2639876208537424, + "grad_norm": 0.162635937333107, + "learning_rate": 1.8153272168587947e-05, + "loss": 1.1809, + "step": 6081 + }, + { + "epoch": 2.264359926004351, + "grad_norm": 0.16348296403884888, + "learning_rate": 1.8152568439680612e-05, + "loss": 1.179, + "step": 6082 + }, + { + "epoch": 2.2647322311549605, + "grad_norm": 0.16939151287078857, + "learning_rate": 1.8151864590360354e-05, + "loss": 1.1605, + "step": 6083 + }, + { + "epoch": 2.2651045363055693, + "grad_norm": 0.17128928005695343, + "learning_rate": 1.815116062063757e-05, + "loss": 1.1947, + "step": 6084 + }, + { + "epoch": 2.2654768414561786, + "grad_norm": 0.1636468470096588, + "learning_rate": 1.8150456530522652e-05, + "loss": 1.181, + "step": 6085 + }, + { + "epoch": 2.2658491466067874, + "grad_norm": 0.1675485074520111, + "learning_rate": 1.8149752320026004e-05, + "loss": 1.1772, + "step": 6086 + }, + { + "epoch": 2.2662214517573966, + "grad_norm": 0.17299336194992065, + "learning_rate": 1.8149047989158026e-05, + "loss": 1.1853, + "step": 6087 + }, + { + "epoch": 2.266593756908006, + "grad_norm": 0.16324491798877716, + "learning_rate": 1.8148343537929118e-05, + "loss": 1.1806, + "step": 6088 + }, + { + "epoch": 2.2669660620586147, + "grad_norm": 0.16224730014801025, + "learning_rate": 1.8147638966349687e-05, + "loss": 1.1856, + "step": 6089 + }, + { + "epoch": 2.267338367209224, + "grad_norm": 0.16687463223934174, + "learning_rate": 1.8146934274430147e-05, + "loss": 1.2029, + "step": 6090 + }, + { + "epoch": 2.267710672359833, + "grad_norm": 0.16694878041744232, + "learning_rate": 1.814622946218089e-05, + "loss": 1.1644, + "step": 6091 + }, + { + "epoch": 2.268082977510442, + "grad_norm": 0.1973126232624054, + "learning_rate": 1.814552452961234e-05, + "loss": 1.1787, + "step": 6092 + }, + { + "epoch": 2.268455282661051, + "grad_norm": 0.20395469665527344, + "learning_rate": 1.81448194767349e-05, + "loss": 1.1916, + "step": 6093 + }, + { + "epoch": 2.26882758781166, + "grad_norm": 0.1700964868068695, + "learning_rate": 1.8144114303558993e-05, + "loss": 1.1889, + "step": 6094 + }, + { + "epoch": 2.269199892962269, + "grad_norm": 0.17254211008548737, + "learning_rate": 1.8143409010095028e-05, + "loss": 1.1644, + "step": 6095 + }, + { + "epoch": 2.2695721981128782, + "grad_norm": 0.19276972115039825, + "learning_rate": 1.814270359635342e-05, + "loss": 1.1883, + "step": 6096 + }, + { + "epoch": 2.2699445032634875, + "grad_norm": 0.1734069436788559, + "learning_rate": 1.814199806234459e-05, + "loss": 1.1843, + "step": 6097 + }, + { + "epoch": 2.2703168084140963, + "grad_norm": 0.1637657731771469, + "learning_rate": 1.8141292408078963e-05, + "loss": 1.1871, + "step": 6098 + }, + { + "epoch": 2.2706891135647056, + "grad_norm": 0.17510093748569489, + "learning_rate": 1.814058663356696e-05, + "loss": 1.1796, + "step": 6099 + }, + { + "epoch": 2.2710614187153144, + "grad_norm": 0.16966238617897034, + "learning_rate": 1.8139880738819e-05, + "loss": 1.1683, + "step": 6100 + }, + { + "epoch": 2.2714337238659237, + "grad_norm": 0.1668512225151062, + "learning_rate": 1.8139174723845513e-05, + "loss": 1.1993, + "step": 6101 + }, + { + "epoch": 2.2718060290165325, + "grad_norm": 0.17251057922840118, + "learning_rate": 1.8138468588656922e-05, + "loss": 1.19, + "step": 6102 + }, + { + "epoch": 2.2721783341671418, + "grad_norm": 0.1686212718486786, + "learning_rate": 1.8137762333263667e-05, + "loss": 1.1798, + "step": 6103 + }, + { + "epoch": 2.2725506393177506, + "grad_norm": 0.16368193924427032, + "learning_rate": 1.8137055957676172e-05, + "loss": 1.168, + "step": 6104 + }, + { + "epoch": 2.27292294446836, + "grad_norm": 0.1598677635192871, + "learning_rate": 1.8136349461904866e-05, + "loss": 1.1762, + "step": 6105 + }, + { + "epoch": 2.273295249618969, + "grad_norm": 0.16657836735248566, + "learning_rate": 1.8135642845960195e-05, + "loss": 1.179, + "step": 6106 + }, + { + "epoch": 2.273667554769578, + "grad_norm": 0.17238342761993408, + "learning_rate": 1.8134936109852587e-05, + "loss": 1.2088, + "step": 6107 + }, + { + "epoch": 2.274039859920187, + "grad_norm": 0.17234353721141815, + "learning_rate": 1.8134229253592485e-05, + "loss": 1.1965, + "step": 6108 + }, + { + "epoch": 2.274412165070796, + "grad_norm": 0.16358719766139984, + "learning_rate": 1.8133522277190324e-05, + "loss": 1.1843, + "step": 6109 + }, + { + "epoch": 2.2747844702214053, + "grad_norm": 0.17148621380329132, + "learning_rate": 1.8132815180656554e-05, + "loss": 1.1863, + "step": 6110 + }, + { + "epoch": 2.275156775372014, + "grad_norm": 0.17336560785770416, + "learning_rate": 1.813210796400161e-05, + "loss": 1.1859, + "step": 6111 + }, + { + "epoch": 2.2755290805226234, + "grad_norm": 0.17160752415657043, + "learning_rate": 1.813140062723594e-05, + "loss": 1.1925, + "step": 6112 + }, + { + "epoch": 2.275901385673232, + "grad_norm": 0.16290338337421417, + "learning_rate": 1.8130693170369998e-05, + "loss": 1.174, + "step": 6113 + }, + { + "epoch": 2.2762736908238415, + "grad_norm": 0.16730904579162598, + "learning_rate": 1.812998559341423e-05, + "loss": 1.1699, + "step": 6114 + }, + { + "epoch": 2.2766459959744507, + "grad_norm": 0.1715693324804306, + "learning_rate": 1.8129277896379077e-05, + "loss": 1.1877, + "step": 6115 + }, + { + "epoch": 2.2770183011250595, + "grad_norm": 0.16124042868614197, + "learning_rate": 1.812857007927501e-05, + "loss": 1.1604, + "step": 6116 + }, + { + "epoch": 2.277390606275669, + "grad_norm": 0.155757874250412, + "learning_rate": 1.8127862142112463e-05, + "loss": 1.1699, + "step": 6117 + }, + { + "epoch": 2.2777629114262776, + "grad_norm": 0.16820941865444183, + "learning_rate": 1.8127154084901906e-05, + "loss": 1.1702, + "step": 6118 + }, + { + "epoch": 2.278135216576887, + "grad_norm": 0.16459953784942627, + "learning_rate": 1.8126445907653797e-05, + "loss": 1.1873, + "step": 6119 + }, + { + "epoch": 2.2785075217274957, + "grad_norm": 0.16601146757602692, + "learning_rate": 1.8125737610378585e-05, + "loss": 1.1809, + "step": 6120 + }, + { + "epoch": 2.278879826878105, + "grad_norm": 0.16226966679096222, + "learning_rate": 1.8125029193086743e-05, + "loss": 1.1728, + "step": 6121 + }, + { + "epoch": 2.279252132028714, + "grad_norm": 0.16306665539741516, + "learning_rate": 1.812432065578873e-05, + "loss": 1.1887, + "step": 6122 + }, + { + "epoch": 2.279624437179323, + "grad_norm": 0.15871618688106537, + "learning_rate": 1.812361199849501e-05, + "loss": 1.1873, + "step": 6123 + }, + { + "epoch": 2.2799967423299323, + "grad_norm": 0.17421315610408783, + "learning_rate": 1.812290322121605e-05, + "loss": 1.1949, + "step": 6124 + }, + { + "epoch": 2.280369047480541, + "grad_norm": 0.16765080392360687, + "learning_rate": 1.8122194323962317e-05, + "loss": 1.1913, + "step": 6125 + }, + { + "epoch": 2.2807413526311504, + "grad_norm": 0.15896672010421753, + "learning_rate": 1.8121485306744286e-05, + "loss": 1.1887, + "step": 6126 + }, + { + "epoch": 2.2811136577817592, + "grad_norm": 0.16295817494392395, + "learning_rate": 1.8120776169572427e-05, + "loss": 1.1773, + "step": 6127 + }, + { + "epoch": 2.2814859629323685, + "grad_norm": 0.16846707463264465, + "learning_rate": 1.8120066912457216e-05, + "loss": 1.1831, + "step": 6128 + }, + { + "epoch": 2.2818582680829778, + "grad_norm": 0.1673799604177475, + "learning_rate": 1.811935753540912e-05, + "loss": 1.1824, + "step": 6129 + }, + { + "epoch": 2.2822305732335866, + "grad_norm": 0.1708277016878128, + "learning_rate": 1.8118648038438627e-05, + "loss": 1.1776, + "step": 6130 + }, + { + "epoch": 2.2826028783841954, + "grad_norm": 0.170358344912529, + "learning_rate": 1.811793842155621e-05, + "loss": 1.1843, + "step": 6131 + }, + { + "epoch": 2.2829751835348047, + "grad_norm": 0.17161040008068085, + "learning_rate": 1.8117228684772358e-05, + "loss": 1.1901, + "step": 6132 + }, + { + "epoch": 2.283347488685414, + "grad_norm": 0.16947078704833984, + "learning_rate": 1.811651882809754e-05, + "loss": 1.1788, + "step": 6133 + }, + { + "epoch": 2.2837197938360227, + "grad_norm": 0.1724851131439209, + "learning_rate": 1.8115808851542255e-05, + "loss": 1.1903, + "step": 6134 + }, + { + "epoch": 2.284092098986632, + "grad_norm": 0.17149105668067932, + "learning_rate": 1.8115098755116974e-05, + "loss": 1.1849, + "step": 6135 + }, + { + "epoch": 2.284464404137241, + "grad_norm": 0.16904261708259583, + "learning_rate": 1.81143885388322e-05, + "loss": 1.18, + "step": 6136 + }, + { + "epoch": 2.28483670928785, + "grad_norm": 0.16865547001361847, + "learning_rate": 1.811367820269842e-05, + "loss": 1.1773, + "step": 6137 + }, + { + "epoch": 2.2852090144384594, + "grad_norm": 0.16392925381660461, + "learning_rate": 1.811296774672611e-05, + "loss": 1.1895, + "step": 6138 + }, + { + "epoch": 2.285581319589068, + "grad_norm": 0.17182591557502747, + "learning_rate": 1.8112257170925785e-05, + "loss": 1.184, + "step": 6139 + }, + { + "epoch": 2.2859536247396774, + "grad_norm": 0.16225169599056244, + "learning_rate": 1.8111546475307927e-05, + "loss": 1.1836, + "step": 6140 + }, + { + "epoch": 2.2863259298902863, + "grad_norm": 0.16837048530578613, + "learning_rate": 1.811083565988304e-05, + "loss": 1.1813, + "step": 6141 + }, + { + "epoch": 2.2866982350408955, + "grad_norm": 0.16455179452896118, + "learning_rate": 1.8110124724661617e-05, + "loss": 1.1837, + "step": 6142 + }, + { + "epoch": 2.2870705401915044, + "grad_norm": 0.19278612732887268, + "learning_rate": 1.810941366965416e-05, + "loss": 1.177, + "step": 6143 + }, + { + "epoch": 2.2874428453421136, + "grad_norm": 0.18453273177146912, + "learning_rate": 1.8108702494871173e-05, + "loss": 1.1796, + "step": 6144 + }, + { + "epoch": 2.2878151504927224, + "grad_norm": 0.17446112632751465, + "learning_rate": 1.8107991200323162e-05, + "loss": 1.1892, + "step": 6145 + }, + { + "epoch": 2.2881874556433317, + "grad_norm": 0.15920548141002655, + "learning_rate": 1.8107279786020627e-05, + "loss": 1.1722, + "step": 6146 + }, + { + "epoch": 2.288559760793941, + "grad_norm": 0.21259953081607819, + "learning_rate": 1.8106568251974077e-05, + "loss": 1.1791, + "step": 6147 + }, + { + "epoch": 2.28893206594455, + "grad_norm": 0.16296431422233582, + "learning_rate": 1.8105856598194026e-05, + "loss": 1.1778, + "step": 6148 + }, + { + "epoch": 2.289304371095159, + "grad_norm": 0.1709262728691101, + "learning_rate": 1.8105144824690977e-05, + "loss": 1.179, + "step": 6149 + }, + { + "epoch": 2.289676676245768, + "grad_norm": 0.163625106215477, + "learning_rate": 1.8104432931475454e-05, + "loss": 1.1787, + "step": 6150 + }, + { + "epoch": 2.290048981396377, + "grad_norm": 0.16324341297149658, + "learning_rate": 1.810372091855796e-05, + "loss": 1.1714, + "step": 6151 + }, + { + "epoch": 2.290421286546986, + "grad_norm": 0.16584549844264984, + "learning_rate": 1.8103008785949015e-05, + "loss": 1.173, + "step": 6152 + }, + { + "epoch": 2.290793591697595, + "grad_norm": 0.17169234156608582, + "learning_rate": 1.8102296533659146e-05, + "loss": 1.1747, + "step": 6153 + }, + { + "epoch": 2.291165896848204, + "grad_norm": 0.17495130002498627, + "learning_rate": 1.810158416169886e-05, + "loss": 1.1814, + "step": 6154 + }, + { + "epoch": 2.2915382019988133, + "grad_norm": 0.16563518345355988, + "learning_rate": 1.8100871670078687e-05, + "loss": 1.1744, + "step": 6155 + }, + { + "epoch": 2.2919105071494226, + "grad_norm": 0.16511280834674835, + "learning_rate": 1.8100159058809146e-05, + "loss": 1.184, + "step": 6156 + }, + { + "epoch": 2.2922828123000314, + "grad_norm": 0.16747237741947174, + "learning_rate": 1.8099446327900766e-05, + "loss": 1.1883, + "step": 6157 + }, + { + "epoch": 2.2926551174506407, + "grad_norm": 0.16100740432739258, + "learning_rate": 1.809873347736407e-05, + "loss": 1.1764, + "step": 6158 + }, + { + "epoch": 2.2930274226012495, + "grad_norm": 0.16185326874256134, + "learning_rate": 1.809802050720959e-05, + "loss": 1.1846, + "step": 6159 + }, + { + "epoch": 2.2933997277518587, + "grad_norm": 0.1648186296224594, + "learning_rate": 1.8097307417447855e-05, + "loss": 1.1802, + "step": 6160 + }, + { + "epoch": 2.2937720329024676, + "grad_norm": 0.1759217381477356, + "learning_rate": 1.8096594208089397e-05, + "loss": 1.1826, + "step": 6161 + }, + { + "epoch": 2.294144338053077, + "grad_norm": 0.1690814197063446, + "learning_rate": 1.809588087914475e-05, + "loss": 1.1606, + "step": 6162 + }, + { + "epoch": 2.2945166432036856, + "grad_norm": 0.1603320837020874, + "learning_rate": 1.8095167430624454e-05, + "loss": 1.1739, + "step": 6163 + }, + { + "epoch": 2.294888948354295, + "grad_norm": 0.16544067859649658, + "learning_rate": 1.809445386253904e-05, + "loss": 1.1809, + "step": 6164 + }, + { + "epoch": 2.295261253504904, + "grad_norm": 0.16254539787769318, + "learning_rate": 1.809374017489905e-05, + "loss": 1.1904, + "step": 6165 + }, + { + "epoch": 2.295633558655513, + "grad_norm": 0.16620919108390808, + "learning_rate": 1.809302636771503e-05, + "loss": 1.1834, + "step": 6166 + }, + { + "epoch": 2.2960058638061223, + "grad_norm": 0.16282342374324799, + "learning_rate": 1.809231244099751e-05, + "loss": 1.1815, + "step": 6167 + }, + { + "epoch": 2.296378168956731, + "grad_norm": 0.157831609249115, + "learning_rate": 1.809159839475705e-05, + "loss": 1.1681, + "step": 6168 + }, + { + "epoch": 2.2967504741073403, + "grad_norm": 0.16010123491287231, + "learning_rate": 1.809088422900419e-05, + "loss": 1.1629, + "step": 6169 + }, + { + "epoch": 2.297122779257949, + "grad_norm": 0.16694015264511108, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.1824, + "step": 6170 + }, + { + "epoch": 2.2974950844085584, + "grad_norm": 0.15992291271686554, + "learning_rate": 1.8089455539003457e-05, + "loss": 1.1718, + "step": 6171 + }, + { + "epoch": 2.2978673895591673, + "grad_norm": 0.1654921919107437, + "learning_rate": 1.808874101477669e-05, + "loss": 1.1947, + "step": 6172 + }, + { + "epoch": 2.2982396947097765, + "grad_norm": 0.1608392298221588, + "learning_rate": 1.8088026371079728e-05, + "loss": 1.1953, + "step": 6173 + }, + { + "epoch": 2.298611999860386, + "grad_norm": 0.1624198704957962, + "learning_rate": 1.8087311607923118e-05, + "loss": 1.1819, + "step": 6174 + }, + { + "epoch": 2.2989843050109946, + "grad_norm": 0.163405641913414, + "learning_rate": 1.808659672531743e-05, + "loss": 1.1877, + "step": 6175 + }, + { + "epoch": 2.299356610161604, + "grad_norm": 0.16340793669223785, + "learning_rate": 1.8085881723273215e-05, + "loss": 1.1771, + "step": 6176 + }, + { + "epoch": 2.2997289153122127, + "grad_norm": 0.16229262948036194, + "learning_rate": 1.808516660180103e-05, + "loss": 1.1862, + "step": 6177 + }, + { + "epoch": 2.300101220462822, + "grad_norm": 0.15974633395671844, + "learning_rate": 1.808445136091144e-05, + "loss": 1.1828, + "step": 6178 + }, + { + "epoch": 2.3004735256134308, + "grad_norm": 0.15938468277454376, + "learning_rate": 1.8083736000615017e-05, + "loss": 1.1667, + "step": 6179 + }, + { + "epoch": 2.30084583076404, + "grad_norm": 0.1704174429178238, + "learning_rate": 1.808302052092232e-05, + "loss": 1.1785, + "step": 6180 + }, + { + "epoch": 2.301218135914649, + "grad_norm": 0.15925031900405884, + "learning_rate": 1.8082304921843913e-05, + "loss": 1.1729, + "step": 6181 + }, + { + "epoch": 2.301590441065258, + "grad_norm": 0.17216730117797852, + "learning_rate": 1.8081589203390374e-05, + "loss": 1.1874, + "step": 6182 + }, + { + "epoch": 2.3019627462158674, + "grad_norm": 0.16319389641284943, + "learning_rate": 1.8080873365572265e-05, + "loss": 1.1853, + "step": 6183 + }, + { + "epoch": 2.302335051366476, + "grad_norm": 0.16606274247169495, + "learning_rate": 1.8080157408400167e-05, + "loss": 1.1777, + "step": 6184 + }, + { + "epoch": 2.3027073565170855, + "grad_norm": 0.16718682646751404, + "learning_rate": 1.807944133188465e-05, + "loss": 1.1713, + "step": 6185 + }, + { + "epoch": 2.3030796616676943, + "grad_norm": 0.16058477759361267, + "learning_rate": 1.8078725136036292e-05, + "loss": 1.171, + "step": 6186 + }, + { + "epoch": 2.3034519668183036, + "grad_norm": 0.1608082354068756, + "learning_rate": 1.8078008820865667e-05, + "loss": 1.1773, + "step": 6187 + }, + { + "epoch": 2.3038242719689124, + "grad_norm": 0.16686289012432098, + "learning_rate": 1.8077292386383364e-05, + "loss": 1.1741, + "step": 6188 + }, + { + "epoch": 2.3041965771195216, + "grad_norm": 0.16598117351531982, + "learning_rate": 1.8076575832599957e-05, + "loss": 1.1979, + "step": 6189 + }, + { + "epoch": 2.3045688822701305, + "grad_norm": 0.16313214600086212, + "learning_rate": 1.8075859159526033e-05, + "loss": 1.1774, + "step": 6190 + }, + { + "epoch": 2.3049411874207397, + "grad_norm": 0.16657821834087372, + "learning_rate": 1.8075142367172175e-05, + "loss": 1.1888, + "step": 6191 + }, + { + "epoch": 2.305313492571349, + "grad_norm": 0.16192109882831573, + "learning_rate": 1.8074425455548972e-05, + "loss": 1.1688, + "step": 6192 + }, + { + "epoch": 2.305685797721958, + "grad_norm": 0.16952742636203766, + "learning_rate": 1.807370842466701e-05, + "loss": 1.1592, + "step": 6193 + }, + { + "epoch": 2.306058102872567, + "grad_norm": 0.15994684398174286, + "learning_rate": 1.8072991274536883e-05, + "loss": 1.1903, + "step": 6194 + }, + { + "epoch": 2.306430408023176, + "grad_norm": 0.16636525094509125, + "learning_rate": 1.807227400516918e-05, + "loss": 1.1754, + "step": 6195 + }, + { + "epoch": 2.306802713173785, + "grad_norm": 0.16642355918884277, + "learning_rate": 1.8071556616574498e-05, + "loss": 1.182, + "step": 6196 + }, + { + "epoch": 2.307175018324394, + "grad_norm": 0.1568913608789444, + "learning_rate": 1.807083910876343e-05, + "loss": 1.1825, + "step": 6197 + }, + { + "epoch": 2.3075473234750032, + "grad_norm": 0.16373074054718018, + "learning_rate": 1.8070121481746576e-05, + "loss": 1.1873, + "step": 6198 + }, + { + "epoch": 2.307919628625612, + "grad_norm": 0.16669581830501556, + "learning_rate": 1.8069403735534533e-05, + "loss": 1.1804, + "step": 6199 + }, + { + "epoch": 2.3082919337762213, + "grad_norm": 0.1655244082212448, + "learning_rate": 1.8068685870137906e-05, + "loss": 1.194, + "step": 6200 + }, + { + "epoch": 2.3086642389268306, + "grad_norm": 0.16131706535816193, + "learning_rate": 1.8067967885567292e-05, + "loss": 1.172, + "step": 6201 + }, + { + "epoch": 2.3090365440774394, + "grad_norm": 0.1645621359348297, + "learning_rate": 1.80672497818333e-05, + "loss": 1.1778, + "step": 6202 + }, + { + "epoch": 2.3094088492280487, + "grad_norm": 0.1699766367673874, + "learning_rate": 1.8066531558946537e-05, + "loss": 1.1868, + "step": 6203 + }, + { + "epoch": 2.3097811543786575, + "grad_norm": 0.16291096806526184, + "learning_rate": 1.8065813216917604e-05, + "loss": 1.1798, + "step": 6204 + }, + { + "epoch": 2.3101534595292668, + "grad_norm": 0.16420194506645203, + "learning_rate": 1.806509475575712e-05, + "loss": 1.1724, + "step": 6205 + }, + { + "epoch": 2.3105257646798756, + "grad_norm": 0.1601489931344986, + "learning_rate": 1.806437617547569e-05, + "loss": 1.1648, + "step": 6206 + }, + { + "epoch": 2.310898069830485, + "grad_norm": 0.17182345688343048, + "learning_rate": 1.806365747608393e-05, + "loss": 1.1888, + "step": 6207 + }, + { + "epoch": 2.3112703749810937, + "grad_norm": 0.16343766450881958, + "learning_rate": 1.806293865759246e-05, + "loss": 1.17, + "step": 6208 + }, + { + "epoch": 2.311642680131703, + "grad_norm": 0.16800105571746826, + "learning_rate": 1.806221972001189e-05, + "loss": 1.1749, + "step": 6209 + }, + { + "epoch": 2.312014985282312, + "grad_norm": 0.16388358175754547, + "learning_rate": 1.806150066335284e-05, + "loss": 1.1791, + "step": 6210 + }, + { + "epoch": 2.312387290432921, + "grad_norm": 0.1601734161376953, + "learning_rate": 1.8060781487625927e-05, + "loss": 1.181, + "step": 6211 + }, + { + "epoch": 2.3127595955835303, + "grad_norm": 0.16823667287826538, + "learning_rate": 1.806006219284178e-05, + "loss": 1.176, + "step": 6212 + }, + { + "epoch": 2.313131900734139, + "grad_norm": 0.16742992401123047, + "learning_rate": 1.805934277901102e-05, + "loss": 1.175, + "step": 6213 + }, + { + "epoch": 2.3135042058847484, + "grad_norm": 0.16113218665122986, + "learning_rate": 1.8058623246144274e-05, + "loss": 1.1971, + "step": 6214 + }, + { + "epoch": 2.313876511035357, + "grad_norm": 0.1706179678440094, + "learning_rate": 1.805790359425217e-05, + "loss": 1.1806, + "step": 6215 + }, + { + "epoch": 2.3142488161859665, + "grad_norm": 0.16922074556350708, + "learning_rate": 1.8057183823345333e-05, + "loss": 1.184, + "step": 6216 + }, + { + "epoch": 2.3146211213365753, + "grad_norm": 0.16326811909675598, + "learning_rate": 1.80564639334344e-05, + "loss": 1.178, + "step": 6217 + }, + { + "epoch": 2.3149934264871845, + "grad_norm": 0.1662980020046234, + "learning_rate": 1.8055743924529996e-05, + "loss": 1.1884, + "step": 6218 + }, + { + "epoch": 2.315365731637794, + "grad_norm": 0.16735336184501648, + "learning_rate": 1.805502379664276e-05, + "loss": 1.1828, + "step": 6219 + }, + { + "epoch": 2.3157380367884026, + "grad_norm": 0.16329142451286316, + "learning_rate": 1.805430354978333e-05, + "loss": 1.1834, + "step": 6220 + }, + { + "epoch": 2.316110341939012, + "grad_norm": 0.16252319514751434, + "learning_rate": 1.8053583183962342e-05, + "loss": 1.1639, + "step": 6221 + }, + { + "epoch": 2.3164826470896207, + "grad_norm": 0.16563460230827332, + "learning_rate": 1.8052862699190435e-05, + "loss": 1.1942, + "step": 6222 + }, + { + "epoch": 2.31685495224023, + "grad_norm": 0.16497038304805756, + "learning_rate": 1.8052142095478253e-05, + "loss": 1.1846, + "step": 6223 + }, + { + "epoch": 2.317227257390839, + "grad_norm": 0.1694660484790802, + "learning_rate": 1.8051421372836438e-05, + "loss": 1.1886, + "step": 6224 + }, + { + "epoch": 2.317599562541448, + "grad_norm": 0.1612187922000885, + "learning_rate": 1.8050700531275632e-05, + "loss": 1.1802, + "step": 6225 + }, + { + "epoch": 2.317971867692057, + "grad_norm": 0.166747584939003, + "learning_rate": 1.8049979570806485e-05, + "loss": 1.1878, + "step": 6226 + }, + { + "epoch": 2.318344172842666, + "grad_norm": 0.17181634902954102, + "learning_rate": 1.8049258491439644e-05, + "loss": 1.1834, + "step": 6227 + }, + { + "epoch": 2.3187164779932754, + "grad_norm": 0.16277888417243958, + "learning_rate": 1.8048537293185763e-05, + "loss": 1.1808, + "step": 6228 + }, + { + "epoch": 2.3190887831438842, + "grad_norm": 0.16461539268493652, + "learning_rate": 1.804781597605549e-05, + "loss": 1.189, + "step": 6229 + }, + { + "epoch": 2.3194610882944935, + "grad_norm": 0.1649829000234604, + "learning_rate": 1.8047094540059478e-05, + "loss": 1.1877, + "step": 6230 + }, + { + "epoch": 2.3198333934451023, + "grad_norm": 0.16547247767448425, + "learning_rate": 1.804637298520839e-05, + "loss": 1.1789, + "step": 6231 + }, + { + "epoch": 2.3202056985957116, + "grad_norm": 0.15987227857112885, + "learning_rate": 1.804565131151287e-05, + "loss": 1.1692, + "step": 6232 + }, + { + "epoch": 2.3205780037463204, + "grad_norm": 0.16262219846248627, + "learning_rate": 1.8044929518983592e-05, + "loss": 1.1858, + "step": 6233 + }, + { + "epoch": 2.3209503088969297, + "grad_norm": 0.16427870094776154, + "learning_rate": 1.8044207607631206e-05, + "loss": 1.18, + "step": 6234 + }, + { + "epoch": 2.3213226140475385, + "grad_norm": 0.1622522473335266, + "learning_rate": 1.804348557746638e-05, + "loss": 1.1718, + "step": 6235 + }, + { + "epoch": 2.3216949191981477, + "grad_norm": 0.15820512175559998, + "learning_rate": 1.8042763428499777e-05, + "loss": 1.1816, + "step": 6236 + }, + { + "epoch": 2.322067224348757, + "grad_norm": 0.16606608033180237, + "learning_rate": 1.8042041160742062e-05, + "loss": 1.1708, + "step": 6237 + }, + { + "epoch": 2.322439529499366, + "grad_norm": 0.17201752960681915, + "learning_rate": 1.8041318774203908e-05, + "loss": 1.1814, + "step": 6238 + }, + { + "epoch": 2.322811834649975, + "grad_norm": 0.16523173451423645, + "learning_rate": 1.8040596268895973e-05, + "loss": 1.1717, + "step": 6239 + }, + { + "epoch": 2.323184139800584, + "grad_norm": 0.16738300025463104, + "learning_rate": 1.803987364482894e-05, + "loss": 1.1809, + "step": 6240 + }, + { + "epoch": 2.323556444951193, + "grad_norm": 0.1602403223514557, + "learning_rate": 1.8039150902013478e-05, + "loss": 1.1754, + "step": 6241 + }, + { + "epoch": 2.323928750101802, + "grad_norm": 0.16947504878044128, + "learning_rate": 1.803842804046026e-05, + "loss": 1.1869, + "step": 6242 + }, + { + "epoch": 2.3243010552524113, + "grad_norm": 0.16907165944576263, + "learning_rate": 1.8037705060179965e-05, + "loss": 1.1857, + "step": 6243 + }, + { + "epoch": 2.32467336040302, + "grad_norm": 0.17978952825069427, + "learning_rate": 1.8036981961183273e-05, + "loss": 1.1801, + "step": 6244 + }, + { + "epoch": 2.3250456655536293, + "grad_norm": 0.16338391602039337, + "learning_rate": 1.803625874348086e-05, + "loss": 1.1679, + "step": 6245 + }, + { + "epoch": 2.3254179707042386, + "grad_norm": 0.17273187637329102, + "learning_rate": 1.803553540708341e-05, + "loss": 1.1749, + "step": 6246 + }, + { + "epoch": 2.3257902758548474, + "grad_norm": 0.17095768451690674, + "learning_rate": 1.8034811952001602e-05, + "loss": 1.1663, + "step": 6247 + }, + { + "epoch": 2.3261625810054567, + "grad_norm": 0.16900064051151276, + "learning_rate": 1.803408837824613e-05, + "loss": 1.1831, + "step": 6248 + }, + { + "epoch": 2.3265348861560655, + "grad_norm": 0.17305012047290802, + "learning_rate": 1.8033364685827677e-05, + "loss": 1.1898, + "step": 6249 + }, + { + "epoch": 2.326907191306675, + "grad_norm": 0.1679958999156952, + "learning_rate": 1.8032640874756932e-05, + "loss": 1.1713, + "step": 6250 + }, + { + "epoch": 2.327279496457284, + "grad_norm": 0.1740444004535675, + "learning_rate": 1.8031916945044586e-05, + "loss": 1.194, + "step": 6251 + }, + { + "epoch": 2.327651801607893, + "grad_norm": 0.1582871377468109, + "learning_rate": 1.803119289670133e-05, + "loss": 1.1891, + "step": 6252 + }, + { + "epoch": 2.3280241067585017, + "grad_norm": 0.17062443494796753, + "learning_rate": 1.8030468729737856e-05, + "loss": 1.185, + "step": 6253 + }, + { + "epoch": 2.328396411909111, + "grad_norm": 0.16043491661548615, + "learning_rate": 1.802974444416487e-05, + "loss": 1.1798, + "step": 6254 + }, + { + "epoch": 2.32876871705972, + "grad_norm": 0.17315945029258728, + "learning_rate": 1.8029020039993055e-05, + "loss": 1.1855, + "step": 6255 + }, + { + "epoch": 2.329141022210329, + "grad_norm": 0.16167216002941132, + "learning_rate": 1.802829551723312e-05, + "loss": 1.1834, + "step": 6256 + }, + { + "epoch": 2.3295133273609383, + "grad_norm": 0.17070269584655762, + "learning_rate": 1.8027570875895762e-05, + "loss": 1.1805, + "step": 6257 + }, + { + "epoch": 2.329885632511547, + "grad_norm": 0.1713981032371521, + "learning_rate": 1.802684611599169e-05, + "loss": 1.1893, + "step": 6258 + }, + { + "epoch": 2.3302579376621564, + "grad_norm": 0.17081809043884277, + "learning_rate": 1.80261212375316e-05, + "loss": 1.1838, + "step": 6259 + }, + { + "epoch": 2.3306302428127657, + "grad_norm": 0.16853542625904083, + "learning_rate": 1.8025396240526208e-05, + "loss": 1.1858, + "step": 6260 + }, + { + "epoch": 2.3310025479633745, + "grad_norm": 0.1681629717350006, + "learning_rate": 1.8024671124986218e-05, + "loss": 1.177, + "step": 6261 + }, + { + "epoch": 2.3313748531139837, + "grad_norm": 0.1574486941099167, + "learning_rate": 1.8023945890922334e-05, + "loss": 1.1778, + "step": 6262 + }, + { + "epoch": 2.3317471582645926, + "grad_norm": 0.16776683926582336, + "learning_rate": 1.8023220538345276e-05, + "loss": 1.1888, + "step": 6263 + }, + { + "epoch": 2.332119463415202, + "grad_norm": 0.16034848988056183, + "learning_rate": 1.802249506726575e-05, + "loss": 1.1826, + "step": 6264 + }, + { + "epoch": 2.3324917685658106, + "grad_norm": 0.16641570627689362, + "learning_rate": 1.8021769477694482e-05, + "loss": 1.1863, + "step": 6265 + }, + { + "epoch": 2.33286407371642, + "grad_norm": 0.16648726165294647, + "learning_rate": 1.802104376964218e-05, + "loss": 1.1586, + "step": 6266 + }, + { + "epoch": 2.3332363788670287, + "grad_norm": 0.16356948018074036, + "learning_rate": 1.8020317943119563e-05, + "loss": 1.1854, + "step": 6267 + }, + { + "epoch": 2.333608684017638, + "grad_norm": 0.16005860269069672, + "learning_rate": 1.8019591998137355e-05, + "loss": 1.1887, + "step": 6268 + }, + { + "epoch": 2.3339809891682473, + "grad_norm": 0.1607915163040161, + "learning_rate": 1.8018865934706277e-05, + "loss": 1.1765, + "step": 6269 + }, + { + "epoch": 2.334353294318856, + "grad_norm": 0.1606864184141159, + "learning_rate": 1.801813975283705e-05, + "loss": 1.182, + "step": 6270 + }, + { + "epoch": 2.3347255994694653, + "grad_norm": 0.17700619995594025, + "learning_rate": 1.80174134525404e-05, + "loss": 1.1745, + "step": 6271 + }, + { + "epoch": 2.335097904620074, + "grad_norm": 0.1743892878293991, + "learning_rate": 1.801668703382706e-05, + "loss": 1.174, + "step": 6272 + }, + { + "epoch": 2.3354702097706834, + "grad_norm": 0.16445204615592957, + "learning_rate": 1.8015960496707756e-05, + "loss": 1.1716, + "step": 6273 + }, + { + "epoch": 2.3358425149212922, + "grad_norm": 0.1924527883529663, + "learning_rate": 1.8015233841193218e-05, + "loss": 1.1676, + "step": 6274 + }, + { + "epoch": 2.3362148200719015, + "grad_norm": 0.19679906964302063, + "learning_rate": 1.8014507067294177e-05, + "loss": 1.1849, + "step": 6275 + }, + { + "epoch": 2.3365871252225103, + "grad_norm": 0.16511952877044678, + "learning_rate": 1.8013780175021373e-05, + "loss": 1.2015, + "step": 6276 + }, + { + "epoch": 2.3369594303731196, + "grad_norm": 0.23659729957580566, + "learning_rate": 1.8013053164385538e-05, + "loss": 1.1712, + "step": 6277 + }, + { + "epoch": 2.337331735523729, + "grad_norm": 0.19166865944862366, + "learning_rate": 1.8012326035397407e-05, + "loss": 1.1822, + "step": 6278 + }, + { + "epoch": 2.3377040406743377, + "grad_norm": 0.1799168735742569, + "learning_rate": 1.8011598788067728e-05, + "loss": 1.1751, + "step": 6279 + }, + { + "epoch": 2.338076345824947, + "grad_norm": 0.16195322573184967, + "learning_rate": 1.8010871422407238e-05, + "loss": 1.1822, + "step": 6280 + }, + { + "epoch": 2.3384486509755558, + "grad_norm": 0.18496288359165192, + "learning_rate": 1.8010143938426674e-05, + "loss": 1.1882, + "step": 6281 + }, + { + "epoch": 2.338820956126165, + "grad_norm": 0.16737127304077148, + "learning_rate": 1.800941633613679e-05, + "loss": 1.1897, + "step": 6282 + }, + { + "epoch": 2.339193261276774, + "grad_norm": 0.16343113780021667, + "learning_rate": 1.800868861554833e-05, + "loss": 1.1761, + "step": 6283 + }, + { + "epoch": 2.339565566427383, + "grad_norm": 0.16952070593833923, + "learning_rate": 1.8007960776672043e-05, + "loss": 1.1742, + "step": 6284 + }, + { + "epoch": 2.339937871577992, + "grad_norm": 0.16918697953224182, + "learning_rate": 1.8007232819518675e-05, + "loss": 1.1809, + "step": 6285 + }, + { + "epoch": 2.340310176728601, + "grad_norm": 0.16563785076141357, + "learning_rate": 1.800650474409898e-05, + "loss": 1.1709, + "step": 6286 + }, + { + "epoch": 2.3406824818792105, + "grad_norm": 0.1735762506723404, + "learning_rate": 1.8005776550423718e-05, + "loss": 1.1761, + "step": 6287 + }, + { + "epoch": 2.3410547870298193, + "grad_norm": 0.16570459306240082, + "learning_rate": 1.8005048238503633e-05, + "loss": 1.1778, + "step": 6288 + }, + { + "epoch": 2.3414270921804285, + "grad_norm": 0.16612395644187927, + "learning_rate": 1.800431980834949e-05, + "loss": 1.1772, + "step": 6289 + }, + { + "epoch": 2.3417993973310374, + "grad_norm": 0.16212446987628937, + "learning_rate": 1.8003591259972047e-05, + "loss": 1.1854, + "step": 6290 + }, + { + "epoch": 2.3421717024816466, + "grad_norm": 0.1619550585746765, + "learning_rate": 1.8002862593382063e-05, + "loss": 1.1682, + "step": 6291 + }, + { + "epoch": 2.3425440076322555, + "grad_norm": 0.16271120309829712, + "learning_rate": 1.80021338085903e-05, + "loss": 1.1646, + "step": 6292 + }, + { + "epoch": 2.3429163127828647, + "grad_norm": 0.1650598645210266, + "learning_rate": 1.8001404905607523e-05, + "loss": 1.189, + "step": 6293 + }, + { + "epoch": 2.3432886179334735, + "grad_norm": 0.16364504396915436, + "learning_rate": 1.8000675884444495e-05, + "loss": 1.1838, + "step": 6294 + }, + { + "epoch": 2.343660923084083, + "grad_norm": 0.16529352962970734, + "learning_rate": 1.7999946745111993e-05, + "loss": 1.1908, + "step": 6295 + }, + { + "epoch": 2.344033228234692, + "grad_norm": 0.1709616482257843, + "learning_rate": 1.7999217487620773e-05, + "loss": 1.1731, + "step": 6296 + }, + { + "epoch": 2.344405533385301, + "grad_norm": 0.16353869438171387, + "learning_rate": 1.7998488111981616e-05, + "loss": 1.1748, + "step": 6297 + }, + { + "epoch": 2.34477783853591, + "grad_norm": 0.16740430891513824, + "learning_rate": 1.799775861820529e-05, + "loss": 1.1674, + "step": 6298 + }, + { + "epoch": 2.345150143686519, + "grad_norm": 0.1658317595720291, + "learning_rate": 1.7997029006302572e-05, + "loss": 1.1709, + "step": 6299 + }, + { + "epoch": 2.3455224488371282, + "grad_norm": 0.1644759625196457, + "learning_rate": 1.799629927628424e-05, + "loss": 1.184, + "step": 6300 + }, + { + "epoch": 2.345894753987737, + "grad_norm": 0.1665249615907669, + "learning_rate": 1.7995569428161066e-05, + "loss": 1.1826, + "step": 6301 + }, + { + "epoch": 2.3462670591383463, + "grad_norm": 0.1632251888513565, + "learning_rate": 1.7994839461943834e-05, + "loss": 1.1614, + "step": 6302 + }, + { + "epoch": 2.346639364288955, + "grad_norm": 0.16572819650173187, + "learning_rate": 1.7994109377643326e-05, + "loss": 1.1768, + "step": 6303 + }, + { + "epoch": 2.3470116694395644, + "grad_norm": 0.16670545935630798, + "learning_rate": 1.7993379175270323e-05, + "loss": 1.1751, + "step": 6304 + }, + { + "epoch": 2.3473839745901737, + "grad_norm": 0.16433420777320862, + "learning_rate": 1.7992648854835607e-05, + "loss": 1.1997, + "step": 6305 + }, + { + "epoch": 2.3477562797407825, + "grad_norm": 0.16783735156059265, + "learning_rate": 1.7991918416349977e-05, + "loss": 1.1837, + "step": 6306 + }, + { + "epoch": 2.3481285848913918, + "grad_norm": 0.166385218501091, + "learning_rate": 1.799118785982421e-05, + "loss": 1.196, + "step": 6307 + }, + { + "epoch": 2.3485008900420006, + "grad_norm": 0.1664101630449295, + "learning_rate": 1.79904571852691e-05, + "loss": 1.1839, + "step": 6308 + }, + { + "epoch": 2.34887319519261, + "grad_norm": 0.16921697556972504, + "learning_rate": 1.7989726392695438e-05, + "loss": 1.1826, + "step": 6309 + }, + { + "epoch": 2.3492455003432187, + "grad_norm": 0.16790980100631714, + "learning_rate": 1.798899548211402e-05, + "loss": 1.1739, + "step": 6310 + }, + { + "epoch": 2.349617805493828, + "grad_norm": 0.16884508728981018, + "learning_rate": 1.798826445353564e-05, + "loss": 1.1825, + "step": 6311 + }, + { + "epoch": 2.3499901106444367, + "grad_norm": 0.16576768457889557, + "learning_rate": 1.7987533306971093e-05, + "loss": 1.1503, + "step": 6312 + }, + { + "epoch": 2.350362415795046, + "grad_norm": 0.1681016981601715, + "learning_rate": 1.798680204243118e-05, + "loss": 1.1753, + "step": 6313 + }, + { + "epoch": 2.3507347209456553, + "grad_norm": 0.17938603460788727, + "learning_rate": 1.7986070659926705e-05, + "loss": 1.1865, + "step": 6314 + }, + { + "epoch": 2.351107026096264, + "grad_norm": 0.16416122019290924, + "learning_rate": 1.798533915946847e-05, + "loss": 1.1693, + "step": 6315 + }, + { + "epoch": 2.3514793312468734, + "grad_norm": 0.1787365823984146, + "learning_rate": 1.7984607541067272e-05, + "loss": 1.174, + "step": 6316 + }, + { + "epoch": 2.351851636397482, + "grad_norm": 0.16736140847206116, + "learning_rate": 1.7983875804733925e-05, + "loss": 1.1772, + "step": 6317 + }, + { + "epoch": 2.3522239415480914, + "grad_norm": 0.17561602592468262, + "learning_rate": 1.798314395047923e-05, + "loss": 1.1793, + "step": 6318 + }, + { + "epoch": 2.3525962466987003, + "grad_norm": 0.16405464708805084, + "learning_rate": 1.7982411978314e-05, + "loss": 1.1866, + "step": 6319 + }, + { + "epoch": 2.3529685518493095, + "grad_norm": 0.1719312071800232, + "learning_rate": 1.798167988824905e-05, + "loss": 1.1743, + "step": 6320 + }, + { + "epoch": 2.3533408569999184, + "grad_norm": 0.17220060527324677, + "learning_rate": 1.7980947680295187e-05, + "loss": 1.1719, + "step": 6321 + }, + { + "epoch": 2.3537131621505276, + "grad_norm": 0.1724272072315216, + "learning_rate": 1.7980215354463223e-05, + "loss": 1.1777, + "step": 6322 + }, + { + "epoch": 2.354085467301137, + "grad_norm": 0.17141182720661163, + "learning_rate": 1.7979482910763984e-05, + "loss": 1.1822, + "step": 6323 + }, + { + "epoch": 2.3544577724517457, + "grad_norm": 0.18284828960895538, + "learning_rate": 1.7978750349208284e-05, + "loss": 1.1885, + "step": 6324 + }, + { + "epoch": 2.354830077602355, + "grad_norm": 0.16586068272590637, + "learning_rate": 1.797801766980694e-05, + "loss": 1.1677, + "step": 6325 + }, + { + "epoch": 2.355202382752964, + "grad_norm": 0.1680738776922226, + "learning_rate": 1.7977284872570775e-05, + "loss": 1.1888, + "step": 6326 + }, + { + "epoch": 2.355574687903573, + "grad_norm": 0.16332261264324188, + "learning_rate": 1.7976551957510614e-05, + "loss": 1.1771, + "step": 6327 + }, + { + "epoch": 2.355946993054182, + "grad_norm": 0.16689446568489075, + "learning_rate": 1.797581892463728e-05, + "loss": 1.1698, + "step": 6328 + }, + { + "epoch": 2.356319298204791, + "grad_norm": 0.17073221504688263, + "learning_rate": 1.7975085773961606e-05, + "loss": 1.1931, + "step": 6329 + }, + { + "epoch": 2.3566916033554, + "grad_norm": 0.16630737483501434, + "learning_rate": 1.797435250549441e-05, + "loss": 1.1698, + "step": 6330 + }, + { + "epoch": 2.357063908506009, + "grad_norm": 0.19237372279167175, + "learning_rate": 1.797361911924653e-05, + "loss": 1.1976, + "step": 6331 + }, + { + "epoch": 2.3574362136566185, + "grad_norm": 0.1919546127319336, + "learning_rate": 1.7972885615228796e-05, + "loss": 1.1915, + "step": 6332 + }, + { + "epoch": 2.3578085188072273, + "grad_norm": 0.16632108390331268, + "learning_rate": 1.797215199345204e-05, + "loss": 1.187, + "step": 6333 + }, + { + "epoch": 2.3581808239578366, + "grad_norm": 0.26524388790130615, + "learning_rate": 1.7971418253927103e-05, + "loss": 1.1899, + "step": 6334 + }, + { + "epoch": 2.3585531291084454, + "grad_norm": 0.20467346906661987, + "learning_rate": 1.7970684396664814e-05, + "loss": 1.1828, + "step": 6335 + }, + { + "epoch": 2.3589254342590547, + "grad_norm": 0.18540099263191223, + "learning_rate": 1.7969950421676022e-05, + "loss": 1.1699, + "step": 6336 + }, + { + "epoch": 2.3592977394096635, + "grad_norm": 0.17078468203544617, + "learning_rate": 1.796921632897156e-05, + "loss": 1.1846, + "step": 6337 + }, + { + "epoch": 2.3596700445602727, + "grad_norm": 0.19237884879112244, + "learning_rate": 1.796848211856227e-05, + "loss": 1.1847, + "step": 6338 + }, + { + "epoch": 2.3600423497108816, + "grad_norm": 0.1824292242527008, + "learning_rate": 1.7967747790459006e-05, + "loss": 1.1699, + "step": 6339 + }, + { + "epoch": 2.360414654861491, + "grad_norm": 0.17438608407974243, + "learning_rate": 1.7967013344672602e-05, + "loss": 1.1848, + "step": 6340 + }, + { + "epoch": 2.3607869600121, + "grad_norm": 0.16449159383773804, + "learning_rate": 1.7966278781213914e-05, + "loss": 1.1741, + "step": 6341 + }, + { + "epoch": 2.361159265162709, + "grad_norm": 0.17680013179779053, + "learning_rate": 1.7965544100093785e-05, + "loss": 1.1611, + "step": 6342 + }, + { + "epoch": 2.361531570313318, + "grad_norm": 0.18226385116577148, + "learning_rate": 1.796480930132307e-05, + "loss": 1.1873, + "step": 6343 + }, + { + "epoch": 2.361903875463927, + "grad_norm": 0.1675233244895935, + "learning_rate": 1.7964074384912624e-05, + "loss": 1.1864, + "step": 6344 + }, + { + "epoch": 2.3622761806145363, + "grad_norm": 0.17054075002670288, + "learning_rate": 1.7963339350873296e-05, + "loss": 1.1863, + "step": 6345 + }, + { + "epoch": 2.362648485765145, + "grad_norm": 0.1669616848230362, + "learning_rate": 1.7962604199215946e-05, + "loss": 1.1782, + "step": 6346 + }, + { + "epoch": 2.3630207909157543, + "grad_norm": 0.1679152250289917, + "learning_rate": 1.7961868929951432e-05, + "loss": 1.1859, + "step": 6347 + }, + { + "epoch": 2.363393096066363, + "grad_norm": 0.1678704470396042, + "learning_rate": 1.7961133543090617e-05, + "loss": 1.1759, + "step": 6348 + }, + { + "epoch": 2.3637654012169724, + "grad_norm": 0.15599095821380615, + "learning_rate": 1.7960398038644356e-05, + "loss": 1.1686, + "step": 6349 + }, + { + "epoch": 2.3641377063675817, + "grad_norm": 0.16793273389339447, + "learning_rate": 1.7959662416623516e-05, + "loss": 1.1858, + "step": 6350 + }, + { + "epoch": 2.3645100115181905, + "grad_norm": 0.16774815320968628, + "learning_rate": 1.795892667703896e-05, + "loss": 1.1846, + "step": 6351 + }, + { + "epoch": 2.3648823166687998, + "grad_norm": 0.16648389399051666, + "learning_rate": 1.795819081990156e-05, + "loss": 1.1811, + "step": 6352 + }, + { + "epoch": 2.3652546218194086, + "grad_norm": 0.16627708077430725, + "learning_rate": 1.795745484522218e-05, + "loss": 1.1804, + "step": 6353 + }, + { + "epoch": 2.365626926970018, + "grad_norm": 0.16126348078250885, + "learning_rate": 1.795671875301169e-05, + "loss": 1.1786, + "step": 6354 + }, + { + "epoch": 2.3659992321206267, + "grad_norm": 0.16175805032253265, + "learning_rate": 1.7955982543280965e-05, + "loss": 1.1678, + "step": 6355 + }, + { + "epoch": 2.366371537271236, + "grad_norm": 0.16892601549625397, + "learning_rate": 1.7955246216040872e-05, + "loss": 1.1723, + "step": 6356 + }, + { + "epoch": 2.3667438424218448, + "grad_norm": 0.1636088341474533, + "learning_rate": 1.79545097713023e-05, + "loss": 1.1844, + "step": 6357 + }, + { + "epoch": 2.367116147572454, + "grad_norm": 0.16723226010799408, + "learning_rate": 1.795377320907611e-05, + "loss": 1.1886, + "step": 6358 + }, + { + "epoch": 2.3674884527230633, + "grad_norm": 0.17732274532318115, + "learning_rate": 1.7953036529373194e-05, + "loss": 1.1981, + "step": 6359 + }, + { + "epoch": 2.367860757873672, + "grad_norm": 0.15954697132110596, + "learning_rate": 1.795229973220443e-05, + "loss": 1.1924, + "step": 6360 + }, + { + "epoch": 2.3682330630242814, + "grad_norm": 0.1688387095928192, + "learning_rate": 1.795156281758069e-05, + "loss": 1.1782, + "step": 6361 + }, + { + "epoch": 2.36860536817489, + "grad_norm": 0.16727487742900848, + "learning_rate": 1.7950825785512873e-05, + "loss": 1.1868, + "step": 6362 + }, + { + "epoch": 2.3689776733254995, + "grad_norm": 0.1692897230386734, + "learning_rate": 1.7950088636011853e-05, + "loss": 1.1952, + "step": 6363 + }, + { + "epoch": 2.3693499784761083, + "grad_norm": 0.15988357365131378, + "learning_rate": 1.7949351369088526e-05, + "loss": 1.1724, + "step": 6364 + }, + { + "epoch": 2.3697222836267176, + "grad_norm": 0.16070933640003204, + "learning_rate": 1.7948613984753777e-05, + "loss": 1.1731, + "step": 6365 + }, + { + "epoch": 2.3700945887773264, + "grad_norm": 0.16724231839179993, + "learning_rate": 1.7947876483018498e-05, + "loss": 1.1779, + "step": 6366 + }, + { + "epoch": 2.3704668939279356, + "grad_norm": 0.16751320660114288, + "learning_rate": 1.7947138863893582e-05, + "loss": 1.1952, + "step": 6367 + }, + { + "epoch": 2.370839199078545, + "grad_norm": 0.16353389620780945, + "learning_rate": 1.7946401127389928e-05, + "loss": 1.1899, + "step": 6368 + }, + { + "epoch": 2.3712115042291537, + "grad_norm": 0.16663357615470886, + "learning_rate": 1.7945663273518423e-05, + "loss": 1.2004, + "step": 6369 + }, + { + "epoch": 2.371583809379763, + "grad_norm": 0.1695907711982727, + "learning_rate": 1.7944925302289972e-05, + "loss": 1.158, + "step": 6370 + }, + { + "epoch": 2.371956114530372, + "grad_norm": 0.16809728741645813, + "learning_rate": 1.794418721371547e-05, + "loss": 1.1689, + "step": 6371 + }, + { + "epoch": 2.372328419680981, + "grad_norm": 0.1606246680021286, + "learning_rate": 1.7943449007805824e-05, + "loss": 1.1615, + "step": 6372 + }, + { + "epoch": 2.3727007248315903, + "grad_norm": 0.1651766151189804, + "learning_rate": 1.7942710684571934e-05, + "loss": 1.183, + "step": 6373 + }, + { + "epoch": 2.373073029982199, + "grad_norm": 0.16688674688339233, + "learning_rate": 1.794197224402471e-05, + "loss": 1.1735, + "step": 6374 + }, + { + "epoch": 2.3734453351328084, + "grad_norm": 0.16931474208831787, + "learning_rate": 1.7941233686175052e-05, + "loss": 1.169, + "step": 6375 + }, + { + "epoch": 2.3738176402834172, + "grad_norm": 0.16354526579380035, + "learning_rate": 1.7940495011033866e-05, + "loss": 1.1828, + "step": 6376 + }, + { + "epoch": 2.3741899454340265, + "grad_norm": 0.16403120756149292, + "learning_rate": 1.7939756218612072e-05, + "loss": 1.1755, + "step": 6377 + }, + { + "epoch": 2.3745622505846353, + "grad_norm": 0.16923734545707703, + "learning_rate": 1.7939017308920575e-05, + "loss": 1.1704, + "step": 6378 + }, + { + "epoch": 2.3749345557352446, + "grad_norm": 0.16847112774848938, + "learning_rate": 1.793827828197029e-05, + "loss": 1.1739, + "step": 6379 + }, + { + "epoch": 2.3753068608858534, + "grad_norm": 0.17019587755203247, + "learning_rate": 1.7937539137772134e-05, + "loss": 1.1815, + "step": 6380 + }, + { + "epoch": 2.3756791660364627, + "grad_norm": 0.16363418102264404, + "learning_rate": 1.7936799876337022e-05, + "loss": 1.1707, + "step": 6381 + }, + { + "epoch": 2.376051471187072, + "grad_norm": 0.16532693803310394, + "learning_rate": 1.7936060497675875e-05, + "loss": 1.174, + "step": 6382 + }, + { + "epoch": 2.3764237763376808, + "grad_norm": 0.168051615357399, + "learning_rate": 1.793532100179961e-05, + "loss": 1.1609, + "step": 6383 + }, + { + "epoch": 2.37679608148829, + "grad_norm": 0.16365954279899597, + "learning_rate": 1.7934581388719158e-05, + "loss": 1.1779, + "step": 6384 + }, + { + "epoch": 2.377168386638899, + "grad_norm": 0.16921496391296387, + "learning_rate": 1.7933841658445432e-05, + "loss": 1.1967, + "step": 6385 + }, + { + "epoch": 2.377540691789508, + "grad_norm": 0.16775216162204742, + "learning_rate": 1.7933101810989363e-05, + "loss": 1.1698, + "step": 6386 + }, + { + "epoch": 2.377912996940117, + "grad_norm": 0.16097751259803772, + "learning_rate": 1.793236184636188e-05, + "loss": 1.1879, + "step": 6387 + }, + { + "epoch": 2.378285302090726, + "grad_norm": 0.1658683568239212, + "learning_rate": 1.7931621764573907e-05, + "loss": 1.1815, + "step": 6388 + }, + { + "epoch": 2.378657607241335, + "grad_norm": 0.16662012040615082, + "learning_rate": 1.793088156563638e-05, + "loss": 1.1767, + "step": 6389 + }, + { + "epoch": 2.3790299123919443, + "grad_norm": 0.16364309191703796, + "learning_rate": 1.7930141249560235e-05, + "loss": 1.1758, + "step": 6390 + }, + { + "epoch": 2.3794022175425535, + "grad_norm": 0.17829427123069763, + "learning_rate": 1.7929400816356394e-05, + "loss": 1.1832, + "step": 6391 + }, + { + "epoch": 2.3797745226931624, + "grad_norm": 0.16619321703910828, + "learning_rate": 1.7928660266035804e-05, + "loss": 1.1796, + "step": 6392 + }, + { + "epoch": 2.3801468278437716, + "grad_norm": 0.16665421426296234, + "learning_rate": 1.79279195986094e-05, + "loss": 1.1768, + "step": 6393 + }, + { + "epoch": 2.3805191329943804, + "grad_norm": 0.17309972643852234, + "learning_rate": 1.792717881408812e-05, + "loss": 1.1811, + "step": 6394 + }, + { + "epoch": 2.3808914381449897, + "grad_norm": 0.1681647002696991, + "learning_rate": 1.7926437912482905e-05, + "loss": 1.1738, + "step": 6395 + }, + { + "epoch": 2.3812637432955985, + "grad_norm": 0.17703039944171906, + "learning_rate": 1.79256968938047e-05, + "loss": 1.1759, + "step": 6396 + }, + { + "epoch": 2.381636048446208, + "grad_norm": 0.16433195769786835, + "learning_rate": 1.792495575806445e-05, + "loss": 1.1746, + "step": 6397 + }, + { + "epoch": 2.3820083535968166, + "grad_norm": 0.1649874895811081, + "learning_rate": 1.7924214505273102e-05, + "loss": 1.1785, + "step": 6398 + }, + { + "epoch": 2.382380658747426, + "grad_norm": 0.17650985717773438, + "learning_rate": 1.79234731354416e-05, + "loss": 1.1714, + "step": 6399 + }, + { + "epoch": 2.382752963898035, + "grad_norm": 0.19608263671398163, + "learning_rate": 1.7922731648580902e-05, + "loss": 1.1805, + "step": 6400 + }, + { + "epoch": 2.383125269048644, + "grad_norm": 0.18923339247703552, + "learning_rate": 1.7921990044701952e-05, + "loss": 1.1701, + "step": 6401 + }, + { + "epoch": 2.3834975741992532, + "grad_norm": 0.1707700490951538, + "learning_rate": 1.7921248323815703e-05, + "loss": 1.1829, + "step": 6402 + }, + { + "epoch": 2.383869879349862, + "grad_norm": 0.16345664858818054, + "learning_rate": 1.7920506485933117e-05, + "loss": 1.1881, + "step": 6403 + }, + { + "epoch": 2.3842421845004713, + "grad_norm": 0.166402205824852, + "learning_rate": 1.7919764531065147e-05, + "loss": 1.1603, + "step": 6404 + }, + { + "epoch": 2.38461448965108, + "grad_norm": 0.17079654335975647, + "learning_rate": 1.7919022459222754e-05, + "loss": 1.1882, + "step": 6405 + }, + { + "epoch": 2.3849867948016894, + "grad_norm": 0.16907550394535065, + "learning_rate": 1.791828027041689e-05, + "loss": 1.1921, + "step": 6406 + }, + { + "epoch": 2.3853590999522982, + "grad_norm": 0.15836122632026672, + "learning_rate": 1.791753796465853e-05, + "loss": 1.1894, + "step": 6407 + }, + { + "epoch": 2.3857314051029075, + "grad_norm": 0.1576787382364273, + "learning_rate": 1.791679554195863e-05, + "loss": 1.1631, + "step": 6408 + }, + { + "epoch": 2.3861037102535168, + "grad_norm": 0.1628400981426239, + "learning_rate": 1.7916053002328152e-05, + "loss": 1.1717, + "step": 6409 + }, + { + "epoch": 2.3864760154041256, + "grad_norm": 0.16520801186561584, + "learning_rate": 1.7915310345778072e-05, + "loss": 1.1745, + "step": 6410 + }, + { + "epoch": 2.386848320554735, + "grad_norm": 0.167642742395401, + "learning_rate": 1.7914567572319352e-05, + "loss": 1.1679, + "step": 6411 + }, + { + "epoch": 2.3872206257053437, + "grad_norm": 0.16473732888698578, + "learning_rate": 1.791382468196297e-05, + "loss": 1.17, + "step": 6412 + }, + { + "epoch": 2.387592930855953, + "grad_norm": 0.17128442227840424, + "learning_rate": 1.791308167471989e-05, + "loss": 1.1791, + "step": 6413 + }, + { + "epoch": 2.3879652360065617, + "grad_norm": 0.17826206982135773, + "learning_rate": 1.791233855060109e-05, + "loss": 1.1615, + "step": 6414 + }, + { + "epoch": 2.388337541157171, + "grad_norm": 0.24664366245269775, + "learning_rate": 1.7911595309617552e-05, + "loss": 1.1762, + "step": 6415 + }, + { + "epoch": 2.38870984630778, + "grad_norm": 0.2372390776872635, + "learning_rate": 1.7910851951780244e-05, + "loss": 1.1696, + "step": 6416 + }, + { + "epoch": 2.389082151458389, + "grad_norm": 0.18587949872016907, + "learning_rate": 1.791010847710015e-05, + "loss": 1.1784, + "step": 6417 + }, + { + "epoch": 2.3894544566089984, + "grad_norm": 0.16316647827625275, + "learning_rate": 1.7909364885588247e-05, + "loss": 1.1808, + "step": 6418 + }, + { + "epoch": 2.389826761759607, + "grad_norm": 0.19883513450622559, + "learning_rate": 1.7908621177255523e-05, + "loss": 1.1763, + "step": 6419 + }, + { + "epoch": 2.3901990669102164, + "grad_norm": 0.1844921112060547, + "learning_rate": 1.790787735211296e-05, + "loss": 1.1791, + "step": 6420 + }, + { + "epoch": 2.3905713720608253, + "grad_norm": 0.17219749093055725, + "learning_rate": 1.7907133410171548e-05, + "loss": 1.1735, + "step": 6421 + }, + { + "epoch": 2.3909436772114345, + "grad_norm": 0.16659492254257202, + "learning_rate": 1.790638935144227e-05, + "loss": 1.1776, + "step": 6422 + }, + { + "epoch": 2.3913159823620433, + "grad_norm": 0.1710551679134369, + "learning_rate": 1.7905645175936116e-05, + "loss": 1.18, + "step": 6423 + }, + { + "epoch": 2.3916882875126526, + "grad_norm": 0.17188216745853424, + "learning_rate": 1.7904900883664078e-05, + "loss": 1.1724, + "step": 6424 + }, + { + "epoch": 2.3920605926632614, + "grad_norm": 0.1644524782896042, + "learning_rate": 1.790415647463715e-05, + "loss": 1.1819, + "step": 6425 + }, + { + "epoch": 2.3924328978138707, + "grad_norm": 0.16760697960853577, + "learning_rate": 1.790341194886633e-05, + "loss": 1.1836, + "step": 6426 + }, + { + "epoch": 2.39280520296448, + "grad_norm": 0.16618818044662476, + "learning_rate": 1.790266730636261e-05, + "loss": 1.192, + "step": 6427 + }, + { + "epoch": 2.393177508115089, + "grad_norm": 0.17228281497955322, + "learning_rate": 1.790192254713699e-05, + "loss": 1.1891, + "step": 6428 + }, + { + "epoch": 2.393549813265698, + "grad_norm": 0.16444258391857147, + "learning_rate": 1.7901177671200472e-05, + "loss": 1.1802, + "step": 6429 + }, + { + "epoch": 2.393922118416307, + "grad_norm": 0.16610148549079895, + "learning_rate": 1.790043267856405e-05, + "loss": 1.1583, + "step": 6430 + }, + { + "epoch": 2.394294423566916, + "grad_norm": 0.16486294567584991, + "learning_rate": 1.789968756923874e-05, + "loss": 1.1835, + "step": 6431 + }, + { + "epoch": 2.394666728717525, + "grad_norm": 0.1681571900844574, + "learning_rate": 1.7898942343235535e-05, + "loss": 1.1903, + "step": 6432 + }, + { + "epoch": 2.395039033868134, + "grad_norm": 0.16249068081378937, + "learning_rate": 1.7898197000565448e-05, + "loss": 1.1848, + "step": 6433 + }, + { + "epoch": 2.395411339018743, + "grad_norm": 0.16723275184631348, + "learning_rate": 1.789745154123949e-05, + "loss": 1.1678, + "step": 6434 + }, + { + "epoch": 2.3957836441693523, + "grad_norm": 0.16536808013916016, + "learning_rate": 1.7896705965268668e-05, + "loss": 1.1744, + "step": 6435 + }, + { + "epoch": 2.3961559493199616, + "grad_norm": 0.16059263050556183, + "learning_rate": 1.7895960272663992e-05, + "loss": 1.1798, + "step": 6436 + }, + { + "epoch": 2.3965282544705704, + "grad_norm": 0.1607469618320465, + "learning_rate": 1.789521446343648e-05, + "loss": 1.1797, + "step": 6437 + }, + { + "epoch": 2.3969005596211796, + "grad_norm": 0.1656348556280136, + "learning_rate": 1.7894468537597146e-05, + "loss": 1.1781, + "step": 6438 + }, + { + "epoch": 2.3972728647717885, + "grad_norm": 0.16378162801265717, + "learning_rate": 1.7893722495157004e-05, + "loss": 1.1878, + "step": 6439 + }, + { + "epoch": 2.3976451699223977, + "grad_norm": 0.1643425077199936, + "learning_rate": 1.789297633612708e-05, + "loss": 1.1851, + "step": 6440 + }, + { + "epoch": 2.3980174750730066, + "grad_norm": 0.16220180690288544, + "learning_rate": 1.789223006051839e-05, + "loss": 1.1757, + "step": 6441 + }, + { + "epoch": 2.398389780223616, + "grad_norm": 0.16219714283943176, + "learning_rate": 1.7891483668341955e-05, + "loss": 1.1934, + "step": 6442 + }, + { + "epoch": 2.3987620853742246, + "grad_norm": 0.16055887937545776, + "learning_rate": 1.7890737159608803e-05, + "loss": 1.1712, + "step": 6443 + }, + { + "epoch": 2.399134390524834, + "grad_norm": 0.16199053823947906, + "learning_rate": 1.788999053432996e-05, + "loss": 1.1757, + "step": 6444 + }, + { + "epoch": 2.399506695675443, + "grad_norm": 0.1576344072818756, + "learning_rate": 1.7889243792516452e-05, + "loss": 1.1713, + "step": 6445 + }, + { + "epoch": 2.399879000826052, + "grad_norm": 0.17367145419120789, + "learning_rate": 1.7888496934179308e-05, + "loss": 1.1757, + "step": 6446 + }, + { + "epoch": 2.4002513059766613, + "grad_norm": 0.1603989452123642, + "learning_rate": 1.7887749959329555e-05, + "loss": 1.1654, + "step": 6447 + }, + { + "epoch": 2.40062361112727, + "grad_norm": 0.1615387499332428, + "learning_rate": 1.7887002867978234e-05, + "loss": 1.1664, + "step": 6448 + }, + { + "epoch": 2.4009959162778793, + "grad_norm": 0.1623525768518448, + "learning_rate": 1.7886255660136376e-05, + "loss": 1.1547, + "step": 6449 + }, + { + "epoch": 2.401368221428488, + "grad_norm": 0.16615155339241028, + "learning_rate": 1.7885508335815013e-05, + "loss": 1.1797, + "step": 6450 + }, + { + "epoch": 2.4017405265790974, + "grad_norm": 0.16319416463375092, + "learning_rate": 1.788476089502519e-05, + "loss": 1.1662, + "step": 6451 + }, + { + "epoch": 2.4021128317297062, + "grad_norm": 0.16049784421920776, + "learning_rate": 1.7884013337777944e-05, + "loss": 1.176, + "step": 6452 + }, + { + "epoch": 2.4024851368803155, + "grad_norm": 0.1557374745607376, + "learning_rate": 1.7883265664084317e-05, + "loss": 1.1815, + "step": 6453 + }, + { + "epoch": 2.4028574420309248, + "grad_norm": 0.17005103826522827, + "learning_rate": 1.7882517873955344e-05, + "loss": 1.1816, + "step": 6454 + }, + { + "epoch": 2.4032297471815336, + "grad_norm": 0.16482587158679962, + "learning_rate": 1.7881769967402083e-05, + "loss": 1.1879, + "step": 6455 + }, + { + "epoch": 2.403602052332143, + "grad_norm": 0.16888852417469025, + "learning_rate": 1.788102194443557e-05, + "loss": 1.1853, + "step": 6456 + }, + { + "epoch": 2.4039743574827517, + "grad_norm": 0.16261079907417297, + "learning_rate": 1.7880273805066864e-05, + "loss": 1.1675, + "step": 6457 + }, + { + "epoch": 2.404346662633361, + "grad_norm": 0.16388806700706482, + "learning_rate": 1.7879525549307004e-05, + "loss": 1.178, + "step": 6458 + }, + { + "epoch": 2.4047189677839698, + "grad_norm": 0.16773760318756104, + "learning_rate": 1.7878777177167047e-05, + "loss": 1.1691, + "step": 6459 + }, + { + "epoch": 2.405091272934579, + "grad_norm": 0.1617787927389145, + "learning_rate": 1.7878028688658044e-05, + "loss": 1.1715, + "step": 6460 + }, + { + "epoch": 2.405463578085188, + "grad_norm": 0.16519810259342194, + "learning_rate": 1.787728008379105e-05, + "loss": 1.189, + "step": 6461 + }, + { + "epoch": 2.405835883235797, + "grad_norm": 0.17165139317512512, + "learning_rate": 1.787653136257713e-05, + "loss": 1.1729, + "step": 6462 + }, + { + "epoch": 2.4062081883864064, + "grad_norm": 0.16290132701396942, + "learning_rate": 1.787578252502733e-05, + "loss": 1.1823, + "step": 6463 + }, + { + "epoch": 2.406580493537015, + "grad_norm": 0.16138114035129547, + "learning_rate": 1.7875033571152718e-05, + "loss": 1.1697, + "step": 6464 + }, + { + "epoch": 2.4069527986876245, + "grad_norm": 0.16568872332572937, + "learning_rate": 1.7874284500964357e-05, + "loss": 1.1734, + "step": 6465 + }, + { + "epoch": 2.4073251038382333, + "grad_norm": 0.1621939092874527, + "learning_rate": 1.7873535314473308e-05, + "loss": 1.1837, + "step": 6466 + }, + { + "epoch": 2.4076974089888425, + "grad_norm": 0.15826240181922913, + "learning_rate": 1.787278601169063e-05, + "loss": 1.1699, + "step": 6467 + }, + { + "epoch": 2.4080697141394514, + "grad_norm": 0.17099608480930328, + "learning_rate": 1.7872036592627404e-05, + "loss": 1.189, + "step": 6468 + }, + { + "epoch": 2.4084420192900606, + "grad_norm": 0.16510766744613647, + "learning_rate": 1.7871287057294688e-05, + "loss": 1.1759, + "step": 6469 + }, + { + "epoch": 2.4088143244406695, + "grad_norm": 0.1691225916147232, + "learning_rate": 1.7870537405703556e-05, + "loss": 1.1761, + "step": 6470 + }, + { + "epoch": 2.4091866295912787, + "grad_norm": 0.1651260256767273, + "learning_rate": 1.7869787637865084e-05, + "loss": 1.1737, + "step": 6471 + }, + { + "epoch": 2.409558934741888, + "grad_norm": 0.16575190424919128, + "learning_rate": 1.7869037753790343e-05, + "loss": 1.1768, + "step": 6472 + }, + { + "epoch": 2.409931239892497, + "grad_norm": 0.17237798869609833, + "learning_rate": 1.78682877534904e-05, + "loss": 1.1944, + "step": 6473 + }, + { + "epoch": 2.410303545043106, + "grad_norm": 0.15723305940628052, + "learning_rate": 1.7867537636976348e-05, + "loss": 1.1725, + "step": 6474 + }, + { + "epoch": 2.410675850193715, + "grad_norm": 0.16039879620075226, + "learning_rate": 1.7866787404259255e-05, + "loss": 1.1674, + "step": 6475 + }, + { + "epoch": 2.411048155344324, + "grad_norm": 0.1647602766752243, + "learning_rate": 1.786603705535021e-05, + "loss": 1.1616, + "step": 6476 + }, + { + "epoch": 2.411420460494933, + "grad_norm": 0.16092441976070404, + "learning_rate": 1.7865286590260288e-05, + "loss": 1.156, + "step": 6477 + }, + { + "epoch": 2.4117927656455422, + "grad_norm": 0.1571139693260193, + "learning_rate": 1.7864536009000575e-05, + "loss": 1.1713, + "step": 6478 + }, + { + "epoch": 2.412165070796151, + "grad_norm": 0.16552495956420898, + "learning_rate": 1.7863785311582163e-05, + "loss": 1.1648, + "step": 6479 + }, + { + "epoch": 2.4125373759467603, + "grad_norm": 0.16000515222549438, + "learning_rate": 1.7863034498016133e-05, + "loss": 1.1861, + "step": 6480 + }, + { + "epoch": 2.4129096810973696, + "grad_norm": 0.16232497990131378, + "learning_rate": 1.7862283568313578e-05, + "loss": 1.1737, + "step": 6481 + }, + { + "epoch": 2.4132819862479784, + "grad_norm": 0.1606409251689911, + "learning_rate": 1.7861532522485588e-05, + "loss": 1.1784, + "step": 6482 + }, + { + "epoch": 2.4136542913985877, + "grad_norm": 0.16612519323825836, + "learning_rate": 1.7860781360543255e-05, + "loss": 1.1825, + "step": 6483 + }, + { + "epoch": 2.4140265965491965, + "grad_norm": 0.16565221548080444, + "learning_rate": 1.7860030082497676e-05, + "loss": 1.1889, + "step": 6484 + }, + { + "epoch": 2.4143989016998058, + "grad_norm": 0.16412298381328583, + "learning_rate": 1.7859278688359946e-05, + "loss": 1.171, + "step": 6485 + }, + { + "epoch": 2.414771206850415, + "grad_norm": 0.1599871665239334, + "learning_rate": 1.7858527178141162e-05, + "loss": 1.1857, + "step": 6486 + }, + { + "epoch": 2.415143512001024, + "grad_norm": 0.16030682623386383, + "learning_rate": 1.7857775551852426e-05, + "loss": 1.1743, + "step": 6487 + }, + { + "epoch": 2.4155158171516327, + "grad_norm": 0.16047127544879913, + "learning_rate": 1.785702380950484e-05, + "loss": 1.1954, + "step": 6488 + }, + { + "epoch": 2.415888122302242, + "grad_norm": 0.16832852363586426, + "learning_rate": 1.78562719511095e-05, + "loss": 1.1862, + "step": 6489 + }, + { + "epoch": 2.416260427452851, + "grad_norm": 0.163087859749794, + "learning_rate": 1.785551997667752e-05, + "loss": 1.1849, + "step": 6490 + }, + { + "epoch": 2.41663273260346, + "grad_norm": 0.16009113192558289, + "learning_rate": 1.7854767886220002e-05, + "loss": 1.1717, + "step": 6491 + }, + { + "epoch": 2.4170050377540693, + "grad_norm": 0.1627979576587677, + "learning_rate": 1.7854015679748053e-05, + "loss": 1.1865, + "step": 6492 + }, + { + "epoch": 2.417377342904678, + "grad_norm": 0.16498367488384247, + "learning_rate": 1.785326335727279e-05, + "loss": 1.1613, + "step": 6493 + }, + { + "epoch": 2.4177496480552874, + "grad_norm": 0.16802792251110077, + "learning_rate": 1.7852510918805318e-05, + "loss": 1.1754, + "step": 6494 + }, + { + "epoch": 2.4181219532058966, + "grad_norm": 0.16168522834777832, + "learning_rate": 1.785175836435675e-05, + "loss": 1.1908, + "step": 6495 + }, + { + "epoch": 2.4184942583565054, + "grad_norm": 0.16734105348587036, + "learning_rate": 1.7851005693938206e-05, + "loss": 1.1671, + "step": 6496 + }, + { + "epoch": 2.4188665635071147, + "grad_norm": 0.1836169809103012, + "learning_rate": 1.78502529075608e-05, + "loss": 1.1681, + "step": 6497 + }, + { + "epoch": 2.4192388686577235, + "grad_norm": 0.20119720697402954, + "learning_rate": 1.7849500005235652e-05, + "loss": 1.1652, + "step": 6498 + }, + { + "epoch": 2.419611173808333, + "grad_norm": 0.19571252167224884, + "learning_rate": 1.7848746986973883e-05, + "loss": 1.163, + "step": 6499 + }, + { + "epoch": 2.4199834789589416, + "grad_norm": 0.19061268866062164, + "learning_rate": 1.7847993852786612e-05, + "loss": 1.1809, + "step": 6500 + }, + { + "epoch": 2.4199834789589416, + "eval_loss": 1.3013861179351807, + "eval_runtime": 17.1871, + "eval_samples_per_second": 100.889, + "eval_steps_per_second": 5.062, + "step": 6500 + }, + { + "epoch": 2.420355784109551, + "grad_norm": 0.17919202148914337, + "learning_rate": 1.7847240602684962e-05, + "loss": 1.1778, + "step": 6501 + }, + { + "epoch": 2.4207280892601597, + "grad_norm": 0.1730869710445404, + "learning_rate": 1.7846487236680064e-05, + "loss": 1.1851, + "step": 6502 + }, + { + "epoch": 2.421100394410769, + "grad_norm": 0.1636027991771698, + "learning_rate": 1.784573375478304e-05, + "loss": 1.1806, + "step": 6503 + }, + { + "epoch": 2.4214726995613782, + "grad_norm": 0.16488824784755707, + "learning_rate": 1.7844980157005022e-05, + "loss": 1.1554, + "step": 6504 + }, + { + "epoch": 2.421845004711987, + "grad_norm": 0.16549324989318848, + "learning_rate": 1.784422644335714e-05, + "loss": 1.1765, + "step": 6505 + }, + { + "epoch": 2.4222173098625963, + "grad_norm": 0.1688234508037567, + "learning_rate": 1.7843472613850523e-05, + "loss": 1.1575, + "step": 6506 + }, + { + "epoch": 2.422589615013205, + "grad_norm": 0.17091526091098785, + "learning_rate": 1.7842718668496312e-05, + "loss": 1.1692, + "step": 6507 + }, + { + "epoch": 2.4229619201638144, + "grad_norm": 0.1648736596107483, + "learning_rate": 1.7841964607305636e-05, + "loss": 1.1882, + "step": 6508 + }, + { + "epoch": 2.423334225314423, + "grad_norm": 0.18578572571277618, + "learning_rate": 1.7841210430289636e-05, + "loss": 1.1695, + "step": 6509 + }, + { + "epoch": 2.4237065304650325, + "grad_norm": 0.1957530379295349, + "learning_rate": 1.7840456137459447e-05, + "loss": 1.1688, + "step": 6510 + }, + { + "epoch": 2.4240788356156413, + "grad_norm": 0.16478918492794037, + "learning_rate": 1.7839701728826214e-05, + "loss": 1.1787, + "step": 6511 + }, + { + "epoch": 2.4244511407662506, + "grad_norm": 0.2944774329662323, + "learning_rate": 1.7838947204401083e-05, + "loss": 1.184, + "step": 6512 + }, + { + "epoch": 2.42482344591686, + "grad_norm": 0.18149353563785553, + "learning_rate": 1.7838192564195188e-05, + "loss": 1.1683, + "step": 6513 + }, + { + "epoch": 2.4251957510674687, + "grad_norm": 0.18618106842041016, + "learning_rate": 1.7837437808219683e-05, + "loss": 1.1592, + "step": 6514 + }, + { + "epoch": 2.425568056218078, + "grad_norm": 0.1710144430398941, + "learning_rate": 1.7836682936485714e-05, + "loss": 1.1993, + "step": 6515 + }, + { + "epoch": 2.4259403613686867, + "grad_norm": 0.17375528812408447, + "learning_rate": 1.783592794900443e-05, + "loss": 1.1793, + "step": 6516 + }, + { + "epoch": 2.426312666519296, + "grad_norm": 0.1787467747926712, + "learning_rate": 1.783517284578698e-05, + "loss": 1.1852, + "step": 6517 + }, + { + "epoch": 2.426684971669905, + "grad_norm": 0.17819075286388397, + "learning_rate": 1.783441762684452e-05, + "loss": 1.1715, + "step": 6518 + }, + { + "epoch": 2.427057276820514, + "grad_norm": 0.16626736521720886, + "learning_rate": 1.78336622921882e-05, + "loss": 1.1769, + "step": 6519 + }, + { + "epoch": 2.427429581971123, + "grad_norm": 0.1616727113723755, + "learning_rate": 1.7832906841829187e-05, + "loss": 1.1717, + "step": 6520 + }, + { + "epoch": 2.427801887121732, + "grad_norm": 0.1669609099626541, + "learning_rate": 1.7832151275778625e-05, + "loss": 1.1717, + "step": 6521 + }, + { + "epoch": 2.4281741922723414, + "grad_norm": 0.16761192679405212, + "learning_rate": 1.7831395594047682e-05, + "loss": 1.1763, + "step": 6522 + }, + { + "epoch": 2.4285464974229503, + "grad_norm": 0.1701916605234146, + "learning_rate": 1.783063979664752e-05, + "loss": 1.1905, + "step": 6523 + }, + { + "epoch": 2.4289188025735595, + "grad_norm": 0.16727663576602936, + "learning_rate": 1.7829883883589297e-05, + "loss": 1.1913, + "step": 6524 + }, + { + "epoch": 2.4292911077241683, + "grad_norm": 0.16369487345218658, + "learning_rate": 1.782912785488418e-05, + "loss": 1.1616, + "step": 6525 + }, + { + "epoch": 2.4296634128747776, + "grad_norm": 0.1752278357744217, + "learning_rate": 1.7828371710543336e-05, + "loss": 1.1921, + "step": 6526 + }, + { + "epoch": 2.4300357180253864, + "grad_norm": 0.16097818315029144, + "learning_rate": 1.7827615450577936e-05, + "loss": 1.1998, + "step": 6527 + }, + { + "epoch": 2.4304080231759957, + "grad_norm": 0.16219815611839294, + "learning_rate": 1.7826859074999145e-05, + "loss": 1.1806, + "step": 6528 + }, + { + "epoch": 2.4307803283266045, + "grad_norm": 0.16351665556430817, + "learning_rate": 1.7826102583818134e-05, + "loss": 1.1762, + "step": 6529 + }, + { + "epoch": 2.4311526334772138, + "grad_norm": 0.16075001657009125, + "learning_rate": 1.782534597704608e-05, + "loss": 1.1809, + "step": 6530 + }, + { + "epoch": 2.431524938627823, + "grad_norm": 0.19722290337085724, + "learning_rate": 1.7824589254694163e-05, + "loss": 1.1829, + "step": 6531 + }, + { + "epoch": 2.431897243778432, + "grad_norm": 0.1670161485671997, + "learning_rate": 1.782383241677355e-05, + "loss": 1.1921, + "step": 6532 + }, + { + "epoch": 2.432269548929041, + "grad_norm": 0.1666540503501892, + "learning_rate": 1.782307546329542e-05, + "loss": 1.1845, + "step": 6533 + }, + { + "epoch": 2.43264185407965, + "grad_norm": 0.16255459189414978, + "learning_rate": 1.782231839427096e-05, + "loss": 1.1682, + "step": 6534 + }, + { + "epoch": 2.433014159230259, + "grad_norm": 0.16188620030879974, + "learning_rate": 1.782156120971135e-05, + "loss": 1.1696, + "step": 6535 + }, + { + "epoch": 2.433386464380868, + "grad_norm": 0.17455708980560303, + "learning_rate": 1.7820803909627766e-05, + "loss": 1.1804, + "step": 6536 + }, + { + "epoch": 2.4337587695314773, + "grad_norm": 0.16326938569545746, + "learning_rate": 1.7820046494031405e-05, + "loss": 1.1794, + "step": 6537 + }, + { + "epoch": 2.434131074682086, + "grad_norm": 0.15706990659236908, + "learning_rate": 1.7819288962933442e-05, + "loss": 1.1619, + "step": 6538 + }, + { + "epoch": 2.4345033798326954, + "grad_norm": 0.16634133458137512, + "learning_rate": 1.7818531316345078e-05, + "loss": 1.1824, + "step": 6539 + }, + { + "epoch": 2.4348756849833046, + "grad_norm": 0.16253423690795898, + "learning_rate": 1.7817773554277493e-05, + "loss": 1.1833, + "step": 6540 + }, + { + "epoch": 2.4352479901339135, + "grad_norm": 0.16380086541175842, + "learning_rate": 1.7817015676741883e-05, + "loss": 1.1861, + "step": 6541 + }, + { + "epoch": 2.4356202952845227, + "grad_norm": 0.16374869644641876, + "learning_rate": 1.7816257683749444e-05, + "loss": 1.1758, + "step": 6542 + }, + { + "epoch": 2.4359926004351315, + "grad_norm": 0.1633332371711731, + "learning_rate": 1.7815499575311367e-05, + "loss": 1.1984, + "step": 6543 + }, + { + "epoch": 2.436364905585741, + "grad_norm": 0.16350342333316803, + "learning_rate": 1.7814741351438855e-05, + "loss": 1.1672, + "step": 6544 + }, + { + "epoch": 2.4367372107363496, + "grad_norm": 0.16379904747009277, + "learning_rate": 1.7813983012143104e-05, + "loss": 1.1895, + "step": 6545 + }, + { + "epoch": 2.437109515886959, + "grad_norm": 0.16727176308631897, + "learning_rate": 1.7813224557435313e-05, + "loss": 1.1741, + "step": 6546 + }, + { + "epoch": 2.4374818210375677, + "grad_norm": 0.1602688729763031, + "learning_rate": 1.7812465987326682e-05, + "loss": 1.1816, + "step": 6547 + }, + { + "epoch": 2.437854126188177, + "grad_norm": 0.16404597461223602, + "learning_rate": 1.7811707301828424e-05, + "loss": 1.1875, + "step": 6548 + }, + { + "epoch": 2.4382264313387862, + "grad_norm": 0.16207285225391388, + "learning_rate": 1.7810948500951738e-05, + "loss": 1.1627, + "step": 6549 + }, + { + "epoch": 2.438598736489395, + "grad_norm": 0.168754443526268, + "learning_rate": 1.7810189584707834e-05, + "loss": 1.1778, + "step": 6550 + }, + { + "epoch": 2.4389710416400043, + "grad_norm": 0.1652376502752304, + "learning_rate": 1.7809430553107915e-05, + "loss": 1.1854, + "step": 6551 + }, + { + "epoch": 2.439343346790613, + "grad_norm": 0.16541428864002228, + "learning_rate": 1.78086714061632e-05, + "loss": 1.1917, + "step": 6552 + }, + { + "epoch": 2.4397156519412224, + "grad_norm": 0.1626761257648468, + "learning_rate": 1.78079121438849e-05, + "loss": 1.1666, + "step": 6553 + }, + { + "epoch": 2.4400879570918312, + "grad_norm": 0.164178267121315, + "learning_rate": 1.7807152766284222e-05, + "loss": 1.1834, + "step": 6554 + }, + { + "epoch": 2.4404602622424405, + "grad_norm": 0.15936389565467834, + "learning_rate": 1.7806393273372396e-05, + "loss": 1.1668, + "step": 6555 + }, + { + "epoch": 2.4408325673930493, + "grad_norm": 0.16379861533641815, + "learning_rate": 1.7805633665160623e-05, + "loss": 1.1796, + "step": 6556 + }, + { + "epoch": 2.4412048725436586, + "grad_norm": 0.165420264005661, + "learning_rate": 1.7804873941660137e-05, + "loss": 1.1771, + "step": 6557 + }, + { + "epoch": 2.441577177694268, + "grad_norm": 0.16824427247047424, + "learning_rate": 1.780411410288215e-05, + "loss": 1.1844, + "step": 6558 + }, + { + "epoch": 2.4419494828448767, + "grad_norm": 0.16532348096370697, + "learning_rate": 1.780335414883789e-05, + "loss": 1.1887, + "step": 6559 + }, + { + "epoch": 2.442321787995486, + "grad_norm": 0.16297706961631775, + "learning_rate": 1.7802594079538574e-05, + "loss": 1.1702, + "step": 6560 + }, + { + "epoch": 2.4426940931460948, + "grad_norm": 0.16439802944660187, + "learning_rate": 1.7801833894995436e-05, + "loss": 1.1802, + "step": 6561 + }, + { + "epoch": 2.443066398296704, + "grad_norm": 0.16698960959911346, + "learning_rate": 1.7801073595219702e-05, + "loss": 1.183, + "step": 6562 + }, + { + "epoch": 2.443438703447313, + "grad_norm": 0.17252756655216217, + "learning_rate": 1.78003131802226e-05, + "loss": 1.187, + "step": 6563 + }, + { + "epoch": 2.443811008597922, + "grad_norm": 0.16879017651081085, + "learning_rate": 1.779955265001536e-05, + "loss": 1.1708, + "step": 6564 + }, + { + "epoch": 2.444183313748531, + "grad_norm": 0.16200561821460724, + "learning_rate": 1.779879200460922e-05, + "loss": 1.173, + "step": 6565 + }, + { + "epoch": 2.44455561889914, + "grad_norm": 0.16437043249607086, + "learning_rate": 1.7798031244015406e-05, + "loss": 1.1693, + "step": 6566 + }, + { + "epoch": 2.4449279240497495, + "grad_norm": 0.17090344429016113, + "learning_rate": 1.7797270368245166e-05, + "loss": 1.1667, + "step": 6567 + }, + { + "epoch": 2.4453002292003583, + "grad_norm": 0.1742093712091446, + "learning_rate": 1.7796509377309728e-05, + "loss": 1.1932, + "step": 6568 + }, + { + "epoch": 2.4456725343509675, + "grad_norm": 0.16742129623889923, + "learning_rate": 1.7795748271220337e-05, + "loss": 1.1628, + "step": 6569 + }, + { + "epoch": 2.4460448395015764, + "grad_norm": 0.16507847607135773, + "learning_rate": 1.779498704998823e-05, + "loss": 1.1942, + "step": 6570 + }, + { + "epoch": 2.4464171446521856, + "grad_norm": 0.16421516239643097, + "learning_rate": 1.7794225713624663e-05, + "loss": 1.1746, + "step": 6571 + }, + { + "epoch": 2.4467894498027944, + "grad_norm": 0.17189864814281464, + "learning_rate": 1.7793464262140864e-05, + "loss": 1.1843, + "step": 6572 + }, + { + "epoch": 2.4471617549534037, + "grad_norm": 0.16595551371574402, + "learning_rate": 1.7792702695548086e-05, + "loss": 1.187, + "step": 6573 + }, + { + "epoch": 2.4475340601040125, + "grad_norm": 0.16488155722618103, + "learning_rate": 1.779194101385758e-05, + "loss": 1.1726, + "step": 6574 + }, + { + "epoch": 2.447906365254622, + "grad_norm": 0.164886474609375, + "learning_rate": 1.7791179217080598e-05, + "loss": 1.1745, + "step": 6575 + }, + { + "epoch": 2.448278670405231, + "grad_norm": 0.17774739861488342, + "learning_rate": 1.779041730522838e-05, + "loss": 1.1738, + "step": 6576 + }, + { + "epoch": 2.44865097555584, + "grad_norm": 0.1659976691007614, + "learning_rate": 1.7789655278312198e-05, + "loss": 1.1803, + "step": 6577 + }, + { + "epoch": 2.449023280706449, + "grad_norm": 0.16639791429042816, + "learning_rate": 1.7788893136343288e-05, + "loss": 1.1628, + "step": 6578 + }, + { + "epoch": 2.449395585857058, + "grad_norm": 0.16745202243328094, + "learning_rate": 1.7788130879332918e-05, + "loss": 1.1697, + "step": 6579 + }, + { + "epoch": 2.4497678910076672, + "grad_norm": 0.16819866001605988, + "learning_rate": 1.7787368507292343e-05, + "loss": 1.1783, + "step": 6580 + }, + { + "epoch": 2.450140196158276, + "grad_norm": 0.16538161039352417, + "learning_rate": 1.7786606020232825e-05, + "loss": 1.1736, + "step": 6581 + }, + { + "epoch": 2.4505125013088853, + "grad_norm": 0.1632433533668518, + "learning_rate": 1.7785843418165624e-05, + "loss": 1.1779, + "step": 6582 + }, + { + "epoch": 2.450884806459494, + "grad_norm": 0.1659589260816574, + "learning_rate": 1.7785080701102003e-05, + "loss": 1.195, + "step": 6583 + }, + { + "epoch": 2.4512571116101034, + "grad_norm": 0.16399775445461273, + "learning_rate": 1.778431786905323e-05, + "loss": 1.1767, + "step": 6584 + }, + { + "epoch": 2.4516294167607127, + "grad_norm": 0.1630357950925827, + "learning_rate": 1.778355492203057e-05, + "loss": 1.1817, + "step": 6585 + }, + { + "epoch": 2.4520017219113215, + "grad_norm": 0.15688146650791168, + "learning_rate": 1.778279186004529e-05, + "loss": 1.1784, + "step": 6586 + }, + { + "epoch": 2.4523740270619308, + "grad_norm": 0.15879155695438385, + "learning_rate": 1.7782028683108667e-05, + "loss": 1.1713, + "step": 6587 + }, + { + "epoch": 2.4527463322125396, + "grad_norm": 0.16145865619182587, + "learning_rate": 1.7781265391231968e-05, + "loss": 1.186, + "step": 6588 + }, + { + "epoch": 2.453118637363149, + "grad_norm": 0.1632131040096283, + "learning_rate": 1.7780501984426465e-05, + "loss": 1.1699, + "step": 6589 + }, + { + "epoch": 2.4534909425137577, + "grad_norm": 0.15937484800815582, + "learning_rate": 1.777973846270344e-05, + "loss": 1.1633, + "step": 6590 + }, + { + "epoch": 2.453863247664367, + "grad_norm": 0.16633634269237518, + "learning_rate": 1.7778974826074163e-05, + "loss": 1.1933, + "step": 6591 + }, + { + "epoch": 2.4542355528149757, + "grad_norm": 0.1646929830312729, + "learning_rate": 1.7778211074549916e-05, + "loss": 1.1823, + "step": 6592 + }, + { + "epoch": 2.454607857965585, + "grad_norm": 0.1606590747833252, + "learning_rate": 1.777744720814198e-05, + "loss": 1.1797, + "step": 6593 + }, + { + "epoch": 2.4549801631161943, + "grad_norm": 0.16645345091819763, + "learning_rate": 1.7776683226861636e-05, + "loss": 1.1663, + "step": 6594 + }, + { + "epoch": 2.455352468266803, + "grad_norm": 0.16911546885967255, + "learning_rate": 1.777591913072017e-05, + "loss": 1.1717, + "step": 6595 + }, + { + "epoch": 2.4557247734174124, + "grad_norm": 0.16527560353279114, + "learning_rate": 1.777515491972887e-05, + "loss": 1.1854, + "step": 6596 + }, + { + "epoch": 2.456097078568021, + "grad_norm": 0.16510476171970367, + "learning_rate": 1.7774390593899014e-05, + "loss": 1.1735, + "step": 6597 + }, + { + "epoch": 2.4564693837186304, + "grad_norm": 0.16441304981708527, + "learning_rate": 1.7773626153241897e-05, + "loss": 1.1721, + "step": 6598 + }, + { + "epoch": 2.4568416888692393, + "grad_norm": 0.163100928068161, + "learning_rate": 1.7772861597768814e-05, + "loss": 1.1774, + "step": 6599 + }, + { + "epoch": 2.4572139940198485, + "grad_norm": 0.16607718169689178, + "learning_rate": 1.777209692749105e-05, + "loss": 1.171, + "step": 6600 + }, + { + "epoch": 2.4575862991704573, + "grad_norm": 0.1616518199443817, + "learning_rate": 1.77713321424199e-05, + "loss": 1.1722, + "step": 6601 + }, + { + "epoch": 2.4579586043210666, + "grad_norm": 0.1628957986831665, + "learning_rate": 1.7770567242566667e-05, + "loss": 1.1708, + "step": 6602 + }, + { + "epoch": 2.458330909471676, + "grad_norm": 0.1651378571987152, + "learning_rate": 1.776980222794264e-05, + "loss": 1.1775, + "step": 6603 + }, + { + "epoch": 2.4587032146222847, + "grad_norm": 0.16311919689178467, + "learning_rate": 1.7769037098559124e-05, + "loss": 1.1746, + "step": 6604 + }, + { + "epoch": 2.459075519772894, + "grad_norm": 0.16620442271232605, + "learning_rate": 1.7768271854427417e-05, + "loss": 1.177, + "step": 6605 + }, + { + "epoch": 2.459447824923503, + "grad_norm": 0.16138948500156403, + "learning_rate": 1.776750649555882e-05, + "loss": 1.1594, + "step": 6606 + }, + { + "epoch": 2.459820130074112, + "grad_norm": 0.16413603723049164, + "learning_rate": 1.776674102196464e-05, + "loss": 1.1901, + "step": 6607 + }, + { + "epoch": 2.4601924352247213, + "grad_norm": 0.1599273979663849, + "learning_rate": 1.7765975433656187e-05, + "loss": 1.1678, + "step": 6608 + }, + { + "epoch": 2.46056474037533, + "grad_norm": 0.15869589149951935, + "learning_rate": 1.776520973064476e-05, + "loss": 1.1647, + "step": 6609 + }, + { + "epoch": 2.460937045525939, + "grad_norm": 0.1629563570022583, + "learning_rate": 1.7764443912941675e-05, + "loss": 1.1711, + "step": 6610 + }, + { + "epoch": 2.461309350676548, + "grad_norm": 0.1599353700876236, + "learning_rate": 1.776367798055824e-05, + "loss": 1.1772, + "step": 6611 + }, + { + "epoch": 2.4616816558271575, + "grad_norm": 0.16407759487628937, + "learning_rate": 1.7762911933505767e-05, + "loss": 1.1826, + "step": 6612 + }, + { + "epoch": 2.4620539609777663, + "grad_norm": 0.1610613763332367, + "learning_rate": 1.776214577179557e-05, + "loss": 1.1704, + "step": 6613 + }, + { + "epoch": 2.4624262661283756, + "grad_norm": 0.16347752511501312, + "learning_rate": 1.776137949543897e-05, + "loss": 1.1712, + "step": 6614 + }, + { + "epoch": 2.4627985712789844, + "grad_norm": 0.15655462443828583, + "learning_rate": 1.7760613104447283e-05, + "loss": 1.1515, + "step": 6615 + }, + { + "epoch": 2.4631708764295936, + "grad_norm": 0.16429446637630463, + "learning_rate": 1.7759846598831827e-05, + "loss": 1.1793, + "step": 6616 + }, + { + "epoch": 2.463543181580203, + "grad_norm": 0.16493436694145203, + "learning_rate": 1.775907997860392e-05, + "loss": 1.1681, + "step": 6617 + }, + { + "epoch": 2.4639154867308117, + "grad_norm": 0.1644970327615738, + "learning_rate": 1.775831324377489e-05, + "loss": 1.1819, + "step": 6618 + }, + { + "epoch": 2.464287791881421, + "grad_norm": 0.16684550046920776, + "learning_rate": 1.7757546394356063e-05, + "loss": 1.1864, + "step": 6619 + }, + { + "epoch": 2.46466009703203, + "grad_norm": 0.16069693863391876, + "learning_rate": 1.775677943035876e-05, + "loss": 1.1721, + "step": 6620 + }, + { + "epoch": 2.465032402182639, + "grad_norm": 0.1637074053287506, + "learning_rate": 1.7756012351794315e-05, + "loss": 1.1697, + "step": 6621 + }, + { + "epoch": 2.465404707333248, + "grad_norm": 0.1612214297056198, + "learning_rate": 1.7755245158674054e-05, + "loss": 1.1658, + "step": 6622 + }, + { + "epoch": 2.465777012483857, + "grad_norm": 0.16873739659786224, + "learning_rate": 1.7754477851009307e-05, + "loss": 1.1943, + "step": 6623 + }, + { + "epoch": 2.466149317634466, + "grad_norm": 0.16475237905979156, + "learning_rate": 1.775371042881141e-05, + "loss": 1.1857, + "step": 6624 + }, + { + "epoch": 2.4665216227850753, + "grad_norm": 0.16475136578083038, + "learning_rate": 1.7752942892091694e-05, + "loss": 1.1782, + "step": 6625 + }, + { + "epoch": 2.4668939279356845, + "grad_norm": 0.16575996577739716, + "learning_rate": 1.7752175240861497e-05, + "loss": 1.1794, + "step": 6626 + }, + { + "epoch": 2.4672662330862933, + "grad_norm": 0.16450487077236176, + "learning_rate": 1.7751407475132164e-05, + "loss": 1.1768, + "step": 6627 + }, + { + "epoch": 2.4676385382369026, + "grad_norm": 0.1675347089767456, + "learning_rate": 1.7750639594915026e-05, + "loss": 1.1753, + "step": 6628 + }, + { + "epoch": 2.4680108433875114, + "grad_norm": 0.16457484662532806, + "learning_rate": 1.7749871600221426e-05, + "loss": 1.1841, + "step": 6629 + }, + { + "epoch": 2.4683831485381207, + "grad_norm": 0.17486611008644104, + "learning_rate": 1.774910349106271e-05, + "loss": 1.1963, + "step": 6630 + }, + { + "epoch": 2.4687554536887295, + "grad_norm": 0.1780180037021637, + "learning_rate": 1.774833526745022e-05, + "loss": 1.1998, + "step": 6631 + }, + { + "epoch": 2.4691277588393388, + "grad_norm": 0.168752983212471, + "learning_rate": 1.7747566929395307e-05, + "loss": 1.1728, + "step": 6632 + }, + { + "epoch": 2.4695000639899476, + "grad_norm": 0.1614418923854828, + "learning_rate": 1.7746798476909316e-05, + "loss": 1.1632, + "step": 6633 + }, + { + "epoch": 2.469872369140557, + "grad_norm": 0.16157226264476776, + "learning_rate": 1.7746029910003598e-05, + "loss": 1.1502, + "step": 6634 + }, + { + "epoch": 2.470244674291166, + "grad_norm": 0.16318723559379578, + "learning_rate": 1.7745261228689505e-05, + "loss": 1.1645, + "step": 6635 + }, + { + "epoch": 2.470616979441775, + "grad_norm": 0.17089669406414032, + "learning_rate": 1.7744492432978385e-05, + "loss": 1.1809, + "step": 6636 + }, + { + "epoch": 2.470989284592384, + "grad_norm": 0.1579371988773346, + "learning_rate": 1.7743723522881604e-05, + "loss": 1.1742, + "step": 6637 + }, + { + "epoch": 2.471361589742993, + "grad_norm": 0.16182205080986023, + "learning_rate": 1.774295449841051e-05, + "loss": 1.1868, + "step": 6638 + }, + { + "epoch": 2.4717338948936023, + "grad_norm": 0.16928228735923767, + "learning_rate": 1.7742185359576464e-05, + "loss": 1.1915, + "step": 6639 + }, + { + "epoch": 2.472106200044211, + "grad_norm": 0.15761636197566986, + "learning_rate": 1.7741416106390828e-05, + "loss": 1.1569, + "step": 6640 + }, + { + "epoch": 2.4724785051948204, + "grad_norm": 0.1611185520887375, + "learning_rate": 1.7740646738864956e-05, + "loss": 1.1806, + "step": 6641 + }, + { + "epoch": 2.472850810345429, + "grad_norm": 0.16303184628486633, + "learning_rate": 1.7739877257010226e-05, + "loss": 1.1706, + "step": 6642 + }, + { + "epoch": 2.4732231154960385, + "grad_norm": 0.1614006906747818, + "learning_rate": 1.7739107660837985e-05, + "loss": 1.1721, + "step": 6643 + }, + { + "epoch": 2.4735954206466477, + "grad_norm": 0.1621858775615692, + "learning_rate": 1.7738337950359617e-05, + "loss": 1.1672, + "step": 6644 + }, + { + "epoch": 2.4739677257972565, + "grad_norm": 0.15944872796535492, + "learning_rate": 1.7737568125586482e-05, + "loss": 1.1675, + "step": 6645 + }, + { + "epoch": 2.474340030947866, + "grad_norm": 0.16335934400558472, + "learning_rate": 1.7736798186529947e-05, + "loss": 1.1747, + "step": 6646 + }, + { + "epoch": 2.4747123360984746, + "grad_norm": 0.16249805688858032, + "learning_rate": 1.7736028133201394e-05, + "loss": 1.1767, + "step": 6647 + }, + { + "epoch": 2.475084641249084, + "grad_norm": 0.16741207242012024, + "learning_rate": 1.7735257965612188e-05, + "loss": 1.1809, + "step": 6648 + }, + { + "epoch": 2.4754569463996927, + "grad_norm": 0.167801633477211, + "learning_rate": 1.773448768377371e-05, + "loss": 1.174, + "step": 6649 + }, + { + "epoch": 2.475829251550302, + "grad_norm": 0.1614260971546173, + "learning_rate": 1.7733717287697328e-05, + "loss": 1.1734, + "step": 6650 + }, + { + "epoch": 2.476201556700911, + "grad_norm": 0.15728265047073364, + "learning_rate": 1.7732946777394432e-05, + "loss": 1.172, + "step": 6651 + }, + { + "epoch": 2.47657386185152, + "grad_norm": 0.16399987041950226, + "learning_rate": 1.77321761528764e-05, + "loss": 1.1902, + "step": 6652 + }, + { + "epoch": 2.4769461670021293, + "grad_norm": 0.1572953164577484, + "learning_rate": 1.7731405414154606e-05, + "loss": 1.1712, + "step": 6653 + }, + { + "epoch": 2.477318472152738, + "grad_norm": 0.16343964636325836, + "learning_rate": 1.7730634561240442e-05, + "loss": 1.178, + "step": 6654 + }, + { + "epoch": 2.4776907773033474, + "grad_norm": 0.1638653576374054, + "learning_rate": 1.7729863594145287e-05, + "loss": 1.1719, + "step": 6655 + }, + { + "epoch": 2.4780630824539562, + "grad_norm": 0.16249097883701324, + "learning_rate": 1.7729092512880534e-05, + "loss": 1.1715, + "step": 6656 + }, + { + "epoch": 2.4784353876045655, + "grad_norm": 0.15715673565864563, + "learning_rate": 1.7728321317457573e-05, + "loss": 1.1589, + "step": 6657 + }, + { + "epoch": 2.4788076927551743, + "grad_norm": 0.16444164514541626, + "learning_rate": 1.7727550007887787e-05, + "loss": 1.1729, + "step": 6658 + }, + { + "epoch": 2.4791799979057836, + "grad_norm": 0.16417501866817474, + "learning_rate": 1.7726778584182575e-05, + "loss": 1.1709, + "step": 6659 + }, + { + "epoch": 2.4795523030563924, + "grad_norm": 0.16987749934196472, + "learning_rate": 1.7726007046353328e-05, + "loss": 1.1784, + "step": 6660 + }, + { + "epoch": 2.4799246082070017, + "grad_norm": 0.16455167531967163, + "learning_rate": 1.772523539441144e-05, + "loss": 1.1882, + "step": 6661 + }, + { + "epoch": 2.480296913357611, + "grad_norm": 0.16781625151634216, + "learning_rate": 1.772446362836831e-05, + "loss": 1.1803, + "step": 6662 + }, + { + "epoch": 2.4806692185082198, + "grad_norm": 0.1608152836561203, + "learning_rate": 1.772369174823534e-05, + "loss": 1.1655, + "step": 6663 + }, + { + "epoch": 2.481041523658829, + "grad_norm": 0.16263465583324432, + "learning_rate": 1.7722919754023923e-05, + "loss": 1.1561, + "step": 6664 + }, + { + "epoch": 2.481413828809438, + "grad_norm": 0.16114389896392822, + "learning_rate": 1.772214764574547e-05, + "loss": 1.1724, + "step": 6665 + }, + { + "epoch": 2.481786133960047, + "grad_norm": 0.1621330976486206, + "learning_rate": 1.7721375423411378e-05, + "loss": 1.1724, + "step": 6666 + }, + { + "epoch": 2.482158439110656, + "grad_norm": 0.16913004219532013, + "learning_rate": 1.7720603087033058e-05, + "loss": 1.1843, + "step": 6667 + }, + { + "epoch": 2.482530744261265, + "grad_norm": 0.16739852726459503, + "learning_rate": 1.7719830636621914e-05, + "loss": 1.1717, + "step": 6668 + }, + { + "epoch": 2.482903049411874, + "grad_norm": 0.16293086111545563, + "learning_rate": 1.7719058072189355e-05, + "loss": 1.1743, + "step": 6669 + }, + { + "epoch": 2.4832753545624833, + "grad_norm": 0.16651244461536407, + "learning_rate": 1.771828539374679e-05, + "loss": 1.1892, + "step": 6670 + }, + { + "epoch": 2.4836476597130925, + "grad_norm": 0.16542814671993256, + "learning_rate": 1.771751260130564e-05, + "loss": 1.1692, + "step": 6671 + }, + { + "epoch": 2.4840199648637014, + "grad_norm": 0.16294194757938385, + "learning_rate": 1.771673969487731e-05, + "loss": 1.1784, + "step": 6672 + }, + { + "epoch": 2.4843922700143106, + "grad_norm": 0.16167296469211578, + "learning_rate": 1.771596667447322e-05, + "loss": 1.1642, + "step": 6673 + }, + { + "epoch": 2.4847645751649194, + "grad_norm": 0.1647670865058899, + "learning_rate": 1.771519354010479e-05, + "loss": 1.1843, + "step": 6674 + }, + { + "epoch": 2.4851368803155287, + "grad_norm": 0.16380488872528076, + "learning_rate": 1.771442029178343e-05, + "loss": 1.1939, + "step": 6675 + }, + { + "epoch": 2.4855091854661375, + "grad_norm": 0.16478745639324188, + "learning_rate": 1.7713646929520568e-05, + "loss": 1.174, + "step": 6676 + }, + { + "epoch": 2.485881490616747, + "grad_norm": 0.16080977022647858, + "learning_rate": 1.7712873453327626e-05, + "loss": 1.1729, + "step": 6677 + }, + { + "epoch": 2.4862537957673556, + "grad_norm": 0.16270096600055695, + "learning_rate": 1.7712099863216027e-05, + "loss": 1.1848, + "step": 6678 + }, + { + "epoch": 2.486626100917965, + "grad_norm": 0.16529610753059387, + "learning_rate": 1.7711326159197195e-05, + "loss": 1.1513, + "step": 6679 + }, + { + "epoch": 2.486998406068574, + "grad_norm": 0.16876338422298431, + "learning_rate": 1.771055234128256e-05, + "loss": 1.1805, + "step": 6680 + }, + { + "epoch": 2.487370711219183, + "grad_norm": 0.18239647150039673, + "learning_rate": 1.7709778409483554e-05, + "loss": 1.1735, + "step": 6681 + }, + { + "epoch": 2.4877430163697922, + "grad_norm": 0.16950790584087372, + "learning_rate": 1.7709004363811598e-05, + "loss": 1.1815, + "step": 6682 + }, + { + "epoch": 2.488115321520401, + "grad_norm": 0.16490155458450317, + "learning_rate": 1.770823020427814e-05, + "loss": 1.1648, + "step": 6683 + }, + { + "epoch": 2.4884876266710103, + "grad_norm": 0.17165668308734894, + "learning_rate": 1.7707455930894603e-05, + "loss": 1.178, + "step": 6684 + }, + { + "epoch": 2.488859931821619, + "grad_norm": 0.16048189997673035, + "learning_rate": 1.770668154367242e-05, + "loss": 1.1776, + "step": 6685 + }, + { + "epoch": 2.4892322369722284, + "grad_norm": 0.16602389514446259, + "learning_rate": 1.770590704262304e-05, + "loss": 1.1667, + "step": 6686 + }, + { + "epoch": 2.489604542122837, + "grad_norm": 0.1645582914352417, + "learning_rate": 1.7705132427757895e-05, + "loss": 1.1881, + "step": 6687 + }, + { + "epoch": 2.4899768472734465, + "grad_norm": 0.16345307230949402, + "learning_rate": 1.7704357699088426e-05, + "loss": 1.1757, + "step": 6688 + }, + { + "epoch": 2.4903491524240557, + "grad_norm": 0.1574520319700241, + "learning_rate": 1.770358285662608e-05, + "loss": 1.1658, + "step": 6689 + }, + { + "epoch": 2.4907214575746646, + "grad_norm": 0.16502508521080017, + "learning_rate": 1.7702807900382296e-05, + "loss": 1.1815, + "step": 6690 + }, + { + "epoch": 2.491093762725274, + "grad_norm": 0.16367071866989136, + "learning_rate": 1.7702032830368525e-05, + "loss": 1.1812, + "step": 6691 + }, + { + "epoch": 2.4914660678758827, + "grad_norm": 0.16245703399181366, + "learning_rate": 1.7701257646596212e-05, + "loss": 1.1684, + "step": 6692 + }, + { + "epoch": 2.491838373026492, + "grad_norm": 0.16035568714141846, + "learning_rate": 1.7700482349076808e-05, + "loss": 1.1712, + "step": 6693 + }, + { + "epoch": 2.4922106781771007, + "grad_norm": 0.16250720620155334, + "learning_rate": 1.769970693782176e-05, + "loss": 1.1736, + "step": 6694 + }, + { + "epoch": 2.49258298332771, + "grad_norm": 0.1739581972360611, + "learning_rate": 1.7698931412842526e-05, + "loss": 1.1765, + "step": 6695 + }, + { + "epoch": 2.492955288478319, + "grad_norm": 0.16992726922035217, + "learning_rate": 1.7698155774150553e-05, + "loss": 1.1814, + "step": 6696 + }, + { + "epoch": 2.493327593628928, + "grad_norm": 0.16158819198608398, + "learning_rate": 1.769738002175731e-05, + "loss": 1.1727, + "step": 6697 + }, + { + "epoch": 2.4936998987795373, + "grad_norm": 0.17658640444278717, + "learning_rate": 1.769660415567424e-05, + "loss": 1.1675, + "step": 6698 + }, + { + "epoch": 2.494072203930146, + "grad_norm": 0.21519418060779572, + "learning_rate": 1.7695828175912816e-05, + "loss": 1.181, + "step": 6699 + }, + { + "epoch": 2.4944445090807554, + "grad_norm": 0.1897740513086319, + "learning_rate": 1.7695052082484493e-05, + "loss": 1.168, + "step": 6700 + }, + { + "epoch": 2.4948168142313643, + "grad_norm": 0.1640034019947052, + "learning_rate": 1.769427587540073e-05, + "loss": 1.1741, + "step": 6701 + }, + { + "epoch": 2.4951891193819735, + "grad_norm": 0.17497296631336212, + "learning_rate": 1.7693499554672996e-05, + "loss": 1.1988, + "step": 6702 + }, + { + "epoch": 2.4955614245325823, + "grad_norm": 0.16799713671207428, + "learning_rate": 1.7692723120312757e-05, + "loss": 1.1981, + "step": 6703 + }, + { + "epoch": 2.4959337296831916, + "grad_norm": 0.16971002519130707, + "learning_rate": 1.7691946572331477e-05, + "loss": 1.198, + "step": 6704 + }, + { + "epoch": 2.4963060348338004, + "grad_norm": 0.17075687646865845, + "learning_rate": 1.7691169910740633e-05, + "loss": 1.1848, + "step": 6705 + }, + { + "epoch": 2.4966783399844097, + "grad_norm": 0.16614024341106415, + "learning_rate": 1.7690393135551692e-05, + "loss": 1.1759, + "step": 6706 + }, + { + "epoch": 2.497050645135019, + "grad_norm": 0.17377164959907532, + "learning_rate": 1.7689616246776125e-05, + "loss": 1.1758, + "step": 6707 + }, + { + "epoch": 2.4974229502856278, + "grad_norm": 0.1613984853029251, + "learning_rate": 1.768883924442541e-05, + "loss": 1.1545, + "step": 6708 + }, + { + "epoch": 2.497795255436237, + "grad_norm": 0.16565319895744324, + "learning_rate": 1.7688062128511023e-05, + "loss": 1.1795, + "step": 6709 + }, + { + "epoch": 2.498167560586846, + "grad_norm": 0.17493651807308197, + "learning_rate": 1.768728489904444e-05, + "loss": 1.1901, + "step": 6710 + }, + { + "epoch": 2.498539865737455, + "grad_norm": 0.16326481103897095, + "learning_rate": 1.7686507556037136e-05, + "loss": 1.1731, + "step": 6711 + }, + { + "epoch": 2.498912170888064, + "grad_norm": 0.1633228361606598, + "learning_rate": 1.7685730099500606e-05, + "loss": 1.1756, + "step": 6712 + }, + { + "epoch": 2.499284476038673, + "grad_norm": 0.16988112032413483, + "learning_rate": 1.768495252944632e-05, + "loss": 1.1778, + "step": 6713 + }, + { + "epoch": 2.499656781189282, + "grad_norm": 0.165721595287323, + "learning_rate": 1.7684174845885768e-05, + "loss": 1.178, + "step": 6714 + }, + { + "epoch": 2.5000290863398913, + "grad_norm": 0.1885932981967926, + "learning_rate": 1.768339704883044e-05, + "loss": 1.1743, + "step": 6715 + }, + { + "epoch": 2.5004013914905006, + "grad_norm": 0.16593773663043976, + "learning_rate": 1.768261913829181e-05, + "loss": 1.1713, + "step": 6716 + }, + { + "epoch": 2.5007736966411094, + "grad_norm": 0.16606302559375763, + "learning_rate": 1.7681841114281387e-05, + "loss": 1.1694, + "step": 6717 + }, + { + "epoch": 2.5011460017917186, + "grad_norm": 0.1713939607143402, + "learning_rate": 1.768106297681065e-05, + "loss": 1.167, + "step": 6718 + }, + { + "epoch": 2.5015183069423275, + "grad_norm": 0.172207772731781, + "learning_rate": 1.7680284725891095e-05, + "loss": 1.1824, + "step": 6719 + }, + { + "epoch": 2.5018906120929367, + "grad_norm": 0.1743270754814148, + "learning_rate": 1.7679506361534216e-05, + "loss": 1.1705, + "step": 6720 + }, + { + "epoch": 2.502262917243546, + "grad_norm": 0.16744069755077362, + "learning_rate": 1.7678727883751508e-05, + "loss": 1.1853, + "step": 6721 + }, + { + "epoch": 2.502635222394155, + "grad_norm": 0.18169431388378143, + "learning_rate": 1.7677949292554473e-05, + "loss": 1.1796, + "step": 6722 + }, + { + "epoch": 2.5030075275447636, + "grad_norm": 0.1659853309392929, + "learning_rate": 1.7677170587954607e-05, + "loss": 1.1716, + "step": 6723 + }, + { + "epoch": 2.503379832695373, + "grad_norm": 0.18055576086044312, + "learning_rate": 1.7676391769963416e-05, + "loss": 1.1826, + "step": 6724 + }, + { + "epoch": 2.503752137845982, + "grad_norm": 0.16379252076148987, + "learning_rate": 1.76756128385924e-05, + "loss": 1.1916, + "step": 6725 + }, + { + "epoch": 2.504124442996591, + "grad_norm": 0.16528604924678802, + "learning_rate": 1.7674833793853064e-05, + "loss": 1.1856, + "step": 6726 + }, + { + "epoch": 2.5044967481472002, + "grad_norm": 0.1737004965543747, + "learning_rate": 1.7674054635756914e-05, + "loss": 1.1808, + "step": 6727 + }, + { + "epoch": 2.504869053297809, + "grad_norm": 0.16210679709911346, + "learning_rate": 1.7673275364315458e-05, + "loss": 1.171, + "step": 6728 + }, + { + "epoch": 2.5052413584484183, + "grad_norm": 0.16479384899139404, + "learning_rate": 1.7672495979540207e-05, + "loss": 1.1741, + "step": 6729 + }, + { + "epoch": 2.5056136635990276, + "grad_norm": 0.16804920136928558, + "learning_rate": 1.7671716481442674e-05, + "loss": 1.1771, + "step": 6730 + }, + { + "epoch": 2.5059859687496364, + "grad_norm": 0.16524706780910492, + "learning_rate": 1.7670936870034366e-05, + "loss": 1.1762, + "step": 6731 + }, + { + "epoch": 2.5063582739002452, + "grad_norm": 0.163568913936615, + "learning_rate": 1.7670157145326806e-05, + "loss": 1.1838, + "step": 6732 + }, + { + "epoch": 2.5067305790508545, + "grad_norm": 0.15912410616874695, + "learning_rate": 1.7669377307331503e-05, + "loss": 1.1833, + "step": 6733 + }, + { + "epoch": 2.5071028842014638, + "grad_norm": 0.16776417195796967, + "learning_rate": 1.7668597356059977e-05, + "loss": 1.184, + "step": 6734 + }, + { + "epoch": 2.5074751893520726, + "grad_norm": 0.17335230112075806, + "learning_rate": 1.7667817291523753e-05, + "loss": 1.2042, + "step": 6735 + }, + { + "epoch": 2.507847494502682, + "grad_norm": 0.15524321794509888, + "learning_rate": 1.766703711373435e-05, + "loss": 1.1639, + "step": 6736 + }, + { + "epoch": 2.5082197996532907, + "grad_norm": 0.16183823347091675, + "learning_rate": 1.7666256822703288e-05, + "loss": 1.1863, + "step": 6737 + }, + { + "epoch": 2.5085921048039, + "grad_norm": 0.1681044101715088, + "learning_rate": 1.7665476418442092e-05, + "loss": 1.1777, + "step": 6738 + }, + { + "epoch": 2.508964409954509, + "grad_norm": 0.165101557970047, + "learning_rate": 1.766469590096229e-05, + "loss": 1.1829, + "step": 6739 + }, + { + "epoch": 2.509336715105118, + "grad_norm": 0.16544567048549652, + "learning_rate": 1.7663915270275413e-05, + "loss": 1.1665, + "step": 6740 + }, + { + "epoch": 2.509709020255727, + "grad_norm": 0.17958173155784607, + "learning_rate": 1.766313452639299e-05, + "loss": 1.1861, + "step": 6741 + }, + { + "epoch": 2.510081325406336, + "grad_norm": 0.19180700182914734, + "learning_rate": 1.766235366932655e-05, + "loss": 1.1774, + "step": 6742 + }, + { + "epoch": 2.5104536305569454, + "grad_norm": 0.1678982377052307, + "learning_rate": 1.7661572699087622e-05, + "loss": 1.1739, + "step": 6743 + }, + { + "epoch": 2.510825935707554, + "grad_norm": 0.16974776983261108, + "learning_rate": 1.7660791615687752e-05, + "loss": 1.1781, + "step": 6744 + }, + { + "epoch": 2.5111982408581635, + "grad_norm": 0.18585939705371857, + "learning_rate": 1.766001041913847e-05, + "loss": 1.1733, + "step": 6745 + }, + { + "epoch": 2.5115705460087723, + "grad_norm": 0.17436900734901428, + "learning_rate": 1.7659229109451312e-05, + "loss": 1.1856, + "step": 6746 + }, + { + "epoch": 2.5119428511593815, + "grad_norm": 0.1617736667394638, + "learning_rate": 1.765844768663782e-05, + "loss": 1.1664, + "step": 6747 + }, + { + "epoch": 2.512315156309991, + "grad_norm": 0.161626398563385, + "learning_rate": 1.765766615070954e-05, + "loss": 1.1708, + "step": 6748 + }, + { + "epoch": 2.5126874614605996, + "grad_norm": 0.16217051446437836, + "learning_rate": 1.7656884501678014e-05, + "loss": 1.1661, + "step": 6749 + }, + { + "epoch": 2.5130597666112084, + "grad_norm": 0.16591928899288177, + "learning_rate": 1.765610273955478e-05, + "loss": 1.1706, + "step": 6750 + }, + { + "epoch": 2.5134320717618177, + "grad_norm": 0.17161224782466888, + "learning_rate": 1.7655320864351388e-05, + "loss": 1.1799, + "step": 6751 + }, + { + "epoch": 2.513804376912427, + "grad_norm": 0.16334664821624756, + "learning_rate": 1.7654538876079387e-05, + "loss": 1.158, + "step": 6752 + }, + { + "epoch": 2.514176682063036, + "grad_norm": 0.15914195775985718, + "learning_rate": 1.7653756774750334e-05, + "loss": 1.1648, + "step": 6753 + }, + { + "epoch": 2.514548987213645, + "grad_norm": 0.18378858268260956, + "learning_rate": 1.7652974560375765e-05, + "loss": 1.1775, + "step": 6754 + }, + { + "epoch": 2.514921292364254, + "grad_norm": 0.21220822632312775, + "learning_rate": 1.7652192232967245e-05, + "loss": 1.1733, + "step": 6755 + }, + { + "epoch": 2.515293597514863, + "grad_norm": 0.1674444079399109, + "learning_rate": 1.7651409792536328e-05, + "loss": 1.1723, + "step": 6756 + }, + { + "epoch": 2.5156659026654724, + "grad_norm": 0.1930401474237442, + "learning_rate": 1.765062723909457e-05, + "loss": 1.1771, + "step": 6757 + }, + { + "epoch": 2.5160382078160812, + "grad_norm": 0.16456985473632812, + "learning_rate": 1.7649844572653523e-05, + "loss": 1.1803, + "step": 6758 + }, + { + "epoch": 2.51641051296669, + "grad_norm": 0.18256615102291107, + "learning_rate": 1.7649061793224752e-05, + "loss": 1.1701, + "step": 6759 + }, + { + "epoch": 2.5167828181172993, + "grad_norm": 0.16572584211826324, + "learning_rate": 1.7648278900819822e-05, + "loss": 1.1784, + "step": 6760 + }, + { + "epoch": 2.5171551232679086, + "grad_norm": 0.1717369556427002, + "learning_rate": 1.7647495895450292e-05, + "loss": 1.1871, + "step": 6761 + }, + { + "epoch": 2.5175274284185174, + "grad_norm": 0.16302919387817383, + "learning_rate": 1.7646712777127722e-05, + "loss": 1.1599, + "step": 6762 + }, + { + "epoch": 2.5178997335691267, + "grad_norm": 0.1645524501800537, + "learning_rate": 1.764592954586369e-05, + "loss": 1.1726, + "step": 6763 + }, + { + "epoch": 2.5182720387197355, + "grad_norm": 0.1693658083677292, + "learning_rate": 1.764514620166976e-05, + "loss": 1.1715, + "step": 6764 + }, + { + "epoch": 2.5186443438703447, + "grad_norm": 0.17292669415473938, + "learning_rate": 1.7644362744557498e-05, + "loss": 1.1747, + "step": 6765 + }, + { + "epoch": 2.519016649020954, + "grad_norm": 0.16915366053581238, + "learning_rate": 1.7643579174538475e-05, + "loss": 1.1728, + "step": 6766 + }, + { + "epoch": 2.519388954171563, + "grad_norm": 0.16110309958457947, + "learning_rate": 1.7642795491624268e-05, + "loss": 1.1649, + "step": 6767 + }, + { + "epoch": 2.519761259322172, + "grad_norm": 0.16794544458389282, + "learning_rate": 1.7642011695826455e-05, + "loss": 1.1718, + "step": 6768 + }, + { + "epoch": 2.520133564472781, + "grad_norm": 0.15866640210151672, + "learning_rate": 1.764122778715661e-05, + "loss": 1.1585, + "step": 6769 + }, + { + "epoch": 2.52050586962339, + "grad_norm": 0.16686029732227325, + "learning_rate": 1.7640443765626304e-05, + "loss": 1.1596, + "step": 6770 + }, + { + "epoch": 2.520878174773999, + "grad_norm": 0.16550125181674957, + "learning_rate": 1.7639659631247127e-05, + "loss": 1.1785, + "step": 6771 + }, + { + "epoch": 2.5212504799246083, + "grad_norm": 0.18188951909542084, + "learning_rate": 1.7638875384030654e-05, + "loss": 1.1607, + "step": 6772 + }, + { + "epoch": 2.521622785075217, + "grad_norm": 0.1796000748872757, + "learning_rate": 1.7638091023988473e-05, + "loss": 1.1778, + "step": 6773 + }, + { + "epoch": 2.5219950902258264, + "grad_norm": 0.17091524600982666, + "learning_rate": 1.7637306551132166e-05, + "loss": 1.1887, + "step": 6774 + }, + { + "epoch": 2.5223673953764356, + "grad_norm": 0.16658535599708557, + "learning_rate": 1.7636521965473324e-05, + "loss": 1.1695, + "step": 6775 + }, + { + "epoch": 2.5227397005270444, + "grad_norm": 0.16318367421627045, + "learning_rate": 1.7635737267023527e-05, + "loss": 1.1821, + "step": 6776 + }, + { + "epoch": 2.5231120056776537, + "grad_norm": 0.16290733218193054, + "learning_rate": 1.7634952455794373e-05, + "loss": 1.1683, + "step": 6777 + }, + { + "epoch": 2.5234843108282625, + "grad_norm": 0.16749358177185059, + "learning_rate": 1.7634167531797447e-05, + "loss": 1.1599, + "step": 6778 + }, + { + "epoch": 2.523856615978872, + "grad_norm": 0.16928349435329437, + "learning_rate": 1.7633382495044347e-05, + "loss": 1.1721, + "step": 6779 + }, + { + "epoch": 2.5242289211294806, + "grad_norm": 0.17672182619571686, + "learning_rate": 1.7632597345546667e-05, + "loss": 1.179, + "step": 6780 + }, + { + "epoch": 2.52460122628009, + "grad_norm": 0.21029046177864075, + "learning_rate": 1.7631812083316003e-05, + "loss": 1.1655, + "step": 6781 + }, + { + "epoch": 2.5249735314306987, + "grad_norm": 0.26718470454216003, + "learning_rate": 1.7631026708363956e-05, + "loss": 1.1803, + "step": 6782 + }, + { + "epoch": 2.525345836581308, + "grad_norm": 0.22429171204566956, + "learning_rate": 1.763024122070212e-05, + "loss": 1.1808, + "step": 6783 + }, + { + "epoch": 2.525718141731917, + "grad_norm": 0.1687416434288025, + "learning_rate": 1.76294556203421e-05, + "loss": 1.1771, + "step": 6784 + }, + { + "epoch": 2.526090446882526, + "grad_norm": 0.18023616075515747, + "learning_rate": 1.7628669907295504e-05, + "loss": 1.1736, + "step": 6785 + }, + { + "epoch": 2.5264627520331353, + "grad_norm": 0.17839936912059784, + "learning_rate": 1.762788408157393e-05, + "loss": 1.1743, + "step": 6786 + }, + { + "epoch": 2.526835057183744, + "grad_norm": 0.16387495398521423, + "learning_rate": 1.7627098143188982e-05, + "loss": 1.1711, + "step": 6787 + }, + { + "epoch": 2.5272073623343534, + "grad_norm": 0.17329815030097961, + "learning_rate": 1.762631209215228e-05, + "loss": 1.1782, + "step": 6788 + }, + { + "epoch": 2.527579667484962, + "grad_norm": 0.1764519065618515, + "learning_rate": 1.7625525928475424e-05, + "loss": 1.1734, + "step": 6789 + }, + { + "epoch": 2.5279519726355715, + "grad_norm": 0.1638839989900589, + "learning_rate": 1.762473965217003e-05, + "loss": 1.1666, + "step": 6790 + }, + { + "epoch": 2.5283242777861803, + "grad_norm": 0.16194722056388855, + "learning_rate": 1.7623953263247707e-05, + "loss": 1.1526, + "step": 6791 + }, + { + "epoch": 2.5286965829367896, + "grad_norm": 0.17311038076877594, + "learning_rate": 1.7623166761720075e-05, + "loss": 1.1802, + "step": 6792 + }, + { + "epoch": 2.529068888087399, + "grad_norm": 0.1649443656206131, + "learning_rate": 1.7622380147598745e-05, + "loss": 1.1698, + "step": 6793 + }, + { + "epoch": 2.5294411932380076, + "grad_norm": 0.16059640049934387, + "learning_rate": 1.7621593420895342e-05, + "loss": 1.1687, + "step": 6794 + }, + { + "epoch": 2.529813498388617, + "grad_norm": 0.17570586502552032, + "learning_rate": 1.762080658162148e-05, + "loss": 1.1768, + "step": 6795 + }, + { + "epoch": 2.5301858035392257, + "grad_norm": 0.1651303470134735, + "learning_rate": 1.7620019629788786e-05, + "loss": 1.1755, + "step": 6796 + }, + { + "epoch": 2.530558108689835, + "grad_norm": 0.16856788098812103, + "learning_rate": 1.7619232565408878e-05, + "loss": 1.1711, + "step": 6797 + }, + { + "epoch": 2.530930413840444, + "grad_norm": 0.16949835419654846, + "learning_rate": 1.7618445388493386e-05, + "loss": 1.1839, + "step": 6798 + }, + { + "epoch": 2.531302718991053, + "grad_norm": 0.17608442902565002, + "learning_rate": 1.761765809905393e-05, + "loss": 1.1857, + "step": 6799 + }, + { + "epoch": 2.531675024141662, + "grad_norm": 0.16439513862133026, + "learning_rate": 1.7616870697102144e-05, + "loss": 1.1789, + "step": 6800 + }, + { + "epoch": 2.532047329292271, + "grad_norm": 0.1684199720621109, + "learning_rate": 1.7616083182649654e-05, + "loss": 1.1857, + "step": 6801 + }, + { + "epoch": 2.5324196344428804, + "grad_norm": 0.16960062086582184, + "learning_rate": 1.7615295555708098e-05, + "loss": 1.1714, + "step": 6802 + }, + { + "epoch": 2.5327919395934892, + "grad_norm": 0.162268728017807, + "learning_rate": 1.7614507816289102e-05, + "loss": 1.1742, + "step": 6803 + }, + { + "epoch": 2.5331642447440985, + "grad_norm": 0.17141248285770416, + "learning_rate": 1.7613719964404303e-05, + "loss": 1.1886, + "step": 6804 + }, + { + "epoch": 2.5335365498947073, + "grad_norm": 0.17517106235027313, + "learning_rate": 1.7612932000065336e-05, + "loss": 1.1835, + "step": 6805 + }, + { + "epoch": 2.5339088550453166, + "grad_norm": 0.1663627028465271, + "learning_rate": 1.7612143923283844e-05, + "loss": 1.1616, + "step": 6806 + }, + { + "epoch": 2.5342811601959254, + "grad_norm": 0.1708042472600937, + "learning_rate": 1.7611355734071464e-05, + "loss": 1.1604, + "step": 6807 + }, + { + "epoch": 2.5346534653465347, + "grad_norm": 0.1797637939453125, + "learning_rate": 1.7610567432439834e-05, + "loss": 1.1802, + "step": 6808 + }, + { + "epoch": 2.5350257704971435, + "grad_norm": 0.16849200427532196, + "learning_rate": 1.7609779018400606e-05, + "loss": 1.1651, + "step": 6809 + }, + { + "epoch": 2.5353980756477528, + "grad_norm": 0.17425650358200073, + "learning_rate": 1.7608990491965416e-05, + "loss": 1.1741, + "step": 6810 + }, + { + "epoch": 2.535770380798362, + "grad_norm": 0.18015819787979126, + "learning_rate": 1.760820185314591e-05, + "loss": 1.1862, + "step": 6811 + }, + { + "epoch": 2.536142685948971, + "grad_norm": 0.16850873827934265, + "learning_rate": 1.7607413101953747e-05, + "loss": 1.1758, + "step": 6812 + }, + { + "epoch": 2.53651499109958, + "grad_norm": 0.1831638067960739, + "learning_rate": 1.7606624238400568e-05, + "loss": 1.1718, + "step": 6813 + }, + { + "epoch": 2.536887296250189, + "grad_norm": 0.16078107059001923, + "learning_rate": 1.7605835262498027e-05, + "loss": 1.1651, + "step": 6814 + }, + { + "epoch": 2.537259601400798, + "grad_norm": 0.17661617696285248, + "learning_rate": 1.7605046174257775e-05, + "loss": 1.1663, + "step": 6815 + }, + { + "epoch": 2.5376319065514075, + "grad_norm": 0.17243711650371552, + "learning_rate": 1.7604256973691468e-05, + "loss": 1.173, + "step": 6816 + }, + { + "epoch": 2.5380042117020163, + "grad_norm": 0.16753429174423218, + "learning_rate": 1.7603467660810763e-05, + "loss": 1.1635, + "step": 6817 + }, + { + "epoch": 2.538376516852625, + "grad_norm": 0.17135019600391388, + "learning_rate": 1.7602678235627317e-05, + "loss": 1.1634, + "step": 6818 + }, + { + "epoch": 2.5387488220032344, + "grad_norm": 0.16117660701274872, + "learning_rate": 1.7601888698152794e-05, + "loss": 1.1647, + "step": 6819 + }, + { + "epoch": 2.5391211271538436, + "grad_norm": 0.17489348351955414, + "learning_rate": 1.760109904839885e-05, + "loss": 1.1938, + "step": 6820 + }, + { + "epoch": 2.5394934323044525, + "grad_norm": 0.16629846394062042, + "learning_rate": 1.760030928637715e-05, + "loss": 1.1785, + "step": 6821 + }, + { + "epoch": 2.5398657374550617, + "grad_norm": 0.1682901829481125, + "learning_rate": 1.759951941209936e-05, + "loss": 1.1865, + "step": 6822 + }, + { + "epoch": 2.5402380426056705, + "grad_norm": 0.1646634191274643, + "learning_rate": 1.7598729425577143e-05, + "loss": 1.1787, + "step": 6823 + }, + { + "epoch": 2.54061034775628, + "grad_norm": 0.17023120820522308, + "learning_rate": 1.759793932682217e-05, + "loss": 1.1697, + "step": 6824 + }, + { + "epoch": 2.540982652906889, + "grad_norm": 0.17722590267658234, + "learning_rate": 1.759714911584611e-05, + "loss": 1.1666, + "step": 6825 + }, + { + "epoch": 2.541354958057498, + "grad_norm": 0.1645139455795288, + "learning_rate": 1.7596358792660633e-05, + "loss": 1.1907, + "step": 6826 + }, + { + "epoch": 2.5417272632081067, + "grad_norm": 0.1630629450082779, + "learning_rate": 1.7595568357277413e-05, + "loss": 1.1829, + "step": 6827 + }, + { + "epoch": 2.542099568358716, + "grad_norm": 0.16655808687210083, + "learning_rate": 1.759477780970813e-05, + "loss": 1.1751, + "step": 6828 + }, + { + "epoch": 2.5424718735093252, + "grad_norm": 0.19367186725139618, + "learning_rate": 1.759398714996445e-05, + "loss": 1.1613, + "step": 6829 + }, + { + "epoch": 2.542844178659934, + "grad_norm": 0.2233743518590927, + "learning_rate": 1.759319637805806e-05, + "loss": 1.1836, + "step": 6830 + }, + { + "epoch": 2.5432164838105433, + "grad_norm": 0.19930030405521393, + "learning_rate": 1.7592405494000635e-05, + "loss": 1.1862, + "step": 6831 + }, + { + "epoch": 2.543588788961152, + "grad_norm": 0.1649360954761505, + "learning_rate": 1.7591614497803856e-05, + "loss": 1.1586, + "step": 6832 + }, + { + "epoch": 2.5439610941117614, + "grad_norm": 0.1704995036125183, + "learning_rate": 1.7590823389479407e-05, + "loss": 1.1979, + "step": 6833 + }, + { + "epoch": 2.5443333992623707, + "grad_norm": 0.20266738533973694, + "learning_rate": 1.7590032169038974e-05, + "loss": 1.1811, + "step": 6834 + }, + { + "epoch": 2.5447057044129795, + "grad_norm": 0.1894606202840805, + "learning_rate": 1.7589240836494245e-05, + "loss": 1.1745, + "step": 6835 + }, + { + "epoch": 2.5450780095635883, + "grad_norm": 0.17341409623622894, + "learning_rate": 1.7588449391856903e-05, + "loss": 1.1793, + "step": 6836 + }, + { + "epoch": 2.5454503147141976, + "grad_norm": 0.20491334795951843, + "learning_rate": 1.758765783513864e-05, + "loss": 1.1762, + "step": 6837 + }, + { + "epoch": 2.545822619864807, + "grad_norm": 0.1708584725856781, + "learning_rate": 1.758686616635114e-05, + "loss": 1.1693, + "step": 6838 + }, + { + "epoch": 2.5461949250154157, + "grad_norm": 0.17428459227085114, + "learning_rate": 1.7586074385506114e-05, + "loss": 1.1827, + "step": 6839 + }, + { + "epoch": 2.546567230166025, + "grad_norm": 0.19591780006885529, + "learning_rate": 1.758528249261524e-05, + "loss": 1.1732, + "step": 6840 + }, + { + "epoch": 2.5469395353166338, + "grad_norm": 0.16241784393787384, + "learning_rate": 1.758449048769022e-05, + "loss": 1.1675, + "step": 6841 + }, + { + "epoch": 2.547311840467243, + "grad_norm": 0.17725640535354614, + "learning_rate": 1.758369837074275e-05, + "loss": 1.166, + "step": 6842 + }, + { + "epoch": 2.5476841456178523, + "grad_norm": 0.17130807042121887, + "learning_rate": 1.7582906141784534e-05, + "loss": 1.1961, + "step": 6843 + }, + { + "epoch": 2.548056450768461, + "grad_norm": 0.18036004900932312, + "learning_rate": 1.758211380082727e-05, + "loss": 1.1713, + "step": 6844 + }, + { + "epoch": 2.54842875591907, + "grad_norm": 0.16558697819709778, + "learning_rate": 1.7581321347882657e-05, + "loss": 1.1733, + "step": 6845 + }, + { + "epoch": 2.548801061069679, + "grad_norm": 0.17441225051879883, + "learning_rate": 1.7580528782962408e-05, + "loss": 1.1662, + "step": 6846 + }, + { + "epoch": 2.5491733662202885, + "grad_norm": 0.16816645860671997, + "learning_rate": 1.757973610607822e-05, + "loss": 1.1792, + "step": 6847 + }, + { + "epoch": 2.5495456713708973, + "grad_norm": 0.19029933214187622, + "learning_rate": 1.757894331724181e-05, + "loss": 1.1739, + "step": 6848 + }, + { + "epoch": 2.5499179765215065, + "grad_norm": 0.15939323604106903, + "learning_rate": 1.757815041646488e-05, + "loss": 1.1672, + "step": 6849 + }, + { + "epoch": 2.5502902816721154, + "grad_norm": 0.1666376292705536, + "learning_rate": 1.7577357403759147e-05, + "loss": 1.1632, + "step": 6850 + }, + { + "epoch": 2.5506625868227246, + "grad_norm": 0.16542813181877136, + "learning_rate": 1.7576564279136318e-05, + "loss": 1.1823, + "step": 6851 + }, + { + "epoch": 2.551034891973334, + "grad_norm": 0.19840499758720398, + "learning_rate": 1.757577104260811e-05, + "loss": 1.1901, + "step": 6852 + }, + { + "epoch": 2.5514071971239427, + "grad_norm": 0.17866215109825134, + "learning_rate": 1.757497769418624e-05, + "loss": 1.1641, + "step": 6853 + }, + { + "epoch": 2.5517795022745515, + "grad_norm": 0.16423308849334717, + "learning_rate": 1.7574184233882424e-05, + "loss": 1.176, + "step": 6854 + }, + { + "epoch": 2.552151807425161, + "grad_norm": 0.1964891403913498, + "learning_rate": 1.7573390661708386e-05, + "loss": 1.1694, + "step": 6855 + }, + { + "epoch": 2.55252411257577, + "grad_norm": 0.16654036939144135, + "learning_rate": 1.7572596977675837e-05, + "loss": 1.1754, + "step": 6856 + }, + { + "epoch": 2.552896417726379, + "grad_norm": 0.18395081162452698, + "learning_rate": 1.757180318179651e-05, + "loss": 1.1691, + "step": 6857 + }, + { + "epoch": 2.553268722876988, + "grad_norm": 0.18189731240272522, + "learning_rate": 1.7571009274082124e-05, + "loss": 1.181, + "step": 6858 + }, + { + "epoch": 2.553641028027597, + "grad_norm": 0.1674988567829132, + "learning_rate": 1.7570215254544406e-05, + "loss": 1.173, + "step": 6859 + }, + { + "epoch": 2.5540133331782062, + "grad_norm": 0.1818539947271347, + "learning_rate": 1.7569421123195086e-05, + "loss": 1.1811, + "step": 6860 + }, + { + "epoch": 2.5543856383288155, + "grad_norm": 0.1642785221338272, + "learning_rate": 1.7568626880045888e-05, + "loss": 1.1886, + "step": 6861 + }, + { + "epoch": 2.5547579434794243, + "grad_norm": 0.21301640570163727, + "learning_rate": 1.7567832525108547e-05, + "loss": 1.1762, + "step": 6862 + }, + { + "epoch": 2.555130248630033, + "grad_norm": 0.1925705522298813, + "learning_rate": 1.7567038058394797e-05, + "loss": 1.1685, + "step": 6863 + }, + { + "epoch": 2.5555025537806424, + "grad_norm": 0.1762634962797165, + "learning_rate": 1.7566243479916365e-05, + "loss": 1.1641, + "step": 6864 + }, + { + "epoch": 2.5558748589312517, + "grad_norm": 0.19311204552650452, + "learning_rate": 1.7565448789684996e-05, + "loss": 1.1789, + "step": 6865 + }, + { + "epoch": 2.5562471640818605, + "grad_norm": 0.17269863188266754, + "learning_rate": 1.756465398771242e-05, + "loss": 1.1702, + "step": 6866 + }, + { + "epoch": 2.5566194692324697, + "grad_norm": 0.1711881160736084, + "learning_rate": 1.7563859074010382e-05, + "loss": 1.1752, + "step": 6867 + }, + { + "epoch": 2.5569917743830786, + "grad_norm": 0.1725349724292755, + "learning_rate": 1.756306404859062e-05, + "loss": 1.1673, + "step": 6868 + }, + { + "epoch": 2.557364079533688, + "grad_norm": 0.1748928427696228, + "learning_rate": 1.7562268911464872e-05, + "loss": 1.1651, + "step": 6869 + }, + { + "epoch": 2.557736384684297, + "grad_norm": 0.17474377155303955, + "learning_rate": 1.7561473662644893e-05, + "loss": 1.1771, + "step": 6870 + }, + { + "epoch": 2.558108689834906, + "grad_norm": 0.17705856263637543, + "learning_rate": 1.7560678302142418e-05, + "loss": 1.1725, + "step": 6871 + }, + { + "epoch": 2.5584809949855147, + "grad_norm": 0.16561757028102875, + "learning_rate": 1.7559882829969203e-05, + "loss": 1.1827, + "step": 6872 + }, + { + "epoch": 2.558853300136124, + "grad_norm": 0.17302724719047546, + "learning_rate": 1.7559087246136987e-05, + "loss": 1.1834, + "step": 6873 + }, + { + "epoch": 2.5592256052867333, + "grad_norm": 0.16074450314044952, + "learning_rate": 1.755829155065753e-05, + "loss": 1.184, + "step": 6874 + }, + { + "epoch": 2.559597910437342, + "grad_norm": 0.16113080084323883, + "learning_rate": 1.7557495743542586e-05, + "loss": 1.1707, + "step": 6875 + }, + { + "epoch": 2.5599702155879513, + "grad_norm": 0.1629118025302887, + "learning_rate": 1.7556699824803897e-05, + "loss": 1.1578, + "step": 6876 + }, + { + "epoch": 2.56034252073856, + "grad_norm": 0.16696172952651978, + "learning_rate": 1.7555903794453232e-05, + "loss": 1.1842, + "step": 6877 + }, + { + "epoch": 2.5607148258891694, + "grad_norm": 0.16263677179813385, + "learning_rate": 1.7555107652502337e-05, + "loss": 1.1854, + "step": 6878 + }, + { + "epoch": 2.5610871310397787, + "grad_norm": 0.163103848695755, + "learning_rate": 1.7554311398962976e-05, + "loss": 1.173, + "step": 6879 + }, + { + "epoch": 2.5614594361903875, + "grad_norm": 0.18173319101333618, + "learning_rate": 1.7553515033846913e-05, + "loss": 1.1643, + "step": 6880 + }, + { + "epoch": 2.5618317413409963, + "grad_norm": 0.1870352178812027, + "learning_rate": 1.7552718557165907e-05, + "loss": 1.1847, + "step": 6881 + }, + { + "epoch": 2.5622040464916056, + "grad_norm": 0.166317880153656, + "learning_rate": 1.755192196893172e-05, + "loss": 1.1683, + "step": 6882 + }, + { + "epoch": 2.562576351642215, + "grad_norm": 0.2503323554992676, + "learning_rate": 1.755112526915612e-05, + "loss": 1.1824, + "step": 6883 + }, + { + "epoch": 2.5629486567928237, + "grad_norm": 0.2117423117160797, + "learning_rate": 1.7550328457850873e-05, + "loss": 1.1667, + "step": 6884 + }, + { + "epoch": 2.563320961943433, + "grad_norm": 0.19109854102134705, + "learning_rate": 1.754953153502775e-05, + "loss": 1.1603, + "step": 6885 + }, + { + "epoch": 2.5636932670940418, + "grad_norm": 0.1728496551513672, + "learning_rate": 1.754873450069852e-05, + "loss": 1.1538, + "step": 6886 + }, + { + "epoch": 2.564065572244651, + "grad_norm": 0.20083752274513245, + "learning_rate": 1.7547937354874953e-05, + "loss": 1.1703, + "step": 6887 + }, + { + "epoch": 2.5644378773952603, + "grad_norm": 0.1984606832265854, + "learning_rate": 1.7547140097568827e-05, + "loss": 1.1897, + "step": 6888 + }, + { + "epoch": 2.564810182545869, + "grad_norm": 0.1722983717918396, + "learning_rate": 1.7546342728791915e-05, + "loss": 1.1647, + "step": 6889 + }, + { + "epoch": 2.5651824876964784, + "grad_norm": 0.17819280922412872, + "learning_rate": 1.7545545248555994e-05, + "loss": 1.1718, + "step": 6890 + }, + { + "epoch": 2.565554792847087, + "grad_norm": 0.17654377222061157, + "learning_rate": 1.754474765687284e-05, + "loss": 1.1609, + "step": 6891 + }, + { + "epoch": 2.5659270979976965, + "grad_norm": 0.16665366291999817, + "learning_rate": 1.7543949953754244e-05, + "loss": 1.1773, + "step": 6892 + }, + { + "epoch": 2.5662994031483053, + "grad_norm": 0.16269910335540771, + "learning_rate": 1.7543152139211973e-05, + "loss": 1.1675, + "step": 6893 + }, + { + "epoch": 2.5666717082989146, + "grad_norm": 0.16960933804512024, + "learning_rate": 1.7542354213257825e-05, + "loss": 1.1802, + "step": 6894 + }, + { + "epoch": 2.5670440134495234, + "grad_norm": 0.18422453105449677, + "learning_rate": 1.7541556175903577e-05, + "loss": 1.1859, + "step": 6895 + }, + { + "epoch": 2.5674163186001326, + "grad_norm": 0.15753304958343506, + "learning_rate": 1.7540758027161014e-05, + "loss": 1.1712, + "step": 6896 + }, + { + "epoch": 2.567788623750742, + "grad_norm": 0.1667400747537613, + "learning_rate": 1.753995976704193e-05, + "loss": 1.1902, + "step": 6897 + }, + { + "epoch": 2.5681609289013507, + "grad_norm": 0.1718086153268814, + "learning_rate": 1.7539161395558115e-05, + "loss": 1.167, + "step": 6898 + }, + { + "epoch": 2.56853323405196, + "grad_norm": 0.1692952811717987, + "learning_rate": 1.7538362912721356e-05, + "loss": 1.1707, + "step": 6899 + }, + { + "epoch": 2.568905539202569, + "grad_norm": 0.16457752883434296, + "learning_rate": 1.7537564318543455e-05, + "loss": 1.1923, + "step": 6900 + }, + { + "epoch": 2.569277844353178, + "grad_norm": 0.16177764534950256, + "learning_rate": 1.7536765613036198e-05, + "loss": 1.1722, + "step": 6901 + }, + { + "epoch": 2.569650149503787, + "grad_norm": 0.17193439602851868, + "learning_rate": 1.7535966796211387e-05, + "loss": 1.1789, + "step": 6902 + }, + { + "epoch": 2.570022454654396, + "grad_norm": 0.1670045256614685, + "learning_rate": 1.753516786808082e-05, + "loss": 1.1715, + "step": 6903 + }, + { + "epoch": 2.570394759805005, + "grad_norm": 0.1643945276737213, + "learning_rate": 1.7534368828656295e-05, + "loss": 1.1788, + "step": 6904 + }, + { + "epoch": 2.5707670649556142, + "grad_norm": 0.1577821522951126, + "learning_rate": 1.7533569677949616e-05, + "loss": 1.1758, + "step": 6905 + }, + { + "epoch": 2.5711393701062235, + "grad_norm": 0.16875483095645905, + "learning_rate": 1.7532770415972585e-05, + "loss": 1.1799, + "step": 6906 + }, + { + "epoch": 2.5715116752568323, + "grad_norm": 0.1640816628932953, + "learning_rate": 1.7531971042737008e-05, + "loss": 1.1744, + "step": 6907 + }, + { + "epoch": 2.5718839804074416, + "grad_norm": 0.16457021236419678, + "learning_rate": 1.7531171558254692e-05, + "loss": 1.1772, + "step": 6908 + }, + { + "epoch": 2.5722562855580504, + "grad_norm": 0.16494521498680115, + "learning_rate": 1.7530371962537445e-05, + "loss": 1.1781, + "step": 6909 + }, + { + "epoch": 2.5726285907086597, + "grad_norm": 0.16351187229156494, + "learning_rate": 1.7529572255597077e-05, + "loss": 1.1792, + "step": 6910 + }, + { + "epoch": 2.5730008958592685, + "grad_norm": 0.1582973450422287, + "learning_rate": 1.7528772437445396e-05, + "loss": 1.1826, + "step": 6911 + }, + { + "epoch": 2.5733732010098778, + "grad_norm": 0.16494831442832947, + "learning_rate": 1.7527972508094223e-05, + "loss": 1.1892, + "step": 6912 + }, + { + "epoch": 2.5737455061604866, + "grad_norm": 0.16706949472427368, + "learning_rate": 1.7527172467555367e-05, + "loss": 1.1759, + "step": 6913 + }, + { + "epoch": 2.574117811311096, + "grad_norm": 0.16176964342594147, + "learning_rate": 1.752637231584064e-05, + "loss": 1.181, + "step": 6914 + }, + { + "epoch": 2.574490116461705, + "grad_norm": 0.1603250950574875, + "learning_rate": 1.7525572052961877e-05, + "loss": 1.1735, + "step": 6915 + }, + { + "epoch": 2.574862421612314, + "grad_norm": 0.1632986068725586, + "learning_rate": 1.752477167893088e-05, + "loss": 1.1669, + "step": 6916 + }, + { + "epoch": 2.575234726762923, + "grad_norm": 0.16188709437847137, + "learning_rate": 1.7523971193759482e-05, + "loss": 1.176, + "step": 6917 + }, + { + "epoch": 2.575607031913532, + "grad_norm": 0.16379517316818237, + "learning_rate": 1.7523170597459497e-05, + "loss": 1.1666, + "step": 6918 + }, + { + "epoch": 2.5759793370641413, + "grad_norm": 0.16542209684848785, + "learning_rate": 1.7522369890042755e-05, + "loss": 1.1477, + "step": 6919 + }, + { + "epoch": 2.57635164221475, + "grad_norm": 0.16362489759922028, + "learning_rate": 1.7521569071521084e-05, + "loss": 1.1709, + "step": 6920 + }, + { + "epoch": 2.5767239473653594, + "grad_norm": 0.18013128638267517, + "learning_rate": 1.752076814190631e-05, + "loss": 1.1737, + "step": 6921 + }, + { + "epoch": 2.577096252515968, + "grad_norm": 0.1668436974287033, + "learning_rate": 1.7519967101210264e-05, + "loss": 1.1797, + "step": 6922 + }, + { + "epoch": 2.5774685576665775, + "grad_norm": 0.17277368903160095, + "learning_rate": 1.751916594944477e-05, + "loss": 1.1708, + "step": 6923 + }, + { + "epoch": 2.5778408628171867, + "grad_norm": 0.1680234968662262, + "learning_rate": 1.751836468662167e-05, + "loss": 1.1707, + "step": 6924 + }, + { + "epoch": 2.5782131679677955, + "grad_norm": 0.17191176116466522, + "learning_rate": 1.7517563312752796e-05, + "loss": 1.1807, + "step": 6925 + }, + { + "epoch": 2.578585473118405, + "grad_norm": 0.16285885870456696, + "learning_rate": 1.7516761827849987e-05, + "loss": 1.1707, + "step": 6926 + }, + { + "epoch": 2.5789577782690136, + "grad_norm": 0.16699667274951935, + "learning_rate": 1.7515960231925072e-05, + "loss": 1.1765, + "step": 6927 + }, + { + "epoch": 2.579330083419623, + "grad_norm": 0.1778481900691986, + "learning_rate": 1.7515158524989896e-05, + "loss": 1.1785, + "step": 6928 + }, + { + "epoch": 2.579702388570232, + "grad_norm": 0.17215748131275177, + "learning_rate": 1.7514356707056303e-05, + "loss": 1.1653, + "step": 6929 + }, + { + "epoch": 2.580074693720841, + "grad_norm": 0.20714597404003143, + "learning_rate": 1.7513554778136133e-05, + "loss": 1.1778, + "step": 6930 + }, + { + "epoch": 2.58044699887145, + "grad_norm": 0.1802298128604889, + "learning_rate": 1.751275273824123e-05, + "loss": 1.1798, + "step": 6931 + }, + { + "epoch": 2.580819304022059, + "grad_norm": 0.18000389635562897, + "learning_rate": 1.7511950587383438e-05, + "loss": 1.1729, + "step": 6932 + }, + { + "epoch": 2.5811916091726683, + "grad_norm": 0.1696796715259552, + "learning_rate": 1.7511148325574613e-05, + "loss": 1.176, + "step": 6933 + }, + { + "epoch": 2.581563914323277, + "grad_norm": 0.1616394817829132, + "learning_rate": 1.7510345952826594e-05, + "loss": 1.1583, + "step": 6934 + }, + { + "epoch": 2.5819362194738864, + "grad_norm": 0.18179087340831757, + "learning_rate": 1.7509543469151234e-05, + "loss": 1.1809, + "step": 6935 + }, + { + "epoch": 2.5823085246244952, + "grad_norm": 0.17522992193698883, + "learning_rate": 1.7508740874560393e-05, + "loss": 1.1693, + "step": 6936 + }, + { + "epoch": 2.5826808297751045, + "grad_norm": 0.15960480272769928, + "learning_rate": 1.7507938169065922e-05, + "loss": 1.17, + "step": 6937 + }, + { + "epoch": 2.5830531349257138, + "grad_norm": 0.16696304082870483, + "learning_rate": 1.750713535267967e-05, + "loss": 1.1632, + "step": 6938 + }, + { + "epoch": 2.5834254400763226, + "grad_norm": 0.16897381842136383, + "learning_rate": 1.7506332425413505e-05, + "loss": 1.1773, + "step": 6939 + }, + { + "epoch": 2.5837977452269314, + "grad_norm": 0.16685079038143158, + "learning_rate": 1.750552938727928e-05, + "loss": 1.1664, + "step": 6940 + }, + { + "epoch": 2.5841700503775407, + "grad_norm": 0.1602102816104889, + "learning_rate": 1.7504726238288857e-05, + "loss": 1.174, + "step": 6941 + }, + { + "epoch": 2.58454235552815, + "grad_norm": 0.21326759457588196, + "learning_rate": 1.7503922978454094e-05, + "loss": 1.179, + "step": 6942 + }, + { + "epoch": 2.5849146606787587, + "grad_norm": 0.16109180450439453, + "learning_rate": 1.7503119607786865e-05, + "loss": 1.1676, + "step": 6943 + }, + { + "epoch": 2.585286965829368, + "grad_norm": 0.1658046394586563, + "learning_rate": 1.7502316126299027e-05, + "loss": 1.1825, + "step": 6944 + }, + { + "epoch": 2.585659270979977, + "grad_norm": 0.16446712613105774, + "learning_rate": 1.7501512534002453e-05, + "loss": 1.1782, + "step": 6945 + }, + { + "epoch": 2.586031576130586, + "grad_norm": 0.1652338057756424, + "learning_rate": 1.7500708830909006e-05, + "loss": 1.1615, + "step": 6946 + }, + { + "epoch": 2.5864038812811954, + "grad_norm": 0.16343404352664948, + "learning_rate": 1.7499905017030565e-05, + "loss": 1.1683, + "step": 6947 + }, + { + "epoch": 2.586776186431804, + "grad_norm": 0.16519500315189362, + "learning_rate": 1.7499101092378995e-05, + "loss": 1.175, + "step": 6948 + }, + { + "epoch": 2.587148491582413, + "grad_norm": 0.16351045668125153, + "learning_rate": 1.7498297056966174e-05, + "loss": 1.1709, + "step": 6949 + }, + { + "epoch": 2.5875207967330223, + "grad_norm": 0.1664671003818512, + "learning_rate": 1.7497492910803972e-05, + "loss": 1.169, + "step": 6950 + }, + { + "epoch": 2.5878931018836315, + "grad_norm": 0.16572503745555878, + "learning_rate": 1.7496688653904277e-05, + "loss": 1.1797, + "step": 6951 + }, + { + "epoch": 2.5882654070342404, + "grad_norm": 0.16367551684379578, + "learning_rate": 1.7495884286278955e-05, + "loss": 1.1695, + "step": 6952 + }, + { + "epoch": 2.5886377121848496, + "grad_norm": 0.16530726850032806, + "learning_rate": 1.7495079807939897e-05, + "loss": 1.1604, + "step": 6953 + }, + { + "epoch": 2.5890100173354584, + "grad_norm": 0.16979095339775085, + "learning_rate": 1.7494275218898976e-05, + "loss": 1.1693, + "step": 6954 + }, + { + "epoch": 2.5893823224860677, + "grad_norm": 0.1681201159954071, + "learning_rate": 1.749347051916808e-05, + "loss": 1.1655, + "step": 6955 + }, + { + "epoch": 2.589754627636677, + "grad_norm": 0.17620940506458282, + "learning_rate": 1.74926657087591e-05, + "loss": 1.1716, + "step": 6956 + }, + { + "epoch": 2.590126932787286, + "grad_norm": 0.16808199882507324, + "learning_rate": 1.7491860787683915e-05, + "loss": 1.1593, + "step": 6957 + }, + { + "epoch": 2.5904992379378946, + "grad_norm": 0.16996102035045624, + "learning_rate": 1.7491055755954418e-05, + "loss": 1.179, + "step": 6958 + }, + { + "epoch": 2.590871543088504, + "grad_norm": 0.16559867560863495, + "learning_rate": 1.7490250613582492e-05, + "loss": 1.1794, + "step": 6959 + }, + { + "epoch": 2.591243848239113, + "grad_norm": 0.16791416704654694, + "learning_rate": 1.748944536058004e-05, + "loss": 1.1791, + "step": 6960 + }, + { + "epoch": 2.591616153389722, + "grad_norm": 0.17107127606868744, + "learning_rate": 1.7488639996958952e-05, + "loss": 1.1862, + "step": 6961 + }, + { + "epoch": 2.591988458540331, + "grad_norm": 0.16655850410461426, + "learning_rate": 1.7487834522731115e-05, + "loss": 1.1807, + "step": 6962 + }, + { + "epoch": 2.59236076369094, + "grad_norm": 0.171476349234581, + "learning_rate": 1.7487028937908436e-05, + "loss": 1.1816, + "step": 6963 + }, + { + "epoch": 2.5927330688415493, + "grad_norm": 0.1685655415058136, + "learning_rate": 1.748622324250281e-05, + "loss": 1.1735, + "step": 6964 + }, + { + "epoch": 2.5931053739921586, + "grad_norm": 0.17578360438346863, + "learning_rate": 1.7485417436526134e-05, + "loss": 1.1802, + "step": 6965 + }, + { + "epoch": 2.5934776791427674, + "grad_norm": 0.17003393173217773, + "learning_rate": 1.748461151999031e-05, + "loss": 1.1937, + "step": 6966 + }, + { + "epoch": 2.593849984293376, + "grad_norm": 0.18038241565227509, + "learning_rate": 1.7483805492907246e-05, + "loss": 1.1826, + "step": 6967 + }, + { + "epoch": 2.5942222894439855, + "grad_norm": 0.23492062091827393, + "learning_rate": 1.7482999355288846e-05, + "loss": 1.1888, + "step": 6968 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 0.18728743493556976, + "learning_rate": 1.7482193107147012e-05, + "loss": 1.1753, + "step": 6969 + }, + { + "epoch": 2.5949668997452036, + "grad_norm": 0.1779962182044983, + "learning_rate": 1.7481386748493664e-05, + "loss": 1.1748, + "step": 6970 + }, + { + "epoch": 2.595339204895813, + "grad_norm": 0.1643301546573639, + "learning_rate": 1.7480580279340694e-05, + "loss": 1.17, + "step": 6971 + }, + { + "epoch": 2.5957115100464216, + "grad_norm": 0.25796541571617126, + "learning_rate": 1.7479773699700024e-05, + "loss": 1.1708, + "step": 6972 + }, + { + "epoch": 2.596083815197031, + "grad_norm": 0.1716795414686203, + "learning_rate": 1.747896700958357e-05, + "loss": 1.1742, + "step": 6973 + }, + { + "epoch": 2.59645612034764, + "grad_norm": 0.17768600583076477, + "learning_rate": 1.747816020900324e-05, + "loss": 1.1841, + "step": 6974 + }, + { + "epoch": 2.596828425498249, + "grad_norm": 0.164155513048172, + "learning_rate": 1.7477353297970952e-05, + "loss": 1.1678, + "step": 6975 + }, + { + "epoch": 2.597200730648858, + "grad_norm": 0.16183872520923615, + "learning_rate": 1.7476546276498625e-05, + "loss": 1.1626, + "step": 6976 + }, + { + "epoch": 2.597573035799467, + "grad_norm": 0.17142947018146515, + "learning_rate": 1.7475739144598183e-05, + "loss": 1.1773, + "step": 6977 + }, + { + "epoch": 2.5979453409500763, + "grad_norm": 0.16305896639823914, + "learning_rate": 1.7474931902281538e-05, + "loss": 1.182, + "step": 6978 + }, + { + "epoch": 2.598317646100685, + "grad_norm": 0.17089135944843292, + "learning_rate": 1.747412454956062e-05, + "loss": 1.1743, + "step": 6979 + }, + { + "epoch": 2.5986899512512944, + "grad_norm": 0.1611926555633545, + "learning_rate": 1.747331708644735e-05, + "loss": 1.1753, + "step": 6980 + }, + { + "epoch": 2.5990622564019032, + "grad_norm": 0.16074836254119873, + "learning_rate": 1.747250951295366e-05, + "loss": 1.1583, + "step": 6981 + }, + { + "epoch": 2.5994345615525125, + "grad_norm": 0.16934338212013245, + "learning_rate": 1.7471701829091468e-05, + "loss": 1.1796, + "step": 6982 + }, + { + "epoch": 2.5998068667031218, + "grad_norm": 0.16507279872894287, + "learning_rate": 1.747089403487271e-05, + "loss": 1.1742, + "step": 6983 + }, + { + "epoch": 2.6001791718537306, + "grad_norm": 0.16199171543121338, + "learning_rate": 1.747008613030932e-05, + "loss": 1.1715, + "step": 6984 + }, + { + "epoch": 2.6005514770043394, + "grad_norm": 0.15591688454151154, + "learning_rate": 1.7469278115413222e-05, + "loss": 1.171, + "step": 6985 + }, + { + "epoch": 2.6009237821549487, + "grad_norm": 0.16649380326271057, + "learning_rate": 1.7468469990196358e-05, + "loss": 1.1597, + "step": 6986 + }, + { + "epoch": 2.601296087305558, + "grad_norm": 0.1675834059715271, + "learning_rate": 1.746766175467066e-05, + "loss": 1.183, + "step": 6987 + }, + { + "epoch": 2.6016683924561668, + "grad_norm": 0.15882286429405212, + "learning_rate": 1.7466853408848067e-05, + "loss": 1.1885, + "step": 6988 + }, + { + "epoch": 2.602040697606776, + "grad_norm": 0.16068002581596375, + "learning_rate": 1.7466044952740517e-05, + "loss": 1.1896, + "step": 6989 + }, + { + "epoch": 2.602413002757385, + "grad_norm": 0.16341431438922882, + "learning_rate": 1.7465236386359952e-05, + "loss": 1.1716, + "step": 6990 + }, + { + "epoch": 2.602785307907994, + "grad_norm": 0.1623801589012146, + "learning_rate": 1.746442770971831e-05, + "loss": 1.1738, + "step": 6991 + }, + { + "epoch": 2.6031576130586034, + "grad_norm": 0.16583126783370972, + "learning_rate": 1.7463618922827545e-05, + "loss": 1.1788, + "step": 6992 + }, + { + "epoch": 2.603529918209212, + "grad_norm": 0.16922509670257568, + "learning_rate": 1.74628100256996e-05, + "loss": 1.1855, + "step": 6993 + }, + { + "epoch": 2.603902223359821, + "grad_norm": 0.16294334828853607, + "learning_rate": 1.7462001018346408e-05, + "loss": 1.178, + "step": 6994 + }, + { + "epoch": 2.6042745285104303, + "grad_norm": 0.16397880017757416, + "learning_rate": 1.7461191900779936e-05, + "loss": 1.1779, + "step": 6995 + }, + { + "epoch": 2.6046468336610396, + "grad_norm": 0.16646462678909302, + "learning_rate": 1.746038267301213e-05, + "loss": 1.1666, + "step": 6996 + }, + { + "epoch": 2.6050191388116484, + "grad_norm": 0.16266998648643494, + "learning_rate": 1.7459573335054935e-05, + "loss": 1.1792, + "step": 6997 + }, + { + "epoch": 2.6053914439622576, + "grad_norm": 0.16222761571407318, + "learning_rate": 1.745876388692031e-05, + "loss": 1.169, + "step": 6998 + }, + { + "epoch": 2.6057637491128665, + "grad_norm": 0.16887149214744568, + "learning_rate": 1.7457954328620217e-05, + "loss": 1.1724, + "step": 6999 + }, + { + "epoch": 2.6061360542634757, + "grad_norm": 0.16719955205917358, + "learning_rate": 1.74571446601666e-05, + "loss": 1.1934, + "step": 7000 + }, + { + "epoch": 2.6061360542634757, + "eval_loss": 1.2935380935668945, + "eval_runtime": 16.7437, + "eval_samples_per_second": 103.562, + "eval_steps_per_second": 5.196, + "step": 7000 + }, + { + "epoch": 2.606508359414085, + "grad_norm": 0.16188542544841766, + "learning_rate": 1.7456334881571428e-05, + "loss": 1.1968, + "step": 7001 + }, + { + "epoch": 2.606880664564694, + "grad_norm": 0.16819295287132263, + "learning_rate": 1.745552499284666e-05, + "loss": 1.1796, + "step": 7002 + }, + { + "epoch": 2.607252969715303, + "grad_norm": 0.1619422435760498, + "learning_rate": 1.745471499400425e-05, + "loss": 1.1739, + "step": 7003 + }, + { + "epoch": 2.607625274865912, + "grad_norm": 0.16731281578540802, + "learning_rate": 1.745390488505617e-05, + "loss": 1.1763, + "step": 7004 + }, + { + "epoch": 2.607997580016521, + "grad_norm": 0.16323542594909668, + "learning_rate": 1.745309466601438e-05, + "loss": 1.1742, + "step": 7005 + }, + { + "epoch": 2.60836988516713, + "grad_norm": 0.16577135026454926, + "learning_rate": 1.7452284336890853e-05, + "loss": 1.1684, + "step": 7006 + }, + { + "epoch": 2.6087421903177392, + "grad_norm": 0.16486932337284088, + "learning_rate": 1.7451473897697552e-05, + "loss": 1.168, + "step": 7007 + }, + { + "epoch": 2.609114495468348, + "grad_norm": 0.17325793206691742, + "learning_rate": 1.745066334844645e-05, + "loss": 1.1709, + "step": 7008 + }, + { + "epoch": 2.6094868006189573, + "grad_norm": 0.1616426408290863, + "learning_rate": 1.744985268914952e-05, + "loss": 1.1741, + "step": 7009 + }, + { + "epoch": 2.6098591057695666, + "grad_norm": 0.16339850425720215, + "learning_rate": 1.744904191981873e-05, + "loss": 1.1719, + "step": 7010 + }, + { + "epoch": 2.6102314109201754, + "grad_norm": 0.16253487765789032, + "learning_rate": 1.744823104046606e-05, + "loss": 1.1705, + "step": 7011 + }, + { + "epoch": 2.6106037160707847, + "grad_norm": 0.17588497698307037, + "learning_rate": 1.7447420051103483e-05, + "loss": 1.178, + "step": 7012 + }, + { + "epoch": 2.6109760212213935, + "grad_norm": 0.18001097440719604, + "learning_rate": 1.744660895174298e-05, + "loss": 1.1642, + "step": 7013 + }, + { + "epoch": 2.6113483263720028, + "grad_norm": 0.1719658225774765, + "learning_rate": 1.7445797742396535e-05, + "loss": 1.1675, + "step": 7014 + }, + { + "epoch": 2.6117206315226116, + "grad_norm": 0.20831653475761414, + "learning_rate": 1.7444986423076116e-05, + "loss": 1.168, + "step": 7015 + }, + { + "epoch": 2.612092936673221, + "grad_norm": 0.17336048185825348, + "learning_rate": 1.744417499379372e-05, + "loss": 1.1814, + "step": 7016 + }, + { + "epoch": 2.6124652418238297, + "grad_norm": 0.17226654291152954, + "learning_rate": 1.7443363454561327e-05, + "loss": 1.1761, + "step": 7017 + }, + { + "epoch": 2.612837546974439, + "grad_norm": 0.20160022377967834, + "learning_rate": 1.744255180539092e-05, + "loss": 1.1843, + "step": 7018 + }, + { + "epoch": 2.613209852125048, + "grad_norm": 0.1713937222957611, + "learning_rate": 1.7441740046294496e-05, + "loss": 1.1625, + "step": 7019 + }, + { + "epoch": 2.613582157275657, + "grad_norm": 0.1691260188817978, + "learning_rate": 1.744092817728403e-05, + "loss": 1.1839, + "step": 7020 + }, + { + "epoch": 2.6139544624262663, + "grad_norm": 0.16929064691066742, + "learning_rate": 1.7440116198371528e-05, + "loss": 1.1644, + "step": 7021 + }, + { + "epoch": 2.614326767576875, + "grad_norm": 0.16705340147018433, + "learning_rate": 1.7439304109568972e-05, + "loss": 1.1847, + "step": 7022 + }, + { + "epoch": 2.6146990727274844, + "grad_norm": 0.16961196064949036, + "learning_rate": 1.7438491910888367e-05, + "loss": 1.1534, + "step": 7023 + }, + { + "epoch": 2.615071377878093, + "grad_norm": 0.16219016909599304, + "learning_rate": 1.74376796023417e-05, + "loss": 1.1695, + "step": 7024 + }, + { + "epoch": 2.6154436830287024, + "grad_norm": 0.19066371023654938, + "learning_rate": 1.7436867183940972e-05, + "loss": 1.1739, + "step": 7025 + }, + { + "epoch": 2.6158159881793113, + "grad_norm": 0.16923125088214874, + "learning_rate": 1.7436054655698184e-05, + "loss": 1.1544, + "step": 7026 + }, + { + "epoch": 2.6161882933299205, + "grad_norm": 0.1667940318584442, + "learning_rate": 1.7435242017625333e-05, + "loss": 1.167, + "step": 7027 + }, + { + "epoch": 2.61656059848053, + "grad_norm": 0.18602821230888367, + "learning_rate": 1.7434429269734426e-05, + "loss": 1.1667, + "step": 7028 + }, + { + "epoch": 2.6169329036311386, + "grad_norm": 0.17603205144405365, + "learning_rate": 1.7433616412037462e-05, + "loss": 1.1759, + "step": 7029 + }, + { + "epoch": 2.617305208781748, + "grad_norm": 0.15942661464214325, + "learning_rate": 1.7432803444546454e-05, + "loss": 1.1731, + "step": 7030 + }, + { + "epoch": 2.6176775139323567, + "grad_norm": 0.19452649354934692, + "learning_rate": 1.7431990367273402e-05, + "loss": 1.1756, + "step": 7031 + }, + { + "epoch": 2.618049819082966, + "grad_norm": 0.1983923316001892, + "learning_rate": 1.7431177180230323e-05, + "loss": 1.1745, + "step": 7032 + }, + { + "epoch": 2.618422124233575, + "grad_norm": 0.16836190223693848, + "learning_rate": 1.7430363883429218e-05, + "loss": 1.1774, + "step": 7033 + }, + { + "epoch": 2.618794429384184, + "grad_norm": 0.29323258996009827, + "learning_rate": 1.742955047688211e-05, + "loss": 1.178, + "step": 7034 + }, + { + "epoch": 2.619166734534793, + "grad_norm": 0.19639234244823456, + "learning_rate": 1.7428736960601004e-05, + "loss": 1.1731, + "step": 7035 + }, + { + "epoch": 2.619539039685402, + "grad_norm": 0.1906968504190445, + "learning_rate": 1.7427923334597922e-05, + "loss": 1.1848, + "step": 7036 + }, + { + "epoch": 2.6199113448360114, + "grad_norm": 0.16878125071525574, + "learning_rate": 1.7427109598884877e-05, + "loss": 1.1764, + "step": 7037 + }, + { + "epoch": 2.6202836499866202, + "grad_norm": 0.16486749053001404, + "learning_rate": 1.742629575347389e-05, + "loss": 1.1691, + "step": 7038 + }, + { + "epoch": 2.6206559551372295, + "grad_norm": 0.17094659805297852, + "learning_rate": 1.742548179837698e-05, + "loss": 1.174, + "step": 7039 + }, + { + "epoch": 2.6210282602878383, + "grad_norm": 0.18062889575958252, + "learning_rate": 1.742466773360617e-05, + "loss": 1.1766, + "step": 7040 + }, + { + "epoch": 2.6214005654384476, + "grad_norm": 0.16564013063907623, + "learning_rate": 1.742385355917348e-05, + "loss": 1.1826, + "step": 7041 + }, + { + "epoch": 2.6217728705890564, + "grad_norm": 0.15717479586601257, + "learning_rate": 1.7423039275090947e-05, + "loss": 1.1718, + "step": 7042 + }, + { + "epoch": 2.6221451757396657, + "grad_norm": 0.16660241782665253, + "learning_rate": 1.7422224881370585e-05, + "loss": 1.1845, + "step": 7043 + }, + { + "epoch": 2.6225174808902745, + "grad_norm": 0.16383567452430725, + "learning_rate": 1.7421410378024428e-05, + "loss": 1.1843, + "step": 7044 + }, + { + "epoch": 2.6228897860408837, + "grad_norm": 0.1657799780368805, + "learning_rate": 1.7420595765064505e-05, + "loss": 1.1704, + "step": 7045 + }, + { + "epoch": 2.623262091191493, + "grad_norm": 0.16230309009552002, + "learning_rate": 1.7419781042502846e-05, + "loss": 1.1725, + "step": 7046 + }, + { + "epoch": 2.623634396342102, + "grad_norm": 0.16071511805057526, + "learning_rate": 1.7418966210351492e-05, + "loss": 1.1682, + "step": 7047 + }, + { + "epoch": 2.624006701492711, + "grad_norm": 0.18019315600395203, + "learning_rate": 1.741815126862247e-05, + "loss": 1.1601, + "step": 7048 + }, + { + "epoch": 2.62437900664332, + "grad_norm": 0.16016197204589844, + "learning_rate": 1.741733621732782e-05, + "loss": 1.1783, + "step": 7049 + }, + { + "epoch": 2.624751311793929, + "grad_norm": 0.15661844611167908, + "learning_rate": 1.7416521056479577e-05, + "loss": 1.1509, + "step": 7050 + }, + { + "epoch": 2.6251236169445384, + "grad_norm": 0.16214154660701752, + "learning_rate": 1.7415705786089784e-05, + "loss": 1.162, + "step": 7051 + }, + { + "epoch": 2.6254959220951473, + "grad_norm": 0.16358621418476105, + "learning_rate": 1.7414890406170487e-05, + "loss": 1.1727, + "step": 7052 + }, + { + "epoch": 2.625868227245756, + "grad_norm": 0.1643485575914383, + "learning_rate": 1.7414074916733715e-05, + "loss": 1.168, + "step": 7053 + }, + { + "epoch": 2.6262405323963653, + "grad_norm": 0.16986949741840363, + "learning_rate": 1.7413259317791528e-05, + "loss": 1.1664, + "step": 7054 + }, + { + "epoch": 2.6266128375469746, + "grad_norm": 0.17110756039619446, + "learning_rate": 1.7412443609355967e-05, + "loss": 1.163, + "step": 7055 + }, + { + "epoch": 2.6269851426975834, + "grad_norm": 0.16550353169441223, + "learning_rate": 1.7411627791439073e-05, + "loss": 1.1659, + "step": 7056 + }, + { + "epoch": 2.6273574478481927, + "grad_norm": 0.16365563869476318, + "learning_rate": 1.7410811864052908e-05, + "loss": 1.1871, + "step": 7057 + }, + { + "epoch": 2.6277297529988015, + "grad_norm": 0.17241568863391876, + "learning_rate": 1.7409995827209517e-05, + "loss": 1.1757, + "step": 7058 + }, + { + "epoch": 2.628102058149411, + "grad_norm": 0.16813285648822784, + "learning_rate": 1.7409179680920945e-05, + "loss": 1.1615, + "step": 7059 + }, + { + "epoch": 2.62847436330002, + "grad_norm": 0.15777957439422607, + "learning_rate": 1.740836342519926e-05, + "loss": 1.167, + "step": 7060 + }, + { + "epoch": 2.628846668450629, + "grad_norm": 0.1858019232749939, + "learning_rate": 1.7407547060056514e-05, + "loss": 1.1673, + "step": 7061 + }, + { + "epoch": 2.6292189736012377, + "grad_norm": 0.15979260206222534, + "learning_rate": 1.740673058550476e-05, + "loss": 1.178, + "step": 7062 + }, + { + "epoch": 2.629591278751847, + "grad_norm": 0.1698780208826065, + "learning_rate": 1.7405914001556058e-05, + "loss": 1.1721, + "step": 7063 + }, + { + "epoch": 2.629963583902456, + "grad_norm": 0.16836461424827576, + "learning_rate": 1.7405097308222474e-05, + "loss": 1.1717, + "step": 7064 + }, + { + "epoch": 2.630335889053065, + "grad_norm": 0.16282488405704498, + "learning_rate": 1.740428050551607e-05, + "loss": 1.1735, + "step": 7065 + }, + { + "epoch": 2.6307081942036743, + "grad_norm": 0.16781288385391235, + "learning_rate": 1.74034635934489e-05, + "loss": 1.2007, + "step": 7066 + }, + { + "epoch": 2.631080499354283, + "grad_norm": 0.16193710267543793, + "learning_rate": 1.7402646572033043e-05, + "loss": 1.1854, + "step": 7067 + }, + { + "epoch": 2.6314528045048924, + "grad_norm": 0.16660821437835693, + "learning_rate": 1.7401829441280563e-05, + "loss": 1.1706, + "step": 7068 + }, + { + "epoch": 2.6318251096555016, + "grad_norm": 0.16784422099590302, + "learning_rate": 1.740101220120352e-05, + "loss": 1.1672, + "step": 7069 + }, + { + "epoch": 2.6321974148061105, + "grad_norm": 0.17018967866897583, + "learning_rate": 1.7400194851813994e-05, + "loss": 1.1758, + "step": 7070 + }, + { + "epoch": 2.6325697199567193, + "grad_norm": 0.16101203858852386, + "learning_rate": 1.7399377393124056e-05, + "loss": 1.185, + "step": 7071 + }, + { + "epoch": 2.6329420251073286, + "grad_norm": 0.16601742804050446, + "learning_rate": 1.7398559825145776e-05, + "loss": 1.17, + "step": 7072 + }, + { + "epoch": 2.633314330257938, + "grad_norm": 0.16237667202949524, + "learning_rate": 1.7397742147891234e-05, + "loss": 1.1807, + "step": 7073 + }, + { + "epoch": 2.6336866354085466, + "grad_norm": 0.19661523401737213, + "learning_rate": 1.7396924361372504e-05, + "loss": 1.1696, + "step": 7074 + }, + { + "epoch": 2.634058940559156, + "grad_norm": 0.18306462466716766, + "learning_rate": 1.7396106465601662e-05, + "loss": 1.1693, + "step": 7075 + }, + { + "epoch": 2.6344312457097647, + "grad_norm": 0.16825948655605316, + "learning_rate": 1.7395288460590797e-05, + "loss": 1.1752, + "step": 7076 + }, + { + "epoch": 2.634803550860374, + "grad_norm": 0.18328233063220978, + "learning_rate": 1.739447034635198e-05, + "loss": 1.1778, + "step": 7077 + }, + { + "epoch": 2.6351758560109833, + "grad_norm": 0.1660008579492569, + "learning_rate": 1.7393652122897306e-05, + "loss": 1.1784, + "step": 7078 + }, + { + "epoch": 2.635548161161592, + "grad_norm": 0.1794942319393158, + "learning_rate": 1.7392833790238854e-05, + "loss": 1.1743, + "step": 7079 + }, + { + "epoch": 2.635920466312201, + "grad_norm": 0.16286969184875488, + "learning_rate": 1.7392015348388707e-05, + "loss": 1.1833, + "step": 7080 + }, + { + "epoch": 2.63629277146281, + "grad_norm": 0.18671861290931702, + "learning_rate": 1.7391196797358957e-05, + "loss": 1.1811, + "step": 7081 + }, + { + "epoch": 2.6366650766134194, + "grad_norm": 0.16873905062675476, + "learning_rate": 1.7390378137161694e-05, + "loss": 1.1764, + "step": 7082 + }, + { + "epoch": 2.6370373817640282, + "grad_norm": 0.16419054567813873, + "learning_rate": 1.7389559367809012e-05, + "loss": 1.1631, + "step": 7083 + }, + { + "epoch": 2.6374096869146375, + "grad_norm": 0.2097456008195877, + "learning_rate": 1.7388740489313e-05, + "loss": 1.1651, + "step": 7084 + }, + { + "epoch": 2.6377819920652463, + "grad_norm": 0.19423960149288177, + "learning_rate": 1.7387921501685757e-05, + "loss": 1.1651, + "step": 7085 + }, + { + "epoch": 2.6381542972158556, + "grad_norm": 0.17196422815322876, + "learning_rate": 1.7387102404939375e-05, + "loss": 1.1788, + "step": 7086 + }, + { + "epoch": 2.638526602366465, + "grad_norm": 0.2757280170917511, + "learning_rate": 1.7386283199085957e-05, + "loss": 1.182, + "step": 7087 + }, + { + "epoch": 2.6388989075170737, + "grad_norm": 0.16954492032527924, + "learning_rate": 1.73854638841376e-05, + "loss": 1.1732, + "step": 7088 + }, + { + "epoch": 2.6392712126676825, + "grad_norm": 0.1665467768907547, + "learning_rate": 1.7384644460106403e-05, + "loss": 1.1731, + "step": 7089 + }, + { + "epoch": 2.6396435178182918, + "grad_norm": 0.16451166570186615, + "learning_rate": 1.738382492700447e-05, + "loss": 1.1616, + "step": 7090 + }, + { + "epoch": 2.640015822968901, + "grad_norm": 0.16248999536037445, + "learning_rate": 1.7383005284843902e-05, + "loss": 1.1636, + "step": 7091 + }, + { + "epoch": 2.64038812811951, + "grad_norm": 0.1698039472103119, + "learning_rate": 1.7382185533636815e-05, + "loss": 1.1749, + "step": 7092 + }, + { + "epoch": 2.640760433270119, + "grad_norm": 0.17128440737724304, + "learning_rate": 1.738136567339531e-05, + "loss": 1.1702, + "step": 7093 + }, + { + "epoch": 2.641132738420728, + "grad_norm": 0.16965392231941223, + "learning_rate": 1.7380545704131496e-05, + "loss": 1.1698, + "step": 7094 + }, + { + "epoch": 2.641505043571337, + "grad_norm": 0.17016279697418213, + "learning_rate": 1.737972562585749e-05, + "loss": 1.1715, + "step": 7095 + }, + { + "epoch": 2.6418773487219465, + "grad_norm": 0.16740964353084564, + "learning_rate": 1.7378905438585394e-05, + "loss": 1.1743, + "step": 7096 + }, + { + "epoch": 2.6422496538725553, + "grad_norm": 0.16494162380695343, + "learning_rate": 1.737808514232733e-05, + "loss": 1.1626, + "step": 7097 + }, + { + "epoch": 2.642621959023164, + "grad_norm": 0.17924334108829498, + "learning_rate": 1.7377264737095408e-05, + "loss": 1.1731, + "step": 7098 + }, + { + "epoch": 2.6429942641737734, + "grad_norm": 0.16398359835147858, + "learning_rate": 1.7376444222901754e-05, + "loss": 1.1804, + "step": 7099 + }, + { + "epoch": 2.6433665693243826, + "grad_norm": 0.16579513251781464, + "learning_rate": 1.737562359975848e-05, + "loss": 1.1583, + "step": 7100 + }, + { + "epoch": 2.6437388744749915, + "grad_norm": 0.16405485570430756, + "learning_rate": 1.7374802867677706e-05, + "loss": 1.1651, + "step": 7101 + }, + { + "epoch": 2.6441111796256007, + "grad_norm": 0.16502594947814941, + "learning_rate": 1.7373982026671557e-05, + "loss": 1.1766, + "step": 7102 + }, + { + "epoch": 2.6444834847762095, + "grad_norm": 0.16717404127120972, + "learning_rate": 1.737316107675216e-05, + "loss": 1.1841, + "step": 7103 + }, + { + "epoch": 2.644855789926819, + "grad_norm": 0.16163359582424164, + "learning_rate": 1.7372340017931636e-05, + "loss": 1.1678, + "step": 7104 + }, + { + "epoch": 2.645228095077428, + "grad_norm": 0.17032460868358612, + "learning_rate": 1.737151885022211e-05, + "loss": 1.1817, + "step": 7105 + }, + { + "epoch": 2.645600400228037, + "grad_norm": 0.16142508387565613, + "learning_rate": 1.7370697573635714e-05, + "loss": 1.1869, + "step": 7106 + }, + { + "epoch": 2.6459727053786457, + "grad_norm": 0.15926913917064667, + "learning_rate": 1.7369876188184577e-05, + "loss": 1.1717, + "step": 7107 + }, + { + "epoch": 2.646345010529255, + "grad_norm": 0.1710928976535797, + "learning_rate": 1.7369054693880832e-05, + "loss": 1.1809, + "step": 7108 + }, + { + "epoch": 2.6467173156798642, + "grad_norm": 0.1716298758983612, + "learning_rate": 1.7368233090736613e-05, + "loss": 1.1709, + "step": 7109 + }, + { + "epoch": 2.647089620830473, + "grad_norm": 0.16648998856544495, + "learning_rate": 1.736741137876405e-05, + "loss": 1.1778, + "step": 7110 + }, + { + "epoch": 2.6474619259810823, + "grad_norm": 0.16086730360984802, + "learning_rate": 1.7366589557975287e-05, + "loss": 1.1909, + "step": 7111 + }, + { + "epoch": 2.647834231131691, + "grad_norm": 0.16966967284679413, + "learning_rate": 1.7365767628382456e-05, + "loss": 1.1799, + "step": 7112 + }, + { + "epoch": 2.6482065362823004, + "grad_norm": 0.16273948550224304, + "learning_rate": 1.7364945589997703e-05, + "loss": 1.1777, + "step": 7113 + }, + { + "epoch": 2.6485788414329097, + "grad_norm": 0.16940386593341827, + "learning_rate": 1.736412344283316e-05, + "loss": 1.1837, + "step": 7114 + }, + { + "epoch": 2.6489511465835185, + "grad_norm": 0.17343072593212128, + "learning_rate": 1.736330118690098e-05, + "loss": 1.173, + "step": 7115 + }, + { + "epoch": 2.6493234517341273, + "grad_norm": 0.16787442564964294, + "learning_rate": 1.73624788222133e-05, + "loss": 1.1676, + "step": 7116 + }, + { + "epoch": 2.6496957568847366, + "grad_norm": 0.1711404025554657, + "learning_rate": 1.7361656348782275e-05, + "loss": 1.1643, + "step": 7117 + }, + { + "epoch": 2.650068062035346, + "grad_norm": 0.17250101268291473, + "learning_rate": 1.7360833766620046e-05, + "loss": 1.1709, + "step": 7118 + }, + { + "epoch": 2.6504403671859547, + "grad_norm": 0.16760481894016266, + "learning_rate": 1.7360011075738762e-05, + "loss": 1.1867, + "step": 7119 + }, + { + "epoch": 2.650812672336564, + "grad_norm": 0.16904081404209137, + "learning_rate": 1.7359188276150578e-05, + "loss": 1.1658, + "step": 7120 + }, + { + "epoch": 2.6511849774871727, + "grad_norm": 0.16162091493606567, + "learning_rate": 1.7358365367867643e-05, + "loss": 1.1653, + "step": 7121 + }, + { + "epoch": 2.651557282637782, + "grad_norm": 0.166825532913208, + "learning_rate": 1.7357542350902114e-05, + "loss": 1.1697, + "step": 7122 + }, + { + "epoch": 2.6519295877883913, + "grad_norm": 0.16394229233264923, + "learning_rate": 1.7356719225266147e-05, + "loss": 1.1705, + "step": 7123 + }, + { + "epoch": 2.652301892939, + "grad_norm": 0.16806428134441376, + "learning_rate": 1.73558959909719e-05, + "loss": 1.1602, + "step": 7124 + }, + { + "epoch": 2.6526741980896094, + "grad_norm": 0.16216734051704407, + "learning_rate": 1.735507264803153e-05, + "loss": 1.1691, + "step": 7125 + }, + { + "epoch": 2.653046503240218, + "grad_norm": 0.17059317231178284, + "learning_rate": 1.7354249196457198e-05, + "loss": 1.1784, + "step": 7126 + }, + { + "epoch": 2.6534188083908274, + "grad_norm": 0.1767067313194275, + "learning_rate": 1.7353425636261067e-05, + "loss": 1.1686, + "step": 7127 + }, + { + "epoch": 2.6537911135414363, + "grad_norm": 0.16557256877422333, + "learning_rate": 1.7352601967455303e-05, + "loss": 1.1818, + "step": 7128 + }, + { + "epoch": 2.6541634186920455, + "grad_norm": 0.17525194585323334, + "learning_rate": 1.7351778190052067e-05, + "loss": 1.1759, + "step": 7129 + }, + { + "epoch": 2.6545357238426543, + "grad_norm": 0.16801874339580536, + "learning_rate": 1.7350954304063528e-05, + "loss": 1.1652, + "step": 7130 + }, + { + "epoch": 2.6549080289932636, + "grad_norm": 0.16785259544849396, + "learning_rate": 1.7350130309501855e-05, + "loss": 1.1728, + "step": 7131 + }, + { + "epoch": 2.655280334143873, + "grad_norm": 0.18316105008125305, + "learning_rate": 1.734930620637922e-05, + "loss": 1.1842, + "step": 7132 + }, + { + "epoch": 2.6556526392944817, + "grad_norm": 0.18117022514343262, + "learning_rate": 1.7348481994707795e-05, + "loss": 1.1717, + "step": 7133 + }, + { + "epoch": 2.656024944445091, + "grad_norm": 0.17637571692466736, + "learning_rate": 1.734765767449975e-05, + "loss": 1.1801, + "step": 7134 + }, + { + "epoch": 2.6563972495957, + "grad_norm": 0.1829034388065338, + "learning_rate": 1.7346833245767265e-05, + "loss": 1.1731, + "step": 7135 + }, + { + "epoch": 2.656769554746309, + "grad_norm": 0.17745697498321533, + "learning_rate": 1.734600870852251e-05, + "loss": 1.1639, + "step": 7136 + }, + { + "epoch": 2.657141859896918, + "grad_norm": 0.176448792219162, + "learning_rate": 1.7345184062777668e-05, + "loss": 1.1718, + "step": 7137 + }, + { + "epoch": 2.657514165047527, + "grad_norm": 0.1889655888080597, + "learning_rate": 1.734435930854492e-05, + "loss": 1.1717, + "step": 7138 + }, + { + "epoch": 2.657886470198136, + "grad_norm": 0.21692200005054474, + "learning_rate": 1.7343534445836446e-05, + "loss": 1.1726, + "step": 7139 + }, + { + "epoch": 2.658258775348745, + "grad_norm": 0.22923436760902405, + "learning_rate": 1.7342709474664426e-05, + "loss": 1.177, + "step": 7140 + }, + { + "epoch": 2.6586310804993545, + "grad_norm": 0.2060500830411911, + "learning_rate": 1.7341884395041052e-05, + "loss": 1.1685, + "step": 7141 + }, + { + "epoch": 2.6590033856499633, + "grad_norm": 0.16787934303283691, + "learning_rate": 1.7341059206978505e-05, + "loss": 1.1647, + "step": 7142 + }, + { + "epoch": 2.6593756908005726, + "grad_norm": 0.2028505802154541, + "learning_rate": 1.7340233910488973e-05, + "loss": 1.1583, + "step": 7143 + }, + { + "epoch": 2.6597479959511814, + "grad_norm": 0.18768154084682465, + "learning_rate": 1.7339408505584653e-05, + "loss": 1.1632, + "step": 7144 + }, + { + "epoch": 2.6601203011017907, + "grad_norm": 0.17588740587234497, + "learning_rate": 1.7338582992277723e-05, + "loss": 1.1676, + "step": 7145 + }, + { + "epoch": 2.6604926062523995, + "grad_norm": 0.19588559865951538, + "learning_rate": 1.7337757370580385e-05, + "loss": 1.1545, + "step": 7146 + }, + { + "epoch": 2.6608649114030087, + "grad_norm": 0.16614019870758057, + "learning_rate": 1.733693164050483e-05, + "loss": 1.1864, + "step": 7147 + }, + { + "epoch": 2.6612372165536176, + "grad_norm": 0.17815782129764557, + "learning_rate": 1.7336105802063255e-05, + "loss": 1.1624, + "step": 7148 + }, + { + "epoch": 2.661609521704227, + "grad_norm": 0.172567680478096, + "learning_rate": 1.7335279855267858e-05, + "loss": 1.1725, + "step": 7149 + }, + { + "epoch": 2.661981826854836, + "grad_norm": 0.16731174290180206, + "learning_rate": 1.733445380013084e-05, + "loss": 1.1608, + "step": 7150 + }, + { + "epoch": 2.662354132005445, + "grad_norm": 0.17305542528629303, + "learning_rate": 1.7333627636664397e-05, + "loss": 1.1803, + "step": 7151 + }, + { + "epoch": 2.662726437156054, + "grad_norm": 0.16189545392990112, + "learning_rate": 1.7332801364880734e-05, + "loss": 1.1735, + "step": 7152 + }, + { + "epoch": 2.663098742306663, + "grad_norm": 0.25279855728149414, + "learning_rate": 1.7331974984792056e-05, + "loss": 1.1793, + "step": 7153 + }, + { + "epoch": 2.6634710474572723, + "grad_norm": 0.1899125874042511, + "learning_rate": 1.733114849641057e-05, + "loss": 1.1931, + "step": 7154 + }, + { + "epoch": 2.663843352607881, + "grad_norm": 0.17817838490009308, + "learning_rate": 1.7330321899748476e-05, + "loss": 1.1718, + "step": 7155 + }, + { + "epoch": 2.6642156577584903, + "grad_norm": 0.16445818543434143, + "learning_rate": 1.732949519481799e-05, + "loss": 1.1743, + "step": 7156 + }, + { + "epoch": 2.664587962909099, + "grad_norm": 0.1586284637451172, + "learning_rate": 1.732866838163132e-05, + "loss": 1.173, + "step": 7157 + }, + { + "epoch": 2.6649602680597084, + "grad_norm": 0.1685306876897812, + "learning_rate": 1.7327841460200677e-05, + "loss": 1.1722, + "step": 7158 + }, + { + "epoch": 2.6653325732103177, + "grad_norm": 0.16867049038410187, + "learning_rate": 1.732701443053828e-05, + "loss": 1.1736, + "step": 7159 + }, + { + "epoch": 2.6657048783609265, + "grad_norm": 0.16906088590621948, + "learning_rate": 1.7326187292656332e-05, + "loss": 1.1659, + "step": 7160 + }, + { + "epoch": 2.6660771835115358, + "grad_norm": 0.16725754737854004, + "learning_rate": 1.7325360046567065e-05, + "loss": 1.1736, + "step": 7161 + }, + { + "epoch": 2.6664494886621446, + "grad_norm": 0.1647002398967743, + "learning_rate": 1.732453269228268e-05, + "loss": 1.1687, + "step": 7162 + }, + { + "epoch": 2.666821793812754, + "grad_norm": 0.16950398683547974, + "learning_rate": 1.7323705229815416e-05, + "loss": 1.1733, + "step": 7163 + }, + { + "epoch": 2.6671940989633627, + "grad_norm": 0.16586348414421082, + "learning_rate": 1.7322877659177482e-05, + "loss": 1.1649, + "step": 7164 + }, + { + "epoch": 2.667566404113972, + "grad_norm": 0.1677524447441101, + "learning_rate": 1.732204998038111e-05, + "loss": 1.1786, + "step": 7165 + }, + { + "epoch": 2.6679387092645808, + "grad_norm": 0.16064026951789856, + "learning_rate": 1.7321222193438513e-05, + "loss": 1.1817, + "step": 7166 + }, + { + "epoch": 2.66831101441519, + "grad_norm": 0.17599624395370483, + "learning_rate": 1.7320394298361926e-05, + "loss": 1.1767, + "step": 7167 + }, + { + "epoch": 2.6686833195657993, + "grad_norm": 0.16839216649532318, + "learning_rate": 1.7319566295163572e-05, + "loss": 1.165, + "step": 7168 + }, + { + "epoch": 2.669055624716408, + "grad_norm": 0.16207967698574066, + "learning_rate": 1.7318738183855685e-05, + "loss": 1.1772, + "step": 7169 + }, + { + "epoch": 2.6694279298670174, + "grad_norm": 0.1666906476020813, + "learning_rate": 1.7317909964450494e-05, + "loss": 1.1723, + "step": 7170 + }, + { + "epoch": 2.669800235017626, + "grad_norm": 0.1715075820684433, + "learning_rate": 1.731708163696023e-05, + "loss": 1.1788, + "step": 7171 + }, + { + "epoch": 2.6701725401682355, + "grad_norm": 0.1627563238143921, + "learning_rate": 1.7316253201397134e-05, + "loss": 1.1688, + "step": 7172 + }, + { + "epoch": 2.6705448453188447, + "grad_norm": 0.1687730997800827, + "learning_rate": 1.7315424657773433e-05, + "loss": 1.1657, + "step": 7173 + }, + { + "epoch": 2.6709171504694535, + "grad_norm": 0.16402725875377655, + "learning_rate": 1.7314596006101372e-05, + "loss": 1.1852, + "step": 7174 + }, + { + "epoch": 2.6712894556200624, + "grad_norm": 0.16662922501564026, + "learning_rate": 1.7313767246393184e-05, + "loss": 1.1782, + "step": 7175 + }, + { + "epoch": 2.6716617607706716, + "grad_norm": 0.1664755940437317, + "learning_rate": 1.7312938378661118e-05, + "loss": 1.1724, + "step": 7176 + }, + { + "epoch": 2.672034065921281, + "grad_norm": 0.15922842919826508, + "learning_rate": 1.7312109402917406e-05, + "loss": 1.1719, + "step": 7177 + }, + { + "epoch": 2.6724063710718897, + "grad_norm": 0.16002245247364044, + "learning_rate": 1.73112803191743e-05, + "loss": 1.176, + "step": 7178 + }, + { + "epoch": 2.672778676222499, + "grad_norm": 0.16558903455734253, + "learning_rate": 1.731045112744404e-05, + "loss": 1.1703, + "step": 7179 + }, + { + "epoch": 2.673150981373108, + "grad_norm": 0.16779029369354248, + "learning_rate": 1.7309621827738877e-05, + "loss": 1.1909, + "step": 7180 + }, + { + "epoch": 2.673523286523717, + "grad_norm": 0.16729268431663513, + "learning_rate": 1.730879242007106e-05, + "loss": 1.1688, + "step": 7181 + }, + { + "epoch": 2.6738955916743263, + "grad_norm": 0.1586335152387619, + "learning_rate": 1.7307962904452837e-05, + "loss": 1.1748, + "step": 7182 + }, + { + "epoch": 2.674267896824935, + "grad_norm": 0.16203922033309937, + "learning_rate": 1.730713328089646e-05, + "loss": 1.1755, + "step": 7183 + }, + { + "epoch": 2.674640201975544, + "grad_norm": 0.17098890244960785, + "learning_rate": 1.730630354941418e-05, + "loss": 1.1814, + "step": 7184 + }, + { + "epoch": 2.6750125071261532, + "grad_norm": 0.16195635497570038, + "learning_rate": 1.7305473710018258e-05, + "loss": 1.1654, + "step": 7185 + }, + { + "epoch": 2.6753848122767625, + "grad_norm": 0.16069965064525604, + "learning_rate": 1.730464376272095e-05, + "loss": 1.1661, + "step": 7186 + }, + { + "epoch": 2.6757571174273713, + "grad_norm": 0.1605072170495987, + "learning_rate": 1.7303813707534506e-05, + "loss": 1.1829, + "step": 7187 + }, + { + "epoch": 2.6761294225779806, + "grad_norm": 0.16202007234096527, + "learning_rate": 1.7302983544471197e-05, + "loss": 1.1668, + "step": 7188 + }, + { + "epoch": 2.6765017277285894, + "grad_norm": 0.16660577058792114, + "learning_rate": 1.7302153273543276e-05, + "loss": 1.1741, + "step": 7189 + }, + { + "epoch": 2.6768740328791987, + "grad_norm": 0.16862109303474426, + "learning_rate": 1.7301322894763013e-05, + "loss": 1.1775, + "step": 7190 + }, + { + "epoch": 2.677246338029808, + "grad_norm": 0.1640176773071289, + "learning_rate": 1.7300492408142666e-05, + "loss": 1.1671, + "step": 7191 + }, + { + "epoch": 2.6776186431804168, + "grad_norm": 0.1989293098449707, + "learning_rate": 1.729966181369451e-05, + "loss": 1.1844, + "step": 7192 + }, + { + "epoch": 2.6779909483310256, + "grad_norm": 0.21205829083919525, + "learning_rate": 1.72988311114308e-05, + "loss": 1.1789, + "step": 7193 + }, + { + "epoch": 2.678363253481635, + "grad_norm": 0.16825103759765625, + "learning_rate": 1.7298000301363815e-05, + "loss": 1.1659, + "step": 7194 + }, + { + "epoch": 2.678735558632244, + "grad_norm": 0.5028261542320251, + "learning_rate": 1.729716938350582e-05, + "loss": 1.1628, + "step": 7195 + }, + { + "epoch": 2.679107863782853, + "grad_norm": 0.1976155936717987, + "learning_rate": 1.72963383578691e-05, + "loss": 1.172, + "step": 7196 + }, + { + "epoch": 2.679480168933462, + "grad_norm": 0.18413805961608887, + "learning_rate": 1.7295507224465913e-05, + "loss": 1.1695, + "step": 7197 + }, + { + "epoch": 2.679852474084071, + "grad_norm": 0.17518746852874756, + "learning_rate": 1.7294675983308545e-05, + "loss": 1.1602, + "step": 7198 + }, + { + "epoch": 2.6802247792346803, + "grad_norm": 0.1705147624015808, + "learning_rate": 1.729384463440927e-05, + "loss": 1.1783, + "step": 7199 + }, + { + "epoch": 2.6805970843852895, + "grad_norm": 0.18338525295257568, + "learning_rate": 1.7293013177780368e-05, + "loss": 1.1786, + "step": 7200 + }, + { + "epoch": 2.6809693895358984, + "grad_norm": 0.1804172545671463, + "learning_rate": 1.7292181613434117e-05, + "loss": 1.1682, + "step": 7201 + }, + { + "epoch": 2.681341694686507, + "grad_norm": 0.17289164662361145, + "learning_rate": 1.7291349941382804e-05, + "loss": 1.1769, + "step": 7202 + }, + { + "epoch": 2.6817139998371164, + "grad_norm": 0.15843546390533447, + "learning_rate": 1.7290518161638707e-05, + "loss": 1.1736, + "step": 7203 + }, + { + "epoch": 2.6820863049877257, + "grad_norm": 0.16561047732830048, + "learning_rate": 1.7289686274214116e-05, + "loss": 1.1621, + "step": 7204 + }, + { + "epoch": 2.6824586101383345, + "grad_norm": 0.16593630611896515, + "learning_rate": 1.7288854279121318e-05, + "loss": 1.1724, + "step": 7205 + }, + { + "epoch": 2.682830915288944, + "grad_norm": 0.16903571784496307, + "learning_rate": 1.7288022176372597e-05, + "loss": 1.1679, + "step": 7206 + }, + { + "epoch": 2.6832032204395526, + "grad_norm": 0.16603899002075195, + "learning_rate": 1.7287189965980245e-05, + "loss": 1.1607, + "step": 7207 + }, + { + "epoch": 2.683575525590162, + "grad_norm": 0.16013038158416748, + "learning_rate": 1.728635764795656e-05, + "loss": 1.1641, + "step": 7208 + }, + { + "epoch": 2.683947830740771, + "grad_norm": 0.16311000287532806, + "learning_rate": 1.7285525222313823e-05, + "loss": 1.1721, + "step": 7209 + }, + { + "epoch": 2.68432013589138, + "grad_norm": 0.16615121066570282, + "learning_rate": 1.728469268906434e-05, + "loss": 1.181, + "step": 7210 + }, + { + "epoch": 2.684692441041989, + "grad_norm": 0.16762305796146393, + "learning_rate": 1.7283860048220403e-05, + "loss": 1.1645, + "step": 7211 + }, + { + "epoch": 2.685064746192598, + "grad_norm": 0.16307219862937927, + "learning_rate": 1.7283027299794306e-05, + "loss": 1.1769, + "step": 7212 + }, + { + "epoch": 2.6854370513432073, + "grad_norm": 0.16042660176753998, + "learning_rate": 1.7282194443798358e-05, + "loss": 1.1636, + "step": 7213 + }, + { + "epoch": 2.685809356493816, + "grad_norm": 0.16484393179416656, + "learning_rate": 1.7281361480244852e-05, + "loss": 1.1716, + "step": 7214 + }, + { + "epoch": 2.6861816616444254, + "grad_norm": 0.16563540697097778, + "learning_rate": 1.7280528409146097e-05, + "loss": 1.1618, + "step": 7215 + }, + { + "epoch": 2.686553966795034, + "grad_norm": 0.16255468130111694, + "learning_rate": 1.7279695230514392e-05, + "loss": 1.1771, + "step": 7216 + }, + { + "epoch": 2.6869262719456435, + "grad_norm": 0.16036321222782135, + "learning_rate": 1.7278861944362045e-05, + "loss": 1.1671, + "step": 7217 + }, + { + "epoch": 2.6872985770962527, + "grad_norm": 0.16216713190078735, + "learning_rate": 1.7278028550701364e-05, + "loss": 1.1772, + "step": 7218 + }, + { + "epoch": 2.6876708822468616, + "grad_norm": 0.16452451050281525, + "learning_rate": 1.727719504954466e-05, + "loss": 1.1639, + "step": 7219 + }, + { + "epoch": 2.6880431873974704, + "grad_norm": 0.16761431097984314, + "learning_rate": 1.727636144090424e-05, + "loss": 1.1635, + "step": 7220 + }, + { + "epoch": 2.6884154925480797, + "grad_norm": 0.16240081191062927, + "learning_rate": 1.7275527724792416e-05, + "loss": 1.1699, + "step": 7221 + }, + { + "epoch": 2.688787797698689, + "grad_norm": 0.16535256803035736, + "learning_rate": 1.7274693901221507e-05, + "loss": 1.1815, + "step": 7222 + }, + { + "epoch": 2.6891601028492977, + "grad_norm": 0.16197264194488525, + "learning_rate": 1.7273859970203825e-05, + "loss": 1.1708, + "step": 7223 + }, + { + "epoch": 2.689532407999907, + "grad_norm": 0.1600698083639145, + "learning_rate": 1.727302593175169e-05, + "loss": 1.1802, + "step": 7224 + }, + { + "epoch": 2.689904713150516, + "grad_norm": 0.1599569022655487, + "learning_rate": 1.7272191785877415e-05, + "loss": 1.1891, + "step": 7225 + }, + { + "epoch": 2.690277018301125, + "grad_norm": 0.16361677646636963, + "learning_rate": 1.7271357532593325e-05, + "loss": 1.1736, + "step": 7226 + }, + { + "epoch": 2.6906493234517344, + "grad_norm": 0.16064906120300293, + "learning_rate": 1.727052317191174e-05, + "loss": 1.168, + "step": 7227 + }, + { + "epoch": 2.691021628602343, + "grad_norm": 0.16545885801315308, + "learning_rate": 1.7269688703844984e-05, + "loss": 1.1683, + "step": 7228 + }, + { + "epoch": 2.691393933752952, + "grad_norm": 0.16295583546161652, + "learning_rate": 1.7268854128405384e-05, + "loss": 1.1605, + "step": 7229 + }, + { + "epoch": 2.6917662389035613, + "grad_norm": 0.16552822291851044, + "learning_rate": 1.7268019445605263e-05, + "loss": 1.161, + "step": 7230 + }, + { + "epoch": 2.6921385440541705, + "grad_norm": 0.16663306951522827, + "learning_rate": 1.726718465545695e-05, + "loss": 1.17, + "step": 7231 + }, + { + "epoch": 2.6925108492047793, + "grad_norm": 0.15905286371707916, + "learning_rate": 1.726634975797278e-05, + "loss": 1.1616, + "step": 7232 + }, + { + "epoch": 2.6928831543553886, + "grad_norm": 0.1620490998029709, + "learning_rate": 1.7265514753165075e-05, + "loss": 1.176, + "step": 7233 + }, + { + "epoch": 2.6932554595059974, + "grad_norm": 0.16328297555446625, + "learning_rate": 1.7264679641046176e-05, + "loss": 1.1792, + "step": 7234 + }, + { + "epoch": 2.6936277646566067, + "grad_norm": 0.1633196771144867, + "learning_rate": 1.7263844421628416e-05, + "loss": 1.1659, + "step": 7235 + }, + { + "epoch": 2.694000069807216, + "grad_norm": 0.1595584601163864, + "learning_rate": 1.7263009094924125e-05, + "loss": 1.1728, + "step": 7236 + }, + { + "epoch": 2.694372374957825, + "grad_norm": 0.16301463544368744, + "learning_rate": 1.7262173660945648e-05, + "loss": 1.1735, + "step": 7237 + }, + { + "epoch": 2.6947446801084336, + "grad_norm": 0.16339920461177826, + "learning_rate": 1.7261338119705323e-05, + "loss": 1.1772, + "step": 7238 + }, + { + "epoch": 2.695116985259043, + "grad_norm": 0.16579227149486542, + "learning_rate": 1.7260502471215488e-05, + "loss": 1.171, + "step": 7239 + }, + { + "epoch": 2.695489290409652, + "grad_norm": 0.164364293217659, + "learning_rate": 1.7259666715488487e-05, + "loss": 1.1683, + "step": 7240 + }, + { + "epoch": 2.695861595560261, + "grad_norm": 0.16661317646503448, + "learning_rate": 1.7258830852536666e-05, + "loss": 1.1794, + "step": 7241 + }, + { + "epoch": 2.69623390071087, + "grad_norm": 0.16931530833244324, + "learning_rate": 1.7257994882372368e-05, + "loss": 1.1703, + "step": 7242 + }, + { + "epoch": 2.696606205861479, + "grad_norm": 0.1614353507757187, + "learning_rate": 1.725715880500794e-05, + "loss": 1.1639, + "step": 7243 + }, + { + "epoch": 2.6969785110120883, + "grad_norm": 0.16335289180278778, + "learning_rate": 1.7256322620455733e-05, + "loss": 1.1744, + "step": 7244 + }, + { + "epoch": 2.6973508161626976, + "grad_norm": 0.16357359290122986, + "learning_rate": 1.7255486328728096e-05, + "loss": 1.1627, + "step": 7245 + }, + { + "epoch": 2.6977231213133064, + "grad_norm": 0.16286170482635498, + "learning_rate": 1.725464992983738e-05, + "loss": 1.1709, + "step": 7246 + }, + { + "epoch": 2.6980954264639156, + "grad_norm": 0.16695433855056763, + "learning_rate": 1.7253813423795943e-05, + "loss": 1.1667, + "step": 7247 + }, + { + "epoch": 2.6984677316145245, + "grad_norm": 0.15719039738178253, + "learning_rate": 1.7252976810616134e-05, + "loss": 1.1666, + "step": 7248 + }, + { + "epoch": 2.6988400367651337, + "grad_norm": 0.15986768901348114, + "learning_rate": 1.7252140090310314e-05, + "loss": 1.1727, + "step": 7249 + }, + { + "epoch": 2.6992123419157426, + "grad_norm": 0.15737248957157135, + "learning_rate": 1.7251303262890838e-05, + "loss": 1.181, + "step": 7250 + }, + { + "epoch": 2.699584647066352, + "grad_norm": 0.1623496413230896, + "learning_rate": 1.725046632837007e-05, + "loss": 1.1727, + "step": 7251 + }, + { + "epoch": 2.6999569522169606, + "grad_norm": 0.16519080102443695, + "learning_rate": 1.724962928676037e-05, + "loss": 1.1729, + "step": 7252 + }, + { + "epoch": 2.70032925736757, + "grad_norm": 0.16225138306617737, + "learning_rate": 1.72487921380741e-05, + "loss": 1.1672, + "step": 7253 + }, + { + "epoch": 2.700701562518179, + "grad_norm": 0.16735146939754486, + "learning_rate": 1.7247954882323622e-05, + "loss": 1.1627, + "step": 7254 + }, + { + "epoch": 2.701073867668788, + "grad_norm": 0.1635318249464035, + "learning_rate": 1.724711751952131e-05, + "loss": 1.166, + "step": 7255 + }, + { + "epoch": 2.7014461728193973, + "grad_norm": 0.1578553318977356, + "learning_rate": 1.7246280049679526e-05, + "loss": 1.155, + "step": 7256 + }, + { + "epoch": 2.701818477970006, + "grad_norm": 0.16123878955841064, + "learning_rate": 1.7245442472810638e-05, + "loss": 1.1906, + "step": 7257 + }, + { + "epoch": 2.7021907831206153, + "grad_norm": 0.16336768865585327, + "learning_rate": 1.724460478892702e-05, + "loss": 1.179, + "step": 7258 + }, + { + "epoch": 2.702563088271224, + "grad_norm": 0.16107387840747833, + "learning_rate": 1.7243766998041045e-05, + "loss": 1.1603, + "step": 7259 + }, + { + "epoch": 2.7029353934218334, + "grad_norm": 0.15997155010700226, + "learning_rate": 1.7242929100165085e-05, + "loss": 1.1739, + "step": 7260 + }, + { + "epoch": 2.7033076985724422, + "grad_norm": 0.16311846673488617, + "learning_rate": 1.7242091095311516e-05, + "loss": 1.1698, + "step": 7261 + }, + { + "epoch": 2.7036800037230515, + "grad_norm": 0.16336789727210999, + "learning_rate": 1.724125298349272e-05, + "loss": 1.1584, + "step": 7262 + }, + { + "epoch": 2.7040523088736608, + "grad_norm": 0.1627056896686554, + "learning_rate": 1.7240414764721067e-05, + "loss": 1.1787, + "step": 7263 + }, + { + "epoch": 2.7044246140242696, + "grad_norm": 0.1628088355064392, + "learning_rate": 1.7239576439008945e-05, + "loss": 1.185, + "step": 7264 + }, + { + "epoch": 2.704796919174879, + "grad_norm": 0.16446030139923096, + "learning_rate": 1.723873800636873e-05, + "loss": 1.1599, + "step": 7265 + }, + { + "epoch": 2.7051692243254877, + "grad_norm": 0.16059374809265137, + "learning_rate": 1.7237899466812814e-05, + "loss": 1.1611, + "step": 7266 + }, + { + "epoch": 2.705541529476097, + "grad_norm": 0.16304123401641846, + "learning_rate": 1.7237060820353573e-05, + "loss": 1.1764, + "step": 7267 + }, + { + "epoch": 2.7059138346267058, + "grad_norm": 0.16345947980880737, + "learning_rate": 1.7236222067003402e-05, + "loss": 1.1757, + "step": 7268 + }, + { + "epoch": 2.706286139777315, + "grad_norm": 0.15779638290405273, + "learning_rate": 1.7235383206774682e-05, + "loss": 1.1699, + "step": 7269 + }, + { + "epoch": 2.706658444927924, + "grad_norm": 0.1547388732433319, + "learning_rate": 1.7234544239679807e-05, + "loss": 1.1618, + "step": 7270 + }, + { + "epoch": 2.707030750078533, + "grad_norm": 0.16903281211853027, + "learning_rate": 1.723370516573117e-05, + "loss": 1.1696, + "step": 7271 + }, + { + "epoch": 2.7074030552291424, + "grad_norm": 0.16696353256702423, + "learning_rate": 1.7232865984941156e-05, + "loss": 1.1588, + "step": 7272 + }, + { + "epoch": 2.707775360379751, + "grad_norm": 0.1645137071609497, + "learning_rate": 1.723202669732217e-05, + "loss": 1.1645, + "step": 7273 + }, + { + "epoch": 2.7081476655303605, + "grad_norm": 0.16324058175086975, + "learning_rate": 1.72311873028866e-05, + "loss": 1.1723, + "step": 7274 + }, + { + "epoch": 2.7085199706809693, + "grad_norm": 0.16554492712020874, + "learning_rate": 1.723034780164685e-05, + "loss": 1.1936, + "step": 7275 + }, + { + "epoch": 2.7088922758315785, + "grad_norm": 0.16024906933307648, + "learning_rate": 1.7229508193615316e-05, + "loss": 1.1689, + "step": 7276 + }, + { + "epoch": 2.7092645809821874, + "grad_norm": 0.16716809570789337, + "learning_rate": 1.72286684788044e-05, + "loss": 1.1698, + "step": 7277 + }, + { + "epoch": 2.7096368861327966, + "grad_norm": 0.16631504893302917, + "learning_rate": 1.7227828657226506e-05, + "loss": 1.1789, + "step": 7278 + }, + { + "epoch": 2.7100091912834054, + "grad_norm": 0.1682237982749939, + "learning_rate": 1.7226988728894033e-05, + "loss": 1.1719, + "step": 7279 + }, + { + "epoch": 2.7103814964340147, + "grad_norm": 0.15563975274562836, + "learning_rate": 1.722614869381939e-05, + "loss": 1.166, + "step": 7280 + }, + { + "epoch": 2.710753801584624, + "grad_norm": 0.16396182775497437, + "learning_rate": 1.7225308552014988e-05, + "loss": 1.1777, + "step": 7281 + }, + { + "epoch": 2.711126106735233, + "grad_norm": 0.16292405128479004, + "learning_rate": 1.722446830349323e-05, + "loss": 1.1783, + "step": 7282 + }, + { + "epoch": 2.711498411885842, + "grad_norm": 0.15950658917427063, + "learning_rate": 1.7223627948266526e-05, + "loss": 1.1775, + "step": 7283 + }, + { + "epoch": 2.711870717036451, + "grad_norm": 0.16486436128616333, + "learning_rate": 1.7222787486347296e-05, + "loss": 1.1664, + "step": 7284 + }, + { + "epoch": 2.71224302218706, + "grad_norm": 0.16335855424404144, + "learning_rate": 1.7221946917747945e-05, + "loss": 1.1599, + "step": 7285 + }, + { + "epoch": 2.7126153273376694, + "grad_norm": 0.15834614634513855, + "learning_rate": 1.722110624248089e-05, + "loss": 1.1644, + "step": 7286 + }, + { + "epoch": 2.7129876324882782, + "grad_norm": 0.16369593143463135, + "learning_rate": 1.722026546055855e-05, + "loss": 1.1681, + "step": 7287 + }, + { + "epoch": 2.713359937638887, + "grad_norm": 0.16682885587215424, + "learning_rate": 1.7219424571993345e-05, + "loss": 1.1763, + "step": 7288 + }, + { + "epoch": 2.7137322427894963, + "grad_norm": 0.17135527729988098, + "learning_rate": 1.721858357679769e-05, + "loss": 1.1878, + "step": 7289 + }, + { + "epoch": 2.7141045479401056, + "grad_norm": 0.16507208347320557, + "learning_rate": 1.7217742474984006e-05, + "loss": 1.1619, + "step": 7290 + }, + { + "epoch": 2.7144768530907144, + "grad_norm": 0.16445507109165192, + "learning_rate": 1.721690126656472e-05, + "loss": 1.1794, + "step": 7291 + }, + { + "epoch": 2.7148491582413237, + "grad_norm": 0.1608470231294632, + "learning_rate": 1.7216059951552256e-05, + "loss": 1.1688, + "step": 7292 + }, + { + "epoch": 2.7152214633919325, + "grad_norm": 0.16081736981868744, + "learning_rate": 1.7215218529959042e-05, + "loss": 1.183, + "step": 7293 + }, + { + "epoch": 2.7155937685425418, + "grad_norm": 0.16180992126464844, + "learning_rate": 1.7214377001797498e-05, + "loss": 1.1553, + "step": 7294 + }, + { + "epoch": 2.715966073693151, + "grad_norm": 0.16335974633693695, + "learning_rate": 1.7213535367080064e-05, + "loss": 1.1667, + "step": 7295 + }, + { + "epoch": 2.71633837884376, + "grad_norm": 0.1620166003704071, + "learning_rate": 1.7212693625819163e-05, + "loss": 1.1695, + "step": 7296 + }, + { + "epoch": 2.7167106839943687, + "grad_norm": 0.15948939323425293, + "learning_rate": 1.7211851778027226e-05, + "loss": 1.1667, + "step": 7297 + }, + { + "epoch": 2.717082989144978, + "grad_norm": 0.15840508043766022, + "learning_rate": 1.7211009823716695e-05, + "loss": 1.1611, + "step": 7298 + }, + { + "epoch": 2.717455294295587, + "grad_norm": 0.1646668016910553, + "learning_rate": 1.72101677629e-05, + "loss": 1.1748, + "step": 7299 + }, + { + "epoch": 2.717827599446196, + "grad_norm": 0.16672910749912262, + "learning_rate": 1.720932559558958e-05, + "loss": 1.1637, + "step": 7300 + }, + { + "epoch": 2.7181999045968053, + "grad_norm": 0.16393324732780457, + "learning_rate": 1.7208483321797876e-05, + "loss": 1.1536, + "step": 7301 + }, + { + "epoch": 2.718572209747414, + "grad_norm": 0.16275498270988464, + "learning_rate": 1.720764094153732e-05, + "loss": 1.1675, + "step": 7302 + }, + { + "epoch": 2.7189445148980234, + "grad_norm": 0.167283296585083, + "learning_rate": 1.7206798454820366e-05, + "loss": 1.1612, + "step": 7303 + }, + { + "epoch": 2.7193168200486326, + "grad_norm": 0.16104766726493835, + "learning_rate": 1.7205955861659446e-05, + "loss": 1.1844, + "step": 7304 + }, + { + "epoch": 2.7196891251992414, + "grad_norm": 0.16111914813518524, + "learning_rate": 1.7205113162067013e-05, + "loss": 1.1702, + "step": 7305 + }, + { + "epoch": 2.7200614303498503, + "grad_norm": 0.16553427278995514, + "learning_rate": 1.720427035605551e-05, + "loss": 1.1664, + "step": 7306 + }, + { + "epoch": 2.7204337355004595, + "grad_norm": 0.16949054598808289, + "learning_rate": 1.7203427443637385e-05, + "loss": 1.1755, + "step": 7307 + }, + { + "epoch": 2.720806040651069, + "grad_norm": 0.16320055723190308, + "learning_rate": 1.7202584424825092e-05, + "loss": 1.1693, + "step": 7308 + }, + { + "epoch": 2.7211783458016776, + "grad_norm": 0.16071534156799316, + "learning_rate": 1.7201741299631075e-05, + "loss": 1.1695, + "step": 7309 + }, + { + "epoch": 2.721550650952287, + "grad_norm": 0.16326375305652618, + "learning_rate": 1.720089806806779e-05, + "loss": 1.1712, + "step": 7310 + }, + { + "epoch": 2.7219229561028957, + "grad_norm": 0.16622449457645416, + "learning_rate": 1.72000547301477e-05, + "loss": 1.1679, + "step": 7311 + }, + { + "epoch": 2.722295261253505, + "grad_norm": 0.1643485128879547, + "learning_rate": 1.7199211285883245e-05, + "loss": 1.1791, + "step": 7312 + }, + { + "epoch": 2.7226675664041142, + "grad_norm": 0.15881820023059845, + "learning_rate": 1.7198367735286897e-05, + "loss": 1.1625, + "step": 7313 + }, + { + "epoch": 2.723039871554723, + "grad_norm": 0.16432297229766846, + "learning_rate": 1.7197524078371105e-05, + "loss": 1.1736, + "step": 7314 + }, + { + "epoch": 2.723412176705332, + "grad_norm": 0.15992549061775208, + "learning_rate": 1.7196680315148335e-05, + "loss": 1.1772, + "step": 7315 + }, + { + "epoch": 2.723784481855941, + "grad_norm": 0.16613058745861053, + "learning_rate": 1.719583644563105e-05, + "loss": 1.1808, + "step": 7316 + }, + { + "epoch": 2.7241567870065504, + "grad_norm": 0.16339601576328278, + "learning_rate": 1.7194992469831712e-05, + "loss": 1.1593, + "step": 7317 + }, + { + "epoch": 2.724529092157159, + "grad_norm": 0.15978363156318665, + "learning_rate": 1.719414838776279e-05, + "loss": 1.1716, + "step": 7318 + }, + { + "epoch": 2.7249013973077685, + "grad_norm": 0.16619928181171417, + "learning_rate": 1.719330419943674e-05, + "loss": 1.1694, + "step": 7319 + }, + { + "epoch": 2.7252737024583773, + "grad_norm": 0.16087765991687775, + "learning_rate": 1.7192459904866042e-05, + "loss": 1.1529, + "step": 7320 + }, + { + "epoch": 2.7256460076089866, + "grad_norm": 0.16294820606708527, + "learning_rate": 1.7191615504063165e-05, + "loss": 1.1925, + "step": 7321 + }, + { + "epoch": 2.726018312759596, + "grad_norm": 0.1619005650281906, + "learning_rate": 1.7190770997040574e-05, + "loss": 1.1692, + "step": 7322 + }, + { + "epoch": 2.7263906179102047, + "grad_norm": 0.17229419946670532, + "learning_rate": 1.7189926383810755e-05, + "loss": 1.1677, + "step": 7323 + }, + { + "epoch": 2.7267629230608135, + "grad_norm": 0.16301006078720093, + "learning_rate": 1.7189081664386168e-05, + "loss": 1.1832, + "step": 7324 + }, + { + "epoch": 2.7271352282114227, + "grad_norm": 0.16268163919448853, + "learning_rate": 1.7188236838779297e-05, + "loss": 1.1718, + "step": 7325 + }, + { + "epoch": 2.727507533362032, + "grad_norm": 0.16183678805828094, + "learning_rate": 1.718739190700262e-05, + "loss": 1.1765, + "step": 7326 + }, + { + "epoch": 2.727879838512641, + "grad_norm": 0.16009078919887543, + "learning_rate": 1.7186546869068612e-05, + "loss": 1.1812, + "step": 7327 + }, + { + "epoch": 2.72825214366325, + "grad_norm": 0.16223828494548798, + "learning_rate": 1.718570172498976e-05, + "loss": 1.1905, + "step": 7328 + }, + { + "epoch": 2.728624448813859, + "grad_norm": 0.1691174954175949, + "learning_rate": 1.7184856474778543e-05, + "loss": 1.1672, + "step": 7329 + }, + { + "epoch": 2.728996753964468, + "grad_norm": 0.1607164889574051, + "learning_rate": 1.7184011118447448e-05, + "loss": 1.1904, + "step": 7330 + }, + { + "epoch": 2.7293690591150774, + "grad_norm": 0.16635142266750336, + "learning_rate": 1.718316565600896e-05, + "loss": 1.1633, + "step": 7331 + }, + { + "epoch": 2.7297413642656863, + "grad_norm": 0.16551473736763, + "learning_rate": 1.7182320087475567e-05, + "loss": 1.1649, + "step": 7332 + }, + { + "epoch": 2.730113669416295, + "grad_norm": 0.16264252364635468, + "learning_rate": 1.7181474412859756e-05, + "loss": 1.1627, + "step": 7333 + }, + { + "epoch": 2.7304859745669043, + "grad_norm": 0.16819992661476135, + "learning_rate": 1.718062863217402e-05, + "loss": 1.1749, + "step": 7334 + }, + { + "epoch": 2.7308582797175136, + "grad_norm": 0.16753548383712769, + "learning_rate": 1.717978274543085e-05, + "loss": 1.1715, + "step": 7335 + }, + { + "epoch": 2.7312305848681224, + "grad_norm": 0.16801391541957855, + "learning_rate": 1.7178936752642737e-05, + "loss": 1.175, + "step": 7336 + }, + { + "epoch": 2.7316028900187317, + "grad_norm": 0.16924819350242615, + "learning_rate": 1.717809065382218e-05, + "loss": 1.1764, + "step": 7337 + }, + { + "epoch": 2.7319751951693405, + "grad_norm": 0.1656593382358551, + "learning_rate": 1.7177244448981675e-05, + "loss": 1.1834, + "step": 7338 + }, + { + "epoch": 2.7323475003199498, + "grad_norm": 0.15879951417446136, + "learning_rate": 1.7176398138133718e-05, + "loss": 1.1816, + "step": 7339 + }, + { + "epoch": 2.732719805470559, + "grad_norm": 0.16423895955085754, + "learning_rate": 1.717555172129081e-05, + "loss": 1.1789, + "step": 7340 + }, + { + "epoch": 2.733092110621168, + "grad_norm": 0.16044852137565613, + "learning_rate": 1.7174705198465454e-05, + "loss": 1.158, + "step": 7341 + }, + { + "epoch": 2.7334644157717767, + "grad_norm": 0.16656622290611267, + "learning_rate": 1.7173858569670155e-05, + "loss": 1.1623, + "step": 7342 + }, + { + "epoch": 2.733836720922386, + "grad_norm": 0.16473737359046936, + "learning_rate": 1.7173011834917415e-05, + "loss": 1.1799, + "step": 7343 + }, + { + "epoch": 2.734209026072995, + "grad_norm": 0.16188117861747742, + "learning_rate": 1.7172164994219738e-05, + "loss": 1.1787, + "step": 7344 + }, + { + "epoch": 2.734581331223604, + "grad_norm": 0.17341145873069763, + "learning_rate": 1.7171318047589637e-05, + "loss": 1.178, + "step": 7345 + }, + { + "epoch": 2.7349536363742133, + "grad_norm": 0.1620875597000122, + "learning_rate": 1.7170470995039618e-05, + "loss": 1.1842, + "step": 7346 + }, + { + "epoch": 2.735325941524822, + "grad_norm": 0.16189490258693695, + "learning_rate": 1.716962383658219e-05, + "loss": 1.1682, + "step": 7347 + }, + { + "epoch": 2.7356982466754314, + "grad_norm": 0.16094739735126495, + "learning_rate": 1.716877657222987e-05, + "loss": 1.173, + "step": 7348 + }, + { + "epoch": 2.7360705518260406, + "grad_norm": 0.16400355100631714, + "learning_rate": 1.7167929201995167e-05, + "loss": 1.1718, + "step": 7349 + }, + { + "epoch": 2.7364428569766495, + "grad_norm": 0.1662655919790268, + "learning_rate": 1.7167081725890602e-05, + "loss": 1.1556, + "step": 7350 + }, + { + "epoch": 2.7368151621272583, + "grad_norm": 0.15632013976573944, + "learning_rate": 1.716623414392869e-05, + "loss": 1.1616, + "step": 7351 + }, + { + "epoch": 2.7371874672778675, + "grad_norm": 0.15825137495994568, + "learning_rate": 1.7165386456121948e-05, + "loss": 1.1631, + "step": 7352 + }, + { + "epoch": 2.737559772428477, + "grad_norm": 0.16320408880710602, + "learning_rate": 1.71645386624829e-05, + "loss": 1.1664, + "step": 7353 + }, + { + "epoch": 2.7379320775790856, + "grad_norm": 0.17327001690864563, + "learning_rate": 1.7163690763024063e-05, + "loss": 1.1686, + "step": 7354 + }, + { + "epoch": 2.738304382729695, + "grad_norm": 0.1668155938386917, + "learning_rate": 1.7162842757757964e-05, + "loss": 1.1665, + "step": 7355 + }, + { + "epoch": 2.7386766878803037, + "grad_norm": 0.15803512930870056, + "learning_rate": 1.716199464669713e-05, + "loss": 1.1703, + "step": 7356 + }, + { + "epoch": 2.739048993030913, + "grad_norm": 0.15973332524299622, + "learning_rate": 1.716114642985408e-05, + "loss": 1.1632, + "step": 7357 + }, + { + "epoch": 2.7394212981815222, + "grad_norm": 0.17077676951885223, + "learning_rate": 1.7160298107241347e-05, + "loss": 1.1578, + "step": 7358 + }, + { + "epoch": 2.739793603332131, + "grad_norm": 0.17319710552692413, + "learning_rate": 1.7159449678871463e-05, + "loss": 1.1749, + "step": 7359 + }, + { + "epoch": 2.7401659084827403, + "grad_norm": 0.16504226624965668, + "learning_rate": 1.7158601144756953e-05, + "loss": 1.1835, + "step": 7360 + }, + { + "epoch": 2.740538213633349, + "grad_norm": 0.1656225025653839, + "learning_rate": 1.715775250491036e-05, + "loss": 1.1673, + "step": 7361 + }, + { + "epoch": 2.7409105187839584, + "grad_norm": 0.16402550041675568, + "learning_rate": 1.7156903759344207e-05, + "loss": 1.1798, + "step": 7362 + }, + { + "epoch": 2.7412828239345672, + "grad_norm": 0.1609089970588684, + "learning_rate": 1.715605490807103e-05, + "loss": 1.1661, + "step": 7363 + }, + { + "epoch": 2.7416551290851765, + "grad_norm": 0.1700262427330017, + "learning_rate": 1.7155205951103378e-05, + "loss": 1.1807, + "step": 7364 + }, + { + "epoch": 2.7420274342357853, + "grad_norm": 0.16744990646839142, + "learning_rate": 1.715435688845378e-05, + "loss": 1.1824, + "step": 7365 + }, + { + "epoch": 2.7423997393863946, + "grad_norm": 0.1665429025888443, + "learning_rate": 1.715350772013478e-05, + "loss": 1.1701, + "step": 7366 + }, + { + "epoch": 2.742772044537004, + "grad_norm": 0.16256341338157654, + "learning_rate": 1.7152658446158926e-05, + "loss": 1.175, + "step": 7367 + }, + { + "epoch": 2.7431443496876127, + "grad_norm": 0.16586780548095703, + "learning_rate": 1.7151809066538746e-05, + "loss": 1.1882, + "step": 7368 + }, + { + "epoch": 2.743516654838222, + "grad_norm": 0.1687692552804947, + "learning_rate": 1.7150959581286802e-05, + "loss": 1.1726, + "step": 7369 + }, + { + "epoch": 2.7438889599888308, + "grad_norm": 0.1655583381652832, + "learning_rate": 1.715010999041563e-05, + "loss": 1.1742, + "step": 7370 + }, + { + "epoch": 2.74426126513944, + "grad_norm": 0.1582690328359604, + "learning_rate": 1.7149260293937782e-05, + "loss": 1.1841, + "step": 7371 + }, + { + "epoch": 2.744633570290049, + "grad_norm": 0.1673700511455536, + "learning_rate": 1.714841049186581e-05, + "loss": 1.1825, + "step": 7372 + }, + { + "epoch": 2.745005875440658, + "grad_norm": 0.1654442995786667, + "learning_rate": 1.7147560584212263e-05, + "loss": 1.1678, + "step": 7373 + }, + { + "epoch": 2.745378180591267, + "grad_norm": 0.16653363406658173, + "learning_rate": 1.7146710570989698e-05, + "loss": 1.1749, + "step": 7374 + }, + { + "epoch": 2.745750485741876, + "grad_norm": 0.15975458920001984, + "learning_rate": 1.7145860452210662e-05, + "loss": 1.18, + "step": 7375 + }, + { + "epoch": 2.7461227908924855, + "grad_norm": 0.16379772126674652, + "learning_rate": 1.7145010227887716e-05, + "loss": 1.1782, + "step": 7376 + }, + { + "epoch": 2.7464950960430943, + "grad_norm": 0.16204066574573517, + "learning_rate": 1.714415989803342e-05, + "loss": 1.1681, + "step": 7377 + }, + { + "epoch": 2.7468674011937035, + "grad_norm": 0.16692563891410828, + "learning_rate": 1.7143309462660326e-05, + "loss": 1.172, + "step": 7378 + }, + { + "epoch": 2.7472397063443124, + "grad_norm": 0.16398832201957703, + "learning_rate": 1.7142458921781002e-05, + "loss": 1.1752, + "step": 7379 + }, + { + "epoch": 2.7476120114949216, + "grad_norm": 0.16204705834388733, + "learning_rate": 1.714160827540801e-05, + "loss": 1.1742, + "step": 7380 + }, + { + "epoch": 2.7479843166455304, + "grad_norm": 0.16847161948680878, + "learning_rate": 1.7140757523553907e-05, + "loss": 1.1765, + "step": 7381 + }, + { + "epoch": 2.7483566217961397, + "grad_norm": 0.16786617040634155, + "learning_rate": 1.713990666623127e-05, + "loss": 1.1632, + "step": 7382 + }, + { + "epoch": 2.7487289269467485, + "grad_norm": 0.15807227790355682, + "learning_rate": 1.7139055703452653e-05, + "loss": 1.1683, + "step": 7383 + }, + { + "epoch": 2.749101232097358, + "grad_norm": 0.1643909215927124, + "learning_rate": 1.7138204635230637e-05, + "loss": 1.1715, + "step": 7384 + }, + { + "epoch": 2.749473537247967, + "grad_norm": 0.15912140905857086, + "learning_rate": 1.7137353461577785e-05, + "loss": 1.179, + "step": 7385 + }, + { + "epoch": 2.749845842398576, + "grad_norm": 0.16151286661624908, + "learning_rate": 1.713650218250667e-05, + "loss": 1.1741, + "step": 7386 + }, + { + "epoch": 2.750218147549185, + "grad_norm": 0.1597239375114441, + "learning_rate": 1.7135650798029864e-05, + "loss": 1.16, + "step": 7387 + }, + { + "epoch": 2.750590452699794, + "grad_norm": 0.16162365674972534, + "learning_rate": 1.7134799308159946e-05, + "loss": 1.1671, + "step": 7388 + }, + { + "epoch": 2.7509627578504032, + "grad_norm": 0.16579143702983856, + "learning_rate": 1.7133947712909485e-05, + "loss": 1.1645, + "step": 7389 + }, + { + "epoch": 2.751335063001012, + "grad_norm": 0.1629938781261444, + "learning_rate": 1.7133096012291067e-05, + "loss": 1.1764, + "step": 7390 + }, + { + "epoch": 2.7517073681516213, + "grad_norm": 0.15928851068019867, + "learning_rate": 1.7132244206317272e-05, + "loss": 1.1714, + "step": 7391 + }, + { + "epoch": 2.75207967330223, + "grad_norm": 0.16716843843460083, + "learning_rate": 1.7131392295000676e-05, + "loss": 1.1764, + "step": 7392 + }, + { + "epoch": 2.7524519784528394, + "grad_norm": 0.16544127464294434, + "learning_rate": 1.713054027835386e-05, + "loss": 1.1731, + "step": 7393 + }, + { + "epoch": 2.7528242836034487, + "grad_norm": 0.1620912104845047, + "learning_rate": 1.7129688156389414e-05, + "loss": 1.1723, + "step": 7394 + }, + { + "epoch": 2.7531965887540575, + "grad_norm": 0.1651720106601715, + "learning_rate": 1.7128835929119923e-05, + "loss": 1.1602, + "step": 7395 + }, + { + "epoch": 2.7535688939046667, + "grad_norm": 0.16467827558517456, + "learning_rate": 1.712798359655797e-05, + "loss": 1.1764, + "step": 7396 + }, + { + "epoch": 2.7539411990552756, + "grad_norm": 0.15525712072849274, + "learning_rate": 1.7127131158716145e-05, + "loss": 1.1681, + "step": 7397 + }, + { + "epoch": 2.754313504205885, + "grad_norm": 0.1612330675125122, + "learning_rate": 1.7126278615607045e-05, + "loss": 1.167, + "step": 7398 + }, + { + "epoch": 2.7546858093564937, + "grad_norm": 0.16370365023612976, + "learning_rate": 1.7125425967243252e-05, + "loss": 1.1789, + "step": 7399 + }, + { + "epoch": 2.755058114507103, + "grad_norm": 0.16268078982830048, + "learning_rate": 1.7124573213637367e-05, + "loss": 1.173, + "step": 7400 + }, + { + "epoch": 2.7554304196577117, + "grad_norm": 0.16411568224430084, + "learning_rate": 1.7123720354801984e-05, + "loss": 1.1595, + "step": 7401 + }, + { + "epoch": 2.755802724808321, + "grad_norm": 0.16271185874938965, + "learning_rate": 1.7122867390749697e-05, + "loss": 1.1676, + "step": 7402 + }, + { + "epoch": 2.7561750299589303, + "grad_norm": 0.16013269126415253, + "learning_rate": 1.7122014321493105e-05, + "loss": 1.1689, + "step": 7403 + }, + { + "epoch": 2.756547335109539, + "grad_norm": 0.168240487575531, + "learning_rate": 1.7121161147044813e-05, + "loss": 1.1712, + "step": 7404 + }, + { + "epoch": 2.7569196402601484, + "grad_norm": 0.16075722873210907, + "learning_rate": 1.7120307867417414e-05, + "loss": 1.1624, + "step": 7405 + }, + { + "epoch": 2.757291945410757, + "grad_norm": 0.16558417677879333, + "learning_rate": 1.7119454482623515e-05, + "loss": 1.1772, + "step": 7406 + }, + { + "epoch": 2.7576642505613664, + "grad_norm": 0.1621825397014618, + "learning_rate": 1.7118600992675718e-05, + "loss": 1.1814, + "step": 7407 + }, + { + "epoch": 2.7580365557119757, + "grad_norm": 0.16692297160625458, + "learning_rate": 1.7117747397586636e-05, + "loss": 1.1804, + "step": 7408 + }, + { + "epoch": 2.7584088608625845, + "grad_norm": 0.15905342996120453, + "learning_rate": 1.7116893697368866e-05, + "loss": 1.171, + "step": 7409 + }, + { + "epoch": 2.7587811660131933, + "grad_norm": 0.16342061758041382, + "learning_rate": 1.7116039892035025e-05, + "loss": 1.1518, + "step": 7410 + }, + { + "epoch": 2.7591534711638026, + "grad_norm": 0.161211296916008, + "learning_rate": 1.7115185981597725e-05, + "loss": 1.1736, + "step": 7411 + }, + { + "epoch": 2.759525776314412, + "grad_norm": 0.16845087707042694, + "learning_rate": 1.7114331966069572e-05, + "loss": 1.1743, + "step": 7412 + }, + { + "epoch": 2.7598980814650207, + "grad_norm": 0.16681751608848572, + "learning_rate": 1.7113477845463177e-05, + "loss": 1.1663, + "step": 7413 + }, + { + "epoch": 2.76027038661563, + "grad_norm": 0.16370663046836853, + "learning_rate": 1.7112623619791167e-05, + "loss": 1.1643, + "step": 7414 + }, + { + "epoch": 2.7606426917662388, + "grad_norm": 0.16601529717445374, + "learning_rate": 1.711176928906615e-05, + "loss": 1.1716, + "step": 7415 + }, + { + "epoch": 2.761014996916848, + "grad_norm": 0.1686096489429474, + "learning_rate": 1.7110914853300748e-05, + "loss": 1.1626, + "step": 7416 + }, + { + "epoch": 2.7613873020674573, + "grad_norm": 0.1649993360042572, + "learning_rate": 1.711006031250758e-05, + "loss": 1.1706, + "step": 7417 + }, + { + "epoch": 2.761759607218066, + "grad_norm": 0.16518589854240417, + "learning_rate": 1.710920566669927e-05, + "loss": 1.1622, + "step": 7418 + }, + { + "epoch": 2.762131912368675, + "grad_norm": 0.1586756408214569, + "learning_rate": 1.7108350915888432e-05, + "loss": 1.1615, + "step": 7419 + }, + { + "epoch": 2.762504217519284, + "grad_norm": 0.16265276074409485, + "learning_rate": 1.71074960600877e-05, + "loss": 1.1762, + "step": 7420 + }, + { + "epoch": 2.7628765226698935, + "grad_norm": 0.16679120063781738, + "learning_rate": 1.71066410993097e-05, + "loss": 1.1838, + "step": 7421 + }, + { + "epoch": 2.7632488278205023, + "grad_norm": 0.16883881390094757, + "learning_rate": 1.7105786033567055e-05, + "loss": 1.1721, + "step": 7422 + }, + { + "epoch": 2.7636211329711116, + "grad_norm": 0.15802286565303802, + "learning_rate": 1.7104930862872394e-05, + "loss": 1.1737, + "step": 7423 + }, + { + "epoch": 2.7639934381217204, + "grad_norm": 0.1627608686685562, + "learning_rate": 1.7104075587238353e-05, + "loss": 1.1678, + "step": 7424 + }, + { + "epoch": 2.7643657432723296, + "grad_norm": 0.1643580198287964, + "learning_rate": 1.710322020667756e-05, + "loss": 1.1746, + "step": 7425 + }, + { + "epoch": 2.764738048422939, + "grad_norm": 0.16312402486801147, + "learning_rate": 1.7102364721202655e-05, + "loss": 1.1812, + "step": 7426 + }, + { + "epoch": 2.7651103535735477, + "grad_norm": 0.15832361578941345, + "learning_rate": 1.710150913082626e-05, + "loss": 1.1886, + "step": 7427 + }, + { + "epoch": 2.7654826587241566, + "grad_norm": 0.16072019934654236, + "learning_rate": 1.7100653435561027e-05, + "loss": 1.1688, + "step": 7428 + }, + { + "epoch": 2.765854963874766, + "grad_norm": 0.16109511256217957, + "learning_rate": 1.7099797635419587e-05, + "loss": 1.184, + "step": 7429 + }, + { + "epoch": 2.766227269025375, + "grad_norm": 0.16431210935115814, + "learning_rate": 1.709894173041458e-05, + "loss": 1.1763, + "step": 7430 + }, + { + "epoch": 2.766599574175984, + "grad_norm": 0.16304095089435577, + "learning_rate": 1.7098085720558653e-05, + "loss": 1.1671, + "step": 7431 + }, + { + "epoch": 2.766971879326593, + "grad_norm": 0.16046518087387085, + "learning_rate": 1.709722960586444e-05, + "loss": 1.1659, + "step": 7432 + }, + { + "epoch": 2.767344184477202, + "grad_norm": 0.1624547243118286, + "learning_rate": 1.7096373386344596e-05, + "loss": 1.1645, + "step": 7433 + }, + { + "epoch": 2.7677164896278112, + "grad_norm": 0.1661730259656906, + "learning_rate": 1.709551706201176e-05, + "loss": 1.171, + "step": 7434 + }, + { + "epoch": 2.7680887947784205, + "grad_norm": 0.15459467470645905, + "learning_rate": 1.709466063287858e-05, + "loss": 1.1533, + "step": 7435 + }, + { + "epoch": 2.7684610999290293, + "grad_norm": 0.16066570580005646, + "learning_rate": 1.709380409895771e-05, + "loss": 1.1714, + "step": 7436 + }, + { + "epoch": 2.768833405079638, + "grad_norm": 0.1591673642396927, + "learning_rate": 1.7092947460261802e-05, + "loss": 1.1589, + "step": 7437 + }, + { + "epoch": 2.7692057102302474, + "grad_norm": 0.1647397130727768, + "learning_rate": 1.7092090716803503e-05, + "loss": 1.1783, + "step": 7438 + }, + { + "epoch": 2.7695780153808567, + "grad_norm": 0.16545167565345764, + "learning_rate": 1.7091233868595465e-05, + "loss": 1.1808, + "step": 7439 + }, + { + "epoch": 2.7699503205314655, + "grad_norm": 0.16354137659072876, + "learning_rate": 1.7090376915650354e-05, + "loss": 1.1713, + "step": 7440 + }, + { + "epoch": 2.7703226256820748, + "grad_norm": 0.1619860678911209, + "learning_rate": 1.708951985798082e-05, + "loss": 1.1803, + "step": 7441 + }, + { + "epoch": 2.7706949308326836, + "grad_norm": 0.1611848771572113, + "learning_rate": 1.7088662695599517e-05, + "loss": 1.1658, + "step": 7442 + }, + { + "epoch": 2.771067235983293, + "grad_norm": 0.1581423580646515, + "learning_rate": 1.7087805428519114e-05, + "loss": 1.1639, + "step": 7443 + }, + { + "epoch": 2.771439541133902, + "grad_norm": 0.1612246185541153, + "learning_rate": 1.708694805675227e-05, + "loss": 1.1588, + "step": 7444 + }, + { + "epoch": 2.771811846284511, + "grad_norm": 0.16077731549739838, + "learning_rate": 1.708609058031165e-05, + "loss": 1.1788, + "step": 7445 + }, + { + "epoch": 2.7721841514351198, + "grad_norm": 0.16849185526371002, + "learning_rate": 1.7085232999209915e-05, + "loss": 1.1797, + "step": 7446 + }, + { + "epoch": 2.772556456585729, + "grad_norm": 0.16407588124275208, + "learning_rate": 1.7084375313459735e-05, + "loss": 1.1638, + "step": 7447 + }, + { + "epoch": 2.7729287617363383, + "grad_norm": 0.1661745011806488, + "learning_rate": 1.7083517523073775e-05, + "loss": 1.1695, + "step": 7448 + }, + { + "epoch": 2.773301066886947, + "grad_norm": 0.1609290987253189, + "learning_rate": 1.7082659628064704e-05, + "loss": 1.1596, + "step": 7449 + }, + { + "epoch": 2.7736733720375564, + "grad_norm": 0.15832340717315674, + "learning_rate": 1.7081801628445195e-05, + "loss": 1.1536, + "step": 7450 + }, + { + "epoch": 2.774045677188165, + "grad_norm": 0.16378222405910492, + "learning_rate": 1.708094352422792e-05, + "loss": 1.1704, + "step": 7451 + }, + { + "epoch": 2.7744179823387745, + "grad_norm": 0.16052454710006714, + "learning_rate": 1.7080085315425557e-05, + "loss": 1.1643, + "step": 7452 + }, + { + "epoch": 2.7747902874893837, + "grad_norm": 0.16648857295513153, + "learning_rate": 1.7079227002050776e-05, + "loss": 1.1766, + "step": 7453 + }, + { + "epoch": 2.7751625926399925, + "grad_norm": 0.16049505770206451, + "learning_rate": 1.7078368584116256e-05, + "loss": 1.172, + "step": 7454 + }, + { + "epoch": 2.7755348977906014, + "grad_norm": 0.1630232185125351, + "learning_rate": 1.7077510061634675e-05, + "loss": 1.1696, + "step": 7455 + }, + { + "epoch": 2.7759072029412106, + "grad_norm": 0.15958517789840698, + "learning_rate": 1.707665143461872e-05, + "loss": 1.1703, + "step": 7456 + }, + { + "epoch": 2.77627950809182, + "grad_norm": 0.16538631916046143, + "learning_rate": 1.707579270308106e-05, + "loss": 1.1695, + "step": 7457 + }, + { + "epoch": 2.7766518132424287, + "grad_norm": 0.16825279593467712, + "learning_rate": 1.7074933867034392e-05, + "loss": 1.1668, + "step": 7458 + }, + { + "epoch": 2.777024118393038, + "grad_norm": 0.15979918837547302, + "learning_rate": 1.7074074926491392e-05, + "loss": 1.1692, + "step": 7459 + }, + { + "epoch": 2.777396423543647, + "grad_norm": 0.1648228019475937, + "learning_rate": 1.707321588146475e-05, + "loss": 1.1818, + "step": 7460 + }, + { + "epoch": 2.777768728694256, + "grad_norm": 0.16858553886413574, + "learning_rate": 1.7072356731967152e-05, + "loss": 1.1717, + "step": 7461 + }, + { + "epoch": 2.7781410338448653, + "grad_norm": 0.16324035823345184, + "learning_rate": 1.707149747801129e-05, + "loss": 1.1706, + "step": 7462 + }, + { + "epoch": 2.778513338995474, + "grad_norm": 0.16237075626850128, + "learning_rate": 1.7070638119609854e-05, + "loss": 1.1689, + "step": 7463 + }, + { + "epoch": 2.778885644146083, + "grad_norm": 0.1649467796087265, + "learning_rate": 1.706977865677554e-05, + "loss": 1.1741, + "step": 7464 + }, + { + "epoch": 2.7792579492966922, + "grad_norm": 0.16637516021728516, + "learning_rate": 1.7068919089521032e-05, + "loss": 1.1881, + "step": 7465 + }, + { + "epoch": 2.7796302544473015, + "grad_norm": 0.1721596121788025, + "learning_rate": 1.7068059417859037e-05, + "loss": 1.1669, + "step": 7466 + }, + { + "epoch": 2.7800025595979103, + "grad_norm": 0.16336818039417267, + "learning_rate": 1.7067199641802247e-05, + "loss": 1.1616, + "step": 7467 + }, + { + "epoch": 2.7803748647485196, + "grad_norm": 0.1582047939300537, + "learning_rate": 1.7066339761363364e-05, + "loss": 1.1887, + "step": 7468 + }, + { + "epoch": 2.7807471698991284, + "grad_norm": 0.16283096373081207, + "learning_rate": 1.7065479776555083e-05, + "loss": 1.1633, + "step": 7469 + }, + { + "epoch": 2.7811194750497377, + "grad_norm": 0.16766688227653503, + "learning_rate": 1.7064619687390108e-05, + "loss": 1.1501, + "step": 7470 + }, + { + "epoch": 2.781491780200347, + "grad_norm": 0.16073793172836304, + "learning_rate": 1.706375949388115e-05, + "loss": 1.166, + "step": 7471 + }, + { + "epoch": 2.7818640853509558, + "grad_norm": 0.1599031388759613, + "learning_rate": 1.7062899196040903e-05, + "loss": 1.1751, + "step": 7472 + }, + { + "epoch": 2.7822363905015646, + "grad_norm": 0.16344262659549713, + "learning_rate": 1.7062038793882078e-05, + "loss": 1.1671, + "step": 7473 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.16053412854671478, + "learning_rate": 1.7061178287417383e-05, + "loss": 1.1696, + "step": 7474 + }, + { + "epoch": 2.782981000802783, + "grad_norm": 0.16248364746570587, + "learning_rate": 1.706031767665953e-05, + "loss": 1.1705, + "step": 7475 + }, + { + "epoch": 2.783353305953392, + "grad_norm": 0.16297762095928192, + "learning_rate": 1.7059456961621226e-05, + "loss": 1.1607, + "step": 7476 + }, + { + "epoch": 2.783725611104001, + "grad_norm": 0.16271525621414185, + "learning_rate": 1.7058596142315185e-05, + "loss": 1.1568, + "step": 7477 + }, + { + "epoch": 2.78409791625461, + "grad_norm": 0.1527474820613861, + "learning_rate": 1.7057735218754126e-05, + "loss": 1.1806, + "step": 7478 + }, + { + "epoch": 2.7844702214052193, + "grad_norm": 0.15945225954055786, + "learning_rate": 1.705687419095076e-05, + "loss": 1.1747, + "step": 7479 + }, + { + "epoch": 2.7848425265558285, + "grad_norm": 0.16087423264980316, + "learning_rate": 1.7056013058917802e-05, + "loss": 1.1723, + "step": 7480 + }, + { + "epoch": 2.7852148317064374, + "grad_norm": 0.16305401921272278, + "learning_rate": 1.7055151822667975e-05, + "loss": 1.1841, + "step": 7481 + }, + { + "epoch": 2.7855871368570466, + "grad_norm": 0.16334961354732513, + "learning_rate": 1.7054290482213996e-05, + "loss": 1.1602, + "step": 7482 + }, + { + "epoch": 2.7859594420076554, + "grad_norm": 0.16361406445503235, + "learning_rate": 1.7053429037568596e-05, + "loss": 1.1759, + "step": 7483 + }, + { + "epoch": 2.7863317471582647, + "grad_norm": 0.1611858606338501, + "learning_rate": 1.7052567488744485e-05, + "loss": 1.1622, + "step": 7484 + }, + { + "epoch": 2.7867040523088735, + "grad_norm": 0.1641787737607956, + "learning_rate": 1.7051705835754394e-05, + "loss": 1.1794, + "step": 7485 + }, + { + "epoch": 2.787076357459483, + "grad_norm": 0.15844056010246277, + "learning_rate": 1.7050844078611058e-05, + "loss": 1.1711, + "step": 7486 + }, + { + "epoch": 2.7874486626100916, + "grad_norm": 0.15609781444072723, + "learning_rate": 1.7049982217327192e-05, + "loss": 1.1585, + "step": 7487 + }, + { + "epoch": 2.787820967760701, + "grad_norm": 0.16171182692050934, + "learning_rate": 1.704912025191553e-05, + "loss": 1.1798, + "step": 7488 + }, + { + "epoch": 2.78819327291131, + "grad_norm": 0.16126160323619843, + "learning_rate": 1.704825818238881e-05, + "loss": 1.158, + "step": 7489 + }, + { + "epoch": 2.788565578061919, + "grad_norm": 0.16148677468299866, + "learning_rate": 1.7047396008759755e-05, + "loss": 1.168, + "step": 7490 + }, + { + "epoch": 2.7889378832125282, + "grad_norm": 0.1633322536945343, + "learning_rate": 1.7046533731041103e-05, + "loss": 1.1784, + "step": 7491 + }, + { + "epoch": 2.789310188363137, + "grad_norm": 0.16195450723171234, + "learning_rate": 1.7045671349245588e-05, + "loss": 1.1889, + "step": 7492 + }, + { + "epoch": 2.7896824935137463, + "grad_norm": 0.16481108963489532, + "learning_rate": 1.7044808863385953e-05, + "loss": 1.1693, + "step": 7493 + }, + { + "epoch": 2.790054798664355, + "grad_norm": 0.16438551247119904, + "learning_rate": 1.7043946273474935e-05, + "loss": 1.1582, + "step": 7494 + }, + { + "epoch": 2.7904271038149644, + "grad_norm": 0.16081902384757996, + "learning_rate": 1.704308357952527e-05, + "loss": 1.1666, + "step": 7495 + }, + { + "epoch": 2.790799408965573, + "grad_norm": 0.16997969150543213, + "learning_rate": 1.7042220781549703e-05, + "loss": 1.1741, + "step": 7496 + }, + { + "epoch": 2.7911717141161825, + "grad_norm": 0.16140542924404144, + "learning_rate": 1.7041357879560972e-05, + "loss": 1.171, + "step": 7497 + }, + { + "epoch": 2.7915440192667917, + "grad_norm": 0.1633981466293335, + "learning_rate": 1.7040494873571832e-05, + "loss": 1.1596, + "step": 7498 + }, + { + "epoch": 2.7919163244174006, + "grad_norm": 0.17155370116233826, + "learning_rate": 1.7039631763595025e-05, + "loss": 1.1825, + "step": 7499 + }, + { + "epoch": 2.79228862956801, + "grad_norm": 0.17078445851802826, + "learning_rate": 1.7038768549643297e-05, + "loss": 1.175, + "step": 7500 + }, + { + "epoch": 2.79228862956801, + "eval_loss": 1.2953369617462158, + "eval_runtime": 16.7431, + "eval_samples_per_second": 103.565, + "eval_steps_per_second": 5.196, + "step": 7500 + }, + { + "epoch": 2.7926609347186186, + "grad_norm": 0.16718538105487823, + "learning_rate": 1.7037905231729402e-05, + "loss": 1.1802, + "step": 7501 + }, + { + "epoch": 2.793033239869228, + "grad_norm": 0.16334514319896698, + "learning_rate": 1.7037041809866085e-05, + "loss": 1.1619, + "step": 7502 + }, + { + "epoch": 2.7934055450198367, + "grad_norm": 0.1652374416589737, + "learning_rate": 1.7036178284066103e-05, + "loss": 1.1584, + "step": 7503 + }, + { + "epoch": 2.793777850170446, + "grad_norm": 0.17066913843154907, + "learning_rate": 1.703531465434221e-05, + "loss": 1.1792, + "step": 7504 + }, + { + "epoch": 2.794150155321055, + "grad_norm": 0.1763291358947754, + "learning_rate": 1.7034450920707162e-05, + "loss": 1.1846, + "step": 7505 + }, + { + "epoch": 2.794522460471664, + "grad_norm": 0.16274365782737732, + "learning_rate": 1.7033587083173713e-05, + "loss": 1.1639, + "step": 7506 + }, + { + "epoch": 2.7948947656222733, + "grad_norm": 0.16275723278522491, + "learning_rate": 1.7032723141754626e-05, + "loss": 1.1731, + "step": 7507 + }, + { + "epoch": 2.795267070772882, + "grad_norm": 0.16381405293941498, + "learning_rate": 1.703185909646266e-05, + "loss": 1.1696, + "step": 7508 + }, + { + "epoch": 2.7956393759234914, + "grad_norm": 0.1678345650434494, + "learning_rate": 1.7030994947310576e-05, + "loss": 1.1755, + "step": 7509 + }, + { + "epoch": 2.7960116810741003, + "grad_norm": 0.16982942819595337, + "learning_rate": 1.703013069431114e-05, + "loss": 1.1687, + "step": 7510 + }, + { + "epoch": 2.7963839862247095, + "grad_norm": 0.16046252846717834, + "learning_rate": 1.7029266337477106e-05, + "loss": 1.1686, + "step": 7511 + }, + { + "epoch": 2.7967562913753183, + "grad_norm": 0.16178347170352936, + "learning_rate": 1.7028401876821257e-05, + "loss": 1.1689, + "step": 7512 + }, + { + "epoch": 2.7971285965259276, + "grad_norm": 0.16835175454616547, + "learning_rate": 1.7027537312356353e-05, + "loss": 1.1482, + "step": 7513 + }, + { + "epoch": 2.7975009016765364, + "grad_norm": 0.16404716670513153, + "learning_rate": 1.702667264409516e-05, + "loss": 1.157, + "step": 7514 + }, + { + "epoch": 2.7978732068271457, + "grad_norm": 0.15957726538181305, + "learning_rate": 1.7025807872050456e-05, + "loss": 1.1719, + "step": 7515 + }, + { + "epoch": 2.798245511977755, + "grad_norm": 0.16722844541072845, + "learning_rate": 1.702494299623501e-05, + "loss": 1.174, + "step": 7516 + }, + { + "epoch": 2.7986178171283638, + "grad_norm": 0.16469328105449677, + "learning_rate": 1.7024078016661597e-05, + "loss": 1.1758, + "step": 7517 + }, + { + "epoch": 2.798990122278973, + "grad_norm": 0.16519738733768463, + "learning_rate": 1.7023212933342995e-05, + "loss": 1.1714, + "step": 7518 + }, + { + "epoch": 2.799362427429582, + "grad_norm": 0.16509708762168884, + "learning_rate": 1.7022347746291975e-05, + "loss": 1.1692, + "step": 7519 + }, + { + "epoch": 2.799734732580191, + "grad_norm": 0.15529975295066833, + "learning_rate": 1.7021482455521323e-05, + "loss": 1.1697, + "step": 7520 + }, + { + "epoch": 2.8001070377308, + "grad_norm": 0.16663393378257751, + "learning_rate": 1.7020617061043815e-05, + "loss": 1.1656, + "step": 7521 + }, + { + "epoch": 2.800479342881409, + "grad_norm": 0.16711021959781647, + "learning_rate": 1.701975156287223e-05, + "loss": 1.1635, + "step": 7522 + }, + { + "epoch": 2.800851648032018, + "grad_norm": 0.1636432558298111, + "learning_rate": 1.7018885961019356e-05, + "loss": 1.1626, + "step": 7523 + }, + { + "epoch": 2.8012239531826273, + "grad_norm": 0.16135211288928986, + "learning_rate": 1.701802025549798e-05, + "loss": 1.1764, + "step": 7524 + }, + { + "epoch": 2.8015962583332366, + "grad_norm": 0.161915123462677, + "learning_rate": 1.7017154446320882e-05, + "loss": 1.1893, + "step": 7525 + }, + { + "epoch": 2.8019685634838454, + "grad_norm": 0.1613880842924118, + "learning_rate": 1.7016288533500855e-05, + "loss": 1.1716, + "step": 7526 + }, + { + "epoch": 2.8023408686344546, + "grad_norm": 0.1624586284160614, + "learning_rate": 1.7015422517050686e-05, + "loss": 1.1683, + "step": 7527 + }, + { + "epoch": 2.8027131737850635, + "grad_norm": 0.1543359011411667, + "learning_rate": 1.7014556396983168e-05, + "loss": 1.1611, + "step": 7528 + }, + { + "epoch": 2.8030854789356727, + "grad_norm": 0.16166086494922638, + "learning_rate": 1.701369017331109e-05, + "loss": 1.1609, + "step": 7529 + }, + { + "epoch": 2.803457784086282, + "grad_norm": 0.1620410680770874, + "learning_rate": 1.7012823846047252e-05, + "loss": 1.1608, + "step": 7530 + }, + { + "epoch": 2.803830089236891, + "grad_norm": 0.168722003698349, + "learning_rate": 1.7011957415204443e-05, + "loss": 1.1798, + "step": 7531 + }, + { + "epoch": 2.8042023943874996, + "grad_norm": 0.16052375733852386, + "learning_rate": 1.7011090880795463e-05, + "loss": 1.1571, + "step": 7532 + }, + { + "epoch": 2.804574699538109, + "grad_norm": 0.1640620082616806, + "learning_rate": 1.701022424283311e-05, + "loss": 1.1875, + "step": 7533 + }, + { + "epoch": 2.804947004688718, + "grad_norm": 0.16312499344348907, + "learning_rate": 1.7009357501330188e-05, + "loss": 1.1748, + "step": 7534 + }, + { + "epoch": 2.805319309839327, + "grad_norm": 0.1676671802997589, + "learning_rate": 1.7008490656299492e-05, + "loss": 1.1347, + "step": 7535 + }, + { + "epoch": 2.8056916149899362, + "grad_norm": 0.15957991778850555, + "learning_rate": 1.700762370775383e-05, + "loss": 1.1705, + "step": 7536 + }, + { + "epoch": 2.806063920140545, + "grad_norm": 0.1587969958782196, + "learning_rate": 1.700675665570601e-05, + "loss": 1.1683, + "step": 7537 + }, + { + "epoch": 2.8064362252911543, + "grad_norm": 0.15972483158111572, + "learning_rate": 1.7005889500168828e-05, + "loss": 1.172, + "step": 7538 + }, + { + "epoch": 2.8068085304417636, + "grad_norm": 0.16055303812026978, + "learning_rate": 1.70050222411551e-05, + "loss": 1.1702, + "step": 7539 + }, + { + "epoch": 2.8071808355923724, + "grad_norm": 0.16454675793647766, + "learning_rate": 1.7004154878677634e-05, + "loss": 1.173, + "step": 7540 + }, + { + "epoch": 2.8075531407429812, + "grad_norm": 0.16008242964744568, + "learning_rate": 1.7003287412749236e-05, + "loss": 1.1688, + "step": 7541 + }, + { + "epoch": 2.8079254458935905, + "grad_norm": 0.16654878854751587, + "learning_rate": 1.7002419843382724e-05, + "loss": 1.1581, + "step": 7542 + }, + { + "epoch": 2.8082977510441998, + "grad_norm": 0.15987439453601837, + "learning_rate": 1.7001552170590913e-05, + "loss": 1.1766, + "step": 7543 + }, + { + "epoch": 2.8086700561948086, + "grad_norm": 0.15910540521144867, + "learning_rate": 1.7000684394386615e-05, + "loss": 1.1669, + "step": 7544 + }, + { + "epoch": 2.809042361345418, + "grad_norm": 0.16279643774032593, + "learning_rate": 1.6999816514782647e-05, + "loss": 1.1597, + "step": 7545 + }, + { + "epoch": 2.8094146664960267, + "grad_norm": 0.16432224214076996, + "learning_rate": 1.699894853179183e-05, + "loss": 1.1671, + "step": 7546 + }, + { + "epoch": 2.809786971646636, + "grad_norm": 0.16405612230300903, + "learning_rate": 1.699808044542698e-05, + "loss": 1.1825, + "step": 7547 + }, + { + "epoch": 2.810159276797245, + "grad_norm": 0.16156992316246033, + "learning_rate": 1.6997212255700924e-05, + "loss": 1.1532, + "step": 7548 + }, + { + "epoch": 2.810531581947854, + "grad_norm": 0.1572588086128235, + "learning_rate": 1.699634396262648e-05, + "loss": 1.1612, + "step": 7549 + }, + { + "epoch": 2.810903887098463, + "grad_norm": 0.16491782665252686, + "learning_rate": 1.6995475566216475e-05, + "loss": 1.1627, + "step": 7550 + }, + { + "epoch": 2.811276192249072, + "grad_norm": 0.1590576469898224, + "learning_rate": 1.6994607066483735e-05, + "loss": 1.159, + "step": 7551 + }, + { + "epoch": 2.8116484973996814, + "grad_norm": 0.16059501469135284, + "learning_rate": 1.6993738463441087e-05, + "loss": 1.1792, + "step": 7552 + }, + { + "epoch": 2.81202080255029, + "grad_norm": 0.16398665308952332, + "learning_rate": 1.6992869757101362e-05, + "loss": 1.1696, + "step": 7553 + }, + { + "epoch": 2.8123931077008995, + "grad_norm": 0.16270703077316284, + "learning_rate": 1.6992000947477386e-05, + "loss": 1.1745, + "step": 7554 + }, + { + "epoch": 2.8127654128515083, + "grad_norm": 0.1659878045320511, + "learning_rate": 1.6991132034582e-05, + "loss": 1.1604, + "step": 7555 + }, + { + "epoch": 2.8131377180021175, + "grad_norm": 0.16921178996562958, + "learning_rate": 1.699026301842803e-05, + "loss": 1.1652, + "step": 7556 + }, + { + "epoch": 2.813510023152727, + "grad_norm": 0.16774874925613403, + "learning_rate": 1.6989393899028313e-05, + "loss": 1.1847, + "step": 7557 + }, + { + "epoch": 2.8138823283033356, + "grad_norm": 0.15916696190834045, + "learning_rate": 1.698852467639569e-05, + "loss": 1.1682, + "step": 7558 + }, + { + "epoch": 2.8142546334539444, + "grad_norm": 0.1583160012960434, + "learning_rate": 1.6987655350542993e-05, + "loss": 1.1811, + "step": 7559 + }, + { + "epoch": 2.8146269386045537, + "grad_norm": 0.16362488269805908, + "learning_rate": 1.6986785921483068e-05, + "loss": 1.1675, + "step": 7560 + }, + { + "epoch": 2.814999243755163, + "grad_norm": 0.16386942565441132, + "learning_rate": 1.6985916389228746e-05, + "loss": 1.173, + "step": 7561 + }, + { + "epoch": 2.815371548905772, + "grad_norm": 0.16522067785263062, + "learning_rate": 1.6985046753792885e-05, + "loss": 1.1626, + "step": 7562 + }, + { + "epoch": 2.815743854056381, + "grad_norm": 0.15882590413093567, + "learning_rate": 1.698417701518832e-05, + "loss": 1.1694, + "step": 7563 + }, + { + "epoch": 2.81611615920699, + "grad_norm": 0.1628119945526123, + "learning_rate": 1.69833071734279e-05, + "loss": 1.1819, + "step": 7564 + }, + { + "epoch": 2.816488464357599, + "grad_norm": 0.1661859154701233, + "learning_rate": 1.6982437228524468e-05, + "loss": 1.1754, + "step": 7565 + }, + { + "epoch": 2.8168607695082084, + "grad_norm": 0.16389884054660797, + "learning_rate": 1.698156718049088e-05, + "loss": 1.1741, + "step": 7566 + }, + { + "epoch": 2.8172330746588172, + "grad_norm": 0.16429540514945984, + "learning_rate": 1.6980697029339978e-05, + "loss": 1.1648, + "step": 7567 + }, + { + "epoch": 2.817605379809426, + "grad_norm": 0.16381768882274628, + "learning_rate": 1.6979826775084624e-05, + "loss": 1.1656, + "step": 7568 + }, + { + "epoch": 2.8179776849600353, + "grad_norm": 0.1680193990468979, + "learning_rate": 1.6978956417737663e-05, + "loss": 1.167, + "step": 7569 + }, + { + "epoch": 2.8183499901106446, + "grad_norm": 0.16285154223442078, + "learning_rate": 1.6978085957311956e-05, + "loss": 1.1723, + "step": 7570 + }, + { + "epoch": 2.8187222952612534, + "grad_norm": 0.1649884134531021, + "learning_rate": 1.6977215393820357e-05, + "loss": 1.1937, + "step": 7571 + }, + { + "epoch": 2.8190946004118627, + "grad_norm": 0.1563154011964798, + "learning_rate": 1.6976344727275725e-05, + "loss": 1.177, + "step": 7572 + }, + { + "epoch": 2.8194669055624715, + "grad_norm": 0.1616252064704895, + "learning_rate": 1.6975473957690917e-05, + "loss": 1.1679, + "step": 7573 + }, + { + "epoch": 2.8198392107130807, + "grad_norm": 0.16457171738147736, + "learning_rate": 1.6974603085078798e-05, + "loss": 1.1668, + "step": 7574 + }, + { + "epoch": 2.82021151586369, + "grad_norm": 0.1589018553495407, + "learning_rate": 1.697373210945223e-05, + "loss": 1.1653, + "step": 7575 + }, + { + "epoch": 2.820583821014299, + "grad_norm": 0.16056711971759796, + "learning_rate": 1.6972861030824072e-05, + "loss": 1.169, + "step": 7576 + }, + { + "epoch": 2.8209561261649077, + "grad_norm": 0.1676499992609024, + "learning_rate": 1.69719898492072e-05, + "loss": 1.1786, + "step": 7577 + }, + { + "epoch": 2.821328431315517, + "grad_norm": 0.16967107355594635, + "learning_rate": 1.6971118564614473e-05, + "loss": 1.158, + "step": 7578 + }, + { + "epoch": 2.821700736466126, + "grad_norm": 0.17274503409862518, + "learning_rate": 1.697024717705876e-05, + "loss": 1.1906, + "step": 7579 + }, + { + "epoch": 2.822073041616735, + "grad_norm": 0.16663110256195068, + "learning_rate": 1.696937568655294e-05, + "loss": 1.17, + "step": 7580 + }, + { + "epoch": 2.8224453467673443, + "grad_norm": 0.16743509471416473, + "learning_rate": 1.696850409310987e-05, + "loss": 1.1783, + "step": 7581 + }, + { + "epoch": 2.822817651917953, + "grad_norm": 0.16072916984558105, + "learning_rate": 1.6967632396742434e-05, + "loss": 1.1684, + "step": 7582 + }, + { + "epoch": 2.8231899570685624, + "grad_norm": 0.16104640066623688, + "learning_rate": 1.696676059746351e-05, + "loss": 1.1607, + "step": 7583 + }, + { + "epoch": 2.8235622622191716, + "grad_norm": 0.1597306877374649, + "learning_rate": 1.6965888695285965e-05, + "loss": 1.1765, + "step": 7584 + }, + { + "epoch": 2.8239345673697804, + "grad_norm": 0.16316664218902588, + "learning_rate": 1.6965016690222685e-05, + "loss": 1.1683, + "step": 7585 + }, + { + "epoch": 2.8243068725203893, + "grad_norm": 0.1655420958995819, + "learning_rate": 1.696414458228654e-05, + "loss": 1.1851, + "step": 7586 + }, + { + "epoch": 2.8246791776709985, + "grad_norm": 0.16307014226913452, + "learning_rate": 1.696327237149042e-05, + "loss": 1.174, + "step": 7587 + }, + { + "epoch": 2.825051482821608, + "grad_norm": 0.16215850412845612, + "learning_rate": 1.6962400057847202e-05, + "loss": 1.1768, + "step": 7588 + }, + { + "epoch": 2.8254237879722166, + "grad_norm": 0.16659583151340485, + "learning_rate": 1.6961527641369774e-05, + "loss": 1.1698, + "step": 7589 + }, + { + "epoch": 2.825796093122826, + "grad_norm": 0.16407622396945953, + "learning_rate": 1.6960655122071023e-05, + "loss": 1.1619, + "step": 7590 + }, + { + "epoch": 2.8261683982734347, + "grad_norm": 0.15823231637477875, + "learning_rate": 1.6959782499963827e-05, + "loss": 1.1612, + "step": 7591 + }, + { + "epoch": 2.826540703424044, + "grad_norm": 0.16647450625896454, + "learning_rate": 1.6958909775061082e-05, + "loss": 1.1726, + "step": 7592 + }, + { + "epoch": 2.826913008574653, + "grad_norm": 0.16879020631313324, + "learning_rate": 1.6958036947375676e-05, + "loss": 1.1824, + "step": 7593 + }, + { + "epoch": 2.827285313725262, + "grad_norm": 0.16589350998401642, + "learning_rate": 1.69571640169205e-05, + "loss": 1.1716, + "step": 7594 + }, + { + "epoch": 2.827657618875871, + "grad_norm": 0.1656552106142044, + "learning_rate": 1.695629098370845e-05, + "loss": 1.1689, + "step": 7595 + }, + { + "epoch": 2.82802992402648, + "grad_norm": 0.16472189128398895, + "learning_rate": 1.6955417847752417e-05, + "loss": 1.1785, + "step": 7596 + }, + { + "epoch": 2.8284022291770894, + "grad_norm": 0.16501384973526, + "learning_rate": 1.69545446090653e-05, + "loss": 1.1715, + "step": 7597 + }, + { + "epoch": 2.828774534327698, + "grad_norm": 0.16673643887043, + "learning_rate": 1.6953671267659996e-05, + "loss": 1.184, + "step": 7598 + }, + { + "epoch": 2.8291468394783075, + "grad_norm": 0.16138608753681183, + "learning_rate": 1.6952797823549406e-05, + "loss": 1.164, + "step": 7599 + }, + { + "epoch": 2.8295191446289163, + "grad_norm": 0.15768542885780334, + "learning_rate": 1.6951924276746425e-05, + "loss": 1.1794, + "step": 7600 + }, + { + "epoch": 2.8298914497795256, + "grad_norm": 0.1626758873462677, + "learning_rate": 1.6951050627263958e-05, + "loss": 1.1608, + "step": 7601 + }, + { + "epoch": 2.830263754930135, + "grad_norm": 0.16393102705478668, + "learning_rate": 1.695017687511491e-05, + "loss": 1.1826, + "step": 7602 + }, + { + "epoch": 2.8306360600807436, + "grad_norm": 0.16286106407642365, + "learning_rate": 1.6949303020312188e-05, + "loss": 1.1822, + "step": 7603 + }, + { + "epoch": 2.831008365231353, + "grad_norm": 0.16217343509197235, + "learning_rate": 1.6948429062868697e-05, + "loss": 1.1658, + "step": 7604 + }, + { + "epoch": 2.8313806703819617, + "grad_norm": 0.17708547413349152, + "learning_rate": 1.6947555002797344e-05, + "loss": 1.1522, + "step": 7605 + }, + { + "epoch": 2.831752975532571, + "grad_norm": 0.18956957757472992, + "learning_rate": 1.6946680840111035e-05, + "loss": 1.17, + "step": 7606 + }, + { + "epoch": 2.83212528068318, + "grad_norm": 0.1878315806388855, + "learning_rate": 1.6945806574822693e-05, + "loss": 1.1833, + "step": 7607 + }, + { + "epoch": 2.832497585833789, + "grad_norm": 0.1767759919166565, + "learning_rate": 1.6944932206945218e-05, + "loss": 1.1772, + "step": 7608 + }, + { + "epoch": 2.832869890984398, + "grad_norm": 0.1732354611158371, + "learning_rate": 1.6944057736491534e-05, + "loss": 1.1649, + "step": 7609 + }, + { + "epoch": 2.833242196135007, + "grad_norm": 0.16452713310718536, + "learning_rate": 1.694318316347455e-05, + "loss": 1.1707, + "step": 7610 + }, + { + "epoch": 2.8336145012856164, + "grad_norm": 0.167094424366951, + "learning_rate": 1.6942308487907187e-05, + "loss": 1.1749, + "step": 7611 + }, + { + "epoch": 2.8339868064362252, + "grad_norm": 0.17273786664009094, + "learning_rate": 1.694143370980237e-05, + "loss": 1.1577, + "step": 7612 + }, + { + "epoch": 2.8343591115868345, + "grad_norm": 0.16662479937076569, + "learning_rate": 1.6940558829173004e-05, + "loss": 1.1733, + "step": 7613 + }, + { + "epoch": 2.8347314167374433, + "grad_norm": 0.16592276096343994, + "learning_rate": 1.6939683846032022e-05, + "loss": 1.1724, + "step": 7614 + }, + { + "epoch": 2.8351037218880526, + "grad_norm": 0.16113296151161194, + "learning_rate": 1.6938808760392346e-05, + "loss": 1.1643, + "step": 7615 + }, + { + "epoch": 2.8354760270386614, + "grad_norm": 0.16023372113704681, + "learning_rate": 1.69379335722669e-05, + "loss": 1.1684, + "step": 7616 + }, + { + "epoch": 2.8358483321892707, + "grad_norm": 0.16699042916297913, + "learning_rate": 1.693705828166861e-05, + "loss": 1.1701, + "step": 7617 + }, + { + "epoch": 2.8362206373398795, + "grad_norm": 0.1670234203338623, + "learning_rate": 1.693618288861041e-05, + "loss": 1.1809, + "step": 7618 + }, + { + "epoch": 2.8365929424904888, + "grad_norm": 0.19244331121444702, + "learning_rate": 1.6935307393105215e-05, + "loss": 1.153, + "step": 7619 + }, + { + "epoch": 2.836965247641098, + "grad_norm": 0.16699756681919098, + "learning_rate": 1.6934431795165972e-05, + "loss": 1.1687, + "step": 7620 + }, + { + "epoch": 2.837337552791707, + "grad_norm": 0.1908612698316574, + "learning_rate": 1.6933556094805602e-05, + "loss": 1.1881, + "step": 7621 + }, + { + "epoch": 2.837709857942316, + "grad_norm": 0.17475947737693787, + "learning_rate": 1.6932680292037045e-05, + "loss": 1.1511, + "step": 7622 + }, + { + "epoch": 2.838082163092925, + "grad_norm": 0.16836613416671753, + "learning_rate": 1.6931804386873232e-05, + "loss": 1.1599, + "step": 7623 + }, + { + "epoch": 2.838454468243534, + "grad_norm": 0.17344890534877777, + "learning_rate": 1.693092837932711e-05, + "loss": 1.1767, + "step": 7624 + }, + { + "epoch": 2.838826773394143, + "grad_norm": 0.17366820573806763, + "learning_rate": 1.693005226941161e-05, + "loss": 1.1815, + "step": 7625 + }, + { + "epoch": 2.8391990785447523, + "grad_norm": 0.16873829066753387, + "learning_rate": 1.692917605713967e-05, + "loss": 1.1751, + "step": 7626 + }, + { + "epoch": 2.839571383695361, + "grad_norm": 0.1660252809524536, + "learning_rate": 1.6928299742524236e-05, + "loss": 1.1723, + "step": 7627 + }, + { + "epoch": 2.8399436888459704, + "grad_norm": 0.17038235068321228, + "learning_rate": 1.6927423325578248e-05, + "loss": 1.1829, + "step": 7628 + }, + { + "epoch": 2.8403159939965796, + "grad_norm": 0.1674618273973465, + "learning_rate": 1.692654680631465e-05, + "loss": 1.1787, + "step": 7629 + }, + { + "epoch": 2.8406882991471885, + "grad_norm": 0.16933341324329376, + "learning_rate": 1.692567018474639e-05, + "loss": 1.169, + "step": 7630 + }, + { + "epoch": 2.8410606042977977, + "grad_norm": 0.16508066654205322, + "learning_rate": 1.6924793460886424e-05, + "loss": 1.1711, + "step": 7631 + }, + { + "epoch": 2.8414329094484065, + "grad_norm": 0.1625910848379135, + "learning_rate": 1.692391663474769e-05, + "loss": 1.1557, + "step": 7632 + }, + { + "epoch": 2.841805214599016, + "grad_norm": 0.16758020222187042, + "learning_rate": 1.692303970634314e-05, + "loss": 1.1555, + "step": 7633 + }, + { + "epoch": 2.8421775197496246, + "grad_norm": 0.1603182852268219, + "learning_rate": 1.6922162675685725e-05, + "loss": 1.1575, + "step": 7634 + }, + { + "epoch": 2.842549824900234, + "grad_norm": 0.16572555899620056, + "learning_rate": 1.6921285542788405e-05, + "loss": 1.1687, + "step": 7635 + }, + { + "epoch": 2.8429221300508427, + "grad_norm": 0.16787642240524292, + "learning_rate": 1.6920408307664132e-05, + "loss": 1.1629, + "step": 7636 + }, + { + "epoch": 2.843294435201452, + "grad_norm": 0.16438321769237518, + "learning_rate": 1.6919530970325865e-05, + "loss": 1.1628, + "step": 7637 + }, + { + "epoch": 2.8436667403520612, + "grad_norm": 0.16906222701072693, + "learning_rate": 1.6918653530786555e-05, + "loss": 1.1589, + "step": 7638 + }, + { + "epoch": 2.84403904550267, + "grad_norm": 0.1649426817893982, + "learning_rate": 1.6917775989059167e-05, + "loss": 1.1705, + "step": 7639 + }, + { + "epoch": 2.8444113506532793, + "grad_norm": 0.17663908004760742, + "learning_rate": 1.6916898345156668e-05, + "loss": 1.1638, + "step": 7640 + }, + { + "epoch": 2.844783655803888, + "grad_norm": 0.16318029165267944, + "learning_rate": 1.6916020599092007e-05, + "loss": 1.1592, + "step": 7641 + }, + { + "epoch": 2.8451559609544974, + "grad_norm": 0.17562070488929749, + "learning_rate": 1.691514275087816e-05, + "loss": 1.1606, + "step": 7642 + }, + { + "epoch": 2.8455282661051067, + "grad_norm": 0.1714848279953003, + "learning_rate": 1.6914264800528087e-05, + "loss": 1.1647, + "step": 7643 + }, + { + "epoch": 2.8459005712557155, + "grad_norm": 0.1683172881603241, + "learning_rate": 1.6913386748054757e-05, + "loss": 1.175, + "step": 7644 + }, + { + "epoch": 2.8462728764063243, + "grad_norm": 0.18591049313545227, + "learning_rate": 1.6912508593471137e-05, + "loss": 1.1501, + "step": 7645 + }, + { + "epoch": 2.8466451815569336, + "grad_norm": 0.16155651211738586, + "learning_rate": 1.69116303367902e-05, + "loss": 1.1764, + "step": 7646 + }, + { + "epoch": 2.847017486707543, + "grad_norm": 0.1677398383617401, + "learning_rate": 1.691075197802492e-05, + "loss": 1.1649, + "step": 7647 + }, + { + "epoch": 2.8473897918581517, + "grad_norm": 0.16577932238578796, + "learning_rate": 1.6909873517188263e-05, + "loss": 1.1761, + "step": 7648 + }, + { + "epoch": 2.847762097008761, + "grad_norm": 0.161788210272789, + "learning_rate": 1.690899495429321e-05, + "loss": 1.1662, + "step": 7649 + }, + { + "epoch": 2.8481344021593697, + "grad_norm": 0.20268653333187103, + "learning_rate": 1.6908116289352735e-05, + "loss": 1.1871, + "step": 7650 + }, + { + "epoch": 2.848506707309979, + "grad_norm": 0.18162259459495544, + "learning_rate": 1.6907237522379816e-05, + "loss": 1.1549, + "step": 7651 + }, + { + "epoch": 2.8488790124605883, + "grad_norm": 0.16151395440101624, + "learning_rate": 1.690635865338743e-05, + "loss": 1.1815, + "step": 7652 + }, + { + "epoch": 2.849251317611197, + "grad_norm": 0.1718040108680725, + "learning_rate": 1.6905479682388565e-05, + "loss": 1.1562, + "step": 7653 + }, + { + "epoch": 2.849623622761806, + "grad_norm": 0.165380597114563, + "learning_rate": 1.690460060939619e-05, + "loss": 1.1713, + "step": 7654 + }, + { + "epoch": 2.849995927912415, + "grad_norm": 0.18731936812400818, + "learning_rate": 1.6903721434423306e-05, + "loss": 1.1732, + "step": 7655 + }, + { + "epoch": 2.8503682330630244, + "grad_norm": 0.16210754215717316, + "learning_rate": 1.6902842157482885e-05, + "loss": 1.1712, + "step": 7656 + }, + { + "epoch": 2.8507405382136333, + "grad_norm": 0.1721230000257492, + "learning_rate": 1.690196277858792e-05, + "loss": 1.1572, + "step": 7657 + }, + { + "epoch": 2.8511128433642425, + "grad_norm": 0.16690939664840698, + "learning_rate": 1.6901083297751397e-05, + "loss": 1.1777, + "step": 7658 + }, + { + "epoch": 2.8514851485148514, + "grad_norm": 0.16586491465568542, + "learning_rate": 1.6900203714986307e-05, + "loss": 1.1747, + "step": 7659 + }, + { + "epoch": 2.8518574536654606, + "grad_norm": 0.17058177292346954, + "learning_rate": 1.689932403030564e-05, + "loss": 1.1755, + "step": 7660 + }, + { + "epoch": 2.85222975881607, + "grad_norm": 0.16409049928188324, + "learning_rate": 1.6898444243722395e-05, + "loss": 1.1629, + "step": 7661 + }, + { + "epoch": 2.8526020639666787, + "grad_norm": 0.16023750603199005, + "learning_rate": 1.6897564355249556e-05, + "loss": 1.1642, + "step": 7662 + }, + { + "epoch": 2.8529743691172875, + "grad_norm": 0.17429199814796448, + "learning_rate": 1.6896684364900127e-05, + "loss": 1.1746, + "step": 7663 + }, + { + "epoch": 2.853346674267897, + "grad_norm": 0.1672469824552536, + "learning_rate": 1.68958042726871e-05, + "loss": 1.1697, + "step": 7664 + }, + { + "epoch": 2.853718979418506, + "grad_norm": 0.16756999492645264, + "learning_rate": 1.689492407862348e-05, + "loss": 1.1722, + "step": 7665 + }, + { + "epoch": 2.854091284569115, + "grad_norm": 0.16315622627735138, + "learning_rate": 1.689404378272226e-05, + "loss": 1.1669, + "step": 7666 + }, + { + "epoch": 2.854463589719724, + "grad_norm": 0.16442465782165527, + "learning_rate": 1.6893163384996453e-05, + "loss": 1.1892, + "step": 7667 + }, + { + "epoch": 2.854835894870333, + "grad_norm": 0.16709788143634796, + "learning_rate": 1.689228288545905e-05, + "loss": 1.1729, + "step": 7668 + }, + { + "epoch": 2.855208200020942, + "grad_norm": 0.17159411311149597, + "learning_rate": 1.689140228412306e-05, + "loss": 1.1619, + "step": 7669 + }, + { + "epoch": 2.8555805051715515, + "grad_norm": 0.16300630569458008, + "learning_rate": 1.68905215810015e-05, + "loss": 1.1589, + "step": 7670 + }, + { + "epoch": 2.8559528103221603, + "grad_norm": 0.17560981214046478, + "learning_rate": 1.6889640776107356e-05, + "loss": 1.1749, + "step": 7671 + }, + { + "epoch": 2.856325115472769, + "grad_norm": 0.16075876355171204, + "learning_rate": 1.688875986945366e-05, + "loss": 1.1659, + "step": 7672 + }, + { + "epoch": 2.8566974206233784, + "grad_norm": 0.16523872315883636, + "learning_rate": 1.6887878861053407e-05, + "loss": 1.175, + "step": 7673 + }, + { + "epoch": 2.8570697257739877, + "grad_norm": 0.1663205772638321, + "learning_rate": 1.688699775091962e-05, + "loss": 1.1689, + "step": 7674 + }, + { + "epoch": 2.8574420309245965, + "grad_norm": 0.17149294912815094, + "learning_rate": 1.68861165390653e-05, + "loss": 1.1637, + "step": 7675 + }, + { + "epoch": 2.8578143360752057, + "grad_norm": 0.15809527039527893, + "learning_rate": 1.688523522550348e-05, + "loss": 1.1621, + "step": 7676 + }, + { + "epoch": 2.8581866412258146, + "grad_norm": 0.16826729476451874, + "learning_rate": 1.6884353810247166e-05, + "loss": 1.1671, + "step": 7677 + }, + { + "epoch": 2.858558946376424, + "grad_norm": 0.1609870195388794, + "learning_rate": 1.6883472293309375e-05, + "loss": 1.1648, + "step": 7678 + }, + { + "epoch": 2.858931251527033, + "grad_norm": 0.16899265348911285, + "learning_rate": 1.6882590674703135e-05, + "loss": 1.1532, + "step": 7679 + }, + { + "epoch": 2.859303556677642, + "grad_norm": 0.1640852838754654, + "learning_rate": 1.6881708954441458e-05, + "loss": 1.1584, + "step": 7680 + }, + { + "epoch": 2.8596758618282507, + "grad_norm": 0.16974326968193054, + "learning_rate": 1.6880827132537373e-05, + "loss": 1.1733, + "step": 7681 + }, + { + "epoch": 2.86004816697886, + "grad_norm": 0.16964828968048096, + "learning_rate": 1.6879945209003903e-05, + "loss": 1.1698, + "step": 7682 + }, + { + "epoch": 2.8604204721294693, + "grad_norm": 0.17090067267417908, + "learning_rate": 1.6879063183854076e-05, + "loss": 1.1671, + "step": 7683 + }, + { + "epoch": 2.860792777280078, + "grad_norm": 0.1564168483018875, + "learning_rate": 1.6878181057100915e-05, + "loss": 1.1604, + "step": 7684 + }, + { + "epoch": 2.8611650824306873, + "grad_norm": 0.1685391068458557, + "learning_rate": 1.6877298828757452e-05, + "loss": 1.1765, + "step": 7685 + }, + { + "epoch": 2.861537387581296, + "grad_norm": 0.16214828193187714, + "learning_rate": 1.6876416498836717e-05, + "loss": 1.166, + "step": 7686 + }, + { + "epoch": 2.8619096927319054, + "grad_norm": 0.16030354797840118, + "learning_rate": 1.6875534067351744e-05, + "loss": 1.1634, + "step": 7687 + }, + { + "epoch": 2.8622819978825147, + "grad_norm": 0.17029841244220734, + "learning_rate": 1.687465153431556e-05, + "loss": 1.1719, + "step": 7688 + }, + { + "epoch": 2.8626543030331235, + "grad_norm": 0.1636890470981598, + "learning_rate": 1.6873768899741212e-05, + "loss": 1.1582, + "step": 7689 + }, + { + "epoch": 2.8630266081837323, + "grad_norm": 0.16757601499557495, + "learning_rate": 1.687288616364172e-05, + "loss": 1.1557, + "step": 7690 + }, + { + "epoch": 2.8633989133343416, + "grad_norm": 0.18567247688770294, + "learning_rate": 1.6872003326030136e-05, + "loss": 1.1699, + "step": 7691 + }, + { + "epoch": 2.863771218484951, + "grad_norm": 0.1703827977180481, + "learning_rate": 1.6871120386919493e-05, + "loss": 1.1579, + "step": 7692 + }, + { + "epoch": 2.8641435236355597, + "grad_norm": 0.16367879509925842, + "learning_rate": 1.6870237346322832e-05, + "loss": 1.156, + "step": 7693 + }, + { + "epoch": 2.864515828786169, + "grad_norm": 0.1624721884727478, + "learning_rate": 1.6869354204253195e-05, + "loss": 1.1659, + "step": 7694 + }, + { + "epoch": 2.8648881339367778, + "grad_norm": 0.16088762879371643, + "learning_rate": 1.686847096072363e-05, + "loss": 1.1753, + "step": 7695 + }, + { + "epoch": 2.865260439087387, + "grad_norm": 0.17531664669513702, + "learning_rate": 1.6867587615747184e-05, + "loss": 1.183, + "step": 7696 + }, + { + "epoch": 2.8656327442379963, + "grad_norm": 0.18076026439666748, + "learning_rate": 1.6866704169336895e-05, + "loss": 1.1681, + "step": 7697 + }, + { + "epoch": 2.866005049388605, + "grad_norm": 0.20938239991664886, + "learning_rate": 1.6865820621505812e-05, + "loss": 1.1816, + "step": 7698 + }, + { + "epoch": 2.866377354539214, + "grad_norm": 0.23691917955875397, + "learning_rate": 1.6864936972266996e-05, + "loss": 1.1581, + "step": 7699 + }, + { + "epoch": 2.866749659689823, + "grad_norm": 0.19211941957473755, + "learning_rate": 1.686405322163349e-05, + "loss": 1.1664, + "step": 7700 + }, + { + "epoch": 2.8671219648404325, + "grad_norm": 0.17298676073551178, + "learning_rate": 1.686316936961835e-05, + "loss": 1.1663, + "step": 7701 + }, + { + "epoch": 2.8674942699910413, + "grad_norm": 0.17014887928962708, + "learning_rate": 1.6862285416234628e-05, + "loss": 1.162, + "step": 7702 + }, + { + "epoch": 2.8678665751416506, + "grad_norm": 0.1860615760087967, + "learning_rate": 1.686140136149538e-05, + "loss": 1.1707, + "step": 7703 + }, + { + "epoch": 2.8682388802922594, + "grad_norm": 0.1729809045791626, + "learning_rate": 1.6860517205413667e-05, + "loss": 1.1786, + "step": 7704 + }, + { + "epoch": 2.8686111854428686, + "grad_norm": 0.16746124625205994, + "learning_rate": 1.6859632948002542e-05, + "loss": 1.164, + "step": 7705 + }, + { + "epoch": 2.868983490593478, + "grad_norm": 0.176571786403656, + "learning_rate": 1.685874858927507e-05, + "loss": 1.152, + "step": 7706 + }, + { + "epoch": 2.8693557957440867, + "grad_norm": 0.17990010976791382, + "learning_rate": 1.6857864129244314e-05, + "loss": 1.1736, + "step": 7707 + }, + { + "epoch": 2.8697281008946955, + "grad_norm": 0.16373847424983978, + "learning_rate": 1.6856979567923333e-05, + "loss": 1.1595, + "step": 7708 + }, + { + "epoch": 2.870100406045305, + "grad_norm": 0.16468669474124908, + "learning_rate": 1.6856094905325195e-05, + "loss": 1.1652, + "step": 7709 + }, + { + "epoch": 2.870472711195914, + "grad_norm": 0.1700790673494339, + "learning_rate": 1.6855210141462964e-05, + "loss": 1.1577, + "step": 7710 + }, + { + "epoch": 2.870845016346523, + "grad_norm": 0.17594699561595917, + "learning_rate": 1.685432527634971e-05, + "loss": 1.1672, + "step": 7711 + }, + { + "epoch": 2.871217321497132, + "grad_norm": 0.16046394407749176, + "learning_rate": 1.68534403099985e-05, + "loss": 1.1777, + "step": 7712 + }, + { + "epoch": 2.871589626647741, + "grad_norm": 0.1609521210193634, + "learning_rate": 1.685255524242241e-05, + "loss": 1.173, + "step": 7713 + }, + { + "epoch": 2.8719619317983502, + "grad_norm": 0.17106173932552338, + "learning_rate": 1.6851670073634513e-05, + "loss": 1.1632, + "step": 7714 + }, + { + "epoch": 2.8723342369489595, + "grad_norm": 0.16623558104038239, + "learning_rate": 1.685078480364787e-05, + "loss": 1.1556, + "step": 7715 + }, + { + "epoch": 2.8727065420995683, + "grad_norm": 0.1585904061794281, + "learning_rate": 1.684989943247557e-05, + "loss": 1.1718, + "step": 7716 + }, + { + "epoch": 2.8730788472501776, + "grad_norm": 0.1625899374485016, + "learning_rate": 1.6849013960130687e-05, + "loss": 1.1638, + "step": 7717 + }, + { + "epoch": 2.8734511524007864, + "grad_norm": 0.1680610626935959, + "learning_rate": 1.6848128386626297e-05, + "loss": 1.175, + "step": 7718 + }, + { + "epoch": 2.8738234575513957, + "grad_norm": 0.16707110404968262, + "learning_rate": 1.6847242711975477e-05, + "loss": 1.1724, + "step": 7719 + }, + { + "epoch": 2.8741957627020045, + "grad_norm": 0.16035796701908112, + "learning_rate": 1.684635693619131e-05, + "loss": 1.1504, + "step": 7720 + }, + { + "epoch": 2.8745680678526138, + "grad_norm": 0.1676149070262909, + "learning_rate": 1.684547105928689e-05, + "loss": 1.1657, + "step": 7721 + }, + { + "epoch": 2.8749403730032226, + "grad_norm": 0.17349836230278015, + "learning_rate": 1.6844585081275285e-05, + "loss": 1.1534, + "step": 7722 + }, + { + "epoch": 2.875312678153832, + "grad_norm": 0.16714976727962494, + "learning_rate": 1.684369900216959e-05, + "loss": 1.1686, + "step": 7723 + }, + { + "epoch": 2.875684983304441, + "grad_norm": 0.168970987200737, + "learning_rate": 1.684281282198289e-05, + "loss": 1.1644, + "step": 7724 + }, + { + "epoch": 2.87605728845505, + "grad_norm": 0.16321352124214172, + "learning_rate": 1.6841926540728276e-05, + "loss": 1.1585, + "step": 7725 + }, + { + "epoch": 2.876429593605659, + "grad_norm": 0.16680431365966797, + "learning_rate": 1.684104015841883e-05, + "loss": 1.1496, + "step": 7726 + }, + { + "epoch": 2.876801898756268, + "grad_norm": 0.1670006364583969, + "learning_rate": 1.6840153675067658e-05, + "loss": 1.1815, + "step": 7727 + }, + { + "epoch": 2.8771742039068773, + "grad_norm": 0.1647917628288269, + "learning_rate": 1.683926709068784e-05, + "loss": 1.1795, + "step": 7728 + }, + { + "epoch": 2.877546509057486, + "grad_norm": 0.16818176209926605, + "learning_rate": 1.683838040529248e-05, + "loss": 1.181, + "step": 7729 + }, + { + "epoch": 2.8779188142080954, + "grad_norm": 0.16495051980018616, + "learning_rate": 1.6837493618894666e-05, + "loss": 1.151, + "step": 7730 + }, + { + "epoch": 2.878291119358704, + "grad_norm": 0.15997083485126495, + "learning_rate": 1.6836606731507506e-05, + "loss": 1.1581, + "step": 7731 + }, + { + "epoch": 2.8786634245093135, + "grad_norm": 0.16573216021060944, + "learning_rate": 1.683571974314409e-05, + "loss": 1.1754, + "step": 7732 + }, + { + "epoch": 2.8790357296599227, + "grad_norm": 0.16260533034801483, + "learning_rate": 1.683483265381752e-05, + "loss": 1.1807, + "step": 7733 + }, + { + "epoch": 2.8794080348105315, + "grad_norm": 0.16376620531082153, + "learning_rate": 1.6833945463540906e-05, + "loss": 1.1666, + "step": 7734 + }, + { + "epoch": 2.879780339961141, + "grad_norm": 0.1692168414592743, + "learning_rate": 1.6833058172327344e-05, + "loss": 1.1674, + "step": 7735 + }, + { + "epoch": 2.8801526451117496, + "grad_norm": 0.16318558156490326, + "learning_rate": 1.683217078018994e-05, + "loss": 1.1756, + "step": 7736 + }, + { + "epoch": 2.880524950262359, + "grad_norm": 0.1628124862909317, + "learning_rate": 1.6831283287141807e-05, + "loss": 1.1631, + "step": 7737 + }, + { + "epoch": 2.8808972554129677, + "grad_norm": 0.1683143526315689, + "learning_rate": 1.6830395693196043e-05, + "loss": 1.1762, + "step": 7738 + }, + { + "epoch": 2.881269560563577, + "grad_norm": 0.16499009728431702, + "learning_rate": 1.682950799836577e-05, + "loss": 1.1689, + "step": 7739 + }, + { + "epoch": 2.881641865714186, + "grad_norm": 0.16890759766101837, + "learning_rate": 1.6828620202664086e-05, + "loss": 1.1914, + "step": 7740 + }, + { + "epoch": 2.882014170864795, + "grad_norm": 0.16511119902133942, + "learning_rate": 1.6827732306104113e-05, + "loss": 1.1627, + "step": 7741 + }, + { + "epoch": 2.8823864760154043, + "grad_norm": 0.16963976621627808, + "learning_rate": 1.6826844308698962e-05, + "loss": 1.1626, + "step": 7742 + }, + { + "epoch": 2.882758781166013, + "grad_norm": 0.17037193477153778, + "learning_rate": 1.682595621046175e-05, + "loss": 1.1666, + "step": 7743 + }, + { + "epoch": 2.8831310863166224, + "grad_norm": 0.16597189009189606, + "learning_rate": 1.682506801140559e-05, + "loss": 1.1835, + "step": 7744 + }, + { + "epoch": 2.8835033914672312, + "grad_norm": 0.2021397054195404, + "learning_rate": 1.6824179711543607e-05, + "loss": 1.1815, + "step": 7745 + }, + { + "epoch": 2.8838756966178405, + "grad_norm": 0.1731014996767044, + "learning_rate": 1.6823291310888916e-05, + "loss": 1.1642, + "step": 7746 + }, + { + "epoch": 2.8842480017684493, + "grad_norm": 0.17293405532836914, + "learning_rate": 1.682240280945464e-05, + "loss": 1.1538, + "step": 7747 + }, + { + "epoch": 2.8846203069190586, + "grad_norm": 0.1841578483581543, + "learning_rate": 1.6821514207253905e-05, + "loss": 1.1458, + "step": 7748 + }, + { + "epoch": 2.8849926120696674, + "grad_norm": 0.16726046800613403, + "learning_rate": 1.6820625504299833e-05, + "loss": 1.1729, + "step": 7749 + }, + { + "epoch": 2.8853649172202767, + "grad_norm": 0.1666422039270401, + "learning_rate": 1.6819736700605548e-05, + "loss": 1.16, + "step": 7750 + }, + { + "epoch": 2.885737222370886, + "grad_norm": 0.1761844903230667, + "learning_rate": 1.6818847796184185e-05, + "loss": 1.1787, + "step": 7751 + }, + { + "epoch": 2.8861095275214947, + "grad_norm": 0.16153545677661896, + "learning_rate": 1.6817958791048866e-05, + "loss": 1.1775, + "step": 7752 + }, + { + "epoch": 2.886481832672104, + "grad_norm": 0.16363756358623505, + "learning_rate": 1.6817069685212717e-05, + "loss": 1.1761, + "step": 7753 + }, + { + "epoch": 2.886854137822713, + "grad_norm": 0.176415354013443, + "learning_rate": 1.6816180478688885e-05, + "loss": 1.1782, + "step": 7754 + }, + { + "epoch": 2.887226442973322, + "grad_norm": 0.17358264327049255, + "learning_rate": 1.681529117149049e-05, + "loss": 1.1663, + "step": 7755 + }, + { + "epoch": 2.887598748123931, + "grad_norm": 0.1688726395368576, + "learning_rate": 1.6814401763630674e-05, + "loss": 1.1724, + "step": 7756 + }, + { + "epoch": 2.88797105327454, + "grad_norm": 0.20944684743881226, + "learning_rate": 1.6813512255122573e-05, + "loss": 1.1587, + "step": 7757 + }, + { + "epoch": 2.888343358425149, + "grad_norm": 0.20509681105613708, + "learning_rate": 1.681262264597932e-05, + "loss": 1.1472, + "step": 7758 + }, + { + "epoch": 2.8887156635757583, + "grad_norm": 0.17619788646697998, + "learning_rate": 1.6811732936214063e-05, + "loss": 1.1787, + "step": 7759 + }, + { + "epoch": 2.8890879687263675, + "grad_norm": 0.16343960165977478, + "learning_rate": 1.6810843125839934e-05, + "loss": 1.1768, + "step": 7760 + }, + { + "epoch": 2.8894602738769763, + "grad_norm": 0.1722916066646576, + "learning_rate": 1.680995321487008e-05, + "loss": 1.1698, + "step": 7761 + }, + { + "epoch": 2.8898325790275856, + "grad_norm": 0.1792430579662323, + "learning_rate": 1.6809063203317645e-05, + "loss": 1.1665, + "step": 7762 + }, + { + "epoch": 2.8902048841781944, + "grad_norm": 0.17468638718128204, + "learning_rate": 1.6808173091195774e-05, + "loss": 1.1726, + "step": 7763 + }, + { + "epoch": 2.8905771893288037, + "grad_norm": 0.1607234925031662, + "learning_rate": 1.6807282878517614e-05, + "loss": 1.1714, + "step": 7764 + }, + { + "epoch": 2.890949494479413, + "grad_norm": 0.15706735849380493, + "learning_rate": 1.680639256529631e-05, + "loss": 1.1607, + "step": 7765 + }, + { + "epoch": 2.891321799630022, + "grad_norm": 0.15637610852718353, + "learning_rate": 1.6805502151545022e-05, + "loss": 1.1658, + "step": 7766 + }, + { + "epoch": 2.8916941047806306, + "grad_norm": 0.16825786232948303, + "learning_rate": 1.6804611637276888e-05, + "loss": 1.1706, + "step": 7767 + }, + { + "epoch": 2.89206640993124, + "grad_norm": 0.18059858679771423, + "learning_rate": 1.680372102250507e-05, + "loss": 1.1694, + "step": 7768 + }, + { + "epoch": 2.892438715081849, + "grad_norm": 0.18463246524333954, + "learning_rate": 1.6802830307242716e-05, + "loss": 1.1592, + "step": 7769 + }, + { + "epoch": 2.892811020232458, + "grad_norm": 0.16244830191135406, + "learning_rate": 1.680193949150299e-05, + "loss": 1.1693, + "step": 7770 + }, + { + "epoch": 2.893183325383067, + "grad_norm": 0.1961405873298645, + "learning_rate": 1.680104857529904e-05, + "loss": 1.184, + "step": 7771 + }, + { + "epoch": 2.893555630533676, + "grad_norm": 0.19858300685882568, + "learning_rate": 1.6800157558644034e-05, + "loss": 1.1683, + "step": 7772 + }, + { + "epoch": 2.8939279356842853, + "grad_norm": 0.18224839866161346, + "learning_rate": 1.6799266441551124e-05, + "loss": 1.1785, + "step": 7773 + }, + { + "epoch": 2.8943002408348946, + "grad_norm": 0.2373683899641037, + "learning_rate": 1.679837522403348e-05, + "loss": 1.1623, + "step": 7774 + }, + { + "epoch": 2.8946725459855034, + "grad_norm": 0.16270266473293304, + "learning_rate": 1.6797483906104256e-05, + "loss": 1.1622, + "step": 7775 + }, + { + "epoch": 2.895044851136112, + "grad_norm": 0.16937381029129028, + "learning_rate": 1.679659248777662e-05, + "loss": 1.1632, + "step": 7776 + }, + { + "epoch": 2.8954171562867215, + "grad_norm": 0.16399821639060974, + "learning_rate": 1.6795700969063743e-05, + "loss": 1.1614, + "step": 7777 + }, + { + "epoch": 2.8957894614373307, + "grad_norm": 0.16400204598903656, + "learning_rate": 1.679480934997879e-05, + "loss": 1.1718, + "step": 7778 + }, + { + "epoch": 2.8961617665879396, + "grad_norm": 0.16825588047504425, + "learning_rate": 1.679391763053493e-05, + "loss": 1.1615, + "step": 7779 + }, + { + "epoch": 2.896534071738549, + "grad_norm": 0.16895346343517303, + "learning_rate": 1.679302581074533e-05, + "loss": 1.1646, + "step": 7780 + }, + { + "epoch": 2.8969063768891576, + "grad_norm": 0.16163942217826843, + "learning_rate": 1.6792133890623162e-05, + "loss": 1.1582, + "step": 7781 + }, + { + "epoch": 2.897278682039767, + "grad_norm": 0.1691490262746811, + "learning_rate": 1.6791241870181607e-05, + "loss": 1.1644, + "step": 7782 + }, + { + "epoch": 2.897650987190376, + "grad_norm": 0.16696293652057648, + "learning_rate": 1.6790349749433835e-05, + "loss": 1.1835, + "step": 7783 + }, + { + "epoch": 2.898023292340985, + "grad_norm": 0.16553637385368347, + "learning_rate": 1.678945752839302e-05, + "loss": 1.175, + "step": 7784 + }, + { + "epoch": 2.898395597491594, + "grad_norm": 0.1722661852836609, + "learning_rate": 1.678856520707235e-05, + "loss": 1.1799, + "step": 7785 + }, + { + "epoch": 2.898767902642203, + "grad_norm": 0.16318213939666748, + "learning_rate": 1.6787672785484998e-05, + "loss": 1.1511, + "step": 7786 + }, + { + "epoch": 2.8991402077928123, + "grad_norm": 0.16159263253211975, + "learning_rate": 1.678678026364414e-05, + "loss": 1.1715, + "step": 7787 + }, + { + "epoch": 2.899512512943421, + "grad_norm": 0.16238828003406525, + "learning_rate": 1.6785887641562967e-05, + "loss": 1.1655, + "step": 7788 + }, + { + "epoch": 2.8998848180940304, + "grad_norm": 0.16337160766124725, + "learning_rate": 1.6784994919254652e-05, + "loss": 1.1697, + "step": 7789 + }, + { + "epoch": 2.9002571232446392, + "grad_norm": 0.16441784799098969, + "learning_rate": 1.67841020967324e-05, + "loss": 1.1774, + "step": 7790 + }, + { + "epoch": 2.9006294283952485, + "grad_norm": 0.16245967149734497, + "learning_rate": 1.6783209174009377e-05, + "loss": 1.1751, + "step": 7791 + }, + { + "epoch": 2.9010017335458578, + "grad_norm": 0.17327164113521576, + "learning_rate": 1.678231615109878e-05, + "loss": 1.1608, + "step": 7792 + }, + { + "epoch": 2.9013740386964666, + "grad_norm": 0.16266658902168274, + "learning_rate": 1.6781423028013803e-05, + "loss": 1.1599, + "step": 7793 + }, + { + "epoch": 2.9017463438470754, + "grad_norm": 0.17774546146392822, + "learning_rate": 1.6780529804767632e-05, + "loss": 1.1655, + "step": 7794 + }, + { + "epoch": 2.9021186489976847, + "grad_norm": 0.18671812117099762, + "learning_rate": 1.6779636481373462e-05, + "loss": 1.1759, + "step": 7795 + }, + { + "epoch": 2.902490954148294, + "grad_norm": 0.1647380143404007, + "learning_rate": 1.6778743057844487e-05, + "loss": 1.1736, + "step": 7796 + }, + { + "epoch": 2.9028632592989028, + "grad_norm": 0.23281651735305786, + "learning_rate": 1.67778495341939e-05, + "loss": 1.1769, + "step": 7797 + }, + { + "epoch": 2.903235564449512, + "grad_norm": 0.1884678155183792, + "learning_rate": 1.67769559104349e-05, + "loss": 1.1746, + "step": 7798 + }, + { + "epoch": 2.903607869600121, + "grad_norm": 0.18494202196598053, + "learning_rate": 1.677606218658069e-05, + "loss": 1.1601, + "step": 7799 + }, + { + "epoch": 2.90398017475073, + "grad_norm": 0.1650589555501938, + "learning_rate": 1.6775168362644465e-05, + "loss": 1.1903, + "step": 7800 + }, + { + "epoch": 2.9043524799013394, + "grad_norm": 0.2014658898115158, + "learning_rate": 1.677427443863943e-05, + "loss": 1.1632, + "step": 7801 + }, + { + "epoch": 2.904724785051948, + "grad_norm": 0.16059400141239166, + "learning_rate": 1.6773380414578785e-05, + "loss": 1.1651, + "step": 7802 + }, + { + "epoch": 2.905097090202557, + "grad_norm": 0.1685784012079239, + "learning_rate": 1.6772486290475737e-05, + "loss": 1.1869, + "step": 7803 + }, + { + "epoch": 2.9054693953531663, + "grad_norm": 0.16995343565940857, + "learning_rate": 1.677159206634349e-05, + "loss": 1.1793, + "step": 7804 + }, + { + "epoch": 2.9058417005037755, + "grad_norm": 0.16553527116775513, + "learning_rate": 1.6770697742195256e-05, + "loss": 1.151, + "step": 7805 + }, + { + "epoch": 2.9062140056543844, + "grad_norm": 0.16535979509353638, + "learning_rate": 1.676980331804424e-05, + "loss": 1.1719, + "step": 7806 + }, + { + "epoch": 2.9065863108049936, + "grad_norm": 0.172554612159729, + "learning_rate": 1.6768908793903653e-05, + "loss": 1.1577, + "step": 7807 + }, + { + "epoch": 2.9069586159556025, + "grad_norm": 0.1674085408449173, + "learning_rate": 1.6768014169786712e-05, + "loss": 1.1716, + "step": 7808 + }, + { + "epoch": 2.9073309211062117, + "grad_norm": 0.17498281598091125, + "learning_rate": 1.676711944570662e-05, + "loss": 1.1595, + "step": 7809 + }, + { + "epoch": 2.907703226256821, + "grad_norm": 0.1705126017332077, + "learning_rate": 1.67662246216766e-05, + "loss": 1.1731, + "step": 7810 + }, + { + "epoch": 2.90807553140743, + "grad_norm": 0.1687796264886856, + "learning_rate": 1.676532969770987e-05, + "loss": 1.18, + "step": 7811 + }, + { + "epoch": 2.9084478365580386, + "grad_norm": 0.17635595798492432, + "learning_rate": 1.6764434673819644e-05, + "loss": 1.1541, + "step": 7812 + }, + { + "epoch": 2.908820141708648, + "grad_norm": 0.17208701372146606, + "learning_rate": 1.6763539550019143e-05, + "loss": 1.1639, + "step": 7813 + }, + { + "epoch": 2.909192446859257, + "grad_norm": 0.172771617770195, + "learning_rate": 1.6762644326321586e-05, + "loss": 1.1729, + "step": 7814 + }, + { + "epoch": 2.909564752009866, + "grad_norm": 0.16505715250968933, + "learning_rate": 1.6761749002740195e-05, + "loss": 1.1502, + "step": 7815 + }, + { + "epoch": 2.9099370571604752, + "grad_norm": 0.17262002825737, + "learning_rate": 1.6760853579288196e-05, + "loss": 1.1625, + "step": 7816 + }, + { + "epoch": 2.910309362311084, + "grad_norm": 0.17353248596191406, + "learning_rate": 1.675995805597882e-05, + "loss": 1.1678, + "step": 7817 + }, + { + "epoch": 2.9106816674616933, + "grad_norm": 0.18139515817165375, + "learning_rate": 1.675906243282528e-05, + "loss": 1.169, + "step": 7818 + }, + { + "epoch": 2.9110539726123026, + "grad_norm": 0.1768987476825714, + "learning_rate": 1.6758166709840815e-05, + "loss": 1.1685, + "step": 7819 + }, + { + "epoch": 2.9114262777629114, + "grad_norm": 0.15893658995628357, + "learning_rate": 1.6757270887038653e-05, + "loss": 1.1696, + "step": 7820 + }, + { + "epoch": 2.9117985829135202, + "grad_norm": 0.19786134362220764, + "learning_rate": 1.6756374964432022e-05, + "loss": 1.1686, + "step": 7821 + }, + { + "epoch": 2.9121708880641295, + "grad_norm": 0.16756854951381683, + "learning_rate": 1.675547894203416e-05, + "loss": 1.1556, + "step": 7822 + }, + { + "epoch": 2.9125431932147388, + "grad_norm": 0.17230384051799774, + "learning_rate": 1.6754582819858295e-05, + "loss": 1.19, + "step": 7823 + }, + { + "epoch": 2.9129154983653476, + "grad_norm": 0.17054490745067596, + "learning_rate": 1.6753686597917668e-05, + "loss": 1.161, + "step": 7824 + }, + { + "epoch": 2.913287803515957, + "grad_norm": 0.16075944900512695, + "learning_rate": 1.675279027622551e-05, + "loss": 1.1755, + "step": 7825 + }, + { + "epoch": 2.9136601086665657, + "grad_norm": 0.17305156588554382, + "learning_rate": 1.6751893854795068e-05, + "loss": 1.1624, + "step": 7826 + }, + { + "epoch": 2.914032413817175, + "grad_norm": 0.17909590899944305, + "learning_rate": 1.6750997333639574e-05, + "loss": 1.1666, + "step": 7827 + }, + { + "epoch": 2.914404718967784, + "grad_norm": 0.16497522592544556, + "learning_rate": 1.6750100712772276e-05, + "loss": 1.1747, + "step": 7828 + }, + { + "epoch": 2.914777024118393, + "grad_norm": 0.17517150938510895, + "learning_rate": 1.6749203992206412e-05, + "loss": 1.1543, + "step": 7829 + }, + { + "epoch": 2.915149329269002, + "grad_norm": 0.17387013137340546, + "learning_rate": 1.6748307171955226e-05, + "loss": 1.1769, + "step": 7830 + }, + { + "epoch": 2.915521634419611, + "grad_norm": 0.1653532087802887, + "learning_rate": 1.674741025203197e-05, + "loss": 1.1586, + "step": 7831 + }, + { + "epoch": 2.9158939395702204, + "grad_norm": 0.17375896871089935, + "learning_rate": 1.6746513232449888e-05, + "loss": 1.1712, + "step": 7832 + }, + { + "epoch": 2.916266244720829, + "grad_norm": 0.16700343787670135, + "learning_rate": 1.6745616113222228e-05, + "loss": 1.1565, + "step": 7833 + }, + { + "epoch": 2.9166385498714384, + "grad_norm": 0.15995582938194275, + "learning_rate": 1.6744718894362243e-05, + "loss": 1.1726, + "step": 7834 + }, + { + "epoch": 2.9170108550220473, + "grad_norm": 0.16936053335666656, + "learning_rate": 1.674382157588318e-05, + "loss": 1.1607, + "step": 7835 + }, + { + "epoch": 2.9173831601726565, + "grad_norm": 0.16181506216526031, + "learning_rate": 1.6742924157798302e-05, + "loss": 1.1635, + "step": 7836 + }, + { + "epoch": 2.917755465323266, + "grad_norm": 0.1655145138502121, + "learning_rate": 1.674202664012085e-05, + "loss": 1.1594, + "step": 7837 + }, + { + "epoch": 2.9181277704738746, + "grad_norm": 0.1637190282344818, + "learning_rate": 1.674112902286409e-05, + "loss": 1.1595, + "step": 7838 + }, + { + "epoch": 2.918500075624484, + "grad_norm": 0.16163207590579987, + "learning_rate": 1.674023130604128e-05, + "loss": 1.1678, + "step": 7839 + }, + { + "epoch": 2.9188723807750927, + "grad_norm": 0.1664661020040512, + "learning_rate": 1.6739333489665672e-05, + "loss": 1.1553, + "step": 7840 + }, + { + "epoch": 2.919244685925702, + "grad_norm": 0.1619381159543991, + "learning_rate": 1.6738435573750535e-05, + "loss": 1.1539, + "step": 7841 + }, + { + "epoch": 2.919616991076311, + "grad_norm": 0.16180214285850525, + "learning_rate": 1.6737537558309128e-05, + "loss": 1.1754, + "step": 7842 + }, + { + "epoch": 2.91998929622692, + "grad_norm": 0.17024828493595123, + "learning_rate": 1.6736639443354712e-05, + "loss": 1.1597, + "step": 7843 + }, + { + "epoch": 2.920361601377529, + "grad_norm": 0.16076843440532684, + "learning_rate": 1.6735741228900556e-05, + "loss": 1.169, + "step": 7844 + }, + { + "epoch": 2.920733906528138, + "grad_norm": 0.16919641196727753, + "learning_rate": 1.673484291495992e-05, + "loss": 1.179, + "step": 7845 + }, + { + "epoch": 2.9211062116787474, + "grad_norm": 0.1667402684688568, + "learning_rate": 1.673394450154608e-05, + "loss": 1.1683, + "step": 7846 + }, + { + "epoch": 2.921478516829356, + "grad_norm": 0.16661867499351501, + "learning_rate": 1.6733045988672306e-05, + "loss": 1.1734, + "step": 7847 + }, + { + "epoch": 2.9218508219799655, + "grad_norm": 0.16996459662914276, + "learning_rate": 1.673214737635186e-05, + "loss": 1.169, + "step": 7848 + }, + { + "epoch": 2.9222231271305743, + "grad_norm": 0.16732630133628845, + "learning_rate": 1.6731248664598023e-05, + "loss": 1.157, + "step": 7849 + }, + { + "epoch": 2.9225954322811836, + "grad_norm": 0.1656404733657837, + "learning_rate": 1.6730349853424064e-05, + "loss": 1.1684, + "step": 7850 + }, + { + "epoch": 2.9229677374317924, + "grad_norm": 0.16698972880840302, + "learning_rate": 1.6729450942843256e-05, + "loss": 1.1607, + "step": 7851 + }, + { + "epoch": 2.9233400425824017, + "grad_norm": 0.18779774010181427, + "learning_rate": 1.6728551932868885e-05, + "loss": 1.1719, + "step": 7852 + }, + { + "epoch": 2.9237123477330105, + "grad_norm": 0.1793741136789322, + "learning_rate": 1.6727652823514225e-05, + "loss": 1.1573, + "step": 7853 + }, + { + "epoch": 2.9240846528836197, + "grad_norm": 0.17160984873771667, + "learning_rate": 1.6726753614792555e-05, + "loss": 1.1786, + "step": 7854 + }, + { + "epoch": 2.924456958034229, + "grad_norm": 0.16810734570026398, + "learning_rate": 1.6725854306717155e-05, + "loss": 1.1771, + "step": 7855 + }, + { + "epoch": 2.924829263184838, + "grad_norm": 0.1664050966501236, + "learning_rate": 1.6724954899301308e-05, + "loss": 1.1654, + "step": 7856 + }, + { + "epoch": 2.925201568335447, + "grad_norm": 0.19625960290431976, + "learning_rate": 1.6724055392558302e-05, + "loss": 1.1529, + "step": 7857 + }, + { + "epoch": 2.925573873486056, + "grad_norm": 0.20255877077579498, + "learning_rate": 1.6723155786501414e-05, + "loss": 1.1741, + "step": 7858 + }, + { + "epoch": 2.925946178636665, + "grad_norm": 0.1825208216905594, + "learning_rate": 1.672225608114394e-05, + "loss": 1.1686, + "step": 7859 + }, + { + "epoch": 2.926318483787274, + "grad_norm": 0.3680337071418762, + "learning_rate": 1.672135627649917e-05, + "loss": 1.1604, + "step": 7860 + }, + { + "epoch": 2.9266907889378833, + "grad_norm": 0.18671971559524536, + "learning_rate": 1.672045637258038e-05, + "loss": 1.166, + "step": 7861 + }, + { + "epoch": 2.927063094088492, + "grad_norm": 0.1835385113954544, + "learning_rate": 1.671955636940088e-05, + "loss": 1.1751, + "step": 7862 + }, + { + "epoch": 2.9274353992391013, + "grad_norm": 0.15858596563339233, + "learning_rate": 1.6718656266973952e-05, + "loss": 1.1597, + "step": 7863 + }, + { + "epoch": 2.9278077043897106, + "grad_norm": 0.16256222128868103, + "learning_rate": 1.6717756065312892e-05, + "loss": 1.1723, + "step": 7864 + }, + { + "epoch": 2.9281800095403194, + "grad_norm": 0.17425408959388733, + "learning_rate": 1.6716855764430995e-05, + "loss": 1.1597, + "step": 7865 + }, + { + "epoch": 2.9285523146909287, + "grad_norm": 0.16741712391376495, + "learning_rate": 1.6715955364341563e-05, + "loss": 1.168, + "step": 7866 + }, + { + "epoch": 2.9289246198415375, + "grad_norm": 0.1664438098669052, + "learning_rate": 1.671505486505789e-05, + "loss": 1.1784, + "step": 7867 + }, + { + "epoch": 2.9292969249921468, + "grad_norm": 0.15664342045783997, + "learning_rate": 1.6714154266593277e-05, + "loss": 1.1586, + "step": 7868 + }, + { + "epoch": 2.9296692301427556, + "grad_norm": 0.1611543446779251, + "learning_rate": 1.671325356896103e-05, + "loss": 1.1643, + "step": 7869 + }, + { + "epoch": 2.930041535293365, + "grad_norm": 0.167099729180336, + "learning_rate": 1.6712352772174444e-05, + "loss": 1.1622, + "step": 7870 + }, + { + "epoch": 2.9304138404439737, + "grad_norm": 0.16537703573703766, + "learning_rate": 1.6711451876246833e-05, + "loss": 1.1692, + "step": 7871 + }, + { + "epoch": 2.930786145594583, + "grad_norm": 0.16481894254684448, + "learning_rate": 1.6710550881191498e-05, + "loss": 1.1521, + "step": 7872 + }, + { + "epoch": 2.931158450745192, + "grad_norm": 0.1629692018032074, + "learning_rate": 1.6709649787021748e-05, + "loss": 1.1602, + "step": 7873 + }, + { + "epoch": 2.931530755895801, + "grad_norm": 0.16847184300422668, + "learning_rate": 1.6708748593750888e-05, + "loss": 1.1644, + "step": 7874 + }, + { + "epoch": 2.9319030610464103, + "grad_norm": 0.1654757410287857, + "learning_rate": 1.6707847301392237e-05, + "loss": 1.1596, + "step": 7875 + }, + { + "epoch": 2.932275366197019, + "grad_norm": 0.16103285551071167, + "learning_rate": 1.67069459099591e-05, + "loss": 1.1816, + "step": 7876 + }, + { + "epoch": 2.9326476713476284, + "grad_norm": 0.15957196056842804, + "learning_rate": 1.6706044419464792e-05, + "loss": 1.1675, + "step": 7877 + }, + { + "epoch": 2.933019976498237, + "grad_norm": 0.16220717132091522, + "learning_rate": 1.670514282992263e-05, + "loss": 1.1758, + "step": 7878 + }, + { + "epoch": 2.9333922816488465, + "grad_norm": 0.16329532861709595, + "learning_rate": 1.670424114134593e-05, + "loss": 1.1742, + "step": 7879 + }, + { + "epoch": 2.9337645867994553, + "grad_norm": 0.16387712955474854, + "learning_rate": 1.6703339353748006e-05, + "loss": 1.1657, + "step": 7880 + }, + { + "epoch": 2.9341368919500646, + "grad_norm": 0.15758801996707916, + "learning_rate": 1.6702437467142186e-05, + "loss": 1.1537, + "step": 7881 + }, + { + "epoch": 2.934509197100674, + "grad_norm": 0.16278088092803955, + "learning_rate": 1.6701535481541783e-05, + "loss": 1.1738, + "step": 7882 + }, + { + "epoch": 2.9348815022512826, + "grad_norm": 0.16199669241905212, + "learning_rate": 1.670063339696012e-05, + "loss": 1.1741, + "step": 7883 + }, + { + "epoch": 2.935253807401892, + "grad_norm": 0.164675772190094, + "learning_rate": 1.6699731213410524e-05, + "loss": 1.1584, + "step": 7884 + }, + { + "epoch": 2.9356261125525007, + "grad_norm": 0.16404102742671967, + "learning_rate": 1.6698828930906316e-05, + "loss": 1.1697, + "step": 7885 + }, + { + "epoch": 2.93599841770311, + "grad_norm": 0.16526629030704498, + "learning_rate": 1.6697926549460826e-05, + "loss": 1.1623, + "step": 7886 + }, + { + "epoch": 2.9363707228537193, + "grad_norm": 0.15449416637420654, + "learning_rate": 1.669702406908738e-05, + "loss": 1.1646, + "step": 7887 + }, + { + "epoch": 2.936743028004328, + "grad_norm": 0.1667950302362442, + "learning_rate": 1.6696121489799314e-05, + "loss": 1.1745, + "step": 7888 + }, + { + "epoch": 2.937115333154937, + "grad_norm": 0.16033466160297394, + "learning_rate": 1.669521881160995e-05, + "loss": 1.1658, + "step": 7889 + }, + { + "epoch": 2.937487638305546, + "grad_norm": 0.1629643738269806, + "learning_rate": 1.6694316034532626e-05, + "loss": 1.177, + "step": 7890 + }, + { + "epoch": 2.9378599434561554, + "grad_norm": 0.15993858873844147, + "learning_rate": 1.6693413158580672e-05, + "loss": 1.1587, + "step": 7891 + }, + { + "epoch": 2.9382322486067642, + "grad_norm": 0.16693513095378876, + "learning_rate": 1.6692510183767424e-05, + "loss": 1.1738, + "step": 7892 + }, + { + "epoch": 2.9386045537573735, + "grad_norm": 0.1641031950712204, + "learning_rate": 1.6691607110106223e-05, + "loss": 1.1536, + "step": 7893 + }, + { + "epoch": 2.9389768589079823, + "grad_norm": 0.15620005130767822, + "learning_rate": 1.6690703937610406e-05, + "loss": 1.1768, + "step": 7894 + }, + { + "epoch": 2.9393491640585916, + "grad_norm": 0.16095943748950958, + "learning_rate": 1.668980066629331e-05, + "loss": 1.1693, + "step": 7895 + }, + { + "epoch": 2.939721469209201, + "grad_norm": 0.1613335907459259, + "learning_rate": 1.668889729616828e-05, + "loss": 1.1716, + "step": 7896 + }, + { + "epoch": 2.9400937743598097, + "grad_norm": 0.1632033735513687, + "learning_rate": 1.6687993827248657e-05, + "loss": 1.1618, + "step": 7897 + }, + { + "epoch": 2.9404660795104185, + "grad_norm": 0.16224397718906403, + "learning_rate": 1.6687090259547782e-05, + "loss": 1.1716, + "step": 7898 + }, + { + "epoch": 2.9408383846610278, + "grad_norm": 0.16363045573234558, + "learning_rate": 1.6686186593079003e-05, + "loss": 1.1671, + "step": 7899 + }, + { + "epoch": 2.941210689811637, + "grad_norm": 0.1638343632221222, + "learning_rate": 1.6685282827855672e-05, + "loss": 1.1756, + "step": 7900 + }, + { + "epoch": 2.941582994962246, + "grad_norm": 0.16511334478855133, + "learning_rate": 1.668437896389113e-05, + "loss": 1.1802, + "step": 7901 + }, + { + "epoch": 2.941955300112855, + "grad_norm": 0.16726627945899963, + "learning_rate": 1.6683475001198733e-05, + "loss": 1.1729, + "step": 7902 + }, + { + "epoch": 2.942327605263464, + "grad_norm": 0.15701860189437866, + "learning_rate": 1.6682570939791827e-05, + "loss": 1.1546, + "step": 7903 + }, + { + "epoch": 2.942699910414073, + "grad_norm": 0.16227418184280396, + "learning_rate": 1.668166677968377e-05, + "loss": 1.1613, + "step": 7904 + }, + { + "epoch": 2.9430722155646825, + "grad_norm": 0.16237196326255798, + "learning_rate": 1.668076252088791e-05, + "loss": 1.1529, + "step": 7905 + }, + { + "epoch": 2.9434445207152913, + "grad_norm": 0.15888628363609314, + "learning_rate": 1.6679858163417607e-05, + "loss": 1.1825, + "step": 7906 + }, + { + "epoch": 2.9438168258659, + "grad_norm": 0.17086133360862732, + "learning_rate": 1.667895370728622e-05, + "loss": 1.1891, + "step": 7907 + }, + { + "epoch": 2.9441891310165094, + "grad_norm": 0.16408465802669525, + "learning_rate": 1.667804915250711e-05, + "loss": 1.1658, + "step": 7908 + }, + { + "epoch": 2.9445614361671186, + "grad_norm": 0.15986572206020355, + "learning_rate": 1.6677144499093626e-05, + "loss": 1.1581, + "step": 7909 + }, + { + "epoch": 2.9449337413177274, + "grad_norm": 0.164417564868927, + "learning_rate": 1.667623974705914e-05, + "loss": 1.1553, + "step": 7910 + }, + { + "epoch": 2.9453060464683367, + "grad_norm": 0.16091735661029816, + "learning_rate": 1.6675334896417014e-05, + "loss": 1.1755, + "step": 7911 + }, + { + "epoch": 2.9456783516189455, + "grad_norm": 0.16419173777103424, + "learning_rate": 1.6674429947180607e-05, + "loss": 1.169, + "step": 7912 + }, + { + "epoch": 2.946050656769555, + "grad_norm": 0.17175257205963135, + "learning_rate": 1.667352489936329e-05, + "loss": 1.1745, + "step": 7913 + }, + { + "epoch": 2.946422961920164, + "grad_norm": 0.16759838163852692, + "learning_rate": 1.6672619752978428e-05, + "loss": 1.1677, + "step": 7914 + }, + { + "epoch": 2.946795267070773, + "grad_norm": 0.16285474598407745, + "learning_rate": 1.6671714508039394e-05, + "loss": 1.1712, + "step": 7915 + }, + { + "epoch": 2.9471675722213817, + "grad_norm": 0.16049782931804657, + "learning_rate": 1.6670809164559553e-05, + "loss": 1.1513, + "step": 7916 + }, + { + "epoch": 2.947539877371991, + "grad_norm": 0.16227254271507263, + "learning_rate": 1.666990372255228e-05, + "loss": 1.1794, + "step": 7917 + }, + { + "epoch": 2.9479121825226002, + "grad_norm": 0.16385634243488312, + "learning_rate": 1.6668998182030945e-05, + "loss": 1.179, + "step": 7918 + }, + { + "epoch": 2.948284487673209, + "grad_norm": 0.15982592105865479, + "learning_rate": 1.666809254300893e-05, + "loss": 1.1648, + "step": 7919 + }, + { + "epoch": 2.9486567928238183, + "grad_norm": 0.16695238649845123, + "learning_rate": 1.6667186805499605e-05, + "loss": 1.1793, + "step": 7920 + }, + { + "epoch": 2.949029097974427, + "grad_norm": 0.1569126844406128, + "learning_rate": 1.666628096951635e-05, + "loss": 1.165, + "step": 7921 + }, + { + "epoch": 2.9494014031250364, + "grad_norm": 0.1580849289894104, + "learning_rate": 1.666537503507254e-05, + "loss": 1.1664, + "step": 7922 + }, + { + "epoch": 2.9497737082756457, + "grad_norm": 0.16597579419612885, + "learning_rate": 1.6664469002181562e-05, + "loss": 1.1624, + "step": 7923 + }, + { + "epoch": 2.9501460134262545, + "grad_norm": 0.16480037569999695, + "learning_rate": 1.6663562870856793e-05, + "loss": 1.1796, + "step": 7924 + }, + { + "epoch": 2.9505183185768633, + "grad_norm": 0.16459214687347412, + "learning_rate": 1.6662656641111623e-05, + "loss": 1.1583, + "step": 7925 + }, + { + "epoch": 2.9508906237274726, + "grad_norm": 0.1645277440547943, + "learning_rate": 1.666175031295943e-05, + "loss": 1.173, + "step": 7926 + }, + { + "epoch": 2.951262928878082, + "grad_norm": 0.16501055657863617, + "learning_rate": 1.66608438864136e-05, + "loss": 1.1613, + "step": 7927 + }, + { + "epoch": 2.9516352340286907, + "grad_norm": 0.16335533559322357, + "learning_rate": 1.6659937361487527e-05, + "loss": 1.1732, + "step": 7928 + }, + { + "epoch": 2.9520075391793, + "grad_norm": 0.16024471819400787, + "learning_rate": 1.6659030738194594e-05, + "loss": 1.1738, + "step": 7929 + }, + { + "epoch": 2.9523798443299087, + "grad_norm": 0.15945962071418762, + "learning_rate": 1.66581240165482e-05, + "loss": 1.1485, + "step": 7930 + }, + { + "epoch": 2.952752149480518, + "grad_norm": 0.15847419202327728, + "learning_rate": 1.6657217196561727e-05, + "loss": 1.1606, + "step": 7931 + }, + { + "epoch": 2.9531244546311273, + "grad_norm": 0.1631278246641159, + "learning_rate": 1.6656310278248577e-05, + "loss": 1.1659, + "step": 7932 + }, + { + "epoch": 2.953496759781736, + "grad_norm": 0.1575775444507599, + "learning_rate": 1.6655403261622143e-05, + "loss": 1.1682, + "step": 7933 + }, + { + "epoch": 2.953869064932345, + "grad_norm": 0.16149340569972992, + "learning_rate": 1.6654496146695817e-05, + "loss": 1.1578, + "step": 7934 + }, + { + "epoch": 2.954241370082954, + "grad_norm": 0.16209939122200012, + "learning_rate": 1.6653588933483003e-05, + "loss": 1.1566, + "step": 7935 + }, + { + "epoch": 2.9546136752335634, + "grad_norm": 0.15919779241085052, + "learning_rate": 1.6652681621997095e-05, + "loss": 1.1714, + "step": 7936 + }, + { + "epoch": 2.9549859803841723, + "grad_norm": 0.15968568623065948, + "learning_rate": 1.66517742122515e-05, + "loss": 1.1729, + "step": 7937 + }, + { + "epoch": 2.9553582855347815, + "grad_norm": 0.16550806164741516, + "learning_rate": 1.6650866704259615e-05, + "loss": 1.1591, + "step": 7938 + }, + { + "epoch": 2.9557305906853903, + "grad_norm": 0.16333597898483276, + "learning_rate": 1.664995909803485e-05, + "loss": 1.1571, + "step": 7939 + }, + { + "epoch": 2.9561028958359996, + "grad_norm": 0.1608911156654358, + "learning_rate": 1.6649051393590605e-05, + "loss": 1.1608, + "step": 7940 + }, + { + "epoch": 2.956475200986609, + "grad_norm": 0.1644187867641449, + "learning_rate": 1.6648143590940286e-05, + "loss": 1.1719, + "step": 7941 + }, + { + "epoch": 2.9568475061372177, + "grad_norm": 0.1601906716823578, + "learning_rate": 1.6647235690097303e-05, + "loss": 1.1627, + "step": 7942 + }, + { + "epoch": 2.9572198112878265, + "grad_norm": 0.15772992372512817, + "learning_rate": 1.6646327691075067e-05, + "loss": 1.1456, + "step": 7943 + }, + { + "epoch": 2.957592116438436, + "grad_norm": 0.15940575301647186, + "learning_rate": 1.664541959388699e-05, + "loss": 1.1709, + "step": 7944 + }, + { + "epoch": 2.957964421589045, + "grad_norm": 0.16511191427707672, + "learning_rate": 1.664451139854648e-05, + "loss": 1.1746, + "step": 7945 + }, + { + "epoch": 2.958336726739654, + "grad_norm": 0.15787945687770844, + "learning_rate": 1.6643603105066955e-05, + "loss": 1.1639, + "step": 7946 + }, + { + "epoch": 2.958709031890263, + "grad_norm": 0.15846529603004456, + "learning_rate": 1.664269471346183e-05, + "loss": 1.1778, + "step": 7947 + }, + { + "epoch": 2.959081337040872, + "grad_norm": 0.16118839383125305, + "learning_rate": 1.6641786223744518e-05, + "loss": 1.1771, + "step": 7948 + }, + { + "epoch": 2.959453642191481, + "grad_norm": 0.16540107131004333, + "learning_rate": 1.6640877635928446e-05, + "loss": 1.1726, + "step": 7949 + }, + { + "epoch": 2.9598259473420905, + "grad_norm": 0.15974678099155426, + "learning_rate": 1.6639968950027023e-05, + "loss": 1.1715, + "step": 7950 + }, + { + "epoch": 2.9601982524926993, + "grad_norm": 0.15811410546302795, + "learning_rate": 1.663906016605368e-05, + "loss": 1.1567, + "step": 7951 + }, + { + "epoch": 2.960570557643308, + "grad_norm": 0.16194650530815125, + "learning_rate": 1.6638151284021828e-05, + "loss": 1.1735, + "step": 7952 + }, + { + "epoch": 2.9609428627939174, + "grad_norm": 0.1610521525144577, + "learning_rate": 1.66372423039449e-05, + "loss": 1.1847, + "step": 7953 + }, + { + "epoch": 2.9613151679445266, + "grad_norm": 0.16063009202480316, + "learning_rate": 1.6636333225836323e-05, + "loss": 1.1642, + "step": 7954 + }, + { + "epoch": 2.9616874730951355, + "grad_norm": 0.16281422972679138, + "learning_rate": 1.663542404970952e-05, + "loss": 1.1756, + "step": 7955 + }, + { + "epoch": 2.9620597782457447, + "grad_norm": 0.15930494666099548, + "learning_rate": 1.663451477557792e-05, + "loss": 1.1409, + "step": 7956 + }, + { + "epoch": 2.9624320833963536, + "grad_norm": 0.16165930032730103, + "learning_rate": 1.6633605403454952e-05, + "loss": 1.1662, + "step": 7957 + }, + { + "epoch": 2.962804388546963, + "grad_norm": 0.16098619997501373, + "learning_rate": 1.6632695933354052e-05, + "loss": 1.1678, + "step": 7958 + }, + { + "epoch": 2.963176693697572, + "grad_norm": 0.16247297823429108, + "learning_rate": 1.663178636528864e-05, + "loss": 1.1676, + "step": 7959 + }, + { + "epoch": 2.963548998848181, + "grad_norm": 0.16938212513923645, + "learning_rate": 1.663087669927217e-05, + "loss": 1.1518, + "step": 7960 + }, + { + "epoch": 2.96392130399879, + "grad_norm": 0.16098153591156006, + "learning_rate": 1.6629966935318062e-05, + "loss": 1.1678, + "step": 7961 + }, + { + "epoch": 2.964293609149399, + "grad_norm": 0.16554594039916992, + "learning_rate": 1.662905707343976e-05, + "loss": 1.1628, + "step": 7962 + }, + { + "epoch": 2.9646659143000083, + "grad_norm": 0.16836988925933838, + "learning_rate": 1.6628147113650703e-05, + "loss": 1.1883, + "step": 7963 + }, + { + "epoch": 2.965038219450617, + "grad_norm": 0.16309542953968048, + "learning_rate": 1.6627237055964324e-05, + "loss": 1.1766, + "step": 7964 + }, + { + "epoch": 2.9654105246012263, + "grad_norm": 0.16153202950954437, + "learning_rate": 1.6626326900394073e-05, + "loss": 1.1642, + "step": 7965 + }, + { + "epoch": 2.965782829751835, + "grad_norm": 0.16180694103240967, + "learning_rate": 1.662541664695339e-05, + "loss": 1.1684, + "step": 7966 + }, + { + "epoch": 2.9661551349024444, + "grad_norm": 0.16211725771427155, + "learning_rate": 1.662450629565572e-05, + "loss": 1.1634, + "step": 7967 + }, + { + "epoch": 2.9665274400530537, + "grad_norm": 0.16405652463436127, + "learning_rate": 1.6623595846514503e-05, + "loss": 1.1675, + "step": 7968 + }, + { + "epoch": 2.9668997452036625, + "grad_norm": 0.1562814861536026, + "learning_rate": 1.662268529954319e-05, + "loss": 1.1668, + "step": 7969 + }, + { + "epoch": 2.9672720503542718, + "grad_norm": 0.161887064576149, + "learning_rate": 1.6621774654755238e-05, + "loss": 1.1572, + "step": 7970 + }, + { + "epoch": 2.9676443555048806, + "grad_norm": 0.16800396144390106, + "learning_rate": 1.6620863912164086e-05, + "loss": 1.1691, + "step": 7971 + }, + { + "epoch": 2.96801666065549, + "grad_norm": 0.16673104465007782, + "learning_rate": 1.661995307178319e-05, + "loss": 1.1746, + "step": 7972 + }, + { + "epoch": 2.9683889658060987, + "grad_norm": 0.17055685818195343, + "learning_rate": 1.6619042133626003e-05, + "loss": 1.1713, + "step": 7973 + }, + { + "epoch": 2.968761270956708, + "grad_norm": 0.16628359258174896, + "learning_rate": 1.661813109770598e-05, + "loss": 1.1812, + "step": 7974 + }, + { + "epoch": 2.9691335761073168, + "grad_norm": 0.16214534640312195, + "learning_rate": 1.6617219964036572e-05, + "loss": 1.1605, + "step": 7975 + }, + { + "epoch": 2.969505881257926, + "grad_norm": 0.16540850698947906, + "learning_rate": 1.6616308732631245e-05, + "loss": 1.1694, + "step": 7976 + }, + { + "epoch": 2.9698781864085353, + "grad_norm": 0.16304931044578552, + "learning_rate": 1.6615397403503452e-05, + "loss": 1.1558, + "step": 7977 + }, + { + "epoch": 2.970250491559144, + "grad_norm": 0.16456028819084167, + "learning_rate": 1.661448597666665e-05, + "loss": 1.1587, + "step": 7978 + }, + { + "epoch": 2.9706227967097534, + "grad_norm": 0.1654767096042633, + "learning_rate": 1.6613574452134314e-05, + "loss": 1.1568, + "step": 7979 + }, + { + "epoch": 2.970995101860362, + "grad_norm": 0.16024528443813324, + "learning_rate": 1.6612662829919894e-05, + "loss": 1.1774, + "step": 7980 + }, + { + "epoch": 2.9713674070109715, + "grad_norm": 0.1553923636674881, + "learning_rate": 1.6611751110036856e-05, + "loss": 1.1664, + "step": 7981 + }, + { + "epoch": 2.9717397121615803, + "grad_norm": 0.16233742237091064, + "learning_rate": 1.661083929249867e-05, + "loss": 1.166, + "step": 7982 + }, + { + "epoch": 2.9721120173121895, + "grad_norm": 0.16320884227752686, + "learning_rate": 1.6609927377318804e-05, + "loss": 1.1617, + "step": 7983 + }, + { + "epoch": 2.9724843224627984, + "grad_norm": 0.16230836510658264, + "learning_rate": 1.6609015364510726e-05, + "loss": 1.157, + "step": 7984 + }, + { + "epoch": 2.9728566276134076, + "grad_norm": 0.16287636756896973, + "learning_rate": 1.6608103254087905e-05, + "loss": 1.1753, + "step": 7985 + }, + { + "epoch": 2.973228932764017, + "grad_norm": 0.16399414837360382, + "learning_rate": 1.6607191046063815e-05, + "loss": 1.1643, + "step": 7986 + }, + { + "epoch": 2.9736012379146257, + "grad_norm": 0.158194899559021, + "learning_rate": 1.6606278740451927e-05, + "loss": 1.1791, + "step": 7987 + }, + { + "epoch": 2.973973543065235, + "grad_norm": 0.16402703523635864, + "learning_rate": 1.660536633726572e-05, + "loss": 1.1695, + "step": 7988 + }, + { + "epoch": 2.974345848215844, + "grad_norm": 0.1630365401506424, + "learning_rate": 1.6604453836518658e-05, + "loss": 1.1654, + "step": 7989 + }, + { + "epoch": 2.974718153366453, + "grad_norm": 0.16490447521209717, + "learning_rate": 1.6603541238224235e-05, + "loss": 1.1718, + "step": 7990 + }, + { + "epoch": 2.975090458517062, + "grad_norm": 0.15666000545024872, + "learning_rate": 1.660262854239592e-05, + "loss": 1.1588, + "step": 7991 + }, + { + "epoch": 2.975462763667671, + "grad_norm": 0.16078460216522217, + "learning_rate": 1.6601715749047195e-05, + "loss": 1.1669, + "step": 7992 + }, + { + "epoch": 2.97583506881828, + "grad_norm": 0.16264961659908295, + "learning_rate": 1.6600802858191543e-05, + "loss": 1.1733, + "step": 7993 + }, + { + "epoch": 2.9762073739688892, + "grad_norm": 0.16248923540115356, + "learning_rate": 1.6599889869842447e-05, + "loss": 1.1615, + "step": 7994 + }, + { + "epoch": 2.9765796791194985, + "grad_norm": 0.16207122802734375, + "learning_rate": 1.6598976784013394e-05, + "loss": 1.1652, + "step": 7995 + }, + { + "epoch": 2.9769519842701073, + "grad_norm": 0.1700456589460373, + "learning_rate": 1.6598063600717865e-05, + "loss": 1.1734, + "step": 7996 + }, + { + "epoch": 2.9773242894207166, + "grad_norm": 0.1640198975801468, + "learning_rate": 1.659715031996935e-05, + "loss": 1.1636, + "step": 7997 + }, + { + "epoch": 2.9776965945713254, + "grad_norm": 0.16763059794902802, + "learning_rate": 1.6596236941781342e-05, + "loss": 1.1581, + "step": 7998 + }, + { + "epoch": 2.9780688997219347, + "grad_norm": 0.16505372524261475, + "learning_rate": 1.6595323466167327e-05, + "loss": 1.1632, + "step": 7999 + }, + { + "epoch": 2.978441204872544, + "grad_norm": 0.16153225302696228, + "learning_rate": 1.6594409893140796e-05, + "loss": 1.1629, + "step": 8000 + }, + { + "epoch": 2.978441204872544, + "eval_loss": 1.2953892946243286, + "eval_runtime": 16.6947, + "eval_samples_per_second": 103.865, + "eval_steps_per_second": 5.211, + "step": 8000 + }, + { + "epoch": 2.9788135100231528, + "grad_norm": 0.163248211145401, + "learning_rate": 1.659349622271525e-05, + "loss": 1.1506, + "step": 8001 + }, + { + "epoch": 2.9791858151737616, + "grad_norm": 0.16736340522766113, + "learning_rate": 1.6592582454904176e-05, + "loss": 1.164, + "step": 8002 + }, + { + "epoch": 2.979558120324371, + "grad_norm": 0.164361834526062, + "learning_rate": 1.659166858972107e-05, + "loss": 1.1693, + "step": 8003 + }, + { + "epoch": 2.97993042547498, + "grad_norm": 0.16543278098106384, + "learning_rate": 1.6590754627179438e-05, + "loss": 1.1647, + "step": 8004 + }, + { + "epoch": 2.980302730625589, + "grad_norm": 0.1678323745727539, + "learning_rate": 1.658984056729277e-05, + "loss": 1.1628, + "step": 8005 + }, + { + "epoch": 2.980675035776198, + "grad_norm": 0.16372528672218323, + "learning_rate": 1.6588926410074573e-05, + "loss": 1.1664, + "step": 8006 + }, + { + "epoch": 2.981047340926807, + "grad_norm": 0.1635918915271759, + "learning_rate": 1.6588012155538343e-05, + "loss": 1.1571, + "step": 8007 + }, + { + "epoch": 2.9814196460774163, + "grad_norm": 0.1657022386789322, + "learning_rate": 1.658709780369759e-05, + "loss": 1.174, + "step": 8008 + }, + { + "epoch": 2.9817919512280255, + "grad_norm": 0.1680508553981781, + "learning_rate": 1.6586183354565814e-05, + "loss": 1.1591, + "step": 8009 + }, + { + "epoch": 2.9821642563786344, + "grad_norm": 0.16020500659942627, + "learning_rate": 1.658526880815652e-05, + "loss": 1.1726, + "step": 8010 + }, + { + "epoch": 2.982536561529243, + "grad_norm": 0.16022956371307373, + "learning_rate": 1.6584354164483225e-05, + "loss": 1.1781, + "step": 8011 + }, + { + "epoch": 2.9829088666798524, + "grad_norm": 0.16404716670513153, + "learning_rate": 1.658343942355943e-05, + "loss": 1.1735, + "step": 8012 + }, + { + "epoch": 2.9832811718304617, + "grad_norm": 0.1646728813648224, + "learning_rate": 1.6582524585398647e-05, + "loss": 1.1829, + "step": 8013 + }, + { + "epoch": 2.9836534769810705, + "grad_norm": 0.16547395288944244, + "learning_rate": 1.658160965001439e-05, + "loss": 1.1801, + "step": 8014 + }, + { + "epoch": 2.98402578213168, + "grad_norm": 0.16899387538433075, + "learning_rate": 1.6580694617420173e-05, + "loss": 1.1598, + "step": 8015 + }, + { + "epoch": 2.9843980872822886, + "grad_norm": 0.1721050888299942, + "learning_rate": 1.6579779487629508e-05, + "loss": 1.165, + "step": 8016 + }, + { + "epoch": 2.984770392432898, + "grad_norm": 0.166593998670578, + "learning_rate": 1.657886426065591e-05, + "loss": 1.1621, + "step": 8017 + }, + { + "epoch": 2.985142697583507, + "grad_norm": 0.16592232882976532, + "learning_rate": 1.6577948936512905e-05, + "loss": 1.1688, + "step": 8018 + }, + { + "epoch": 2.985515002734116, + "grad_norm": 0.16835105419158936, + "learning_rate": 1.6577033515214e-05, + "loss": 1.1917, + "step": 8019 + }, + { + "epoch": 2.985887307884725, + "grad_norm": 0.16211645305156708, + "learning_rate": 1.6576117996772728e-05, + "loss": 1.1697, + "step": 8020 + }, + { + "epoch": 2.986259613035334, + "grad_norm": 0.16470582783222198, + "learning_rate": 1.657520238120261e-05, + "loss": 1.1691, + "step": 8021 + }, + { + "epoch": 2.9866319181859433, + "grad_norm": 0.16480398178100586, + "learning_rate": 1.6574286668517155e-05, + "loss": 1.1564, + "step": 8022 + }, + { + "epoch": 2.987004223336552, + "grad_norm": 0.15897324681282043, + "learning_rate": 1.6573370858729907e-05, + "loss": 1.1565, + "step": 8023 + }, + { + "epoch": 2.9873765284871614, + "grad_norm": 0.1609337478876114, + "learning_rate": 1.657245495185438e-05, + "loss": 1.1762, + "step": 8024 + }, + { + "epoch": 2.98774883363777, + "grad_norm": 0.16375748813152313, + "learning_rate": 1.6571538947904105e-05, + "loss": 1.172, + "step": 8025 + }, + { + "epoch": 2.9881211387883795, + "grad_norm": 0.16461491584777832, + "learning_rate": 1.6570622846892615e-05, + "loss": 1.1717, + "step": 8026 + }, + { + "epoch": 2.9884934439389887, + "grad_norm": 0.1679835021495819, + "learning_rate": 1.6569706648833436e-05, + "loss": 1.1607, + "step": 8027 + }, + { + "epoch": 2.9888657490895976, + "grad_norm": 0.1627042144536972, + "learning_rate": 1.6568790353740104e-05, + "loss": 1.1534, + "step": 8028 + }, + { + "epoch": 2.9892380542402064, + "grad_norm": 0.1635371893644333, + "learning_rate": 1.656787396162615e-05, + "loss": 1.1718, + "step": 8029 + }, + { + "epoch": 2.9896103593908157, + "grad_norm": 0.16765522956848145, + "learning_rate": 1.656695747250511e-05, + "loss": 1.1629, + "step": 8030 + }, + { + "epoch": 2.989982664541425, + "grad_norm": 0.16241970658302307, + "learning_rate": 1.656604088639052e-05, + "loss": 1.1618, + "step": 8031 + }, + { + "epoch": 2.9903549696920337, + "grad_norm": 0.16690513491630554, + "learning_rate": 1.6565124203295918e-05, + "loss": 1.1722, + "step": 8032 + }, + { + "epoch": 2.990727274842643, + "grad_norm": 0.15712475776672363, + "learning_rate": 1.656420742323484e-05, + "loss": 1.1419, + "step": 8033 + }, + { + "epoch": 2.991099579993252, + "grad_norm": 0.16157348453998566, + "learning_rate": 1.6563290546220835e-05, + "loss": 1.1554, + "step": 8034 + }, + { + "epoch": 2.991471885143861, + "grad_norm": 0.16115577518939972, + "learning_rate": 1.6562373572267438e-05, + "loss": 1.167, + "step": 8035 + }, + { + "epoch": 2.9918441902944704, + "grad_norm": 0.16360346972942352, + "learning_rate": 1.6561456501388197e-05, + "loss": 1.1597, + "step": 8036 + }, + { + "epoch": 2.992216495445079, + "grad_norm": 0.15973906219005585, + "learning_rate": 1.6560539333596657e-05, + "loss": 1.1556, + "step": 8037 + }, + { + "epoch": 2.992588800595688, + "grad_norm": 0.16520273685455322, + "learning_rate": 1.6559622068906357e-05, + "loss": 1.1799, + "step": 8038 + }, + { + "epoch": 2.9929611057462973, + "grad_norm": 0.1652621626853943, + "learning_rate": 1.6558704707330857e-05, + "loss": 1.1635, + "step": 8039 + }, + { + "epoch": 2.9933334108969065, + "grad_norm": 0.16183042526245117, + "learning_rate": 1.6557787248883698e-05, + "loss": 1.1567, + "step": 8040 + }, + { + "epoch": 2.9937057160475153, + "grad_norm": 0.1610204577445984, + "learning_rate": 1.655686969357843e-05, + "loss": 1.1709, + "step": 8041 + }, + { + "epoch": 2.9940780211981246, + "grad_norm": 0.162316232919693, + "learning_rate": 1.655595204142861e-05, + "loss": 1.165, + "step": 8042 + }, + { + "epoch": 2.9944503263487334, + "grad_norm": 0.1625785082578659, + "learning_rate": 1.655503429244779e-05, + "loss": 1.1799, + "step": 8043 + }, + { + "epoch": 2.9948226314993427, + "grad_norm": 0.15980488061904907, + "learning_rate": 1.6554116446649528e-05, + "loss": 1.1653, + "step": 8044 + }, + { + "epoch": 2.995194936649952, + "grad_norm": 0.16220425069332123, + "learning_rate": 1.655319850404737e-05, + "loss": 1.1635, + "step": 8045 + }, + { + "epoch": 2.9955672418005608, + "grad_norm": 0.1602453589439392, + "learning_rate": 1.6552280464654888e-05, + "loss": 1.1805, + "step": 8046 + }, + { + "epoch": 2.9959395469511696, + "grad_norm": 0.16323374211788177, + "learning_rate": 1.6551362328485633e-05, + "loss": 1.1555, + "step": 8047 + }, + { + "epoch": 2.996311852101779, + "grad_norm": 0.16577033698558807, + "learning_rate": 1.6550444095553167e-05, + "loss": 1.1722, + "step": 8048 + }, + { + "epoch": 2.996684157252388, + "grad_norm": 0.15781056880950928, + "learning_rate": 1.654952576587105e-05, + "loss": 1.1596, + "step": 8049 + }, + { + "epoch": 2.997056462402997, + "grad_norm": 0.16249702870845795, + "learning_rate": 1.6548607339452853e-05, + "loss": 1.1499, + "step": 8050 + }, + { + "epoch": 2.997428767553606, + "grad_norm": 0.16227367520332336, + "learning_rate": 1.6547688816312134e-05, + "loss": 1.1898, + "step": 8051 + }, + { + "epoch": 2.997801072704215, + "grad_norm": 0.16416963934898376, + "learning_rate": 1.6546770196462462e-05, + "loss": 1.1614, + "step": 8052 + }, + { + "epoch": 2.9981733778548243, + "grad_norm": 0.1586388200521469, + "learning_rate": 1.654585147991741e-05, + "loss": 1.175, + "step": 8053 + }, + { + "epoch": 2.9985456830054336, + "grad_norm": 0.16161870956420898, + "learning_rate": 1.6544932666690538e-05, + "loss": 1.1595, + "step": 8054 + }, + { + "epoch": 2.9989179881560424, + "grad_norm": 0.16111552715301514, + "learning_rate": 1.654401375679542e-05, + "loss": 1.1749, + "step": 8055 + }, + { + "epoch": 2.999290293306651, + "grad_norm": 0.16045445203781128, + "learning_rate": 1.654309475024563e-05, + "loss": 1.1532, + "step": 8056 + }, + { + "epoch": 2.9996625984572605, + "grad_norm": 0.16429480910301208, + "learning_rate": 1.654217564705474e-05, + "loss": 1.1617, + "step": 8057 + }, + { + "epoch": 3.0000349036078697, + "grad_norm": 0.16927365958690643, + "learning_rate": 1.654125644723633e-05, + "loss": 1.1758, + "step": 8058 + }, + { + "epoch": 3.0004072087584785, + "grad_norm": 0.15990622341632843, + "learning_rate": 1.654033715080397e-05, + "loss": 1.1513, + "step": 8059 + }, + { + "epoch": 3.000779513909088, + "grad_norm": 0.16184860467910767, + "learning_rate": 1.6539417757771246e-05, + "loss": 1.1631, + "step": 8060 + }, + { + "epoch": 3.0011518190596966, + "grad_norm": 0.16141143441200256, + "learning_rate": 1.6538498268151728e-05, + "loss": 1.153, + "step": 8061 + }, + { + "epoch": 3.001524124210306, + "grad_norm": 0.16540926694869995, + "learning_rate": 1.6537578681958998e-05, + "loss": 1.1469, + "step": 8062 + }, + { + "epoch": 3.0018964293609147, + "grad_norm": 0.18034324049949646, + "learning_rate": 1.6536658999206643e-05, + "loss": 1.1609, + "step": 8063 + }, + { + "epoch": 3.002268734511524, + "grad_norm": 0.15943139791488647, + "learning_rate": 1.653573921990825e-05, + "loss": 1.1609, + "step": 8064 + }, + { + "epoch": 3.0026410396621332, + "grad_norm": 0.16149209439754486, + "learning_rate": 1.6534819344077392e-05, + "loss": 1.1507, + "step": 8065 + }, + { + "epoch": 3.003013344812742, + "grad_norm": 0.1729980707168579, + "learning_rate": 1.6533899371727668e-05, + "loss": 1.1611, + "step": 8066 + }, + { + "epoch": 3.0033856499633513, + "grad_norm": 0.16380807757377625, + "learning_rate": 1.6532979302872654e-05, + "loss": 1.1694, + "step": 8067 + }, + { + "epoch": 3.00375795511396, + "grad_norm": 0.16485844552516937, + "learning_rate": 1.653205913752595e-05, + "loss": 1.1572, + "step": 8068 + }, + { + "epoch": 3.0041302602645694, + "grad_norm": 0.1658172309398651, + "learning_rate": 1.6531138875701142e-05, + "loss": 1.1508, + "step": 8069 + }, + { + "epoch": 3.0045025654151782, + "grad_norm": 0.15986262261867523, + "learning_rate": 1.6530218517411823e-05, + "loss": 1.1413, + "step": 8070 + }, + { + "epoch": 3.0048748705657875, + "grad_norm": 0.16682755947113037, + "learning_rate": 1.6529298062671587e-05, + "loss": 1.1601, + "step": 8071 + }, + { + "epoch": 3.0052471757163968, + "grad_norm": 0.16259485483169556, + "learning_rate": 1.6528377511494028e-05, + "loss": 1.1513, + "step": 8072 + }, + { + "epoch": 3.0056194808670056, + "grad_norm": 0.1639915257692337, + "learning_rate": 1.652745686389274e-05, + "loss": 1.1732, + "step": 8073 + }, + { + "epoch": 3.005991786017615, + "grad_norm": 0.16997599601745605, + "learning_rate": 1.6526536119881325e-05, + "loss": 1.1559, + "step": 8074 + }, + { + "epoch": 3.0063640911682237, + "grad_norm": 0.17218388617038727, + "learning_rate": 1.6525615279473385e-05, + "loss": 1.1643, + "step": 8075 + }, + { + "epoch": 3.006736396318833, + "grad_norm": 0.16318227350711823, + "learning_rate": 1.652469434268251e-05, + "loss": 1.1595, + "step": 8076 + }, + { + "epoch": 3.0071087014694418, + "grad_norm": 0.1637173295021057, + "learning_rate": 1.6523773309522314e-05, + "loss": 1.1586, + "step": 8077 + }, + { + "epoch": 3.007481006620051, + "grad_norm": 0.1644509881734848, + "learning_rate": 1.6522852180006396e-05, + "loss": 1.168, + "step": 8078 + }, + { + "epoch": 3.00785331177066, + "grad_norm": 0.16260574758052826, + "learning_rate": 1.6521930954148358e-05, + "loss": 1.1534, + "step": 8079 + }, + { + "epoch": 3.008225616921269, + "grad_norm": 0.16555775701999664, + "learning_rate": 1.652100963196181e-05, + "loss": 1.1544, + "step": 8080 + }, + { + "epoch": 3.0085979220718784, + "grad_norm": 0.1621171087026596, + "learning_rate": 1.652008821346036e-05, + "loss": 1.1601, + "step": 8081 + }, + { + "epoch": 3.008970227222487, + "grad_norm": 0.15849411487579346, + "learning_rate": 1.6519166698657616e-05, + "loss": 1.1589, + "step": 8082 + }, + { + "epoch": 3.0093425323730965, + "grad_norm": 0.16716186702251434, + "learning_rate": 1.6518245087567188e-05, + "loss": 1.154, + "step": 8083 + }, + { + "epoch": 3.0097148375237053, + "grad_norm": 0.1690070927143097, + "learning_rate": 1.6517323380202693e-05, + "loss": 1.1483, + "step": 8084 + }, + { + "epoch": 3.0100871426743145, + "grad_norm": 0.16490869224071503, + "learning_rate": 1.6516401576577736e-05, + "loss": 1.1596, + "step": 8085 + }, + { + "epoch": 3.0104594478249234, + "grad_norm": 0.16512571275234222, + "learning_rate": 1.6515479676705935e-05, + "loss": 1.1574, + "step": 8086 + }, + { + "epoch": 3.0108317529755326, + "grad_norm": 0.17184092104434967, + "learning_rate": 1.6514557680600912e-05, + "loss": 1.1579, + "step": 8087 + }, + { + "epoch": 3.0112040581261414, + "grad_norm": 0.17025062441825867, + "learning_rate": 1.651363558827628e-05, + "loss": 1.1661, + "step": 8088 + }, + { + "epoch": 3.0115763632767507, + "grad_norm": 0.17025823891162872, + "learning_rate": 1.651271339974566e-05, + "loss": 1.1601, + "step": 8089 + }, + { + "epoch": 3.01194866842736, + "grad_norm": 0.16634193062782288, + "learning_rate": 1.6511791115022672e-05, + "loss": 1.1519, + "step": 8090 + }, + { + "epoch": 3.012320973577969, + "grad_norm": 0.17068715393543243, + "learning_rate": 1.6510868734120935e-05, + "loss": 1.171, + "step": 8091 + }, + { + "epoch": 3.012693278728578, + "grad_norm": 0.1623985916376114, + "learning_rate": 1.6509946257054078e-05, + "loss": 1.1518, + "step": 8092 + }, + { + "epoch": 3.013065583879187, + "grad_norm": 0.16179263591766357, + "learning_rate": 1.650902368383572e-05, + "loss": 1.1498, + "step": 8093 + }, + { + "epoch": 3.013437889029796, + "grad_norm": 0.1658586859703064, + "learning_rate": 1.6508101014479494e-05, + "loss": 1.1601, + "step": 8094 + }, + { + "epoch": 3.013810194180405, + "grad_norm": 0.16824504733085632, + "learning_rate": 1.6507178248999026e-05, + "loss": 1.1453, + "step": 8095 + }, + { + "epoch": 3.0141824993310142, + "grad_norm": 0.16324131190776825, + "learning_rate": 1.6506255387407942e-05, + "loss": 1.16, + "step": 8096 + }, + { + "epoch": 3.014554804481623, + "grad_norm": 0.16698867082595825, + "learning_rate": 1.6505332429719872e-05, + "loss": 1.1527, + "step": 8097 + }, + { + "epoch": 3.0149271096322323, + "grad_norm": 0.16737176477909088, + "learning_rate": 1.650440937594845e-05, + "loss": 1.162, + "step": 8098 + }, + { + "epoch": 3.0152994147828416, + "grad_norm": 0.16450172662734985, + "learning_rate": 1.650348622610731e-05, + "loss": 1.1524, + "step": 8099 + }, + { + "epoch": 3.0156717199334504, + "grad_norm": 0.16533337533473969, + "learning_rate": 1.650256298021009e-05, + "loss": 1.1577, + "step": 8100 + }, + { + "epoch": 3.0160440250840597, + "grad_norm": 0.16812501847743988, + "learning_rate": 1.650163963827042e-05, + "loss": 1.1594, + "step": 8101 + }, + { + "epoch": 3.0164163302346685, + "grad_norm": 0.16462981700897217, + "learning_rate": 1.6500716200301943e-05, + "loss": 1.1554, + "step": 8102 + }, + { + "epoch": 3.0167886353852778, + "grad_norm": 0.1637783944606781, + "learning_rate": 1.6499792666318294e-05, + "loss": 1.1647, + "step": 8103 + }, + { + "epoch": 3.0171609405358866, + "grad_norm": 0.16835294663906097, + "learning_rate": 1.6498869036333116e-05, + "loss": 1.1474, + "step": 8104 + }, + { + "epoch": 3.017533245686496, + "grad_norm": 0.16287805140018463, + "learning_rate": 1.649794531036005e-05, + "loss": 1.1588, + "step": 8105 + }, + { + "epoch": 3.0179055508371047, + "grad_norm": 0.1584957391023636, + "learning_rate": 1.649702148841274e-05, + "loss": 1.1543, + "step": 8106 + }, + { + "epoch": 3.018277855987714, + "grad_norm": 0.15747535228729248, + "learning_rate": 1.6496097570504826e-05, + "loss": 1.1669, + "step": 8107 + }, + { + "epoch": 3.018650161138323, + "grad_norm": 0.16498741507530212, + "learning_rate": 1.6495173556649965e-05, + "loss": 1.1697, + "step": 8108 + }, + { + "epoch": 3.019022466288932, + "grad_norm": 0.16772598028182983, + "learning_rate": 1.6494249446861795e-05, + "loss": 1.1731, + "step": 8109 + }, + { + "epoch": 3.0193947714395413, + "grad_norm": 0.16565538942813873, + "learning_rate": 1.6493325241153968e-05, + "loss": 1.1572, + "step": 8110 + }, + { + "epoch": 3.01976707659015, + "grad_norm": 0.16811548173427582, + "learning_rate": 1.6492400939540134e-05, + "loss": 1.1409, + "step": 8111 + }, + { + "epoch": 3.0201393817407594, + "grad_norm": 0.17630524933338165, + "learning_rate": 1.6491476542033948e-05, + "loss": 1.1761, + "step": 8112 + }, + { + "epoch": 3.020511686891368, + "grad_norm": 0.17578953504562378, + "learning_rate": 1.649055204864906e-05, + "loss": 1.1435, + "step": 8113 + }, + { + "epoch": 3.0208839920419774, + "grad_norm": 0.17409630119800568, + "learning_rate": 1.6489627459399123e-05, + "loss": 1.1627, + "step": 8114 + }, + { + "epoch": 3.0212562971925863, + "grad_norm": 0.16359230875968933, + "learning_rate": 1.64887027742978e-05, + "loss": 1.1507, + "step": 8115 + }, + { + "epoch": 3.0216286023431955, + "grad_norm": 0.16680704057216644, + "learning_rate": 1.648777799335874e-05, + "loss": 1.1732, + "step": 8116 + }, + { + "epoch": 3.022000907493805, + "grad_norm": 0.1649051010608673, + "learning_rate": 1.6486853116595608e-05, + "loss": 1.1658, + "step": 8117 + }, + { + "epoch": 3.0223732126444136, + "grad_norm": 0.1658775508403778, + "learning_rate": 1.6485928144022066e-05, + "loss": 1.1569, + "step": 8118 + }, + { + "epoch": 3.022745517795023, + "grad_norm": 0.17610371112823486, + "learning_rate": 1.648500307565177e-05, + "loss": 1.1667, + "step": 8119 + }, + { + "epoch": 3.0231178229456317, + "grad_norm": 0.17497166991233826, + "learning_rate": 1.6484077911498383e-05, + "loss": 1.146, + "step": 8120 + }, + { + "epoch": 3.023490128096241, + "grad_norm": 0.17234422266483307, + "learning_rate": 1.6483152651575575e-05, + "loss": 1.1553, + "step": 8121 + }, + { + "epoch": 3.02386243324685, + "grad_norm": 0.166042760014534, + "learning_rate": 1.6482227295897008e-05, + "loss": 1.161, + "step": 8122 + }, + { + "epoch": 3.024234738397459, + "grad_norm": 0.16150951385498047, + "learning_rate": 1.648130184447635e-05, + "loss": 1.1666, + "step": 8123 + }, + { + "epoch": 3.024607043548068, + "grad_norm": 0.16276511549949646, + "learning_rate": 1.648037629732727e-05, + "loss": 1.1454, + "step": 8124 + }, + { + "epoch": 3.024979348698677, + "grad_norm": 0.16008317470550537, + "learning_rate": 1.6479450654463443e-05, + "loss": 1.152, + "step": 8125 + }, + { + "epoch": 3.0253516538492864, + "grad_norm": 0.1709553301334381, + "learning_rate": 1.6478524915898532e-05, + "loss": 1.1614, + "step": 8126 + }, + { + "epoch": 3.025723958999895, + "grad_norm": 0.16601483523845673, + "learning_rate": 1.6477599081646217e-05, + "loss": 1.1536, + "step": 8127 + }, + { + "epoch": 3.0260962641505045, + "grad_norm": 0.16305498778820038, + "learning_rate": 1.647667315172017e-05, + "loss": 1.1581, + "step": 8128 + }, + { + "epoch": 3.0264685693011133, + "grad_norm": 0.17328190803527832, + "learning_rate": 1.6475747126134066e-05, + "loss": 1.1502, + "step": 8129 + }, + { + "epoch": 3.0268408744517226, + "grad_norm": 0.20106814801692963, + "learning_rate": 1.647482100490158e-05, + "loss": 1.1654, + "step": 8130 + }, + { + "epoch": 3.0272131796023314, + "grad_norm": 0.18390576541423798, + "learning_rate": 1.64738947880364e-05, + "loss": 1.1635, + "step": 8131 + }, + { + "epoch": 3.0275854847529406, + "grad_norm": 0.16346316039562225, + "learning_rate": 1.6472968475552197e-05, + "loss": 1.177, + "step": 8132 + }, + { + "epoch": 3.02795778990355, + "grad_norm": 0.18944934010505676, + "learning_rate": 1.647204206746266e-05, + "loss": 1.1598, + "step": 8133 + }, + { + "epoch": 3.0283300950541587, + "grad_norm": 0.1777556836605072, + "learning_rate": 1.6471115563781467e-05, + "loss": 1.1578, + "step": 8134 + }, + { + "epoch": 3.028702400204768, + "grad_norm": 0.17949633300304413, + "learning_rate": 1.6470188964522296e-05, + "loss": 1.1698, + "step": 8135 + }, + { + "epoch": 3.029074705355377, + "grad_norm": 0.18838725984096527, + "learning_rate": 1.6469262269698846e-05, + "loss": 1.1585, + "step": 8136 + }, + { + "epoch": 3.029447010505986, + "grad_norm": 0.1662994623184204, + "learning_rate": 1.6468335479324796e-05, + "loss": 1.1597, + "step": 8137 + }, + { + "epoch": 3.029819315656595, + "grad_norm": 0.172524094581604, + "learning_rate": 1.646740859341384e-05, + "loss": 1.1683, + "step": 8138 + }, + { + "epoch": 3.030191620807204, + "grad_norm": 0.17046119272708893, + "learning_rate": 1.6466481611979665e-05, + "loss": 1.1582, + "step": 8139 + }, + { + "epoch": 3.030563925957813, + "grad_norm": 0.18434128165245056, + "learning_rate": 1.646555453503596e-05, + "loss": 1.1576, + "step": 8140 + }, + { + "epoch": 3.0309362311084223, + "grad_norm": 0.16541865468025208, + "learning_rate": 1.646462736259642e-05, + "loss": 1.1453, + "step": 8141 + }, + { + "epoch": 3.0313085362590315, + "grad_norm": 0.1774558275938034, + "learning_rate": 1.646370009467474e-05, + "loss": 1.1619, + "step": 8142 + }, + { + "epoch": 3.0316808414096403, + "grad_norm": 0.16616548597812653, + "learning_rate": 1.6462772731284615e-05, + "loss": 1.1659, + "step": 8143 + }, + { + "epoch": 3.0320531465602496, + "grad_norm": 0.17070108652114868, + "learning_rate": 1.6461845272439743e-05, + "loss": 1.1528, + "step": 8144 + }, + { + "epoch": 3.0324254517108584, + "grad_norm": 0.17470189929008484, + "learning_rate": 1.646091771815382e-05, + "loss": 1.1824, + "step": 8145 + }, + { + "epoch": 3.0327977568614677, + "grad_norm": 0.17306390404701233, + "learning_rate": 1.645999006844055e-05, + "loss": 1.1623, + "step": 8146 + }, + { + "epoch": 3.0331700620120765, + "grad_norm": 0.16506531834602356, + "learning_rate": 1.6459062323313634e-05, + "loss": 1.1725, + "step": 8147 + }, + { + "epoch": 3.0335423671626858, + "grad_norm": 0.17168426513671875, + "learning_rate": 1.645813448278677e-05, + "loss": 1.1475, + "step": 8148 + }, + { + "epoch": 3.0339146723132946, + "grad_norm": 0.16225890815258026, + "learning_rate": 1.6457206546873665e-05, + "loss": 1.1614, + "step": 8149 + }, + { + "epoch": 3.034286977463904, + "grad_norm": 0.16611915826797485, + "learning_rate": 1.6456278515588023e-05, + "loss": 1.1497, + "step": 8150 + }, + { + "epoch": 3.034659282614513, + "grad_norm": 0.174469992518425, + "learning_rate": 1.6455350388943555e-05, + "loss": 1.17, + "step": 8151 + }, + { + "epoch": 3.035031587765122, + "grad_norm": 0.16496939957141876, + "learning_rate": 1.6454422166953968e-05, + "loss": 1.1392, + "step": 8152 + }, + { + "epoch": 3.035403892915731, + "grad_norm": 0.17337270081043243, + "learning_rate": 1.6453493849632968e-05, + "loss": 1.1587, + "step": 8153 + }, + { + "epoch": 3.03577619806634, + "grad_norm": 0.1641514003276825, + "learning_rate": 1.6452565436994272e-05, + "loss": 1.1646, + "step": 8154 + }, + { + "epoch": 3.0361485032169493, + "grad_norm": 0.17081360518932343, + "learning_rate": 1.6451636929051587e-05, + "loss": 1.145, + "step": 8155 + }, + { + "epoch": 3.036520808367558, + "grad_norm": 0.16759049892425537, + "learning_rate": 1.645070832581863e-05, + "loss": 1.171, + "step": 8156 + }, + { + "epoch": 3.0368931135181674, + "grad_norm": 0.170254647731781, + "learning_rate": 1.6449779627309113e-05, + "loss": 1.142, + "step": 8157 + }, + { + "epoch": 3.037265418668776, + "grad_norm": 0.16546176373958588, + "learning_rate": 1.644885083353676e-05, + "loss": 1.1524, + "step": 8158 + }, + { + "epoch": 3.0376377238193855, + "grad_norm": 0.16441422700881958, + "learning_rate": 1.6447921944515285e-05, + "loss": 1.1519, + "step": 8159 + }, + { + "epoch": 3.0380100289699947, + "grad_norm": 0.17892083525657654, + "learning_rate": 1.6446992960258404e-05, + "loss": 1.169, + "step": 8160 + }, + { + "epoch": 3.0383823341206035, + "grad_norm": 0.1632370948791504, + "learning_rate": 1.6446063880779845e-05, + "loss": 1.1567, + "step": 8161 + }, + { + "epoch": 3.038754639271213, + "grad_norm": 0.17492267489433289, + "learning_rate": 1.6445134706093325e-05, + "loss": 1.1518, + "step": 8162 + }, + { + "epoch": 3.0391269444218216, + "grad_norm": 0.17203693091869354, + "learning_rate": 1.6444205436212567e-05, + "loss": 1.1601, + "step": 8163 + }, + { + "epoch": 3.039499249572431, + "grad_norm": 0.17056483030319214, + "learning_rate": 1.6443276071151303e-05, + "loss": 1.1711, + "step": 8164 + }, + { + "epoch": 3.0398715547230397, + "grad_norm": 0.17935492098331451, + "learning_rate": 1.6442346610923258e-05, + "loss": 1.1439, + "step": 8165 + }, + { + "epoch": 3.040243859873649, + "grad_norm": 0.16036418080329895, + "learning_rate": 1.6441417055542154e-05, + "loss": 1.1444, + "step": 8166 + }, + { + "epoch": 3.040616165024258, + "grad_norm": 0.1764068752527237, + "learning_rate": 1.6440487405021727e-05, + "loss": 1.1411, + "step": 8167 + }, + { + "epoch": 3.040988470174867, + "grad_norm": 0.16799296438694, + "learning_rate": 1.6439557659375705e-05, + "loss": 1.1551, + "step": 8168 + }, + { + "epoch": 3.0413607753254763, + "grad_norm": 0.17432555556297302, + "learning_rate": 1.6438627818617817e-05, + "loss": 1.1627, + "step": 8169 + }, + { + "epoch": 3.041733080476085, + "grad_norm": 0.18049269914627075, + "learning_rate": 1.6437697882761802e-05, + "loss": 1.1532, + "step": 8170 + }, + { + "epoch": 3.0421053856266944, + "grad_norm": 0.16729703545570374, + "learning_rate": 1.6436767851821395e-05, + "loss": 1.1795, + "step": 8171 + }, + { + "epoch": 3.0424776907773032, + "grad_norm": 0.1749987155199051, + "learning_rate": 1.6435837725810326e-05, + "loss": 1.1612, + "step": 8172 + }, + { + "epoch": 3.0428499959279125, + "grad_norm": 0.16448847949504852, + "learning_rate": 1.6434907504742342e-05, + "loss": 1.1616, + "step": 8173 + }, + { + "epoch": 3.0432223010785213, + "grad_norm": 0.1609332412481308, + "learning_rate": 1.6433977188631177e-05, + "loss": 1.1641, + "step": 8174 + }, + { + "epoch": 3.0435946062291306, + "grad_norm": 0.1693718433380127, + "learning_rate": 1.6433046777490576e-05, + "loss": 1.1719, + "step": 8175 + }, + { + "epoch": 3.0439669113797394, + "grad_norm": 0.16598600149154663, + "learning_rate": 1.643211627133427e-05, + "loss": 1.1562, + "step": 8176 + }, + { + "epoch": 3.0443392165303487, + "grad_norm": 0.16353359818458557, + "learning_rate": 1.6431185670176017e-05, + "loss": 1.1626, + "step": 8177 + }, + { + "epoch": 3.044711521680958, + "grad_norm": 0.1780644953250885, + "learning_rate": 1.6430254974029554e-05, + "loss": 1.1502, + "step": 8178 + }, + { + "epoch": 3.0450838268315668, + "grad_norm": 0.1959240734577179, + "learning_rate": 1.6429324182908628e-05, + "loss": 1.1409, + "step": 8179 + }, + { + "epoch": 3.045456131982176, + "grad_norm": 0.18989701569080353, + "learning_rate": 1.6428393296826987e-05, + "loss": 1.1457, + "step": 8180 + }, + { + "epoch": 3.045828437132785, + "grad_norm": 0.16791072487831116, + "learning_rate": 1.642746231579838e-05, + "loss": 1.1521, + "step": 8181 + }, + { + "epoch": 3.046200742283394, + "grad_norm": 0.17417898774147034, + "learning_rate": 1.642653123983656e-05, + "loss": 1.1476, + "step": 8182 + }, + { + "epoch": 3.046573047434003, + "grad_norm": 0.20077449083328247, + "learning_rate": 1.6425600068955272e-05, + "loss": 1.1731, + "step": 8183 + }, + { + "epoch": 3.046945352584612, + "grad_norm": 0.1830679327249527, + "learning_rate": 1.642466880316828e-05, + "loss": 1.166, + "step": 8184 + }, + { + "epoch": 3.0473176577352215, + "grad_norm": 0.16485272347927094, + "learning_rate": 1.642373744248933e-05, + "loss": 1.1681, + "step": 8185 + }, + { + "epoch": 3.0476899628858303, + "grad_norm": 0.1660563200712204, + "learning_rate": 1.6422805986932184e-05, + "loss": 1.1524, + "step": 8186 + }, + { + "epoch": 3.0480622680364395, + "grad_norm": 0.16424550116062164, + "learning_rate": 1.642187443651059e-05, + "loss": 1.1539, + "step": 8187 + }, + { + "epoch": 3.0484345731870484, + "grad_norm": 0.1737472116947174, + "learning_rate": 1.642094279123832e-05, + "loss": 1.1597, + "step": 8188 + }, + { + "epoch": 3.0488068783376576, + "grad_norm": 0.1679793894290924, + "learning_rate": 1.6420011051129127e-05, + "loss": 1.1495, + "step": 8189 + }, + { + "epoch": 3.0491791834882664, + "grad_norm": 0.16718783974647522, + "learning_rate": 1.641907921619677e-05, + "loss": 1.1653, + "step": 8190 + }, + { + "epoch": 3.0495514886388757, + "grad_norm": 0.17275018990039825, + "learning_rate": 1.641814728645502e-05, + "loss": 1.1521, + "step": 8191 + }, + { + "epoch": 3.0499237937894845, + "grad_norm": 0.17378568649291992, + "learning_rate": 1.6417215261917638e-05, + "loss": 1.1681, + "step": 8192 + }, + { + "epoch": 3.050296098940094, + "grad_norm": 0.19507519900798798, + "learning_rate": 1.6416283142598387e-05, + "loss": 1.1686, + "step": 8193 + }, + { + "epoch": 3.050668404090703, + "grad_norm": 0.1943422257900238, + "learning_rate": 1.6415350928511037e-05, + "loss": 1.1595, + "step": 8194 + }, + { + "epoch": 3.051040709241312, + "grad_norm": 0.16781660914421082, + "learning_rate": 1.6414418619669354e-05, + "loss": 1.1471, + "step": 8195 + }, + { + "epoch": 3.051413014391921, + "grad_norm": 0.1694183200597763, + "learning_rate": 1.6413486216087114e-05, + "loss": 1.1439, + "step": 8196 + }, + { + "epoch": 3.05178531954253, + "grad_norm": 0.17168377339839935, + "learning_rate": 1.6412553717778085e-05, + "loss": 1.159, + "step": 8197 + }, + { + "epoch": 3.0521576246931392, + "grad_norm": 0.16123619675636292, + "learning_rate": 1.6411621124756035e-05, + "loss": 1.1616, + "step": 8198 + }, + { + "epoch": 3.052529929843748, + "grad_norm": 0.18184486031532288, + "learning_rate": 1.641068843703475e-05, + "loss": 1.1613, + "step": 8199 + }, + { + "epoch": 3.0529022349943573, + "grad_norm": 0.2111905813217163, + "learning_rate": 1.6409755654627994e-05, + "loss": 1.165, + "step": 8200 + }, + { + "epoch": 3.053274540144966, + "grad_norm": 0.1873953491449356, + "learning_rate": 1.6408822777549552e-05, + "loss": 1.1603, + "step": 8201 + }, + { + "epoch": 3.0536468452955754, + "grad_norm": 0.1657257080078125, + "learning_rate": 1.64078898058132e-05, + "loss": 1.1615, + "step": 8202 + }, + { + "epoch": 3.0540191504461847, + "grad_norm": 0.18995431065559387, + "learning_rate": 1.6406956739432716e-05, + "loss": 1.1451, + "step": 8203 + }, + { + "epoch": 3.0543914555967935, + "grad_norm": 0.1653064638376236, + "learning_rate": 1.6406023578421884e-05, + "loss": 1.1514, + "step": 8204 + }, + { + "epoch": 3.0547637607474027, + "grad_norm": 0.19428938627243042, + "learning_rate": 1.6405090322794484e-05, + "loss": 1.1483, + "step": 8205 + }, + { + "epoch": 3.0551360658980116, + "grad_norm": 0.18280021846294403, + "learning_rate": 1.6404156972564305e-05, + "loss": 1.1456, + "step": 8206 + }, + { + "epoch": 3.055508371048621, + "grad_norm": 0.18493548035621643, + "learning_rate": 1.6403223527745127e-05, + "loss": 1.152, + "step": 8207 + }, + { + "epoch": 3.0558806761992297, + "grad_norm": 0.16537559032440186, + "learning_rate": 1.6402289988350742e-05, + "loss": 1.1563, + "step": 8208 + }, + { + "epoch": 3.056252981349839, + "grad_norm": 0.19156776368618011, + "learning_rate": 1.6401356354394934e-05, + "loss": 1.1445, + "step": 8209 + }, + { + "epoch": 3.0566252865004477, + "grad_norm": 0.17017942667007446, + "learning_rate": 1.6400422625891493e-05, + "loss": 1.1643, + "step": 8210 + }, + { + "epoch": 3.056997591651057, + "grad_norm": 0.1759144514799118, + "learning_rate": 1.6399488802854214e-05, + "loss": 1.1651, + "step": 8211 + }, + { + "epoch": 3.0573698968016663, + "grad_norm": 0.17706505954265594, + "learning_rate": 1.6398554885296888e-05, + "loss": 1.1491, + "step": 8212 + }, + { + "epoch": 3.057742201952275, + "grad_norm": 0.16222470998764038, + "learning_rate": 1.6397620873233304e-05, + "loss": 1.1636, + "step": 8213 + }, + { + "epoch": 3.0581145071028843, + "grad_norm": 0.21946337819099426, + "learning_rate": 1.6396686766677263e-05, + "loss": 1.1623, + "step": 8214 + }, + { + "epoch": 3.058486812253493, + "grad_norm": 0.16888217628002167, + "learning_rate": 1.639575256564256e-05, + "loss": 1.1594, + "step": 8215 + }, + { + "epoch": 3.0588591174041024, + "grad_norm": 0.16997283697128296, + "learning_rate": 1.6394818270142995e-05, + "loss": 1.1507, + "step": 8216 + }, + { + "epoch": 3.0592314225547113, + "grad_norm": 0.16864870488643646, + "learning_rate": 1.6393883880192362e-05, + "loss": 1.1613, + "step": 8217 + }, + { + "epoch": 3.0596037277053205, + "grad_norm": 0.16163980960845947, + "learning_rate": 1.6392949395804464e-05, + "loss": 1.1675, + "step": 8218 + }, + { + "epoch": 3.0599760328559293, + "grad_norm": 0.16427217423915863, + "learning_rate": 1.639201481699311e-05, + "loss": 1.1476, + "step": 8219 + }, + { + "epoch": 3.0603483380065386, + "grad_norm": 0.16542796790599823, + "learning_rate": 1.6391080143772094e-05, + "loss": 1.1446, + "step": 8220 + }, + { + "epoch": 3.060720643157148, + "grad_norm": 0.1682531237602234, + "learning_rate": 1.639014537615523e-05, + "loss": 1.1564, + "step": 8221 + }, + { + "epoch": 3.0610929483077567, + "grad_norm": 0.16446895897388458, + "learning_rate": 1.6389210514156317e-05, + "loss": 1.1661, + "step": 8222 + }, + { + "epoch": 3.061465253458366, + "grad_norm": 0.16606532037258148, + "learning_rate": 1.6388275557789165e-05, + "loss": 1.1658, + "step": 8223 + }, + { + "epoch": 3.0618375586089748, + "grad_norm": 0.16892418265342712, + "learning_rate": 1.6387340507067584e-05, + "loss": 1.1503, + "step": 8224 + }, + { + "epoch": 3.062209863759584, + "grad_norm": 0.16336201131343842, + "learning_rate": 1.6386405362005385e-05, + "loss": 1.1647, + "step": 8225 + }, + { + "epoch": 3.062582168910193, + "grad_norm": 0.15720415115356445, + "learning_rate": 1.638547012261638e-05, + "loss": 1.151, + "step": 8226 + }, + { + "epoch": 3.062954474060802, + "grad_norm": 0.1673574596643448, + "learning_rate": 1.6384534788914383e-05, + "loss": 1.1508, + "step": 8227 + }, + { + "epoch": 3.063326779211411, + "grad_norm": 0.1641821265220642, + "learning_rate": 1.6383599360913204e-05, + "loss": 1.1522, + "step": 8228 + }, + { + "epoch": 3.06369908436202, + "grad_norm": 0.17019106447696686, + "learning_rate": 1.6382663838626667e-05, + "loss": 1.1652, + "step": 8229 + }, + { + "epoch": 3.0640713895126295, + "grad_norm": 0.17099004983901978, + "learning_rate": 1.6381728222068585e-05, + "loss": 1.1716, + "step": 8230 + }, + { + "epoch": 3.0644436946632383, + "grad_norm": 0.16094860434532166, + "learning_rate": 1.6380792511252775e-05, + "loss": 1.1498, + "step": 8231 + }, + { + "epoch": 3.0648159998138476, + "grad_norm": 0.16481037437915802, + "learning_rate": 1.6379856706193064e-05, + "loss": 1.1479, + "step": 8232 + }, + { + "epoch": 3.0651883049644564, + "grad_norm": 0.16703465580940247, + "learning_rate": 1.6378920806903265e-05, + "loss": 1.1656, + "step": 8233 + }, + { + "epoch": 3.0655606101150656, + "grad_norm": 0.17205815017223358, + "learning_rate": 1.6377984813397212e-05, + "loss": 1.1599, + "step": 8234 + }, + { + "epoch": 3.0659329152656745, + "grad_norm": 0.1638091653585434, + "learning_rate": 1.637704872568872e-05, + "loss": 1.1548, + "step": 8235 + }, + { + "epoch": 3.0663052204162837, + "grad_norm": 0.16245143115520477, + "learning_rate": 1.6376112543791622e-05, + "loss": 1.1547, + "step": 8236 + }, + { + "epoch": 3.0666775255668925, + "grad_norm": 0.17442138493061066, + "learning_rate": 1.6375176267719735e-05, + "loss": 1.1526, + "step": 8237 + }, + { + "epoch": 3.067049830717502, + "grad_norm": 0.1698945164680481, + "learning_rate": 1.63742398974869e-05, + "loss": 1.1514, + "step": 8238 + }, + { + "epoch": 3.067422135868111, + "grad_norm": 0.17009849846363068, + "learning_rate": 1.6373303433106936e-05, + "loss": 1.1399, + "step": 8239 + }, + { + "epoch": 3.06779444101872, + "grad_norm": 0.1673266440629959, + "learning_rate": 1.6372366874593688e-05, + "loss": 1.1445, + "step": 8240 + }, + { + "epoch": 3.068166746169329, + "grad_norm": 0.16925260424613953, + "learning_rate": 1.6371430221960975e-05, + "loss": 1.1528, + "step": 8241 + }, + { + "epoch": 3.068539051319938, + "grad_norm": 0.17465613782405853, + "learning_rate": 1.637049347522264e-05, + "loss": 1.159, + "step": 8242 + }, + { + "epoch": 3.0689113564705472, + "grad_norm": 0.17698585987091064, + "learning_rate": 1.636955663439252e-05, + "loss": 1.1602, + "step": 8243 + }, + { + "epoch": 3.069283661621156, + "grad_norm": 0.17667868733406067, + "learning_rate": 1.6368619699484446e-05, + "loss": 1.158, + "step": 8244 + }, + { + "epoch": 3.0696559667717653, + "grad_norm": 0.17002210021018982, + "learning_rate": 1.6367682670512253e-05, + "loss": 1.1616, + "step": 8245 + }, + { + "epoch": 3.070028271922374, + "grad_norm": 0.18032929301261902, + "learning_rate": 1.636674554748979e-05, + "loss": 1.1825, + "step": 8246 + }, + { + "epoch": 3.0704005770729834, + "grad_norm": 0.16729597747325897, + "learning_rate": 1.6365808330430897e-05, + "loss": 1.161, + "step": 8247 + }, + { + "epoch": 3.0707728822235927, + "grad_norm": 0.188649520277977, + "learning_rate": 1.6364871019349414e-05, + "loss": 1.1605, + "step": 8248 + }, + { + "epoch": 3.0711451873742015, + "grad_norm": 0.1881103366613388, + "learning_rate": 1.6363933614259184e-05, + "loss": 1.1416, + "step": 8249 + }, + { + "epoch": 3.0715174925248108, + "grad_norm": 0.1650039553642273, + "learning_rate": 1.6362996115174056e-05, + "loss": 1.1556, + "step": 8250 + }, + { + "epoch": 3.0718897976754196, + "grad_norm": 0.19066570699214935, + "learning_rate": 1.6362058522107872e-05, + "loss": 1.1541, + "step": 8251 + }, + { + "epoch": 3.072262102826029, + "grad_norm": 0.19495618343353271, + "learning_rate": 1.6361120835074485e-05, + "loss": 1.1702, + "step": 8252 + }, + { + "epoch": 3.0726344079766377, + "grad_norm": 0.1677856743335724, + "learning_rate": 1.636018305408774e-05, + "loss": 1.1482, + "step": 8253 + }, + { + "epoch": 3.073006713127247, + "grad_norm": 0.22052864730358124, + "learning_rate": 1.6359245179161492e-05, + "loss": 1.1543, + "step": 8254 + }, + { + "epoch": 3.073379018277856, + "grad_norm": 0.17813050746917725, + "learning_rate": 1.6358307210309595e-05, + "loss": 1.1654, + "step": 8255 + }, + { + "epoch": 3.073751323428465, + "grad_norm": 0.18808187544345856, + "learning_rate": 1.6357369147545894e-05, + "loss": 1.165, + "step": 8256 + }, + { + "epoch": 3.0741236285790743, + "grad_norm": 0.16268914937973022, + "learning_rate": 1.635643099088425e-05, + "loss": 1.1691, + "step": 8257 + }, + { + "epoch": 3.074495933729683, + "grad_norm": 0.19553907215595245, + "learning_rate": 1.6355492740338523e-05, + "loss": 1.1465, + "step": 8258 + }, + { + "epoch": 3.0748682388802924, + "grad_norm": 0.1695629507303238, + "learning_rate": 1.6354554395922564e-05, + "loss": 1.1672, + "step": 8259 + }, + { + "epoch": 3.075240544030901, + "grad_norm": 0.1807146817445755, + "learning_rate": 1.635361595765024e-05, + "loss": 1.1486, + "step": 8260 + }, + { + "epoch": 3.0756128491815105, + "grad_norm": 0.1665555089712143, + "learning_rate": 1.63526774255354e-05, + "loss": 1.1478, + "step": 8261 + }, + { + "epoch": 3.0759851543321193, + "grad_norm": 0.17835910618305206, + "learning_rate": 1.6351738799591918e-05, + "loss": 1.1594, + "step": 8262 + }, + { + "epoch": 3.0763574594827285, + "grad_norm": 0.17160485684871674, + "learning_rate": 1.635080007983365e-05, + "loss": 1.1574, + "step": 8263 + }, + { + "epoch": 3.076729764633338, + "grad_norm": 0.16415171325206757, + "learning_rate": 1.6349861266274467e-05, + "loss": 1.1511, + "step": 8264 + }, + { + "epoch": 3.0771020697839466, + "grad_norm": 0.16749624907970428, + "learning_rate": 1.6348922358928228e-05, + "loss": 1.148, + "step": 8265 + }, + { + "epoch": 3.077474374934556, + "grad_norm": 0.1690567582845688, + "learning_rate": 1.6347983357808804e-05, + "loss": 1.1539, + "step": 8266 + }, + { + "epoch": 3.0778466800851647, + "grad_norm": 0.17391081154346466, + "learning_rate": 1.6347044262930067e-05, + "loss": 1.1471, + "step": 8267 + }, + { + "epoch": 3.078218985235774, + "grad_norm": 0.17175255715847015, + "learning_rate": 1.6346105074305884e-05, + "loss": 1.1618, + "step": 8268 + }, + { + "epoch": 3.078591290386383, + "grad_norm": 0.1740272045135498, + "learning_rate": 1.6345165791950125e-05, + "loss": 1.1542, + "step": 8269 + }, + { + "epoch": 3.078963595536992, + "grad_norm": 0.19044634699821472, + "learning_rate": 1.634422641587667e-05, + "loss": 1.1599, + "step": 8270 + }, + { + "epoch": 3.079335900687601, + "grad_norm": 0.1639656275510788, + "learning_rate": 1.6343286946099385e-05, + "loss": 1.1623, + "step": 8271 + }, + { + "epoch": 3.07970820583821, + "grad_norm": 0.17656481266021729, + "learning_rate": 1.6342347382632155e-05, + "loss": 1.1523, + "step": 8272 + }, + { + "epoch": 3.0800805109888194, + "grad_norm": 0.15982165932655334, + "learning_rate": 1.6341407725488844e-05, + "loss": 1.1475, + "step": 8273 + }, + { + "epoch": 3.0804528161394282, + "grad_norm": 0.17611384391784668, + "learning_rate": 1.6340467974683344e-05, + "loss": 1.1676, + "step": 8274 + }, + { + "epoch": 3.0808251212900375, + "grad_norm": 0.17255373299121857, + "learning_rate": 1.633952813022953e-05, + "loss": 1.1819, + "step": 8275 + }, + { + "epoch": 3.0811974264406463, + "grad_norm": 0.17750908434391022, + "learning_rate": 1.633858819214128e-05, + "loss": 1.1745, + "step": 8276 + }, + { + "epoch": 3.0815697315912556, + "grad_norm": 0.21422599256038666, + "learning_rate": 1.6337648160432484e-05, + "loss": 1.1535, + "step": 8277 + }, + { + "epoch": 3.0819420367418644, + "grad_norm": 0.17324034869670868, + "learning_rate": 1.633670803511702e-05, + "loss": 1.1591, + "step": 8278 + }, + { + "epoch": 3.0823143418924737, + "grad_norm": 0.18860980868339539, + "learning_rate": 1.6335767816208775e-05, + "loss": 1.1505, + "step": 8279 + }, + { + "epoch": 3.0826866470430825, + "grad_norm": 0.20710186660289764, + "learning_rate": 1.633482750372164e-05, + "loss": 1.1695, + "step": 8280 + }, + { + "epoch": 3.0830589521936917, + "grad_norm": 0.16789516806602478, + "learning_rate": 1.63338870976695e-05, + "loss": 1.1677, + "step": 8281 + }, + { + "epoch": 3.083431257344301, + "grad_norm": 0.1649201661348343, + "learning_rate": 1.6332946598066244e-05, + "loss": 1.1482, + "step": 8282 + }, + { + "epoch": 3.08380356249491, + "grad_norm": 0.16981996595859528, + "learning_rate": 1.6332006004925763e-05, + "loss": 1.1704, + "step": 8283 + }, + { + "epoch": 3.084175867645519, + "grad_norm": 0.16521765291690826, + "learning_rate": 1.6331065318261955e-05, + "loss": 1.1556, + "step": 8284 + }, + { + "epoch": 3.084548172796128, + "grad_norm": 0.16958005726337433, + "learning_rate": 1.6330124538088705e-05, + "loss": 1.1542, + "step": 8285 + }, + { + "epoch": 3.084920477946737, + "grad_norm": 0.1606731116771698, + "learning_rate": 1.6329183664419918e-05, + "loss": 1.1508, + "step": 8286 + }, + { + "epoch": 3.085292783097346, + "grad_norm": 0.166421040892601, + "learning_rate": 1.6328242697269478e-05, + "loss": 1.1642, + "step": 8287 + }, + { + "epoch": 3.0856650882479553, + "grad_norm": 0.1679898500442505, + "learning_rate": 1.6327301636651296e-05, + "loss": 1.1504, + "step": 8288 + }, + { + "epoch": 3.086037393398564, + "grad_norm": 0.1639619916677475, + "learning_rate": 1.6326360482579265e-05, + "loss": 1.158, + "step": 8289 + }, + { + "epoch": 3.0864096985491734, + "grad_norm": 0.16645826399326324, + "learning_rate": 1.6325419235067286e-05, + "loss": 1.151, + "step": 8290 + }, + { + "epoch": 3.0867820036997826, + "grad_norm": 0.1687537580728531, + "learning_rate": 1.6324477894129263e-05, + "loss": 1.1579, + "step": 8291 + }, + { + "epoch": 3.0871543088503914, + "grad_norm": 0.16648247838020325, + "learning_rate": 1.6323536459779098e-05, + "loss": 1.1513, + "step": 8292 + }, + { + "epoch": 3.0875266140010007, + "grad_norm": 0.16747455298900604, + "learning_rate": 1.6322594932030697e-05, + "loss": 1.1528, + "step": 8293 + }, + { + "epoch": 3.0878989191516095, + "grad_norm": 0.20737069845199585, + "learning_rate": 1.632165331089796e-05, + "loss": 1.1452, + "step": 8294 + }, + { + "epoch": 3.088271224302219, + "grad_norm": 0.3523947298526764, + "learning_rate": 1.6320711596394805e-05, + "loss": 1.1623, + "step": 8295 + }, + { + "epoch": 3.0886435294528276, + "grad_norm": 0.21294717490673065, + "learning_rate": 1.6319769788535135e-05, + "loss": 1.1556, + "step": 8296 + }, + { + "epoch": 3.089015834603437, + "grad_norm": 0.18315008282661438, + "learning_rate": 1.6318827887332865e-05, + "loss": 1.1677, + "step": 8297 + }, + { + "epoch": 3.089388139754046, + "grad_norm": 0.1660303920507431, + "learning_rate": 1.6317885892801902e-05, + "loss": 1.164, + "step": 8298 + }, + { + "epoch": 3.089760444904655, + "grad_norm": 0.1686762571334839, + "learning_rate": 1.631694380495616e-05, + "loss": 1.1713, + "step": 8299 + }, + { + "epoch": 3.090132750055264, + "grad_norm": 0.1759401112794876, + "learning_rate": 1.6316001623809557e-05, + "loss": 1.1509, + "step": 8300 + }, + { + "epoch": 3.090505055205873, + "grad_norm": 0.17869412899017334, + "learning_rate": 1.6315059349376002e-05, + "loss": 1.1608, + "step": 8301 + }, + { + "epoch": 3.0908773603564823, + "grad_norm": 0.17332401871681213, + "learning_rate": 1.6314116981669418e-05, + "loss": 1.1478, + "step": 8302 + }, + { + "epoch": 3.091249665507091, + "grad_norm": 0.1675678789615631, + "learning_rate": 1.6313174520703727e-05, + "loss": 1.166, + "step": 8303 + }, + { + "epoch": 3.0916219706577004, + "grad_norm": 0.1689613312482834, + "learning_rate": 1.631223196649284e-05, + "loss": 1.1671, + "step": 8304 + }, + { + "epoch": 3.091994275808309, + "grad_norm": 0.16656659543514252, + "learning_rate": 1.631128931905068e-05, + "loss": 1.1671, + "step": 8305 + }, + { + "epoch": 3.0923665809589185, + "grad_norm": 0.16922014951705933, + "learning_rate": 1.631034657839118e-05, + "loss": 1.1645, + "step": 8306 + }, + { + "epoch": 3.0927388861095277, + "grad_norm": 0.169945627450943, + "learning_rate": 1.6309403744528254e-05, + "loss": 1.1565, + "step": 8307 + }, + { + "epoch": 3.0931111912601366, + "grad_norm": 0.1612582504749298, + "learning_rate": 1.630846081747583e-05, + "loss": 1.1604, + "step": 8308 + }, + { + "epoch": 3.093483496410746, + "grad_norm": 0.16429011523723602, + "learning_rate": 1.6307517797247836e-05, + "loss": 1.1472, + "step": 8309 + }, + { + "epoch": 3.0938558015613546, + "grad_norm": 0.1718926727771759, + "learning_rate": 1.63065746838582e-05, + "loss": 1.1568, + "step": 8310 + }, + { + "epoch": 3.094228106711964, + "grad_norm": 0.16393272578716278, + "learning_rate": 1.6305631477320853e-05, + "loss": 1.1553, + "step": 8311 + }, + { + "epoch": 3.0946004118625727, + "grad_norm": 0.166838601231575, + "learning_rate": 1.6304688177649725e-05, + "loss": 1.1584, + "step": 8312 + }, + { + "epoch": 3.094972717013182, + "grad_norm": 0.16786356270313263, + "learning_rate": 1.6303744784858745e-05, + "loss": 1.1563, + "step": 8313 + }, + { + "epoch": 3.095345022163791, + "grad_norm": 0.16941292583942413, + "learning_rate": 1.6302801298961853e-05, + "loss": 1.158, + "step": 8314 + }, + { + "epoch": 3.0957173273144, + "grad_norm": 0.16948345303535461, + "learning_rate": 1.6301857719972977e-05, + "loss": 1.1635, + "step": 8315 + }, + { + "epoch": 3.0960896324650093, + "grad_norm": 0.16545285284519196, + "learning_rate": 1.6300914047906063e-05, + "loss": 1.1388, + "step": 8316 + }, + { + "epoch": 3.096461937615618, + "grad_norm": 0.1651238054037094, + "learning_rate": 1.6299970282775046e-05, + "loss": 1.1709, + "step": 8317 + }, + { + "epoch": 3.0968342427662274, + "grad_norm": 0.16604574024677277, + "learning_rate": 1.6299026424593858e-05, + "loss": 1.1495, + "step": 8318 + }, + { + "epoch": 3.0972065479168363, + "grad_norm": 0.16288162767887115, + "learning_rate": 1.6298082473376444e-05, + "loss": 1.149, + "step": 8319 + }, + { + "epoch": 3.0975788530674455, + "grad_norm": 0.16391494870185852, + "learning_rate": 1.629713842913675e-05, + "loss": 1.1536, + "step": 8320 + }, + { + "epoch": 3.0979511582180543, + "grad_norm": 0.16908983886241913, + "learning_rate": 1.6296194291888718e-05, + "loss": 1.1626, + "step": 8321 + }, + { + "epoch": 3.0983234633686636, + "grad_norm": 0.16984610259532928, + "learning_rate": 1.6295250061646292e-05, + "loss": 1.1611, + "step": 8322 + }, + { + "epoch": 3.0986957685192724, + "grad_norm": 0.16279767453670502, + "learning_rate": 1.6294305738423413e-05, + "loss": 1.1429, + "step": 8323 + }, + { + "epoch": 3.0990680736698817, + "grad_norm": 0.16995452344417572, + "learning_rate": 1.6293361322234036e-05, + "loss": 1.154, + "step": 8324 + }, + { + "epoch": 3.099440378820491, + "grad_norm": 0.1628962606191635, + "learning_rate": 1.6292416813092107e-05, + "loss": 1.1576, + "step": 8325 + }, + { + "epoch": 3.0998126839710998, + "grad_norm": 0.16195490956306458, + "learning_rate": 1.6291472211011575e-05, + "loss": 1.1549, + "step": 8326 + }, + { + "epoch": 3.100184989121709, + "grad_norm": 0.16598109900951385, + "learning_rate": 1.6290527516006396e-05, + "loss": 1.1605, + "step": 8327 + }, + { + "epoch": 3.100557294272318, + "grad_norm": 0.16446641087532043, + "learning_rate": 1.628958272809052e-05, + "loss": 1.163, + "step": 8328 + }, + { + "epoch": 3.100929599422927, + "grad_norm": 0.1585415005683899, + "learning_rate": 1.62886378472779e-05, + "loss": 1.1575, + "step": 8329 + }, + { + "epoch": 3.101301904573536, + "grad_norm": 0.16187165677547455, + "learning_rate": 1.6287692873582495e-05, + "loss": 1.1461, + "step": 8330 + }, + { + "epoch": 3.101674209724145, + "grad_norm": 0.1652093529701233, + "learning_rate": 1.628674780701826e-05, + "loss": 1.1509, + "step": 8331 + }, + { + "epoch": 3.102046514874754, + "grad_norm": 0.16078272461891174, + "learning_rate": 1.6285802647599156e-05, + "loss": 1.1402, + "step": 8332 + }, + { + "epoch": 3.1024188200253633, + "grad_norm": 0.1659494936466217, + "learning_rate": 1.6284857395339143e-05, + "loss": 1.1555, + "step": 8333 + }, + { + "epoch": 3.1027911251759726, + "grad_norm": 0.16813765466213226, + "learning_rate": 1.6283912050252176e-05, + "loss": 1.1623, + "step": 8334 + }, + { + "epoch": 3.1031634303265814, + "grad_norm": 0.16813194751739502, + "learning_rate": 1.6282966612352224e-05, + "loss": 1.1508, + "step": 8335 + }, + { + "epoch": 3.1035357354771906, + "grad_norm": 0.16411879658699036, + "learning_rate": 1.628202108165325e-05, + "loss": 1.1743, + "step": 8336 + }, + { + "epoch": 3.1039080406277995, + "grad_norm": 0.17190620303153992, + "learning_rate": 1.628107545816922e-05, + "loss": 1.1595, + "step": 8337 + }, + { + "epoch": 3.1042803457784087, + "grad_norm": 0.16098536550998688, + "learning_rate": 1.6280129741914098e-05, + "loss": 1.1521, + "step": 8338 + }, + { + "epoch": 3.1046526509290175, + "grad_norm": 0.16388416290283203, + "learning_rate": 1.6279183932901853e-05, + "loss": 1.164, + "step": 8339 + }, + { + "epoch": 3.105024956079627, + "grad_norm": 0.16218237578868866, + "learning_rate": 1.627823803114646e-05, + "loss": 1.1579, + "step": 8340 + }, + { + "epoch": 3.1053972612302356, + "grad_norm": 0.16982564330101013, + "learning_rate": 1.627729203666188e-05, + "loss": 1.1598, + "step": 8341 + }, + { + "epoch": 3.105769566380845, + "grad_norm": 0.16258504986763, + "learning_rate": 1.627634594946209e-05, + "loss": 1.1694, + "step": 8342 + }, + { + "epoch": 3.106141871531454, + "grad_norm": 0.16323330998420715, + "learning_rate": 1.6275399769561068e-05, + "loss": 1.1582, + "step": 8343 + }, + { + "epoch": 3.106514176682063, + "grad_norm": 0.16780774295330048, + "learning_rate": 1.6274453496972783e-05, + "loss": 1.1605, + "step": 8344 + }, + { + "epoch": 3.1068864818326722, + "grad_norm": 0.1661432832479477, + "learning_rate": 1.6273507131711216e-05, + "loss": 1.1553, + "step": 8345 + }, + { + "epoch": 3.107258786983281, + "grad_norm": 0.16441033780574799, + "learning_rate": 1.627256067379034e-05, + "loss": 1.1619, + "step": 8346 + }, + { + "epoch": 3.1076310921338903, + "grad_norm": 0.16335833072662354, + "learning_rate": 1.6271614123224137e-05, + "loss": 1.151, + "step": 8347 + }, + { + "epoch": 3.108003397284499, + "grad_norm": 0.16474972665309906, + "learning_rate": 1.6270667480026588e-05, + "loss": 1.1616, + "step": 8348 + }, + { + "epoch": 3.1083757024351084, + "grad_norm": 0.17309875786304474, + "learning_rate": 1.6269720744211675e-05, + "loss": 1.1736, + "step": 8349 + }, + { + "epoch": 3.1087480075857172, + "grad_norm": 0.16838182508945465, + "learning_rate": 1.6268773915793376e-05, + "loss": 1.1496, + "step": 8350 + }, + { + "epoch": 3.1091203127363265, + "grad_norm": 0.1622922271490097, + "learning_rate": 1.6267826994785683e-05, + "loss": 1.1562, + "step": 8351 + }, + { + "epoch": 3.1094926178869358, + "grad_norm": 0.16335055232048035, + "learning_rate": 1.6266879981202577e-05, + "loss": 1.1565, + "step": 8352 + }, + { + "epoch": 3.1098649230375446, + "grad_norm": 0.16523607075214386, + "learning_rate": 1.626593287505805e-05, + "loss": 1.1497, + "step": 8353 + }, + { + "epoch": 3.110237228188154, + "grad_norm": 0.16475920379161835, + "learning_rate": 1.6264985676366085e-05, + "loss": 1.1481, + "step": 8354 + }, + { + "epoch": 3.1106095333387627, + "grad_norm": 0.16444498300552368, + "learning_rate": 1.6264038385140676e-05, + "loss": 1.1676, + "step": 8355 + }, + { + "epoch": 3.110981838489372, + "grad_norm": 0.16481232643127441, + "learning_rate": 1.6263091001395808e-05, + "loss": 1.1595, + "step": 8356 + }, + { + "epoch": 3.1113541436399808, + "grad_norm": 0.16143718361854553, + "learning_rate": 1.6262143525145485e-05, + "loss": 1.143, + "step": 8357 + }, + { + "epoch": 3.11172644879059, + "grad_norm": 0.16786842048168182, + "learning_rate": 1.6261195956403694e-05, + "loss": 1.159, + "step": 8358 + }, + { + "epoch": 3.112098753941199, + "grad_norm": 0.163263738155365, + "learning_rate": 1.626024829518443e-05, + "loss": 1.1516, + "step": 8359 + }, + { + "epoch": 3.112471059091808, + "grad_norm": 0.16883710026741028, + "learning_rate": 1.6259300541501694e-05, + "loss": 1.1651, + "step": 8360 + }, + { + "epoch": 3.1128433642424174, + "grad_norm": 0.16147491335868835, + "learning_rate": 1.6258352695369478e-05, + "loss": 1.1586, + "step": 8361 + }, + { + "epoch": 3.113215669393026, + "grad_norm": 0.15874330699443817, + "learning_rate": 1.625740475680179e-05, + "loss": 1.1513, + "step": 8362 + }, + { + "epoch": 3.1135879745436355, + "grad_norm": 0.16957154870033264, + "learning_rate": 1.6256456725812625e-05, + "loss": 1.1636, + "step": 8363 + }, + { + "epoch": 3.1139602796942443, + "grad_norm": 0.1629595011472702, + "learning_rate": 1.6255508602415987e-05, + "loss": 1.1588, + "step": 8364 + }, + { + "epoch": 3.1143325848448535, + "grad_norm": 0.1647290140390396, + "learning_rate": 1.6254560386625874e-05, + "loss": 1.1537, + "step": 8365 + }, + { + "epoch": 3.1147048899954624, + "grad_norm": 0.1659194529056549, + "learning_rate": 1.6253612078456304e-05, + "loss": 1.1602, + "step": 8366 + }, + { + "epoch": 3.1150771951460716, + "grad_norm": 0.16474999487400055, + "learning_rate": 1.625266367792127e-05, + "loss": 1.152, + "step": 8367 + }, + { + "epoch": 3.1154495002966804, + "grad_norm": 0.1607973873615265, + "learning_rate": 1.6251715185034795e-05, + "loss": 1.1555, + "step": 8368 + }, + { + "epoch": 3.1158218054472897, + "grad_norm": 0.16143812239170074, + "learning_rate": 1.625076659981087e-05, + "loss": 1.149, + "step": 8369 + }, + { + "epoch": 3.116194110597899, + "grad_norm": 0.16376273334026337, + "learning_rate": 1.6249817922263518e-05, + "loss": 1.1621, + "step": 8370 + }, + { + "epoch": 3.116566415748508, + "grad_norm": 0.1660854071378708, + "learning_rate": 1.6248869152406745e-05, + "loss": 1.1475, + "step": 8371 + }, + { + "epoch": 3.116938720899117, + "grad_norm": 0.1652403473854065, + "learning_rate": 1.624792029025457e-05, + "loss": 1.1496, + "step": 8372 + }, + { + "epoch": 3.117311026049726, + "grad_norm": 0.1686633676290512, + "learning_rate": 1.6246971335821004e-05, + "loss": 1.1589, + "step": 8373 + }, + { + "epoch": 3.117683331200335, + "grad_norm": 0.1678876131772995, + "learning_rate": 1.6246022289120063e-05, + "loss": 1.1533, + "step": 8374 + }, + { + "epoch": 3.118055636350944, + "grad_norm": 0.1635279506444931, + "learning_rate": 1.6245073150165766e-05, + "loss": 1.156, + "step": 8375 + }, + { + "epoch": 3.1184279415015532, + "grad_norm": 0.16207703948020935, + "learning_rate": 1.624412391897213e-05, + "loss": 1.1726, + "step": 8376 + }, + { + "epoch": 3.1188002466521625, + "grad_norm": 0.1617921143770218, + "learning_rate": 1.6243174595553174e-05, + "loss": 1.1607, + "step": 8377 + }, + { + "epoch": 3.1191725518027713, + "grad_norm": 0.16754458844661713, + "learning_rate": 1.624222517992292e-05, + "loss": 1.163, + "step": 8378 + }, + { + "epoch": 3.1195448569533806, + "grad_norm": 0.15687395632266998, + "learning_rate": 1.6241275672095397e-05, + "loss": 1.1651, + "step": 8379 + }, + { + "epoch": 3.1199171621039894, + "grad_norm": 0.1605893075466156, + "learning_rate": 1.6240326072084617e-05, + "loss": 1.1457, + "step": 8380 + }, + { + "epoch": 3.1202894672545987, + "grad_norm": 0.16542315483093262, + "learning_rate": 1.6239376379904618e-05, + "loss": 1.165, + "step": 8381 + }, + { + "epoch": 3.1206617724052075, + "grad_norm": 0.16830827295780182, + "learning_rate": 1.623842659556942e-05, + "loss": 1.1516, + "step": 8382 + }, + { + "epoch": 3.1210340775558167, + "grad_norm": 0.1646735519170761, + "learning_rate": 1.623747671909305e-05, + "loss": 1.1631, + "step": 8383 + }, + { + "epoch": 3.1214063827064256, + "grad_norm": 0.16516734659671783, + "learning_rate": 1.6236526750489542e-05, + "loss": 1.1675, + "step": 8384 + }, + { + "epoch": 3.121778687857035, + "grad_norm": 0.16348835825920105, + "learning_rate": 1.6235576689772927e-05, + "loss": 1.1596, + "step": 8385 + }, + { + "epoch": 3.122150993007644, + "grad_norm": 0.17139267921447754, + "learning_rate": 1.6234626536957235e-05, + "loss": 1.1434, + "step": 8386 + }, + { + "epoch": 3.122523298158253, + "grad_norm": 0.16735519468784332, + "learning_rate": 1.62336762920565e-05, + "loss": 1.1433, + "step": 8387 + }, + { + "epoch": 3.122895603308862, + "grad_norm": 0.16513751447200775, + "learning_rate": 1.6232725955084756e-05, + "loss": 1.1627, + "step": 8388 + }, + { + "epoch": 3.123267908459471, + "grad_norm": 0.16176214814186096, + "learning_rate": 1.6231775526056044e-05, + "loss": 1.1773, + "step": 8389 + }, + { + "epoch": 3.1236402136100803, + "grad_norm": 0.16931404173374176, + "learning_rate": 1.6230825004984395e-05, + "loss": 1.1647, + "step": 8390 + }, + { + "epoch": 3.124012518760689, + "grad_norm": 0.16403864324092865, + "learning_rate": 1.6229874391883856e-05, + "loss": 1.1753, + "step": 8391 + }, + { + "epoch": 3.1243848239112983, + "grad_norm": 0.16192291676998138, + "learning_rate": 1.6228923686768458e-05, + "loss": 1.1377, + "step": 8392 + }, + { + "epoch": 3.124757129061907, + "grad_norm": 0.16515354812145233, + "learning_rate": 1.622797288965225e-05, + "loss": 1.1599, + "step": 8393 + }, + { + "epoch": 3.1251294342125164, + "grad_norm": 0.1676434427499771, + "learning_rate": 1.6227022000549276e-05, + "loss": 1.163, + "step": 8394 + }, + { + "epoch": 3.1255017393631257, + "grad_norm": 0.16849081218242645, + "learning_rate": 1.6226071019473577e-05, + "loss": 1.1566, + "step": 8395 + }, + { + "epoch": 3.1258740445137345, + "grad_norm": 0.16453030705451965, + "learning_rate": 1.6225119946439196e-05, + "loss": 1.1531, + "step": 8396 + }, + { + "epoch": 3.126246349664344, + "grad_norm": 0.166966512799263, + "learning_rate": 1.622416878146019e-05, + "loss": 1.1641, + "step": 8397 + }, + { + "epoch": 3.1266186548149526, + "grad_norm": 0.1629679799079895, + "learning_rate": 1.6223217524550595e-05, + "loss": 1.16, + "step": 8398 + }, + { + "epoch": 3.126990959965562, + "grad_norm": 0.16159994900226593, + "learning_rate": 1.6222266175724472e-05, + "loss": 1.1775, + "step": 8399 + }, + { + "epoch": 3.1273632651161707, + "grad_norm": 0.16298353672027588, + "learning_rate": 1.6221314734995867e-05, + "loss": 1.1664, + "step": 8400 + }, + { + "epoch": 3.12773557026678, + "grad_norm": 0.16081856191158295, + "learning_rate": 1.622036320237883e-05, + "loss": 1.1433, + "step": 8401 + }, + { + "epoch": 3.1281078754173888, + "grad_norm": 0.17165522277355194, + "learning_rate": 1.6219411577887428e-05, + "loss": 1.1592, + "step": 8402 + }, + { + "epoch": 3.128480180567998, + "grad_norm": 0.16671797633171082, + "learning_rate": 1.62184598615357e-05, + "loss": 1.1508, + "step": 8403 + }, + { + "epoch": 3.1288524857186073, + "grad_norm": 0.16210219264030457, + "learning_rate": 1.6217508053337713e-05, + "loss": 1.1595, + "step": 8404 + }, + { + "epoch": 3.129224790869216, + "grad_norm": 0.17077292501926422, + "learning_rate": 1.6216556153307518e-05, + "loss": 1.1484, + "step": 8405 + }, + { + "epoch": 3.1295970960198254, + "grad_norm": 0.16143833100795746, + "learning_rate": 1.6215604161459183e-05, + "loss": 1.1785, + "step": 8406 + }, + { + "epoch": 3.129969401170434, + "grad_norm": 0.18213441967964172, + "learning_rate": 1.6214652077806764e-05, + "loss": 1.1635, + "step": 8407 + }, + { + "epoch": 3.1303417063210435, + "grad_norm": 0.16605854034423828, + "learning_rate": 1.6213699902364325e-05, + "loss": 1.1576, + "step": 8408 + }, + { + "epoch": 3.1307140114716523, + "grad_norm": 0.17813484370708466, + "learning_rate": 1.6212747635145928e-05, + "loss": 1.1615, + "step": 8409 + }, + { + "epoch": 3.1310863166222616, + "grad_norm": 0.16783683001995087, + "learning_rate": 1.6211795276165635e-05, + "loss": 1.1513, + "step": 8410 + }, + { + "epoch": 3.131458621772871, + "grad_norm": 0.1840137392282486, + "learning_rate": 1.621084282543752e-05, + "loss": 1.1596, + "step": 8411 + }, + { + "epoch": 3.1318309269234796, + "grad_norm": 0.1757200062274933, + "learning_rate": 1.6209890282975644e-05, + "loss": 1.1677, + "step": 8412 + }, + { + "epoch": 3.132203232074089, + "grad_norm": 0.17295728623867035, + "learning_rate": 1.6208937648794076e-05, + "loss": 1.1536, + "step": 8413 + }, + { + "epoch": 3.1325755372246977, + "grad_norm": 0.1669415831565857, + "learning_rate": 1.6207984922906893e-05, + "loss": 1.1625, + "step": 8414 + }, + { + "epoch": 3.132947842375307, + "grad_norm": 0.18558120727539062, + "learning_rate": 1.620703210532816e-05, + "loss": 1.1613, + "step": 8415 + }, + { + "epoch": 3.133320147525916, + "grad_norm": 0.16660122573375702, + "learning_rate": 1.6206079196071952e-05, + "loss": 1.1513, + "step": 8416 + }, + { + "epoch": 3.133692452676525, + "grad_norm": 0.17779971659183502, + "learning_rate": 1.6205126195152345e-05, + "loss": 1.1553, + "step": 8417 + }, + { + "epoch": 3.134064757827134, + "grad_norm": 0.17245428264141083, + "learning_rate": 1.620417310258341e-05, + "loss": 1.1591, + "step": 8418 + }, + { + "epoch": 3.134437062977743, + "grad_norm": 0.179290309548378, + "learning_rate": 1.620321991837923e-05, + "loss": 1.1569, + "step": 8419 + }, + { + "epoch": 3.1348093681283524, + "grad_norm": 0.16428595781326294, + "learning_rate": 1.6202266642553884e-05, + "loss": 1.1522, + "step": 8420 + }, + { + "epoch": 3.1351816732789612, + "grad_norm": 0.1769377738237381, + "learning_rate": 1.6201313275121447e-05, + "loss": 1.1554, + "step": 8421 + }, + { + "epoch": 3.1355539784295705, + "grad_norm": 0.16289122402668, + "learning_rate": 1.6200359816096e-05, + "loss": 1.1597, + "step": 8422 + }, + { + "epoch": 3.1359262835801793, + "grad_norm": 0.1719742715358734, + "learning_rate": 1.619940626549163e-05, + "loss": 1.1637, + "step": 8423 + }, + { + "epoch": 3.1362985887307886, + "grad_norm": 0.16820377111434937, + "learning_rate": 1.619845262332242e-05, + "loss": 1.1467, + "step": 8424 + }, + { + "epoch": 3.1366708938813974, + "grad_norm": 0.16198331117630005, + "learning_rate": 1.619749888960245e-05, + "loss": 1.1575, + "step": 8425 + }, + { + "epoch": 3.1370431990320067, + "grad_norm": 0.17046979069709778, + "learning_rate": 1.6196545064345813e-05, + "loss": 1.1583, + "step": 8426 + }, + { + "epoch": 3.1374155041826155, + "grad_norm": 0.15980608761310577, + "learning_rate": 1.619559114756659e-05, + "loss": 1.1496, + "step": 8427 + }, + { + "epoch": 3.1377878093332248, + "grad_norm": 0.1674344688653946, + "learning_rate": 1.619463713927888e-05, + "loss": 1.1611, + "step": 8428 + }, + { + "epoch": 3.138160114483834, + "grad_norm": 0.17162348330020905, + "learning_rate": 1.6193683039496768e-05, + "loss": 1.1675, + "step": 8429 + }, + { + "epoch": 3.138532419634443, + "grad_norm": 0.16896787285804749, + "learning_rate": 1.6192728848234343e-05, + "loss": 1.1662, + "step": 8430 + }, + { + "epoch": 3.138904724785052, + "grad_norm": 0.17117677628993988, + "learning_rate": 1.6191774565505703e-05, + "loss": 1.1653, + "step": 8431 + }, + { + "epoch": 3.139277029935661, + "grad_norm": 0.16469299793243408, + "learning_rate": 1.619082019132494e-05, + "loss": 1.1709, + "step": 8432 + }, + { + "epoch": 3.13964933508627, + "grad_norm": 0.18532411754131317, + "learning_rate": 1.618986572570615e-05, + "loss": 1.1446, + "step": 8433 + }, + { + "epoch": 3.140021640236879, + "grad_norm": 0.16900207102298737, + "learning_rate": 1.6188911168663433e-05, + "loss": 1.1662, + "step": 8434 + }, + { + "epoch": 3.1403939453874883, + "grad_norm": 0.17637377977371216, + "learning_rate": 1.6187956520210893e-05, + "loss": 1.1558, + "step": 8435 + }, + { + "epoch": 3.140766250538097, + "grad_norm": 0.16088640689849854, + "learning_rate": 1.6187001780362613e-05, + "loss": 1.1539, + "step": 8436 + }, + { + "epoch": 3.1411385556887064, + "grad_norm": 0.18611860275268555, + "learning_rate": 1.6186046949132713e-05, + "loss": 1.1575, + "step": 8437 + }, + { + "epoch": 3.1415108608393156, + "grad_norm": 0.17658965289592743, + "learning_rate": 1.6185092026535286e-05, + "loss": 1.1652, + "step": 8438 + }, + { + "epoch": 3.1418831659899245, + "grad_norm": 0.19135300815105438, + "learning_rate": 1.6184137012584434e-05, + "loss": 1.1432, + "step": 8439 + }, + { + "epoch": 3.1422554711405337, + "grad_norm": 0.16602368652820587, + "learning_rate": 1.618318190729427e-05, + "loss": 1.1599, + "step": 8440 + }, + { + "epoch": 3.1426277762911425, + "grad_norm": 0.20455126464366913, + "learning_rate": 1.6182226710678898e-05, + "loss": 1.1602, + "step": 8441 + }, + { + "epoch": 3.143000081441752, + "grad_norm": 0.18047231435775757, + "learning_rate": 1.6181271422752424e-05, + "loss": 1.1531, + "step": 8442 + }, + { + "epoch": 3.1433723865923606, + "grad_norm": 0.17739862203598022, + "learning_rate": 1.6180316043528957e-05, + "loss": 1.1533, + "step": 8443 + }, + { + "epoch": 3.14374469174297, + "grad_norm": 0.16755236685276031, + "learning_rate": 1.617936057302261e-05, + "loss": 1.1491, + "step": 8444 + }, + { + "epoch": 3.1441169968935787, + "grad_norm": 0.18111351132392883, + "learning_rate": 1.61784050112475e-05, + "loss": 1.156, + "step": 8445 + }, + { + "epoch": 3.144489302044188, + "grad_norm": 0.1638672947883606, + "learning_rate": 1.617744935821773e-05, + "loss": 1.1649, + "step": 8446 + }, + { + "epoch": 3.1448616071947972, + "grad_norm": 0.17401021718978882, + "learning_rate": 1.6176493613947425e-05, + "loss": 1.1624, + "step": 8447 + }, + { + "epoch": 3.145233912345406, + "grad_norm": 0.16585645079612732, + "learning_rate": 1.6175537778450694e-05, + "loss": 1.1537, + "step": 8448 + }, + { + "epoch": 3.1456062174960153, + "grad_norm": 0.18722714483737946, + "learning_rate": 1.6174581851741658e-05, + "loss": 1.1663, + "step": 8449 + }, + { + "epoch": 3.145978522646624, + "grad_norm": 0.16669632494449615, + "learning_rate": 1.6173625833834438e-05, + "loss": 1.143, + "step": 8450 + }, + { + "epoch": 3.1463508277972334, + "grad_norm": 0.1624676138162613, + "learning_rate": 1.617266972474315e-05, + "loss": 1.1485, + "step": 8451 + }, + { + "epoch": 3.1467231329478422, + "grad_norm": 0.16883505880832672, + "learning_rate": 1.617171352448192e-05, + "loss": 1.1538, + "step": 8452 + }, + { + "epoch": 3.1470954380984515, + "grad_norm": 0.16919681429862976, + "learning_rate": 1.6170757233064863e-05, + "loss": 1.1483, + "step": 8453 + }, + { + "epoch": 3.1474677432490603, + "grad_norm": 0.18415765464305878, + "learning_rate": 1.6169800850506113e-05, + "loss": 1.1692, + "step": 8454 + }, + { + "epoch": 3.1478400483996696, + "grad_norm": 0.16374878585338593, + "learning_rate": 1.616884437681979e-05, + "loss": 1.146, + "step": 8455 + }, + { + "epoch": 3.148212353550279, + "grad_norm": 0.16538135707378387, + "learning_rate": 1.6167887812020023e-05, + "loss": 1.1619, + "step": 8456 + }, + { + "epoch": 3.1485846587008877, + "grad_norm": 0.1735272854566574, + "learning_rate": 1.6166931156120943e-05, + "loss": 1.1598, + "step": 8457 + }, + { + "epoch": 3.148956963851497, + "grad_norm": 0.16497787833213806, + "learning_rate": 1.6165974409136673e-05, + "loss": 1.1705, + "step": 8458 + }, + { + "epoch": 3.1493292690021057, + "grad_norm": 0.1695416122674942, + "learning_rate": 1.6165017571081348e-05, + "loss": 1.1711, + "step": 8459 + }, + { + "epoch": 3.149701574152715, + "grad_norm": 0.1687697172164917, + "learning_rate": 1.6164060641969104e-05, + "loss": 1.1484, + "step": 8460 + }, + { + "epoch": 3.150073879303324, + "grad_norm": 0.16305190324783325, + "learning_rate": 1.6163103621814065e-05, + "loss": 1.1711, + "step": 8461 + }, + { + "epoch": 3.150446184453933, + "grad_norm": 0.16032063961029053, + "learning_rate": 1.616214651063038e-05, + "loss": 1.1599, + "step": 8462 + }, + { + "epoch": 3.150818489604542, + "grad_norm": 0.17928291857242584, + "learning_rate": 1.6161189308432174e-05, + "loss": 1.1642, + "step": 8463 + }, + { + "epoch": 3.151190794755151, + "grad_norm": 0.170723557472229, + "learning_rate": 1.616023201523359e-05, + "loss": 1.1557, + "step": 8464 + }, + { + "epoch": 3.1515630999057604, + "grad_norm": 0.17493624985218048, + "learning_rate": 1.6159274631048763e-05, + "loss": 1.137, + "step": 8465 + }, + { + "epoch": 3.1519354050563693, + "grad_norm": 0.19886091351509094, + "learning_rate": 1.6158317155891837e-05, + "loss": 1.1471, + "step": 8466 + }, + { + "epoch": 3.1523077102069785, + "grad_norm": 0.1778782457113266, + "learning_rate": 1.6157359589776952e-05, + "loss": 1.1636, + "step": 8467 + }, + { + "epoch": 3.1526800153575874, + "grad_norm": 0.16123712062835693, + "learning_rate": 1.6156401932718258e-05, + "loss": 1.1593, + "step": 8468 + }, + { + "epoch": 3.1530523205081966, + "grad_norm": 0.17915739119052887, + "learning_rate": 1.6155444184729888e-05, + "loss": 1.1455, + "step": 8469 + }, + { + "epoch": 3.1534246256588054, + "grad_norm": 0.19462330639362335, + "learning_rate": 1.6154486345825996e-05, + "loss": 1.1657, + "step": 8470 + }, + { + "epoch": 3.1537969308094147, + "grad_norm": 0.1797885149717331, + "learning_rate": 1.6153528416020724e-05, + "loss": 1.1494, + "step": 8471 + }, + { + "epoch": 3.1541692359600235, + "grad_norm": 0.16456390917301178, + "learning_rate": 1.6152570395328227e-05, + "loss": 1.1628, + "step": 8472 + }, + { + "epoch": 3.154541541110633, + "grad_norm": 0.16568942368030548, + "learning_rate": 1.6151612283762653e-05, + "loss": 1.162, + "step": 8473 + }, + { + "epoch": 3.154913846261242, + "grad_norm": 0.16583888232707977, + "learning_rate": 1.6150654081338143e-05, + "loss": 1.1659, + "step": 8474 + }, + { + "epoch": 3.155286151411851, + "grad_norm": 0.15852290391921997, + "learning_rate": 1.6149695788068868e-05, + "loss": 1.1591, + "step": 8475 + }, + { + "epoch": 3.15565845656246, + "grad_norm": 0.16142380237579346, + "learning_rate": 1.6148737403968967e-05, + "loss": 1.1588, + "step": 8476 + }, + { + "epoch": 3.156030761713069, + "grad_norm": 0.17836064100265503, + "learning_rate": 1.6147778929052602e-05, + "loss": 1.1708, + "step": 8477 + }, + { + "epoch": 3.156403066863678, + "grad_norm": 0.1994626820087433, + "learning_rate": 1.614682036333393e-05, + "loss": 1.153, + "step": 8478 + }, + { + "epoch": 3.156775372014287, + "grad_norm": 0.21441636979579926, + "learning_rate": 1.6145861706827104e-05, + "loss": 1.1498, + "step": 8479 + }, + { + "epoch": 3.1571476771648963, + "grad_norm": 0.19782859086990356, + "learning_rate": 1.6144902959546286e-05, + "loss": 1.1527, + "step": 8480 + }, + { + "epoch": 3.157519982315505, + "grad_norm": 0.16160625219345093, + "learning_rate": 1.614394412150564e-05, + "loss": 1.1526, + "step": 8481 + }, + { + "epoch": 3.1578922874661144, + "grad_norm": 0.20167799293994904, + "learning_rate": 1.614298519271932e-05, + "loss": 1.155, + "step": 8482 + }, + { + "epoch": 3.1582645926167237, + "grad_norm": 0.21661166846752167, + "learning_rate": 1.61420261732015e-05, + "loss": 1.1462, + "step": 8483 + }, + { + "epoch": 3.1586368977673325, + "grad_norm": 0.16825565695762634, + "learning_rate": 1.6141067062966332e-05, + "loss": 1.1623, + "step": 8484 + }, + { + "epoch": 3.1590092029179417, + "grad_norm": 0.2019083946943283, + "learning_rate": 1.6140107862027993e-05, + "loss": 1.1671, + "step": 8485 + }, + { + "epoch": 3.1593815080685506, + "grad_norm": 0.17671798169612885, + "learning_rate": 1.6139148570400647e-05, + "loss": 1.1589, + "step": 8486 + }, + { + "epoch": 3.15975381321916, + "grad_norm": 0.17492499947547913, + "learning_rate": 1.6138189188098463e-05, + "loss": 1.1591, + "step": 8487 + }, + { + "epoch": 3.1601261183697686, + "grad_norm": 0.16377635300159454, + "learning_rate": 1.6137229715135604e-05, + "loss": 1.1596, + "step": 8488 + }, + { + "epoch": 3.160498423520378, + "grad_norm": 0.17293529212474823, + "learning_rate": 1.6136270151526254e-05, + "loss": 1.1644, + "step": 8489 + }, + { + "epoch": 3.1608707286709867, + "grad_norm": 0.15997327864170074, + "learning_rate": 1.6135310497284575e-05, + "loss": 1.141, + "step": 8490 + }, + { + "epoch": 3.161243033821596, + "grad_norm": 0.16692161560058594, + "learning_rate": 1.6134350752424746e-05, + "loss": 1.1528, + "step": 8491 + }, + { + "epoch": 3.1616153389722053, + "grad_norm": 0.17296627163887024, + "learning_rate": 1.613339091696094e-05, + "loss": 1.1472, + "step": 8492 + }, + { + "epoch": 3.161987644122814, + "grad_norm": 0.1687869429588318, + "learning_rate": 1.6132430990907338e-05, + "loss": 1.1503, + "step": 8493 + }, + { + "epoch": 3.1623599492734233, + "grad_norm": 0.18529605865478516, + "learning_rate": 1.6131470974278114e-05, + "loss": 1.1723, + "step": 8494 + }, + { + "epoch": 3.162732254424032, + "grad_norm": 0.1638706773519516, + "learning_rate": 1.6130510867087447e-05, + "loss": 1.1436, + "step": 8495 + }, + { + "epoch": 3.1631045595746414, + "grad_norm": 0.18358993530273438, + "learning_rate": 1.612955066934952e-05, + "loss": 1.1605, + "step": 8496 + }, + { + "epoch": 3.1634768647252502, + "grad_norm": 0.16975806653499603, + "learning_rate": 1.6128590381078516e-05, + "loss": 1.1495, + "step": 8497 + }, + { + "epoch": 3.1638491698758595, + "grad_norm": 0.1780412644147873, + "learning_rate": 1.6127630002288615e-05, + "loss": 1.1628, + "step": 8498 + }, + { + "epoch": 3.1642214750264688, + "grad_norm": 0.17131397128105164, + "learning_rate": 1.6126669532994003e-05, + "loss": 1.1578, + "step": 8499 + }, + { + "epoch": 3.1645937801770776, + "grad_norm": 0.17910616099834442, + "learning_rate": 1.6125708973208868e-05, + "loss": 1.1559, + "step": 8500 + }, + { + "epoch": 3.1645937801770776, + "eval_loss": 1.297155499458313, + "eval_runtime": 16.9178, + "eval_samples_per_second": 102.496, + "eval_steps_per_second": 5.143, + "step": 8500 + }, + { + "epoch": 3.1645937801770776, + "step": 8500, + "total_flos": 1.2178098370248075e+20, + "train_loss": 1.2383465781352099, + "train_runtime": 245543.4426, + "train_samples_per_second": 70.008, + "train_steps_per_second": 0.109 + } + ], + "logging_steps": 1, + "max_steps": 26850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2178098370248075e+20, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}