diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8721 @@ +{ + "best_metric": 0.8285140562248996, + "best_model_checkpoint": "/scratch/camembertv2/runs/results/xnli/camembertv2-base-bf16-p2-17000/max_seq_length-160-gradient_accumulation_steps-4-precision-fp32-learning_rate-1e-05-epochs-10-lr_scheduler-cosine-warmup_steps-0.1/SEED-666/checkpoint-61360", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 122720, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008148631029986962, + "grad_norm": 12.58836841583252, + "learning_rate": 8.148631029986963e-08, + "loss": 1.1012, + "step": 100 + }, + { + "epoch": 0.016297262059973925, + "grad_norm": 1.359410285949707, + "learning_rate": 1.6297262059973925e-07, + "loss": 1.1011, + "step": 200 + }, + { + "epoch": 0.024445893089960886, + "grad_norm": 1.128892183303833, + "learning_rate": 2.4445893089960885e-07, + "loss": 1.0978, + "step": 300 + }, + { + "epoch": 0.03259452411994785, + "grad_norm": 1.3794234991073608, + "learning_rate": 3.259452411994785e-07, + "loss": 1.0999, + "step": 400 + }, + { + "epoch": 0.04074315514993481, + "grad_norm": 1.3247599601745605, + "learning_rate": 4.0743155149934816e-07, + "loss": 1.0984, + "step": 500 + }, + { + "epoch": 0.04889178617992177, + "grad_norm": 0.9611015319824219, + "learning_rate": 4.889178617992177e-07, + "loss": 1.1001, + "step": 600 + }, + { + "epoch": 0.05704041720990873, + "grad_norm": 0.9682479500770569, + "learning_rate": 5.704041720990874e-07, + "loss": 1.0985, + "step": 700 + }, + { + "epoch": 0.0651890482398957, + "grad_norm": 1.950333833694458, + "learning_rate": 6.51890482398957e-07, + "loss": 1.0989, + "step": 800 + }, + { + "epoch": 0.07333767926988266, + "grad_norm": 1.4916733503341675, + "learning_rate": 7.333767926988267e-07, + "loss": 1.0964, + "step": 900 + }, + { + "epoch": 0.08148631029986962, + "grad_norm": 1.1135200262069702, + "learning_rate": 8.148631029986963e-07, + "loss": 1.096, + "step": 1000 + }, + { + "epoch": 0.08963494132985658, + "grad_norm": 1.773497462272644, + "learning_rate": 8.963494132985659e-07, + "loss": 1.094, + "step": 1100 + }, + { + "epoch": 0.09778357235984354, + "grad_norm": 1.5511926412582397, + "learning_rate": 9.778357235984354e-07, + "loss": 1.093, + "step": 1200 + }, + { + "epoch": 0.1059322033898305, + "grad_norm": 1.389298915863037, + "learning_rate": 1.059322033898305e-06, + "loss": 1.0871, + "step": 1300 + }, + { + "epoch": 0.11408083441981746, + "grad_norm": 2.486689329147339, + "learning_rate": 1.1408083441981747e-06, + "loss": 1.0751, + "step": 1400 + }, + { + "epoch": 0.12222946544980444, + "grad_norm": 2.697650194168091, + "learning_rate": 1.2222946544980446e-06, + "loss": 1.0505, + "step": 1500 + }, + { + "epoch": 0.1303780964797914, + "grad_norm": 3.557525157928467, + "learning_rate": 1.303780964797914e-06, + "loss": 1.0393, + "step": 1600 + }, + { + "epoch": 0.13852672750977835, + "grad_norm": 4.691379070281982, + "learning_rate": 1.3852672750977837e-06, + "loss": 1.0147, + "step": 1700 + }, + { + "epoch": 0.14667535853976532, + "grad_norm": 5.234630107879639, + "learning_rate": 1.4667535853976533e-06, + "loss": 0.9971, + "step": 1800 + }, + { + "epoch": 0.15482398956975227, + "grad_norm": 6.027713298797607, + "learning_rate": 1.5482398956975228e-06, + "loss": 1.0007, + "step": 1900 + }, + { + "epoch": 0.16297262059973924, + "grad_norm": 13.33498477935791, + "learning_rate": 1.6297262059973926e-06, + "loss": 0.984, + "step": 2000 + }, + { + "epoch": 0.17112125162972622, + "grad_norm": 9.432430267333984, + "learning_rate": 1.7112125162972623e-06, + "loss": 0.9633, + "step": 2100 + }, + { + "epoch": 0.17926988265971316, + "grad_norm": 7.303864479064941, + "learning_rate": 1.7926988265971317e-06, + "loss": 0.9463, + "step": 2200 + }, + { + "epoch": 0.18741851368970014, + "grad_norm": 6.125274181365967, + "learning_rate": 1.8741851368970016e-06, + "loss": 0.9336, + "step": 2300 + }, + { + "epoch": 0.19556714471968709, + "grad_norm": 6.614850044250488, + "learning_rate": 1.955671447196871e-06, + "loss": 0.9388, + "step": 2400 + }, + { + "epoch": 0.20371577574967406, + "grad_norm": 7.883510589599609, + "learning_rate": 2.037157757496741e-06, + "loss": 0.9122, + "step": 2500 + }, + { + "epoch": 0.211864406779661, + "grad_norm": 6.615538597106934, + "learning_rate": 2.11864406779661e-06, + "loss": 0.8907, + "step": 2600 + }, + { + "epoch": 0.22001303780964798, + "grad_norm": 6.040781021118164, + "learning_rate": 2.20013037809648e-06, + "loss": 0.8725, + "step": 2700 + }, + { + "epoch": 0.22816166883963493, + "grad_norm": 9.688776016235352, + "learning_rate": 2.2816166883963494e-06, + "loss": 0.8674, + "step": 2800 + }, + { + "epoch": 0.2363102998696219, + "grad_norm": 15.747467994689941, + "learning_rate": 2.363102998696219e-06, + "loss": 0.8199, + "step": 2900 + }, + { + "epoch": 0.24445893089960888, + "grad_norm": 9.381732940673828, + "learning_rate": 2.444589308996089e-06, + "loss": 0.831, + "step": 3000 + }, + { + "epoch": 0.2526075619295958, + "grad_norm": 8.603889465332031, + "learning_rate": 2.5260756192959584e-06, + "loss": 0.811, + "step": 3100 + }, + { + "epoch": 0.2607561929595828, + "grad_norm": 11.614546775817871, + "learning_rate": 2.607561929595828e-06, + "loss": 0.789, + "step": 3200 + }, + { + "epoch": 0.2689048239895698, + "grad_norm": 7.733945846557617, + "learning_rate": 2.689048239895698e-06, + "loss": 0.7947, + "step": 3300 + }, + { + "epoch": 0.2770534550195567, + "grad_norm": 14.573506355285645, + "learning_rate": 2.7705345501955674e-06, + "loss": 0.7913, + "step": 3400 + }, + { + "epoch": 0.28520208604954367, + "grad_norm": 11.938140869140625, + "learning_rate": 2.852020860495437e-06, + "loss": 0.793, + "step": 3500 + }, + { + "epoch": 0.29335071707953064, + "grad_norm": 9.235187530517578, + "learning_rate": 2.9335071707953067e-06, + "loss": 0.7538, + "step": 3600 + }, + { + "epoch": 0.3014993481095176, + "grad_norm": 9.092159271240234, + "learning_rate": 3.0149934810951763e-06, + "loss": 0.7547, + "step": 3700 + }, + { + "epoch": 0.30964797913950454, + "grad_norm": 11.72921371459961, + "learning_rate": 3.0964797913950456e-06, + "loss": 0.7461, + "step": 3800 + }, + { + "epoch": 0.3177966101694915, + "grad_norm": 15.118708610534668, + "learning_rate": 3.1779661016949152e-06, + "loss": 0.7171, + "step": 3900 + }, + { + "epoch": 0.3259452411994785, + "grad_norm": 17.719839096069336, + "learning_rate": 3.2594524119947853e-06, + "loss": 0.7027, + "step": 4000 + }, + { + "epoch": 0.33409387222946546, + "grad_norm": 10.063789367675781, + "learning_rate": 3.340938722294655e-06, + "loss": 0.7229, + "step": 4100 + }, + { + "epoch": 0.34224250325945244, + "grad_norm": 8.052227020263672, + "learning_rate": 3.4224250325945246e-06, + "loss": 0.7218, + "step": 4200 + }, + { + "epoch": 0.35039113428943935, + "grad_norm": 9.68342399597168, + "learning_rate": 3.503911342894394e-06, + "loss": 0.6873, + "step": 4300 + }, + { + "epoch": 0.35853976531942633, + "grad_norm": 9.140670776367188, + "learning_rate": 3.5853976531942635e-06, + "loss": 0.702, + "step": 4400 + }, + { + "epoch": 0.3666883963494133, + "grad_norm": 8.805059432983398, + "learning_rate": 3.666883963494133e-06, + "loss": 0.7245, + "step": 4500 + }, + { + "epoch": 0.3748370273794003, + "grad_norm": 7.228201389312744, + "learning_rate": 3.748370273794003e-06, + "loss": 0.6651, + "step": 4600 + }, + { + "epoch": 0.3829856584093872, + "grad_norm": 8.284133911132812, + "learning_rate": 3.829856584093872e-06, + "loss": 0.6956, + "step": 4700 + }, + { + "epoch": 0.39113428943937417, + "grad_norm": 8.938249588012695, + "learning_rate": 3.911342894393742e-06, + "loss": 0.6777, + "step": 4800 + }, + { + "epoch": 0.39928292046936115, + "grad_norm": 10.810254096984863, + "learning_rate": 3.992829204693612e-06, + "loss": 0.6803, + "step": 4900 + }, + { + "epoch": 0.4074315514993481, + "grad_norm": 11.629922866821289, + "learning_rate": 4.074315514993482e-06, + "loss": 0.6659, + "step": 5000 + }, + { + "epoch": 0.4155801825293351, + "grad_norm": 7.82265043258667, + "learning_rate": 4.1558018252933515e-06, + "loss": 0.6842, + "step": 5100 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 9.290712356567383, + "learning_rate": 4.23728813559322e-06, + "loss": 0.6711, + "step": 5200 + }, + { + "epoch": 0.431877444589309, + "grad_norm": 10.643411636352539, + "learning_rate": 4.31877444589309e-06, + "loss": 0.6521, + "step": 5300 + }, + { + "epoch": 0.44002607561929596, + "grad_norm": 8.533503532409668, + "learning_rate": 4.40026075619296e-06, + "loss": 0.6613, + "step": 5400 + }, + { + "epoch": 0.44817470664928294, + "grad_norm": 12.260805130004883, + "learning_rate": 4.48174706649283e-06, + "loss": 0.6512, + "step": 5500 + }, + { + "epoch": 0.45632333767926986, + "grad_norm": 7.977556228637695, + "learning_rate": 4.563233376792699e-06, + "loss": 0.6499, + "step": 5600 + }, + { + "epoch": 0.46447196870925683, + "grad_norm": 7.418649673461914, + "learning_rate": 4.6447196870925686e-06, + "loss": 0.6591, + "step": 5700 + }, + { + "epoch": 0.4726205997392438, + "grad_norm": 10.594202995300293, + "learning_rate": 4.726205997392438e-06, + "loss": 0.6497, + "step": 5800 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 11.133523941040039, + "learning_rate": 4.807692307692308e-06, + "loss": 0.6538, + "step": 5900 + }, + { + "epoch": 0.48891786179921776, + "grad_norm": 12.108560562133789, + "learning_rate": 4.889178617992178e-06, + "loss": 0.6195, + "step": 6000 + }, + { + "epoch": 0.4970664928292047, + "grad_norm": 9.70545482635498, + "learning_rate": 4.970664928292047e-06, + "loss": 0.6351, + "step": 6100 + }, + { + "epoch": 0.5052151238591917, + "grad_norm": 12.699902534484863, + "learning_rate": 5.052151238591917e-06, + "loss": 0.6557, + "step": 6200 + }, + { + "epoch": 0.5133637548891786, + "grad_norm": 10.324420928955078, + "learning_rate": 5.1336375488917865e-06, + "loss": 0.6415, + "step": 6300 + }, + { + "epoch": 0.5215123859191656, + "grad_norm": 10.3858642578125, + "learning_rate": 5.215123859191656e-06, + "loss": 0.624, + "step": 6400 + }, + { + "epoch": 0.5296610169491526, + "grad_norm": 13.573092460632324, + "learning_rate": 5.296610169491526e-06, + "loss": 0.6622, + "step": 6500 + }, + { + "epoch": 0.5378096479791395, + "grad_norm": 8.366503715515137, + "learning_rate": 5.378096479791396e-06, + "loss": 0.6166, + "step": 6600 + }, + { + "epoch": 0.5459582790091264, + "grad_norm": 6.413454532623291, + "learning_rate": 5.459582790091264e-06, + "loss": 0.6315, + "step": 6700 + }, + { + "epoch": 0.5541069100391134, + "grad_norm": 7.670026779174805, + "learning_rate": 5.541069100391135e-06, + "loss": 0.612, + "step": 6800 + }, + { + "epoch": 0.5622555410691004, + "grad_norm": 10.53145694732666, + "learning_rate": 5.622555410691004e-06, + "loss": 0.6167, + "step": 6900 + }, + { + "epoch": 0.5704041720990873, + "grad_norm": 6.5404462814331055, + "learning_rate": 5.704041720990874e-06, + "loss": 0.6226, + "step": 7000 + }, + { + "epoch": 0.5785528031290743, + "grad_norm": 9.084834098815918, + "learning_rate": 5.785528031290744e-06, + "loss": 0.6214, + "step": 7100 + }, + { + "epoch": 0.5867014341590613, + "grad_norm": 9.231087684631348, + "learning_rate": 5.867014341590613e-06, + "loss": 0.6245, + "step": 7200 + }, + { + "epoch": 0.5948500651890483, + "grad_norm": 8.526376724243164, + "learning_rate": 5.948500651890483e-06, + "loss": 0.6205, + "step": 7300 + }, + { + "epoch": 0.6029986962190352, + "grad_norm": 9.337794303894043, + "learning_rate": 6.029986962190353e-06, + "loss": 0.6156, + "step": 7400 + }, + { + "epoch": 0.6111473272490222, + "grad_norm": 8.846671104431152, + "learning_rate": 6.111473272490222e-06, + "loss": 0.6142, + "step": 7500 + }, + { + "epoch": 0.6192959582790091, + "grad_norm": 8.68179988861084, + "learning_rate": 6.192959582790091e-06, + "loss": 0.6218, + "step": 7600 + }, + { + "epoch": 0.627444589308996, + "grad_norm": 9.76940631866455, + "learning_rate": 6.274445893089961e-06, + "loss": 0.587, + "step": 7700 + }, + { + "epoch": 0.635593220338983, + "grad_norm": 7.811220169067383, + "learning_rate": 6.3559322033898304e-06, + "loss": 0.6002, + "step": 7800 + }, + { + "epoch": 0.64374185136897, + "grad_norm": 8.950928688049316, + "learning_rate": 6.4374185136897e-06, + "loss": 0.6032, + "step": 7900 + }, + { + "epoch": 0.651890482398957, + "grad_norm": 6.704097270965576, + "learning_rate": 6.518904823989571e-06, + "loss": 0.5993, + "step": 8000 + }, + { + "epoch": 0.660039113428944, + "grad_norm": 11.18411922454834, + "learning_rate": 6.60039113428944e-06, + "loss": 0.6035, + "step": 8100 + }, + { + "epoch": 0.6681877444589309, + "grad_norm": 8.417338371276855, + "learning_rate": 6.68187744458931e-06, + "loss": 0.624, + "step": 8200 + }, + { + "epoch": 0.6763363754889179, + "grad_norm": 9.916496276855469, + "learning_rate": 6.7633637548891795e-06, + "loss": 0.6275, + "step": 8300 + }, + { + "epoch": 0.6844850065189049, + "grad_norm": 8.701171875, + "learning_rate": 6.844850065189049e-06, + "loss": 0.5773, + "step": 8400 + }, + { + "epoch": 0.6926336375488917, + "grad_norm": 10.245955467224121, + "learning_rate": 6.926336375488918e-06, + "loss": 0.6139, + "step": 8500 + }, + { + "epoch": 0.7007822685788787, + "grad_norm": 6.190640926361084, + "learning_rate": 7.007822685788788e-06, + "loss": 0.5833, + "step": 8600 + }, + { + "epoch": 0.7089308996088657, + "grad_norm": 10.875850677490234, + "learning_rate": 7.089308996088657e-06, + "loss": 0.6, + "step": 8700 + }, + { + "epoch": 0.7170795306388527, + "grad_norm": 8.644452095031738, + "learning_rate": 7.170795306388527e-06, + "loss": 0.6097, + "step": 8800 + }, + { + "epoch": 0.7252281616688396, + "grad_norm": 8.089356422424316, + "learning_rate": 7.252281616688397e-06, + "loss": 0.583, + "step": 8900 + }, + { + "epoch": 0.7333767926988266, + "grad_norm": 12.513883590698242, + "learning_rate": 7.333767926988266e-06, + "loss": 0.5669, + "step": 9000 + }, + { + "epoch": 0.7415254237288136, + "grad_norm": 9.404706001281738, + "learning_rate": 7.415254237288137e-06, + "loss": 0.5833, + "step": 9100 + }, + { + "epoch": 0.7496740547588006, + "grad_norm": 6.789037227630615, + "learning_rate": 7.496740547588006e-06, + "loss": 0.5985, + "step": 9200 + }, + { + "epoch": 0.7578226857887875, + "grad_norm": 7.355409145355225, + "learning_rate": 7.578226857887876e-06, + "loss": 0.5686, + "step": 9300 + }, + { + "epoch": 0.7659713168187744, + "grad_norm": 7.175694465637207, + "learning_rate": 7.659713168187744e-06, + "loss": 0.5991, + "step": 9400 + }, + { + "epoch": 0.7741199478487614, + "grad_norm": 6.2600274085998535, + "learning_rate": 7.741199478487615e-06, + "loss": 0.5803, + "step": 9500 + }, + { + "epoch": 0.7822685788787483, + "grad_norm": 11.514883995056152, + "learning_rate": 7.822685788787483e-06, + "loss": 0.5802, + "step": 9600 + }, + { + "epoch": 0.7904172099087353, + "grad_norm": 6.594653129577637, + "learning_rate": 7.904172099087354e-06, + "loss": 0.5772, + "step": 9700 + }, + { + "epoch": 0.7985658409387223, + "grad_norm": 10.59202766418457, + "learning_rate": 7.985658409387224e-06, + "loss": 0.5848, + "step": 9800 + }, + { + "epoch": 0.8067144719687093, + "grad_norm": 7.8735151290893555, + "learning_rate": 8.067144719687093e-06, + "loss": 0.5813, + "step": 9900 + }, + { + "epoch": 0.8148631029986962, + "grad_norm": 9.064979553222656, + "learning_rate": 8.148631029986964e-06, + "loss": 0.5792, + "step": 10000 + }, + { + "epoch": 0.8230117340286832, + "grad_norm": 10.0288667678833, + "learning_rate": 8.230117340286832e-06, + "loss": 0.5622, + "step": 10100 + }, + { + "epoch": 0.8311603650586702, + "grad_norm": 8.7724609375, + "learning_rate": 8.311603650586703e-06, + "loss": 0.5767, + "step": 10200 + }, + { + "epoch": 0.8393089960886571, + "grad_norm": 8.127886772155762, + "learning_rate": 8.393089960886572e-06, + "loss": 0.5721, + "step": 10300 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 7.77069616317749, + "learning_rate": 8.47457627118644e-06, + "loss": 0.5925, + "step": 10400 + }, + { + "epoch": 0.855606258148631, + "grad_norm": 7.864415645599365, + "learning_rate": 8.556062581486311e-06, + "loss": 0.5805, + "step": 10500 + }, + { + "epoch": 0.863754889178618, + "grad_norm": 7.0319952964782715, + "learning_rate": 8.63754889178618e-06, + "loss": 0.577, + "step": 10600 + }, + { + "epoch": 0.871903520208605, + "grad_norm": 7.513912677764893, + "learning_rate": 8.71903520208605e-06, + "loss": 0.5978, + "step": 10700 + }, + { + "epoch": 0.8800521512385919, + "grad_norm": 8.28197193145752, + "learning_rate": 8.80052151238592e-06, + "loss": 0.5912, + "step": 10800 + }, + { + "epoch": 0.8882007822685789, + "grad_norm": 7.632150650024414, + "learning_rate": 8.88200782268579e-06, + "loss": 0.5706, + "step": 10900 + }, + { + "epoch": 0.8963494132985659, + "grad_norm": 7.691524028778076, + "learning_rate": 8.96349413298566e-06, + "loss": 0.5612, + "step": 11000 + }, + { + "epoch": 0.9044980443285529, + "grad_norm": 8.549062728881836, + "learning_rate": 9.044980443285529e-06, + "loss": 0.5494, + "step": 11100 + }, + { + "epoch": 0.9126466753585397, + "grad_norm": 10.64492416381836, + "learning_rate": 9.126466753585398e-06, + "loss": 0.5629, + "step": 11200 + }, + { + "epoch": 0.9207953063885267, + "grad_norm": 7.610856056213379, + "learning_rate": 9.207953063885268e-06, + "loss": 0.5627, + "step": 11300 + }, + { + "epoch": 0.9289439374185137, + "grad_norm": 10.41044807434082, + "learning_rate": 9.289439374185137e-06, + "loss": 0.5756, + "step": 11400 + }, + { + "epoch": 0.9370925684485006, + "grad_norm": 6.464520454406738, + "learning_rate": 9.370925684485008e-06, + "loss": 0.5817, + "step": 11500 + }, + { + "epoch": 0.9452411994784876, + "grad_norm": 12.031845092773438, + "learning_rate": 9.452411994784876e-06, + "loss": 0.5761, + "step": 11600 + }, + { + "epoch": 0.9533898305084746, + "grad_norm": 8.345417022705078, + "learning_rate": 9.533898305084747e-06, + "loss": 0.5789, + "step": 11700 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 8.58055305480957, + "learning_rate": 9.615384615384616e-06, + "loss": 0.5745, + "step": 11800 + }, + { + "epoch": 0.9696870925684485, + "grad_norm": 5.948461532592773, + "learning_rate": 9.696870925684486e-06, + "loss": 0.5695, + "step": 11900 + }, + { + "epoch": 0.9778357235984355, + "grad_norm": 8.523883819580078, + "learning_rate": 9.778357235984357e-06, + "loss": 0.575, + "step": 12000 + }, + { + "epoch": 0.9859843546284224, + "grad_norm": 8.530996322631836, + "learning_rate": 9.859843546284224e-06, + "loss": 0.5496, + "step": 12100 + }, + { + "epoch": 0.9941329856584094, + "grad_norm": 8.197943687438965, + "learning_rate": 9.941329856584094e-06, + "loss": 0.5929, + "step": 12200 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8028112449799196, + "eval_loss": 0.5100582242012024, + "eval_runtime": 7.5718, + "eval_samples_per_second": 328.853, + "eval_steps_per_second": 41.206, + "step": 12272 + }, + { + "epoch": 1.0022816166883963, + "grad_norm": 8.692102432250977, + "learning_rate": 9.999998414230423e-06, + "loss": 0.5456, + "step": 12300 + }, + { + "epoch": 1.0104302477183833, + "grad_norm": 6.663279056549072, + "learning_rate": 9.999966860686959e-06, + "loss": 0.546, + "step": 12400 + }, + { + "epoch": 1.0185788787483703, + "grad_norm": 7.53484582901001, + "learning_rate": 9.999894854131206e-06, + "loss": 0.5182, + "step": 12500 + }, + { + "epoch": 1.0267275097783573, + "grad_norm": 6.181861877441406, + "learning_rate": 9.999782395145752e-06, + "loss": 0.5093, + "step": 12600 + }, + { + "epoch": 1.0348761408083442, + "grad_norm": 9.323958396911621, + "learning_rate": 9.999629484640457e-06, + "loss": 0.528, + "step": 12700 + }, + { + "epoch": 1.0430247718383312, + "grad_norm": 10.739737510681152, + "learning_rate": 9.999436123852473e-06, + "loss": 0.5423, + "step": 12800 + }, + { + "epoch": 1.0511734028683182, + "grad_norm": 7.653073787689209, + "learning_rate": 9.99920231434621e-06, + "loss": 0.5215, + "step": 12900 + }, + { + "epoch": 1.0593220338983051, + "grad_norm": 6.83660888671875, + "learning_rate": 9.998928058013346e-06, + "loss": 0.5134, + "step": 13000 + }, + { + "epoch": 1.0674706649282921, + "grad_norm": 10.44430923461914, + "learning_rate": 9.99861335707279e-06, + "loss": 0.5466, + "step": 13100 + }, + { + "epoch": 1.075619295958279, + "grad_norm": 7.2710280418396, + "learning_rate": 9.998258214070683e-06, + "loss": 0.5364, + "step": 13200 + }, + { + "epoch": 1.083767926988266, + "grad_norm": 5.829804420471191, + "learning_rate": 9.997862631880362e-06, + "loss": 0.5146, + "step": 13300 + }, + { + "epoch": 1.0919165580182528, + "grad_norm": 8.52145767211914, + "learning_rate": 9.997426613702348e-06, + "loss": 0.5105, + "step": 13400 + }, + { + "epoch": 1.1000651890482398, + "grad_norm": 6.255794525146484, + "learning_rate": 9.996950163064313e-06, + "loss": 0.532, + "step": 13500 + }, + { + "epoch": 1.1082138200782268, + "grad_norm": 8.463394165039062, + "learning_rate": 9.996433283821057e-06, + "loss": 0.5265, + "step": 13600 + }, + { + "epoch": 1.1163624511082137, + "grad_norm": 9.939913749694824, + "learning_rate": 9.995875980154468e-06, + "loss": 0.5297, + "step": 13700 + }, + { + "epoch": 1.1245110821382007, + "grad_norm": 10.322543144226074, + "learning_rate": 9.995278256573504e-06, + "loss": 0.5413, + "step": 13800 + }, + { + "epoch": 1.1326597131681877, + "grad_norm": 7.6662445068359375, + "learning_rate": 9.994640117914139e-06, + "loss": 0.5197, + "step": 13900 + }, + { + "epoch": 1.1408083441981747, + "grad_norm": 12.555916786193848, + "learning_rate": 9.99396156933933e-06, + "loss": 0.5472, + "step": 14000 + }, + { + "epoch": 1.1489569752281616, + "grad_norm": 12.246332168579102, + "learning_rate": 9.993242616338983e-06, + "loss": 0.5296, + "step": 14100 + }, + { + "epoch": 1.1571056062581486, + "grad_norm": 11.406452178955078, + "learning_rate": 9.992483264729902e-06, + "loss": 0.5266, + "step": 14200 + }, + { + "epoch": 1.1652542372881356, + "grad_norm": 7.620953559875488, + "learning_rate": 9.991683520655735e-06, + "loss": 0.5267, + "step": 14300 + }, + { + "epoch": 1.1734028683181226, + "grad_norm": 7.820069789886475, + "learning_rate": 9.990843390586938e-06, + "loss": 0.5384, + "step": 14400 + }, + { + "epoch": 1.1815514993481095, + "grad_norm": 8.187140464782715, + "learning_rate": 9.989962881320714e-06, + "loss": 0.5071, + "step": 14500 + }, + { + "epoch": 1.1897001303780965, + "grad_norm": 8.322758674621582, + "learning_rate": 9.989041999980964e-06, + "loss": 0.5342, + "step": 14600 + }, + { + "epoch": 1.1978487614080835, + "grad_norm": 9.802703857421875, + "learning_rate": 9.988080754018218e-06, + "loss": 0.5205, + "step": 14700 + }, + { + "epoch": 1.2059973924380705, + "grad_norm": 9.249838829040527, + "learning_rate": 9.987079151209588e-06, + "loss": 0.5069, + "step": 14800 + }, + { + "epoch": 1.2141460234680574, + "grad_norm": 4.855494022369385, + "learning_rate": 9.986037199658698e-06, + "loss": 0.5107, + "step": 14900 + }, + { + "epoch": 1.2222946544980444, + "grad_norm": 9.250731468200684, + "learning_rate": 9.984954907795619e-06, + "loss": 0.5093, + "step": 15000 + }, + { + "epoch": 1.2304432855280312, + "grad_norm": 5.86234712600708, + "learning_rate": 9.983832284376804e-06, + "loss": 0.5539, + "step": 15100 + }, + { + "epoch": 1.2385919165580184, + "grad_norm": 13.074224472045898, + "learning_rate": 9.982669338485012e-06, + "loss": 0.5248, + "step": 15200 + }, + { + "epoch": 1.2467405475880051, + "grad_norm": 12.13022518157959, + "learning_rate": 9.981466079529236e-06, + "loss": 0.5415, + "step": 15300 + }, + { + "epoch": 1.254889178617992, + "grad_norm": 9.259481430053711, + "learning_rate": 9.980222517244633e-06, + "loss": 0.5224, + "step": 15400 + }, + { + "epoch": 1.263037809647979, + "grad_norm": 7.281178951263428, + "learning_rate": 9.978938661692439e-06, + "loss": 0.5363, + "step": 15500 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 12.429268836975098, + "learning_rate": 9.977614523259884e-06, + "loss": 0.5257, + "step": 15600 + }, + { + "epoch": 1.279335071707953, + "grad_norm": 8.357499122619629, + "learning_rate": 9.97625011266012e-06, + "loss": 0.5151, + "step": 15700 + }, + { + "epoch": 1.28748370273794, + "grad_norm": 7.741194725036621, + "learning_rate": 9.974845440932121e-06, + "loss": 0.4973, + "step": 15800 + }, + { + "epoch": 1.295632333767927, + "grad_norm": 12.34659481048584, + "learning_rate": 9.973400519440605e-06, + "loss": 0.5275, + "step": 15900 + }, + { + "epoch": 1.303780964797914, + "grad_norm": 7.972919940948486, + "learning_rate": 9.971915359875935e-06, + "loss": 0.5196, + "step": 16000 + }, + { + "epoch": 1.311929595827901, + "grad_norm": 6.398066520690918, + "learning_rate": 9.970389974254025e-06, + "loss": 0.5239, + "step": 16100 + }, + { + "epoch": 1.320078226857888, + "grad_norm": 9.441793441772461, + "learning_rate": 9.968824374916245e-06, + "loss": 0.5141, + "step": 16200 + }, + { + "epoch": 1.3282268578878749, + "grad_norm": 8.154695510864258, + "learning_rate": 9.967218574529323e-06, + "loss": 0.5179, + "step": 16300 + }, + { + "epoch": 1.3363754889178618, + "grad_norm": 9.219006538391113, + "learning_rate": 9.965572586085235e-06, + "loss": 0.4859, + "step": 16400 + }, + { + "epoch": 1.3445241199478488, + "grad_norm": 7.020698070526123, + "learning_rate": 9.96388642290111e-06, + "loss": 0.5128, + "step": 16500 + }, + { + "epoch": 1.3526727509778358, + "grad_norm": 7.134260654449463, + "learning_rate": 9.96216009861911e-06, + "loss": 0.5067, + "step": 16600 + }, + { + "epoch": 1.3608213820078228, + "grad_norm": 6.663614273071289, + "learning_rate": 9.96039362720634e-06, + "loss": 0.5352, + "step": 16700 + }, + { + "epoch": 1.3689700130378095, + "grad_norm": 7.817680358886719, + "learning_rate": 9.958587022954704e-06, + "loss": 0.5143, + "step": 16800 + }, + { + "epoch": 1.3771186440677967, + "grad_norm": 8.092264175415039, + "learning_rate": 9.956740300480818e-06, + "loss": 0.5111, + "step": 16900 + }, + { + "epoch": 1.3852672750977835, + "grad_norm": 7.305174350738525, + "learning_rate": 9.954853474725878e-06, + "loss": 0.5432, + "step": 17000 + }, + { + "epoch": 1.3934159061277707, + "grad_norm": 7.337920188903809, + "learning_rate": 9.952926560955547e-06, + "loss": 0.5279, + "step": 17100 + }, + { + "epoch": 1.4015645371577574, + "grad_norm": 8.824036598205566, + "learning_rate": 9.950959574759815e-06, + "loss": 0.5073, + "step": 17200 + }, + { + "epoch": 1.4097131681877444, + "grad_norm": 5.825498580932617, + "learning_rate": 9.948952532052895e-06, + "loss": 0.5208, + "step": 17300 + }, + { + "epoch": 1.4178617992177314, + "grad_norm": 6.746844291687012, + "learning_rate": 9.946905449073077e-06, + "loss": 0.5245, + "step": 17400 + }, + { + "epoch": 1.4260104302477183, + "grad_norm": 9.570401191711426, + "learning_rate": 9.944818342382607e-06, + "loss": 0.5056, + "step": 17500 + }, + { + "epoch": 1.4341590612777053, + "grad_norm": 8.143331527709961, + "learning_rate": 9.942691228867548e-06, + "loss": 0.5066, + "step": 17600 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 8.18307113647461, + "learning_rate": 9.940524125737641e-06, + "loss": 0.4933, + "step": 17700 + }, + { + "epoch": 1.4504563233376793, + "grad_norm": 9.306159019470215, + "learning_rate": 9.938317050526173e-06, + "loss": 0.5092, + "step": 17800 + }, + { + "epoch": 1.4586049543676662, + "grad_norm": 7.026943206787109, + "learning_rate": 9.936070021089834e-06, + "loss": 0.5071, + "step": 17900 + }, + { + "epoch": 1.4667535853976532, + "grad_norm": 8.45121955871582, + "learning_rate": 9.933783055608562e-06, + "loss": 0.5193, + "step": 18000 + }, + { + "epoch": 1.4749022164276402, + "grad_norm": 5.932709217071533, + "learning_rate": 9.93145617258541e-06, + "loss": 0.5311, + "step": 18100 + }, + { + "epoch": 1.4830508474576272, + "grad_norm": 8.077872276306152, + "learning_rate": 9.929089390846389e-06, + "loss": 0.4887, + "step": 18200 + }, + { + "epoch": 1.4911994784876141, + "grad_norm": 10.298677444458008, + "learning_rate": 9.926682729540313e-06, + "loss": 0.5006, + "step": 18300 + }, + { + "epoch": 1.4993481095176011, + "grad_norm": 7.896773815155029, + "learning_rate": 9.924236208138656e-06, + "loss": 0.4828, + "step": 18400 + }, + { + "epoch": 1.5074967405475879, + "grad_norm": 10.591178894042969, + "learning_rate": 9.921749846435375e-06, + "loss": 0.4936, + "step": 18500 + }, + { + "epoch": 1.515645371577575, + "grad_norm": 8.356033325195312, + "learning_rate": 9.919223664546774e-06, + "loss": 0.5271, + "step": 18600 + }, + { + "epoch": 1.5237940026075618, + "grad_norm": 9.826644897460938, + "learning_rate": 9.916657682911317e-06, + "loss": 0.5115, + "step": 18700 + }, + { + "epoch": 1.531942633637549, + "grad_norm": 7.742495536804199, + "learning_rate": 9.914051922289482e-06, + "loss": 0.5037, + "step": 18800 + }, + { + "epoch": 1.5400912646675358, + "grad_norm": 6.355010032653809, + "learning_rate": 9.91140640376358e-06, + "loss": 0.5047, + "step": 18900 + }, + { + "epoch": 1.548239895697523, + "grad_norm": 11.718524932861328, + "learning_rate": 9.908721148737591e-06, + "loss": 0.5074, + "step": 19000 + }, + { + "epoch": 1.5563885267275097, + "grad_norm": 6.173713207244873, + "learning_rate": 9.905996178936991e-06, + "loss": 0.5367, + "step": 19100 + }, + { + "epoch": 1.5645371577574967, + "grad_norm": 10.962457656860352, + "learning_rate": 9.903231516408576e-06, + "loss": 0.4991, + "step": 19200 + }, + { + "epoch": 1.5726857887874837, + "grad_norm": 6.949578285217285, + "learning_rate": 9.900427183520276e-06, + "loss": 0.4935, + "step": 19300 + }, + { + "epoch": 1.5808344198174706, + "grad_norm": 6.240306854248047, + "learning_rate": 9.897583202960985e-06, + "loss": 0.5136, + "step": 19400 + }, + { + "epoch": 1.5889830508474576, + "grad_norm": 6.609454154968262, + "learning_rate": 9.89469959774037e-06, + "loss": 0.4972, + "step": 19500 + }, + { + "epoch": 1.5971316818774446, + "grad_norm": 8.191039085388184, + "learning_rate": 9.891776391188694e-06, + "loss": 0.5202, + "step": 19600 + }, + { + "epoch": 1.6052803129074316, + "grad_norm": 7.624372959136963, + "learning_rate": 9.888813606956612e-06, + "loss": 0.515, + "step": 19700 + }, + { + "epoch": 1.6134289439374185, + "grad_norm": 8.45014476776123, + "learning_rate": 9.885811269014992e-06, + "loss": 0.517, + "step": 19800 + }, + { + "epoch": 1.6215775749674055, + "grad_norm": 6.690873146057129, + "learning_rate": 9.882769401654719e-06, + "loss": 0.5153, + "step": 19900 + }, + { + "epoch": 1.6297262059973925, + "grad_norm": 6.8720808029174805, + "learning_rate": 9.879688029486496e-06, + "loss": 0.5288, + "step": 20000 + }, + { + "epoch": 1.6378748370273795, + "grad_norm": 9.76561164855957, + "learning_rate": 9.876567177440645e-06, + "loss": 0.509, + "step": 20100 + }, + { + "epoch": 1.6460234680573662, + "grad_norm": 12.810523986816406, + "learning_rate": 9.873406870766906e-06, + "loss": 0.5144, + "step": 20200 + }, + { + "epoch": 1.6541720990873534, + "grad_norm": 6.44625997543335, + "learning_rate": 9.870207135034235e-06, + "loss": 0.5237, + "step": 20300 + }, + { + "epoch": 1.6623207301173402, + "grad_norm": 9.6302490234375, + "learning_rate": 9.86696799613059e-06, + "loss": 0.5094, + "step": 20400 + }, + { + "epoch": 1.6704693611473274, + "grad_norm": 10.308381080627441, + "learning_rate": 9.863689480262734e-06, + "loss": 0.498, + "step": 20500 + }, + { + "epoch": 1.6786179921773141, + "grad_norm": 11.594625473022461, + "learning_rate": 9.860371613956008e-06, + "loss": 0.5224, + "step": 20600 + }, + { + "epoch": 1.6867666232073013, + "grad_norm": 7.823093414306641, + "learning_rate": 9.85701442405413e-06, + "loss": 0.515, + "step": 20700 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 6.978199481964111, + "learning_rate": 9.853617937718966e-06, + "loss": 0.5103, + "step": 20800 + }, + { + "epoch": 1.7030638852672753, + "grad_norm": 9.50684928894043, + "learning_rate": 9.850182182430322e-06, + "loss": 0.4876, + "step": 20900 + }, + { + "epoch": 1.711212516297262, + "grad_norm": 9.167742729187012, + "learning_rate": 9.84670718598571e-06, + "loss": 0.521, + "step": 21000 + }, + { + "epoch": 1.719361147327249, + "grad_norm": 9.103960990905762, + "learning_rate": 9.843192976500131e-06, + "loss": 0.4987, + "step": 21100 + }, + { + "epoch": 1.727509778357236, + "grad_norm": 7.777735233306885, + "learning_rate": 9.83963958240585e-06, + "loss": 0.4838, + "step": 21200 + }, + { + "epoch": 1.735658409387223, + "grad_norm": 3.7518503665924072, + "learning_rate": 9.83604703245215e-06, + "loss": 0.5019, + "step": 21300 + }, + { + "epoch": 1.74380704041721, + "grad_norm": 8.239873886108398, + "learning_rate": 9.832415355705118e-06, + "loss": 0.5119, + "step": 21400 + }, + { + "epoch": 1.7519556714471969, + "grad_norm": 7.265876293182373, + "learning_rate": 9.828744581547407e-06, + "loss": 0.4681, + "step": 21500 + }, + { + "epoch": 1.7601043024771839, + "grad_norm": 9.064807891845703, + "learning_rate": 9.825034739677984e-06, + "loss": 0.4737, + "step": 21600 + }, + { + "epoch": 1.7682529335071708, + "grad_norm": 6.92955207824707, + "learning_rate": 9.821285860111903e-06, + "loss": 0.4968, + "step": 21700 + }, + { + "epoch": 1.7764015645371578, + "grad_norm": 10.282632827758789, + "learning_rate": 9.817497973180062e-06, + "loss": 0.4986, + "step": 21800 + }, + { + "epoch": 1.7845501955671446, + "grad_norm": 5.6930084228515625, + "learning_rate": 9.813671109528949e-06, + "loss": 0.5135, + "step": 21900 + }, + { + "epoch": 1.7926988265971318, + "grad_norm": 6.911000728607178, + "learning_rate": 9.809805300120403e-06, + "loss": 0.5046, + "step": 22000 + }, + { + "epoch": 1.8008474576271185, + "grad_norm": 6.411030292510986, + "learning_rate": 9.805900576231358e-06, + "loss": 0.4926, + "step": 22100 + }, + { + "epoch": 1.8089960886571057, + "grad_norm": 6.620294570922852, + "learning_rate": 9.801956969453592e-06, + "loss": 0.4788, + "step": 22200 + }, + { + "epoch": 1.8171447196870925, + "grad_norm": 6.77543830871582, + "learning_rate": 9.797974511693471e-06, + "loss": 0.4896, + "step": 22300 + }, + { + "epoch": 1.8252933507170797, + "grad_norm": 7.471630573272705, + "learning_rate": 9.793953235171694e-06, + "loss": 0.4979, + "step": 22400 + }, + { + "epoch": 1.8334419817470664, + "grad_norm": 6.550878524780273, + "learning_rate": 9.789893172423021e-06, + "loss": 0.5081, + "step": 22500 + }, + { + "epoch": 1.8415906127770536, + "grad_norm": 9.887825965881348, + "learning_rate": 9.78579435629603e-06, + "loss": 0.5089, + "step": 22600 + }, + { + "epoch": 1.8497392438070404, + "grad_norm": 7.088003158569336, + "learning_rate": 9.781656819952826e-06, + "loss": 0.4811, + "step": 22700 + }, + { + "epoch": 1.8578878748370273, + "grad_norm": 6.524052619934082, + "learning_rate": 9.777480596868796e-06, + "loss": 0.5018, + "step": 22800 + }, + { + "epoch": 1.8660365058670143, + "grad_norm": 7.965360164642334, + "learning_rate": 9.773265720832324e-06, + "loss": 0.5144, + "step": 22900 + }, + { + "epoch": 1.8741851368970013, + "grad_norm": 7.510045051574707, + "learning_rate": 9.769012225944521e-06, + "loss": 0.5002, + "step": 23000 + }, + { + "epoch": 1.8823337679269883, + "grad_norm": 11.717968940734863, + "learning_rate": 9.764720146618955e-06, + "loss": 0.5003, + "step": 23100 + }, + { + "epoch": 1.8904823989569752, + "grad_norm": 5.974288463592529, + "learning_rate": 9.760389517581362e-06, + "loss": 0.4912, + "step": 23200 + }, + { + "epoch": 1.8986310299869622, + "grad_norm": 5.159633159637451, + "learning_rate": 9.75602037386937e-06, + "loss": 0.4861, + "step": 23300 + }, + { + "epoch": 1.9067796610169492, + "grad_norm": 6.651115417480469, + "learning_rate": 9.75161275083222e-06, + "loss": 0.5153, + "step": 23400 + }, + { + "epoch": 1.9149282920469362, + "grad_norm": 7.513479709625244, + "learning_rate": 9.747166684130474e-06, + "loss": 0.4931, + "step": 23500 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 8.77505874633789, + "learning_rate": 9.742682209735727e-06, + "loss": 0.501, + "step": 23600 + }, + { + "epoch": 1.93122555410691, + "grad_norm": 6.932135581970215, + "learning_rate": 9.738159363930324e-06, + "loss": 0.52, + "step": 23700 + }, + { + "epoch": 1.9393741851368969, + "grad_norm": 10.359477996826172, + "learning_rate": 9.73359818330705e-06, + "loss": 0.4877, + "step": 23800 + }, + { + "epoch": 1.947522816166884, + "grad_norm": 8.781031608581543, + "learning_rate": 9.72899870476885e-06, + "loss": 0.4891, + "step": 23900 + }, + { + "epoch": 1.9556714471968708, + "grad_norm": 8.263874053955078, + "learning_rate": 9.724360965528523e-06, + "loss": 0.5061, + "step": 24000 + }, + { + "epoch": 1.963820078226858, + "grad_norm": 7.766465663909912, + "learning_rate": 9.719685003108423e-06, + "loss": 0.4902, + "step": 24100 + }, + { + "epoch": 1.9719687092568448, + "grad_norm": 4.978456974029541, + "learning_rate": 9.714970855340152e-06, + "loss": 0.4873, + "step": 24200 + }, + { + "epoch": 1.980117340286832, + "grad_norm": 7.918380260467529, + "learning_rate": 9.71021856036426e-06, + "loss": 0.4941, + "step": 24300 + }, + { + "epoch": 1.9882659713168187, + "grad_norm": 8.015583038330078, + "learning_rate": 9.705428156629933e-06, + "loss": 0.4833, + "step": 24400 + }, + { + "epoch": 1.996414602346806, + "grad_norm": 7.768013954162598, + "learning_rate": 9.700599682894675e-06, + "loss": 0.4932, + "step": 24500 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.8008032128514057, + "eval_loss": 0.5220404267311096, + "eval_runtime": 6.9482, + "eval_samples_per_second": 358.366, + "eval_steps_per_second": 44.904, + "step": 24544 + }, + { + "epoch": 2.0045632333767927, + "grad_norm": 7.617489337921143, + "learning_rate": 9.695733178224009e-06, + "loss": 0.4491, + "step": 24600 + }, + { + "epoch": 2.01271186440678, + "grad_norm": 8.741541862487793, + "learning_rate": 9.690828681991153e-06, + "loss": 0.4068, + "step": 24700 + }, + { + "epoch": 2.0208604954367666, + "grad_norm": 11.999881744384766, + "learning_rate": 9.685886233876696e-06, + "loss": 0.4138, + "step": 24800 + }, + { + "epoch": 2.029009126466754, + "grad_norm": 9.766683578491211, + "learning_rate": 9.680905873868287e-06, + "loss": 0.3986, + "step": 24900 + }, + { + "epoch": 2.0371577574967406, + "grad_norm": 6.533343315124512, + "learning_rate": 9.675887642260306e-06, + "loss": 0.4024, + "step": 25000 + }, + { + "epoch": 2.0453063885267273, + "grad_norm": 9.137768745422363, + "learning_rate": 9.670831579653539e-06, + "loss": 0.4436, + "step": 25100 + }, + { + "epoch": 2.0534550195567145, + "grad_norm": 9.635496139526367, + "learning_rate": 9.665737726954852e-06, + "loss": 0.4019, + "step": 25200 + }, + { + "epoch": 2.0616036505867013, + "grad_norm": 7.93952751159668, + "learning_rate": 9.66060612537685e-06, + "loss": 0.4221, + "step": 25300 + }, + { + "epoch": 2.0697522816166884, + "grad_norm": 9.508652687072754, + "learning_rate": 9.65543681643756e-06, + "loss": 0.4221, + "step": 25400 + }, + { + "epoch": 2.077900912646675, + "grad_norm": 9.136526107788086, + "learning_rate": 9.650229841960084e-06, + "loss": 0.4239, + "step": 25500 + }, + { + "epoch": 2.0860495436766624, + "grad_norm": 11.71844482421875, + "learning_rate": 9.644985244072258e-06, + "loss": 0.4047, + "step": 25600 + }, + { + "epoch": 2.094198174706649, + "grad_norm": 4.190426826477051, + "learning_rate": 9.639703065206323e-06, + "loss": 0.4209, + "step": 25700 + }, + { + "epoch": 2.1023468057366363, + "grad_norm": 11.736051559448242, + "learning_rate": 9.63438334809857e-06, + "loss": 0.4086, + "step": 25800 + }, + { + "epoch": 2.110495436766623, + "grad_norm": 7.024579048156738, + "learning_rate": 9.629026135789002e-06, + "loss": 0.4346, + "step": 25900 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 10.942073822021484, + "learning_rate": 9.62363147162098e-06, + "loss": 0.4242, + "step": 26000 + }, + { + "epoch": 2.126792698826597, + "grad_norm": 12.155450820922852, + "learning_rate": 9.618199399240876e-06, + "loss": 0.4706, + "step": 26100 + }, + { + "epoch": 2.1349413298565842, + "grad_norm": 6.733283519744873, + "learning_rate": 9.612729962597721e-06, + "loss": 0.4406, + "step": 26200 + }, + { + "epoch": 2.143089960886571, + "grad_norm": 7.309271335601807, + "learning_rate": 9.607223205942845e-06, + "loss": 0.4169, + "step": 26300 + }, + { + "epoch": 2.151238591916558, + "grad_norm": 7.154285907745361, + "learning_rate": 9.601679173829522e-06, + "loss": 0.4406, + "step": 26400 + }, + { + "epoch": 2.159387222946545, + "grad_norm": 8.043559074401855, + "learning_rate": 9.596097911112609e-06, + "loss": 0.4264, + "step": 26500 + }, + { + "epoch": 2.167535853976532, + "grad_norm": 9.203978538513184, + "learning_rate": 9.590479462948185e-06, + "loss": 0.4173, + "step": 26600 + }, + { + "epoch": 2.175684485006519, + "grad_norm": 7.716718673706055, + "learning_rate": 9.58482387479318e-06, + "loss": 0.412, + "step": 26700 + }, + { + "epoch": 2.1838331160365057, + "grad_norm": 10.910019874572754, + "learning_rate": 9.57913119240501e-06, + "loss": 0.3844, + "step": 26800 + }, + { + "epoch": 2.191981747066493, + "grad_norm": 7.980166435241699, + "learning_rate": 9.573401461841218e-06, + "loss": 0.4441, + "step": 26900 + }, + { + "epoch": 2.2001303780964796, + "grad_norm": 7.328435897827148, + "learning_rate": 9.567634729459076e-06, + "loss": 0.4118, + "step": 27000 + }, + { + "epoch": 2.208279009126467, + "grad_norm": 7.026157379150391, + "learning_rate": 9.561831041915238e-06, + "loss": 0.4258, + "step": 27100 + }, + { + "epoch": 2.2164276401564535, + "grad_norm": 10.100348472595215, + "learning_rate": 9.555990446165339e-06, + "loss": 0.4368, + "step": 27200 + }, + { + "epoch": 2.2245762711864407, + "grad_norm": 11.21714973449707, + "learning_rate": 9.550112989463633e-06, + "loss": 0.4253, + "step": 27300 + }, + { + "epoch": 2.2327249022164275, + "grad_norm": 7.6962127685546875, + "learning_rate": 9.5441987193626e-06, + "loss": 0.4273, + "step": 27400 + }, + { + "epoch": 2.2408735332464147, + "grad_norm": 13.219654083251953, + "learning_rate": 9.538247683712567e-06, + "loss": 0.4369, + "step": 27500 + }, + { + "epoch": 2.2490221642764014, + "grad_norm": 5.536248683929443, + "learning_rate": 9.532259930661315e-06, + "loss": 0.4493, + "step": 27600 + }, + { + "epoch": 2.2571707953063886, + "grad_norm": 7.112065315246582, + "learning_rate": 9.526235508653694e-06, + "loss": 0.4325, + "step": 27700 + }, + { + "epoch": 2.2653194263363754, + "grad_norm": 6.064886093139648, + "learning_rate": 9.520174466431235e-06, + "loss": 0.4353, + "step": 27800 + }, + { + "epoch": 2.2734680573663626, + "grad_norm": 7.9532318115234375, + "learning_rate": 9.51407685303174e-06, + "loss": 0.4358, + "step": 27900 + }, + { + "epoch": 2.2816166883963493, + "grad_norm": 8.64626693725586, + "learning_rate": 9.507942717788907e-06, + "loss": 0.4489, + "step": 28000 + }, + { + "epoch": 2.2897653194263365, + "grad_norm": 9.648942947387695, + "learning_rate": 9.50177211033191e-06, + "loss": 0.4498, + "step": 28100 + }, + { + "epoch": 2.2979139504563233, + "grad_norm": 7.498199939727783, + "learning_rate": 9.495565080585017e-06, + "loss": 0.4086, + "step": 28200 + }, + { + "epoch": 2.3060625814863105, + "grad_norm": 8.632119178771973, + "learning_rate": 9.489321678767167e-06, + "loss": 0.4207, + "step": 28300 + }, + { + "epoch": 2.3142112125162972, + "grad_norm": 8.807448387145996, + "learning_rate": 9.48304195539158e-06, + "loss": 0.428, + "step": 28400 + }, + { + "epoch": 2.322359843546284, + "grad_norm": 7.809271812438965, + "learning_rate": 9.476725961265332e-06, + "loss": 0.4546, + "step": 28500 + }, + { + "epoch": 2.330508474576271, + "grad_norm": 8.758193969726562, + "learning_rate": 9.470373747488966e-06, + "loss": 0.432, + "step": 28600 + }, + { + "epoch": 2.3386571056062584, + "grad_norm": 8.046852111816406, + "learning_rate": 9.463985365456057e-06, + "loss": 0.4169, + "step": 28700 + }, + { + "epoch": 2.346805736636245, + "grad_norm": 12.665115356445312, + "learning_rate": 9.457560866852805e-06, + "loss": 0.4242, + "step": 28800 + }, + { + "epoch": 2.354954367666232, + "grad_norm": 10.333826065063477, + "learning_rate": 9.45110030365762e-06, + "loss": 0.4603, + "step": 28900 + }, + { + "epoch": 2.363102998696219, + "grad_norm": 8.857953071594238, + "learning_rate": 9.444603728140698e-06, + "loss": 0.454, + "step": 29000 + }, + { + "epoch": 2.371251629726206, + "grad_norm": 9.135393142700195, + "learning_rate": 9.438071192863596e-06, + "loss": 0.4574, + "step": 29100 + }, + { + "epoch": 2.379400260756193, + "grad_norm": 6.3214921951293945, + "learning_rate": 9.43150275067881e-06, + "loss": 0.4299, + "step": 29200 + }, + { + "epoch": 2.38754889178618, + "grad_norm": 7.322382926940918, + "learning_rate": 9.42489845472935e-06, + "loss": 0.4265, + "step": 29300 + }, + { + "epoch": 2.395697522816167, + "grad_norm": 11.1491060256958, + "learning_rate": 9.418258358448298e-06, + "loss": 0.4233, + "step": 29400 + }, + { + "epoch": 2.4038461538461537, + "grad_norm": 7.171163082122803, + "learning_rate": 9.411582515558391e-06, + "loss": 0.4271, + "step": 29500 + }, + { + "epoch": 2.411994784876141, + "grad_norm": 5.758033275604248, + "learning_rate": 9.404870980071579e-06, + "loss": 0.4463, + "step": 29600 + }, + { + "epoch": 2.4201434159061277, + "grad_norm": 11.001411437988281, + "learning_rate": 9.398123806288588e-06, + "loss": 0.42, + "step": 29700 + }, + { + "epoch": 2.428292046936115, + "grad_norm": 6.28535795211792, + "learning_rate": 9.39134104879848e-06, + "loss": 0.4188, + "step": 29800 + }, + { + "epoch": 2.4364406779661016, + "grad_norm": 6.2432861328125, + "learning_rate": 9.38452276247821e-06, + "loss": 0.4242, + "step": 29900 + }, + { + "epoch": 2.444589308996089, + "grad_norm": 9.474976539611816, + "learning_rate": 9.377669002492193e-06, + "loss": 0.43, + "step": 30000 + }, + { + "epoch": 2.4527379400260756, + "grad_norm": 7.984436988830566, + "learning_rate": 9.37077982429184e-06, + "loss": 0.4328, + "step": 30100 + }, + { + "epoch": 2.4608865710560623, + "grad_norm": 8.237207412719727, + "learning_rate": 9.363855283615124e-06, + "loss": 0.4166, + "step": 30200 + }, + { + "epoch": 2.4690352020860495, + "grad_norm": 7.6592936515808105, + "learning_rate": 9.356895436486122e-06, + "loss": 0.4253, + "step": 30300 + }, + { + "epoch": 2.4771838331160367, + "grad_norm": 5.206706523895264, + "learning_rate": 9.349900339214564e-06, + "loss": 0.4414, + "step": 30400 + }, + { + "epoch": 2.4853324641460235, + "grad_norm": 10.161866188049316, + "learning_rate": 9.342870048395376e-06, + "loss": 0.415, + "step": 30500 + }, + { + "epoch": 2.4934810951760102, + "grad_norm": 4.225031852722168, + "learning_rate": 9.335804620908222e-06, + "loss": 0.4243, + "step": 30600 + }, + { + "epoch": 2.5016297262059974, + "grad_norm": 7.489659786224365, + "learning_rate": 9.328704113917046e-06, + "loss": 0.4417, + "step": 30700 + }, + { + "epoch": 2.509778357235984, + "grad_norm": 8.180109977722168, + "learning_rate": 9.32156858486961e-06, + "loss": 0.4217, + "step": 30800 + }, + { + "epoch": 2.5179269882659714, + "grad_norm": 9.16032886505127, + "learning_rate": 9.314398091497024e-06, + "loss": 0.4297, + "step": 30900 + }, + { + "epoch": 2.526075619295958, + "grad_norm": 8.16234302520752, + "learning_rate": 9.307192691813285e-06, + "loss": 0.4319, + "step": 31000 + }, + { + "epoch": 2.5342242503259453, + "grad_norm": 10.111699104309082, + "learning_rate": 9.299952444114802e-06, + "loss": 0.4186, + "step": 31100 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 6.305666923522949, + "learning_rate": 9.29267740697993e-06, + "loss": 0.4382, + "step": 31200 + }, + { + "epoch": 2.5505215123859193, + "grad_norm": 9.985565185546875, + "learning_rate": 9.285367639268492e-06, + "loss": 0.4272, + "step": 31300 + }, + { + "epoch": 2.558670143415906, + "grad_norm": 10.670126914978027, + "learning_rate": 9.278023200121305e-06, + "loss": 0.4228, + "step": 31400 + }, + { + "epoch": 2.5668187744458932, + "grad_norm": 7.42661714553833, + "learning_rate": 9.2706441489597e-06, + "loss": 0.4314, + "step": 31500 + }, + { + "epoch": 2.57496740547588, + "grad_norm": 6.457535266876221, + "learning_rate": 9.263230545485044e-06, + "loss": 0.4401, + "step": 31600 + }, + { + "epoch": 2.583116036505867, + "grad_norm": 11.822875022888184, + "learning_rate": 9.25578244967825e-06, + "loss": 0.3865, + "step": 31700 + }, + { + "epoch": 2.591264667535854, + "grad_norm": 12.4473295211792, + "learning_rate": 9.2482999217993e-06, + "loss": 0.4272, + "step": 31800 + }, + { + "epoch": 2.5994132985658407, + "grad_norm": 5.283376693725586, + "learning_rate": 9.240783022386757e-06, + "loss": 0.4084, + "step": 31900 + }, + { + "epoch": 2.607561929595828, + "grad_norm": 8.190621376037598, + "learning_rate": 9.233231812257266e-06, + "loss": 0.4257, + "step": 32000 + }, + { + "epoch": 2.615710560625815, + "grad_norm": 6.570192813873291, + "learning_rate": 9.225646352505071e-06, + "loss": 0.4464, + "step": 32100 + }, + { + "epoch": 2.623859191655802, + "grad_norm": 10.470175743103027, + "learning_rate": 9.218026704501519e-06, + "loss": 0.4245, + "step": 32200 + }, + { + "epoch": 2.6320078226857886, + "grad_norm": 7.662964820861816, + "learning_rate": 9.210372929894561e-06, + "loss": 0.4265, + "step": 32300 + }, + { + "epoch": 2.640156453715776, + "grad_norm": 7.74278450012207, + "learning_rate": 9.202685090608256e-06, + "loss": 0.4293, + "step": 32400 + }, + { + "epoch": 2.648305084745763, + "grad_norm": 6.661880970001221, + "learning_rate": 9.194963248842266e-06, + "loss": 0.4592, + "step": 32500 + }, + { + "epoch": 2.6564537157757497, + "grad_norm": 8.020112991333008, + "learning_rate": 9.18720746707136e-06, + "loss": 0.4229, + "step": 32600 + }, + { + "epoch": 2.6646023468057365, + "grad_norm": 5.921052932739258, + "learning_rate": 9.179417808044897e-06, + "loss": 0.4141, + "step": 32700 + }, + { + "epoch": 2.6727509778357237, + "grad_norm": 10.444842338562012, + "learning_rate": 9.17159433478633e-06, + "loss": 0.4437, + "step": 32800 + }, + { + "epoch": 2.6808996088657104, + "grad_norm": 7.524814605712891, + "learning_rate": 9.163737110592697e-06, + "loss": 0.4128, + "step": 32900 + }, + { + "epoch": 2.6890482398956976, + "grad_norm": 10.936373710632324, + "learning_rate": 9.155846199034086e-06, + "loss": 0.4273, + "step": 33000 + }, + { + "epoch": 2.6971968709256844, + "grad_norm": 7.02941370010376, + "learning_rate": 9.147921663953157e-06, + "loss": 0.4433, + "step": 33100 + }, + { + "epoch": 2.7053455019556716, + "grad_norm": 10.595579147338867, + "learning_rate": 9.139963569464593e-06, + "loss": 0.4264, + "step": 33200 + }, + { + "epoch": 2.7134941329856583, + "grad_norm": 5.312283992767334, + "learning_rate": 9.131971979954603e-06, + "loss": 0.4149, + "step": 33300 + }, + { + "epoch": 2.7216427640156455, + "grad_norm": 7.464469909667969, + "learning_rate": 9.123946960080387e-06, + "loss": 0.4368, + "step": 33400 + }, + { + "epoch": 2.7297913950456323, + "grad_norm": 7.507636547088623, + "learning_rate": 9.115888574769623e-06, + "loss": 0.4344, + "step": 33500 + }, + { + "epoch": 2.737940026075619, + "grad_norm": 7.984206676483154, + "learning_rate": 9.107796889219933e-06, + "loss": 0.4165, + "step": 33600 + }, + { + "epoch": 2.7460886571056062, + "grad_norm": 9.600481986999512, + "learning_rate": 9.099671968898362e-06, + "loss": 0.4212, + "step": 33700 + }, + { + "epoch": 2.7542372881355934, + "grad_norm": 6.417558670043945, + "learning_rate": 9.091513879540845e-06, + "loss": 0.41, + "step": 33800 + }, + { + "epoch": 2.76238591916558, + "grad_norm": 7.52598762512207, + "learning_rate": 9.08332268715168e-06, + "loss": 0.4443, + "step": 33900 + }, + { + "epoch": 2.770534550195567, + "grad_norm": 8.766283988952637, + "learning_rate": 9.075098458002988e-06, + "loss": 0.4552, + "step": 34000 + }, + { + "epoch": 2.778683181225554, + "grad_norm": 7.127804756164551, + "learning_rate": 9.066841258634177e-06, + "loss": 0.426, + "step": 34100 + }, + { + "epoch": 2.7868318122555413, + "grad_norm": 8.190874099731445, + "learning_rate": 9.058551155851405e-06, + "loss": 0.4374, + "step": 34200 + }, + { + "epoch": 2.794980443285528, + "grad_norm": 7.887624740600586, + "learning_rate": 9.050228216727046e-06, + "loss": 0.437, + "step": 34300 + }, + { + "epoch": 2.803129074315515, + "grad_norm": 10.439249038696289, + "learning_rate": 9.041872508599136e-06, + "loss": 0.4165, + "step": 34400 + }, + { + "epoch": 2.811277705345502, + "grad_norm": 9.891864776611328, + "learning_rate": 9.033484099070839e-06, + "loss": 0.4336, + "step": 34500 + }, + { + "epoch": 2.819426336375489, + "grad_norm": 10.03987979888916, + "learning_rate": 9.025063056009886e-06, + "loss": 0.4365, + "step": 34600 + }, + { + "epoch": 2.827574967405476, + "grad_norm": 6.188653469085693, + "learning_rate": 9.016609447548046e-06, + "loss": 0.41, + "step": 34700 + }, + { + "epoch": 2.8357235984354627, + "grad_norm": 11.486917495727539, + "learning_rate": 9.008123342080553e-06, + "loss": 0.4343, + "step": 34800 + }, + { + "epoch": 2.84387222946545, + "grad_norm": 9.972556114196777, + "learning_rate": 8.99960480826557e-06, + "loss": 0.4282, + "step": 34900 + }, + { + "epoch": 2.8520208604954367, + "grad_norm": 7.771157741546631, + "learning_rate": 8.991053915023625e-06, + "loss": 0.4086, + "step": 35000 + }, + { + "epoch": 2.860169491525424, + "grad_norm": 5.989213943481445, + "learning_rate": 8.982470731537054e-06, + "loss": 0.4647, + "step": 35100 + }, + { + "epoch": 2.8683181225554106, + "grad_norm": 7.19948148727417, + "learning_rate": 8.973855327249442e-06, + "loss": 0.4086, + "step": 35200 + }, + { + "epoch": 2.8764667535853974, + "grad_norm": 7.22706937789917, + "learning_rate": 8.965207771865061e-06, + "loss": 0.4225, + "step": 35300 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 11.344962120056152, + "learning_rate": 8.95652813534831e-06, + "loss": 0.4275, + "step": 35400 + }, + { + "epoch": 2.8927640156453718, + "grad_norm": 10.637499809265137, + "learning_rate": 8.947816487923143e-06, + "loss": 0.4347, + "step": 35500 + }, + { + "epoch": 2.9009126466753585, + "grad_norm": 7.946286678314209, + "learning_rate": 8.939072900072501e-06, + "loss": 0.4218, + "step": 35600 + }, + { + "epoch": 2.9090612777053453, + "grad_norm": 6.058999061584473, + "learning_rate": 8.930297442537747e-06, + "loss": 0.4212, + "step": 35700 + }, + { + "epoch": 2.9172099087353325, + "grad_norm": 10.35421371459961, + "learning_rate": 8.921490186318092e-06, + "loss": 0.4028, + "step": 35800 + }, + { + "epoch": 2.9253585397653197, + "grad_norm": 8.85345458984375, + "learning_rate": 8.912651202670013e-06, + "loss": 0.4455, + "step": 35900 + }, + { + "epoch": 2.9335071707953064, + "grad_norm": 7.476600646972656, + "learning_rate": 8.90378056310669e-06, + "loss": 0.4212, + "step": 36000 + }, + { + "epoch": 2.941655801825293, + "grad_norm": 8.27695369720459, + "learning_rate": 8.894878339397416e-06, + "loss": 0.4186, + "step": 36100 + }, + { + "epoch": 2.9498044328552804, + "grad_norm": 8.344620704650879, + "learning_rate": 8.885944603567023e-06, + "loss": 0.4242, + "step": 36200 + }, + { + "epoch": 2.957953063885267, + "grad_norm": 8.976387023925781, + "learning_rate": 8.876979427895291e-06, + "loss": 0.4359, + "step": 36300 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 10.581543922424316, + "learning_rate": 8.867982884916377e-06, + "loss": 0.4171, + "step": 36400 + }, + { + "epoch": 2.974250325945241, + "grad_norm": 6.423446178436279, + "learning_rate": 8.858955047418217e-06, + "loss": 0.4248, + "step": 36500 + }, + { + "epoch": 2.9823989569752283, + "grad_norm": 6.647116184234619, + "learning_rate": 8.849895988441933e-06, + "loss": 0.4272, + "step": 36600 + }, + { + "epoch": 2.990547588005215, + "grad_norm": 11.199699401855469, + "learning_rate": 8.840805781281261e-06, + "loss": 0.4336, + "step": 36700 + }, + { + "epoch": 2.9986962190352022, + "grad_norm": 6.946083068847656, + "learning_rate": 8.831684499481941e-06, + "loss": 0.4278, + "step": 36800 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.8088353413654619, + "eval_loss": 0.5133101940155029, + "eval_runtime": 6.8742, + "eval_samples_per_second": 362.222, + "eval_steps_per_second": 45.387, + "step": 36816 + }, + { + "epoch": 3.006844850065189, + "grad_norm": 8.117693901062012, + "learning_rate": 8.822532216841124e-06, + "loss": 0.3563, + "step": 36900 + }, + { + "epoch": 3.014993481095176, + "grad_norm": 8.939483642578125, + "learning_rate": 8.813349007406785e-06, + "loss": 0.3693, + "step": 37000 + }, + { + "epoch": 3.023142112125163, + "grad_norm": 5.619213104248047, + "learning_rate": 8.80413494547711e-06, + "loss": 0.359, + "step": 37100 + }, + { + "epoch": 3.03129074315515, + "grad_norm": 7.458463191986084, + "learning_rate": 8.794890105599905e-06, + "loss": 0.3631, + "step": 37200 + }, + { + "epoch": 3.039439374185137, + "grad_norm": 8.206454277038574, + "learning_rate": 8.785614562571991e-06, + "loss": 0.3513, + "step": 37300 + }, + { + "epoch": 3.047588005215124, + "grad_norm": 8.663100242614746, + "learning_rate": 8.776308391438597e-06, + "loss": 0.3348, + "step": 37400 + }, + { + "epoch": 3.055736636245111, + "grad_norm": 8.638208389282227, + "learning_rate": 8.766971667492754e-06, + "loss": 0.3618, + "step": 37500 + }, + { + "epoch": 3.0638852672750976, + "grad_norm": 8.416321754455566, + "learning_rate": 8.757604466274683e-06, + "loss": 0.3671, + "step": 37600 + }, + { + "epoch": 3.0720338983050848, + "grad_norm": 10.002084732055664, + "learning_rate": 8.748206863571188e-06, + "loss": 0.3462, + "step": 37700 + }, + { + "epoch": 3.0801825293350715, + "grad_norm": 8.242202758789062, + "learning_rate": 8.73877893541504e-06, + "loss": 0.3524, + "step": 37800 + }, + { + "epoch": 3.0883311603650587, + "grad_norm": 9.762850761413574, + "learning_rate": 8.729320758084363e-06, + "loss": 0.3844, + "step": 37900 + }, + { + "epoch": 3.0964797913950455, + "grad_norm": 13.008197784423828, + "learning_rate": 8.719832408102017e-06, + "loss": 0.3489, + "step": 38000 + }, + { + "epoch": 3.1046284224250327, + "grad_norm": 9.61468505859375, + "learning_rate": 8.71031396223498e-06, + "loss": 0.3386, + "step": 38100 + }, + { + "epoch": 3.1127770534550194, + "grad_norm": 9.158555030822754, + "learning_rate": 8.700765497493723e-06, + "loss": 0.3542, + "step": 38200 + }, + { + "epoch": 3.1209256844850066, + "grad_norm": 11.94726276397705, + "learning_rate": 8.69118709113159e-06, + "loss": 0.3591, + "step": 38300 + }, + { + "epoch": 3.1290743155149934, + "grad_norm": 9.813300132751465, + "learning_rate": 8.681578820644173e-06, + "loss": 0.3625, + "step": 38400 + }, + { + "epoch": 3.1372229465449806, + "grad_norm": 8.50658130645752, + "learning_rate": 8.671940763768682e-06, + "loss": 0.3789, + "step": 38500 + }, + { + "epoch": 3.1453715775749673, + "grad_norm": 6.037990570068359, + "learning_rate": 8.662272998483323e-06, + "loss": 0.3635, + "step": 38600 + }, + { + "epoch": 3.1535202086049545, + "grad_norm": 11.817001342773438, + "learning_rate": 8.65257560300666e-06, + "loss": 0.3526, + "step": 38700 + }, + { + "epoch": 3.1616688396349413, + "grad_norm": 4.690389156341553, + "learning_rate": 8.642848655796985e-06, + "loss": 0.3634, + "step": 38800 + }, + { + "epoch": 3.1698174706649285, + "grad_norm": 12.257222175598145, + "learning_rate": 8.633092235551679e-06, + "loss": 0.3626, + "step": 38900 + }, + { + "epoch": 3.1779661016949152, + "grad_norm": 7.710871696472168, + "learning_rate": 8.623306421206588e-06, + "loss": 0.3571, + "step": 39000 + }, + { + "epoch": 3.1861147327249024, + "grad_norm": 6.811945915222168, + "learning_rate": 8.613491291935365e-06, + "loss": 0.351, + "step": 39100 + }, + { + "epoch": 3.194263363754889, + "grad_norm": 19.7229061126709, + "learning_rate": 8.60364692714885e-06, + "loss": 0.3348, + "step": 39200 + }, + { + "epoch": 3.2024119947848764, + "grad_norm": 9.32421875, + "learning_rate": 8.59377340649441e-06, + "loss": 0.3437, + "step": 39300 + }, + { + "epoch": 3.210560625814863, + "grad_norm": 9.309675216674805, + "learning_rate": 8.583870809855306e-06, + "loss": 0.3687, + "step": 39400 + }, + { + "epoch": 3.21870925684485, + "grad_norm": 5.458558559417725, + "learning_rate": 8.573939217350043e-06, + "loss": 0.3584, + "step": 39500 + }, + { + "epoch": 3.226857887874837, + "grad_norm": 8.717120170593262, + "learning_rate": 8.563978709331717e-06, + "loss": 0.3473, + "step": 39600 + }, + { + "epoch": 3.235006518904824, + "grad_norm": 6.542947769165039, + "learning_rate": 8.553989366387376e-06, + "loss": 0.3806, + "step": 39700 + }, + { + "epoch": 3.243155149934811, + "grad_norm": 11.504007339477539, + "learning_rate": 8.543971269337355e-06, + "loss": 0.3606, + "step": 39800 + }, + { + "epoch": 3.2513037809647978, + "grad_norm": 9.393417358398438, + "learning_rate": 8.533924499234633e-06, + "loss": 0.3532, + "step": 39900 + }, + { + "epoch": 3.259452411994785, + "grad_norm": 8.129273414611816, + "learning_rate": 8.523849137364175e-06, + "loss": 0.3473, + "step": 40000 + }, + { + "epoch": 3.2676010430247717, + "grad_norm": 12.241875648498535, + "learning_rate": 8.513745265242263e-06, + "loss": 0.3576, + "step": 40100 + }, + { + "epoch": 3.275749674054759, + "grad_norm": 9.895030975341797, + "learning_rate": 8.503612964615858e-06, + "loss": 0.3458, + "step": 40200 + }, + { + "epoch": 3.2838983050847457, + "grad_norm": 5.42219877243042, + "learning_rate": 8.493452317461914e-06, + "loss": 0.3772, + "step": 40300 + }, + { + "epoch": 3.292046936114733, + "grad_norm": 8.165868759155273, + "learning_rate": 8.483263405986735e-06, + "loss": 0.3561, + "step": 40400 + }, + { + "epoch": 3.3001955671447196, + "grad_norm": 13.24457836151123, + "learning_rate": 8.4730463126253e-06, + "loss": 0.3587, + "step": 40500 + }, + { + "epoch": 3.308344198174707, + "grad_norm": 12.287585258483887, + "learning_rate": 8.462801120040595e-06, + "loss": 0.3432, + "step": 40600 + }, + { + "epoch": 3.3164928292046936, + "grad_norm": 8.932402610778809, + "learning_rate": 8.452527911122953e-06, + "loss": 0.3696, + "step": 40700 + }, + { + "epoch": 3.3246414602346808, + "grad_norm": 8.847443580627441, + "learning_rate": 8.442226768989373e-06, + "loss": 0.362, + "step": 40800 + }, + { + "epoch": 3.3327900912646675, + "grad_norm": 13.20019245147705, + "learning_rate": 8.431897776982851e-06, + "loss": 0.3543, + "step": 40900 + }, + { + "epoch": 3.3409387222946547, + "grad_norm": 8.375232696533203, + "learning_rate": 8.421541018671712e-06, + "loss": 0.3741, + "step": 41000 + }, + { + "epoch": 3.3490873533246415, + "grad_norm": 7.601521968841553, + "learning_rate": 8.411156577848927e-06, + "loss": 0.3518, + "step": 41100 + }, + { + "epoch": 3.3572359843546282, + "grad_norm": 5.853700637817383, + "learning_rate": 8.400744538531431e-06, + "loss": 0.3556, + "step": 41200 + }, + { + "epoch": 3.3653846153846154, + "grad_norm": 15.7562837600708, + "learning_rate": 8.390304984959455e-06, + "loss": 0.3591, + "step": 41300 + }, + { + "epoch": 3.373533246414602, + "grad_norm": 7.048288822174072, + "learning_rate": 8.379838001595837e-06, + "loss": 0.3774, + "step": 41400 + }, + { + "epoch": 3.3816818774445894, + "grad_norm": 8.532382011413574, + "learning_rate": 8.369343673125339e-06, + "loss": 0.3482, + "step": 41500 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 5.468735218048096, + "learning_rate": 8.358822084453964e-06, + "loss": 0.3637, + "step": 41600 + }, + { + "epoch": 3.3979791395045633, + "grad_norm": 7.324248313903809, + "learning_rate": 8.348273320708269e-06, + "loss": 0.365, + "step": 41700 + }, + { + "epoch": 3.40612777053455, + "grad_norm": 8.06946849822998, + "learning_rate": 8.33769746723467e-06, + "loss": 0.3661, + "step": 41800 + }, + { + "epoch": 3.4142764015645373, + "grad_norm": 11.85434341430664, + "learning_rate": 8.32709460959876e-06, + "loss": 0.3542, + "step": 41900 + }, + { + "epoch": 3.422425032594524, + "grad_norm": 8.629081726074219, + "learning_rate": 8.316464833584618e-06, + "loss": 0.3476, + "step": 42000 + }, + { + "epoch": 3.430573663624511, + "grad_norm": 7.888760566711426, + "learning_rate": 8.305808225194103e-06, + "loss": 0.3752, + "step": 42100 + }, + { + "epoch": 3.438722294654498, + "grad_norm": 8.756083488464355, + "learning_rate": 8.295124870646168e-06, + "loss": 0.359, + "step": 42200 + }, + { + "epoch": 3.446870925684485, + "grad_norm": 8.682005882263184, + "learning_rate": 8.284414856376161e-06, + "loss": 0.3607, + "step": 42300 + }, + { + "epoch": 3.455019556714472, + "grad_norm": 14.85304069519043, + "learning_rate": 8.273678269035126e-06, + "loss": 0.3417, + "step": 42400 + }, + { + "epoch": 3.463168187744459, + "grad_norm": 10.479057312011719, + "learning_rate": 8.262915195489097e-06, + "loss": 0.3571, + "step": 42500 + }, + { + "epoch": 3.471316818774446, + "grad_norm": 9.107665061950684, + "learning_rate": 8.2521257228184e-06, + "loss": 0.3655, + "step": 42600 + }, + { + "epoch": 3.479465449804433, + "grad_norm": 10.01933765411377, + "learning_rate": 8.241309938316947e-06, + "loss": 0.363, + "step": 42700 + }, + { + "epoch": 3.48761408083442, + "grad_norm": 7.9999189376831055, + "learning_rate": 8.230467929491533e-06, + "loss": 0.3753, + "step": 42800 + }, + { + "epoch": 3.4957627118644066, + "grad_norm": 9.211396217346191, + "learning_rate": 8.219599784061124e-06, + "loss": 0.3389, + "step": 42900 + }, + { + "epoch": 3.5039113428943938, + "grad_norm": 9.140076637268066, + "learning_rate": 8.20870558995614e-06, + "loss": 0.3683, + "step": 43000 + }, + { + "epoch": 3.512059973924381, + "grad_norm": 9.534440040588379, + "learning_rate": 8.197785435317766e-06, + "loss": 0.3585, + "step": 43100 + }, + { + "epoch": 3.5202086049543677, + "grad_norm": 10.818157196044922, + "learning_rate": 8.186839408497213e-06, + "loss": 0.3546, + "step": 43200 + }, + { + "epoch": 3.5283572359843545, + "grad_norm": 11.54218578338623, + "learning_rate": 8.175867598055021e-06, + "loss": 0.3818, + "step": 43300 + }, + { + "epoch": 3.5365058670143417, + "grad_norm": 10.037505149841309, + "learning_rate": 8.164870092760336e-06, + "loss": 0.347, + "step": 43400 + }, + { + "epoch": 3.5446544980443284, + "grad_norm": 11.143013000488281, + "learning_rate": 8.153846981590191e-06, + "loss": 0.3633, + "step": 43500 + }, + { + "epoch": 3.5528031290743156, + "grad_norm": 9.558606147766113, + "learning_rate": 8.142798353728786e-06, + "loss": 0.373, + "step": 43600 + }, + { + "epoch": 3.5609517601043024, + "grad_norm": 13.201570510864258, + "learning_rate": 8.131724298566767e-06, + "loss": 0.3611, + "step": 43700 + }, + { + "epoch": 3.5691003911342896, + "grad_norm": 10.490971565246582, + "learning_rate": 8.120624905700511e-06, + "loss": 0.3292, + "step": 43800 + }, + { + "epoch": 3.5772490221642763, + "grad_norm": 3.778831958770752, + "learning_rate": 8.109500264931387e-06, + "loss": 0.3731, + "step": 43900 + }, + { + "epoch": 3.5853976531942635, + "grad_norm": 10.723892211914062, + "learning_rate": 8.098350466265034e-06, + "loss": 0.3783, + "step": 44000 + }, + { + "epoch": 3.5935462842242503, + "grad_norm": 9.849285125732422, + "learning_rate": 8.087175599910642e-06, + "loss": 0.337, + "step": 44100 + }, + { + "epoch": 3.601694915254237, + "grad_norm": 11.700067520141602, + "learning_rate": 8.07597575628021e-06, + "loss": 0.3639, + "step": 44200 + }, + { + "epoch": 3.609843546284224, + "grad_norm": 37.506065368652344, + "learning_rate": 8.064751025987822e-06, + "loss": 0.3644, + "step": 44300 + }, + { + "epoch": 3.6179921773142114, + "grad_norm": 9.770977973937988, + "learning_rate": 8.053501499848907e-06, + "loss": 0.3838, + "step": 44400 + }, + { + "epoch": 3.626140808344198, + "grad_norm": 14.631871223449707, + "learning_rate": 8.042227268879516e-06, + "loss": 0.3732, + "step": 44500 + }, + { + "epoch": 3.634289439374185, + "grad_norm": 7.656193256378174, + "learning_rate": 8.030928424295572e-06, + "loss": 0.358, + "step": 44600 + }, + { + "epoch": 3.642438070404172, + "grad_norm": 9.974722862243652, + "learning_rate": 8.019605057512144e-06, + "loss": 0.3588, + "step": 44700 + }, + { + "epoch": 3.6505867014341593, + "grad_norm": 12.311222076416016, + "learning_rate": 8.008257260142693e-06, + "loss": 0.362, + "step": 44800 + }, + { + "epoch": 3.658735332464146, + "grad_norm": 11.374334335327148, + "learning_rate": 7.99688512399835e-06, + "loss": 0.385, + "step": 44900 + }, + { + "epoch": 3.666883963494133, + "grad_norm": 7.951153755187988, + "learning_rate": 7.985488741087153e-06, + "loss": 0.352, + "step": 45000 + }, + { + "epoch": 3.67503259452412, + "grad_norm": 5.6287384033203125, + "learning_rate": 7.97406820361332e-06, + "loss": 0.3763, + "step": 45100 + }, + { + "epoch": 3.6831812255541068, + "grad_norm": 9.33438777923584, + "learning_rate": 7.962623603976491e-06, + "loss": 0.3852, + "step": 45200 + }, + { + "epoch": 3.691329856584094, + "grad_norm": 12.365875244140625, + "learning_rate": 7.951155034770983e-06, + "loss": 0.3775, + "step": 45300 + }, + { + "epoch": 3.6994784876140807, + "grad_norm": 9.91942024230957, + "learning_rate": 7.93966258878505e-06, + "loss": 0.3678, + "step": 45400 + }, + { + "epoch": 3.707627118644068, + "grad_norm": 9.160215377807617, + "learning_rate": 7.928146359000117e-06, + "loss": 0.36, + "step": 45500 + }, + { + "epoch": 3.7157757496740547, + "grad_norm": 11.565260887145996, + "learning_rate": 7.91660643859004e-06, + "loss": 0.3531, + "step": 45600 + }, + { + "epoch": 3.723924380704042, + "grad_norm": 4.027003765106201, + "learning_rate": 7.905042920920344e-06, + "loss": 0.3722, + "step": 45700 + }, + { + "epoch": 3.7320730117340286, + "grad_norm": 13.809627532958984, + "learning_rate": 7.893455899547476e-06, + "loss": 0.3524, + "step": 45800 + }, + { + "epoch": 3.740221642764016, + "grad_norm": 13.452054023742676, + "learning_rate": 7.881845468218039e-06, + "loss": 0.375, + "step": 45900 + }, + { + "epoch": 3.7483702737940026, + "grad_norm": 9.63260269165039, + "learning_rate": 7.87021172086804e-06, + "loss": 0.3636, + "step": 46000 + }, + { + "epoch": 3.7565189048239898, + "grad_norm": 8.539379119873047, + "learning_rate": 7.85855475162213e-06, + "loss": 0.3687, + "step": 46100 + }, + { + "epoch": 3.7646675358539765, + "grad_norm": 7.635307788848877, + "learning_rate": 7.846874654792835e-06, + "loss": 0.3709, + "step": 46200 + }, + { + "epoch": 3.7728161668839633, + "grad_norm": 8.707938194274902, + "learning_rate": 7.835171524879805e-06, + "loss": 0.3466, + "step": 46300 + }, + { + "epoch": 3.7809647979139505, + "grad_norm": 6.248547077178955, + "learning_rate": 7.823445456569036e-06, + "loss": 0.3706, + "step": 46400 + }, + { + "epoch": 3.7891134289439377, + "grad_norm": 11.434155464172363, + "learning_rate": 7.811696544732115e-06, + "loss": 0.3907, + "step": 46500 + }, + { + "epoch": 3.7972620599739244, + "grad_norm": 5.250894546508789, + "learning_rate": 7.799924884425447e-06, + "loss": 0.377, + "step": 46600 + }, + { + "epoch": 3.805410691003911, + "grad_norm": 6.875328063964844, + "learning_rate": 7.788130570889488e-06, + "loss": 0.3569, + "step": 46700 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 8.773159980773926, + "learning_rate": 7.776313699547971e-06, + "loss": 0.3635, + "step": 46800 + }, + { + "epoch": 3.821707953063885, + "grad_norm": 4.8134002685546875, + "learning_rate": 7.764474366007138e-06, + "loss": 0.345, + "step": 46900 + }, + { + "epoch": 3.8298565840938723, + "grad_norm": 6.085391998291016, + "learning_rate": 7.752612666054963e-06, + "loss": 0.3699, + "step": 47000 + }, + { + "epoch": 3.838005215123859, + "grad_norm": 8.958887100219727, + "learning_rate": 7.740728695660389e-06, + "loss": 0.3407, + "step": 47100 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 6.2288994789123535, + "learning_rate": 7.728822550972523e-06, + "loss": 0.3633, + "step": 47200 + }, + { + "epoch": 3.854302477183833, + "grad_norm": 9.540541648864746, + "learning_rate": 7.716894328319893e-06, + "loss": 0.3476, + "step": 47300 + }, + { + "epoch": 3.86245110821382, + "grad_norm": 5.929731369018555, + "learning_rate": 7.704944124209645e-06, + "loss": 0.3929, + "step": 47400 + }, + { + "epoch": 3.870599739243807, + "grad_norm": 7.797017574310303, + "learning_rate": 7.692972035326772e-06, + "loss": 0.3728, + "step": 47500 + }, + { + "epoch": 3.878748370273794, + "grad_norm": 14.781734466552734, + "learning_rate": 7.680978158533324e-06, + "loss": 0.3546, + "step": 47600 + }, + { + "epoch": 3.886897001303781, + "grad_norm": 10.41878890991211, + "learning_rate": 7.668962590867636e-06, + "loss": 0.3603, + "step": 47700 + }, + { + "epoch": 3.895045632333768, + "grad_norm": 8.300308227539062, + "learning_rate": 7.656925429543531e-06, + "loss": 0.3546, + "step": 47800 + }, + { + "epoch": 3.903194263363755, + "grad_norm": 9.709467887878418, + "learning_rate": 7.644866771949544e-06, + "loss": 0.3575, + "step": 47900 + }, + { + "epoch": 3.9113428943937416, + "grad_norm": 7.606164455413818, + "learning_rate": 7.632786715648128e-06, + "loss": 0.3658, + "step": 48000 + }, + { + "epoch": 3.919491525423729, + "grad_norm": 11.461851119995117, + "learning_rate": 7.62068535837486e-06, + "loss": 0.3653, + "step": 48100 + }, + { + "epoch": 3.927640156453716, + "grad_norm": 11.35883617401123, + "learning_rate": 7.608562798037662e-06, + "loss": 0.3672, + "step": 48200 + }, + { + "epoch": 3.9357887874837028, + "grad_norm": 9.994701385498047, + "learning_rate": 7.596419132715997e-06, + "loss": 0.3601, + "step": 48300 + }, + { + "epoch": 3.9439374185136895, + "grad_norm": 12.242551803588867, + "learning_rate": 7.584254460660092e-06, + "loss": 0.3552, + "step": 48400 + }, + { + "epoch": 3.9520860495436767, + "grad_norm": 11.628976821899414, + "learning_rate": 7.572068880290118e-06, + "loss": 0.3644, + "step": 48500 + }, + { + "epoch": 3.960234680573664, + "grad_norm": 9.713350296020508, + "learning_rate": 7.559862490195418e-06, + "loss": 0.3463, + "step": 48600 + }, + { + "epoch": 3.9683833116036507, + "grad_norm": 5.648345470428467, + "learning_rate": 7.547635389133694e-06, + "loss": 0.3483, + "step": 48700 + }, + { + "epoch": 3.9765319426336374, + "grad_norm": 15.131999015808105, + "learning_rate": 7.535387676030222e-06, + "loss": 0.366, + "step": 48800 + }, + { + "epoch": 3.9846805736636246, + "grad_norm": 8.72270393371582, + "learning_rate": 7.523119449977028e-06, + "loss": 0.3567, + "step": 48900 + }, + { + "epoch": 3.9928292046936114, + "grad_norm": 10.733074188232422, + "learning_rate": 7.510830810232112e-06, + "loss": 0.37, + "step": 49000 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.8188755020080322, + "eval_loss": 0.529120922088623, + "eval_runtime": 6.8942, + "eval_samples_per_second": 361.175, + "eval_steps_per_second": 45.256, + "step": 49088 + }, + { + "epoch": 4.0009778357235986, + "grad_norm": 7.13838529586792, + "learning_rate": 7.498521856218637e-06, + "loss": 0.355, + "step": 49100 + }, + { + "epoch": 4.009126466753585, + "grad_norm": 5.439541816711426, + "learning_rate": 7.486192687524112e-06, + "loss": 0.3005, + "step": 49200 + }, + { + "epoch": 4.017275097783572, + "grad_norm": 17.687950134277344, + "learning_rate": 7.4738434038996e-06, + "loss": 0.2864, + "step": 49300 + }, + { + "epoch": 4.02542372881356, + "grad_norm": 11.162871360778809, + "learning_rate": 7.461474105258911e-06, + "loss": 0.3025, + "step": 49400 + }, + { + "epoch": 4.0335723598435465, + "grad_norm": 9.104811668395996, + "learning_rate": 7.449084891677785e-06, + "loss": 0.2846, + "step": 49500 + }, + { + "epoch": 4.041720990873533, + "grad_norm": 11.716981887817383, + "learning_rate": 7.436675863393086e-06, + "loss": 0.2984, + "step": 49600 + }, + { + "epoch": 4.04986962190352, + "grad_norm": 6.521731376647949, + "learning_rate": 7.424247120801997e-06, + "loss": 0.2979, + "step": 49700 + }, + { + "epoch": 4.058018252933508, + "grad_norm": 6.5696539878845215, + "learning_rate": 7.4117987644611985e-06, + "loss": 0.2898, + "step": 49800 + }, + { + "epoch": 4.066166883963494, + "grad_norm": 9.98416805267334, + "learning_rate": 7.399330895086061e-06, + "loss": 0.3115, + "step": 49900 + }, + { + "epoch": 4.074315514993481, + "grad_norm": 6.788928985595703, + "learning_rate": 7.386843613549828e-06, + "loss": 0.3158, + "step": 50000 + }, + { + "epoch": 4.082464146023468, + "grad_norm": 9.002969741821289, + "learning_rate": 7.374337020882798e-06, + "loss": 0.2964, + "step": 50100 + }, + { + "epoch": 4.090612777053455, + "grad_norm": 8.216889381408691, + "learning_rate": 7.3618112182715115e-06, + "loss": 0.3194, + "step": 50200 + }, + { + "epoch": 4.098761408083442, + "grad_norm": 17.576051712036133, + "learning_rate": 7.349266307057932e-06, + "loss": 0.3093, + "step": 50300 + }, + { + "epoch": 4.106910039113429, + "grad_norm": 14.113720893859863, + "learning_rate": 7.336702388738619e-06, + "loss": 0.2656, + "step": 50400 + }, + { + "epoch": 4.115058670143416, + "grad_norm": 13.906309127807617, + "learning_rate": 7.324119564963915e-06, + "loss": 0.2977, + "step": 50500 + }, + { + "epoch": 4.1232073011734025, + "grad_norm": 9.152776718139648, + "learning_rate": 7.311517937537122e-06, + "loss": 0.3067, + "step": 50600 + }, + { + "epoch": 4.13135593220339, + "grad_norm": 10.242730140686035, + "learning_rate": 7.29889760841367e-06, + "loss": 0.301, + "step": 50700 + }, + { + "epoch": 4.139504563233377, + "grad_norm": 11.567678451538086, + "learning_rate": 7.2862586797003046e-06, + "loss": 0.2997, + "step": 50800 + }, + { + "epoch": 4.147653194263364, + "grad_norm": 6.842143535614014, + "learning_rate": 7.27360125365425e-06, + "loss": 0.3004, + "step": 50900 + }, + { + "epoch": 4.15580182529335, + "grad_norm": 12.490499496459961, + "learning_rate": 7.260925432682386e-06, + "loss": 0.2959, + "step": 51000 + }, + { + "epoch": 4.163950456323338, + "grad_norm": 7.078547477722168, + "learning_rate": 7.248231319340422e-06, + "loss": 0.2966, + "step": 51100 + }, + { + "epoch": 4.172099087353325, + "grad_norm": 17.07299041748047, + "learning_rate": 7.235519016332064e-06, + "loss": 0.3241, + "step": 51200 + }, + { + "epoch": 4.1802477183833116, + "grad_norm": 14.579496383666992, + "learning_rate": 7.222788626508184e-06, + "loss": 0.294, + "step": 51300 + }, + { + "epoch": 4.188396349413298, + "grad_norm": 16.198028564453125, + "learning_rate": 7.210040252865984e-06, + "loss": 0.3049, + "step": 51400 + }, + { + "epoch": 4.196544980443286, + "grad_norm": 12.001542091369629, + "learning_rate": 7.197273998548174e-06, + "loss": 0.2932, + "step": 51500 + }, + { + "epoch": 4.204693611473273, + "grad_norm": 8.593428611755371, + "learning_rate": 7.184489966842128e-06, + "loss": 0.3147, + "step": 51600 + }, + { + "epoch": 4.2128422425032594, + "grad_norm": 19.119985580444336, + "learning_rate": 7.1716882611790475e-06, + "loss": 0.2929, + "step": 51700 + }, + { + "epoch": 4.220990873533246, + "grad_norm": 12.756973266601562, + "learning_rate": 7.1588689851331305e-06, + "loss": 0.2973, + "step": 51800 + }, + { + "epoch": 4.229139504563233, + "grad_norm": 11.550286293029785, + "learning_rate": 7.146032242420732e-06, + "loss": 0.2996, + "step": 51900 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 8.533171653747559, + "learning_rate": 7.133178136899522e-06, + "loss": 0.3094, + "step": 52000 + }, + { + "epoch": 4.245436766623207, + "grad_norm": 11.978692054748535, + "learning_rate": 7.120306772567647e-06, + "loss": 0.3013, + "step": 52100 + }, + { + "epoch": 4.253585397653194, + "grad_norm": 10.963492393493652, + "learning_rate": 7.107418253562889e-06, + "loss": 0.3081, + "step": 52200 + }, + { + "epoch": 4.261734028683181, + "grad_norm": 11.645411491394043, + "learning_rate": 7.0945126841618225e-06, + "loss": 0.2867, + "step": 52300 + }, + { + "epoch": 4.2698826597131685, + "grad_norm": 11.48385238647461, + "learning_rate": 7.081590168778973e-06, + "loss": 0.3088, + "step": 52400 + }, + { + "epoch": 4.278031290743155, + "grad_norm": 10.083149909973145, + "learning_rate": 7.068650811965967e-06, + "loss": 0.2954, + "step": 52500 + }, + { + "epoch": 4.286179921773142, + "grad_norm": 10.841811180114746, + "learning_rate": 7.055694718410688e-06, + "loss": 0.2944, + "step": 52600 + }, + { + "epoch": 4.294328552803129, + "grad_norm": 12.332331657409668, + "learning_rate": 7.042721992936438e-06, + "loss": 0.2857, + "step": 52700 + }, + { + "epoch": 4.302477183833116, + "grad_norm": 13.689620971679688, + "learning_rate": 7.029732740501073e-06, + "loss": 0.3024, + "step": 52800 + }, + { + "epoch": 4.310625814863103, + "grad_norm": 13.064624786376953, + "learning_rate": 7.016727066196168e-06, + "loss": 0.2917, + "step": 52900 + }, + { + "epoch": 4.31877444589309, + "grad_norm": 8.214381217956543, + "learning_rate": 7.003705075246163e-06, + "loss": 0.3173, + "step": 53000 + }, + { + "epoch": 4.326923076923077, + "grad_norm": 14.797425270080566, + "learning_rate": 6.990666873007506e-06, + "loss": 0.2734, + "step": 53100 + }, + { + "epoch": 4.335071707953064, + "grad_norm": 10.985969543457031, + "learning_rate": 6.977612564967808e-06, + "loss": 0.2958, + "step": 53200 + }, + { + "epoch": 4.343220338983051, + "grad_norm": 12.808884620666504, + "learning_rate": 6.964542256744986e-06, + "loss": 0.3169, + "step": 53300 + }, + { + "epoch": 4.351368970013038, + "grad_norm": 21.643781661987305, + "learning_rate": 6.9514560540864095e-06, + "loss": 0.3154, + "step": 53400 + }, + { + "epoch": 4.3595176010430245, + "grad_norm": 7.609200477600098, + "learning_rate": 6.938354062868041e-06, + "loss": 0.2985, + "step": 53500 + }, + { + "epoch": 4.367666232073011, + "grad_norm": 13.469466209411621, + "learning_rate": 6.925236389093588e-06, + "loss": 0.3063, + "step": 53600 + }, + { + "epoch": 4.375814863102999, + "grad_norm": 12.873883247375488, + "learning_rate": 6.912103138893636e-06, + "loss": 0.2903, + "step": 53700 + }, + { + "epoch": 4.383963494132986, + "grad_norm": 8.953607559204102, + "learning_rate": 6.898954418524797e-06, + "loss": 0.2897, + "step": 53800 + }, + { + "epoch": 4.3921121251629724, + "grad_norm": 21.484949111938477, + "learning_rate": 6.885790334368844e-06, + "loss": 0.2989, + "step": 53900 + }, + { + "epoch": 4.400260756192959, + "grad_norm": 8.624776840209961, + "learning_rate": 6.872610992931857e-06, + "loss": 0.2811, + "step": 54000 + }, + { + "epoch": 4.408409387222947, + "grad_norm": 13.120560646057129, + "learning_rate": 6.859416500843351e-06, + "loss": 0.3003, + "step": 54100 + }, + { + "epoch": 4.416558018252934, + "grad_norm": 8.616204261779785, + "learning_rate": 6.846206964855426e-06, + "loss": 0.3191, + "step": 54200 + }, + { + "epoch": 4.42470664928292, + "grad_norm": 7.0158233642578125, + "learning_rate": 6.832982491841894e-06, + "loss": 0.31, + "step": 54300 + }, + { + "epoch": 4.432855280312907, + "grad_norm": 9.716617584228516, + "learning_rate": 6.819743188797419e-06, + "loss": 0.2949, + "step": 54400 + }, + { + "epoch": 4.441003911342895, + "grad_norm": 10.602276802062988, + "learning_rate": 6.806489162836649e-06, + "loss": 0.3037, + "step": 54500 + }, + { + "epoch": 4.4491525423728815, + "grad_norm": 8.699592590332031, + "learning_rate": 6.793220521193347e-06, + "loss": 0.313, + "step": 54600 + }, + { + "epoch": 4.457301173402868, + "grad_norm": 8.307058334350586, + "learning_rate": 6.779937371219532e-06, + "loss": 0.2924, + "step": 54700 + }, + { + "epoch": 4.465449804432855, + "grad_norm": 10.045998573303223, + "learning_rate": 6.766639820384602e-06, + "loss": 0.3124, + "step": 54800 + }, + { + "epoch": 4.473598435462843, + "grad_norm": 15.478697776794434, + "learning_rate": 6.753327976274467e-06, + "loss": 0.2892, + "step": 54900 + }, + { + "epoch": 4.481747066492829, + "grad_norm": 12.46609878540039, + "learning_rate": 6.740001946590675e-06, + "loss": 0.2809, + "step": 55000 + }, + { + "epoch": 4.489895697522816, + "grad_norm": 11.292198181152344, + "learning_rate": 6.726661839149556e-06, + "loss": 0.2915, + "step": 55100 + }, + { + "epoch": 4.498044328552803, + "grad_norm": 15.23190689086914, + "learning_rate": 6.71330776188133e-06, + "loss": 0.306, + "step": 55200 + }, + { + "epoch": 4.5061929595827905, + "grad_norm": 11.232503890991211, + "learning_rate": 6.69993982282924e-06, + "loss": 0.2979, + "step": 55300 + }, + { + "epoch": 4.514341590612777, + "grad_norm": 11.436495780944824, + "learning_rate": 6.686558130148687e-06, + "loss": 0.2976, + "step": 55400 + }, + { + "epoch": 4.522490221642764, + "grad_norm": 11.90659236907959, + "learning_rate": 6.673162792106341e-06, + "loss": 0.3106, + "step": 55500 + }, + { + "epoch": 4.530638852672751, + "grad_norm": 9.979248046875, + "learning_rate": 6.6597539170792795e-06, + "loss": 0.2948, + "step": 55600 + }, + { + "epoch": 4.5387874837027375, + "grad_norm": 19.104442596435547, + "learning_rate": 6.646331613554094e-06, + "loss": 0.3248, + "step": 55700 + }, + { + "epoch": 4.546936114732725, + "grad_norm": 9.139418601989746, + "learning_rate": 6.632895990126028e-06, + "loss": 0.2996, + "step": 55800 + }, + { + "epoch": 4.555084745762712, + "grad_norm": 9.373650550842285, + "learning_rate": 6.619447155498091e-06, + "loss": 0.3127, + "step": 55900 + }, + { + "epoch": 4.563233376792699, + "grad_norm": 12.213810920715332, + "learning_rate": 6.605985218480179e-06, + "loss": 0.3113, + "step": 56000 + }, + { + "epoch": 4.5713820078226854, + "grad_norm": 9.15962028503418, + "learning_rate": 6.5925102879881915e-06, + "loss": 0.311, + "step": 56100 + }, + { + "epoch": 4.579530638852673, + "grad_norm": 11.712223052978516, + "learning_rate": 6.579022473043159e-06, + "loss": 0.3074, + "step": 56200 + }, + { + "epoch": 4.58767926988266, + "grad_norm": 9.559146881103516, + "learning_rate": 6.565521882770355e-06, + "loss": 0.3065, + "step": 56300 + }, + { + "epoch": 4.595827900912647, + "grad_norm": 8.07590389251709, + "learning_rate": 6.552008626398409e-06, + "loss": 0.3195, + "step": 56400 + }, + { + "epoch": 4.603976531942633, + "grad_norm": 13.063721656799316, + "learning_rate": 6.5384828132584335e-06, + "loss": 0.2778, + "step": 56500 + }, + { + "epoch": 4.612125162972621, + "grad_norm": 13.26430892944336, + "learning_rate": 6.524944552783129e-06, + "loss": 0.3081, + "step": 56600 + }, + { + "epoch": 4.620273794002608, + "grad_norm": 14.221997261047363, + "learning_rate": 6.511393954505906e-06, + "loss": 0.3072, + "step": 56700 + }, + { + "epoch": 4.6284224250325945, + "grad_norm": 10.34438705444336, + "learning_rate": 6.497831128059993e-06, + "loss": 0.3078, + "step": 56800 + }, + { + "epoch": 4.636571056062581, + "grad_norm": 15.65034294128418, + "learning_rate": 6.4842561831775575e-06, + "loss": 0.3035, + "step": 56900 + }, + { + "epoch": 4.644719687092568, + "grad_norm": 10.238895416259766, + "learning_rate": 6.470669229688809e-06, + "loss": 0.2962, + "step": 57000 + }, + { + "epoch": 4.652868318122556, + "grad_norm": 16.671092987060547, + "learning_rate": 6.457070377521111e-06, + "loss": 0.307, + "step": 57100 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 11.118473052978516, + "learning_rate": 6.443459736698106e-06, + "loss": 0.3079, + "step": 57200 + }, + { + "epoch": 4.669165580182529, + "grad_norm": 7.511115550994873, + "learning_rate": 6.429837417338804e-06, + "loss": 0.2959, + "step": 57300 + }, + { + "epoch": 4.677314211212517, + "grad_norm": 14.2573881149292, + "learning_rate": 6.416203529656707e-06, + "loss": 0.2948, + "step": 57400 + }, + { + "epoch": 4.6854628422425035, + "grad_norm": 11.03162956237793, + "learning_rate": 6.40255818395891e-06, + "loss": 0.3095, + "step": 57500 + }, + { + "epoch": 4.69361147327249, + "grad_norm": 11.995973587036133, + "learning_rate": 6.388901490645214e-06, + "loss": 0.3099, + "step": 57600 + }, + { + "epoch": 4.701760104302477, + "grad_norm": 9.43193244934082, + "learning_rate": 6.375233560207229e-06, + "loss": 0.3276, + "step": 57700 + }, + { + "epoch": 4.709908735332464, + "grad_norm": 10.617565155029297, + "learning_rate": 6.361554503227475e-06, + "loss": 0.3149, + "step": 57800 + }, + { + "epoch": 4.718057366362451, + "grad_norm": 16.004545211791992, + "learning_rate": 6.347864430378501e-06, + "loss": 0.2907, + "step": 57900 + }, + { + "epoch": 4.726205997392438, + "grad_norm": 18.075027465820312, + "learning_rate": 6.334163452421978e-06, + "loss": 0.3168, + "step": 58000 + }, + { + "epoch": 4.734354628422425, + "grad_norm": 19.736661911010742, + "learning_rate": 6.320451680207805e-06, + "loss": 0.3077, + "step": 58100 + }, + { + "epoch": 4.742503259452412, + "grad_norm": 6.202484607696533, + "learning_rate": 6.306729224673217e-06, + "loss": 0.3022, + "step": 58200 + }, + { + "epoch": 4.750651890482399, + "grad_norm": 4.973538398742676, + "learning_rate": 6.29299619684188e-06, + "loss": 0.3032, + "step": 58300 + }, + { + "epoch": 4.758800521512386, + "grad_norm": 9.67834186553955, + "learning_rate": 6.2792527078230024e-06, + "loss": 0.2937, + "step": 58400 + }, + { + "epoch": 4.766949152542373, + "grad_norm": 7.5604777336120605, + "learning_rate": 6.265498868810424e-06, + "loss": 0.3132, + "step": 58500 + }, + { + "epoch": 4.77509778357236, + "grad_norm": 11.391521453857422, + "learning_rate": 6.251734791081728e-06, + "loss": 0.3249, + "step": 58600 + }, + { + "epoch": 4.783246414602347, + "grad_norm": 16.40961265563965, + "learning_rate": 6.237960585997334e-06, + "loss": 0.2951, + "step": 58700 + }, + { + "epoch": 4.791395045632334, + "grad_norm": 4.114518165588379, + "learning_rate": 6.224176364999595e-06, + "loss": 0.3091, + "step": 58800 + }, + { + "epoch": 4.799543676662321, + "grad_norm": 9.569024085998535, + "learning_rate": 6.210382239611906e-06, + "loss": 0.3093, + "step": 58900 + }, + { + "epoch": 4.8076923076923075, + "grad_norm": 30.753637313842773, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.2982, + "step": 59000 + }, + { + "epoch": 4.815840938722294, + "grad_norm": 7.500620365142822, + "learning_rate": 6.18276472216e-06, + "loss": 0.2956, + "step": 59100 + }, + { + "epoch": 4.823989569752282, + "grad_norm": 14.710212707519531, + "learning_rate": 6.16894155353962e-06, + "loss": 0.3078, + "step": 59200 + }, + { + "epoch": 4.832138200782269, + "grad_norm": 7.550549507141113, + "learning_rate": 6.1551089274151525e-06, + "loss": 0.3093, + "step": 59300 + }, + { + "epoch": 4.840286831812255, + "grad_norm": 8.313648223876953, + "learning_rate": 6.141266955701616e-06, + "loss": 0.2872, + "step": 59400 + }, + { + "epoch": 4.848435462842242, + "grad_norm": 3.505223274230957, + "learning_rate": 6.127415750389645e-06, + "loss": 0.2991, + "step": 59500 + }, + { + "epoch": 4.85658409387223, + "grad_norm": 10.405817985534668, + "learning_rate": 6.113555423544576e-06, + "loss": 0.3083, + "step": 59600 + }, + { + "epoch": 4.8647327249022165, + "grad_norm": 9.818922996520996, + "learning_rate": 6.0996860873055505e-06, + "loss": 0.3131, + "step": 59700 + }, + { + "epoch": 4.872881355932203, + "grad_norm": 8.345934867858887, + "learning_rate": 6.085807853884595e-06, + "loss": 0.2963, + "step": 59800 + }, + { + "epoch": 4.88102998696219, + "grad_norm": 10.804642677307129, + "learning_rate": 6.071920835565724e-06, + "loss": 0.315, + "step": 59900 + }, + { + "epoch": 4.889178617992178, + "grad_norm": 10.550320625305176, + "learning_rate": 6.058025144704026e-06, + "loss": 0.288, + "step": 60000 + }, + { + "epoch": 4.897327249022164, + "grad_norm": 7.386425018310547, + "learning_rate": 6.044120893724758e-06, + "loss": 0.3175, + "step": 60100 + }, + { + "epoch": 4.905475880052151, + "grad_norm": 16.652528762817383, + "learning_rate": 6.030208195122433e-06, + "loss": 0.3218, + "step": 60200 + }, + { + "epoch": 4.913624511082138, + "grad_norm": 15.053431510925293, + "learning_rate": 6.016287161459907e-06, + "loss": 0.2769, + "step": 60300 + }, + { + "epoch": 4.921773142112125, + "grad_norm": 7.756086349487305, + "learning_rate": 6.002357905367481e-06, + "loss": 0.289, + "step": 60400 + }, + { + "epoch": 4.929921773142112, + "grad_norm": 10.426520347595215, + "learning_rate": 5.9884205395419725e-06, + "loss": 0.3169, + "step": 60500 + }, + { + "epoch": 4.938070404172099, + "grad_norm": 12.334880828857422, + "learning_rate": 5.974475176745813e-06, + "loss": 0.3093, + "step": 60600 + }, + { + "epoch": 4.946219035202086, + "grad_norm": 14.239689826965332, + "learning_rate": 5.960521929806141e-06, + "loss": 0.3036, + "step": 60700 + }, + { + "epoch": 4.9543676662320735, + "grad_norm": 12.593892097473145, + "learning_rate": 5.946560911613877e-06, + "loss": 0.2911, + "step": 60800 + }, + { + "epoch": 4.96251629726206, + "grad_norm": 4.950251579284668, + "learning_rate": 5.9325922351228186e-06, + "loss": 0.2942, + "step": 60900 + }, + { + "epoch": 4.970664928292047, + "grad_norm": 10.60743522644043, + "learning_rate": 5.918616013348719e-06, + "loss": 0.302, + "step": 61000 + }, + { + "epoch": 4.978813559322034, + "grad_norm": 18.459735870361328, + "learning_rate": 5.904632359368388e-06, + "loss": 0.2806, + "step": 61100 + }, + { + "epoch": 4.9869621903520205, + "grad_norm": 10.454113006591797, + "learning_rate": 5.890641386318756e-06, + "loss": 0.3009, + "step": 61200 + }, + { + "epoch": 4.995110821382008, + "grad_norm": 12.8052396774292, + "learning_rate": 5.876643207395976e-06, + "loss": 0.3122, + "step": 61300 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.8285140562248996, + "eval_loss": 0.5347269773483276, + "eval_runtime": 6.815, + "eval_samples_per_second": 365.37, + "eval_steps_per_second": 45.781, + "step": 61360 + }, + { + "epoch": 5.003259452411995, + "grad_norm": 3.523259162902832, + "learning_rate": 5.862637935854502e-06, + "loss": 0.289, + "step": 61400 + }, + { + "epoch": 5.011408083441982, + "grad_norm": 14.498679161071777, + "learning_rate": 5.848625685006164e-06, + "loss": 0.2673, + "step": 61500 + }, + { + "epoch": 5.019556714471968, + "grad_norm": 15.165558815002441, + "learning_rate": 5.834606568219269e-06, + "loss": 0.2499, + "step": 61600 + }, + { + "epoch": 5.027705345501956, + "grad_norm": 12.705721855163574, + "learning_rate": 5.820580698917666e-06, + "loss": 0.2486, + "step": 61700 + }, + { + "epoch": 5.035853976531943, + "grad_norm": 15.987256050109863, + "learning_rate": 5.806548190579842e-06, + "loss": 0.2417, + "step": 61800 + }, + { + "epoch": 5.0440026075619295, + "grad_norm": 8.831116676330566, + "learning_rate": 5.792509156737997e-06, + "loss": 0.2265, + "step": 61900 + }, + { + "epoch": 5.052151238591916, + "grad_norm": 12.182964324951172, + "learning_rate": 5.7784637109771225e-06, + "loss": 0.2538, + "step": 62000 + }, + { + "epoch": 5.060299869621904, + "grad_norm": 10.809981346130371, + "learning_rate": 5.764411966934092e-06, + "loss": 0.2603, + "step": 62100 + }, + { + "epoch": 5.068448500651891, + "grad_norm": 5.705296039581299, + "learning_rate": 5.750354038296733e-06, + "loss": 0.2438, + "step": 62200 + }, + { + "epoch": 5.076597131681877, + "grad_norm": 9.95255184173584, + "learning_rate": 5.736290038802911e-06, + "loss": 0.234, + "step": 62300 + }, + { + "epoch": 5.084745762711864, + "grad_norm": 7.724064350128174, + "learning_rate": 5.722220082239608e-06, + "loss": 0.2488, + "step": 62400 + }, + { + "epoch": 5.092894393741851, + "grad_norm": 10.82822036743164, + "learning_rate": 5.708144282442006e-06, + "loss": 0.2591, + "step": 62500 + }, + { + "epoch": 5.101043024771839, + "grad_norm": 8.642077445983887, + "learning_rate": 5.694062753292559e-06, + "loss": 0.2581, + "step": 62600 + }, + { + "epoch": 5.109191655801825, + "grad_norm": 10.630475044250488, + "learning_rate": 5.679975608720078e-06, + "loss": 0.2408, + "step": 62700 + }, + { + "epoch": 5.117340286831812, + "grad_norm": 10.559286117553711, + "learning_rate": 5.665882962698801e-06, + "loss": 0.2417, + "step": 62800 + }, + { + "epoch": 5.125488917861799, + "grad_norm": 6.505354881286621, + "learning_rate": 5.651784929247486e-06, + "loss": 0.2517, + "step": 62900 + }, + { + "epoch": 5.1336375488917865, + "grad_norm": 10.710380554199219, + "learning_rate": 5.637681622428468e-06, + "loss": 0.235, + "step": 63000 + }, + { + "epoch": 5.141786179921773, + "grad_norm": 4.721646785736084, + "learning_rate": 5.6235731563467535e-06, + "loss": 0.2577, + "step": 63100 + }, + { + "epoch": 5.14993481095176, + "grad_norm": 8.588154792785645, + "learning_rate": 5.609459645149089e-06, + "loss": 0.2728, + "step": 63200 + }, + { + "epoch": 5.158083441981747, + "grad_norm": 19.248777389526367, + "learning_rate": 5.595341203023044e-06, + "loss": 0.2371, + "step": 63300 + }, + { + "epoch": 5.166232073011734, + "grad_norm": 8.45293140411377, + "learning_rate": 5.581217944196071e-06, + "loss": 0.264, + "step": 63400 + }, + { + "epoch": 5.174380704041721, + "grad_norm": 8.333393096923828, + "learning_rate": 5.567089982934605e-06, + "loss": 0.2558, + "step": 63500 + }, + { + "epoch": 5.182529335071708, + "grad_norm": 14.054290771484375, + "learning_rate": 5.552957433543119e-06, + "loss": 0.2524, + "step": 63600 + }, + { + "epoch": 5.190677966101695, + "grad_norm": 12.668076515197754, + "learning_rate": 5.538820410363214e-06, + "loss": 0.2408, + "step": 63700 + }, + { + "epoch": 5.198826597131682, + "grad_norm": 9.344785690307617, + "learning_rate": 5.524679027772676e-06, + "loss": 0.2538, + "step": 63800 + }, + { + "epoch": 5.206975228161669, + "grad_norm": 9.552376747131348, + "learning_rate": 5.510533400184572e-06, + "loss": 0.2535, + "step": 63900 + }, + { + "epoch": 5.215123859191656, + "grad_norm": 10.270748138427734, + "learning_rate": 5.496383642046311e-06, + "loss": 0.2672, + "step": 64000 + }, + { + "epoch": 5.2232724902216425, + "grad_norm": 15.067427635192871, + "learning_rate": 5.4822298678387174e-06, + "loss": 0.2455, + "step": 64100 + }, + { + "epoch": 5.23142112125163, + "grad_norm": 5.5667948722839355, + "learning_rate": 5.468072192075111e-06, + "loss": 0.2539, + "step": 64200 + }, + { + "epoch": 5.239569752281617, + "grad_norm": 11.088788032531738, + "learning_rate": 5.453910729300378e-06, + "loss": 0.2523, + "step": 64300 + }, + { + "epoch": 5.247718383311604, + "grad_norm": 24.676876068115234, + "learning_rate": 5.439745594090042e-06, + "loss": 0.2488, + "step": 64400 + }, + { + "epoch": 5.25586701434159, + "grad_norm": 9.937374114990234, + "learning_rate": 5.425576901049342e-06, + "loss": 0.2575, + "step": 64500 + }, + { + "epoch": 5.264015645371577, + "grad_norm": 13.66021728515625, + "learning_rate": 5.411404764812299e-06, + "loss": 0.2396, + "step": 64600 + }, + { + "epoch": 5.272164276401565, + "grad_norm": 11.568852424621582, + "learning_rate": 5.3972293000407945e-06, + "loss": 0.2398, + "step": 64700 + }, + { + "epoch": 5.280312907431552, + "grad_norm": 9.292428970336914, + "learning_rate": 5.383050621423639e-06, + "loss": 0.2696, + "step": 64800 + }, + { + "epoch": 5.288461538461538, + "grad_norm": 21.01643180847168, + "learning_rate": 5.368868843675642e-06, + "loss": 0.2522, + "step": 64900 + }, + { + "epoch": 5.296610169491525, + "grad_norm": 7.557727813720703, + "learning_rate": 5.354684081536693e-06, + "loss": 0.2709, + "step": 65000 + }, + { + "epoch": 5.304758800521513, + "grad_norm": 7.703597545623779, + "learning_rate": 5.340496449770824e-06, + "loss": 0.2561, + "step": 65100 + }, + { + "epoch": 5.3129074315514995, + "grad_norm": 11.133892059326172, + "learning_rate": 5.3263060631652805e-06, + "loss": 0.2595, + "step": 65200 + }, + { + "epoch": 5.321056062581486, + "grad_norm": 15.144754409790039, + "learning_rate": 5.312113036529604e-06, + "loss": 0.2506, + "step": 65300 + }, + { + "epoch": 5.329204693611473, + "grad_norm": 7.959693431854248, + "learning_rate": 5.297917484694692e-06, + "loss": 0.2644, + "step": 65400 + }, + { + "epoch": 5.337353324641461, + "grad_norm": 15.450654029846191, + "learning_rate": 5.28371952251187e-06, + "loss": 0.2533, + "step": 65500 + }, + { + "epoch": 5.345501955671447, + "grad_norm": 6.035745620727539, + "learning_rate": 5.269519264851967e-06, + "loss": 0.2507, + "step": 65600 + }, + { + "epoch": 5.353650586701434, + "grad_norm": 8.266439437866211, + "learning_rate": 5.255316826604385e-06, + "loss": 0.2588, + "step": 65700 + }, + { + "epoch": 5.361799217731421, + "grad_norm": 9.542835235595703, + "learning_rate": 5.24111232267617e-06, + "loss": 0.2584, + "step": 65800 + }, + { + "epoch": 5.369947848761408, + "grad_norm": 11.433173179626465, + "learning_rate": 5.2269058679910735e-06, + "loss": 0.2451, + "step": 65900 + }, + { + "epoch": 5.378096479791395, + "grad_norm": 12.72153377532959, + "learning_rate": 5.212697577488635e-06, + "loss": 0.2496, + "step": 66000 + }, + { + "epoch": 5.386245110821382, + "grad_norm": 9.416111946105957, + "learning_rate": 5.1984875661232495e-06, + "loss": 0.2567, + "step": 66100 + }, + { + "epoch": 5.394393741851369, + "grad_norm": 15.701902389526367, + "learning_rate": 5.184275948863231e-06, + "loss": 0.2432, + "step": 66200 + }, + { + "epoch": 5.4025423728813555, + "grad_norm": 7.241784572601318, + "learning_rate": 5.1700628406898835e-06, + "loss": 0.2441, + "step": 66300 + }, + { + "epoch": 5.410691003911343, + "grad_norm": 21.102312088012695, + "learning_rate": 5.155848356596581e-06, + "loss": 0.2695, + "step": 66400 + }, + { + "epoch": 5.41883963494133, + "grad_norm": 12.834817886352539, + "learning_rate": 5.1416326115878255e-06, + "loss": 0.2705, + "step": 66500 + }, + { + "epoch": 5.426988265971317, + "grad_norm": 29.203624725341797, + "learning_rate": 5.127415720678319e-06, + "loss": 0.2354, + "step": 66600 + }, + { + "epoch": 5.435136897001303, + "grad_norm": 13.500927925109863, + "learning_rate": 5.113197798892038e-06, + "loss": 0.2508, + "step": 66700 + }, + { + "epoch": 5.443285528031291, + "grad_norm": 7.524002552032471, + "learning_rate": 5.098978961261296e-06, + "loss": 0.2494, + "step": 66800 + }, + { + "epoch": 5.451434159061278, + "grad_norm": 17.00074577331543, + "learning_rate": 5.084759322825821e-06, + "loss": 0.241, + "step": 66900 + }, + { + "epoch": 5.459582790091265, + "grad_norm": 11.755769729614258, + "learning_rate": 5.070538998631813e-06, + "loss": 0.2658, + "step": 67000 + }, + { + "epoch": 5.467731421121251, + "grad_norm": 13.64929485321045, + "learning_rate": 5.056318103731028e-06, + "loss": 0.2515, + "step": 67100 + }, + { + "epoch": 5.475880052151239, + "grad_norm": 6.673364639282227, + "learning_rate": 5.042096753179835e-06, + "loss": 0.2505, + "step": 67200 + }, + { + "epoch": 5.484028683181226, + "grad_norm": 6.764876365661621, + "learning_rate": 5.02787506203829e-06, + "loss": 0.2584, + "step": 67300 + }, + { + "epoch": 5.4921773142112125, + "grad_norm": 11.133795738220215, + "learning_rate": 5.013653145369204e-06, + "loss": 0.2598, + "step": 67400 + }, + { + "epoch": 5.500325945241199, + "grad_norm": 11.689901351928711, + "learning_rate": 4.9994311182372145e-06, + "loss": 0.2397, + "step": 67500 + }, + { + "epoch": 5.508474576271187, + "grad_norm": 18.084266662597656, + "learning_rate": 4.985209095707852e-06, + "loss": 0.265, + "step": 67600 + }, + { + "epoch": 5.516623207301174, + "grad_norm": 12.136353492736816, + "learning_rate": 4.970987192846609e-06, + "loss": 0.2372, + "step": 67700 + }, + { + "epoch": 5.52477183833116, + "grad_norm": 14.060345649719238, + "learning_rate": 4.95676552471801e-06, + "loss": 0.2657, + "step": 67800 + }, + { + "epoch": 5.532920469361147, + "grad_norm": 5.493065357208252, + "learning_rate": 4.942544206384682e-06, + "loss": 0.2377, + "step": 67900 + }, + { + "epoch": 5.541069100391134, + "grad_norm": 13.543553352355957, + "learning_rate": 4.928323352906421e-06, + "loss": 0.2456, + "step": 68000 + }, + { + "epoch": 5.5492177314211215, + "grad_norm": 12.011448860168457, + "learning_rate": 4.9141030793392595e-06, + "loss": 0.2695, + "step": 68100 + }, + { + "epoch": 5.557366362451108, + "grad_norm": 7.862688064575195, + "learning_rate": 4.899883500734542e-06, + "loss": 0.2668, + "step": 68200 + }, + { + "epoch": 5.565514993481095, + "grad_norm": 11.895374298095703, + "learning_rate": 4.885664732137988e-06, + "loss": 0.2581, + "step": 68300 + }, + { + "epoch": 5.573663624511082, + "grad_norm": 19.049335479736328, + "learning_rate": 4.871446888588762e-06, + "loss": 0.2581, + "step": 68400 + }, + { + "epoch": 5.581812255541069, + "grad_norm": 15.173524856567383, + "learning_rate": 4.85723008511855e-06, + "loss": 0.2374, + "step": 68500 + }, + { + "epoch": 5.589960886571056, + "grad_norm": 15.82532024383545, + "learning_rate": 4.84301443675062e-06, + "loss": 0.2548, + "step": 68600 + }, + { + "epoch": 5.598109517601043, + "grad_norm": 9.289793014526367, + "learning_rate": 4.828800058498889e-06, + "loss": 0.2585, + "step": 68700 + }, + { + "epoch": 5.60625814863103, + "grad_norm": 13.010422706604004, + "learning_rate": 4.814587065367009e-06, + "loss": 0.264, + "step": 68800 + }, + { + "epoch": 5.614406779661017, + "grad_norm": 10.556730270385742, + "learning_rate": 4.800375572347414e-06, + "loss": 0.2436, + "step": 68900 + }, + { + "epoch": 5.622555410691004, + "grad_norm": 13.723767280578613, + "learning_rate": 4.786165694420408e-06, + "loss": 0.2477, + "step": 69000 + }, + { + "epoch": 5.630704041720991, + "grad_norm": 11.722618103027344, + "learning_rate": 4.771957546553226e-06, + "loss": 0.2581, + "step": 69100 + }, + { + "epoch": 5.638852672750978, + "grad_norm": 10.373120307922363, + "learning_rate": 4.757751243699109e-06, + "loss": 0.2606, + "step": 69200 + }, + { + "epoch": 5.647001303780964, + "grad_norm": 15.857172966003418, + "learning_rate": 4.743546900796364e-06, + "loss": 0.2723, + "step": 69300 + }, + { + "epoch": 5.655149934810952, + "grad_norm": 22.450532913208008, + "learning_rate": 4.729344632767446e-06, + "loss": 0.235, + "step": 69400 + }, + { + "epoch": 5.663298565840939, + "grad_norm": 15.469109535217285, + "learning_rate": 4.71514455451802e-06, + "loss": 0.2455, + "step": 69500 + }, + { + "epoch": 5.6714471968709255, + "grad_norm": 21.650880813598633, + "learning_rate": 4.7009467809360375e-06, + "loss": 0.2597, + "step": 69600 + }, + { + "epoch": 5.679595827900913, + "grad_norm": 16.47661590576172, + "learning_rate": 4.6867514268907995e-06, + "loss": 0.2555, + "step": 69700 + }, + { + "epoch": 5.6877444589309, + "grad_norm": 16.370121002197266, + "learning_rate": 4.672558607232033e-06, + "loss": 0.2411, + "step": 69800 + }, + { + "epoch": 5.695893089960887, + "grad_norm": 10.867352485656738, + "learning_rate": 4.658368436788963e-06, + "loss": 0.2638, + "step": 69900 + }, + { + "epoch": 5.704041720990873, + "grad_norm": 13.257880210876465, + "learning_rate": 4.644181030369378e-06, + "loss": 0.233, + "step": 70000 + }, + { + "epoch": 5.71219035202086, + "grad_norm": 16.66828155517578, + "learning_rate": 4.629996502758703e-06, + "loss": 0.2549, + "step": 70100 + }, + { + "epoch": 5.720338983050848, + "grad_norm": 8.0230712890625, + "learning_rate": 4.615814968719071e-06, + "loss": 0.251, + "step": 70200 + }, + { + "epoch": 5.7284876140808345, + "grad_norm": 20.61688804626465, + "learning_rate": 4.6016365429884e-06, + "loss": 0.2617, + "step": 70300 + }, + { + "epoch": 5.736636245110821, + "grad_norm": 4.916039943695068, + "learning_rate": 4.587461340279457e-06, + "loss": 0.2772, + "step": 70400 + }, + { + "epoch": 5.744784876140808, + "grad_norm": 13.59726333618164, + "learning_rate": 4.573289475278927e-06, + "loss": 0.2654, + "step": 70500 + }, + { + "epoch": 5.752933507170796, + "grad_norm": 21.178253173828125, + "learning_rate": 4.559121062646499e-06, + "loss": 0.237, + "step": 70600 + }, + { + "epoch": 5.761082138200782, + "grad_norm": 15.958664894104004, + "learning_rate": 4.544956217013927e-06, + "loss": 0.2447, + "step": 70700 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 7.610626220703125, + "learning_rate": 4.530795052984104e-06, + "loss": 0.239, + "step": 70800 + }, + { + "epoch": 5.777379400260756, + "grad_norm": 10.934889793395996, + "learning_rate": 4.5166376851301385e-06, + "loss": 0.2562, + "step": 70900 + }, + { + "epoch": 5.7855280312907436, + "grad_norm": 7.9625244140625, + "learning_rate": 4.502484227994426e-06, + "loss": 0.2606, + "step": 71000 + }, + { + "epoch": 5.79367666232073, + "grad_norm": 15.313315391540527, + "learning_rate": 4.488334796087719e-06, + "loss": 0.2454, + "step": 71100 + }, + { + "epoch": 5.801825293350717, + "grad_norm": 16.183135986328125, + "learning_rate": 4.474189503888207e-06, + "loss": 0.2591, + "step": 71200 + }, + { + "epoch": 5.809973924380704, + "grad_norm": 8.89918041229248, + "learning_rate": 4.4600484658405815e-06, + "loss": 0.2577, + "step": 71300 + }, + { + "epoch": 5.818122555410691, + "grad_norm": 8.31811237335205, + "learning_rate": 4.445911796355119e-06, + "loss": 0.2382, + "step": 71400 + }, + { + "epoch": 5.826271186440678, + "grad_norm": 9.141270637512207, + "learning_rate": 4.431779609806751e-06, + "loss": 0.2401, + "step": 71500 + }, + { + "epoch": 5.834419817470665, + "grad_norm": 8.92165756225586, + "learning_rate": 4.4176520205341365e-06, + "loss": 0.2133, + "step": 71600 + }, + { + "epoch": 5.842568448500652, + "grad_norm": 14.15666675567627, + "learning_rate": 4.403529142838745e-06, + "loss": 0.2536, + "step": 71700 + }, + { + "epoch": 5.8507170795306385, + "grad_norm": 8.742586135864258, + "learning_rate": 4.38941109098392e-06, + "loss": 0.261, + "step": 71800 + }, + { + "epoch": 5.858865710560626, + "grad_norm": 8.7103853225708, + "learning_rate": 4.375297979193965e-06, + "loss": 0.2331, + "step": 71900 + }, + { + "epoch": 5.867014341590613, + "grad_norm": 13.822142601013184, + "learning_rate": 4.361189921653215e-06, + "loss": 0.2583, + "step": 72000 + }, + { + "epoch": 5.8751629726206, + "grad_norm": 9.043753623962402, + "learning_rate": 4.3470870325051084e-06, + "loss": 0.2635, + "step": 72100 + }, + { + "epoch": 5.883311603650586, + "grad_norm": 10.288004875183105, + "learning_rate": 4.332989425851273e-06, + "loss": 0.2644, + "step": 72200 + }, + { + "epoch": 5.891460234680574, + "grad_norm": 18.826217651367188, + "learning_rate": 4.318897215750593e-06, + "loss": 0.2515, + "step": 72300 + }, + { + "epoch": 5.899608865710561, + "grad_norm": 11.778913497924805, + "learning_rate": 4.304810516218298e-06, + "loss": 0.2628, + "step": 72400 + }, + { + "epoch": 5.9077574967405475, + "grad_norm": 16.54121971130371, + "learning_rate": 4.290729441225027e-06, + "loss": 0.2792, + "step": 72500 + }, + { + "epoch": 5.915906127770534, + "grad_norm": 12.631098747253418, + "learning_rate": 4.276654104695915e-06, + "loss": 0.2503, + "step": 72600 + }, + { + "epoch": 5.924054758800521, + "grad_norm": 10.706419944763184, + "learning_rate": 4.262584620509669e-06, + "loss": 0.2564, + "step": 72700 + }, + { + "epoch": 5.932203389830509, + "grad_norm": 8.69650650024414, + "learning_rate": 4.248521102497649e-06, + "loss": 0.2569, + "step": 72800 + }, + { + "epoch": 5.940352020860495, + "grad_norm": 12.438202857971191, + "learning_rate": 4.23446366444294e-06, + "loss": 0.2531, + "step": 72900 + }, + { + "epoch": 5.948500651890482, + "grad_norm": 22.02505874633789, + "learning_rate": 4.220412420079438e-06, + "loss": 0.2692, + "step": 73000 + }, + { + "epoch": 5.95664928292047, + "grad_norm": 13.650114059448242, + "learning_rate": 4.206367483090931e-06, + "loss": 0.2663, + "step": 73100 + }, + { + "epoch": 5.9647979139504566, + "grad_norm": 13.705251693725586, + "learning_rate": 4.192328967110172e-06, + "loss": 0.2295, + "step": 73200 + }, + { + "epoch": 5.972946544980443, + "grad_norm": 7.683305263519287, + "learning_rate": 4.178296985717967e-06, + "loss": 0.2622, + "step": 73300 + }, + { + "epoch": 5.98109517601043, + "grad_norm": 7.798497676849365, + "learning_rate": 4.16427165244225e-06, + "loss": 0.2431, + "step": 73400 + }, + { + "epoch": 5.989243807040417, + "grad_norm": 8.129569053649902, + "learning_rate": 4.150253080757172e-06, + "loss": 0.2372, + "step": 73500 + }, + { + "epoch": 5.9973924380704045, + "grad_norm": 14.516979217529297, + "learning_rate": 4.136241384082174e-06, + "loss": 0.2801, + "step": 73600 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.8168674698795181, + "eval_loss": 0.6053332686424255, + "eval_runtime": 7.0202, + "eval_samples_per_second": 354.691, + "eval_steps_per_second": 44.443, + "step": 73632 + }, + { + "epoch": 6.005541069100391, + "grad_norm": 11.174201965332031, + "learning_rate": 4.122236675781071e-06, + "loss": 0.224, + "step": 73700 + }, + { + "epoch": 6.013689700130378, + "grad_norm": 24.070091247558594, + "learning_rate": 4.108239069161147e-06, + "loss": 0.2289, + "step": 73800 + }, + { + "epoch": 6.021838331160365, + "grad_norm": 14.804594993591309, + "learning_rate": 4.09424867747222e-06, + "loss": 0.2017, + "step": 73900 + }, + { + "epoch": 6.029986962190352, + "grad_norm": 20.014951705932617, + "learning_rate": 4.0802656139057385e-06, + "loss": 0.2203, + "step": 74000 + }, + { + "epoch": 6.038135593220339, + "grad_norm": 11.608116149902344, + "learning_rate": 4.066289991593859e-06, + "loss": 0.1983, + "step": 74100 + }, + { + "epoch": 6.046284224250326, + "grad_norm": 10.88152027130127, + "learning_rate": 4.052321923608539e-06, + "loss": 0.2167, + "step": 74200 + }, + { + "epoch": 6.054432855280313, + "grad_norm": 9.91988754272461, + "learning_rate": 4.038361522960609e-06, + "loss": 0.2114, + "step": 74300 + }, + { + "epoch": 6.0625814863103, + "grad_norm": 10.7438383102417, + "learning_rate": 4.024408902598871e-06, + "loss": 0.2126, + "step": 74400 + }, + { + "epoch": 6.070730117340287, + "grad_norm": 13.341911315917969, + "learning_rate": 4.01046417540918e-06, + "loss": 0.2099, + "step": 74500 + }, + { + "epoch": 6.078878748370274, + "grad_norm": 14.30612564086914, + "learning_rate": 3.996527454213522e-06, + "loss": 0.2159, + "step": 74600 + }, + { + "epoch": 6.0870273794002605, + "grad_norm": 14.352286338806152, + "learning_rate": 3.98259885176912e-06, + "loss": 0.2314, + "step": 74700 + }, + { + "epoch": 6.095176010430248, + "grad_norm": 10.346816062927246, + "learning_rate": 3.968678480767503e-06, + "loss": 0.2111, + "step": 74800 + }, + { + "epoch": 6.103324641460235, + "grad_norm": 16.672042846679688, + "learning_rate": 3.954766453833608e-06, + "loss": 0.199, + "step": 74900 + }, + { + "epoch": 6.111473272490222, + "grad_norm": 14.719056129455566, + "learning_rate": 3.94086288352486e-06, + "loss": 0.1996, + "step": 75000 + }, + { + "epoch": 6.119621903520208, + "grad_norm": 15.159549713134766, + "learning_rate": 3.926967882330262e-06, + "loss": 0.2246, + "step": 75100 + }, + { + "epoch": 6.127770534550195, + "grad_norm": 8.278336524963379, + "learning_rate": 3.913081562669492e-06, + "loss": 0.229, + "step": 75200 + }, + { + "epoch": 6.135919165580183, + "grad_norm": 17.559757232666016, + "learning_rate": 3.899204036891989e-06, + "loss": 0.2012, + "step": 75300 + }, + { + "epoch": 6.1440677966101696, + "grad_norm": 11.502748489379883, + "learning_rate": 3.885335417276037e-06, + "loss": 0.202, + "step": 75400 + }, + { + "epoch": 6.152216427640156, + "grad_norm": 10.84666633605957, + "learning_rate": 3.871475816027868e-06, + "loss": 0.2142, + "step": 75500 + }, + { + "epoch": 6.160365058670143, + "grad_norm": 15.855389595031738, + "learning_rate": 3.857625345280751e-06, + "loss": 0.2287, + "step": 75600 + }, + { + "epoch": 6.168513689700131, + "grad_norm": 12.554780960083008, + "learning_rate": 3.843784117094081e-06, + "loss": 0.1949, + "step": 75700 + }, + { + "epoch": 6.1766623207301175, + "grad_norm": 7.536383628845215, + "learning_rate": 3.829952243452475e-06, + "loss": 0.2062, + "step": 75800 + }, + { + "epoch": 6.184810951760104, + "grad_norm": 13.602145195007324, + "learning_rate": 3.816129836264864e-06, + "loss": 0.2211, + "step": 75900 + }, + { + "epoch": 6.192959582790091, + "grad_norm": 10.88949966430664, + "learning_rate": 3.802317007363593e-06, + "loss": 0.2141, + "step": 76000 + }, + { + "epoch": 6.201108213820079, + "grad_norm": 3.1079776287078857, + "learning_rate": 3.7885138685035113e-06, + "loss": 0.2121, + "step": 76100 + }, + { + "epoch": 6.209256844850065, + "grad_norm": 10.546631813049316, + "learning_rate": 3.774720531361063e-06, + "loss": 0.2272, + "step": 76200 + }, + { + "epoch": 6.217405475880052, + "grad_norm": 22.11454200744629, + "learning_rate": 3.7609371075334e-06, + "loss": 0.2118, + "step": 76300 + }, + { + "epoch": 6.225554106910039, + "grad_norm": 16.33343505859375, + "learning_rate": 3.7471637085374614e-06, + "loss": 0.227, + "step": 76400 + }, + { + "epoch": 6.2337027379400265, + "grad_norm": 14.43807315826416, + "learning_rate": 3.7334004458090833e-06, + "loss": 0.2287, + "step": 76500 + }, + { + "epoch": 6.241851368970013, + "grad_norm": 14.813934326171875, + "learning_rate": 3.719647430702089e-06, + "loss": 0.2064, + "step": 76600 + }, + { + "epoch": 6.25, + "grad_norm": 5.587681770324707, + "learning_rate": 3.705904774487396e-06, + "loss": 0.2051, + "step": 76700 + }, + { + "epoch": 6.258148631029987, + "grad_norm": 7.330463409423828, + "learning_rate": 3.6921725883521087e-06, + "loss": 0.2225, + "step": 76800 + }, + { + "epoch": 6.2662972620599735, + "grad_norm": 19.726444244384766, + "learning_rate": 3.678450983398623e-06, + "loss": 0.2131, + "step": 76900 + }, + { + "epoch": 6.274445893089961, + "grad_norm": 15.526715278625488, + "learning_rate": 3.664740070643723e-06, + "loss": 0.2257, + "step": 77000 + }, + { + "epoch": 6.282594524119948, + "grad_norm": 9.113424301147461, + "learning_rate": 3.6510399610176906e-06, + "loss": 0.2075, + "step": 77100 + }, + { + "epoch": 6.290743155149935, + "grad_norm": 11.527823448181152, + "learning_rate": 3.6373507653634e-06, + "loss": 0.1921, + "step": 77200 + }, + { + "epoch": 6.298891786179921, + "grad_norm": 5.839615345001221, + "learning_rate": 3.6236725944354245e-06, + "loss": 0.2426, + "step": 77300 + }, + { + "epoch": 6.307040417209909, + "grad_norm": 16.31635284423828, + "learning_rate": 3.6100055588991435e-06, + "loss": 0.206, + "step": 77400 + }, + { + "epoch": 6.315189048239896, + "grad_norm": 13.138345718383789, + "learning_rate": 3.5963497693298386e-06, + "loss": 0.2223, + "step": 77500 + }, + { + "epoch": 6.3233376792698825, + "grad_norm": 3.202862024307251, + "learning_rate": 3.5827053362118085e-06, + "loss": 0.2095, + "step": 77600 + }, + { + "epoch": 6.331486310299869, + "grad_norm": 11.949639320373535, + "learning_rate": 3.5690723699374697e-06, + "loss": 0.2176, + "step": 77700 + }, + { + "epoch": 6.339634941329857, + "grad_norm": 17.555377960205078, + "learning_rate": 3.5554509808064602e-06, + "loss": 0.2204, + "step": 77800 + }, + { + "epoch": 6.347783572359844, + "grad_norm": 6.945880889892578, + "learning_rate": 3.5418412790247575e-06, + "loss": 0.2006, + "step": 77900 + }, + { + "epoch": 6.3559322033898304, + "grad_norm": 29.10856056213379, + "learning_rate": 3.528243374703776e-06, + "loss": 0.2089, + "step": 78000 + }, + { + "epoch": 6.364080834419817, + "grad_norm": 21.48233413696289, + "learning_rate": 3.5146573778594855e-06, + "loss": 0.2091, + "step": 78100 + }, + { + "epoch": 6.372229465449805, + "grad_norm": 10.77776050567627, + "learning_rate": 3.5010833984115135e-06, + "loss": 0.1919, + "step": 78200 + }, + { + "epoch": 6.380378096479792, + "grad_norm": 22.155200958251953, + "learning_rate": 3.4875215461822574e-06, + "loss": 0.2269, + "step": 78300 + }, + { + "epoch": 6.388526727509778, + "grad_norm": 12.029594421386719, + "learning_rate": 3.473971930896001e-06, + "loss": 0.2328, + "step": 78400 + }, + { + "epoch": 6.396675358539765, + "grad_norm": 8.563623428344727, + "learning_rate": 3.460434662178024e-06, + "loss": 0.2202, + "step": 78500 + }, + { + "epoch": 6.404823989569753, + "grad_norm": 6.394750118255615, + "learning_rate": 3.4469098495537063e-06, + "loss": 0.2324, + "step": 78600 + }, + { + "epoch": 6.4129726205997395, + "grad_norm": 15.485038757324219, + "learning_rate": 3.433397602447659e-06, + "loss": 0.2152, + "step": 78700 + }, + { + "epoch": 6.421121251629726, + "grad_norm": 15.386170387268066, + "learning_rate": 3.4198980301828256e-06, + "loss": 0.2065, + "step": 78800 + }, + { + "epoch": 6.429269882659713, + "grad_norm": 11.893247604370117, + "learning_rate": 3.406411241979603e-06, + "loss": 0.2235, + "step": 78900 + }, + { + "epoch": 6.4374185136897, + "grad_norm": 12.216060638427734, + "learning_rate": 3.3929373469549554e-06, + "loss": 0.211, + "step": 79000 + }, + { + "epoch": 6.445567144719687, + "grad_norm": 9.018731117248535, + "learning_rate": 3.379476454121533e-06, + "loss": 0.2253, + "step": 79100 + }, + { + "epoch": 6.453715775749674, + "grad_norm": 18.289003372192383, + "learning_rate": 3.366028672386792e-06, + "loss": 0.2265, + "step": 79200 + }, + { + "epoch": 6.461864406779661, + "grad_norm": 6.403520584106445, + "learning_rate": 3.35259411055211e-06, + "loss": 0.2241, + "step": 79300 + }, + { + "epoch": 6.470013037809648, + "grad_norm": 6.311509609222412, + "learning_rate": 3.3391728773119037e-06, + "loss": 0.2204, + "step": 79400 + }, + { + "epoch": 6.478161668839635, + "grad_norm": 16.21648597717285, + "learning_rate": 3.3257650812527566e-06, + "loss": 0.2083, + "step": 79500 + }, + { + "epoch": 6.486310299869622, + "grad_norm": 2.8797686100006104, + "learning_rate": 3.3123708308525354e-06, + "loss": 0.2152, + "step": 79600 + }, + { + "epoch": 6.494458930899609, + "grad_norm": 12.40995979309082, + "learning_rate": 3.298990234479514e-06, + "loss": 0.2061, + "step": 79700 + }, + { + "epoch": 6.5026075619295955, + "grad_norm": 13.1309814453125, + "learning_rate": 3.2856234003914945e-06, + "loss": 0.196, + "step": 79800 + }, + { + "epoch": 6.510756192959583, + "grad_norm": 11.270479202270508, + "learning_rate": 3.2722704367349357e-06, + "loss": 0.1969, + "step": 79900 + }, + { + "epoch": 6.51890482398957, + "grad_norm": 5.54075813293457, + "learning_rate": 3.258931451544075e-06, + "loss": 0.2345, + "step": 80000 + }, + { + "epoch": 6.527053455019557, + "grad_norm": 9.90404987335205, + "learning_rate": 3.245606552740053e-06, + "loss": 0.2223, + "step": 80100 + }, + { + "epoch": 6.5352020860495434, + "grad_norm": 16.18077850341797, + "learning_rate": 3.2322958481300426e-06, + "loss": 0.2163, + "step": 80200 + }, + { + "epoch": 6.54335071707953, + "grad_norm": 6.288787841796875, + "learning_rate": 3.2189994454063776e-06, + "loss": 0.2093, + "step": 80300 + }, + { + "epoch": 6.551499348109518, + "grad_norm": 21.265981674194336, + "learning_rate": 3.205717452145679e-06, + "loss": 0.1972, + "step": 80400 + }, + { + "epoch": 6.559647979139505, + "grad_norm": 14.27213191986084, + "learning_rate": 3.1924499758079863e-06, + "loss": 0.2211, + "step": 80500 + }, + { + "epoch": 6.567796610169491, + "grad_norm": 6.663931369781494, + "learning_rate": 3.1791971237358893e-06, + "loss": 0.2037, + "step": 80600 + }, + { + "epoch": 6.575945241199479, + "grad_norm": 20.920997619628906, + "learning_rate": 3.1659590031536546e-06, + "loss": 0.2016, + "step": 80700 + }, + { + "epoch": 6.584093872229466, + "grad_norm": 5.427749156951904, + "learning_rate": 3.1527357211663647e-06, + "loss": 0.2145, + "step": 80800 + }, + { + "epoch": 6.5922425032594525, + "grad_norm": 5.944066524505615, + "learning_rate": 3.1395273847590444e-06, + "loss": 0.2243, + "step": 80900 + }, + { + "epoch": 6.600391134289439, + "grad_norm": 4.4831366539001465, + "learning_rate": 3.1263341007958015e-06, + "loss": 0.2251, + "step": 81000 + }, + { + "epoch": 6.608539765319426, + "grad_norm": 7.92203950881958, + "learning_rate": 3.113155976018959e-06, + "loss": 0.2202, + "step": 81100 + }, + { + "epoch": 6.616688396349414, + "grad_norm": 7.755978584289551, + "learning_rate": 3.0999931170481922e-06, + "loss": 0.2099, + "step": 81200 + }, + { + "epoch": 6.6248370273794, + "grad_norm": 12.123492240905762, + "learning_rate": 3.086845630379668e-06, + "loss": 0.2279, + "step": 81300 + }, + { + "epoch": 6.632985658409387, + "grad_norm": 8.695425987243652, + "learning_rate": 3.073713622385177e-06, + "loss": 0.2171, + "step": 81400 + }, + { + "epoch": 6.641134289439374, + "grad_norm": 12.858569145202637, + "learning_rate": 3.0605971993112805e-06, + "loss": 0.21, + "step": 81500 + }, + { + "epoch": 6.6492829204693615, + "grad_norm": 20.741817474365234, + "learning_rate": 3.0474964672784456e-06, + "loss": 0.2101, + "step": 81600 + }, + { + "epoch": 6.657431551499348, + "grad_norm": 12.694851875305176, + "learning_rate": 3.034411532280193e-06, + "loss": 0.2119, + "step": 81700 + }, + { + "epoch": 6.665580182529335, + "grad_norm": 11.025914192199707, + "learning_rate": 3.0213425001822266e-06, + "loss": 0.1936, + "step": 81800 + }, + { + "epoch": 6.673728813559322, + "grad_norm": 8.600627899169922, + "learning_rate": 3.008289476721594e-06, + "loss": 0.2239, + "step": 81900 + }, + { + "epoch": 6.681877444589309, + "grad_norm": 5.949343681335449, + "learning_rate": 2.9952525675058175e-06, + "loss": 0.2103, + "step": 82000 + }, + { + "epoch": 6.690026075619296, + "grad_norm": 9.281770706176758, + "learning_rate": 2.9822318780120463e-06, + "loss": 0.2252, + "step": 82100 + }, + { + "epoch": 6.698174706649283, + "grad_norm": 8.222912788391113, + "learning_rate": 2.9692275135862002e-06, + "loss": 0.199, + "step": 82200 + }, + { + "epoch": 6.70632333767927, + "grad_norm": 10.598749160766602, + "learning_rate": 2.9562395794421193e-06, + "loss": 0.2244, + "step": 82300 + }, + { + "epoch": 6.7144719687092564, + "grad_norm": 11.608291625976562, + "learning_rate": 2.9432681806607145e-06, + "loss": 0.2176, + "step": 82400 + }, + { + "epoch": 6.722620599739244, + "grad_norm": 9.24106216430664, + "learning_rate": 2.9303134221891106e-06, + "loss": 0.2222, + "step": 82500 + }, + { + "epoch": 6.730769230769231, + "grad_norm": 2.6706371307373047, + "learning_rate": 2.917375408839803e-06, + "loss": 0.2159, + "step": 82600 + }, + { + "epoch": 6.738917861799218, + "grad_norm": 11.834959030151367, + "learning_rate": 2.904454245289805e-06, + "loss": 0.216, + "step": 82700 + }, + { + "epoch": 6.747066492829204, + "grad_norm": 3.9120168685913086, + "learning_rate": 2.8915500360798117e-06, + "loss": 0.2051, + "step": 82800 + }, + { + "epoch": 6.755215123859192, + "grad_norm": 9.347685813903809, + "learning_rate": 2.8786628856133404e-06, + "loss": 0.238, + "step": 82900 + }, + { + "epoch": 6.763363754889179, + "grad_norm": 7.142603874206543, + "learning_rate": 2.8657928981558926e-06, + "loss": 0.2076, + "step": 83000 + }, + { + "epoch": 6.7715123859191655, + "grad_norm": 15.814796447753906, + "learning_rate": 2.852940177834111e-06, + "loss": 0.2018, + "step": 83100 + }, + { + "epoch": 6.779661016949152, + "grad_norm": 11.722209930419922, + "learning_rate": 2.8401048286349353e-06, + "loss": 0.2275, + "step": 83200 + }, + { + "epoch": 6.78780964797914, + "grad_norm": 10.187668800354004, + "learning_rate": 2.8272869544047622e-06, + "loss": 0.2093, + "step": 83300 + }, + { + "epoch": 6.795958279009127, + "grad_norm": 15.927581787109375, + "learning_rate": 2.814486658848603e-06, + "loss": 0.2065, + "step": 83400 + }, + { + "epoch": 6.804106910039113, + "grad_norm": 12.883095741271973, + "learning_rate": 2.8017040455292465e-06, + "loss": 0.2108, + "step": 83500 + }, + { + "epoch": 6.8122555410691, + "grad_norm": 7.530974864959717, + "learning_rate": 2.788939217866422e-06, + "loss": 0.2139, + "step": 83600 + }, + { + "epoch": 6.820404172099087, + "grad_norm": 20.07868766784668, + "learning_rate": 2.7761922791359596e-06, + "loss": 0.2205, + "step": 83700 + }, + { + "epoch": 6.8285528031290745, + "grad_norm": 7.615067481994629, + "learning_rate": 2.7634633324689563e-06, + "loss": 0.2067, + "step": 83800 + }, + { + "epoch": 6.836701434159061, + "grad_norm": 10.10435962677002, + "learning_rate": 2.7507524808509416e-06, + "loss": 0.2284, + "step": 83900 + }, + { + "epoch": 6.844850065189048, + "grad_norm": 12.469111442565918, + "learning_rate": 2.738059827121046e-06, + "loss": 0.2086, + "step": 84000 + }, + { + "epoch": 6.852998696219036, + "grad_norm": 8.140021324157715, + "learning_rate": 2.7253854739711634e-06, + "loss": 0.2162, + "step": 84100 + }, + { + "epoch": 6.861147327249022, + "grad_norm": 14.818914413452148, + "learning_rate": 2.7127295239451273e-06, + "loss": 0.2153, + "step": 84200 + }, + { + "epoch": 6.869295958279009, + "grad_norm": 8.947492599487305, + "learning_rate": 2.700092079437877e-06, + "loss": 0.2073, + "step": 84300 + }, + { + "epoch": 6.877444589308996, + "grad_norm": 8.173857688903809, + "learning_rate": 2.687473242694629e-06, + "loss": 0.2136, + "step": 84400 + }, + { + "epoch": 6.885593220338983, + "grad_norm": 4.175146579742432, + "learning_rate": 2.6748731158100528e-06, + "loss": 0.2082, + "step": 84500 + }, + { + "epoch": 6.89374185136897, + "grad_norm": 8.696370124816895, + "learning_rate": 2.6622918007274406e-06, + "loss": 0.2128, + "step": 84600 + }, + { + "epoch": 6.901890482398957, + "grad_norm": 8.253527641296387, + "learning_rate": 2.649729399237886e-06, + "loss": 0.1985, + "step": 84700 + }, + { + "epoch": 6.910039113428944, + "grad_norm": 9.825946807861328, + "learning_rate": 2.6371860129794585e-06, + "loss": 0.2084, + "step": 84800 + }, + { + "epoch": 6.918187744458931, + "grad_norm": 21.79430389404297, + "learning_rate": 2.624661743436383e-06, + "loss": 0.2154, + "step": 84900 + }, + { + "epoch": 6.926336375488918, + "grad_norm": 17.554534912109375, + "learning_rate": 2.6121566919382168e-06, + "loss": 0.2073, + "step": 85000 + }, + { + "epoch": 6.934485006518905, + "grad_norm": 14.525189399719238, + "learning_rate": 2.599670959659032e-06, + "loss": 0.2136, + "step": 85100 + }, + { + "epoch": 6.942633637548892, + "grad_norm": 17.66045570373535, + "learning_rate": 2.5872046476165926e-06, + "loss": 0.2259, + "step": 85200 + }, + { + "epoch": 6.9507822685788785, + "grad_norm": 12.12194538116455, + "learning_rate": 2.574757856671542e-06, + "loss": 0.2303, + "step": 85300 + }, + { + "epoch": 6.958930899608866, + "grad_norm": 16.121667861938477, + "learning_rate": 2.5623306875265865e-06, + "loss": 0.209, + "step": 85400 + }, + { + "epoch": 6.967079530638853, + "grad_norm": 37.0359001159668, + "learning_rate": 2.5499232407256764e-06, + "loss": 0.2135, + "step": 85500 + }, + { + "epoch": 6.97522816166884, + "grad_norm": 9.753621101379395, + "learning_rate": 2.5375356166531974e-06, + "loss": 0.2246, + "step": 85600 + }, + { + "epoch": 6.983376792698826, + "grad_norm": 11.933328628540039, + "learning_rate": 2.525167915533153e-06, + "loss": 0.2083, + "step": 85700 + }, + { + "epoch": 6.991525423728813, + "grad_norm": 11.32873821258545, + "learning_rate": 2.512820237428366e-06, + "loss": 0.221, + "step": 85800 + }, + { + "epoch": 6.999674054758801, + "grad_norm": 10.335704803466797, + "learning_rate": 2.5004926822396468e-06, + "loss": 0.218, + "step": 85900 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.8200803212851405, + "eval_loss": 0.6657418608665466, + "eval_runtime": 6.9032, + "eval_samples_per_second": 360.703, + "eval_steps_per_second": 45.196, + "step": 85904 + }, + { + "epoch": 7.0078226857887875, + "grad_norm": 13.04452133178711, + "learning_rate": 2.4881853497050074e-06, + "loss": 0.1828, + "step": 86000 + }, + { + "epoch": 7.015971316818774, + "grad_norm": 11.350065231323242, + "learning_rate": 2.475898339398842e-06, + "loss": 0.1981, + "step": 86100 + }, + { + "epoch": 7.024119947848761, + "grad_norm": 3.5544838905334473, + "learning_rate": 2.463631750731125e-06, + "loss": 0.1873, + "step": 86200 + }, + { + "epoch": 7.032268578878749, + "grad_norm": 6.474255084991455, + "learning_rate": 2.451385682946606e-06, + "loss": 0.205, + "step": 86300 + }, + { + "epoch": 7.040417209908735, + "grad_norm": 10.676136016845703, + "learning_rate": 2.43916023512401e-06, + "loss": 0.1702, + "step": 86400 + }, + { + "epoch": 7.048565840938722, + "grad_norm": 6.142400741577148, + "learning_rate": 2.4269555061752303e-06, + "loss": 0.2017, + "step": 86500 + }, + { + "epoch": 7.056714471968709, + "grad_norm": 16.273656845092773, + "learning_rate": 2.4147715948445323e-06, + "loss": 0.1776, + "step": 86600 + }, + { + "epoch": 7.064863102998697, + "grad_norm": 22.690208435058594, + "learning_rate": 2.4026085997077486e-06, + "loss": 0.1762, + "step": 86700 + }, + { + "epoch": 7.073011734028683, + "grad_norm": 14.49307632446289, + "learning_rate": 2.390466619171492e-06, + "loss": 0.1664, + "step": 86800 + }, + { + "epoch": 7.08116036505867, + "grad_norm": 14.948646545410156, + "learning_rate": 2.378345751472351e-06, + "loss": 0.1953, + "step": 86900 + }, + { + "epoch": 7.089308996088657, + "grad_norm": 12.674484252929688, + "learning_rate": 2.3662460946760962e-06, + "loss": 0.1932, + "step": 87000 + }, + { + "epoch": 7.0974576271186445, + "grad_norm": 14.729815483093262, + "learning_rate": 2.354167746676892e-06, + "loss": 0.1814, + "step": 87100 + }, + { + "epoch": 7.105606258148631, + "grad_norm": 16.739356994628906, + "learning_rate": 2.3421108051964974e-06, + "loss": 0.1761, + "step": 87200 + }, + { + "epoch": 7.113754889178618, + "grad_norm": 16.266368865966797, + "learning_rate": 2.330075367783479e-06, + "loss": 0.1947, + "step": 87300 + }, + { + "epoch": 7.121903520208605, + "grad_norm": 12.137019157409668, + "learning_rate": 2.318061531812422e-06, + "loss": 0.2017, + "step": 87400 + }, + { + "epoch": 7.130052151238592, + "grad_norm": 7.073469161987305, + "learning_rate": 2.3060693944831404e-06, + "loss": 0.1746, + "step": 87500 + }, + { + "epoch": 7.138200782268579, + "grad_norm": 7.888490200042725, + "learning_rate": 2.294099052819893e-06, + "loss": 0.1882, + "step": 87600 + }, + { + "epoch": 7.146349413298566, + "grad_norm": 18.83835792541504, + "learning_rate": 2.282150603670596e-06, + "loss": 0.182, + "step": 87700 + }, + { + "epoch": 7.154498044328553, + "grad_norm": 9.491145133972168, + "learning_rate": 2.2702241437060463e-06, + "loss": 0.1817, + "step": 87800 + }, + { + "epoch": 7.162646675358539, + "grad_norm": 11.629495620727539, + "learning_rate": 2.2583197694191272e-06, + "loss": 0.1737, + "step": 87900 + }, + { + "epoch": 7.170795306388527, + "grad_norm": 3.3986611366271973, + "learning_rate": 2.246437577124038e-06, + "loss": 0.1839, + "step": 88000 + }, + { + "epoch": 7.178943937418514, + "grad_norm": 3.2696523666381836, + "learning_rate": 2.2345776629555085e-06, + "loss": 0.1896, + "step": 88100 + }, + { + "epoch": 7.1870925684485005, + "grad_norm": 9.869660377502441, + "learning_rate": 2.2227401228680275e-06, + "loss": 0.2028, + "step": 88200 + }, + { + "epoch": 7.195241199478487, + "grad_norm": 8.699070930480957, + "learning_rate": 2.2109250526350584e-06, + "loss": 0.2025, + "step": 88300 + }, + { + "epoch": 7.203389830508475, + "grad_norm": 3.9306254386901855, + "learning_rate": 2.1991325478482695e-06, + "loss": 0.1827, + "step": 88400 + }, + { + "epoch": 7.211538461538462, + "grad_norm": 18.14926528930664, + "learning_rate": 2.187362703916766e-06, + "loss": 0.1843, + "step": 88500 + }, + { + "epoch": 7.219687092568448, + "grad_norm": 15.083455085754395, + "learning_rate": 2.175615616066305e-06, + "loss": 0.1932, + "step": 88600 + }, + { + "epoch": 7.227835723598435, + "grad_norm": 14.958844184875488, + "learning_rate": 2.163891379338535e-06, + "loss": 0.1839, + "step": 88700 + }, + { + "epoch": 7.235984354628423, + "grad_norm": 9.219823837280273, + "learning_rate": 2.1521900885902214e-06, + "loss": 0.205, + "step": 88800 + }, + { + "epoch": 7.24413298565841, + "grad_norm": 10.361544609069824, + "learning_rate": 2.1405118384924858e-06, + "loss": 0.1942, + "step": 88900 + }, + { + "epoch": 7.252281616688396, + "grad_norm": 7.847745418548584, + "learning_rate": 2.128856723530033e-06, + "loss": 0.2046, + "step": 89000 + }, + { + "epoch": 7.260430247718383, + "grad_norm": 8.953947067260742, + "learning_rate": 2.1172248380003853e-06, + "loss": 0.1903, + "step": 89100 + }, + { + "epoch": 7.26857887874837, + "grad_norm": 6.825370788574219, + "learning_rate": 2.105616276013133e-06, + "loss": 0.178, + "step": 89200 + }, + { + "epoch": 7.2767275097783575, + "grad_norm": 10.48969554901123, + "learning_rate": 2.0940311314891574e-06, + "loss": 0.1778, + "step": 89300 + }, + { + "epoch": 7.284876140808344, + "grad_norm": 13.994695663452148, + "learning_rate": 2.082469498159879e-06, + "loss": 0.1673, + "step": 89400 + }, + { + "epoch": 7.293024771838331, + "grad_norm": 17.321313858032227, + "learning_rate": 2.0709314695664957e-06, + "loss": 0.2043, + "step": 89500 + }, + { + "epoch": 7.301173402868318, + "grad_norm": 10.52856731414795, + "learning_rate": 2.0594171390592294e-06, + "loss": 0.1942, + "step": 89600 + }, + { + "epoch": 7.309322033898305, + "grad_norm": 23.261329650878906, + "learning_rate": 2.047926599796568e-06, + "loss": 0.1816, + "step": 89700 + }, + { + "epoch": 7.317470664928292, + "grad_norm": 6.534886360168457, + "learning_rate": 2.0364599447445126e-06, + "loss": 0.1808, + "step": 89800 + }, + { + "epoch": 7.325619295958279, + "grad_norm": 12.067914962768555, + "learning_rate": 2.0250172666758267e-06, + "loss": 0.187, + "step": 89900 + }, + { + "epoch": 7.333767926988266, + "grad_norm": 11.018478393554688, + "learning_rate": 2.0135986581692817e-06, + "loss": 0.1865, + "step": 90000 + }, + { + "epoch": 7.341916558018253, + "grad_norm": 9.79710865020752, + "learning_rate": 2.002204211608913e-06, + "loss": 0.1987, + "step": 90100 + }, + { + "epoch": 7.35006518904824, + "grad_norm": 15.164643287658691, + "learning_rate": 1.990834019183268e-06, + "loss": 0.1973, + "step": 90200 + }, + { + "epoch": 7.358213820078227, + "grad_norm": 22.170740127563477, + "learning_rate": 1.9794881728846642e-06, + "loss": 0.1702, + "step": 90300 + }, + { + "epoch": 7.3663624511082135, + "grad_norm": 8.200043678283691, + "learning_rate": 1.968166764508442e-06, + "loss": 0.183, + "step": 90400 + }, + { + "epoch": 7.374511082138201, + "grad_norm": 6.145725250244141, + "learning_rate": 1.9568698856522215e-06, + "loss": 0.1906, + "step": 90500 + }, + { + "epoch": 7.382659713168188, + "grad_norm": 22.14548683166504, + "learning_rate": 1.945597627715166e-06, + "loss": 0.1947, + "step": 90600 + }, + { + "epoch": 7.390808344198175, + "grad_norm": 10.075164794921875, + "learning_rate": 1.934350081897237e-06, + "loss": 0.171, + "step": 90700 + }, + { + "epoch": 7.398956975228161, + "grad_norm": 6.933922290802002, + "learning_rate": 1.923127339198459e-06, + "loss": 0.1845, + "step": 90800 + }, + { + "epoch": 7.407105606258149, + "grad_norm": 26.223041534423828, + "learning_rate": 1.9119294904181847e-06, + "loss": 0.1852, + "step": 90900 + }, + { + "epoch": 7.415254237288136, + "grad_norm": 4.778967380523682, + "learning_rate": 1.900756626154356e-06, + "loss": 0.1958, + "step": 91000 + }, + { + "epoch": 7.423402868318123, + "grad_norm": 29.773698806762695, + "learning_rate": 1.889608836802776e-06, + "loss": 0.1809, + "step": 91100 + }, + { + "epoch": 7.431551499348109, + "grad_norm": 8.9940767288208, + "learning_rate": 1.8784862125563734e-06, + "loss": 0.1869, + "step": 91200 + }, + { + "epoch": 7.439700130378096, + "grad_norm": 15.34753704071045, + "learning_rate": 1.8673888434044756e-06, + "loss": 0.1863, + "step": 91300 + }, + { + "epoch": 7.447848761408084, + "grad_norm": 19.44320297241211, + "learning_rate": 1.8563168191320823e-06, + "loss": 0.1798, + "step": 91400 + }, + { + "epoch": 7.4559973924380705, + "grad_norm": 12.468984603881836, + "learning_rate": 1.8452702293191339e-06, + "loss": 0.1808, + "step": 91500 + }, + { + "epoch": 7.464146023468057, + "grad_norm": 8.79600715637207, + "learning_rate": 1.8342491633397863e-06, + "loss": 0.1823, + "step": 91600 + }, + { + "epoch": 7.472294654498044, + "grad_norm": 15.76307487487793, + "learning_rate": 1.8232537103616953e-06, + "loss": 0.1959, + "step": 91700 + }, + { + "epoch": 7.480443285528032, + "grad_norm": 9.05780029296875, + "learning_rate": 1.8122839593452902e-06, + "loss": 0.1797, + "step": 91800 + }, + { + "epoch": 7.488591916558018, + "grad_norm": 11.826004981994629, + "learning_rate": 1.8013399990430525e-06, + "loss": 0.1639, + "step": 91900 + }, + { + "epoch": 7.496740547588005, + "grad_norm": 20.31383514404297, + "learning_rate": 1.7904219179988007e-06, + "loss": 0.1916, + "step": 92000 + }, + { + "epoch": 7.504889178617992, + "grad_norm": 18.240629196166992, + "learning_rate": 1.7795298045469766e-06, + "loss": 0.1791, + "step": 92100 + }, + { + "epoch": 7.5130378096479795, + "grad_norm": 20.392873764038086, + "learning_rate": 1.7686637468119223e-06, + "loss": 0.2021, + "step": 92200 + }, + { + "epoch": 7.521186440677966, + "grad_norm": 9.732405662536621, + "learning_rate": 1.757823832707175e-06, + "loss": 0.1818, + "step": 92300 + }, + { + "epoch": 7.529335071707953, + "grad_norm": 21.23190689086914, + "learning_rate": 1.7470101499347498e-06, + "loss": 0.1692, + "step": 92400 + }, + { + "epoch": 7.53748370273794, + "grad_norm": 7.4514641761779785, + "learning_rate": 1.736222785984435e-06, + "loss": 0.2084, + "step": 92500 + }, + { + "epoch": 7.5456323337679265, + "grad_norm": 13.29001522064209, + "learning_rate": 1.7254618281330838e-06, + "loss": 0.1897, + "step": 92600 + }, + { + "epoch": 7.553780964797914, + "grad_norm": 9.683525085449219, + "learning_rate": 1.7147273634439021e-06, + "loss": 0.156, + "step": 92700 + }, + { + "epoch": 7.561929595827901, + "grad_norm": 12.022348403930664, + "learning_rate": 1.7040194787657566e-06, + "loss": 0.2136, + "step": 92800 + }, + { + "epoch": 7.570078226857888, + "grad_norm": 11.087843894958496, + "learning_rate": 1.6933382607324572e-06, + "loss": 0.171, + "step": 92900 + }, + { + "epoch": 7.578226857887875, + "grad_norm": 20.101045608520508, + "learning_rate": 1.6826837957620662e-06, + "loss": 0.2131, + "step": 93000 + }, + { + "epoch": 7.586375488917862, + "grad_norm": 13.087589263916016, + "learning_rate": 1.672056170056196e-06, + "loss": 0.1791, + "step": 93100 + }, + { + "epoch": 7.594524119947849, + "grad_norm": 9.458551406860352, + "learning_rate": 1.6614554695993085e-06, + "loss": 0.1746, + "step": 93200 + }, + { + "epoch": 7.602672750977836, + "grad_norm": 12.884553909301758, + "learning_rate": 1.6508817801580268e-06, + "loss": 0.1673, + "step": 93300 + }, + { + "epoch": 7.610821382007822, + "grad_norm": 10.40186595916748, + "learning_rate": 1.6403351872804347e-06, + "loss": 0.1659, + "step": 93400 + }, + { + "epoch": 7.61897001303781, + "grad_norm": 12.832286834716797, + "learning_rate": 1.6298157762953897e-06, + "loss": 0.1693, + "step": 93500 + }, + { + "epoch": 7.627118644067797, + "grad_norm": 13.989652633666992, + "learning_rate": 1.6193236323118283e-06, + "loss": 0.203, + "step": 93600 + }, + { + "epoch": 7.6352672750977835, + "grad_norm": 13.184144020080566, + "learning_rate": 1.6088588402180783e-06, + "loss": 0.1983, + "step": 93700 + }, + { + "epoch": 7.64341590612777, + "grad_norm": 50.71080017089844, + "learning_rate": 1.5984214846811735e-06, + "loss": 0.1837, + "step": 93800 + }, + { + "epoch": 7.651564537157758, + "grad_norm": 8.608222007751465, + "learning_rate": 1.588011650146169e-06, + "loss": 0.1786, + "step": 93900 + }, + { + "epoch": 7.659713168187745, + "grad_norm": 9.973206520080566, + "learning_rate": 1.5776294208354537e-06, + "loss": 0.1873, + "step": 94000 + }, + { + "epoch": 7.667861799217731, + "grad_norm": 3.6279351711273193, + "learning_rate": 1.5672748807480736e-06, + "loss": 0.1754, + "step": 94100 + }, + { + "epoch": 7.676010430247718, + "grad_norm": 13.710479736328125, + "learning_rate": 1.5569481136590554e-06, + "loss": 0.1973, + "step": 94200 + }, + { + "epoch": 7.684159061277706, + "grad_norm": 20.849790573120117, + "learning_rate": 1.5466492031187174e-06, + "loss": 0.1953, + "step": 94300 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 16.05866241455078, + "learning_rate": 1.5363782324520033e-06, + "loss": 0.1834, + "step": 94400 + }, + { + "epoch": 7.700456323337679, + "grad_norm": 10.594083786010742, + "learning_rate": 1.5261352847578044e-06, + "loss": 0.196, + "step": 94500 + }, + { + "epoch": 7.708604954367666, + "grad_norm": 14.200790405273438, + "learning_rate": 1.5159204429082874e-06, + "loss": 0.1793, + "step": 94600 + }, + { + "epoch": 7.716753585397653, + "grad_norm": 3.8873071670532227, + "learning_rate": 1.5057337895482255e-06, + "loss": 0.1865, + "step": 94700 + }, + { + "epoch": 7.72490221642764, + "grad_norm": 13.96704387664795, + "learning_rate": 1.4955754070943268e-06, + "loss": 0.1653, + "step": 94800 + }, + { + "epoch": 7.733050847457627, + "grad_norm": 23.539247512817383, + "learning_rate": 1.48544537773457e-06, + "loss": 0.1713, + "step": 94900 + }, + { + "epoch": 7.741199478487614, + "grad_norm": 14.154293060302734, + "learning_rate": 1.4753437834275397e-06, + "loss": 0.1894, + "step": 95000 + }, + { + "epoch": 7.749348109517601, + "grad_norm": 8.608110427856445, + "learning_rate": 1.4652707059017607e-06, + "loss": 0.1887, + "step": 95100 + }, + { + "epoch": 7.757496740547588, + "grad_norm": 9.453892707824707, + "learning_rate": 1.4552262266550382e-06, + "loss": 0.1769, + "step": 95200 + }, + { + "epoch": 7.765645371577575, + "grad_norm": 12.239083290100098, + "learning_rate": 1.4452104269538009e-06, + "loss": 0.1699, + "step": 95300 + }, + { + "epoch": 7.773794002607562, + "grad_norm": 10.937909126281738, + "learning_rate": 1.4352233878324384e-06, + "loss": 0.1667, + "step": 95400 + }, + { + "epoch": 7.781942633637549, + "grad_norm": 21.223346710205078, + "learning_rate": 1.4252651900926496e-06, + "loss": 0.182, + "step": 95500 + }, + { + "epoch": 7.790091264667536, + "grad_norm": 7.070313453674316, + "learning_rate": 1.4153359143027879e-06, + "loss": 0.1896, + "step": 95600 + }, + { + "epoch": 7.798239895697523, + "grad_norm": 14.346339225769043, + "learning_rate": 1.4054356407972086e-06, + "loss": 0.1743, + "step": 95700 + }, + { + "epoch": 7.80638852672751, + "grad_norm": 15.966556549072266, + "learning_rate": 1.3955644496756199e-06, + "loss": 0.1902, + "step": 95800 + }, + { + "epoch": 7.8145371577574965, + "grad_norm": 16.198644638061523, + "learning_rate": 1.3857224208024345e-06, + "loss": 0.1945, + "step": 95900 + }, + { + "epoch": 7.822685788787483, + "grad_norm": 8.803377151489258, + "learning_rate": 1.3759096338061222e-06, + "loss": 0.1793, + "step": 96000 + }, + { + "epoch": 7.830834419817471, + "grad_norm": 19.771717071533203, + "learning_rate": 1.3661261680785693e-06, + "loss": 0.1809, + "step": 96100 + }, + { + "epoch": 7.838983050847458, + "grad_norm": 11.52552318572998, + "learning_rate": 1.3563721027744309e-06, + "loss": 0.1887, + "step": 96200 + }, + { + "epoch": 7.847131681877444, + "grad_norm": 17.998104095458984, + "learning_rate": 1.3466475168104953e-06, + "loss": 0.2107, + "step": 96300 + }, + { + "epoch": 7.855280312907432, + "grad_norm": 6.081639289855957, + "learning_rate": 1.3369524888650437e-06, + "loss": 0.1849, + "step": 96400 + }, + { + "epoch": 7.863428943937419, + "grad_norm": 6.099484443664551, + "learning_rate": 1.3272870973772118e-06, + "loss": 0.1847, + "step": 96500 + }, + { + "epoch": 7.8715775749674055, + "grad_norm": 19.433902740478516, + "learning_rate": 1.3176514205463586e-06, + "loss": 0.2, + "step": 96600 + }, + { + "epoch": 7.879726205997392, + "grad_norm": 6.365217208862305, + "learning_rate": 1.3080455363314309e-06, + "loss": 0.2062, + "step": 96700 + }, + { + "epoch": 7.887874837027379, + "grad_norm": 9.893994331359863, + "learning_rate": 1.2984695224503351e-06, + "loss": 0.1721, + "step": 96800 + }, + { + "epoch": 7.896023468057367, + "grad_norm": 22.75550079345703, + "learning_rate": 1.2889234563793058e-06, + "loss": 0.204, + "step": 96900 + }, + { + "epoch": 7.904172099087353, + "grad_norm": 2.8168067932128906, + "learning_rate": 1.279407415352279e-06, + "loss": 0.1963, + "step": 97000 + }, + { + "epoch": 7.91232073011734, + "grad_norm": 19.346757888793945, + "learning_rate": 1.2699214763602741e-06, + "loss": 0.1845, + "step": 97100 + }, + { + "epoch": 7.920469361147327, + "grad_norm": 13.861513137817383, + "learning_rate": 1.2604657161507566e-06, + "loss": 0.1934, + "step": 97200 + }, + { + "epoch": 7.9286179921773146, + "grad_norm": 12.996659278869629, + "learning_rate": 1.2510402112270326e-06, + "loss": 0.1808, + "step": 97300 + }, + { + "epoch": 7.936766623207301, + "grad_norm": 16.255569458007812, + "learning_rate": 1.2416450378476196e-06, + "loss": 0.1919, + "step": 97400 + }, + { + "epoch": 7.944915254237288, + "grad_norm": 9.47265625, + "learning_rate": 1.2322802720256355e-06, + "loss": 0.1887, + "step": 97500 + }, + { + "epoch": 7.953063885267275, + "grad_norm": 13.006512641906738, + "learning_rate": 1.2229459895281787e-06, + "loss": 0.1927, + "step": 97600 + }, + { + "epoch": 7.9612125162972625, + "grad_norm": 13.849684715270996, + "learning_rate": 1.213642265875718e-06, + "loss": 0.1906, + "step": 97700 + }, + { + "epoch": 7.969361147327249, + "grad_norm": 25.117225646972656, + "learning_rate": 1.2043691763414844e-06, + "loss": 0.1659, + "step": 97800 + }, + { + "epoch": 7.977509778357236, + "grad_norm": 9.633444786071777, + "learning_rate": 1.1951267959508562e-06, + "loss": 0.1923, + "step": 97900 + }, + { + "epoch": 7.985658409387223, + "grad_norm": 9.853534698486328, + "learning_rate": 1.185915199480751e-06, + "loss": 0.1969, + "step": 98000 + }, + { + "epoch": 7.9938070404172095, + "grad_norm": 12.424792289733887, + "learning_rate": 1.1767344614590303e-06, + "loss": 0.1772, + "step": 98100 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.8200803212851405, + "eval_loss": 0.7215536236763, + "eval_runtime": 7.0555, + "eval_samples_per_second": 352.917, + "eval_steps_per_second": 44.221, + "step": 98176 + }, + { + "epoch": 8.001955671447197, + "grad_norm": 13.437636375427246, + "learning_rate": 1.167584656163887e-06, + "loss": 0.1774, + "step": 98200 + }, + { + "epoch": 8.010104302477183, + "grad_norm": 14.577449798583984, + "learning_rate": 1.1584658576232482e-06, + "loss": 0.1693, + "step": 98300 + }, + { + "epoch": 8.01825293350717, + "grad_norm": 18.45952606201172, + "learning_rate": 1.1493781396141795e-06, + "loss": 0.17, + "step": 98400 + }, + { + "epoch": 8.026401564537158, + "grad_norm": 18.29120635986328, + "learning_rate": 1.1403215756622804e-06, + "loss": 0.178, + "step": 98500 + }, + { + "epoch": 8.034550195567144, + "grad_norm": 11.486896514892578, + "learning_rate": 1.1312962390410954e-06, + "loss": 0.1815, + "step": 98600 + }, + { + "epoch": 8.042698826597132, + "grad_norm": 19.90141487121582, + "learning_rate": 1.1223022027715197e-06, + "loss": 0.1682, + "step": 98700 + }, + { + "epoch": 8.05084745762712, + "grad_norm": 11.248079299926758, + "learning_rate": 1.1133395396212048e-06, + "loss": 0.169, + "step": 98800 + }, + { + "epoch": 8.058996088657105, + "grad_norm": 7.839399814605713, + "learning_rate": 1.104408322103978e-06, + "loss": 0.1684, + "step": 98900 + }, + { + "epoch": 8.067144719687093, + "grad_norm": 8.082372665405273, + "learning_rate": 1.095508622479247e-06, + "loss": 0.1769, + "step": 99000 + }, + { + "epoch": 8.075293350717079, + "grad_norm": 9.952238082885742, + "learning_rate": 1.0866405127514234e-06, + "loss": 0.1866, + "step": 99100 + }, + { + "epoch": 8.083441981747066, + "grad_norm": 5.250309467315674, + "learning_rate": 1.0778040646693316e-06, + "loss": 0.162, + "step": 99200 + }, + { + "epoch": 8.091590612777054, + "grad_norm": 9.988779067993164, + "learning_rate": 1.0689993497256336e-06, + "loss": 0.177, + "step": 99300 + }, + { + "epoch": 8.09973924380704, + "grad_norm": 8.978513717651367, + "learning_rate": 1.0602264391562506e-06, + "loss": 0.151, + "step": 99400 + }, + { + "epoch": 8.107887874837028, + "grad_norm": 23.60556983947754, + "learning_rate": 1.051485403939786e-06, + "loss": 0.1734, + "step": 99500 + }, + { + "epoch": 8.116036505867015, + "grad_norm": 10.938061714172363, + "learning_rate": 1.0427763147969467e-06, + "loss": 0.1733, + "step": 99600 + }, + { + "epoch": 8.124185136897001, + "grad_norm": 5.527510643005371, + "learning_rate": 1.0340992421899776e-06, + "loss": 0.1565, + "step": 99700 + }, + { + "epoch": 8.132333767926989, + "grad_norm": 9.493518829345703, + "learning_rate": 1.0254542563220922e-06, + "loss": 0.181, + "step": 99800 + }, + { + "epoch": 8.140482398956975, + "grad_norm": 7.9793548583984375, + "learning_rate": 1.0168414271368953e-06, + "loss": 0.1837, + "step": 99900 + }, + { + "epoch": 8.148631029986962, + "grad_norm": 11.252303123474121, + "learning_rate": 1.0082608243178276e-06, + "loss": 0.1708, + "step": 100000 + }, + { + "epoch": 8.15677966101695, + "grad_norm": 14.102470397949219, + "learning_rate": 9.997125172875943e-07, + "loss": 0.1884, + "step": 100100 + }, + { + "epoch": 8.164928292046936, + "grad_norm": 38.51998519897461, + "learning_rate": 9.91196575207608e-07, + "loss": 0.184, + "step": 100200 + }, + { + "epoch": 8.173076923076923, + "grad_norm": 7.0270466804504395, + "learning_rate": 9.82713066977427e-07, + "loss": 0.1489, + "step": 100300 + }, + { + "epoch": 8.18122555410691, + "grad_norm": 14.944999694824219, + "learning_rate": 9.742620612341992e-07, + "loss": 0.1835, + "step": 100400 + }, + { + "epoch": 8.189374185136897, + "grad_norm": 7.147238731384277, + "learning_rate": 9.658436263521048e-07, + "loss": 0.1512, + "step": 100500 + }, + { + "epoch": 8.197522816166884, + "grad_norm": 5.465837001800537, + "learning_rate": 9.574578304418063e-07, + "loss": 0.1702, + "step": 100600 + }, + { + "epoch": 8.20567144719687, + "grad_norm": 4.3965630531311035, + "learning_rate": 9.491047413498933e-07, + "loss": 0.1619, + "step": 100700 + }, + { + "epoch": 8.213820078226858, + "grad_norm": 21.602157592773438, + "learning_rate": 9.407844266583377e-07, + "loss": 0.1726, + "step": 100800 + }, + { + "epoch": 8.221968709256846, + "grad_norm": 16.533201217651367, + "learning_rate": 9.324969536839435e-07, + "loss": 0.1564, + "step": 100900 + }, + { + "epoch": 8.230117340286832, + "grad_norm": 17.454898834228516, + "learning_rate": 9.242423894778046e-07, + "loss": 0.1847, + "step": 101000 + }, + { + "epoch": 8.23826597131682, + "grad_norm": 17.726686477661133, + "learning_rate": 9.160208008247618e-07, + "loss": 0.1695, + "step": 101100 + }, + { + "epoch": 8.246414602346805, + "grad_norm": 31.844257354736328, + "learning_rate": 9.078322542428597e-07, + "loss": 0.1698, + "step": 101200 + }, + { + "epoch": 8.254563233376793, + "grad_norm": 9.689949989318848, + "learning_rate": 8.99676815982814e-07, + "loss": 0.153, + "step": 101300 + }, + { + "epoch": 8.26271186440678, + "grad_norm": 13.61907958984375, + "learning_rate": 8.915545520274699e-07, + "loss": 0.177, + "step": 101400 + }, + { + "epoch": 8.270860495436766, + "grad_norm": 11.14121150970459, + "learning_rate": 8.834655280912718e-07, + "loss": 0.1674, + "step": 101500 + }, + { + "epoch": 8.279009126466754, + "grad_norm": 12.197967529296875, + "learning_rate": 8.754098096197312e-07, + "loss": 0.1787, + "step": 101600 + }, + { + "epoch": 8.28715775749674, + "grad_norm": 12.565035820007324, + "learning_rate": 8.67387461788895e-07, + "loss": 0.1679, + "step": 101700 + }, + { + "epoch": 8.295306388526727, + "grad_norm": 21.256549835205078, + "learning_rate": 8.593985495048201e-07, + "loss": 0.1695, + "step": 101800 + }, + { + "epoch": 8.303455019556715, + "grad_norm": 4.485990524291992, + "learning_rate": 8.514431374030496e-07, + "loss": 0.1654, + "step": 101900 + }, + { + "epoch": 8.3116036505867, + "grad_norm": 13.213761329650879, + "learning_rate": 8.435212898480855e-07, + "loss": 0.1626, + "step": 102000 + }, + { + "epoch": 8.319752281616688, + "grad_norm": 19.035646438598633, + "learning_rate": 8.356330709328725e-07, + "loss": 0.1611, + "step": 102100 + }, + { + "epoch": 8.327900912646676, + "grad_norm": 21.1912841796875, + "learning_rate": 8.277785444782765e-07, + "loss": 0.1607, + "step": 102200 + }, + { + "epoch": 8.336049543676662, + "grad_norm": 19.324132919311523, + "learning_rate": 8.199577740325703e-07, + "loss": 0.1741, + "step": 102300 + }, + { + "epoch": 8.34419817470665, + "grad_norm": 8.325228691101074, + "learning_rate": 8.121708228709174e-07, + "loss": 0.1808, + "step": 102400 + }, + { + "epoch": 8.352346805736635, + "grad_norm": 11.028812408447266, + "learning_rate": 8.044177539948617e-07, + "loss": 0.169, + "step": 102500 + }, + { + "epoch": 8.360495436766623, + "grad_norm": 20.587303161621094, + "learning_rate": 7.966986301318158e-07, + "loss": 0.1569, + "step": 102600 + }, + { + "epoch": 8.36864406779661, + "grad_norm": 8.49282455444336, + "learning_rate": 7.890135137345589e-07, + "loss": 0.1584, + "step": 102700 + }, + { + "epoch": 8.376792698826597, + "grad_norm": 14.866241455078125, + "learning_rate": 7.813624669807246e-07, + "loss": 0.1608, + "step": 102800 + }, + { + "epoch": 8.384941329856584, + "grad_norm": 3.761150598526001, + "learning_rate": 7.73745551772298e-07, + "loss": 0.1533, + "step": 102900 + }, + { + "epoch": 8.393089960886572, + "grad_norm": 17.36056900024414, + "learning_rate": 7.66162829735122e-07, + "loss": 0.1723, + "step": 103000 + }, + { + "epoch": 8.401238591916558, + "grad_norm": 14.63774585723877, + "learning_rate": 7.586143622183922e-07, + "loss": 0.1769, + "step": 103100 + }, + { + "epoch": 8.409387222946545, + "grad_norm": 15.453008651733398, + "learning_rate": 7.511002102941639e-07, + "loss": 0.1845, + "step": 103200 + }, + { + "epoch": 8.417535853976531, + "grad_norm": 23.958969116210938, + "learning_rate": 7.436204347568548e-07, + "loss": 0.1829, + "step": 103300 + }, + { + "epoch": 8.425684485006519, + "grad_norm": 22.29449462890625, + "learning_rate": 7.361750961227587e-07, + "loss": 0.1722, + "step": 103400 + }, + { + "epoch": 8.433833116036507, + "grad_norm": 12.636420249938965, + "learning_rate": 7.287642546295487e-07, + "loss": 0.1614, + "step": 103500 + }, + { + "epoch": 8.441981747066492, + "grad_norm": 12.580671310424805, + "learning_rate": 7.213879702357951e-07, + "loss": 0.1713, + "step": 103600 + }, + { + "epoch": 8.45013037809648, + "grad_norm": 9.213543891906738, + "learning_rate": 7.140463026204764e-07, + "loss": 0.1619, + "step": 103700 + }, + { + "epoch": 8.458279009126466, + "grad_norm": 15.926830291748047, + "learning_rate": 7.067393111825016e-07, + "loss": 0.1748, + "step": 103800 + }, + { + "epoch": 8.466427640156454, + "grad_norm": 22.008920669555664, + "learning_rate": 6.994670550402249e-07, + "loss": 0.1926, + "step": 103900 + }, + { + "epoch": 8.474576271186441, + "grad_norm": 4.002703666687012, + "learning_rate": 6.922295930309691e-07, + "loss": 0.1613, + "step": 104000 + }, + { + "epoch": 8.482724902216427, + "grad_norm": 10.932751655578613, + "learning_rate": 6.850269837105522e-07, + "loss": 0.1635, + "step": 104100 + }, + { + "epoch": 8.490873533246415, + "grad_norm": 20.70867347717285, + "learning_rate": 6.778592853528077e-07, + "loss": 0.1708, + "step": 104200 + }, + { + "epoch": 8.499022164276402, + "grad_norm": 9.567403793334961, + "learning_rate": 6.707265559491188e-07, + "loss": 0.1814, + "step": 104300 + }, + { + "epoch": 8.507170795306388, + "grad_norm": 24.9285888671875, + "learning_rate": 6.63628853207946e-07, + "loss": 0.1746, + "step": 104400 + }, + { + "epoch": 8.515319426336376, + "grad_norm": 12.97628402709961, + "learning_rate": 6.565662345543595e-07, + "loss": 0.17, + "step": 104500 + }, + { + "epoch": 8.523468057366362, + "grad_norm": 5.221209526062012, + "learning_rate": 6.495387571295785e-07, + "loss": 0.1726, + "step": 104600 + }, + { + "epoch": 8.53161668839635, + "grad_norm": 12.438835144042969, + "learning_rate": 6.42546477790506e-07, + "loss": 0.1703, + "step": 104700 + }, + { + "epoch": 8.539765319426337, + "grad_norm": 9.98957633972168, + "learning_rate": 6.355894531092705e-07, + "loss": 0.1883, + "step": 104800 + }, + { + "epoch": 8.547913950456323, + "grad_norm": 8.844900131225586, + "learning_rate": 6.286677393727653e-07, + "loss": 0.1623, + "step": 104900 + }, + { + "epoch": 8.55606258148631, + "grad_norm": 5.921658039093018, + "learning_rate": 6.217813925821958e-07, + "loss": 0.16, + "step": 105000 + }, + { + "epoch": 8.564211212516298, + "grad_norm": 12.132319450378418, + "learning_rate": 6.149304684526253e-07, + "loss": 0.1843, + "step": 105100 + }, + { + "epoch": 8.572359843546284, + "grad_norm": 13.31769847869873, + "learning_rate": 6.081150224125254e-07, + "loss": 0.1586, + "step": 105200 + }, + { + "epoch": 8.580508474576272, + "grad_norm": 21.240800857543945, + "learning_rate": 6.013351096033254e-07, + "loss": 0.1783, + "step": 105300 + }, + { + "epoch": 8.588657105606258, + "grad_norm": 9.178833961486816, + "learning_rate": 5.945907848789667e-07, + "loss": 0.1847, + "step": 105400 + }, + { + "epoch": 8.596805736636245, + "grad_norm": 7.893414497375488, + "learning_rate": 5.878821028054637e-07, + "loss": 0.1474, + "step": 105500 + }, + { + "epoch": 8.604954367666233, + "grad_norm": 17.363147735595703, + "learning_rate": 5.812091176604551e-07, + "loss": 0.1567, + "step": 105600 + }, + { + "epoch": 8.613102998696219, + "grad_norm": 7.612610340118408, + "learning_rate": 5.745718834327679e-07, + "loss": 0.158, + "step": 105700 + }, + { + "epoch": 8.621251629726206, + "grad_norm": 12.395828247070312, + "learning_rate": 5.679704538219827e-07, + "loss": 0.1817, + "step": 105800 + }, + { + "epoch": 8.629400260756192, + "grad_norm": 2.951467514038086, + "learning_rate": 5.614048822379947e-07, + "loss": 0.1731, + "step": 105900 + }, + { + "epoch": 8.63754889178618, + "grad_norm": 14.023295402526855, + "learning_rate": 5.548752218005882e-07, + "loss": 0.1638, + "step": 106000 + }, + { + "epoch": 8.645697522816167, + "grad_norm": 21.505937576293945, + "learning_rate": 5.483815253389957e-07, + "loss": 0.1529, + "step": 106100 + }, + { + "epoch": 8.653846153846153, + "grad_norm": 8.31225299835205, + "learning_rate": 5.41923845391486e-07, + "loss": 0.1563, + "step": 106200 + }, + { + "epoch": 8.661994784876141, + "grad_norm": 9.446884155273438, + "learning_rate": 5.355022342049249e-07, + "loss": 0.1622, + "step": 106300 + }, + { + "epoch": 8.670143415906129, + "grad_norm": 21.06761360168457, + "learning_rate": 5.291167437343608e-07, + "loss": 0.1602, + "step": 106400 + }, + { + "epoch": 8.678292046936114, + "grad_norm": 13.025223731994629, + "learning_rate": 5.227674256426002e-07, + "loss": 0.1611, + "step": 106500 + }, + { + "epoch": 8.686440677966102, + "grad_norm": 6.65778923034668, + "learning_rate": 5.164543312997922e-07, + "loss": 0.1677, + "step": 106600 + }, + { + "epoch": 8.694589308996088, + "grad_norm": 25.8751220703125, + "learning_rate": 5.101775117830121e-07, + "loss": 0.1639, + "step": 106700 + }, + { + "epoch": 8.702737940026076, + "grad_norm": 18.437524795532227, + "learning_rate": 5.039370178758485e-07, + "loss": 0.1651, + "step": 106800 + }, + { + "epoch": 8.710886571056063, + "grad_norm": 31.746627807617188, + "learning_rate": 4.977329000679903e-07, + "loss": 0.1758, + "step": 106900 + }, + { + "epoch": 8.719035202086049, + "grad_norm": 12.55679988861084, + "learning_rate": 4.915652085548217e-07, + "loss": 0.1571, + "step": 107000 + }, + { + "epoch": 8.727183833116037, + "grad_norm": 1.4074722528457642, + "learning_rate": 4.854339932370134e-07, + "loss": 0.1526, + "step": 107100 + }, + { + "epoch": 8.735332464146023, + "grad_norm": 5.811018466949463, + "learning_rate": 4.793393037201194e-07, + "loss": 0.1745, + "step": 107200 + }, + { + "epoch": 8.74348109517601, + "grad_norm": 2.8639020919799805, + "learning_rate": 4.7328118931417753e-07, + "loss": 0.1695, + "step": 107300 + }, + { + "epoch": 8.751629726205998, + "grad_norm": 20.180130004882812, + "learning_rate": 4.672596990333073e-07, + "loss": 0.1758, + "step": 107400 + }, + { + "epoch": 8.759778357235984, + "grad_norm": 19.003700256347656, + "learning_rate": 4.6127488159531495e-07, + "loss": 0.1669, + "step": 107500 + }, + { + "epoch": 8.767926988265971, + "grad_norm": 12.393278121948242, + "learning_rate": 4.553267854213017e-07, + "loss": 0.1827, + "step": 107600 + }, + { + "epoch": 8.776075619295959, + "grad_norm": 23.79950714111328, + "learning_rate": 4.494154586352667e-07, + "loss": 0.1571, + "step": 107700 + }, + { + "epoch": 8.784224250325945, + "grad_norm": 21.107633590698242, + "learning_rate": 4.435409490637227e-07, + "loss": 0.1744, + "step": 107800 + }, + { + "epoch": 8.792372881355933, + "grad_norm": 15.573356628417969, + "learning_rate": 4.3770330423530626e-07, + "loss": 0.1675, + "step": 107900 + }, + { + "epoch": 8.800521512385918, + "grad_norm": 14.63633918762207, + "learning_rate": 4.3190257138039313e-07, + "loss": 0.1667, + "step": 108000 + }, + { + "epoch": 8.808670143415906, + "grad_norm": 15.823701858520508, + "learning_rate": 4.2613879743071907e-07, + "loss": 0.164, + "step": 108100 + }, + { + "epoch": 8.816818774445894, + "grad_norm": 7.163984775543213, + "learning_rate": 4.204120290189956e-07, + "loss": 0.1648, + "step": 108200 + }, + { + "epoch": 8.82496740547588, + "grad_norm": 10.87267780303955, + "learning_rate": 4.147223124785366e-07, + "loss": 0.1767, + "step": 108300 + }, + { + "epoch": 8.833116036505867, + "grad_norm": 13.024577140808105, + "learning_rate": 4.0906969384288396e-07, + "loss": 0.1561, + "step": 108400 + }, + { + "epoch": 8.841264667535853, + "grad_norm": 15.831514358520508, + "learning_rate": 4.034542188454282e-07, + "loss": 0.2002, + "step": 108500 + }, + { + "epoch": 8.84941329856584, + "grad_norm": 8.199058532714844, + "learning_rate": 3.9787593291904793e-07, + "loss": 0.1823, + "step": 108600 + }, + { + "epoch": 8.857561929595828, + "grad_norm": 14.69583511352539, + "learning_rate": 3.9233488119573506e-07, + "loss": 0.1779, + "step": 108700 + }, + { + "epoch": 8.865710560625814, + "grad_norm": 12.765257835388184, + "learning_rate": 3.868311085062337e-07, + "loss": 0.1626, + "step": 108800 + }, + { + "epoch": 8.873859191655802, + "grad_norm": 31.990026473999023, + "learning_rate": 3.8136465937967657e-07, + "loss": 0.1856, + "step": 108900 + }, + { + "epoch": 8.88200782268579, + "grad_norm": 24.627126693725586, + "learning_rate": 3.7593557804322167e-07, + "loss": 0.1518, + "step": 109000 + }, + { + "epoch": 8.890156453715775, + "grad_norm": 32.763092041015625, + "learning_rate": 3.705439084217016e-07, + "loss": 0.1526, + "step": 109100 + }, + { + "epoch": 8.898305084745763, + "grad_norm": 14.418821334838867, + "learning_rate": 3.6518969413725905e-07, + "loss": 0.1602, + "step": 109200 + }, + { + "epoch": 8.906453715775749, + "grad_norm": 9.382340431213379, + "learning_rate": 3.5987297850900217e-07, + "loss": 0.1742, + "step": 109300 + }, + { + "epoch": 8.914602346805736, + "grad_norm": 22.482595443725586, + "learning_rate": 3.5459380455264594e-07, + "loss": 0.1737, + "step": 109400 + }, + { + "epoch": 8.922750977835724, + "grad_norm": 18.5339412689209, + "learning_rate": 3.4935221498017316e-07, + "loss": 0.1581, + "step": 109500 + }, + { + "epoch": 8.93089960886571, + "grad_norm": 21.965267181396484, + "learning_rate": 3.4414825219948153e-07, + "loss": 0.1597, + "step": 109600 + }, + { + "epoch": 8.939048239895698, + "grad_norm": 13.353527069091797, + "learning_rate": 3.3898195831404354e-07, + "loss": 0.1747, + "step": 109700 + }, + { + "epoch": 8.947196870925685, + "grad_norm": 7.977973461151123, + "learning_rate": 3.3385337512256863e-07, + "loss": 0.1562, + "step": 109800 + }, + { + "epoch": 8.955345501955671, + "grad_norm": 9.263310432434082, + "learning_rate": 3.287625441186576e-07, + "loss": 0.1772, + "step": 109900 + }, + { + "epoch": 8.963494132985659, + "grad_norm": 13.787714958190918, + "learning_rate": 3.2370950649047383e-07, + "loss": 0.1976, + "step": 110000 + }, + { + "epoch": 8.971642764015645, + "grad_norm": 20.066761016845703, + "learning_rate": 3.1869430312040816e-07, + "loss": 0.1596, + "step": 110100 + }, + { + "epoch": 8.979791395045632, + "grad_norm": 20.64689826965332, + "learning_rate": 3.137169745847435e-07, + "loss": 0.1704, + "step": 110200 + }, + { + "epoch": 8.98794002607562, + "grad_norm": 46.617713928222656, + "learning_rate": 3.08777561153335e-07, + "loss": 0.1889, + "step": 110300 + }, + { + "epoch": 8.996088657105606, + "grad_norm": 14.401327133178711, + "learning_rate": 3.0387610278927725e-07, + "loss": 0.1702, + "step": 110400 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.8196787148594378, + "eval_loss": 0.7465346455574036, + "eval_runtime": 7.168, + "eval_samples_per_second": 347.378, + "eval_steps_per_second": 43.527, + "step": 110448 + }, + { + "epoch": 9.004237288135593, + "grad_norm": 15.593995094299316, + "learning_rate": 2.990126391485848e-07, + "loss": 0.1722, + "step": 110500 + }, + { + "epoch": 9.01238591916558, + "grad_norm": 4.0746636390686035, + "learning_rate": 2.941872095798698e-07, + "loss": 0.1346, + "step": 110600 + }, + { + "epoch": 9.020534550195567, + "grad_norm": 6.78621768951416, + "learning_rate": 2.893998531240222e-07, + "loss": 0.1819, + "step": 110700 + }, + { + "epoch": 9.028683181225555, + "grad_norm": 16.810945510864258, + "learning_rate": 2.8465060851389725e-07, + "loss": 0.152, + "step": 110800 + }, + { + "epoch": 9.03683181225554, + "grad_norm": 2.5170655250549316, + "learning_rate": 2.7993951417400025e-07, + "loss": 0.1737, + "step": 110900 + }, + { + "epoch": 9.044980443285528, + "grad_norm": 5.630674362182617, + "learning_rate": 2.752666082201727e-07, + "loss": 0.1703, + "step": 111000 + }, + { + "epoch": 9.053129074315516, + "grad_norm": 29.249120712280273, + "learning_rate": 2.7063192845929286e-07, + "loss": 0.1648, + "step": 111100 + }, + { + "epoch": 9.061277705345502, + "grad_norm": 7.27542781829834, + "learning_rate": 2.660355123889585e-07, + "loss": 0.1483, + "step": 111200 + }, + { + "epoch": 9.06942633637549, + "grad_norm": 27.242809295654297, + "learning_rate": 2.614773971971929e-07, + "loss": 0.1693, + "step": 111300 + }, + { + "epoch": 9.077574967405475, + "grad_norm": 15.899724006652832, + "learning_rate": 2.5695761976213704e-07, + "loss": 0.1562, + "step": 111400 + }, + { + "epoch": 9.085723598435463, + "grad_norm": 20.975248336791992, + "learning_rate": 2.5247621665175636e-07, + "loss": 0.1558, + "step": 111500 + }, + { + "epoch": 9.09387222946545, + "grad_norm": 17.303001403808594, + "learning_rate": 2.4803322412354227e-07, + "loss": 0.1594, + "step": 111600 + }, + { + "epoch": 9.102020860495436, + "grad_norm": 14.3364839553833, + "learning_rate": 2.436286781242192e-07, + "loss": 0.1558, + "step": 111700 + }, + { + "epoch": 9.110169491525424, + "grad_norm": 18.47357940673828, + "learning_rate": 2.3926261428945386e-07, + "loss": 0.1713, + "step": 111800 + }, + { + "epoch": 9.118318122555412, + "grad_norm": 2.021436929702759, + "learning_rate": 2.3493506794356745e-07, + "loss": 0.1577, + "step": 111900 + }, + { + "epoch": 9.126466753585397, + "grad_norm": 4.512004852294922, + "learning_rate": 2.3064607409924888e-07, + "loss": 0.1552, + "step": 112000 + }, + { + "epoch": 9.134615384615385, + "grad_norm": 21.13969612121582, + "learning_rate": 2.2639566745727203e-07, + "loss": 0.1504, + "step": 112100 + }, + { + "epoch": 9.142764015645371, + "grad_norm": 17.030675888061523, + "learning_rate": 2.2218388240621558e-07, + "loss": 0.1785, + "step": 112200 + }, + { + "epoch": 9.150912646675359, + "grad_norm": 11.586610794067383, + "learning_rate": 2.1801075302218423e-07, + "loss": 0.174, + "step": 112300 + }, + { + "epoch": 9.159061277705346, + "grad_norm": 19.795167922973633, + "learning_rate": 2.1387631306853174e-07, + "loss": 0.1672, + "step": 112400 + }, + { + "epoch": 9.167209908735332, + "grad_norm": 23.909713745117188, + "learning_rate": 2.0978059599559065e-07, + "loss": 0.1684, + "step": 112500 + }, + { + "epoch": 9.17535853976532, + "grad_norm": 5.545074939727783, + "learning_rate": 2.057236349403985e-07, + "loss": 0.165, + "step": 112600 + }, + { + "epoch": 9.183507170795306, + "grad_norm": 12.588091850280762, + "learning_rate": 2.0170546272643256e-07, + "loss": 0.167, + "step": 112700 + }, + { + "epoch": 9.191655801825293, + "grad_norm": 12.73204517364502, + "learning_rate": 1.9772611186334168e-07, + "loss": 0.1535, + "step": 112800 + }, + { + "epoch": 9.19980443285528, + "grad_norm": 11.712594985961914, + "learning_rate": 1.9378561454668598e-07, + "loss": 0.1629, + "step": 112900 + }, + { + "epoch": 9.207953063885267, + "grad_norm": 6.922073841094971, + "learning_rate": 1.8988400265767316e-07, + "loss": 0.1544, + "step": 113000 + }, + { + "epoch": 9.216101694915254, + "grad_norm": 14.258295059204102, + "learning_rate": 1.8602130776290362e-07, + "loss": 0.1575, + "step": 113100 + }, + { + "epoch": 9.224250325945242, + "grad_norm": 20.113460540771484, + "learning_rate": 1.8219756111411357e-07, + "loss": 0.151, + "step": 113200 + }, + { + "epoch": 9.232398956975228, + "grad_norm": 9.496116638183594, + "learning_rate": 1.784127936479213e-07, + "loss": 0.1791, + "step": 113300 + }, + { + "epoch": 9.240547588005215, + "grad_norm": 7.643208026885986, + "learning_rate": 1.7466703598557898e-07, + "loss": 0.1752, + "step": 113400 + }, + { + "epoch": 9.248696219035201, + "grad_norm": 21.511184692382812, + "learning_rate": 1.709603184327241e-07, + "loss": 0.1538, + "step": 113500 + }, + { + "epoch": 9.256844850065189, + "grad_norm": 18.147607803344727, + "learning_rate": 1.6729267097913338e-07, + "loss": 0.1606, + "step": 113600 + }, + { + "epoch": 9.264993481095177, + "grad_norm": 13.48155689239502, + "learning_rate": 1.6366412329848035e-07, + "loss": 0.1661, + "step": 113700 + }, + { + "epoch": 9.273142112125162, + "grad_norm": 21.713895797729492, + "learning_rate": 1.6007470474809772e-07, + "loss": 0.157, + "step": 113800 + }, + { + "epoch": 9.28129074315515, + "grad_norm": 11.30298137664795, + "learning_rate": 1.565244443687347e-07, + "loss": 0.1802, + "step": 113900 + }, + { + "epoch": 9.289439374185136, + "grad_norm": 15.809433937072754, + "learning_rate": 1.5301337088432787e-07, + "loss": 0.1723, + "step": 114000 + }, + { + "epoch": 9.297588005215124, + "grad_norm": 8.747072219848633, + "learning_rate": 1.4954151270176686e-07, + "loss": 0.1616, + "step": 114100 + }, + { + "epoch": 9.305736636245111, + "grad_norm": 1.6549293994903564, + "learning_rate": 1.4610889791066008e-07, + "loss": 0.1732, + "step": 114200 + }, + { + "epoch": 9.313885267275097, + "grad_norm": 13.10067367553711, + "learning_rate": 1.4271555428311323e-07, + "loss": 0.1618, + "step": 114300 + }, + { + "epoch": 9.322033898305085, + "grad_norm": 13.006690979003906, + "learning_rate": 1.39361509273504e-07, + "loss": 0.1806, + "step": 114400 + }, + { + "epoch": 9.330182529335072, + "grad_norm": 23.973905563354492, + "learning_rate": 1.3604679001825605e-07, + "loss": 0.1678, + "step": 114500 + }, + { + "epoch": 9.338331160365058, + "grad_norm": 10.249641418457031, + "learning_rate": 1.3277142333562253e-07, + "loss": 0.1646, + "step": 114600 + }, + { + "epoch": 9.346479791395046, + "grad_norm": 30.132413864135742, + "learning_rate": 1.2953543572546968e-07, + "loss": 0.1635, + "step": 114700 + }, + { + "epoch": 9.354628422425032, + "grad_norm": 13.259139060974121, + "learning_rate": 1.2633885336906014e-07, + "loss": 0.172, + "step": 114800 + }, + { + "epoch": 9.36277705345502, + "grad_norm": 19.1724853515625, + "learning_rate": 1.2318170212884285e-07, + "loss": 0.1633, + "step": 114900 + }, + { + "epoch": 9.370925684485007, + "grad_norm": 14.311450004577637, + "learning_rate": 1.2006400754824177e-07, + "loss": 0.1747, + "step": 115000 + }, + { + "epoch": 9.379074315514993, + "grad_norm": 8.39560317993164, + "learning_rate": 1.1698579485145134e-07, + "loss": 0.1441, + "step": 115100 + }, + { + "epoch": 9.38722294654498, + "grad_norm": 10.600957870483398, + "learning_rate": 1.1394708894323314e-07, + "loss": 0.1923, + "step": 115200 + }, + { + "epoch": 9.395371577574968, + "grad_norm": 9.45894718170166, + "learning_rate": 1.1094791440871e-07, + "loss": 0.1476, + "step": 115300 + }, + { + "epoch": 9.403520208604954, + "grad_norm": 6.497547149658203, + "learning_rate": 1.079882955131728e-07, + "loss": 0.1621, + "step": 115400 + }, + { + "epoch": 9.411668839634942, + "grad_norm": 5.700404644012451, + "learning_rate": 1.0506825620187954e-07, + "loss": 0.1569, + "step": 115500 + }, + { + "epoch": 9.419817470664928, + "grad_norm": 5.055960655212402, + "learning_rate": 1.0218782009986494e-07, + "loss": 0.1439, + "step": 115600 + }, + { + "epoch": 9.427966101694915, + "grad_norm": 0.8036000728607178, + "learning_rate": 9.93470105117461e-08, + "loss": 0.163, + "step": 115700 + }, + { + "epoch": 9.436114732724903, + "grad_norm": 21.1984920501709, + "learning_rate": 9.654585042153663e-08, + "loss": 0.153, + "step": 115800 + }, + { + "epoch": 9.444263363754889, + "grad_norm": 3.3010308742523193, + "learning_rate": 9.378436249245892e-08, + "loss": 0.1584, + "step": 115900 + }, + { + "epoch": 9.452411994784876, + "grad_norm": 9.636171340942383, + "learning_rate": 9.106256906676159e-08, + "loss": 0.1765, + "step": 116000 + }, + { + "epoch": 9.460560625814864, + "grad_norm": 1.7043323516845703, + "learning_rate": 8.838049216554123e-08, + "loss": 0.1604, + "step": 116100 + }, + { + "epoch": 9.46870925684485, + "grad_norm": 9.73293399810791, + "learning_rate": 8.573815348855818e-08, + "loss": 0.1703, + "step": 116200 + }, + { + "epoch": 9.476857887874838, + "grad_norm": 7.777896404266357, + "learning_rate": 8.313557441406606e-08, + "loss": 0.1632, + "step": 116300 + }, + { + "epoch": 9.485006518904823, + "grad_norm": 17.46415901184082, + "learning_rate": 8.057277599863744e-08, + "loss": 0.1536, + "step": 116400 + }, + { + "epoch": 9.493155149934811, + "grad_norm": 10.912395477294922, + "learning_rate": 7.804977897699295e-08, + "loss": 0.1611, + "step": 116500 + }, + { + "epoch": 9.501303780964799, + "grad_norm": 12.858296394348145, + "learning_rate": 7.556660376183301e-08, + "loss": 0.1458, + "step": 116600 + }, + { + "epoch": 9.509452411994785, + "grad_norm": 7.577301025390625, + "learning_rate": 7.312327044367463e-08, + "loss": 0.1408, + "step": 116700 + }, + { + "epoch": 9.517601043024772, + "grad_norm": 13.470318794250488, + "learning_rate": 7.071979879068769e-08, + "loss": 0.1568, + "step": 116800 + }, + { + "epoch": 9.525749674054758, + "grad_norm": 16.199295043945312, + "learning_rate": 6.835620824853451e-08, + "loss": 0.161, + "step": 116900 + }, + { + "epoch": 9.533898305084746, + "grad_norm": 15.154216766357422, + "learning_rate": 6.603251794021381e-08, + "loss": 0.1783, + "step": 117000 + }, + { + "epoch": 9.542046936114733, + "grad_norm": 9.926989555358887, + "learning_rate": 6.374874666590369e-08, + "loss": 0.149, + "step": 117100 + }, + { + "epoch": 9.55019556714472, + "grad_norm": 14.719680786132812, + "learning_rate": 6.15049129028128e-08, + "loss": 0.1459, + "step": 117200 + }, + { + "epoch": 9.558344198174707, + "grad_norm": 23.45909881591797, + "learning_rate": 5.93010348050288e-08, + "loss": 0.1624, + "step": 117300 + }, + { + "epoch": 9.566492829204694, + "grad_norm": 22.256080627441406, + "learning_rate": 5.7137130203370194e-08, + "loss": 0.1536, + "step": 117400 + }, + { + "epoch": 9.57464146023468, + "grad_norm": 5.540316581726074, + "learning_rate": 5.501321660524583e-08, + "loss": 0.1541, + "step": 117500 + }, + { + "epoch": 9.582790091264668, + "grad_norm": 3.839772939682007, + "learning_rate": 5.292931119451006e-08, + "loss": 0.1577, + "step": 117600 + }, + { + "epoch": 9.590938722294654, + "grad_norm": 4.665050029754639, + "learning_rate": 5.088543083132502e-08, + "loss": 0.1547, + "step": 117700 + }, + { + "epoch": 9.599087353324641, + "grad_norm": 18.975759506225586, + "learning_rate": 4.888159205202303e-08, + "loss": 0.1652, + "step": 117800 + }, + { + "epoch": 9.607235984354629, + "grad_norm": 13.844809532165527, + "learning_rate": 4.691781106897497e-08, + "loss": 0.1528, + "step": 117900 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 5.203334331512451, + "learning_rate": 4.499410377045765e-08, + "loss": 0.1484, + "step": 118000 + }, + { + "epoch": 9.623533246414603, + "grad_norm": 17.595108032226562, + "learning_rate": 4.311048572052501e-08, + "loss": 0.1547, + "step": 118100 + }, + { + "epoch": 9.631681877444588, + "grad_norm": 10.652242660522461, + "learning_rate": 4.1266972158883204e-08, + "loss": 0.1658, + "step": 118200 + }, + { + "epoch": 9.639830508474576, + "grad_norm": 15.711381912231445, + "learning_rate": 3.9463578000765724e-08, + "loss": 0.1493, + "step": 118300 + }, + { + "epoch": 9.647979139504564, + "grad_norm": 18.064918518066406, + "learning_rate": 3.7700317836814605e-08, + "loss": 0.1558, + "step": 118400 + }, + { + "epoch": 9.65612777053455, + "grad_norm": 11.699357986450195, + "learning_rate": 3.5977205932962164e-08, + "loss": 0.1465, + "step": 118500 + }, + { + "epoch": 9.664276401564537, + "grad_norm": 9.775052070617676, + "learning_rate": 3.429425623031335e-08, + "loss": 0.1456, + "step": 118600 + }, + { + "epoch": 9.672425032594525, + "grad_norm": 19.886598587036133, + "learning_rate": 3.265148234503579e-08, + "loss": 0.165, + "step": 118700 + }, + { + "epoch": 9.68057366362451, + "grad_norm": 13.31386661529541, + "learning_rate": 3.104889756824825e-08, + "loss": 0.1682, + "step": 118800 + }, + { + "epoch": 9.688722294654498, + "grad_norm": 16.752405166625977, + "learning_rate": 2.9486514865912364e-08, + "loss": 0.1498, + "step": 118900 + }, + { + "epoch": 9.696870925684484, + "grad_norm": 12.920425415039062, + "learning_rate": 2.7964346878729952e-08, + "loss": 0.1573, + "step": 119000 + }, + { + "epoch": 9.705019556714472, + "grad_norm": 5.0780110359191895, + "learning_rate": 2.64824059220381e-08, + "loss": 0.159, + "step": 119100 + }, + { + "epoch": 9.71316818774446, + "grad_norm": 13.475509643554688, + "learning_rate": 2.504070398571201e-08, + "loss": 0.1997, + "step": 119200 + }, + { + "epoch": 9.721316818774445, + "grad_norm": 20.931211471557617, + "learning_rate": 2.3639252734065644e-08, + "loss": 0.1957, + "step": 119300 + }, + { + "epoch": 9.729465449804433, + "grad_norm": 20.29063606262207, + "learning_rate": 2.227806350575956e-08, + "loss": 0.1388, + "step": 119400 + }, + { + "epoch": 9.737614080834419, + "grad_norm": 0.7664732336997986, + "learning_rate": 2.0957147313707127e-08, + "loss": 0.166, + "step": 119500 + }, + { + "epoch": 9.745762711864407, + "grad_norm": 18.868257522583008, + "learning_rate": 1.9676514844987338e-08, + "loss": 0.1618, + "step": 119600 + }, + { + "epoch": 9.753911342894394, + "grad_norm": 15.741533279418945, + "learning_rate": 1.8436176460756572e-08, + "loss": 0.1589, + "step": 119700 + }, + { + "epoch": 9.76205997392438, + "grad_norm": 11.955362319946289, + "learning_rate": 1.723614219616754e-08, + "loss": 0.168, + "step": 119800 + }, + { + "epoch": 9.770208604954368, + "grad_norm": 26.171483993530273, + "learning_rate": 1.6076421760283234e-08, + "loss": 0.157, + "step": 119900 + }, + { + "epoch": 9.778357235984355, + "grad_norm": 14.887884140014648, + "learning_rate": 1.4957024536003674e-08, + "loss": 0.1383, + "step": 120000 + }, + { + "epoch": 9.786505867014341, + "grad_norm": 9.518312454223633, + "learning_rate": 1.3877959579985944e-08, + "loss": 0.1385, + "step": 120100 + }, + { + "epoch": 9.794654498044329, + "grad_norm": 18.155826568603516, + "learning_rate": 1.283923562257483e-08, + "loss": 0.1623, + "step": 120200 + }, + { + "epoch": 9.802803129074315, + "grad_norm": 17.2945613861084, + "learning_rate": 1.1840861067727306e-08, + "loss": 0.1551, + "step": 120300 + }, + { + "epoch": 9.810951760104302, + "grad_norm": 24.658214569091797, + "learning_rate": 1.0882843992949255e-08, + "loss": 0.1499, + "step": 120400 + }, + { + "epoch": 9.81910039113429, + "grad_norm": 6.880736351013184, + "learning_rate": 9.9651921492272e-09, + "loss": 0.1501, + "step": 120500 + }, + { + "epoch": 9.827249022164276, + "grad_norm": 25.12505531311035, + "learning_rate": 9.087912960967227e-09, + "loss": 0.1571, + "step": 120600 + }, + { + "epoch": 9.835397653194264, + "grad_norm": 28.05438995361328, + "learning_rate": 8.251013525932273e-09, + "loss": 0.1637, + "step": 120700 + }, + { + "epoch": 9.843546284224251, + "grad_norm": 10.58689022064209, + "learning_rate": 7.454500615188264e-09, + "loss": 0.1509, + "step": 120800 + }, + { + "epoch": 9.851694915254237, + "grad_norm": 24.10919761657715, + "learning_rate": 6.698380673048066e-09, + "loss": 0.1691, + "step": 120900 + }, + { + "epoch": 9.859843546284225, + "grad_norm": 0.43672606348991394, + "learning_rate": 5.982659817017067e-09, + "loss": 0.1746, + "step": 121000 + }, + { + "epoch": 9.86799217731421, + "grad_norm": 12.899723052978516, + "learning_rate": 5.307343837747115e-09, + "loss": 0.1497, + "step": 121100 + }, + { + "epoch": 9.876140808344198, + "grad_norm": 18.292190551757812, + "learning_rate": 4.672438198987661e-09, + "loss": 0.1594, + "step": 121200 + }, + { + "epoch": 9.884289439374186, + "grad_norm": 18.396923065185547, + "learning_rate": 4.077948037541357e-09, + "loss": 0.1574, + "step": 121300 + }, + { + "epoch": 9.892438070404172, + "grad_norm": 22.605993270874023, + "learning_rate": 3.5238781632240813e-09, + "loss": 0.1642, + "step": 121400 + }, + { + "epoch": 9.90058670143416, + "grad_norm": 23.427574157714844, + "learning_rate": 3.010233058824419e-09, + "loss": 0.1765, + "step": 121500 + }, + { + "epoch": 9.908735332464147, + "grad_norm": 3.2891268730163574, + "learning_rate": 2.5370168800681325e-09, + "loss": 0.1743, + "step": 121600 + }, + { + "epoch": 9.916883963494133, + "grad_norm": 19.58220100402832, + "learning_rate": 2.1042334555848585e-09, + "loss": 0.1596, + "step": 121700 + }, + { + "epoch": 9.92503259452412, + "grad_norm": 15.260977745056152, + "learning_rate": 1.711886286876463e-09, + "loss": 0.1486, + "step": 121800 + }, + { + "epoch": 9.933181225554106, + "grad_norm": 5.988215446472168, + "learning_rate": 1.3599785482881767e-09, + "loss": 0.1518, + "step": 121900 + }, + { + "epoch": 9.941329856584094, + "grad_norm": 5.850574970245361, + "learning_rate": 1.0485130869858362e-09, + "loss": 0.1588, + "step": 122000 + }, + { + "epoch": 9.949478487614082, + "grad_norm": 11.288055419921875, + "learning_rate": 7.774924229281278e-10, + "loss": 0.1734, + "step": 122100 + }, + { + "epoch": 9.957627118644067, + "grad_norm": 10.142143249511719, + "learning_rate": 5.469187488510441e-10, + "loss": 0.173, + "step": 122200 + }, + { + "epoch": 9.965775749674055, + "grad_norm": 14.460721015930176, + "learning_rate": 3.5679393024623533e-10, + "loss": 0.1603, + "step": 122300 + }, + { + "epoch": 9.973924380704041, + "grad_norm": 23.698572158813477, + "learning_rate": 2.071195053482411e-10, + "loss": 0.1616, + "step": 122400 + }, + { + "epoch": 9.982073011734029, + "grad_norm": 9.158120155334473, + "learning_rate": 9.789668512116823e-11, + "loss": 0.1702, + "step": 122500 + }, + { + "epoch": 9.990221642764016, + "grad_norm": 32.79683303833008, + "learning_rate": 2.912635325036384e-11, + "loss": 0.1718, + "step": 122600 + }, + { + "epoch": 9.998370273794002, + "grad_norm": 17.385313034057617, + "learning_rate": 8.090661318682636e-13, + "loss": 0.1656, + "step": 122700 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.7754501700401306, + "eval_runtime": 7.1115, + "eval_samples_per_second": 350.138, + "eval_steps_per_second": 43.873, + "step": 122720 + }, + { + "epoch": 10.0, + "step": 122720, + "total_flos": 1.617427903829713e+17, + "train_loss": 0.3309724763735177, + "train_runtime": 41426.0671, + "train_samples_per_second": 94.796, + "train_steps_per_second": 2.962 + } + ], + "logging_steps": 100, + "max_steps": 122720, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.617427903829713e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}