|
{ |
|
"best_metric": 0.8558232931726908, |
|
"best_model_checkpoint": "/scratch/camembertv2/runs/results/xnli/camembertav2-base-bf16-p2-17000/max_seq_length-160-gradient_accumulation_steps-4-precision-fp32-learning_rate-1e-05-epochs-10-lr_scheduler-polynomial-warmup_steps-0.1/SEED-1/checkpoint-36816", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 122720, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008148631029986962, |
|
"grad_norm": 1.3529162406921387, |
|
"learning_rate": 8.148631029986963e-08, |
|
"loss": 1.1024, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016297262059973925, |
|
"grad_norm": 1.2678381204605103, |
|
"learning_rate": 1.6297262059973925e-07, |
|
"loss": 1.1006, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.024445893089960886, |
|
"grad_norm": 0.9964377880096436, |
|
"learning_rate": 2.4445893089960885e-07, |
|
"loss": 1.1, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03259452411994785, |
|
"grad_norm": 2.1048221588134766, |
|
"learning_rate": 3.259452411994785e-07, |
|
"loss": 1.1015, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04074315514993481, |
|
"grad_norm": 1.244388222694397, |
|
"learning_rate": 4.0743155149934816e-07, |
|
"loss": 1.0995, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04889178617992177, |
|
"grad_norm": 1.3034368753433228, |
|
"learning_rate": 4.889178617992177e-07, |
|
"loss": 1.0988, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05704041720990873, |
|
"grad_norm": 1.0444238185882568, |
|
"learning_rate": 5.704041720990874e-07, |
|
"loss": 1.0963, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0651890482398957, |
|
"grad_norm": 3.489915370941162, |
|
"learning_rate": 6.51890482398957e-07, |
|
"loss": 1.0967, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07333767926988266, |
|
"grad_norm": 1.0847524404525757, |
|
"learning_rate": 7.333767926988267e-07, |
|
"loss": 1.0952, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08148631029986962, |
|
"grad_norm": 1.0315290689468384, |
|
"learning_rate": 8.148631029986963e-07, |
|
"loss": 1.0935, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08963494132985658, |
|
"grad_norm": 1.655386209487915, |
|
"learning_rate": 8.963494132985659e-07, |
|
"loss": 1.0883, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09778357235984354, |
|
"grad_norm": 1.4588720798492432, |
|
"learning_rate": 9.778357235984354e-07, |
|
"loss": 1.0823, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1059322033898305, |
|
"grad_norm": 1.3579902648925781, |
|
"learning_rate": 1.059322033898305e-06, |
|
"loss": 1.0728, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11408083441981746, |
|
"grad_norm": 1.7215408086776733, |
|
"learning_rate": 1.1408083441981747e-06, |
|
"loss": 1.0528, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12222946544980444, |
|
"grad_norm": 2.2282140254974365, |
|
"learning_rate": 1.2222946544980446e-06, |
|
"loss": 1.0169, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1303780964797914, |
|
"grad_norm": 2.632673501968384, |
|
"learning_rate": 1.303780964797914e-06, |
|
"loss": 0.9727, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.13852672750977835, |
|
"grad_norm": 5.786465167999268, |
|
"learning_rate": 1.3852672750977837e-06, |
|
"loss": 0.9276, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14667535853976532, |
|
"grad_norm": 5.518361568450928, |
|
"learning_rate": 1.4667535853976533e-06, |
|
"loss": 0.8697, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15482398956975227, |
|
"grad_norm": 6.2041192054748535, |
|
"learning_rate": 1.5482398956975228e-06, |
|
"loss": 0.8348, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16297262059973924, |
|
"grad_norm": 4.39878511428833, |
|
"learning_rate": 1.6297262059973926e-06, |
|
"loss": 0.784, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17112125162972622, |
|
"grad_norm": 8.355758666992188, |
|
"learning_rate": 1.7112125162972623e-06, |
|
"loss": 0.7816, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.17926988265971316, |
|
"grad_norm": 7.7773518562316895, |
|
"learning_rate": 1.7926988265971317e-06, |
|
"loss": 0.7391, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.18741851368970014, |
|
"grad_norm": 10.548506736755371, |
|
"learning_rate": 1.8741851368970016e-06, |
|
"loss": 0.7357, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.19556714471968709, |
|
"grad_norm": 7.029742240905762, |
|
"learning_rate": 1.955671447196871e-06, |
|
"loss": 0.7138, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.20371577574967406, |
|
"grad_norm": 7.368396759033203, |
|
"learning_rate": 2.037157757496741e-06, |
|
"loss": 0.6928, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.211864406779661, |
|
"grad_norm": 5.480055809020996, |
|
"learning_rate": 2.11864406779661e-06, |
|
"loss": 0.7032, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.22001303780964798, |
|
"grad_norm": 9.542736053466797, |
|
"learning_rate": 2.20013037809648e-06, |
|
"loss": 0.6752, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.22816166883963493, |
|
"grad_norm": 5.912474632263184, |
|
"learning_rate": 2.2816166883963494e-06, |
|
"loss": 0.6566, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2363102998696219, |
|
"grad_norm": 6.762691020965576, |
|
"learning_rate": 2.363102998696219e-06, |
|
"loss": 0.6671, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.24445893089960888, |
|
"grad_norm": 9.684687614440918, |
|
"learning_rate": 2.444589308996089e-06, |
|
"loss": 0.6612, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2526075619295958, |
|
"grad_norm": 8.38856029510498, |
|
"learning_rate": 2.5260756192959584e-06, |
|
"loss": 0.6441, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2607561929595828, |
|
"grad_norm": 8.438015937805176, |
|
"learning_rate": 2.607561929595828e-06, |
|
"loss": 0.6314, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2689048239895698, |
|
"grad_norm": 8.523892402648926, |
|
"learning_rate": 2.689048239895698e-06, |
|
"loss": 0.6382, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2770534550195567, |
|
"grad_norm": 8.889211654663086, |
|
"learning_rate": 2.7705345501955674e-06, |
|
"loss": 0.6364, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.28520208604954367, |
|
"grad_norm": 441.1722717285156, |
|
"learning_rate": 2.852020860495437e-06, |
|
"loss": 0.6033, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.29335071707953064, |
|
"grad_norm": 10.166702270507812, |
|
"learning_rate": 2.9335071707953067e-06, |
|
"loss": 0.6282, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3014993481095176, |
|
"grad_norm": 10.398645401000977, |
|
"learning_rate": 3.0149934810951763e-06, |
|
"loss": 0.6072, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.30964797913950454, |
|
"grad_norm": 12.474647521972656, |
|
"learning_rate": 3.0964797913950456e-06, |
|
"loss": 0.6154, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3177966101694915, |
|
"grad_norm": 7.278645038604736, |
|
"learning_rate": 3.1779661016949152e-06, |
|
"loss": 0.6028, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3259452411994785, |
|
"grad_norm": 8.92871379852295, |
|
"learning_rate": 3.2594524119947853e-06, |
|
"loss": 0.61, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.33409387222946546, |
|
"grad_norm": 7.182549476623535, |
|
"learning_rate": 3.340938722294655e-06, |
|
"loss": 0.6053, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.34224250325945244, |
|
"grad_norm": 8.213513374328613, |
|
"learning_rate": 3.4224250325945246e-06, |
|
"loss": 0.5985, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.35039113428943935, |
|
"grad_norm": 8.712896347045898, |
|
"learning_rate": 3.503911342894394e-06, |
|
"loss": 0.6186, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.35853976531942633, |
|
"grad_norm": 11.862103462219238, |
|
"learning_rate": 3.5853976531942635e-06, |
|
"loss": 0.5893, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3666883963494133, |
|
"grad_norm": 10.739442825317383, |
|
"learning_rate": 3.666883963494133e-06, |
|
"loss": 0.5883, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3748370273794003, |
|
"grad_norm": 7.735357761383057, |
|
"learning_rate": 3.748370273794003e-06, |
|
"loss": 0.5942, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.3829856584093872, |
|
"grad_norm": 7.83574914932251, |
|
"learning_rate": 3.829856584093872e-06, |
|
"loss": 0.5715, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.39113428943937417, |
|
"grad_norm": 8.37131118774414, |
|
"learning_rate": 3.911342894393742e-06, |
|
"loss": 0.5481, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.39928292046936115, |
|
"grad_norm": 8.036620140075684, |
|
"learning_rate": 3.992829204693612e-06, |
|
"loss": 0.5712, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4074315514993481, |
|
"grad_norm": 13.716651916503906, |
|
"learning_rate": 4.074315514993482e-06, |
|
"loss": 0.5569, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4155801825293351, |
|
"grad_norm": 8.926308631896973, |
|
"learning_rate": 4.1558018252933515e-06, |
|
"loss": 0.5404, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.423728813559322, |
|
"grad_norm": 10.423807144165039, |
|
"learning_rate": 4.23728813559322e-06, |
|
"loss": 0.5535, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.431877444589309, |
|
"grad_norm": 7.072977542877197, |
|
"learning_rate": 4.31877444589309e-06, |
|
"loss": 0.5529, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.44002607561929596, |
|
"grad_norm": 9.907496452331543, |
|
"learning_rate": 4.40026075619296e-06, |
|
"loss": 0.5459, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.44817470664928294, |
|
"grad_norm": 10.329442977905273, |
|
"learning_rate": 4.48174706649283e-06, |
|
"loss": 0.5777, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.45632333767926986, |
|
"grad_norm": 9.943466186523438, |
|
"learning_rate": 4.563233376792699e-06, |
|
"loss": 0.5686, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.46447196870925683, |
|
"grad_norm": 6.4508056640625, |
|
"learning_rate": 4.6447196870925686e-06, |
|
"loss": 0.5461, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.4726205997392438, |
|
"grad_norm": 7.132810115814209, |
|
"learning_rate": 4.726205997392438e-06, |
|
"loss": 0.5566, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.4807692307692308, |
|
"grad_norm": 7.723414421081543, |
|
"learning_rate": 4.807692307692308e-06, |
|
"loss": 0.5661, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.48891786179921776, |
|
"grad_norm": 11.582465171813965, |
|
"learning_rate": 4.889178617992178e-06, |
|
"loss": 0.5517, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4970664928292047, |
|
"grad_norm": 7.484246730804443, |
|
"learning_rate": 4.970664928292047e-06, |
|
"loss": 0.568, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5052151238591917, |
|
"grad_norm": 10.276209831237793, |
|
"learning_rate": 5.052151238591917e-06, |
|
"loss": 0.525, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5133637548891786, |
|
"grad_norm": 8.953728675842285, |
|
"learning_rate": 5.1336375488917865e-06, |
|
"loss": 0.5534, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5215123859191656, |
|
"grad_norm": 8.184313774108887, |
|
"learning_rate": 5.215123859191656e-06, |
|
"loss": 0.5467, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 8.448795318603516, |
|
"learning_rate": 5.296610169491526e-06, |
|
"loss": 0.5411, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5378096479791395, |
|
"grad_norm": 9.622394561767578, |
|
"learning_rate": 5.378096479791396e-06, |
|
"loss": 0.5656, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5459582790091264, |
|
"grad_norm": 13.388031959533691, |
|
"learning_rate": 5.459582790091264e-06, |
|
"loss": 0.5433, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.5541069100391134, |
|
"grad_norm": 8.313009262084961, |
|
"learning_rate": 5.541069100391135e-06, |
|
"loss": 0.5373, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.5622555410691004, |
|
"grad_norm": 6.502901077270508, |
|
"learning_rate": 5.622555410691004e-06, |
|
"loss": 0.5443, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5704041720990873, |
|
"grad_norm": 6.7939863204956055, |
|
"learning_rate": 5.704041720990874e-06, |
|
"loss": 0.5691, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5785528031290743, |
|
"grad_norm": 8.51882266998291, |
|
"learning_rate": 5.785528031290744e-06, |
|
"loss": 0.5477, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.5867014341590613, |
|
"grad_norm": 7.1099324226379395, |
|
"learning_rate": 5.867014341590613e-06, |
|
"loss": 0.5287, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.5948500651890483, |
|
"grad_norm": 12.773337364196777, |
|
"learning_rate": 5.948500651890483e-06, |
|
"loss": 0.5188, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6029986962190352, |
|
"grad_norm": 7.300539016723633, |
|
"learning_rate": 6.029986962190353e-06, |
|
"loss": 0.5259, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6111473272490222, |
|
"grad_norm": 8.644438743591309, |
|
"learning_rate": 6.111473272490222e-06, |
|
"loss": 0.5299, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6192959582790091, |
|
"grad_norm": 10.571152687072754, |
|
"learning_rate": 6.192959582790091e-06, |
|
"loss": 0.5238, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.627444589308996, |
|
"grad_norm": 7.20805549621582, |
|
"learning_rate": 6.274445893089961e-06, |
|
"loss": 0.5184, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.635593220338983, |
|
"grad_norm": 6.403540134429932, |
|
"learning_rate": 6.3559322033898304e-06, |
|
"loss": 0.5069, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.64374185136897, |
|
"grad_norm": 5.379425525665283, |
|
"learning_rate": 6.4374185136897e-06, |
|
"loss": 0.5149, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.651890482398957, |
|
"grad_norm": 10.648305892944336, |
|
"learning_rate": 6.518904823989571e-06, |
|
"loss": 0.5319, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.660039113428944, |
|
"grad_norm": 7.570769786834717, |
|
"learning_rate": 6.60039113428944e-06, |
|
"loss": 0.5556, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.6681877444589309, |
|
"grad_norm": 7.133244037628174, |
|
"learning_rate": 6.68187744458931e-06, |
|
"loss": 0.5016, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.6763363754889179, |
|
"grad_norm": 9.345438003540039, |
|
"learning_rate": 6.7633637548891795e-06, |
|
"loss": 0.515, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.6844850065189049, |
|
"grad_norm": 7.831772327423096, |
|
"learning_rate": 6.844850065189049e-06, |
|
"loss": 0.5035, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.6926336375488917, |
|
"grad_norm": 4.8923468589782715, |
|
"learning_rate": 6.926336375488918e-06, |
|
"loss": 0.5379, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7007822685788787, |
|
"grad_norm": 5.273585319519043, |
|
"learning_rate": 7.007822685788788e-06, |
|
"loss": 0.5299, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7089308996088657, |
|
"grad_norm": 5.884276390075684, |
|
"learning_rate": 7.089308996088657e-06, |
|
"loss": 0.5212, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7170795306388527, |
|
"grad_norm": 11.059792518615723, |
|
"learning_rate": 7.170795306388527e-06, |
|
"loss": 0.5156, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7252281616688396, |
|
"grad_norm": 9.073988914489746, |
|
"learning_rate": 7.252281616688397e-06, |
|
"loss": 0.516, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7333767926988266, |
|
"grad_norm": 7.554878234863281, |
|
"learning_rate": 7.333767926988266e-06, |
|
"loss": 0.5189, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7415254237288136, |
|
"grad_norm": 7.445247173309326, |
|
"learning_rate": 7.415254237288137e-06, |
|
"loss": 0.5257, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.7496740547588006, |
|
"grad_norm": 6.337717056274414, |
|
"learning_rate": 7.496740547588006e-06, |
|
"loss": 0.5297, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.7578226857887875, |
|
"grad_norm": 8.99385929107666, |
|
"learning_rate": 7.578226857887876e-06, |
|
"loss": 0.516, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.7659713168187744, |
|
"grad_norm": 6.633099555969238, |
|
"learning_rate": 7.659713168187744e-06, |
|
"loss": 0.5012, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.7741199478487614, |
|
"grad_norm": 5.951461315155029, |
|
"learning_rate": 7.741199478487615e-06, |
|
"loss": 0.5004, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7822685788787483, |
|
"grad_norm": 8.544897079467773, |
|
"learning_rate": 7.822685788787483e-06, |
|
"loss": 0.5181, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.7904172099087353, |
|
"grad_norm": 9.489747047424316, |
|
"learning_rate": 7.904172099087354e-06, |
|
"loss": 0.4967, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.7985658409387223, |
|
"grad_norm": 8.092077255249023, |
|
"learning_rate": 7.985658409387224e-06, |
|
"loss": 0.5054, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.8067144719687093, |
|
"grad_norm": 6.389376163482666, |
|
"learning_rate": 8.067144719687093e-06, |
|
"loss": 0.4826, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.8148631029986962, |
|
"grad_norm": 8.780875205993652, |
|
"learning_rate": 8.148631029986964e-06, |
|
"loss": 0.5211, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8230117340286832, |
|
"grad_norm": 5.853449821472168, |
|
"learning_rate": 8.230117340286832e-06, |
|
"loss": 0.4993, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.8311603650586702, |
|
"grad_norm": 6.378355026245117, |
|
"learning_rate": 8.311603650586703e-06, |
|
"loss": 0.4901, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.8393089960886571, |
|
"grad_norm": 6.240468502044678, |
|
"learning_rate": 8.393089960886572e-06, |
|
"loss": 0.5026, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.847457627118644, |
|
"grad_norm": 8.406333923339844, |
|
"learning_rate": 8.47457627118644e-06, |
|
"loss": 0.5139, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.855606258148631, |
|
"grad_norm": 9.096386909484863, |
|
"learning_rate": 8.556062581486311e-06, |
|
"loss": 0.4982, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.863754889178618, |
|
"grad_norm": 8.424356460571289, |
|
"learning_rate": 8.63754889178618e-06, |
|
"loss": 0.5216, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.871903520208605, |
|
"grad_norm": 5.484040260314941, |
|
"learning_rate": 8.71903520208605e-06, |
|
"loss": 0.4975, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.8800521512385919, |
|
"grad_norm": 6.343160152435303, |
|
"learning_rate": 8.80052151238592e-06, |
|
"loss": 0.5016, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.8882007822685789, |
|
"grad_norm": 4.8916497230529785, |
|
"learning_rate": 8.88200782268579e-06, |
|
"loss": 0.5241, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.8963494132985659, |
|
"grad_norm": 11.608317375183105, |
|
"learning_rate": 8.96349413298566e-06, |
|
"loss": 0.5009, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9044980443285529, |
|
"grad_norm": 6.503841876983643, |
|
"learning_rate": 9.044980443285529e-06, |
|
"loss": 0.491, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9126466753585397, |
|
"grad_norm": 5.118412494659424, |
|
"learning_rate": 9.126466753585398e-06, |
|
"loss": 0.5199, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.9207953063885267, |
|
"grad_norm": 4.837841510772705, |
|
"learning_rate": 9.207953063885268e-06, |
|
"loss": 0.5021, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9289439374185137, |
|
"grad_norm": 6.082594394683838, |
|
"learning_rate": 9.289439374185137e-06, |
|
"loss": 0.4881, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.9370925684485006, |
|
"grad_norm": 8.028156280517578, |
|
"learning_rate": 9.370925684485008e-06, |
|
"loss": 0.4741, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9452411994784876, |
|
"grad_norm": 7.433921813964844, |
|
"learning_rate": 9.452411994784876e-06, |
|
"loss": 0.5031, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.9533898305084746, |
|
"grad_norm": 6.140247821807861, |
|
"learning_rate": 9.533898305084747e-06, |
|
"loss": 0.4971, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 9.514617919921875, |
|
"learning_rate": 9.615384615384616e-06, |
|
"loss": 0.4782, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.9696870925684485, |
|
"grad_norm": 6.993307590484619, |
|
"learning_rate": 9.696870925684486e-06, |
|
"loss": 0.481, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.9778357235984355, |
|
"grad_norm": 5.9556989669799805, |
|
"learning_rate": 9.778357235984357e-06, |
|
"loss": 0.4979, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9859843546284224, |
|
"grad_norm": 9.89676570892334, |
|
"learning_rate": 9.859843546284224e-06, |
|
"loss": 0.489, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.9941329856584094, |
|
"grad_norm": 6.464805603027344, |
|
"learning_rate": 9.941329856584094e-06, |
|
"loss": 0.5016, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8433734939759037, |
|
"eval_loss": 0.42833781242370605, |
|
"eval_runtime": 6.2968, |
|
"eval_samples_per_second": 395.439, |
|
"eval_steps_per_second": 49.549, |
|
"step": 12272 |
|
}, |
|
{ |
|
"epoch": 1.0022816166883963, |
|
"grad_norm": 5.668147563934326, |
|
"learning_rate": 9.997490221642765e-06, |
|
"loss": 0.4942, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.0104302477183833, |
|
"grad_norm": 6.818889141082764, |
|
"learning_rate": 9.988526727509779e-06, |
|
"loss": 0.4668, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.0185788787483703, |
|
"grad_norm": 8.424429893493652, |
|
"learning_rate": 9.979563233376793e-06, |
|
"loss": 0.4502, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0267275097783573, |
|
"grad_norm": 6.944746971130371, |
|
"learning_rate": 9.970599739243808e-06, |
|
"loss": 0.4248, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.0348761408083442, |
|
"grad_norm": 7.109178066253662, |
|
"learning_rate": 9.961636245110821e-06, |
|
"loss": 0.4612, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.0430247718383312, |
|
"grad_norm": 7.571024417877197, |
|
"learning_rate": 9.952672750977836e-06, |
|
"loss": 0.4519, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.0511734028683182, |
|
"grad_norm": 5.856245517730713, |
|
"learning_rate": 9.943709256844851e-06, |
|
"loss": 0.4469, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.0593220338983051, |
|
"grad_norm": 7.169976234436035, |
|
"learning_rate": 9.934745762711866e-06, |
|
"loss": 0.4542, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.0674706649282921, |
|
"grad_norm": 7.9827351570129395, |
|
"learning_rate": 9.92578226857888e-06, |
|
"loss": 0.4486, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.075619295958279, |
|
"grad_norm": 5.570741176605225, |
|
"learning_rate": 9.916818774445894e-06, |
|
"loss": 0.4432, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.083767926988266, |
|
"grad_norm": 5.507940769195557, |
|
"learning_rate": 9.907855280312908e-06, |
|
"loss": 0.4817, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.0919165580182528, |
|
"grad_norm": 4.281393527984619, |
|
"learning_rate": 9.898891786179923e-06, |
|
"loss": 0.444, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.1000651890482398, |
|
"grad_norm": 8.615004539489746, |
|
"learning_rate": 9.889928292046936e-06, |
|
"loss": 0.4556, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.1082138200782268, |
|
"grad_norm": 6.27708625793457, |
|
"learning_rate": 9.880964797913951e-06, |
|
"loss": 0.4776, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.1163624511082137, |
|
"grad_norm": 7.115694999694824, |
|
"learning_rate": 9.872001303780966e-06, |
|
"loss": 0.4491, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.1245110821382007, |
|
"grad_norm": 9.465932846069336, |
|
"learning_rate": 9.86303780964798e-06, |
|
"loss": 0.4242, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.1326597131681877, |
|
"grad_norm": 6.352951526641846, |
|
"learning_rate": 9.854074315514994e-06, |
|
"loss": 0.4598, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.1408083441981747, |
|
"grad_norm": 6.058132171630859, |
|
"learning_rate": 9.845110821382009e-06, |
|
"loss": 0.4477, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1489569752281616, |
|
"grad_norm": 11.479142189025879, |
|
"learning_rate": 9.836147327249023e-06, |
|
"loss": 0.451, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.1571056062581486, |
|
"grad_norm": 6.964011192321777, |
|
"learning_rate": 9.827183833116037e-06, |
|
"loss": 0.4676, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.1652542372881356, |
|
"grad_norm": 5.015002250671387, |
|
"learning_rate": 9.818220338983051e-06, |
|
"loss": 0.4485, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.1734028683181226, |
|
"grad_norm": 9.673796653747559, |
|
"learning_rate": 9.809256844850066e-06, |
|
"loss": 0.457, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.1815514993481095, |
|
"grad_norm": 7.423110485076904, |
|
"learning_rate": 9.80029335071708e-06, |
|
"loss": 0.4452, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.1897001303780965, |
|
"grad_norm": 7.399952411651611, |
|
"learning_rate": 9.791329856584096e-06, |
|
"loss": 0.445, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.1978487614080835, |
|
"grad_norm": 7.837041854858398, |
|
"learning_rate": 9.782366362451109e-06, |
|
"loss": 0.4348, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.2059973924380705, |
|
"grad_norm": 5.745814800262451, |
|
"learning_rate": 9.773402868318124e-06, |
|
"loss": 0.4383, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.2141460234680574, |
|
"grad_norm": 5.527169227600098, |
|
"learning_rate": 9.764439374185138e-06, |
|
"loss": 0.4304, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.2222946544980444, |
|
"grad_norm": 5.521317005157471, |
|
"learning_rate": 9.755475880052152e-06, |
|
"loss": 0.4348, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.2304432855280312, |
|
"grad_norm": 6.435342788696289, |
|
"learning_rate": 9.746512385919166e-06, |
|
"loss": 0.4407, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.2385919165580184, |
|
"grad_norm": 7.2671637535095215, |
|
"learning_rate": 9.737548891786181e-06, |
|
"loss": 0.4543, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.2467405475880051, |
|
"grad_norm": 7.949999809265137, |
|
"learning_rate": 9.728585397653194e-06, |
|
"loss": 0.4422, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.254889178617992, |
|
"grad_norm": 6.410602569580078, |
|
"learning_rate": 9.719621903520209e-06, |
|
"loss": 0.4526, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.263037809647979, |
|
"grad_norm": 5.39459753036499, |
|
"learning_rate": 9.710658409387224e-06, |
|
"loss": 0.4536, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.271186440677966, |
|
"grad_norm": 6.411767482757568, |
|
"learning_rate": 9.701694915254239e-06, |
|
"loss": 0.4434, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.279335071707953, |
|
"grad_norm": 4.861929416656494, |
|
"learning_rate": 9.692731421121252e-06, |
|
"loss": 0.4281, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.28748370273794, |
|
"grad_norm": 5.593100547790527, |
|
"learning_rate": 9.683767926988267e-06, |
|
"loss": 0.4557, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.295632333767927, |
|
"grad_norm": 8.539257049560547, |
|
"learning_rate": 9.674804432855281e-06, |
|
"loss": 0.4487, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.303780964797914, |
|
"grad_norm": 6.900200366973877, |
|
"learning_rate": 9.665840938722295e-06, |
|
"loss": 0.4285, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.311929595827901, |
|
"grad_norm": 5.426370143890381, |
|
"learning_rate": 9.65687744458931e-06, |
|
"loss": 0.434, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.320078226857888, |
|
"grad_norm": 8.011603355407715, |
|
"learning_rate": 9.647913950456324e-06, |
|
"loss": 0.429, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.3282268578878749, |
|
"grad_norm": 5.003970146179199, |
|
"learning_rate": 9.638950456323339e-06, |
|
"loss": 0.4046, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.3363754889178618, |
|
"grad_norm": 6.891529560089111, |
|
"learning_rate": 9.629986962190354e-06, |
|
"loss": 0.439, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.3445241199478488, |
|
"grad_norm": 7.005788803100586, |
|
"learning_rate": 9.621023468057367e-06, |
|
"loss": 0.4422, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3526727509778358, |
|
"grad_norm": 9.583046913146973, |
|
"learning_rate": 9.612059973924382e-06, |
|
"loss": 0.4351, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.3608213820078228, |
|
"grad_norm": 6.386882305145264, |
|
"learning_rate": 9.603096479791396e-06, |
|
"loss": 0.4435, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.3689700130378095, |
|
"grad_norm": 6.294077396392822, |
|
"learning_rate": 9.59413298565841e-06, |
|
"loss": 0.4404, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.3771186440677967, |
|
"grad_norm": 9.978828430175781, |
|
"learning_rate": 9.585169491525424e-06, |
|
"loss": 0.4334, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.3852672750977835, |
|
"grad_norm": 9.329121589660645, |
|
"learning_rate": 9.57620599739244e-06, |
|
"loss": 0.4593, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.3934159061277707, |
|
"grad_norm": 6.96733283996582, |
|
"learning_rate": 9.567242503259454e-06, |
|
"loss": 0.4398, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.4015645371577574, |
|
"grad_norm": 6.355087757110596, |
|
"learning_rate": 9.558279009126467e-06, |
|
"loss": 0.4198, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.4097131681877444, |
|
"grad_norm": 4.224381446838379, |
|
"learning_rate": 9.549315514993482e-06, |
|
"loss": 0.4598, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.4178617992177314, |
|
"grad_norm": 5.903212070465088, |
|
"learning_rate": 9.540352020860497e-06, |
|
"loss": 0.4474, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.4260104302477183, |
|
"grad_norm": 7.5130438804626465, |
|
"learning_rate": 9.53138852672751e-06, |
|
"loss": 0.4576, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4341590612777053, |
|
"grad_norm": 7.3006086349487305, |
|
"learning_rate": 9.522425032594525e-06, |
|
"loss": 0.4133, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.4423076923076923, |
|
"grad_norm": 4.964842796325684, |
|
"learning_rate": 9.51346153846154e-06, |
|
"loss": 0.4291, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.4504563233376793, |
|
"grad_norm": 5.327952861785889, |
|
"learning_rate": 9.504498044328554e-06, |
|
"loss": 0.4257, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.4586049543676662, |
|
"grad_norm": 8.153892517089844, |
|
"learning_rate": 9.495534550195569e-06, |
|
"loss": 0.4369, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.4667535853976532, |
|
"grad_norm": 6.842040061950684, |
|
"learning_rate": 9.486571056062582e-06, |
|
"loss": 0.434, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.4749022164276402, |
|
"grad_norm": 10.316542625427246, |
|
"learning_rate": 9.477607561929597e-06, |
|
"loss": 0.4263, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.4830508474576272, |
|
"grad_norm": 7.404006481170654, |
|
"learning_rate": 9.468644067796612e-06, |
|
"loss": 0.4419, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.4911994784876141, |
|
"grad_norm": 7.187209129333496, |
|
"learning_rate": 9.459680573663625e-06, |
|
"loss": 0.4371, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.4993481095176011, |
|
"grad_norm": 6.297025203704834, |
|
"learning_rate": 9.45071707953064e-06, |
|
"loss": 0.4355, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.5074967405475879, |
|
"grad_norm": 5.6740875244140625, |
|
"learning_rate": 9.441753585397653e-06, |
|
"loss": 0.4562, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.515645371577575, |
|
"grad_norm": 6.485991954803467, |
|
"learning_rate": 9.43279009126467e-06, |
|
"loss": 0.4446, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.5237940026075618, |
|
"grad_norm": 6.18002986907959, |
|
"learning_rate": 9.423826597131682e-06, |
|
"loss": 0.4388, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.531942633637549, |
|
"grad_norm": 7.745771884918213, |
|
"learning_rate": 9.414863102998697e-06, |
|
"loss": 0.4376, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.5400912646675358, |
|
"grad_norm": 10.22498607635498, |
|
"learning_rate": 9.405899608865712e-06, |
|
"loss": 0.4334, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.548239895697523, |
|
"grad_norm": 8.568212509155273, |
|
"learning_rate": 9.396936114732725e-06, |
|
"loss": 0.4492, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.5563885267275097, |
|
"grad_norm": 7.228001117706299, |
|
"learning_rate": 9.38797262059974e-06, |
|
"loss": 0.4232, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.5645371577574967, |
|
"grad_norm": 8.333541870117188, |
|
"learning_rate": 9.379009126466755e-06, |
|
"loss": 0.4303, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.5726857887874837, |
|
"grad_norm": 7.778701305389404, |
|
"learning_rate": 9.370045632333768e-06, |
|
"loss": 0.4518, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.5808344198174706, |
|
"grad_norm": 6.472428798675537, |
|
"learning_rate": 9.361082138200784e-06, |
|
"loss": 0.4354, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.5889830508474576, |
|
"grad_norm": 8.317501068115234, |
|
"learning_rate": 9.352118644067797e-06, |
|
"loss": 0.4554, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.5971316818774446, |
|
"grad_norm": 8.868863105773926, |
|
"learning_rate": 9.343155149934812e-06, |
|
"loss": 0.435, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.6052803129074316, |
|
"grad_norm": 6.662153244018555, |
|
"learning_rate": 9.334191655801827e-06, |
|
"loss": 0.4498, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.6134289439374185, |
|
"grad_norm": 4.642838001251221, |
|
"learning_rate": 9.32522816166884e-06, |
|
"loss": 0.4496, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.6215775749674055, |
|
"grad_norm": 6.16021203994751, |
|
"learning_rate": 9.316264667535855e-06, |
|
"loss": 0.4477, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.6297262059973925, |
|
"grad_norm": 6.225278377532959, |
|
"learning_rate": 9.307301173402868e-06, |
|
"loss": 0.4356, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.6378748370273795, |
|
"grad_norm": 5.955763339996338, |
|
"learning_rate": 9.298337679269883e-06, |
|
"loss": 0.439, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.6460234680573662, |
|
"grad_norm": 6.927220344543457, |
|
"learning_rate": 9.289374185136898e-06, |
|
"loss": 0.4371, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.6541720990873534, |
|
"grad_norm": 7.151987075805664, |
|
"learning_rate": 9.280410691003912e-06, |
|
"loss": 0.4334, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.6623207301173402, |
|
"grad_norm": 7.74386739730835, |
|
"learning_rate": 9.271447196870927e-06, |
|
"loss": 0.425, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.6704693611473274, |
|
"grad_norm": 5.687450408935547, |
|
"learning_rate": 9.26248370273794e-06, |
|
"loss": 0.412, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.6786179921773141, |
|
"grad_norm": 6.604902744293213, |
|
"learning_rate": 9.253520208604955e-06, |
|
"loss": 0.4534, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.6867666232073013, |
|
"grad_norm": 6.3008623123168945, |
|
"learning_rate": 9.24455671447197e-06, |
|
"loss": 0.443, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.694915254237288, |
|
"grad_norm": 4.483551502227783, |
|
"learning_rate": 9.235593220338983e-06, |
|
"loss": 0.4397, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.7030638852672753, |
|
"grad_norm": 6.908230304718018, |
|
"learning_rate": 9.226629726205998e-06, |
|
"loss": 0.4533, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.711212516297262, |
|
"grad_norm": 6.523016929626465, |
|
"learning_rate": 9.217666232073013e-06, |
|
"loss": 0.4352, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.719361147327249, |
|
"grad_norm": 5.211183547973633, |
|
"learning_rate": 9.208702737940028e-06, |
|
"loss": 0.4485, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.727509778357236, |
|
"grad_norm": 4.074219703674316, |
|
"learning_rate": 9.199739243807042e-06, |
|
"loss": 0.4536, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.735658409387223, |
|
"grad_norm": 5.244410037994385, |
|
"learning_rate": 9.190775749674055e-06, |
|
"loss": 0.4311, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.74380704041721, |
|
"grad_norm": 5.48605489730835, |
|
"learning_rate": 9.18181225554107e-06, |
|
"loss": 0.4464, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.7519556714471969, |
|
"grad_norm": 6.30487060546875, |
|
"learning_rate": 9.172848761408083e-06, |
|
"loss": 0.4119, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.7601043024771839, |
|
"grad_norm": 7.300415992736816, |
|
"learning_rate": 9.163885267275098e-06, |
|
"loss": 0.4118, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.7682529335071708, |
|
"grad_norm": 7.198819160461426, |
|
"learning_rate": 9.154921773142113e-06, |
|
"loss": 0.425, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.7764015645371578, |
|
"grad_norm": 7.625805854797363, |
|
"learning_rate": 9.145958279009126e-06, |
|
"loss": 0.454, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.7845501955671446, |
|
"grad_norm": 5.602317810058594, |
|
"learning_rate": 9.136994784876143e-06, |
|
"loss": 0.4336, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.7926988265971318, |
|
"grad_norm": 4.4945478439331055, |
|
"learning_rate": 9.128031290743156e-06, |
|
"loss": 0.4295, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.8008474576271185, |
|
"grad_norm": 8.505398750305176, |
|
"learning_rate": 9.11906779661017e-06, |
|
"loss": 0.4481, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.8089960886571057, |
|
"grad_norm": 6.470762729644775, |
|
"learning_rate": 9.110104302477185e-06, |
|
"loss": 0.4142, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.8171447196870925, |
|
"grad_norm": 9.113410949707031, |
|
"learning_rate": 9.101140808344198e-06, |
|
"loss": 0.4169, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.8252933507170797, |
|
"grad_norm": 5.110347270965576, |
|
"learning_rate": 9.092177314211213e-06, |
|
"loss": 0.4283, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.8334419817470664, |
|
"grad_norm": 7.161343574523926, |
|
"learning_rate": 9.083213820078228e-06, |
|
"loss": 0.4499, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.8415906127770536, |
|
"grad_norm": 7.881711959838867, |
|
"learning_rate": 9.074250325945243e-06, |
|
"loss": 0.4069, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.8497392438070404, |
|
"grad_norm": 6.091899871826172, |
|
"learning_rate": 9.065286831812258e-06, |
|
"loss": 0.4315, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.8578878748370273, |
|
"grad_norm": 6.5380167961120605, |
|
"learning_rate": 9.05632333767927e-06, |
|
"loss": 0.431, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.8660365058670143, |
|
"grad_norm": 6.479442119598389, |
|
"learning_rate": 9.047359843546286e-06, |
|
"loss": 0.4299, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.8741851368970013, |
|
"grad_norm": 8.060819625854492, |
|
"learning_rate": 9.038396349413299e-06, |
|
"loss": 0.4204, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.8823337679269883, |
|
"grad_norm": 4.994590759277344, |
|
"learning_rate": 9.029432855280313e-06, |
|
"loss": 0.44, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.8904823989569752, |
|
"grad_norm": 6.113391876220703, |
|
"learning_rate": 9.020469361147328e-06, |
|
"loss": 0.448, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.8986310299869622, |
|
"grad_norm": 5.091489315032959, |
|
"learning_rate": 9.011505867014341e-06, |
|
"loss": 0.4615, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.9067796610169492, |
|
"grad_norm": 5.466245651245117, |
|
"learning_rate": 9.002542372881356e-06, |
|
"loss": 0.4456, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.9149282920469362, |
|
"grad_norm": 7.680745601654053, |
|
"learning_rate": 8.993578878748371e-06, |
|
"loss": 0.4207, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 6.748586177825928, |
|
"learning_rate": 8.984615384615386e-06, |
|
"loss": 0.4234, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.93122555410691, |
|
"grad_norm": 7.408194541931152, |
|
"learning_rate": 8.9756518904824e-06, |
|
"loss": 0.4345, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.9393741851368969, |
|
"grad_norm": 9.398754119873047, |
|
"learning_rate": 8.966688396349414e-06, |
|
"loss": 0.4267, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.947522816166884, |
|
"grad_norm": 8.72804069519043, |
|
"learning_rate": 8.957724902216428e-06, |
|
"loss": 0.4222, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.9556714471968708, |
|
"grad_norm": 3.8285768032073975, |
|
"learning_rate": 8.948761408083443e-06, |
|
"loss": 0.4295, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.963820078226858, |
|
"grad_norm": 7.150258541107178, |
|
"learning_rate": 8.939797913950456e-06, |
|
"loss": 0.4308, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.9719687092568448, |
|
"grad_norm": 5.6762495040893555, |
|
"learning_rate": 8.930834419817473e-06, |
|
"loss": 0.4315, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.980117340286832, |
|
"grad_norm": 6.01600980758667, |
|
"learning_rate": 8.921870925684486e-06, |
|
"loss": 0.4479, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.9882659713168187, |
|
"grad_norm": 5.277040958404541, |
|
"learning_rate": 8.9129074315515e-06, |
|
"loss": 0.4357, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.996414602346806, |
|
"grad_norm": 7.153831958770752, |
|
"learning_rate": 8.903943937418514e-06, |
|
"loss": 0.427, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8325301204819278, |
|
"eval_loss": 0.4437418580055237, |
|
"eval_runtime": 5.5267, |
|
"eval_samples_per_second": 450.541, |
|
"eval_steps_per_second": 56.453, |
|
"step": 24544 |
|
}, |
|
{ |
|
"epoch": 2.0045632333767927, |
|
"grad_norm": 8.098788261413574, |
|
"learning_rate": 8.894980443285529e-06, |
|
"loss": 0.3801, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 2.01271186440678, |
|
"grad_norm": 6.156032085418701, |
|
"learning_rate": 8.886016949152544e-06, |
|
"loss": 0.3587, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 2.0208604954367666, |
|
"grad_norm": 3.556076765060425, |
|
"learning_rate": 8.877053455019557e-06, |
|
"loss": 0.3615, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 2.029009126466754, |
|
"grad_norm": 7.757440090179443, |
|
"learning_rate": 8.868089960886571e-06, |
|
"loss": 0.3457, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 2.0371577574967406, |
|
"grad_norm": 3.515918493270874, |
|
"learning_rate": 8.859126466753586e-06, |
|
"loss": 0.363, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.0453063885267273, |
|
"grad_norm": 4.51107931137085, |
|
"learning_rate": 8.850162972620601e-06, |
|
"loss": 0.3352, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 2.0534550195567145, |
|
"grad_norm": 5.263424396514893, |
|
"learning_rate": 8.841199478487616e-06, |
|
"loss": 0.3589, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 2.0616036505867013, |
|
"grad_norm": 8.238348960876465, |
|
"learning_rate": 8.832235984354629e-06, |
|
"loss": 0.3312, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 2.0697522816166884, |
|
"grad_norm": 6.862210750579834, |
|
"learning_rate": 8.823272490221644e-06, |
|
"loss": 0.3555, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 2.077900912646675, |
|
"grad_norm": 11.949597358703613, |
|
"learning_rate": 8.814308996088659e-06, |
|
"loss": 0.3522, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.0860495436766624, |
|
"grad_norm": 6.46090030670166, |
|
"learning_rate": 8.805345501955672e-06, |
|
"loss": 0.3373, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 2.094198174706649, |
|
"grad_norm": 6.2601637840271, |
|
"learning_rate": 8.796382007822686e-06, |
|
"loss": 0.3587, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 2.1023468057366363, |
|
"grad_norm": 6.387700080871582, |
|
"learning_rate": 8.787418513689701e-06, |
|
"loss": 0.3746, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 2.110495436766623, |
|
"grad_norm": 8.692789077758789, |
|
"learning_rate": 8.778455019556714e-06, |
|
"loss": 0.3499, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 2.1186440677966103, |
|
"grad_norm": 4.821929454803467, |
|
"learning_rate": 8.76949152542373e-06, |
|
"loss": 0.3605, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.126792698826597, |
|
"grad_norm": 8.666610717773438, |
|
"learning_rate": 8.760528031290744e-06, |
|
"loss": 0.3462, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 2.1349413298565842, |
|
"grad_norm": 7.047764301300049, |
|
"learning_rate": 8.751564537157759e-06, |
|
"loss": 0.3573, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 2.143089960886571, |
|
"grad_norm": 6.5750837326049805, |
|
"learning_rate": 8.742601043024772e-06, |
|
"loss": 0.3625, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 2.151238591916558, |
|
"grad_norm": 8.66965389251709, |
|
"learning_rate": 8.733637548891787e-06, |
|
"loss": 0.3656, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 2.159387222946545, |
|
"grad_norm": 8.956038475036621, |
|
"learning_rate": 8.724674054758802e-06, |
|
"loss": 0.3553, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.167535853976532, |
|
"grad_norm": 9.39869499206543, |
|
"learning_rate": 8.715710560625815e-06, |
|
"loss": 0.3388, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 2.175684485006519, |
|
"grad_norm": 7.84521484375, |
|
"learning_rate": 8.706747066492831e-06, |
|
"loss": 0.3521, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 2.1838331160365057, |
|
"grad_norm": 8.594714164733887, |
|
"learning_rate": 8.697783572359844e-06, |
|
"loss": 0.3451, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 2.191981747066493, |
|
"grad_norm": 5.132546424865723, |
|
"learning_rate": 8.688820078226859e-06, |
|
"loss": 0.3568, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 2.2001303780964796, |
|
"grad_norm": 6.885127067565918, |
|
"learning_rate": 8.679856584093874e-06, |
|
"loss": 0.3603, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.208279009126467, |
|
"grad_norm": 3.2750186920166016, |
|
"learning_rate": 8.670893089960887e-06, |
|
"loss": 0.3548, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 2.2164276401564535, |
|
"grad_norm": 6.619633674621582, |
|
"learning_rate": 8.661929595827902e-06, |
|
"loss": 0.3442, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 2.2245762711864407, |
|
"grad_norm": 5.827129364013672, |
|
"learning_rate": 8.652966101694917e-06, |
|
"loss": 0.3641, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 2.2327249022164275, |
|
"grad_norm": 5.905134677886963, |
|
"learning_rate": 8.644002607561931e-06, |
|
"loss": 0.3738, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 2.2408735332464147, |
|
"grad_norm": 7.653305530548096, |
|
"learning_rate": 8.635039113428944e-06, |
|
"loss": 0.3315, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.2490221642764014, |
|
"grad_norm": 4.101062297821045, |
|
"learning_rate": 8.62607561929596e-06, |
|
"loss": 0.3647, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 2.2571707953063886, |
|
"grad_norm": 5.449923992156982, |
|
"learning_rate": 8.617112125162974e-06, |
|
"loss": 0.3537, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 2.2653194263363754, |
|
"grad_norm": 7.924961566925049, |
|
"learning_rate": 8.608148631029987e-06, |
|
"loss": 0.3494, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 2.2734680573663626, |
|
"grad_norm": 7.311993598937988, |
|
"learning_rate": 8.599185136897002e-06, |
|
"loss": 0.3791, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 2.2816166883963493, |
|
"grad_norm": 9.550232887268066, |
|
"learning_rate": 8.590221642764017e-06, |
|
"loss": 0.3758, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.2897653194263365, |
|
"grad_norm": 8.781049728393555, |
|
"learning_rate": 8.58125814863103e-06, |
|
"loss": 0.3508, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 2.2979139504563233, |
|
"grad_norm": 12.556912422180176, |
|
"learning_rate": 8.572294654498045e-06, |
|
"loss": 0.3781, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 2.3060625814863105, |
|
"grad_norm": 7.472509860992432, |
|
"learning_rate": 8.56333116036506e-06, |
|
"loss": 0.3532, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 2.3142112125162972, |
|
"grad_norm": 7.974531173706055, |
|
"learning_rate": 8.554367666232074e-06, |
|
"loss": 0.3736, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 2.322359843546284, |
|
"grad_norm": 3.605121374130249, |
|
"learning_rate": 8.545404172099089e-06, |
|
"loss": 0.3735, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.330508474576271, |
|
"grad_norm": 6.731493949890137, |
|
"learning_rate": 8.536440677966102e-06, |
|
"loss": 0.3744, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 2.3386571056062584, |
|
"grad_norm": 4.8290839195251465, |
|
"learning_rate": 8.527477183833117e-06, |
|
"loss": 0.3313, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 2.346805736636245, |
|
"grad_norm": 7.126736640930176, |
|
"learning_rate": 8.518513689700132e-06, |
|
"loss": 0.3748, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 2.354954367666232, |
|
"grad_norm": 8.92186450958252, |
|
"learning_rate": 8.509550195567145e-06, |
|
"loss": 0.3766, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 2.363102998696219, |
|
"grad_norm": 4.0069146156311035, |
|
"learning_rate": 8.50058670143416e-06, |
|
"loss": 0.3667, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.371251629726206, |
|
"grad_norm": 4.4215168952941895, |
|
"learning_rate": 8.491623207301173e-06, |
|
"loss": 0.366, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 2.379400260756193, |
|
"grad_norm": 6.785125255584717, |
|
"learning_rate": 8.48265971316819e-06, |
|
"loss": 0.3718, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 2.38754889178618, |
|
"grad_norm": 5.78675651550293, |
|
"learning_rate": 8.473696219035202e-06, |
|
"loss": 0.3529, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 2.395697522816167, |
|
"grad_norm": 8.054706573486328, |
|
"learning_rate": 8.464732724902217e-06, |
|
"loss": 0.3755, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 2.4038461538461537, |
|
"grad_norm": 6.803676605224609, |
|
"learning_rate": 8.455769230769232e-06, |
|
"loss": 0.3157, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.411994784876141, |
|
"grad_norm": 8.307502746582031, |
|
"learning_rate": 8.446805736636245e-06, |
|
"loss": 0.3489, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 2.4201434159061277, |
|
"grad_norm": 8.928215026855469, |
|
"learning_rate": 8.43784224250326e-06, |
|
"loss": 0.3793, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 2.428292046936115, |
|
"grad_norm": 6.0352396965026855, |
|
"learning_rate": 8.428878748370275e-06, |
|
"loss": 0.3612, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 2.4364406779661016, |
|
"grad_norm": 8.676813125610352, |
|
"learning_rate": 8.41991525423729e-06, |
|
"loss": 0.3548, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 2.444589308996089, |
|
"grad_norm": 8.409051895141602, |
|
"learning_rate": 8.410951760104304e-06, |
|
"loss": 0.3733, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.4527379400260756, |
|
"grad_norm": 12.27320671081543, |
|
"learning_rate": 8.401988265971318e-06, |
|
"loss": 0.3656, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 2.4608865710560623, |
|
"grad_norm": 5.953177452087402, |
|
"learning_rate": 8.393024771838332e-06, |
|
"loss": 0.3513, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 2.4690352020860495, |
|
"grad_norm": 3.4694623947143555, |
|
"learning_rate": 8.384061277705347e-06, |
|
"loss": 0.3439, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 2.4771838331160367, |
|
"grad_norm": 7.406796455383301, |
|
"learning_rate": 8.37509778357236e-06, |
|
"loss": 0.3431, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 2.4853324641460235, |
|
"grad_norm": 11.077946662902832, |
|
"learning_rate": 8.366134289439375e-06, |
|
"loss": 0.3387, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.4934810951760102, |
|
"grad_norm": 5.576053142547607, |
|
"learning_rate": 8.357170795306388e-06, |
|
"loss": 0.3635, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 2.5016297262059974, |
|
"grad_norm": 5.933220863342285, |
|
"learning_rate": 8.348207301173403e-06, |
|
"loss": 0.3555, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 2.509778357235984, |
|
"grad_norm": 5.958785533905029, |
|
"learning_rate": 8.339243807040418e-06, |
|
"loss": 0.3586, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 2.5179269882659714, |
|
"grad_norm": 6.1259446144104, |
|
"learning_rate": 8.330280312907433e-06, |
|
"loss": 0.3526, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 2.526075619295958, |
|
"grad_norm": 4.747119426727295, |
|
"learning_rate": 8.321316818774447e-06, |
|
"loss": 0.3525, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.5342242503259453, |
|
"grad_norm": 5.299834251403809, |
|
"learning_rate": 8.31235332464146e-06, |
|
"loss": 0.3445, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 2.542372881355932, |
|
"grad_norm": 8.24792194366455, |
|
"learning_rate": 8.303389830508475e-06, |
|
"loss": 0.3562, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 2.5505215123859193, |
|
"grad_norm": 4.663332462310791, |
|
"learning_rate": 8.29442633637549e-06, |
|
"loss": 0.3497, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 2.558670143415906, |
|
"grad_norm": 6.58770227432251, |
|
"learning_rate": 8.285462842242503e-06, |
|
"loss": 0.3528, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 2.5668187744458932, |
|
"grad_norm": 6.949512481689453, |
|
"learning_rate": 8.27649934810952e-06, |
|
"loss": 0.3743, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.57496740547588, |
|
"grad_norm": 7.117619514465332, |
|
"learning_rate": 8.267535853976533e-06, |
|
"loss": 0.369, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 2.583116036505867, |
|
"grad_norm": 5.4686689376831055, |
|
"learning_rate": 8.258572359843548e-06, |
|
"loss": 0.3577, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 2.591264667535854, |
|
"grad_norm": 7.302579879760742, |
|
"learning_rate": 8.249608865710562e-06, |
|
"loss": 0.3454, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 2.5994132985658407, |
|
"grad_norm": 3.9816572666168213, |
|
"learning_rate": 8.240645371577576e-06, |
|
"loss": 0.3506, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 2.607561929595828, |
|
"grad_norm": 11.21993350982666, |
|
"learning_rate": 8.23168187744459e-06, |
|
"loss": 0.3461, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.615710560625815, |
|
"grad_norm": 6.287281513214111, |
|
"learning_rate": 8.222718383311603e-06, |
|
"loss": 0.336, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 2.623859191655802, |
|
"grad_norm": 6.645712852478027, |
|
"learning_rate": 8.213754889178618e-06, |
|
"loss": 0.3675, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 2.6320078226857886, |
|
"grad_norm": 4.677251815795898, |
|
"learning_rate": 8.204791395045633e-06, |
|
"loss": 0.3517, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 2.640156453715776, |
|
"grad_norm": 5.261903285980225, |
|
"learning_rate": 8.195827900912648e-06, |
|
"loss": 0.3595, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 2.648305084745763, |
|
"grad_norm": 6.755844593048096, |
|
"learning_rate": 8.186864406779663e-06, |
|
"loss": 0.3592, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.6564537157757497, |
|
"grad_norm": 9.253024101257324, |
|
"learning_rate": 8.177900912646676e-06, |
|
"loss": 0.3655, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 2.6646023468057365, |
|
"grad_norm": 7.2145490646362305, |
|
"learning_rate": 8.16893741851369e-06, |
|
"loss": 0.3573, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 2.6727509778357237, |
|
"grad_norm": 5.882996082305908, |
|
"learning_rate": 8.159973924380705e-06, |
|
"loss": 0.3555, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 2.6808996088657104, |
|
"grad_norm": 13.761658668518066, |
|
"learning_rate": 8.151010430247718e-06, |
|
"loss": 0.3631, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 2.6890482398956976, |
|
"grad_norm": 6.742971420288086, |
|
"learning_rate": 8.142046936114733e-06, |
|
"loss": 0.3714, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.6971968709256844, |
|
"grad_norm": 5.441837787628174, |
|
"learning_rate": 8.133083441981748e-06, |
|
"loss": 0.35, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 2.7053455019556716, |
|
"grad_norm": 8.888238906860352, |
|
"learning_rate": 8.124119947848761e-06, |
|
"loss": 0.349, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 2.7134941329856583, |
|
"grad_norm": 6.0085368156433105, |
|
"learning_rate": 8.115156453715778e-06, |
|
"loss": 0.3555, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 2.7216427640156455, |
|
"grad_norm": 8.16164493560791, |
|
"learning_rate": 8.10619295958279e-06, |
|
"loss": 0.361, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 2.7297913950456323, |
|
"grad_norm": 7.598118782043457, |
|
"learning_rate": 8.097229465449806e-06, |
|
"loss": 0.3655, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.737940026075619, |
|
"grad_norm": 7.139162540435791, |
|
"learning_rate": 8.088265971316819e-06, |
|
"loss": 0.3604, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 2.7460886571056062, |
|
"grad_norm": 6.311704158782959, |
|
"learning_rate": 8.079302477183834e-06, |
|
"loss": 0.354, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 2.7542372881355934, |
|
"grad_norm": 5.32002592086792, |
|
"learning_rate": 8.070338983050848e-06, |
|
"loss": 0.3655, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 2.76238591916558, |
|
"grad_norm": 4.203720569610596, |
|
"learning_rate": 8.061375488917861e-06, |
|
"loss": 0.3611, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 2.770534550195567, |
|
"grad_norm": 9.162140846252441, |
|
"learning_rate": 8.052411994784878e-06, |
|
"loss": 0.3633, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.778683181225554, |
|
"grad_norm": 5.322396755218506, |
|
"learning_rate": 8.043448500651891e-06, |
|
"loss": 0.3614, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 2.7868318122555413, |
|
"grad_norm": 8.66873550415039, |
|
"learning_rate": 8.034485006518906e-06, |
|
"loss": 0.3522, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 2.794980443285528, |
|
"grad_norm": 7.859837055206299, |
|
"learning_rate": 8.02552151238592e-06, |
|
"loss": 0.3562, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 2.803129074315515, |
|
"grad_norm": 6.7927398681640625, |
|
"learning_rate": 8.016558018252934e-06, |
|
"loss": 0.3662, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 2.811277705345502, |
|
"grad_norm": 5.269212245941162, |
|
"learning_rate": 8.007594524119949e-06, |
|
"loss": 0.3509, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.819426336375489, |
|
"grad_norm": 5.3910017013549805, |
|
"learning_rate": 7.998631029986963e-06, |
|
"loss": 0.3618, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 2.827574967405476, |
|
"grad_norm": 6.497340202331543, |
|
"learning_rate": 7.989667535853978e-06, |
|
"loss": 0.338, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 2.8357235984354627, |
|
"grad_norm": 9.944086074829102, |
|
"learning_rate": 7.980704041720991e-06, |
|
"loss": 0.3523, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.84387222946545, |
|
"grad_norm": 4.036046504974365, |
|
"learning_rate": 7.971740547588006e-06, |
|
"loss": 0.3458, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 2.8520208604954367, |
|
"grad_norm": 7.9690327644348145, |
|
"learning_rate": 7.962777053455021e-06, |
|
"loss": 0.3469, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.860169491525424, |
|
"grad_norm": 5.498697757720947, |
|
"learning_rate": 7.953813559322034e-06, |
|
"loss": 0.3703, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 2.8683181225554106, |
|
"grad_norm": 8.990461349487305, |
|
"learning_rate": 7.944850065189049e-06, |
|
"loss": 0.3463, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.8764667535853974, |
|
"grad_norm": 5.683210372924805, |
|
"learning_rate": 7.935886571056064e-06, |
|
"loss": 0.3285, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 11.34899616241455, |
|
"learning_rate": 7.926923076923077e-06, |
|
"loss": 0.3638, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 2.8927640156453718, |
|
"grad_norm": 8.744894027709961, |
|
"learning_rate": 7.917959582790092e-06, |
|
"loss": 0.3575, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.9009126466753585, |
|
"grad_norm": 8.961909294128418, |
|
"learning_rate": 7.908996088657106e-06, |
|
"loss": 0.3367, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.9090612777053453, |
|
"grad_norm": 9.418771743774414, |
|
"learning_rate": 7.900032594524121e-06, |
|
"loss": 0.3575, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 2.9172099087353325, |
|
"grad_norm": 10.538002967834473, |
|
"learning_rate": 7.891069100391136e-06, |
|
"loss": 0.376, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 2.9253585397653197, |
|
"grad_norm": 4.822318077087402, |
|
"learning_rate": 7.882105606258149e-06, |
|
"loss": 0.3468, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 2.9335071707953064, |
|
"grad_norm": 9.379559516906738, |
|
"learning_rate": 7.873142112125164e-06, |
|
"loss": 0.3506, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.941655801825293, |
|
"grad_norm": 11.871332168579102, |
|
"learning_rate": 7.864178617992179e-06, |
|
"loss": 0.3494, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 2.9498044328552804, |
|
"grad_norm": 5.834349632263184, |
|
"learning_rate": 7.855215123859192e-06, |
|
"loss": 0.3951, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 2.957953063885267, |
|
"grad_norm": 9.025989532470703, |
|
"learning_rate": 7.846251629726208e-06, |
|
"loss": 0.3642, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 2.9661016949152543, |
|
"grad_norm": 7.520548343658447, |
|
"learning_rate": 7.837288135593221e-06, |
|
"loss": 0.3668, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.974250325945241, |
|
"grad_norm": 7.14879035949707, |
|
"learning_rate": 7.828324641460236e-06, |
|
"loss": 0.3696, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.9823989569752283, |
|
"grad_norm": 8.017966270446777, |
|
"learning_rate": 7.81936114732725e-06, |
|
"loss": 0.3637, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 2.990547588005215, |
|
"grad_norm": 7.470760345458984, |
|
"learning_rate": 7.810397653194264e-06, |
|
"loss": 0.3586, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 2.9986962190352022, |
|
"grad_norm": 4.955636501312256, |
|
"learning_rate": 7.801434159061279e-06, |
|
"loss": 0.3493, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8558232931726908, |
|
"eval_loss": 0.4229515790939331, |
|
"eval_runtime": 5.5172, |
|
"eval_samples_per_second": 451.314, |
|
"eval_steps_per_second": 56.55, |
|
"step": 36816 |
|
}, |
|
{ |
|
"epoch": 3.006844850065189, |
|
"grad_norm": 11.195860862731934, |
|
"learning_rate": 7.792470664928292e-06, |
|
"loss": 0.303, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 3.014993481095176, |
|
"grad_norm": 8.619274139404297, |
|
"learning_rate": 7.783507170795307e-06, |
|
"loss": 0.2999, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.023142112125163, |
|
"grad_norm": 7.2149200439453125, |
|
"learning_rate": 7.774543676662322e-06, |
|
"loss": 0.2997, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 3.03129074315515, |
|
"grad_norm": 10.171060562133789, |
|
"learning_rate": 7.765580182529336e-06, |
|
"loss": 0.302, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 3.039439374185137, |
|
"grad_norm": 10.720246315002441, |
|
"learning_rate": 7.756616688396351e-06, |
|
"loss": 0.2884, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 3.047588005215124, |
|
"grad_norm": 7.545303821563721, |
|
"learning_rate": 7.747653194263364e-06, |
|
"loss": 0.3076, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 3.055736636245111, |
|
"grad_norm": 7.973757266998291, |
|
"learning_rate": 7.73868970013038e-06, |
|
"loss": 0.2762, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.0638852672750976, |
|
"grad_norm": 7.613004207611084, |
|
"learning_rate": 7.729726205997394e-06, |
|
"loss": 0.285, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 3.0720338983050848, |
|
"grad_norm": 5.3108367919921875, |
|
"learning_rate": 7.720762711864407e-06, |
|
"loss": 0.2777, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 3.0801825293350715, |
|
"grad_norm": 4.986673355102539, |
|
"learning_rate": 7.711799217731422e-06, |
|
"loss": 0.2718, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 3.0883311603650587, |
|
"grad_norm": 8.627853393554688, |
|
"learning_rate": 7.702835723598437e-06, |
|
"loss": 0.3053, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 3.0964797913950455, |
|
"grad_norm": 5.650912284851074, |
|
"learning_rate": 7.69387222946545e-06, |
|
"loss": 0.2761, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.1046284224250327, |
|
"grad_norm": 8.650264739990234, |
|
"learning_rate": 7.684908735332465e-06, |
|
"loss": 0.3097, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 3.1127770534550194, |
|
"grad_norm": 3.7856078147888184, |
|
"learning_rate": 7.67594524119948e-06, |
|
"loss": 0.2875, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 3.1209256844850066, |
|
"grad_norm": 10.533018112182617, |
|
"learning_rate": 7.666981747066493e-06, |
|
"loss": 0.2876, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 3.1290743155149934, |
|
"grad_norm": 5.538301467895508, |
|
"learning_rate": 7.658018252933507e-06, |
|
"loss": 0.2934, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 3.1372229465449806, |
|
"grad_norm": 12.393535614013672, |
|
"learning_rate": 7.649054758800522e-06, |
|
"loss": 0.2992, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.1453715775749673, |
|
"grad_norm": 6.047006130218506, |
|
"learning_rate": 7.640091264667535e-06, |
|
"loss": 0.2723, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 3.1535202086049545, |
|
"grad_norm": 7.563205718994141, |
|
"learning_rate": 7.63112777053455e-06, |
|
"loss": 0.2927, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 3.1616688396349413, |
|
"grad_norm": 10.292147636413574, |
|
"learning_rate": 7.622164276401565e-06, |
|
"loss": 0.267, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 3.1698174706649285, |
|
"grad_norm": 8.638325691223145, |
|
"learning_rate": 7.613200782268578e-06, |
|
"loss": 0.292, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 3.1779661016949152, |
|
"grad_norm": 8.556358337402344, |
|
"learning_rate": 7.6042372881355944e-06, |
|
"loss": 0.2935, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.1861147327249024, |
|
"grad_norm": 9.609768867492676, |
|
"learning_rate": 7.5952737940026075e-06, |
|
"loss": 0.2804, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 3.194263363754889, |
|
"grad_norm": 7.0451507568359375, |
|
"learning_rate": 7.5863102998696215e-06, |
|
"loss": 0.315, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 3.2024119947848764, |
|
"grad_norm": 13.499476432800293, |
|
"learning_rate": 7.577346805736636e-06, |
|
"loss": 0.2866, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 3.210560625814863, |
|
"grad_norm": 12.766288757324219, |
|
"learning_rate": 7.568383311603649e-06, |
|
"loss": 0.2743, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 3.21870925684485, |
|
"grad_norm": 5.220301151275635, |
|
"learning_rate": 7.559419817470664e-06, |
|
"loss": 0.2879, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 3.226857887874837, |
|
"grad_norm": 5.3805060386657715, |
|
"learning_rate": 7.550456323337679e-06, |
|
"loss": 0.2884, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 3.235006518904824, |
|
"grad_norm": 5.386475086212158, |
|
"learning_rate": 7.541492829204694e-06, |
|
"loss": 0.2806, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 3.243155149934811, |
|
"grad_norm": 9.031757354736328, |
|
"learning_rate": 7.532529335071708e-06, |
|
"loss": 0.286, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 3.2513037809647978, |
|
"grad_norm": 6.262955665588379, |
|
"learning_rate": 7.523565840938723e-06, |
|
"loss": 0.2869, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 3.259452411994785, |
|
"grad_norm": 8.278800964355469, |
|
"learning_rate": 7.514602346805736e-06, |
|
"loss": 0.3113, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 3.2676010430247717, |
|
"grad_norm": 6.366271495819092, |
|
"learning_rate": 7.5056388526727505e-06, |
|
"loss": 0.2828, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 3.275749674054759, |
|
"grad_norm": 9.194992065429688, |
|
"learning_rate": 7.496675358539765e-06, |
|
"loss": 0.2878, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 3.2838983050847457, |
|
"grad_norm": 10.264578819274902, |
|
"learning_rate": 7.487711864406779e-06, |
|
"loss": 0.3111, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 3.292046936114733, |
|
"grad_norm": 10.604935646057129, |
|
"learning_rate": 7.478748370273795e-06, |
|
"loss": 0.2721, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 3.3001955671447196, |
|
"grad_norm": 7.705831050872803, |
|
"learning_rate": 7.469784876140808e-06, |
|
"loss": 0.3051, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 3.308344198174707, |
|
"grad_norm": 10.131879806518555, |
|
"learning_rate": 7.460821382007822e-06, |
|
"loss": 0.2912, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 3.3164928292046936, |
|
"grad_norm": 7.966042995452881, |
|
"learning_rate": 7.451857887874837e-06, |
|
"loss": 0.3067, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 3.3246414602346808, |
|
"grad_norm": 7.855635643005371, |
|
"learning_rate": 7.442894393741852e-06, |
|
"loss": 0.2991, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 3.3327900912646675, |
|
"grad_norm": 6.461732864379883, |
|
"learning_rate": 7.4339308996088655e-06, |
|
"loss": 0.2724, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 3.3409387222946547, |
|
"grad_norm": 7.149626731872559, |
|
"learning_rate": 7.4249674054758795e-06, |
|
"loss": 0.3029, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 3.3490873533246415, |
|
"grad_norm": 8.23928451538086, |
|
"learning_rate": 7.416003911342895e-06, |
|
"loss": 0.2957, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 3.3572359843546282, |
|
"grad_norm": 7.156587600708008, |
|
"learning_rate": 7.407040417209908e-06, |
|
"loss": 0.3053, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 3.3653846153846154, |
|
"grad_norm": 9.947956085205078, |
|
"learning_rate": 7.398076923076923e-06, |
|
"loss": 0.308, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 3.373533246414602, |
|
"grad_norm": 11.746365547180176, |
|
"learning_rate": 7.389113428943936e-06, |
|
"loss": 0.2818, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 3.3816818774445894, |
|
"grad_norm": 11.161590576171875, |
|
"learning_rate": 7.380149934810952e-06, |
|
"loss": 0.2969, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 3.389830508474576, |
|
"grad_norm": 7.389200687408447, |
|
"learning_rate": 7.371186440677966e-06, |
|
"loss": 0.2892, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 3.3979791395045633, |
|
"grad_norm": 8.784459114074707, |
|
"learning_rate": 7.36222294654498e-06, |
|
"loss": 0.2976, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 3.40612777053455, |
|
"grad_norm": 7.9105072021484375, |
|
"learning_rate": 7.3532594524119945e-06, |
|
"loss": 0.2904, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 3.4142764015645373, |
|
"grad_norm": 4.8031229972839355, |
|
"learning_rate": 7.3442959582790085e-06, |
|
"loss": 0.2928, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 3.422425032594524, |
|
"grad_norm": 7.598479747772217, |
|
"learning_rate": 7.335332464146024e-06, |
|
"loss": 0.315, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.430573663624511, |
|
"grad_norm": 4.397596836090088, |
|
"learning_rate": 7.326368970013037e-06, |
|
"loss": 0.278, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 3.438722294654498, |
|
"grad_norm": 5.5082316398620605, |
|
"learning_rate": 7.317405475880052e-06, |
|
"loss": 0.3155, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 3.446870925684485, |
|
"grad_norm": 1.8329218626022339, |
|
"learning_rate": 7.308441981747066e-06, |
|
"loss": 0.2834, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 3.455019556714472, |
|
"grad_norm": 5.782153129577637, |
|
"learning_rate": 7.299478487614081e-06, |
|
"loss": 0.2999, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 3.463168187744459, |
|
"grad_norm": 6.500859260559082, |
|
"learning_rate": 7.290514993481095e-06, |
|
"loss": 0.3001, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.471316818774446, |
|
"grad_norm": 5.2227396965026855, |
|
"learning_rate": 7.28155149934811e-06, |
|
"loss": 0.3155, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 3.479465449804433, |
|
"grad_norm": 7.104318618774414, |
|
"learning_rate": 7.272588005215124e-06, |
|
"loss": 0.3103, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 3.48761408083442, |
|
"grad_norm": 8.62105941772461, |
|
"learning_rate": 7.263624511082138e-06, |
|
"loss": 0.2892, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 3.4957627118644066, |
|
"grad_norm": 11.45067310333252, |
|
"learning_rate": 7.254661016949152e-06, |
|
"loss": 0.2889, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 3.5039113428943938, |
|
"grad_norm": 11.803699493408203, |
|
"learning_rate": 7.245697522816166e-06, |
|
"loss": 0.2755, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.512059973924381, |
|
"grad_norm": 6.8339762687683105, |
|
"learning_rate": 7.236734028683181e-06, |
|
"loss": 0.3083, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 3.5202086049543677, |
|
"grad_norm": 5.603298187255859, |
|
"learning_rate": 7.227770534550196e-06, |
|
"loss": 0.3034, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 3.5283572359843545, |
|
"grad_norm": 8.332951545715332, |
|
"learning_rate": 7.21880704041721e-06, |
|
"loss": 0.3187, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 3.5365058670143417, |
|
"grad_norm": 10.166672706604004, |
|
"learning_rate": 7.209843546284225e-06, |
|
"loss": 0.2966, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 3.5446544980443284, |
|
"grad_norm": 8.069250106811523, |
|
"learning_rate": 7.200880052151239e-06, |
|
"loss": 0.3033, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.5528031290743156, |
|
"grad_norm": 6.117469787597656, |
|
"learning_rate": 7.191916558018253e-06, |
|
"loss": 0.3169, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 3.5609517601043024, |
|
"grad_norm": 9.44518756866455, |
|
"learning_rate": 7.1829530638852665e-06, |
|
"loss": 0.3097, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 3.5691003911342896, |
|
"grad_norm": 12.208475112915039, |
|
"learning_rate": 7.173989569752282e-06, |
|
"loss": 0.2995, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 3.5772490221642763, |
|
"grad_norm": 3.2542121410369873, |
|
"learning_rate": 7.165026075619296e-06, |
|
"loss": 0.2853, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 3.5853976531942635, |
|
"grad_norm": 5.231081962585449, |
|
"learning_rate": 7.156062581486311e-06, |
|
"loss": 0.2811, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.5935462842242503, |
|
"grad_norm": 8.1747407913208, |
|
"learning_rate": 7.147099087353325e-06, |
|
"loss": 0.2902, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 3.601694915254237, |
|
"grad_norm": 5.907063961029053, |
|
"learning_rate": 7.138135593220339e-06, |
|
"loss": 0.2955, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 3.609843546284224, |
|
"grad_norm": 8.774656295776367, |
|
"learning_rate": 7.129172099087354e-06, |
|
"loss": 0.2976, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 3.6179921773142114, |
|
"grad_norm": 6.138618469238281, |
|
"learning_rate": 7.120208604954368e-06, |
|
"loss": 0.3101, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 3.626140808344198, |
|
"grad_norm": 3.3446667194366455, |
|
"learning_rate": 7.111245110821382e-06, |
|
"loss": 0.2896, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 3.634289439374185, |
|
"grad_norm": 4.759154319763184, |
|
"learning_rate": 7.1022816166883955e-06, |
|
"loss": 0.2952, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 3.642438070404172, |
|
"grad_norm": 5.970653533935547, |
|
"learning_rate": 7.093318122555411e-06, |
|
"loss": 0.2943, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 3.6505867014341593, |
|
"grad_norm": 7.213112831115723, |
|
"learning_rate": 7.084354628422425e-06, |
|
"loss": 0.304, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 3.658735332464146, |
|
"grad_norm": 6.11391019821167, |
|
"learning_rate": 7.075391134289439e-06, |
|
"loss": 0.2935, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 3.666883963494133, |
|
"grad_norm": 6.258280277252197, |
|
"learning_rate": 7.066427640156454e-06, |
|
"loss": 0.2933, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.67503259452412, |
|
"grad_norm": 11.640277862548828, |
|
"learning_rate": 7.057464146023469e-06, |
|
"loss": 0.318, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 3.6831812255541068, |
|
"grad_norm": 6.384451866149902, |
|
"learning_rate": 7.048500651890483e-06, |
|
"loss": 0.3106, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 3.691329856584094, |
|
"grad_norm": 6.5114054679870605, |
|
"learning_rate": 7.039537157757496e-06, |
|
"loss": 0.3065, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 3.6994784876140807, |
|
"grad_norm": 7.666008949279785, |
|
"learning_rate": 7.030573663624511e-06, |
|
"loss": 0.2934, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 3.707627118644068, |
|
"grad_norm": 8.854146003723145, |
|
"learning_rate": 7.021610169491525e-06, |
|
"loss": 0.2986, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 3.7157757496740547, |
|
"grad_norm": 8.994945526123047, |
|
"learning_rate": 7.01264667535854e-06, |
|
"loss": 0.2904, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 3.723924380704042, |
|
"grad_norm": 6.301919460296631, |
|
"learning_rate": 7.003683181225555e-06, |
|
"loss": 0.3042, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 3.7320730117340286, |
|
"grad_norm": 6.444594860076904, |
|
"learning_rate": 6.994719687092568e-06, |
|
"loss": 0.2934, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 3.740221642764016, |
|
"grad_norm": 7.342471122741699, |
|
"learning_rate": 6.985756192959584e-06, |
|
"loss": 0.2573, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 3.7483702737940026, |
|
"grad_norm": 10.449690818786621, |
|
"learning_rate": 6.976792698826597e-06, |
|
"loss": 0.2992, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.7565189048239898, |
|
"grad_norm": 6.386083602905273, |
|
"learning_rate": 6.967829204693612e-06, |
|
"loss": 0.3197, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 3.7646675358539765, |
|
"grad_norm": 5.020893573760986, |
|
"learning_rate": 6.958865710560625e-06, |
|
"loss": 0.2898, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 3.7728161668839633, |
|
"grad_norm": 13.005038261413574, |
|
"learning_rate": 6.949902216427641e-06, |
|
"loss": 0.2775, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 3.7809647979139505, |
|
"grad_norm": 9.009956359863281, |
|
"learning_rate": 6.940938722294654e-06, |
|
"loss": 0.3111, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 3.7891134289439377, |
|
"grad_norm": 11.799562454223633, |
|
"learning_rate": 6.931975228161669e-06, |
|
"loss": 0.3086, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.7972620599739244, |
|
"grad_norm": 5.661113262176514, |
|
"learning_rate": 6.923011734028683e-06, |
|
"loss": 0.2922, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 3.805410691003911, |
|
"grad_norm": 7.668259143829346, |
|
"learning_rate": 6.914048239895698e-06, |
|
"loss": 0.3089, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 3.8135593220338984, |
|
"grad_norm": 5.768048286437988, |
|
"learning_rate": 6.905084745762713e-06, |
|
"loss": 0.316, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 3.821707953063885, |
|
"grad_norm": 10.127528190612793, |
|
"learning_rate": 6.896121251629726e-06, |
|
"loss": 0.3055, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 3.8298565840938723, |
|
"grad_norm": 8.772488594055176, |
|
"learning_rate": 6.887157757496741e-06, |
|
"loss": 0.3021, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.838005215123859, |
|
"grad_norm": 5.210601806640625, |
|
"learning_rate": 6.8781942633637554e-06, |
|
"loss": 0.2871, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 6.755288124084473, |
|
"learning_rate": 6.869230769230769e-06, |
|
"loss": 0.3067, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 3.854302477183833, |
|
"grad_norm": 5.8384833335876465, |
|
"learning_rate": 6.860267275097784e-06, |
|
"loss": 0.3026, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 3.86245110821382, |
|
"grad_norm": 10.123713493347168, |
|
"learning_rate": 6.851303780964797e-06, |
|
"loss": 0.2925, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 3.870599739243807, |
|
"grad_norm": 6.272671699523926, |
|
"learning_rate": 6.842340286831814e-06, |
|
"loss": 0.3202, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.878748370273794, |
|
"grad_norm": 6.816521167755127, |
|
"learning_rate": 6.833376792698826e-06, |
|
"loss": 0.3031, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 3.886897001303781, |
|
"grad_norm": 6.4613752365112305, |
|
"learning_rate": 6.824413298565842e-06, |
|
"loss": 0.3067, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 3.895045632333768, |
|
"grad_norm": 10.189191818237305, |
|
"learning_rate": 6.815449804432854e-06, |
|
"loss": 0.2946, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 3.903194263363755, |
|
"grad_norm": 12.38447093963623, |
|
"learning_rate": 6.8064863102998705e-06, |
|
"loss": 0.3023, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 3.9113428943937416, |
|
"grad_norm": 9.731588363647461, |
|
"learning_rate": 6.797522816166885e-06, |
|
"loss": 0.2932, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.919491525423729, |
|
"grad_norm": 9.75956916809082, |
|
"learning_rate": 6.788559322033898e-06, |
|
"loss": 0.3004, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 3.927640156453716, |
|
"grad_norm": 9.017115592956543, |
|
"learning_rate": 6.779595827900912e-06, |
|
"loss": 0.3083, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 3.9357887874837028, |
|
"grad_norm": 4.012237071990967, |
|
"learning_rate": 6.7706323337679255e-06, |
|
"loss": 0.2975, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 3.9439374185136895, |
|
"grad_norm": 9.703700065612793, |
|
"learning_rate": 6.761668839634942e-06, |
|
"loss": 0.3007, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 3.9520860495436767, |
|
"grad_norm": 5.018248558044434, |
|
"learning_rate": 6.752705345501955e-06, |
|
"loss": 0.2835, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.960234680573664, |
|
"grad_norm": 8.908330917358398, |
|
"learning_rate": 6.7437418513689716e-06, |
|
"loss": 0.2885, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 3.9683833116036507, |
|
"grad_norm": 7.982217788696289, |
|
"learning_rate": 6.734778357235984e-06, |
|
"loss": 0.2939, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 3.9765319426336374, |
|
"grad_norm": 6.468784809112549, |
|
"learning_rate": 6.725814863102999e-06, |
|
"loss": 0.2973, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 3.9846805736636246, |
|
"grad_norm": 11.741927146911621, |
|
"learning_rate": 6.716851368970012e-06, |
|
"loss": 0.3115, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 3.9928292046936114, |
|
"grad_norm": 6.260389804840088, |
|
"learning_rate": 6.7078878748370266e-06, |
|
"loss": 0.3033, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8365461847389558, |
|
"eval_loss": 0.49297481775283813, |
|
"eval_runtime": 5.5068, |
|
"eval_samples_per_second": 452.166, |
|
"eval_steps_per_second": 56.657, |
|
"step": 49088 |
|
}, |
|
{ |
|
"epoch": 4.0009778357235986, |
|
"grad_norm": 6.392615795135498, |
|
"learning_rate": 6.698924380704041e-06, |
|
"loss": 0.29, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 4.009126466753585, |
|
"grad_norm": 8.285822868347168, |
|
"learning_rate": 6.689960886571055e-06, |
|
"loss": 0.2284, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 4.017275097783572, |
|
"grad_norm": 8.578091621398926, |
|
"learning_rate": 6.680997392438071e-06, |
|
"loss": 0.2389, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 4.02542372881356, |
|
"grad_norm": 7.808691024780273, |
|
"learning_rate": 6.672033898305083e-06, |
|
"loss": 0.237, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 4.0335723598435465, |
|
"grad_norm": 11.781246185302734, |
|
"learning_rate": 6.6630704041721e-06, |
|
"loss": 0.2207, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 4.041720990873533, |
|
"grad_norm": 11.581729888916016, |
|
"learning_rate": 6.654106910039113e-06, |
|
"loss": 0.2456, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 4.04986962190352, |
|
"grad_norm": 11.857730865478516, |
|
"learning_rate": 6.645143415906128e-06, |
|
"loss": 0.2381, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 4.058018252933508, |
|
"grad_norm": 5.7172722816467285, |
|
"learning_rate": 6.636179921773142e-06, |
|
"loss": 0.2306, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 4.066166883963494, |
|
"grad_norm": 9.22181224822998, |
|
"learning_rate": 6.627216427640156e-06, |
|
"loss": 0.2418, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 4.074315514993481, |
|
"grad_norm": 9.777718544006348, |
|
"learning_rate": 6.618252933507171e-06, |
|
"loss": 0.2391, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 4.082464146023468, |
|
"grad_norm": 11.828337669372559, |
|
"learning_rate": 6.609289439374184e-06, |
|
"loss": 0.234, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 4.090612777053455, |
|
"grad_norm": 8.534415245056152, |
|
"learning_rate": 6.600325945241201e-06, |
|
"loss": 0.2199, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 4.098761408083442, |
|
"grad_norm": 8.329660415649414, |
|
"learning_rate": 6.591362451108213e-06, |
|
"loss": 0.261, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 4.106910039113429, |
|
"grad_norm": 6.748033046722412, |
|
"learning_rate": 6.582398956975228e-06, |
|
"loss": 0.2412, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 4.115058670143416, |
|
"grad_norm": 3.2746846675872803, |
|
"learning_rate": 6.573435462842243e-06, |
|
"loss": 0.2509, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 4.1232073011734025, |
|
"grad_norm": 5.985006809234619, |
|
"learning_rate": 6.564471968709256e-06, |
|
"loss": 0.2412, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 4.13135593220339, |
|
"grad_norm": 10.339166641235352, |
|
"learning_rate": 6.555508474576271e-06, |
|
"loss": 0.2439, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 4.139504563233377, |
|
"grad_norm": 3.1601479053497314, |
|
"learning_rate": 6.546544980443285e-06, |
|
"loss": 0.2332, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 4.147653194263364, |
|
"grad_norm": 7.7636308670043945, |
|
"learning_rate": 6.5375814863103e-06, |
|
"loss": 0.2428, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 4.15580182529335, |
|
"grad_norm": 5.945723056793213, |
|
"learning_rate": 6.528617992177313e-06, |
|
"loss": 0.235, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 4.163950456323338, |
|
"grad_norm": 6.053178787231445, |
|
"learning_rate": 6.519654498044329e-06, |
|
"loss": 0.258, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 4.172099087353325, |
|
"grad_norm": 7.7220845222473145, |
|
"learning_rate": 6.510691003911342e-06, |
|
"loss": 0.2202, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 4.1802477183833116, |
|
"grad_norm": 10.34273910522461, |
|
"learning_rate": 6.501727509778357e-06, |
|
"loss": 0.2308, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 4.188396349413298, |
|
"grad_norm": 11.372400283813477, |
|
"learning_rate": 6.492764015645372e-06, |
|
"loss": 0.2403, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 4.196544980443286, |
|
"grad_norm": 5.102409839630127, |
|
"learning_rate": 6.483800521512386e-06, |
|
"loss": 0.2423, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 4.204693611473273, |
|
"grad_norm": 5.650859832763672, |
|
"learning_rate": 6.474837027379401e-06, |
|
"loss": 0.2239, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 4.2128422425032594, |
|
"grad_norm": 10.150152206420898, |
|
"learning_rate": 6.4658735332464136e-06, |
|
"loss": 0.2458, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 4.220990873533246, |
|
"grad_norm": 13.111183166503906, |
|
"learning_rate": 6.456910039113428e-06, |
|
"loss": 0.2542, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 4.229139504563233, |
|
"grad_norm": 10.616249084472656, |
|
"learning_rate": 6.447946544980442e-06, |
|
"loss": 0.2643, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 4.237288135593221, |
|
"grad_norm": 8.26252269744873, |
|
"learning_rate": 6.438983050847458e-06, |
|
"loss": 0.2476, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 4.245436766623207, |
|
"grad_norm": 3.8655807971954346, |
|
"learning_rate": 6.430019556714472e-06, |
|
"loss": 0.2421, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 4.253585397653194, |
|
"grad_norm": 12.139904975891113, |
|
"learning_rate": 6.421056062581486e-06, |
|
"loss": 0.2498, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 4.261734028683181, |
|
"grad_norm": 6.828706741333008, |
|
"learning_rate": 6.4120925684485e-06, |
|
"loss": 0.2607, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 4.2698826597131685, |
|
"grad_norm": 6.083545684814453, |
|
"learning_rate": 6.403129074315515e-06, |
|
"loss": 0.2371, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 4.278031290743155, |
|
"grad_norm": 11.325626373291016, |
|
"learning_rate": 6.3941655801825294e-06, |
|
"loss": 0.2487, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 4.286179921773142, |
|
"grad_norm": 5.367312431335449, |
|
"learning_rate": 6.385202086049543e-06, |
|
"loss": 0.2254, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 4.294328552803129, |
|
"grad_norm": 14.246549606323242, |
|
"learning_rate": 6.376238591916558e-06, |
|
"loss": 0.2237, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 4.302477183833116, |
|
"grad_norm": 11.854809761047363, |
|
"learning_rate": 6.367275097783572e-06, |
|
"loss": 0.2446, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 4.310625814863103, |
|
"grad_norm": 10.6870698928833, |
|
"learning_rate": 6.358311603650586e-06, |
|
"loss": 0.2219, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 4.31877444589309, |
|
"grad_norm": 11.290742874145508, |
|
"learning_rate": 6.349348109517601e-06, |
|
"loss": 0.2757, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 4.326923076923077, |
|
"grad_norm": 5.060425758361816, |
|
"learning_rate": 6.340384615384615e-06, |
|
"loss": 0.2279, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 4.335071707953064, |
|
"grad_norm": 11.129130363464355, |
|
"learning_rate": 6.3314211212516305e-06, |
|
"loss": 0.2352, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 4.343220338983051, |
|
"grad_norm": 9.912896156311035, |
|
"learning_rate": 6.322457627118644e-06, |
|
"loss": 0.2378, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 4.351368970013038, |
|
"grad_norm": 6.451144218444824, |
|
"learning_rate": 6.3134941329856584e-06, |
|
"loss": 0.229, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 4.3595176010430245, |
|
"grad_norm": 8.81298828125, |
|
"learning_rate": 6.3045306388526716e-06, |
|
"loss": 0.248, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 4.367666232073011, |
|
"grad_norm": 10.046360969543457, |
|
"learning_rate": 6.295567144719687e-06, |
|
"loss": 0.2236, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 4.375814863102999, |
|
"grad_norm": 7.006089687347412, |
|
"learning_rate": 6.286603650586701e-06, |
|
"loss": 0.2522, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 4.383963494132986, |
|
"grad_norm": 8.601024627685547, |
|
"learning_rate": 6.277640156453716e-06, |
|
"loss": 0.2595, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 4.3921121251629724, |
|
"grad_norm": 7.744350433349609, |
|
"learning_rate": 6.26867666232073e-06, |
|
"loss": 0.2433, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 4.400260756192959, |
|
"grad_norm": 9.870165824890137, |
|
"learning_rate": 6.259713168187745e-06, |
|
"loss": 0.256, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.408409387222947, |
|
"grad_norm": 16.055753707885742, |
|
"learning_rate": 6.250749674054759e-06, |
|
"loss": 0.2626, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 4.416558018252934, |
|
"grad_norm": 7.4111104011535645, |
|
"learning_rate": 6.241786179921773e-06, |
|
"loss": 0.2403, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 4.42470664928292, |
|
"grad_norm": 9.383624076843262, |
|
"learning_rate": 6.2328226857887874e-06, |
|
"loss": 0.2528, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 4.432855280312907, |
|
"grad_norm": 12.037814140319824, |
|
"learning_rate": 6.223859191655802e-06, |
|
"loss": 0.2618, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 4.441003911342895, |
|
"grad_norm": 11.180290222167969, |
|
"learning_rate": 6.214895697522816e-06, |
|
"loss": 0.2619, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.4491525423728815, |
|
"grad_norm": 16.229679107666016, |
|
"learning_rate": 6.205932203389831e-06, |
|
"loss": 0.2113, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 4.457301173402868, |
|
"grad_norm": 13.413905143737793, |
|
"learning_rate": 6.196968709256845e-06, |
|
"loss": 0.2294, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 4.465449804432855, |
|
"grad_norm": 4.0000739097595215, |
|
"learning_rate": 6.18800521512386e-06, |
|
"loss": 0.2492, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 4.473598435462843, |
|
"grad_norm": 6.232034206390381, |
|
"learning_rate": 6.179041720990873e-06, |
|
"loss": 0.2257, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 4.481747066492829, |
|
"grad_norm": 10.516459465026855, |
|
"learning_rate": 6.1700782268578885e-06, |
|
"loss": 0.2411, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.489895697522816, |
|
"grad_norm": 1.9097963571548462, |
|
"learning_rate": 6.1611147327249025e-06, |
|
"loss": 0.2397, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 4.498044328552803, |
|
"grad_norm": 8.531264305114746, |
|
"learning_rate": 6.152151238591917e-06, |
|
"loss": 0.2496, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 4.5061929595827905, |
|
"grad_norm": 11.344954490661621, |
|
"learning_rate": 6.143187744458931e-06, |
|
"loss": 0.2376, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 4.514341590612777, |
|
"grad_norm": 11.570788383483887, |
|
"learning_rate": 6.134224250325945e-06, |
|
"loss": 0.238, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 4.522490221642764, |
|
"grad_norm": 7.503643035888672, |
|
"learning_rate": 6.125260756192959e-06, |
|
"loss": 0.2492, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.530638852672751, |
|
"grad_norm": 10.721003532409668, |
|
"learning_rate": 6.116297262059974e-06, |
|
"loss": 0.2508, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 4.5387874837027375, |
|
"grad_norm": 7.251551151275635, |
|
"learning_rate": 6.107333767926989e-06, |
|
"loss": 0.2661, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 4.546936114732725, |
|
"grad_norm": 5.292377948760986, |
|
"learning_rate": 6.098370273794002e-06, |
|
"loss": 0.2253, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 4.555084745762712, |
|
"grad_norm": 8.165968894958496, |
|
"learning_rate": 6.0894067796610175e-06, |
|
"loss": 0.2401, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 4.563233376792699, |
|
"grad_norm": 5.5696001052856445, |
|
"learning_rate": 6.0804432855280315e-06, |
|
"loss": 0.2565, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.5713820078226854, |
|
"grad_norm": 4.321511745452881, |
|
"learning_rate": 6.0714797913950455e-06, |
|
"loss": 0.2468, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 4.579530638852673, |
|
"grad_norm": 7.747223854064941, |
|
"learning_rate": 6.06251629726206e-06, |
|
"loss": 0.2677, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 4.58767926988266, |
|
"grad_norm": 6.845622539520264, |
|
"learning_rate": 6.053552803129075e-06, |
|
"loss": 0.2399, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 4.595827900912647, |
|
"grad_norm": 7.972433567047119, |
|
"learning_rate": 6.044589308996088e-06, |
|
"loss": 0.2538, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 4.603976531942633, |
|
"grad_norm": 7.560089111328125, |
|
"learning_rate": 6.035625814863102e-06, |
|
"loss": 0.2417, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 4.612125162972621, |
|
"grad_norm": 12.906600952148438, |
|
"learning_rate": 6.026662320730118e-06, |
|
"loss": 0.2534, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 4.620273794002608, |
|
"grad_norm": 6.646966934204102, |
|
"learning_rate": 6.01769882659713e-06, |
|
"loss": 0.2229, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 4.6284224250325945, |
|
"grad_norm": 4.299482822418213, |
|
"learning_rate": 6.0087353324641465e-06, |
|
"loss": 0.2451, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 4.636571056062581, |
|
"grad_norm": 2.2282984256744385, |
|
"learning_rate": 5.999771838331161e-06, |
|
"loss": 0.2441, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 4.644719687092568, |
|
"grad_norm": 6.4274210929870605, |
|
"learning_rate": 5.9908083441981745e-06, |
|
"loss": 0.248, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.652868318122556, |
|
"grad_norm": 11.95504379272461, |
|
"learning_rate": 5.981844850065189e-06, |
|
"loss": 0.2479, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 4.661016949152542, |
|
"grad_norm": 5.159088134765625, |
|
"learning_rate": 5.972881355932203e-06, |
|
"loss": 0.2408, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 4.669165580182529, |
|
"grad_norm": 13.74659252166748, |
|
"learning_rate": 5.963917861799218e-06, |
|
"loss": 0.2596, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 4.677314211212517, |
|
"grad_norm": 7.052792072296143, |
|
"learning_rate": 5.954954367666231e-06, |
|
"loss": 0.251, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 4.6854628422425035, |
|
"grad_norm": 7.998805999755859, |
|
"learning_rate": 5.945990873533248e-06, |
|
"loss": 0.2517, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 4.69361147327249, |
|
"grad_norm": 8.009773254394531, |
|
"learning_rate": 5.93702737940026e-06, |
|
"loss": 0.2549, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 4.701760104302477, |
|
"grad_norm": 16.998640060424805, |
|
"learning_rate": 5.9280638852672755e-06, |
|
"loss": 0.2387, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 4.709908735332464, |
|
"grad_norm": 5.554251670837402, |
|
"learning_rate": 5.9191003911342895e-06, |
|
"loss": 0.2342, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 4.718057366362451, |
|
"grad_norm": 11.730195999145508, |
|
"learning_rate": 5.910136897001304e-06, |
|
"loss": 0.2536, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 4.726205997392438, |
|
"grad_norm": 8.52701473236084, |
|
"learning_rate": 5.901173402868317e-06, |
|
"loss": 0.2552, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.734354628422425, |
|
"grad_norm": 8.105571746826172, |
|
"learning_rate": 5.892209908735332e-06, |
|
"loss": 0.2512, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 4.742503259452412, |
|
"grad_norm": 8.212213516235352, |
|
"learning_rate": 5.883246414602347e-06, |
|
"loss": 0.2517, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 4.750651890482399, |
|
"grad_norm": 7.248411178588867, |
|
"learning_rate": 5.874282920469359e-06, |
|
"loss": 0.243, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 4.758800521512386, |
|
"grad_norm": 7.534228801727295, |
|
"learning_rate": 5.865319426336376e-06, |
|
"loss": 0.2345, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 4.766949152542373, |
|
"grad_norm": 9.132428169250488, |
|
"learning_rate": 5.856355932203391e-06, |
|
"loss": 0.2378, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 4.77509778357236, |
|
"grad_norm": 10.025369644165039, |
|
"learning_rate": 5.847392438070405e-06, |
|
"loss": 0.2674, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 4.783246414602347, |
|
"grad_norm": 10.384153366088867, |
|
"learning_rate": 5.8384289439374185e-06, |
|
"loss": 0.2391, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 4.791395045632334, |
|
"grad_norm": 15.859262466430664, |
|
"learning_rate": 5.8294654498044325e-06, |
|
"loss": 0.2417, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 4.799543676662321, |
|
"grad_norm": 7.405439376831055, |
|
"learning_rate": 5.820501955671448e-06, |
|
"loss": 0.2475, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 4.8076923076923075, |
|
"grad_norm": 9.894548416137695, |
|
"learning_rate": 5.81153846153846e-06, |
|
"loss": 0.242, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.815840938722294, |
|
"grad_norm": 4.984899997711182, |
|
"learning_rate": 5.802574967405477e-06, |
|
"loss": 0.2367, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 4.823989569752282, |
|
"grad_norm": 8.233868598937988, |
|
"learning_rate": 5.793611473272489e-06, |
|
"loss": 0.2422, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 4.832138200782269, |
|
"grad_norm": 3.405486583709717, |
|
"learning_rate": 5.784647979139505e-06, |
|
"loss": 0.2363, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 4.840286831812255, |
|
"grad_norm": 10.75444221496582, |
|
"learning_rate": 5.775684485006519e-06, |
|
"loss": 0.2387, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 4.848435462842242, |
|
"grad_norm": 8.05445671081543, |
|
"learning_rate": 5.766720990873532e-06, |
|
"loss": 0.2501, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 4.85658409387223, |
|
"grad_norm": 9.405356407165527, |
|
"learning_rate": 5.757757496740547e-06, |
|
"loss": 0.2535, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 4.8647327249022165, |
|
"grad_norm": 7.889901638031006, |
|
"learning_rate": 5.7487940026075615e-06, |
|
"loss": 0.2472, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 4.872881355932203, |
|
"grad_norm": 12.93803596496582, |
|
"learning_rate": 5.739830508474578e-06, |
|
"loss": 0.2484, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 4.88102998696219, |
|
"grad_norm": 10.640684127807617, |
|
"learning_rate": 5.73086701434159e-06, |
|
"loss": 0.257, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 4.889178617992178, |
|
"grad_norm": 6.97421932220459, |
|
"learning_rate": 5.721903520208605e-06, |
|
"loss": 0.2271, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.897327249022164, |
|
"grad_norm": 8.209015846252441, |
|
"learning_rate": 5.71294002607562e-06, |
|
"loss": 0.261, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 4.905475880052151, |
|
"grad_norm": 10.535201072692871, |
|
"learning_rate": 5.703976531942633e-06, |
|
"loss": 0.2282, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 4.913624511082138, |
|
"grad_norm": 7.5695013999938965, |
|
"learning_rate": 5.695013037809648e-06, |
|
"loss": 0.2598, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 4.921773142112125, |
|
"grad_norm": 6.751789569854736, |
|
"learning_rate": 5.686049543676662e-06, |
|
"loss": 0.2683, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 4.929921773142112, |
|
"grad_norm": 8.825233459472656, |
|
"learning_rate": 5.677086049543677e-06, |
|
"loss": 0.2468, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 4.938070404172099, |
|
"grad_norm": 6.470550060272217, |
|
"learning_rate": 5.66812255541069e-06, |
|
"loss": 0.2296, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 4.946219035202086, |
|
"grad_norm": 6.545186996459961, |
|
"learning_rate": 5.659159061277706e-06, |
|
"loss": 0.2595, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 4.9543676662320735, |
|
"grad_norm": 11.159351348876953, |
|
"learning_rate": 5.650195567144719e-06, |
|
"loss": 0.2515, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 4.96251629726206, |
|
"grad_norm": 7.022935390472412, |
|
"learning_rate": 5.641232073011734e-06, |
|
"loss": 0.2569, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 4.970664928292047, |
|
"grad_norm": 6.251514911651611, |
|
"learning_rate": 5.632268578878748e-06, |
|
"loss": 0.2317, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.978813559322034, |
|
"grad_norm": 5.2566657066345215, |
|
"learning_rate": 5.623305084745763e-06, |
|
"loss": 0.2413, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 4.9869621903520205, |
|
"grad_norm": 8.16510009765625, |
|
"learning_rate": 5.614341590612776e-06, |
|
"loss": 0.2392, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 4.995110821382008, |
|
"grad_norm": 4.102538585662842, |
|
"learning_rate": 5.605378096479791e-06, |
|
"loss": 0.257, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8502008032128514, |
|
"eval_loss": 0.5025619864463806, |
|
"eval_runtime": 5.5086, |
|
"eval_samples_per_second": 452.021, |
|
"eval_steps_per_second": 56.639, |
|
"step": 61360 |
|
}, |
|
{ |
|
"epoch": 5.003259452411995, |
|
"grad_norm": 3.8134822845458984, |
|
"learning_rate": 5.596414602346807e-06, |
|
"loss": 0.2316, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 5.011408083441982, |
|
"grad_norm": 6.683708667755127, |
|
"learning_rate": 5.5874511082138195e-06, |
|
"loss": 0.1937, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 5.019556714471968, |
|
"grad_norm": 9.566272735595703, |
|
"learning_rate": 5.578487614080835e-06, |
|
"loss": 0.2288, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 5.027705345501956, |
|
"grad_norm": 10.972317695617676, |
|
"learning_rate": 5.569524119947849e-06, |
|
"loss": 0.2247, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 5.035853976531943, |
|
"grad_norm": 7.630614280700684, |
|
"learning_rate": 5.560560625814862e-06, |
|
"loss": 0.1842, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 5.0440026075619295, |
|
"grad_norm": 9.422782897949219, |
|
"learning_rate": 5.551597131681877e-06, |
|
"loss": 0.1899, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 5.052151238591916, |
|
"grad_norm": 8.5192289352417, |
|
"learning_rate": 5.542633637548892e-06, |
|
"loss": 0.1951, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 5.060299869621904, |
|
"grad_norm": 8.805126190185547, |
|
"learning_rate": 5.533670143415907e-06, |
|
"loss": 0.2086, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 5.068448500651891, |
|
"grad_norm": 20.683149337768555, |
|
"learning_rate": 5.52470664928292e-06, |
|
"loss": 0.183, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 5.076597131681877, |
|
"grad_norm": 4.523620128631592, |
|
"learning_rate": 5.515743155149935e-06, |
|
"loss": 0.2044, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 5.084745762711864, |
|
"grad_norm": 9.480749130249023, |
|
"learning_rate": 5.5067796610169485e-06, |
|
"loss": 0.2056, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 5.092894393741851, |
|
"grad_norm": 5.888172149658203, |
|
"learning_rate": 5.497816166883963e-06, |
|
"loss": 0.2056, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 5.101043024771839, |
|
"grad_norm": 5.114003658294678, |
|
"learning_rate": 5.488852672750978e-06, |
|
"loss": 0.2073, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 5.109191655801825, |
|
"grad_norm": 2.9696881771087646, |
|
"learning_rate": 5.479889178617992e-06, |
|
"loss": 0.2026, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 5.117340286831812, |
|
"grad_norm": 3.7792017459869385, |
|
"learning_rate": 5.470925684485006e-06, |
|
"loss": 0.2032, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 5.125488917861799, |
|
"grad_norm": 8.12082290649414, |
|
"learning_rate": 5.46196219035202e-06, |
|
"loss": 0.2016, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 5.1336375488917865, |
|
"grad_norm": 10.067122459411621, |
|
"learning_rate": 5.452998696219035e-06, |
|
"loss": 0.2199, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 5.141786179921773, |
|
"grad_norm": 8.625639915466309, |
|
"learning_rate": 5.444035202086049e-06, |
|
"loss": 0.197, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 5.14993481095176, |
|
"grad_norm": 5.854794502258301, |
|
"learning_rate": 5.435071707953064e-06, |
|
"loss": 0.2087, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 5.158083441981747, |
|
"grad_norm": 25.03852653503418, |
|
"learning_rate": 5.426108213820078e-06, |
|
"loss": 0.198, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 5.166232073011734, |
|
"grad_norm": 8.568482398986816, |
|
"learning_rate": 5.417144719687092e-06, |
|
"loss": 0.2056, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 5.174380704041721, |
|
"grad_norm": 13.770308494567871, |
|
"learning_rate": 5.408181225554106e-06, |
|
"loss": 0.2177, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 5.182529335071708, |
|
"grad_norm": 5.927313804626465, |
|
"learning_rate": 5.399217731421121e-06, |
|
"loss": 0.2116, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 5.190677966101695, |
|
"grad_norm": 8.017006874084473, |
|
"learning_rate": 5.390254237288136e-06, |
|
"loss": 0.2024, |
|
"step": 63700 |
|
}, |
|
{ |
|
"epoch": 5.198826597131682, |
|
"grad_norm": 12.88976001739502, |
|
"learning_rate": 5.38129074315515e-06, |
|
"loss": 0.2093, |
|
"step": 63800 |
|
}, |
|
{ |
|
"epoch": 5.206975228161669, |
|
"grad_norm": 7.503077507019043, |
|
"learning_rate": 5.372327249022165e-06, |
|
"loss": 0.2086, |
|
"step": 63900 |
|
}, |
|
{ |
|
"epoch": 5.215123859191656, |
|
"grad_norm": 5.918655872344971, |
|
"learning_rate": 5.3633637548891785e-06, |
|
"loss": 0.2132, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 5.2232724902216425, |
|
"grad_norm": 14.097025871276855, |
|
"learning_rate": 5.3544002607561925e-06, |
|
"loss": 0.1944, |
|
"step": 64100 |
|
}, |
|
{ |
|
"epoch": 5.23142112125163, |
|
"grad_norm": 5.404310703277588, |
|
"learning_rate": 5.345436766623207e-06, |
|
"loss": 0.2086, |
|
"step": 64200 |
|
}, |
|
{ |
|
"epoch": 5.239569752281617, |
|
"grad_norm": 12.598578453063965, |
|
"learning_rate": 5.336473272490221e-06, |
|
"loss": 0.201, |
|
"step": 64300 |
|
}, |
|
{ |
|
"epoch": 5.247718383311604, |
|
"grad_norm": 7.65584135055542, |
|
"learning_rate": 5.327509778357236e-06, |
|
"loss": 0.1958, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 5.25586701434159, |
|
"grad_norm": 6.702199459075928, |
|
"learning_rate": 5.31854628422425e-06, |
|
"loss": 0.2, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 5.264015645371577, |
|
"grad_norm": 7.551068305969238, |
|
"learning_rate": 5.309582790091266e-06, |
|
"loss": 0.2015, |
|
"step": 64600 |
|
}, |
|
{ |
|
"epoch": 5.272164276401565, |
|
"grad_norm": 7.219007968902588, |
|
"learning_rate": 5.300619295958279e-06, |
|
"loss": 0.2151, |
|
"step": 64700 |
|
}, |
|
{ |
|
"epoch": 5.280312907431552, |
|
"grad_norm": 11.942194938659668, |
|
"learning_rate": 5.291655801825294e-06, |
|
"loss": 0.2096, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 5.288461538461538, |
|
"grad_norm": 11.055607795715332, |
|
"learning_rate": 5.2826923076923075e-06, |
|
"loss": 0.1832, |
|
"step": 64900 |
|
}, |
|
{ |
|
"epoch": 5.296610169491525, |
|
"grad_norm": 9.625134468078613, |
|
"learning_rate": 5.273728813559322e-06, |
|
"loss": 0.2222, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 5.304758800521513, |
|
"grad_norm": 8.117315292358398, |
|
"learning_rate": 5.264765319426336e-06, |
|
"loss": 0.2024, |
|
"step": 65100 |
|
}, |
|
{ |
|
"epoch": 5.3129074315514995, |
|
"grad_norm": 16.055788040161133, |
|
"learning_rate": 5.255801825293351e-06, |
|
"loss": 0.2045, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 5.321056062581486, |
|
"grad_norm": 11.91893482208252, |
|
"learning_rate": 5.246838331160365e-06, |
|
"loss": 0.1999, |
|
"step": 65300 |
|
}, |
|
{ |
|
"epoch": 5.329204693611473, |
|
"grad_norm": 8.290375709533691, |
|
"learning_rate": 5.237874837027379e-06, |
|
"loss": 0.2037, |
|
"step": 65400 |
|
}, |
|
{ |
|
"epoch": 5.337353324641461, |
|
"grad_norm": 9.186257362365723, |
|
"learning_rate": 5.228911342894394e-06, |
|
"loss": 0.2167, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.345501955671447, |
|
"grad_norm": 13.397846221923828, |
|
"learning_rate": 5.219947848761408e-06, |
|
"loss": 0.2201, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 5.353650586701434, |
|
"grad_norm": 13.287492752075195, |
|
"learning_rate": 5.210984354628423e-06, |
|
"loss": 0.2168, |
|
"step": 65700 |
|
}, |
|
{ |
|
"epoch": 5.361799217731421, |
|
"grad_norm": 14.4291410446167, |
|
"learning_rate": 5.202020860495437e-06, |
|
"loss": 0.2131, |
|
"step": 65800 |
|
}, |
|
{ |
|
"epoch": 5.369947848761408, |
|
"grad_norm": 12.810196876525879, |
|
"learning_rate": 5.193057366362451e-06, |
|
"loss": 0.1926, |
|
"step": 65900 |
|
}, |
|
{ |
|
"epoch": 5.378096479791395, |
|
"grad_norm": 7.368422985076904, |
|
"learning_rate": 5.184093872229465e-06, |
|
"loss": 0.2103, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.386245110821382, |
|
"grad_norm": 7.721367359161377, |
|
"learning_rate": 5.175130378096479e-06, |
|
"loss": 0.1999, |
|
"step": 66100 |
|
}, |
|
{ |
|
"epoch": 5.394393741851369, |
|
"grad_norm": 12.670356750488281, |
|
"learning_rate": 5.166166883963495e-06, |
|
"loss": 0.216, |
|
"step": 66200 |
|
}, |
|
{ |
|
"epoch": 5.4025423728813555, |
|
"grad_norm": 11.775321960449219, |
|
"learning_rate": 5.157203389830509e-06, |
|
"loss": 0.1863, |
|
"step": 66300 |
|
}, |
|
{ |
|
"epoch": 5.410691003911343, |
|
"grad_norm": 3.6226017475128174, |
|
"learning_rate": 5.148239895697523e-06, |
|
"loss": 0.1994, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 5.41883963494133, |
|
"grad_norm": 19.766469955444336, |
|
"learning_rate": 5.139276401564536e-06, |
|
"loss": 0.2046, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.426988265971317, |
|
"grad_norm": 14.729117393493652, |
|
"learning_rate": 5.130312907431552e-06, |
|
"loss": 0.2117, |
|
"step": 66600 |
|
}, |
|
{ |
|
"epoch": 5.435136897001303, |
|
"grad_norm": 5.150448322296143, |
|
"learning_rate": 5.1213494132985655e-06, |
|
"loss": 0.2051, |
|
"step": 66700 |
|
}, |
|
{ |
|
"epoch": 5.443285528031291, |
|
"grad_norm": 17.367290496826172, |
|
"learning_rate": 5.11238591916558e-06, |
|
"loss": 0.2148, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 5.451434159061278, |
|
"grad_norm": 10.725879669189453, |
|
"learning_rate": 5.103422425032595e-06, |
|
"loss": 0.2168, |
|
"step": 66900 |
|
}, |
|
{ |
|
"epoch": 5.459582790091265, |
|
"grad_norm": 12.010479927062988, |
|
"learning_rate": 5.094458930899608e-06, |
|
"loss": 0.2032, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 5.467731421121251, |
|
"grad_norm": 10.557023048400879, |
|
"learning_rate": 5.085495436766624e-06, |
|
"loss": 0.2118, |
|
"step": 67100 |
|
}, |
|
{ |
|
"epoch": 5.475880052151239, |
|
"grad_norm": 14.533101081848145, |
|
"learning_rate": 5.076531942633637e-06, |
|
"loss": 0.2186, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 5.484028683181226, |
|
"grad_norm": 21.979625701904297, |
|
"learning_rate": 5.067568448500652e-06, |
|
"loss": 0.1842, |
|
"step": 67300 |
|
}, |
|
{ |
|
"epoch": 5.4921773142112125, |
|
"grad_norm": 12.857218742370605, |
|
"learning_rate": 5.058604954367667e-06, |
|
"loss": 0.2051, |
|
"step": 67400 |
|
}, |
|
{ |
|
"epoch": 5.500325945241199, |
|
"grad_norm": 9.391901969909668, |
|
"learning_rate": 5.0496414602346814e-06, |
|
"loss": 0.21, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 5.508474576271187, |
|
"grad_norm": 11.181014060974121, |
|
"learning_rate": 5.0406779661016945e-06, |
|
"loss": 0.207, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 5.516623207301174, |
|
"grad_norm": 11.92509651184082, |
|
"learning_rate": 5.031714471968709e-06, |
|
"loss": 0.191, |
|
"step": 67700 |
|
}, |
|
{ |
|
"epoch": 5.52477183833116, |
|
"grad_norm": 11.574973106384277, |
|
"learning_rate": 5.022750977835723e-06, |
|
"loss": 0.2046, |
|
"step": 67800 |
|
}, |
|
{ |
|
"epoch": 5.532920469361147, |
|
"grad_norm": 10.391201972961426, |
|
"learning_rate": 5.013787483702736e-06, |
|
"loss": 0.1916, |
|
"step": 67900 |
|
}, |
|
{ |
|
"epoch": 5.541069100391134, |
|
"grad_norm": 11.391480445861816, |
|
"learning_rate": 5.004823989569753e-06, |
|
"loss": 0.2191, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 5.5492177314211215, |
|
"grad_norm": 4.8835015296936035, |
|
"learning_rate": 4.995860495436767e-06, |
|
"loss": 0.1916, |
|
"step": 68100 |
|
}, |
|
{ |
|
"epoch": 5.557366362451108, |
|
"grad_norm": 15.419282913208008, |
|
"learning_rate": 4.986897001303781e-06, |
|
"loss": 0.2146, |
|
"step": 68200 |
|
}, |
|
{ |
|
"epoch": 5.565514993481095, |
|
"grad_norm": 7.127319812774658, |
|
"learning_rate": 4.977933507170796e-06, |
|
"loss": 0.2111, |
|
"step": 68300 |
|
}, |
|
{ |
|
"epoch": 5.573663624511082, |
|
"grad_norm": 12.268532752990723, |
|
"learning_rate": 4.96897001303781e-06, |
|
"loss": 0.1966, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 5.581812255541069, |
|
"grad_norm": 8.935240745544434, |
|
"learning_rate": 4.9600065189048235e-06, |
|
"loss": 0.2063, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 5.589960886571056, |
|
"grad_norm": 7.016481876373291, |
|
"learning_rate": 4.9510430247718375e-06, |
|
"loss": 0.2156, |
|
"step": 68600 |
|
}, |
|
{ |
|
"epoch": 5.598109517601043, |
|
"grad_norm": 12.118579864501953, |
|
"learning_rate": 4.942079530638852e-06, |
|
"loss": 0.2162, |
|
"step": 68700 |
|
}, |
|
{ |
|
"epoch": 5.60625814863103, |
|
"grad_norm": 15.719651222229004, |
|
"learning_rate": 4.933116036505867e-06, |
|
"loss": 0.1974, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 5.614406779661017, |
|
"grad_norm": 1.1470224857330322, |
|
"learning_rate": 4.924152542372882e-06, |
|
"loss": 0.1947, |
|
"step": 68900 |
|
}, |
|
{ |
|
"epoch": 5.622555410691004, |
|
"grad_norm": 9.488736152648926, |
|
"learning_rate": 4.915189048239896e-06, |
|
"loss": 0.207, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 5.630704041720991, |
|
"grad_norm": 11.961145401000977, |
|
"learning_rate": 4.90622555410691e-06, |
|
"loss": 0.2143, |
|
"step": 69100 |
|
}, |
|
{ |
|
"epoch": 5.638852672750978, |
|
"grad_norm": 4.816403388977051, |
|
"learning_rate": 4.897262059973925e-06, |
|
"loss": 0.2119, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 5.647001303780964, |
|
"grad_norm": 9.472099304199219, |
|
"learning_rate": 4.888298565840939e-06, |
|
"loss": 0.1907, |
|
"step": 69300 |
|
}, |
|
{ |
|
"epoch": 5.655149934810952, |
|
"grad_norm": 12.95845890045166, |
|
"learning_rate": 4.8793350717079525e-06, |
|
"loss": 0.2015, |
|
"step": 69400 |
|
}, |
|
{ |
|
"epoch": 5.663298565840939, |
|
"grad_norm": 17.84122657775879, |
|
"learning_rate": 4.8703715775749665e-06, |
|
"loss": 0.1871, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 5.6714471968709255, |
|
"grad_norm": 12.41882610321045, |
|
"learning_rate": 4.861408083441982e-06, |
|
"loss": 0.2017, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 5.679595827900913, |
|
"grad_norm": 2.849573850631714, |
|
"learning_rate": 4.852444589308996e-06, |
|
"loss": 0.1933, |
|
"step": 69700 |
|
}, |
|
{ |
|
"epoch": 5.6877444589309, |
|
"grad_norm": 8.450750350952148, |
|
"learning_rate": 4.843481095176011e-06, |
|
"loss": 0.2051, |
|
"step": 69800 |
|
}, |
|
{ |
|
"epoch": 5.695893089960887, |
|
"grad_norm": 2.862558126449585, |
|
"learning_rate": 4.834517601043025e-06, |
|
"loss": 0.217, |
|
"step": 69900 |
|
}, |
|
{ |
|
"epoch": 5.704041720990873, |
|
"grad_norm": 7.673337936401367, |
|
"learning_rate": 4.825554106910039e-06, |
|
"loss": 0.1976, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.71219035202086, |
|
"grad_norm": 12.854650497436523, |
|
"learning_rate": 4.816590612777053e-06, |
|
"loss": 0.2092, |
|
"step": 70100 |
|
}, |
|
{ |
|
"epoch": 5.720338983050848, |
|
"grad_norm": 14.666707038879395, |
|
"learning_rate": 4.807627118644068e-06, |
|
"loss": 0.2147, |
|
"step": 70200 |
|
}, |
|
{ |
|
"epoch": 5.7284876140808345, |
|
"grad_norm": 12.781336784362793, |
|
"learning_rate": 4.7986636245110815e-06, |
|
"loss": 0.2145, |
|
"step": 70300 |
|
}, |
|
{ |
|
"epoch": 5.736636245110821, |
|
"grad_norm": 15.079524993896484, |
|
"learning_rate": 4.789700130378097e-06, |
|
"loss": 0.1991, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 5.744784876140808, |
|
"grad_norm": 10.632801055908203, |
|
"learning_rate": 4.780736636245111e-06, |
|
"loss": 0.2046, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 5.752933507170796, |
|
"grad_norm": 4.691617488861084, |
|
"learning_rate": 4.771773142112125e-06, |
|
"loss": 0.2143, |
|
"step": 70600 |
|
}, |
|
{ |
|
"epoch": 5.761082138200782, |
|
"grad_norm": 5.9435648918151855, |
|
"learning_rate": 4.76280964797914e-06, |
|
"loss": 0.211, |
|
"step": 70700 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 4.910902500152588, |
|
"learning_rate": 4.753846153846154e-06, |
|
"loss": 0.2491, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 5.777379400260756, |
|
"grad_norm": 6.0848708152771, |
|
"learning_rate": 4.744882659713168e-06, |
|
"loss": 0.1952, |
|
"step": 70900 |
|
}, |
|
{ |
|
"epoch": 5.7855280312907436, |
|
"grad_norm": 7.89244270324707, |
|
"learning_rate": 4.735919165580182e-06, |
|
"loss": 0.2118, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.79367666232073, |
|
"grad_norm": 3.871981382369995, |
|
"learning_rate": 4.726955671447197e-06, |
|
"loss": 0.2104, |
|
"step": 71100 |
|
}, |
|
{ |
|
"epoch": 5.801825293350717, |
|
"grad_norm": 13.02302074432373, |
|
"learning_rate": 4.717992177314211e-06, |
|
"loss": 0.203, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 5.809973924380704, |
|
"grad_norm": 6.594838619232178, |
|
"learning_rate": 4.709028683181226e-06, |
|
"loss": 0.2154, |
|
"step": 71300 |
|
}, |
|
{ |
|
"epoch": 5.818122555410691, |
|
"grad_norm": 12.507122993469238, |
|
"learning_rate": 4.70006518904824e-06, |
|
"loss": 0.223, |
|
"step": 71400 |
|
}, |
|
{ |
|
"epoch": 5.826271186440678, |
|
"grad_norm": 3.6633384227752686, |
|
"learning_rate": 4.691101694915254e-06, |
|
"loss": 0.2032, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 5.834419817470665, |
|
"grad_norm": 9.926380157470703, |
|
"learning_rate": 4.682138200782268e-06, |
|
"loss": 0.2046, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 5.842568448500652, |
|
"grad_norm": 14.155366897583008, |
|
"learning_rate": 4.673174706649283e-06, |
|
"loss": 0.1853, |
|
"step": 71700 |
|
}, |
|
{ |
|
"epoch": 5.8507170795306385, |
|
"grad_norm": 9.108920097351074, |
|
"learning_rate": 4.664211212516297e-06, |
|
"loss": 0.2252, |
|
"step": 71800 |
|
}, |
|
{ |
|
"epoch": 5.858865710560626, |
|
"grad_norm": 7.253769397735596, |
|
"learning_rate": 4.655247718383311e-06, |
|
"loss": 0.196, |
|
"step": 71900 |
|
}, |
|
{ |
|
"epoch": 5.867014341590613, |
|
"grad_norm": 9.013677597045898, |
|
"learning_rate": 4.6462842242503264e-06, |
|
"loss": 0.2264, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.8751629726206, |
|
"grad_norm": 8.925735473632812, |
|
"learning_rate": 4.63732073011734e-06, |
|
"loss": 0.2142, |
|
"step": 72100 |
|
}, |
|
{ |
|
"epoch": 5.883311603650586, |
|
"grad_norm": 9.083677291870117, |
|
"learning_rate": 4.628357235984355e-06, |
|
"loss": 0.225, |
|
"step": 72200 |
|
}, |
|
{ |
|
"epoch": 5.891460234680574, |
|
"grad_norm": 12.813594818115234, |
|
"learning_rate": 4.619393741851369e-06, |
|
"loss": 0.2093, |
|
"step": 72300 |
|
}, |
|
{ |
|
"epoch": 5.899608865710561, |
|
"grad_norm": 13.768078804016113, |
|
"learning_rate": 4.610430247718383e-06, |
|
"loss": 0.2031, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 5.9077574967405475, |
|
"grad_norm": 8.680143356323242, |
|
"learning_rate": 4.601466753585397e-06, |
|
"loss": 0.2147, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 5.915906127770534, |
|
"grad_norm": 17.846006393432617, |
|
"learning_rate": 4.592503259452412e-06, |
|
"loss": 0.2096, |
|
"step": 72600 |
|
}, |
|
{ |
|
"epoch": 5.924054758800521, |
|
"grad_norm": 8.560224533081055, |
|
"learning_rate": 4.583539765319426e-06, |
|
"loss": 0.1881, |
|
"step": 72700 |
|
}, |
|
{ |
|
"epoch": 5.932203389830509, |
|
"grad_norm": 5.689293384552002, |
|
"learning_rate": 4.5745762711864415e-06, |
|
"loss": 0.1971, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 5.940352020860495, |
|
"grad_norm": 3.36490535736084, |
|
"learning_rate": 4.5656127770534554e-06, |
|
"loss": 0.2107, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 5.948500651890482, |
|
"grad_norm": 6.769710063934326, |
|
"learning_rate": 4.556649282920469e-06, |
|
"loss": 0.2062, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.95664928292047, |
|
"grad_norm": 4.0076775550842285, |
|
"learning_rate": 4.547685788787483e-06, |
|
"loss": 0.202, |
|
"step": 73100 |
|
}, |
|
{ |
|
"epoch": 5.9647979139504566, |
|
"grad_norm": 15.15523910522461, |
|
"learning_rate": 4.538722294654498e-06, |
|
"loss": 0.2186, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 5.972946544980443, |
|
"grad_norm": 8.619361877441406, |
|
"learning_rate": 4.529758800521512e-06, |
|
"loss": 0.2087, |
|
"step": 73300 |
|
}, |
|
{ |
|
"epoch": 5.98109517601043, |
|
"grad_norm": 13.271175384521484, |
|
"learning_rate": 4.520795306388526e-06, |
|
"loss": 0.2069, |
|
"step": 73400 |
|
}, |
|
{ |
|
"epoch": 5.989243807040417, |
|
"grad_norm": 10.295662879943848, |
|
"learning_rate": 4.511831812255541e-06, |
|
"loss": 0.2219, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 5.9973924380704045, |
|
"grad_norm": 11.802294731140137, |
|
"learning_rate": 4.502868318122556e-06, |
|
"loss": 0.1998, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.840562248995984, |
|
"eval_loss": 0.6011542677879333, |
|
"eval_runtime": 5.4949, |
|
"eval_samples_per_second": 453.151, |
|
"eval_steps_per_second": 56.78, |
|
"step": 73632 |
|
}, |
|
{ |
|
"epoch": 6.005541069100391, |
|
"grad_norm": 6.388237953186035, |
|
"learning_rate": 4.4939048239895705e-06, |
|
"loss": 0.1839, |
|
"step": 73700 |
|
}, |
|
{ |
|
"epoch": 6.013689700130378, |
|
"grad_norm": 10.506056785583496, |
|
"learning_rate": 4.4849413298565844e-06, |
|
"loss": 0.1734, |
|
"step": 73800 |
|
}, |
|
{ |
|
"epoch": 6.021838331160365, |
|
"grad_norm": 8.927146911621094, |
|
"learning_rate": 4.475977835723598e-06, |
|
"loss": 0.19, |
|
"step": 73900 |
|
}, |
|
{ |
|
"epoch": 6.029986962190352, |
|
"grad_norm": 7.502388000488281, |
|
"learning_rate": 4.467014341590612e-06, |
|
"loss": 0.1928, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 6.038135593220339, |
|
"grad_norm": 3.7142844200134277, |
|
"learning_rate": 4.458050847457627e-06, |
|
"loss": 0.1615, |
|
"step": 74100 |
|
}, |
|
{ |
|
"epoch": 6.046284224250326, |
|
"grad_norm": 8.377524375915527, |
|
"learning_rate": 4.449087353324641e-06, |
|
"loss": 0.1725, |
|
"step": 74200 |
|
}, |
|
{ |
|
"epoch": 6.054432855280313, |
|
"grad_norm": 12.744202613830566, |
|
"learning_rate": 4.440123859191655e-06, |
|
"loss": 0.1858, |
|
"step": 74300 |
|
}, |
|
{ |
|
"epoch": 6.0625814863103, |
|
"grad_norm": 6.460068225860596, |
|
"learning_rate": 4.43116036505867e-06, |
|
"loss": 0.1941, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 6.070730117340287, |
|
"grad_norm": 5.896920204162598, |
|
"learning_rate": 4.422196870925685e-06, |
|
"loss": 0.1656, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 6.078878748370274, |
|
"grad_norm": 4.367869853973389, |
|
"learning_rate": 4.413233376792699e-06, |
|
"loss": 0.1582, |
|
"step": 74600 |
|
}, |
|
{ |
|
"epoch": 6.0870273794002605, |
|
"grad_norm": 18.771028518676758, |
|
"learning_rate": 4.4042698826597134e-06, |
|
"loss": 0.1585, |
|
"step": 74700 |
|
}, |
|
{ |
|
"epoch": 6.095176010430248, |
|
"grad_norm": 9.596067428588867, |
|
"learning_rate": 4.395306388526727e-06, |
|
"loss": 0.1734, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 6.103324641460235, |
|
"grad_norm": 5.74586820602417, |
|
"learning_rate": 4.386342894393741e-06, |
|
"loss": 0.1721, |
|
"step": 74900 |
|
}, |
|
{ |
|
"epoch": 6.111473272490222, |
|
"grad_norm": 14.977295875549316, |
|
"learning_rate": 4.377379400260756e-06, |
|
"loss": 0.1664, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 6.119621903520208, |
|
"grad_norm": 15.801685333251953, |
|
"learning_rate": 4.36841590612777e-06, |
|
"loss": 0.1658, |
|
"step": 75100 |
|
}, |
|
{ |
|
"epoch": 6.127770534550195, |
|
"grad_norm": 9.82263469696045, |
|
"learning_rate": 4.359452411994784e-06, |
|
"loss": 0.1708, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 6.135919165580183, |
|
"grad_norm": 5.305476188659668, |
|
"learning_rate": 4.3504889178618e-06, |
|
"loss": 0.1879, |
|
"step": 75300 |
|
}, |
|
{ |
|
"epoch": 6.1440677966101696, |
|
"grad_norm": 11.85977554321289, |
|
"learning_rate": 4.341525423728814e-06, |
|
"loss": 0.1725, |
|
"step": 75400 |
|
}, |
|
{ |
|
"epoch": 6.152216427640156, |
|
"grad_norm": 12.436184883117676, |
|
"learning_rate": 4.332561929595828e-06, |
|
"loss": 0.1645, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 6.160365058670143, |
|
"grad_norm": 4.3290324211120605, |
|
"learning_rate": 4.3235984354628424e-06, |
|
"loss": 0.1893, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 6.168513689700131, |
|
"grad_norm": 3.737994432449341, |
|
"learning_rate": 4.314634941329856e-06, |
|
"loss": 0.189, |
|
"step": 75700 |
|
}, |
|
{ |
|
"epoch": 6.1766623207301175, |
|
"grad_norm": 7.4377241134643555, |
|
"learning_rate": 4.30567144719687e-06, |
|
"loss": 0.1878, |
|
"step": 75800 |
|
}, |
|
{ |
|
"epoch": 6.184810951760104, |
|
"grad_norm": 16.438753128051758, |
|
"learning_rate": 4.296707953063885e-06, |
|
"loss": 0.1941, |
|
"step": 75900 |
|
}, |
|
{ |
|
"epoch": 6.192959582790091, |
|
"grad_norm": 14.083968162536621, |
|
"learning_rate": 4.287744458930899e-06, |
|
"loss": 0.1814, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 6.201108213820079, |
|
"grad_norm": 12.704230308532715, |
|
"learning_rate": 4.278780964797914e-06, |
|
"loss": 0.1782, |
|
"step": 76100 |
|
}, |
|
{ |
|
"epoch": 6.209256844850065, |
|
"grad_norm": 7.3511481285095215, |
|
"learning_rate": 4.269817470664929e-06, |
|
"loss": 0.1882, |
|
"step": 76200 |
|
}, |
|
{ |
|
"epoch": 6.217405475880052, |
|
"grad_norm": 9.752764701843262, |
|
"learning_rate": 4.260853976531943e-06, |
|
"loss": 0.1596, |
|
"step": 76300 |
|
}, |
|
{ |
|
"epoch": 6.225554106910039, |
|
"grad_norm": 9.257549285888672, |
|
"learning_rate": 4.251890482398957e-06, |
|
"loss": 0.1569, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 6.2337027379400265, |
|
"grad_norm": 11.098052978515625, |
|
"learning_rate": 4.2429269882659714e-06, |
|
"loss": 0.1709, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 6.241851368970013, |
|
"grad_norm": 14.279288291931152, |
|
"learning_rate": 4.233963494132985e-06, |
|
"loss": 0.1708, |
|
"step": 76600 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 10.652061462402344, |
|
"learning_rate": 4.224999999999999e-06, |
|
"loss": 0.1838, |
|
"step": 76700 |
|
}, |
|
{ |
|
"epoch": 6.258148631029987, |
|
"grad_norm": 19.59062385559082, |
|
"learning_rate": 4.216036505867014e-06, |
|
"loss": 0.1601, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 6.2662972620599735, |
|
"grad_norm": 11.830570220947266, |
|
"learning_rate": 4.207073011734029e-06, |
|
"loss": 0.1888, |
|
"step": 76900 |
|
}, |
|
{ |
|
"epoch": 6.274445893089961, |
|
"grad_norm": 12.277135848999023, |
|
"learning_rate": 4.198109517601043e-06, |
|
"loss": 0.1573, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 6.282594524119948, |
|
"grad_norm": 12.045758247375488, |
|
"learning_rate": 4.189146023468058e-06, |
|
"loss": 0.2031, |
|
"step": 77100 |
|
}, |
|
{ |
|
"epoch": 6.290743155149935, |
|
"grad_norm": 9.055584907531738, |
|
"learning_rate": 4.180182529335072e-06, |
|
"loss": 0.1797, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 6.298891786179921, |
|
"grad_norm": 16.008596420288086, |
|
"learning_rate": 4.171219035202086e-06, |
|
"loss": 0.1665, |
|
"step": 77300 |
|
}, |
|
{ |
|
"epoch": 6.307040417209909, |
|
"grad_norm": 2.769876480102539, |
|
"learning_rate": 4.1622555410691004e-06, |
|
"loss": 0.1788, |
|
"step": 77400 |
|
}, |
|
{ |
|
"epoch": 6.315189048239896, |
|
"grad_norm": 13.493385314941406, |
|
"learning_rate": 4.153292046936114e-06, |
|
"loss": 0.1727, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 6.3233376792698825, |
|
"grad_norm": 17.533010482788086, |
|
"learning_rate": 4.144328552803128e-06, |
|
"loss": 0.1961, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 6.331486310299869, |
|
"grad_norm": 9.357499122619629, |
|
"learning_rate": 4.135365058670144e-06, |
|
"loss": 0.1772, |
|
"step": 77700 |
|
}, |
|
{ |
|
"epoch": 6.339634941329857, |
|
"grad_norm": 13.053030967712402, |
|
"learning_rate": 4.126401564537158e-06, |
|
"loss": 0.1685, |
|
"step": 77800 |
|
}, |
|
{ |
|
"epoch": 6.347783572359844, |
|
"grad_norm": 7.006876468658447, |
|
"learning_rate": 4.117438070404172e-06, |
|
"loss": 0.17, |
|
"step": 77900 |
|
}, |
|
{ |
|
"epoch": 6.3559322033898304, |
|
"grad_norm": 14.491143226623535, |
|
"learning_rate": 4.108474576271187e-06, |
|
"loss": 0.1748, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 6.364080834419817, |
|
"grad_norm": 6.185390472412109, |
|
"learning_rate": 4.099511082138201e-06, |
|
"loss": 0.1654, |
|
"step": 78100 |
|
}, |
|
{ |
|
"epoch": 6.372229465449805, |
|
"grad_norm": 23.27703857421875, |
|
"learning_rate": 4.090547588005215e-06, |
|
"loss": 0.1704, |
|
"step": 78200 |
|
}, |
|
{ |
|
"epoch": 6.380378096479792, |
|
"grad_norm": 6.468029022216797, |
|
"learning_rate": 4.0815840938722294e-06, |
|
"loss": 0.1781, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 6.388526727509778, |
|
"grad_norm": 12.720208168029785, |
|
"learning_rate": 4.072620599739243e-06, |
|
"loss": 0.1678, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 6.396675358539765, |
|
"grad_norm": 2.81718373298645, |
|
"learning_rate": 4.063657105606258e-06, |
|
"loss": 0.1785, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 6.404823989569753, |
|
"grad_norm": 18.40774154663086, |
|
"learning_rate": 4.054693611473273e-06, |
|
"loss": 0.1648, |
|
"step": 78600 |
|
}, |
|
{ |
|
"epoch": 6.4129726205997395, |
|
"grad_norm": 4.896127223968506, |
|
"learning_rate": 4.045730117340287e-06, |
|
"loss": 0.1961, |
|
"step": 78700 |
|
}, |
|
{ |
|
"epoch": 6.421121251629726, |
|
"grad_norm": 7.5092387199401855, |
|
"learning_rate": 4.036766623207301e-06, |
|
"loss": 0.1703, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 6.429269882659713, |
|
"grad_norm": 6.800247669219971, |
|
"learning_rate": 4.027803129074316e-06, |
|
"loss": 0.1715, |
|
"step": 78900 |
|
}, |
|
{ |
|
"epoch": 6.4374185136897, |
|
"grad_norm": 12.569574356079102, |
|
"learning_rate": 4.01883963494133e-06, |
|
"loss": 0.1771, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 6.445567144719687, |
|
"grad_norm": 7.636009216308594, |
|
"learning_rate": 4.009876140808344e-06, |
|
"loss": 0.1736, |
|
"step": 79100 |
|
}, |
|
{ |
|
"epoch": 6.453715775749674, |
|
"grad_norm": 7.997006893157959, |
|
"learning_rate": 4.000912646675358e-06, |
|
"loss": 0.169, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 6.461864406779661, |
|
"grad_norm": 7.453688144683838, |
|
"learning_rate": 3.991949152542373e-06, |
|
"loss": 0.1636, |
|
"step": 79300 |
|
}, |
|
{ |
|
"epoch": 6.470013037809648, |
|
"grad_norm": 9.414899826049805, |
|
"learning_rate": 3.982985658409387e-06, |
|
"loss": 0.1678, |
|
"step": 79400 |
|
}, |
|
{ |
|
"epoch": 6.478161668839635, |
|
"grad_norm": 13.018781661987305, |
|
"learning_rate": 3.974022164276402e-06, |
|
"loss": 0.1668, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 6.486310299869622, |
|
"grad_norm": 10.45935344696045, |
|
"learning_rate": 3.965058670143416e-06, |
|
"loss": 0.1866, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 6.494458930899609, |
|
"grad_norm": 19.200838088989258, |
|
"learning_rate": 3.95609517601043e-06, |
|
"loss": 0.2034, |
|
"step": 79700 |
|
}, |
|
{ |
|
"epoch": 6.5026075619295955, |
|
"grad_norm": 15.825751304626465, |
|
"learning_rate": 3.947131681877445e-06, |
|
"loss": 0.1656, |
|
"step": 79800 |
|
}, |
|
{ |
|
"epoch": 6.510756192959583, |
|
"grad_norm": 2.095191240310669, |
|
"learning_rate": 3.938168187744459e-06, |
|
"loss": 0.1774, |
|
"step": 79900 |
|
}, |
|
{ |
|
"epoch": 6.51890482398957, |
|
"grad_norm": 3.431582450866699, |
|
"learning_rate": 3.929204693611473e-06, |
|
"loss": 0.1812, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 6.527053455019557, |
|
"grad_norm": 10.681953430175781, |
|
"learning_rate": 3.920241199478488e-06, |
|
"loss": 0.1935, |
|
"step": 80100 |
|
}, |
|
{ |
|
"epoch": 6.5352020860495434, |
|
"grad_norm": 15.143997192382812, |
|
"learning_rate": 3.911277705345502e-06, |
|
"loss": 0.1792, |
|
"step": 80200 |
|
}, |
|
{ |
|
"epoch": 6.54335071707953, |
|
"grad_norm": 7.197531223297119, |
|
"learning_rate": 3.902314211212517e-06, |
|
"loss": 0.1718, |
|
"step": 80300 |
|
}, |
|
{ |
|
"epoch": 6.551499348109518, |
|
"grad_norm": 6.244622707366943, |
|
"learning_rate": 3.893350717079531e-06, |
|
"loss": 0.1591, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 6.559647979139505, |
|
"grad_norm": 8.115144729614258, |
|
"learning_rate": 3.884387222946545e-06, |
|
"loss": 0.1619, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 6.567796610169491, |
|
"grad_norm": 4.486932754516602, |
|
"learning_rate": 3.875423728813559e-06, |
|
"loss": 0.192, |
|
"step": 80600 |
|
}, |
|
{ |
|
"epoch": 6.575945241199479, |
|
"grad_norm": 12.3147611618042, |
|
"learning_rate": 3.866460234680574e-06, |
|
"loss": 0.1869, |
|
"step": 80700 |
|
}, |
|
{ |
|
"epoch": 6.584093872229466, |
|
"grad_norm": 8.49996280670166, |
|
"learning_rate": 3.857496740547588e-06, |
|
"loss": 0.1689, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 6.5922425032594525, |
|
"grad_norm": 19.81438446044922, |
|
"learning_rate": 3.848533246414603e-06, |
|
"loss": 0.1918, |
|
"step": 80900 |
|
}, |
|
{ |
|
"epoch": 6.600391134289439, |
|
"grad_norm": 8.60556697845459, |
|
"learning_rate": 3.839569752281617e-06, |
|
"loss": 0.1951, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 6.608539765319426, |
|
"grad_norm": 11.23202133178711, |
|
"learning_rate": 3.830606258148631e-06, |
|
"loss": 0.1531, |
|
"step": 81100 |
|
}, |
|
{ |
|
"epoch": 6.616688396349414, |
|
"grad_norm": 6.247717380523682, |
|
"learning_rate": 3.821642764015645e-06, |
|
"loss": 0.17, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 6.6248370273794, |
|
"grad_norm": 5.568502902984619, |
|
"learning_rate": 3.8126792698826596e-06, |
|
"loss": 0.1765, |
|
"step": 81300 |
|
}, |
|
{ |
|
"epoch": 6.632985658409387, |
|
"grad_norm": 9.164602279663086, |
|
"learning_rate": 3.803715775749674e-06, |
|
"loss": 0.1467, |
|
"step": 81400 |
|
}, |
|
{ |
|
"epoch": 6.641134289439374, |
|
"grad_norm": 26.45659065246582, |
|
"learning_rate": 3.7947522816166884e-06, |
|
"loss": 0.1778, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 6.6492829204693615, |
|
"grad_norm": 15.123627662658691, |
|
"learning_rate": 3.7857887874837023e-06, |
|
"loss": 0.171, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 6.657431551499348, |
|
"grad_norm": 14.836517333984375, |
|
"learning_rate": 3.7768252933507175e-06, |
|
"loss": 0.175, |
|
"step": 81700 |
|
}, |
|
{ |
|
"epoch": 6.665580182529335, |
|
"grad_norm": 7.995541572570801, |
|
"learning_rate": 3.767861799217732e-06, |
|
"loss": 0.2045, |
|
"step": 81800 |
|
}, |
|
{ |
|
"epoch": 6.673728813559322, |
|
"grad_norm": 15.063085556030273, |
|
"learning_rate": 3.758898305084746e-06, |
|
"loss": 0.1797, |
|
"step": 81900 |
|
}, |
|
{ |
|
"epoch": 6.681877444589309, |
|
"grad_norm": 8.916457176208496, |
|
"learning_rate": 3.7499348109517603e-06, |
|
"loss": 0.1525, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 6.690026075619296, |
|
"grad_norm": 8.451703071594238, |
|
"learning_rate": 3.7409713168187746e-06, |
|
"loss": 0.1755, |
|
"step": 82100 |
|
}, |
|
{ |
|
"epoch": 6.698174706649283, |
|
"grad_norm": 7.143068790435791, |
|
"learning_rate": 3.7320078226857886e-06, |
|
"loss": 0.1613, |
|
"step": 82200 |
|
}, |
|
{ |
|
"epoch": 6.70632333767927, |
|
"grad_norm": 12.28380012512207, |
|
"learning_rate": 3.723044328552803e-06, |
|
"loss": 0.1797, |
|
"step": 82300 |
|
}, |
|
{ |
|
"epoch": 6.7144719687092564, |
|
"grad_norm": 7.404759407043457, |
|
"learning_rate": 3.714080834419817e-06, |
|
"loss": 0.193, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 6.722620599739244, |
|
"grad_norm": 14.824822425842285, |
|
"learning_rate": 3.705117340286832e-06, |
|
"loss": 0.1669, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 6.730769230769231, |
|
"grad_norm": 8.087990760803223, |
|
"learning_rate": 3.6961538461538465e-06, |
|
"loss": 0.1909, |
|
"step": 82600 |
|
}, |
|
{ |
|
"epoch": 6.738917861799218, |
|
"grad_norm": 8.700764656066895, |
|
"learning_rate": 3.687190352020861e-06, |
|
"loss": 0.1634, |
|
"step": 82700 |
|
}, |
|
{ |
|
"epoch": 6.747066492829204, |
|
"grad_norm": 15.996380805969238, |
|
"learning_rate": 3.678226857887875e-06, |
|
"loss": 0.1715, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 6.755215123859192, |
|
"grad_norm": 4.605205059051514, |
|
"learning_rate": 3.6692633637548893e-06, |
|
"loss": 0.1875, |
|
"step": 82900 |
|
}, |
|
{ |
|
"epoch": 6.763363754889179, |
|
"grad_norm": 12.218436241149902, |
|
"learning_rate": 3.660299869621903e-06, |
|
"loss": 0.1471, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 6.7715123859191655, |
|
"grad_norm": 7.971900939941406, |
|
"learning_rate": 3.6513363754889176e-06, |
|
"loss": 0.1613, |
|
"step": 83100 |
|
}, |
|
{ |
|
"epoch": 6.779661016949152, |
|
"grad_norm": 15.262982368469238, |
|
"learning_rate": 3.6423728813559315e-06, |
|
"loss": 0.1711, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 6.78780964797914, |
|
"grad_norm": 14.941751480102539, |
|
"learning_rate": 3.633409387222946e-06, |
|
"loss": 0.17, |
|
"step": 83300 |
|
}, |
|
{ |
|
"epoch": 6.795958279009127, |
|
"grad_norm": 18.507015228271484, |
|
"learning_rate": 3.624445893089961e-06, |
|
"loss": 0.1555, |
|
"step": 83400 |
|
}, |
|
{ |
|
"epoch": 6.804106910039113, |
|
"grad_norm": 9.77978801727295, |
|
"learning_rate": 3.6154823989569755e-06, |
|
"loss": 0.189, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 6.8122555410691, |
|
"grad_norm": 6.868803977966309, |
|
"learning_rate": 3.6065189048239895e-06, |
|
"loss": 0.1675, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 6.820404172099087, |
|
"grad_norm": 7.371277809143066, |
|
"learning_rate": 3.597555410691004e-06, |
|
"loss": 0.196, |
|
"step": 83700 |
|
}, |
|
{ |
|
"epoch": 6.8285528031290745, |
|
"grad_norm": 2.901946783065796, |
|
"learning_rate": 3.5885919165580187e-06, |
|
"loss": 0.1852, |
|
"step": 83800 |
|
}, |
|
{ |
|
"epoch": 6.836701434159061, |
|
"grad_norm": 15.448030471801758, |
|
"learning_rate": 3.5796284224250322e-06, |
|
"loss": 0.1678, |
|
"step": 83900 |
|
}, |
|
{ |
|
"epoch": 6.844850065189048, |
|
"grad_norm": 9.891990661621094, |
|
"learning_rate": 3.570664928292046e-06, |
|
"loss": 0.1885, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.852998696219036, |
|
"grad_norm": 13.63137149810791, |
|
"learning_rate": 3.5617014341590605e-06, |
|
"loss": 0.1902, |
|
"step": 84100 |
|
}, |
|
{ |
|
"epoch": 6.861147327249022, |
|
"grad_norm": 4.282588481903076, |
|
"learning_rate": 3.5527379400260758e-06, |
|
"loss": 0.1888, |
|
"step": 84200 |
|
}, |
|
{ |
|
"epoch": 6.869295958279009, |
|
"grad_norm": 7.632056713104248, |
|
"learning_rate": 3.54377444589309e-06, |
|
"loss": 0.1703, |
|
"step": 84300 |
|
}, |
|
{ |
|
"epoch": 6.877444589308996, |
|
"grad_norm": 2.5631256103515625, |
|
"learning_rate": 3.534810951760105e-06, |
|
"loss": 0.1863, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 6.885593220338983, |
|
"grad_norm": 11.351937294006348, |
|
"learning_rate": 3.5258474576271185e-06, |
|
"loss": 0.1753, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 6.89374185136897, |
|
"grad_norm": 9.421881675720215, |
|
"learning_rate": 3.5168839634941325e-06, |
|
"loss": 0.1752, |
|
"step": 84600 |
|
}, |
|
{ |
|
"epoch": 6.901890482398957, |
|
"grad_norm": 7.395664691925049, |
|
"learning_rate": 3.507920469361147e-06, |
|
"loss": 0.1636, |
|
"step": 84700 |
|
}, |
|
{ |
|
"epoch": 6.910039113428944, |
|
"grad_norm": 10.84174633026123, |
|
"learning_rate": 3.4989569752281612e-06, |
|
"loss": 0.1781, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 6.918187744458931, |
|
"grad_norm": 6.427243709564209, |
|
"learning_rate": 3.4899934810951756e-06, |
|
"loss": 0.1751, |
|
"step": 84900 |
|
}, |
|
{ |
|
"epoch": 6.926336375488918, |
|
"grad_norm": 5.036814212799072, |
|
"learning_rate": 3.4810299869621912e-06, |
|
"loss": 0.1758, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.934485006518905, |
|
"grad_norm": 12.158060073852539, |
|
"learning_rate": 3.4720664928292048e-06, |
|
"loss": 0.1846, |
|
"step": 85100 |
|
}, |
|
{ |
|
"epoch": 6.942633637548892, |
|
"grad_norm": 9.157272338867188, |
|
"learning_rate": 3.4631029986962187e-06, |
|
"loss": 0.1807, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 6.9507822685788785, |
|
"grad_norm": 19.283143997192383, |
|
"learning_rate": 3.4541395045632335e-06, |
|
"loss": 0.1778, |
|
"step": 85300 |
|
}, |
|
{ |
|
"epoch": 6.958930899608866, |
|
"grad_norm": 25.128320693969727, |
|
"learning_rate": 3.4451760104302475e-06, |
|
"loss": 0.1785, |
|
"step": 85400 |
|
}, |
|
{ |
|
"epoch": 6.967079530638853, |
|
"grad_norm": 16.265281677246094, |
|
"learning_rate": 3.436212516297262e-06, |
|
"loss": 0.1577, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 6.97522816166884, |
|
"grad_norm": 26.714344024658203, |
|
"learning_rate": 3.427249022164276e-06, |
|
"loss": 0.1859, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 6.983376792698826, |
|
"grad_norm": 6.852247714996338, |
|
"learning_rate": 3.4182855280312906e-06, |
|
"loss": 0.1724, |
|
"step": 85700 |
|
}, |
|
{ |
|
"epoch": 6.991525423728813, |
|
"grad_norm": 12.064892768859863, |
|
"learning_rate": 3.409322033898305e-06, |
|
"loss": 0.1944, |
|
"step": 85800 |
|
}, |
|
{ |
|
"epoch": 6.999674054758801, |
|
"grad_norm": 8.689165115356445, |
|
"learning_rate": 3.40035853976532e-06, |
|
"loss": 0.1737, |
|
"step": 85900 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.8393574297188755, |
|
"eval_loss": 0.636776864528656, |
|
"eval_runtime": 5.4695, |
|
"eval_samples_per_second": 455.252, |
|
"eval_steps_per_second": 57.044, |
|
"step": 85904 |
|
}, |
|
{ |
|
"epoch": 7.0078226857887875, |
|
"grad_norm": 7.449398517608643, |
|
"learning_rate": 3.3913950456323338e-06, |
|
"loss": 0.1534, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 7.015971316818774, |
|
"grad_norm": 4.026369571685791, |
|
"learning_rate": 3.382431551499348e-06, |
|
"loss": 0.1476, |
|
"step": 86100 |
|
}, |
|
{ |
|
"epoch": 7.024119947848761, |
|
"grad_norm": 12.344401359558105, |
|
"learning_rate": 3.373468057366362e-06, |
|
"loss": 0.1352, |
|
"step": 86200 |
|
}, |
|
{ |
|
"epoch": 7.032268578878749, |
|
"grad_norm": 2.723181962966919, |
|
"learning_rate": 3.364504563233377e-06, |
|
"loss": 0.1449, |
|
"step": 86300 |
|
}, |
|
{ |
|
"epoch": 7.040417209908735, |
|
"grad_norm": 13.284101486206055, |
|
"learning_rate": 3.355541069100391e-06, |
|
"loss": 0.1424, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 7.048565840938722, |
|
"grad_norm": 7.289649963378906, |
|
"learning_rate": 3.3465775749674053e-06, |
|
"loss": 0.1556, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 7.056714471968709, |
|
"grad_norm": 11.415511131286621, |
|
"learning_rate": 3.3376140808344196e-06, |
|
"loss": 0.1537, |
|
"step": 86600 |
|
}, |
|
{ |
|
"epoch": 7.064863102998697, |
|
"grad_norm": 10.688644409179688, |
|
"learning_rate": 3.3286505867014344e-06, |
|
"loss": 0.1539, |
|
"step": 86700 |
|
}, |
|
{ |
|
"epoch": 7.073011734028683, |
|
"grad_norm": 0.7387442588806152, |
|
"learning_rate": 3.319687092568449e-06, |
|
"loss": 0.152, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 7.08116036505867, |
|
"grad_norm": 11.108467102050781, |
|
"learning_rate": 3.3107235984354628e-06, |
|
"loss": 0.1561, |
|
"step": 86900 |
|
}, |
|
{ |
|
"epoch": 7.089308996088657, |
|
"grad_norm": 3.977663040161133, |
|
"learning_rate": 3.301760104302477e-06, |
|
"loss": 0.1588, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 7.0974576271186445, |
|
"grad_norm": 15.751721382141113, |
|
"learning_rate": 3.2927966101694915e-06, |
|
"loss": 0.1555, |
|
"step": 87100 |
|
}, |
|
{ |
|
"epoch": 7.105606258148631, |
|
"grad_norm": 8.952390670776367, |
|
"learning_rate": 3.2838331160365055e-06, |
|
"loss": 0.1501, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 7.113754889178618, |
|
"grad_norm": 8.698328971862793, |
|
"learning_rate": 3.274869621903519e-06, |
|
"loss": 0.1579, |
|
"step": 87300 |
|
}, |
|
{ |
|
"epoch": 7.121903520208605, |
|
"grad_norm": 6.493157386779785, |
|
"learning_rate": 3.265906127770535e-06, |
|
"loss": 0.1585, |
|
"step": 87400 |
|
}, |
|
{ |
|
"epoch": 7.130052151238592, |
|
"grad_norm": 12.06603717803955, |
|
"learning_rate": 3.256942633637549e-06, |
|
"loss": 0.1498, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 7.138200782268579, |
|
"grad_norm": 8.994619369506836, |
|
"learning_rate": 3.2479791395045634e-06, |
|
"loss": 0.1561, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 7.146349413298566, |
|
"grad_norm": 8.27117919921875, |
|
"learning_rate": 3.239015645371578e-06, |
|
"loss": 0.1532, |
|
"step": 87700 |
|
}, |
|
{ |
|
"epoch": 7.154498044328553, |
|
"grad_norm": 14.190399169921875, |
|
"learning_rate": 3.2300521512385918e-06, |
|
"loss": 0.1447, |
|
"step": 87800 |
|
}, |
|
{ |
|
"epoch": 7.162646675358539, |
|
"grad_norm": 5.721526145935059, |
|
"learning_rate": 3.221088657105606e-06, |
|
"loss": 0.1525, |
|
"step": 87900 |
|
}, |
|
{ |
|
"epoch": 7.170795306388527, |
|
"grad_norm": 9.777070999145508, |
|
"learning_rate": 3.21212516297262e-06, |
|
"loss": 0.154, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 7.178943937418514, |
|
"grad_norm": 9.652997016906738, |
|
"learning_rate": 3.2031616688396345e-06, |
|
"loss": 0.1634, |
|
"step": 88100 |
|
}, |
|
{ |
|
"epoch": 7.1870925684485005, |
|
"grad_norm": 18.047399520874023, |
|
"learning_rate": 3.1941981747066497e-06, |
|
"loss": 0.1412, |
|
"step": 88200 |
|
}, |
|
{ |
|
"epoch": 7.195241199478487, |
|
"grad_norm": 28.265060424804688, |
|
"learning_rate": 3.185234680573664e-06, |
|
"loss": 0.1419, |
|
"step": 88300 |
|
}, |
|
{ |
|
"epoch": 7.203389830508475, |
|
"grad_norm": 13.160205841064453, |
|
"learning_rate": 3.176271186440678e-06, |
|
"loss": 0.1427, |
|
"step": 88400 |
|
}, |
|
{ |
|
"epoch": 7.211538461538462, |
|
"grad_norm": 10.281450271606445, |
|
"learning_rate": 3.1673076923076924e-06, |
|
"loss": 0.1447, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 7.219687092568448, |
|
"grad_norm": 13.10381031036377, |
|
"learning_rate": 3.1583441981747064e-06, |
|
"loss": 0.1358, |
|
"step": 88600 |
|
}, |
|
{ |
|
"epoch": 7.227835723598435, |
|
"grad_norm": 6.493006706237793, |
|
"learning_rate": 3.1493807040417208e-06, |
|
"loss": 0.1317, |
|
"step": 88700 |
|
}, |
|
{ |
|
"epoch": 7.235984354628423, |
|
"grad_norm": 3.7273976802825928, |
|
"learning_rate": 3.1404172099087347e-06, |
|
"loss": 0.1464, |
|
"step": 88800 |
|
}, |
|
{ |
|
"epoch": 7.24413298565841, |
|
"grad_norm": 17.353458404541016, |
|
"learning_rate": 3.131453715775749e-06, |
|
"loss": 0.1413, |
|
"step": 88900 |
|
}, |
|
{ |
|
"epoch": 7.252281616688396, |
|
"grad_norm": 17.094566345214844, |
|
"learning_rate": 3.1224902216427643e-06, |
|
"loss": 0.163, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 7.260430247718383, |
|
"grad_norm": 17.34813117980957, |
|
"learning_rate": 3.1135267275097787e-06, |
|
"loss": 0.1454, |
|
"step": 89100 |
|
}, |
|
{ |
|
"epoch": 7.26857887874837, |
|
"grad_norm": 2.7473742961883545, |
|
"learning_rate": 3.1045632333767927e-06, |
|
"loss": 0.1515, |
|
"step": 89200 |
|
}, |
|
{ |
|
"epoch": 7.2767275097783575, |
|
"grad_norm": 10.833642959594727, |
|
"learning_rate": 3.095599739243807e-06, |
|
"loss": 0.1445, |
|
"step": 89300 |
|
}, |
|
{ |
|
"epoch": 7.284876140808344, |
|
"grad_norm": 10.2700777053833, |
|
"learning_rate": 3.086636245110822e-06, |
|
"loss": 0.1595, |
|
"step": 89400 |
|
}, |
|
{ |
|
"epoch": 7.293024771838331, |
|
"grad_norm": 7.091324329376221, |
|
"learning_rate": 3.0776727509778354e-06, |
|
"loss": 0.1601, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 7.301173402868318, |
|
"grad_norm": 14.033100128173828, |
|
"learning_rate": 3.0687092568448494e-06, |
|
"loss": 0.1565, |
|
"step": 89600 |
|
}, |
|
{ |
|
"epoch": 7.309322033898305, |
|
"grad_norm": 1.7879611253738403, |
|
"learning_rate": 3.0597457627118637e-06, |
|
"loss": 0.1334, |
|
"step": 89700 |
|
}, |
|
{ |
|
"epoch": 7.317470664928292, |
|
"grad_norm": 15.966545104980469, |
|
"learning_rate": 3.05078226857888e-06, |
|
"loss": 0.1591, |
|
"step": 89800 |
|
}, |
|
{ |
|
"epoch": 7.325619295958279, |
|
"grad_norm": 17.925477981567383, |
|
"learning_rate": 3.0418187744458933e-06, |
|
"loss": 0.1611, |
|
"step": 89900 |
|
}, |
|
{ |
|
"epoch": 7.333767926988266, |
|
"grad_norm": 28.116106033325195, |
|
"learning_rate": 3.032855280312908e-06, |
|
"loss": 0.1397, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 7.341916558018253, |
|
"grad_norm": 22.7088565826416, |
|
"learning_rate": 3.0238917861799217e-06, |
|
"loss": 0.1345, |
|
"step": 90100 |
|
}, |
|
{ |
|
"epoch": 7.35006518904824, |
|
"grad_norm": 7.538867473602295, |
|
"learning_rate": 3.0149282920469356e-06, |
|
"loss": 0.1332, |
|
"step": 90200 |
|
}, |
|
{ |
|
"epoch": 7.358213820078227, |
|
"grad_norm": 15.904698371887207, |
|
"learning_rate": 3.00596479791395e-06, |
|
"loss": 0.1734, |
|
"step": 90300 |
|
}, |
|
{ |
|
"epoch": 7.3663624511082135, |
|
"grad_norm": 17.251426696777344, |
|
"learning_rate": 2.9970013037809644e-06, |
|
"loss": 0.1524, |
|
"step": 90400 |
|
}, |
|
{ |
|
"epoch": 7.374511082138201, |
|
"grad_norm": 18.56187629699707, |
|
"learning_rate": 2.9880378096479788e-06, |
|
"loss": 0.1484, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 7.382659713168188, |
|
"grad_norm": 9.472875595092773, |
|
"learning_rate": 2.9790743155149944e-06, |
|
"loss": 0.1491, |
|
"step": 90600 |
|
}, |
|
{ |
|
"epoch": 7.390808344198175, |
|
"grad_norm": 11.260825157165527, |
|
"learning_rate": 2.970110821382008e-06, |
|
"loss": 0.1522, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 7.398956975228161, |
|
"grad_norm": 23.518583297729492, |
|
"learning_rate": 2.961147327249022e-06, |
|
"loss": 0.1507, |
|
"step": 90800 |
|
}, |
|
{ |
|
"epoch": 7.407105606258149, |
|
"grad_norm": 6.331607818603516, |
|
"learning_rate": 2.9521838331160367e-06, |
|
"loss": 0.1562, |
|
"step": 90900 |
|
}, |
|
{ |
|
"epoch": 7.415254237288136, |
|
"grad_norm": 23.582700729370117, |
|
"learning_rate": 2.9432203389830507e-06, |
|
"loss": 0.1432, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 7.423402868318123, |
|
"grad_norm": 14.635252952575684, |
|
"learning_rate": 2.934256844850065e-06, |
|
"loss": 0.1496, |
|
"step": 91100 |
|
}, |
|
{ |
|
"epoch": 7.431551499348109, |
|
"grad_norm": 8.255620002746582, |
|
"learning_rate": 2.925293350717079e-06, |
|
"loss": 0.1602, |
|
"step": 91200 |
|
}, |
|
{ |
|
"epoch": 7.439700130378096, |
|
"grad_norm": 3.5353052616119385, |
|
"learning_rate": 2.916329856584094e-06, |
|
"loss": 0.1508, |
|
"step": 91300 |
|
}, |
|
{ |
|
"epoch": 7.447848761408084, |
|
"grad_norm": 10.22754192352295, |
|
"learning_rate": 2.9073663624511078e-06, |
|
"loss": 0.1417, |
|
"step": 91400 |
|
}, |
|
{ |
|
"epoch": 7.4559973924380705, |
|
"grad_norm": 8.802057266235352, |
|
"learning_rate": 2.898402868318123e-06, |
|
"loss": 0.1555, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 7.464146023468057, |
|
"grad_norm": 7.474407196044922, |
|
"learning_rate": 2.889439374185137e-06, |
|
"loss": 0.1658, |
|
"step": 91600 |
|
}, |
|
{ |
|
"epoch": 7.472294654498044, |
|
"grad_norm": 18.78350830078125, |
|
"learning_rate": 2.8804758800521513e-06, |
|
"loss": 0.1482, |
|
"step": 91700 |
|
}, |
|
{ |
|
"epoch": 7.480443285528032, |
|
"grad_norm": 8.468660354614258, |
|
"learning_rate": 2.8715123859191653e-06, |
|
"loss": 0.1704, |
|
"step": 91800 |
|
}, |
|
{ |
|
"epoch": 7.488591916558018, |
|
"grad_norm": 0.4872637391090393, |
|
"learning_rate": 2.8625488917861797e-06, |
|
"loss": 0.1515, |
|
"step": 91900 |
|
}, |
|
{ |
|
"epoch": 7.496740547588005, |
|
"grad_norm": 8.285701751708984, |
|
"learning_rate": 2.853585397653194e-06, |
|
"loss": 0.1685, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 7.504889178617992, |
|
"grad_norm": 19.152118682861328, |
|
"learning_rate": 2.8446219035202084e-06, |
|
"loss": 0.1515, |
|
"step": 92100 |
|
}, |
|
{ |
|
"epoch": 7.5130378096479795, |
|
"grad_norm": 22.134498596191406, |
|
"learning_rate": 2.8356584093872224e-06, |
|
"loss": 0.1412, |
|
"step": 92200 |
|
}, |
|
{ |
|
"epoch": 7.521186440677966, |
|
"grad_norm": 22.581012725830078, |
|
"learning_rate": 2.8266949152542376e-06, |
|
"loss": 0.155, |
|
"step": 92300 |
|
}, |
|
{ |
|
"epoch": 7.529335071707953, |
|
"grad_norm": 9.176002502441406, |
|
"learning_rate": 2.817731421121252e-06, |
|
"loss": 0.154, |
|
"step": 92400 |
|
}, |
|
{ |
|
"epoch": 7.53748370273794, |
|
"grad_norm": 12.96983814239502, |
|
"learning_rate": 2.808767926988266e-06, |
|
"loss": 0.1656, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 7.5456323337679265, |
|
"grad_norm": 6.628359317779541, |
|
"learning_rate": 2.7998044328552803e-06, |
|
"loss": 0.1413, |
|
"step": 92600 |
|
}, |
|
{ |
|
"epoch": 7.553780964797914, |
|
"grad_norm": 4.591996192932129, |
|
"learning_rate": 2.7908409387222947e-06, |
|
"loss": 0.1544, |
|
"step": 92700 |
|
}, |
|
{ |
|
"epoch": 7.561929595827901, |
|
"grad_norm": 13.306089401245117, |
|
"learning_rate": 2.7818774445893087e-06, |
|
"loss": 0.1523, |
|
"step": 92800 |
|
}, |
|
{ |
|
"epoch": 7.570078226857888, |
|
"grad_norm": 13.552392959594727, |
|
"learning_rate": 2.772913950456323e-06, |
|
"loss": 0.1417, |
|
"step": 92900 |
|
}, |
|
{ |
|
"epoch": 7.578226857887875, |
|
"grad_norm": 10.100014686584473, |
|
"learning_rate": 2.763950456323337e-06, |
|
"loss": 0.1491, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 7.586375488917862, |
|
"grad_norm": 13.423697471618652, |
|
"learning_rate": 2.7549869621903523e-06, |
|
"loss": 0.1638, |
|
"step": 93100 |
|
}, |
|
{ |
|
"epoch": 7.594524119947849, |
|
"grad_norm": 16.027891159057617, |
|
"learning_rate": 2.7460234680573666e-06, |
|
"loss": 0.1548, |
|
"step": 93200 |
|
}, |
|
{ |
|
"epoch": 7.602672750977836, |
|
"grad_norm": 15.648063659667969, |
|
"learning_rate": 2.737059973924381e-06, |
|
"loss": 0.1601, |
|
"step": 93300 |
|
}, |
|
{ |
|
"epoch": 7.610821382007822, |
|
"grad_norm": 17.28276824951172, |
|
"learning_rate": 2.728096479791395e-06, |
|
"loss": 0.1455, |
|
"step": 93400 |
|
}, |
|
{ |
|
"epoch": 7.61897001303781, |
|
"grad_norm": 1.7760809659957886, |
|
"learning_rate": 2.7191329856584093e-06, |
|
"loss": 0.1523, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 7.627118644067797, |
|
"grad_norm": 12.409880638122559, |
|
"learning_rate": 2.7101694915254233e-06, |
|
"loss": 0.17, |
|
"step": 93600 |
|
}, |
|
{ |
|
"epoch": 7.6352672750977835, |
|
"grad_norm": 8.9781494140625, |
|
"learning_rate": 2.7012059973924377e-06, |
|
"loss": 0.1628, |
|
"step": 93700 |
|
}, |
|
{ |
|
"epoch": 7.64341590612777, |
|
"grad_norm": 11.374709129333496, |
|
"learning_rate": 2.6922425032594516e-06, |
|
"loss": 0.1584, |
|
"step": 93800 |
|
}, |
|
{ |
|
"epoch": 7.651564537157758, |
|
"grad_norm": 8.572677612304688, |
|
"learning_rate": 2.6832790091264673e-06, |
|
"loss": 0.1581, |
|
"step": 93900 |
|
}, |
|
{ |
|
"epoch": 7.659713168187745, |
|
"grad_norm": 15.630659103393555, |
|
"learning_rate": 2.6743155149934813e-06, |
|
"loss": 0.1547, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 7.667861799217731, |
|
"grad_norm": 2.3736350536346436, |
|
"learning_rate": 2.6653520208604956e-06, |
|
"loss": 0.1588, |
|
"step": 94100 |
|
}, |
|
{ |
|
"epoch": 7.676010430247718, |
|
"grad_norm": 6.438554286956787, |
|
"learning_rate": 2.6563885267275096e-06, |
|
"loss": 0.1483, |
|
"step": 94200 |
|
}, |
|
{ |
|
"epoch": 7.684159061277706, |
|
"grad_norm": 3.4226784706115723, |
|
"learning_rate": 2.647425032594524e-06, |
|
"loss": 0.1543, |
|
"step": 94300 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 0.7672721147537231, |
|
"learning_rate": 2.638461538461538e-06, |
|
"loss": 0.1519, |
|
"step": 94400 |
|
}, |
|
{ |
|
"epoch": 7.700456323337679, |
|
"grad_norm": 14.213494300842285, |
|
"learning_rate": 2.6294980443285523e-06, |
|
"loss": 0.1406, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 7.708604954367666, |
|
"grad_norm": 0.7124353647232056, |
|
"learning_rate": 2.6205345501955663e-06, |
|
"loss": 0.144, |
|
"step": 94600 |
|
}, |
|
{ |
|
"epoch": 7.716753585397653, |
|
"grad_norm": 23.662931442260742, |
|
"learning_rate": 2.611571056062582e-06, |
|
"loss": 0.1617, |
|
"step": 94700 |
|
}, |
|
{ |
|
"epoch": 7.72490221642764, |
|
"grad_norm": 11.896281242370605, |
|
"learning_rate": 2.602607561929596e-06, |
|
"loss": 0.1404, |
|
"step": 94800 |
|
}, |
|
{ |
|
"epoch": 7.733050847457627, |
|
"grad_norm": 21.615901947021484, |
|
"learning_rate": 2.5936440677966103e-06, |
|
"loss": 0.156, |
|
"step": 94900 |
|
}, |
|
{ |
|
"epoch": 7.741199478487614, |
|
"grad_norm": 1.9456912279129028, |
|
"learning_rate": 2.584680573663625e-06, |
|
"loss": 0.1591, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 7.749348109517601, |
|
"grad_norm": 12.462078094482422, |
|
"learning_rate": 2.5757170795306386e-06, |
|
"loss": 0.1598, |
|
"step": 95100 |
|
}, |
|
{ |
|
"epoch": 7.757496740547588, |
|
"grad_norm": 6.921799182891846, |
|
"learning_rate": 2.5667535853976525e-06, |
|
"loss": 0.1479, |
|
"step": 95200 |
|
}, |
|
{ |
|
"epoch": 7.765645371577575, |
|
"grad_norm": 5.263434410095215, |
|
"learning_rate": 2.557790091264667e-06, |
|
"loss": 0.1522, |
|
"step": 95300 |
|
}, |
|
{ |
|
"epoch": 7.773794002607562, |
|
"grad_norm": 6.944530963897705, |
|
"learning_rate": 2.5488265971316813e-06, |
|
"loss": 0.1598, |
|
"step": 95400 |
|
}, |
|
{ |
|
"epoch": 7.781942633637549, |
|
"grad_norm": 13.571086883544922, |
|
"learning_rate": 2.5398631029986965e-06, |
|
"loss": 0.1611, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 7.790091264667536, |
|
"grad_norm": 17.36863899230957, |
|
"learning_rate": 2.5308996088657113e-06, |
|
"loss": 0.153, |
|
"step": 95600 |
|
}, |
|
{ |
|
"epoch": 7.798239895697523, |
|
"grad_norm": 8.735814094543457, |
|
"learning_rate": 2.521936114732725e-06, |
|
"loss": 0.138, |
|
"step": 95700 |
|
}, |
|
{ |
|
"epoch": 7.80638852672751, |
|
"grad_norm": 15.301424980163574, |
|
"learning_rate": 2.512972620599739e-06, |
|
"loss": 0.1572, |
|
"step": 95800 |
|
}, |
|
{ |
|
"epoch": 7.8145371577574965, |
|
"grad_norm": 15.560503005981445, |
|
"learning_rate": 2.5040091264667536e-06, |
|
"loss": 0.1446, |
|
"step": 95900 |
|
}, |
|
{ |
|
"epoch": 7.822685788787483, |
|
"grad_norm": 16.19012451171875, |
|
"learning_rate": 2.4950456323337676e-06, |
|
"loss": 0.1742, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 7.830834419817471, |
|
"grad_norm": 4.544064998626709, |
|
"learning_rate": 2.486082138200782e-06, |
|
"loss": 0.1516, |
|
"step": 96100 |
|
}, |
|
{ |
|
"epoch": 7.838983050847458, |
|
"grad_norm": 5.693359375, |
|
"learning_rate": 2.477118644067796e-06, |
|
"loss": 0.1588, |
|
"step": 96200 |
|
}, |
|
{ |
|
"epoch": 7.847131681877444, |
|
"grad_norm": 14.035545349121094, |
|
"learning_rate": 2.468155149934811e-06, |
|
"loss": 0.1373, |
|
"step": 96300 |
|
}, |
|
{ |
|
"epoch": 7.855280312907432, |
|
"grad_norm": 15.699686050415039, |
|
"learning_rate": 2.4591916558018255e-06, |
|
"loss": 0.143, |
|
"step": 96400 |
|
}, |
|
{ |
|
"epoch": 7.863428943937419, |
|
"grad_norm": 17.58596420288086, |
|
"learning_rate": 2.45022816166884e-06, |
|
"loss": 0.1504, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 7.8715775749674055, |
|
"grad_norm": 12.520322799682617, |
|
"learning_rate": 2.441264667535854e-06, |
|
"loss": 0.1642, |
|
"step": 96600 |
|
}, |
|
{ |
|
"epoch": 7.879726205997392, |
|
"grad_norm": 6.03079080581665, |
|
"learning_rate": 2.4323011734028683e-06, |
|
"loss": 0.1493, |
|
"step": 96700 |
|
}, |
|
{ |
|
"epoch": 7.887874837027379, |
|
"grad_norm": 11.687211990356445, |
|
"learning_rate": 2.423337679269882e-06, |
|
"loss": 0.1567, |
|
"step": 96800 |
|
}, |
|
{ |
|
"epoch": 7.896023468057367, |
|
"grad_norm": 16.79334259033203, |
|
"learning_rate": 2.4143741851368966e-06, |
|
"loss": 0.1635, |
|
"step": 96900 |
|
}, |
|
{ |
|
"epoch": 7.904172099087353, |
|
"grad_norm": 8.044095039367676, |
|
"learning_rate": 2.405410691003911e-06, |
|
"loss": 0.1485, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 7.91232073011734, |
|
"grad_norm": 10.20647144317627, |
|
"learning_rate": 2.396447196870926e-06, |
|
"loss": 0.163, |
|
"step": 97100 |
|
}, |
|
{ |
|
"epoch": 7.920469361147327, |
|
"grad_norm": 7.316319465637207, |
|
"learning_rate": 2.38748370273794e-06, |
|
"loss": 0.165, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 7.9286179921773146, |
|
"grad_norm": 6.296102523803711, |
|
"learning_rate": 2.3785202086049545e-06, |
|
"loss": 0.1644, |
|
"step": 97300 |
|
}, |
|
{ |
|
"epoch": 7.936766623207301, |
|
"grad_norm": 7.60091495513916, |
|
"learning_rate": 2.369556714471969e-06, |
|
"loss": 0.1559, |
|
"step": 97400 |
|
}, |
|
{ |
|
"epoch": 7.944915254237288, |
|
"grad_norm": 0.3591231405735016, |
|
"learning_rate": 2.360593220338983e-06, |
|
"loss": 0.1637, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 7.953063885267275, |
|
"grad_norm": 13.358650207519531, |
|
"learning_rate": 2.3516297262059973e-06, |
|
"loss": 0.1576, |
|
"step": 97600 |
|
}, |
|
{ |
|
"epoch": 7.9612125162972625, |
|
"grad_norm": 3.420527696609497, |
|
"learning_rate": 2.3426662320730112e-06, |
|
"loss": 0.1362, |
|
"step": 97700 |
|
}, |
|
{ |
|
"epoch": 7.969361147327249, |
|
"grad_norm": 14.245848655700684, |
|
"learning_rate": 2.3337027379400256e-06, |
|
"loss": 0.1487, |
|
"step": 97800 |
|
}, |
|
{ |
|
"epoch": 7.977509778357236, |
|
"grad_norm": 12.376141548156738, |
|
"learning_rate": 2.324739243807041e-06, |
|
"loss": 0.14, |
|
"step": 97900 |
|
}, |
|
{ |
|
"epoch": 7.985658409387223, |
|
"grad_norm": 0.6416822671890259, |
|
"learning_rate": 2.315775749674055e-06, |
|
"loss": 0.1541, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 7.9938070404172095, |
|
"grad_norm": 10.687934875488281, |
|
"learning_rate": 2.306812255541069e-06, |
|
"loss": 0.1485, |
|
"step": 98100 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8417670682730923, |
|
"eval_loss": 0.6814994215965271, |
|
"eval_runtime": 5.5348, |
|
"eval_samples_per_second": 449.877, |
|
"eval_steps_per_second": 56.37, |
|
"step": 98176 |
|
}, |
|
{ |
|
"epoch": 8.001955671447197, |
|
"grad_norm": 1.1167025566101074, |
|
"learning_rate": 2.2978487614080835e-06, |
|
"loss": 0.1672, |
|
"step": 98200 |
|
}, |
|
{ |
|
"epoch": 8.010104302477183, |
|
"grad_norm": 18.145008087158203, |
|
"learning_rate": 2.2888852672750975e-06, |
|
"loss": 0.116, |
|
"step": 98300 |
|
}, |
|
{ |
|
"epoch": 8.01825293350717, |
|
"grad_norm": 9.317021369934082, |
|
"learning_rate": 2.279921773142112e-06, |
|
"loss": 0.138, |
|
"step": 98400 |
|
}, |
|
{ |
|
"epoch": 8.026401564537158, |
|
"grad_norm": 9.0396089553833, |
|
"learning_rate": 2.2709582790091263e-06, |
|
"loss": 0.1428, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 8.034550195567144, |
|
"grad_norm": 13.613592147827148, |
|
"learning_rate": 2.2619947848761402e-06, |
|
"loss": 0.1222, |
|
"step": 98600 |
|
}, |
|
{ |
|
"epoch": 8.042698826597132, |
|
"grad_norm": 8.281913757324219, |
|
"learning_rate": 2.2530312907431554e-06, |
|
"loss": 0.1309, |
|
"step": 98700 |
|
}, |
|
{ |
|
"epoch": 8.05084745762712, |
|
"grad_norm": 2.797579050064087, |
|
"learning_rate": 2.24406779661017e-06, |
|
"loss": 0.1313, |
|
"step": 98800 |
|
}, |
|
{ |
|
"epoch": 8.058996088657105, |
|
"grad_norm": 13.423441886901855, |
|
"learning_rate": 2.235104302477184e-06, |
|
"loss": 0.1223, |
|
"step": 98900 |
|
}, |
|
{ |
|
"epoch": 8.067144719687093, |
|
"grad_norm": 2.5191502571105957, |
|
"learning_rate": 2.226140808344198e-06, |
|
"loss": 0.1175, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 8.075293350717079, |
|
"grad_norm": 9.729893684387207, |
|
"learning_rate": 2.2171773142112125e-06, |
|
"loss": 0.1281, |
|
"step": 99100 |
|
}, |
|
{ |
|
"epoch": 8.083441981747066, |
|
"grad_norm": 10.935778617858887, |
|
"learning_rate": 2.2082138200782265e-06, |
|
"loss": 0.1274, |
|
"step": 99200 |
|
}, |
|
{ |
|
"epoch": 8.091590612777054, |
|
"grad_norm": 14.475391387939453, |
|
"learning_rate": 2.199250325945241e-06, |
|
"loss": 0.1434, |
|
"step": 99300 |
|
}, |
|
{ |
|
"epoch": 8.09973924380704, |
|
"grad_norm": 5.676713466644287, |
|
"learning_rate": 2.190286831812255e-06, |
|
"loss": 0.1282, |
|
"step": 99400 |
|
}, |
|
{ |
|
"epoch": 8.107887874837028, |
|
"grad_norm": 3.881793260574341, |
|
"learning_rate": 2.1813233376792705e-06, |
|
"loss": 0.113, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 8.116036505867015, |
|
"grad_norm": 18.958267211914062, |
|
"learning_rate": 2.1723598435462844e-06, |
|
"loss": 0.1462, |
|
"step": 99600 |
|
}, |
|
{ |
|
"epoch": 8.124185136897001, |
|
"grad_norm": 3.9751124382019043, |
|
"learning_rate": 2.163396349413299e-06, |
|
"loss": 0.1304, |
|
"step": 99700 |
|
}, |
|
{ |
|
"epoch": 8.132333767926989, |
|
"grad_norm": 20.424734115600586, |
|
"learning_rate": 2.1544328552803128e-06, |
|
"loss": 0.1509, |
|
"step": 99800 |
|
}, |
|
{ |
|
"epoch": 8.140482398956975, |
|
"grad_norm": 17.55602264404297, |
|
"learning_rate": 2.145469361147327e-06, |
|
"loss": 0.1458, |
|
"step": 99900 |
|
}, |
|
{ |
|
"epoch": 8.148631029986962, |
|
"grad_norm": 9.35766315460205, |
|
"learning_rate": 2.1365058670143415e-06, |
|
"loss": 0.1335, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 8.15677966101695, |
|
"grad_norm": 0.5167970657348633, |
|
"learning_rate": 2.1275423728813555e-06, |
|
"loss": 0.1336, |
|
"step": 100100 |
|
}, |
|
{ |
|
"epoch": 8.164928292046936, |
|
"grad_norm": 9.043828964233398, |
|
"learning_rate": 2.11857887874837e-06, |
|
"loss": 0.1242, |
|
"step": 100200 |
|
}, |
|
{ |
|
"epoch": 8.173076923076923, |
|
"grad_norm": 8.647052764892578, |
|
"learning_rate": 2.109615384615384e-06, |
|
"loss": 0.131, |
|
"step": 100300 |
|
}, |
|
{ |
|
"epoch": 8.18122555410691, |
|
"grad_norm": 16.887643814086914, |
|
"learning_rate": 2.1006518904823995e-06, |
|
"loss": 0.1475, |
|
"step": 100400 |
|
}, |
|
{ |
|
"epoch": 8.189374185136897, |
|
"grad_norm": 24.23810386657715, |
|
"learning_rate": 2.0916883963494134e-06, |
|
"loss": 0.1256, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 8.197522816166884, |
|
"grad_norm": 13.14070987701416, |
|
"learning_rate": 2.082724902216428e-06, |
|
"loss": 0.137, |
|
"step": 100600 |
|
}, |
|
{ |
|
"epoch": 8.20567144719687, |
|
"grad_norm": 6.199899673461914, |
|
"learning_rate": 2.0737614080834418e-06, |
|
"loss": 0.1363, |
|
"step": 100700 |
|
}, |
|
{ |
|
"epoch": 8.213820078226858, |
|
"grad_norm": 3.194542646408081, |
|
"learning_rate": 2.064797913950456e-06, |
|
"loss": 0.1584, |
|
"step": 100800 |
|
}, |
|
{ |
|
"epoch": 8.221968709256846, |
|
"grad_norm": 16.422868728637695, |
|
"learning_rate": 2.05583441981747e-06, |
|
"loss": 0.1464, |
|
"step": 100900 |
|
}, |
|
{ |
|
"epoch": 8.230117340286832, |
|
"grad_norm": 18.549264907836914, |
|
"learning_rate": 2.0468709256844845e-06, |
|
"loss": 0.1392, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 8.23826597131682, |
|
"grad_norm": 3.2117178440093994, |
|
"learning_rate": 2.037907431551499e-06, |
|
"loss": 0.1431, |
|
"step": 101100 |
|
}, |
|
{ |
|
"epoch": 8.246414602346805, |
|
"grad_norm": 23.774818420410156, |
|
"learning_rate": 2.028943937418514e-06, |
|
"loss": 0.1308, |
|
"step": 101200 |
|
}, |
|
{ |
|
"epoch": 8.254563233376793, |
|
"grad_norm": 1.8825786113739014, |
|
"learning_rate": 2.019980443285528e-06, |
|
"loss": 0.1409, |
|
"step": 101300 |
|
}, |
|
{ |
|
"epoch": 8.26271186440678, |
|
"grad_norm": 4.661303520202637, |
|
"learning_rate": 2.0110169491525424e-06, |
|
"loss": 0.1357, |
|
"step": 101400 |
|
}, |
|
{ |
|
"epoch": 8.270860495436766, |
|
"grad_norm": 18.980335235595703, |
|
"learning_rate": 2.002053455019557e-06, |
|
"loss": 0.1446, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 8.279009126466754, |
|
"grad_norm": 9.78714656829834, |
|
"learning_rate": 1.9930899608865708e-06, |
|
"loss": 0.1197, |
|
"step": 101600 |
|
}, |
|
{ |
|
"epoch": 8.28715775749674, |
|
"grad_norm": 9.437408447265625, |
|
"learning_rate": 1.984126466753585e-06, |
|
"loss": 0.1296, |
|
"step": 101700 |
|
}, |
|
{ |
|
"epoch": 8.295306388526727, |
|
"grad_norm": 11.14543628692627, |
|
"learning_rate": 1.9751629726205995e-06, |
|
"loss": 0.1402, |
|
"step": 101800 |
|
}, |
|
{ |
|
"epoch": 8.303455019556715, |
|
"grad_norm": 10.319045066833496, |
|
"learning_rate": 1.9661994784876135e-06, |
|
"loss": 0.1278, |
|
"step": 101900 |
|
}, |
|
{ |
|
"epoch": 8.3116036505867, |
|
"grad_norm": 20.33431053161621, |
|
"learning_rate": 1.9572359843546287e-06, |
|
"loss": 0.1667, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 8.319752281616688, |
|
"grad_norm": 6.3067145347595215, |
|
"learning_rate": 1.948272490221643e-06, |
|
"loss": 0.1346, |
|
"step": 102100 |
|
}, |
|
{ |
|
"epoch": 8.327900912646676, |
|
"grad_norm": 15.225107192993164, |
|
"learning_rate": 1.9393089960886575e-06, |
|
"loss": 0.1464, |
|
"step": 102200 |
|
}, |
|
{ |
|
"epoch": 8.336049543676662, |
|
"grad_norm": 17.693836212158203, |
|
"learning_rate": 1.9303455019556714e-06, |
|
"loss": 0.1479, |
|
"step": 102300 |
|
}, |
|
{ |
|
"epoch": 8.34419817470665, |
|
"grad_norm": 28.224361419677734, |
|
"learning_rate": 1.921382007822686e-06, |
|
"loss": 0.1315, |
|
"step": 102400 |
|
}, |
|
{ |
|
"epoch": 8.352346805736635, |
|
"grad_norm": 1.448736548423767, |
|
"learning_rate": 1.9124185136896998e-06, |
|
"loss": 0.1433, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 8.360495436766623, |
|
"grad_norm": 24.642520904541016, |
|
"learning_rate": 1.9034550195567142e-06, |
|
"loss": 0.1155, |
|
"step": 102600 |
|
}, |
|
{ |
|
"epoch": 8.36864406779661, |
|
"grad_norm": 18.223909378051758, |
|
"learning_rate": 1.8944915254237281e-06, |
|
"loss": 0.147, |
|
"step": 102700 |
|
}, |
|
{ |
|
"epoch": 8.376792698826597, |
|
"grad_norm": 14.59261417388916, |
|
"learning_rate": 1.8855280312907438e-06, |
|
"loss": 0.1158, |
|
"step": 102800 |
|
}, |
|
{ |
|
"epoch": 8.384941329856584, |
|
"grad_norm": 7.161222457885742, |
|
"learning_rate": 1.8765645371577575e-06, |
|
"loss": 0.1316, |
|
"step": 102900 |
|
}, |
|
{ |
|
"epoch": 8.393089960886572, |
|
"grad_norm": 6.764099597930908, |
|
"learning_rate": 1.8676010430247721e-06, |
|
"loss": 0.1377, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 8.401238591916558, |
|
"grad_norm": 3.5225605964660645, |
|
"learning_rate": 1.858637548891786e-06, |
|
"loss": 0.1354, |
|
"step": 103100 |
|
}, |
|
{ |
|
"epoch": 8.409387222946545, |
|
"grad_norm": 28.121694564819336, |
|
"learning_rate": 1.8496740547588004e-06, |
|
"loss": 0.1351, |
|
"step": 103200 |
|
}, |
|
{ |
|
"epoch": 8.417535853976531, |
|
"grad_norm": 8.749202728271484, |
|
"learning_rate": 1.840710560625815e-06, |
|
"loss": 0.1318, |
|
"step": 103300 |
|
}, |
|
{ |
|
"epoch": 8.425684485006519, |
|
"grad_norm": 18.74785804748535, |
|
"learning_rate": 1.8317470664928288e-06, |
|
"loss": 0.1453, |
|
"step": 103400 |
|
}, |
|
{ |
|
"epoch": 8.433833116036507, |
|
"grad_norm": 29.5967960357666, |
|
"learning_rate": 1.822783572359843e-06, |
|
"loss": 0.1347, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 8.441981747066492, |
|
"grad_norm": 13.819513320922852, |
|
"learning_rate": 1.8138200782268584e-06, |
|
"loss": 0.1453, |
|
"step": 103600 |
|
}, |
|
{ |
|
"epoch": 8.45013037809648, |
|
"grad_norm": 14.125680923461914, |
|
"learning_rate": 1.8048565840938728e-06, |
|
"loss": 0.1325, |
|
"step": 103700 |
|
}, |
|
{ |
|
"epoch": 8.458279009126466, |
|
"grad_norm": 20.32880973815918, |
|
"learning_rate": 1.7958930899608867e-06, |
|
"loss": 0.1489, |
|
"step": 103800 |
|
}, |
|
{ |
|
"epoch": 8.466427640156454, |
|
"grad_norm": 1.8261317014694214, |
|
"learning_rate": 1.7869295958279011e-06, |
|
"loss": 0.1209, |
|
"step": 103900 |
|
}, |
|
{ |
|
"epoch": 8.474576271186441, |
|
"grad_norm": 18.03254508972168, |
|
"learning_rate": 1.777966101694915e-06, |
|
"loss": 0.1549, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 8.482724902216427, |
|
"grad_norm": 14.922330856323242, |
|
"learning_rate": 1.7690026075619292e-06, |
|
"loss": 0.1532, |
|
"step": 104100 |
|
}, |
|
{ |
|
"epoch": 8.490873533246415, |
|
"grad_norm": 8.974449157714844, |
|
"learning_rate": 1.7600391134289436e-06, |
|
"loss": 0.1596, |
|
"step": 104200 |
|
}, |
|
{ |
|
"epoch": 8.499022164276402, |
|
"grad_norm": 13.620256423950195, |
|
"learning_rate": 1.7510756192959578e-06, |
|
"loss": 0.1236, |
|
"step": 104300 |
|
}, |
|
{ |
|
"epoch": 8.507170795306388, |
|
"grad_norm": 9.534696578979492, |
|
"learning_rate": 1.742112125162973e-06, |
|
"loss": 0.1421, |
|
"step": 104400 |
|
}, |
|
{ |
|
"epoch": 8.515319426336376, |
|
"grad_norm": 12.891791343688965, |
|
"learning_rate": 1.7331486310299874e-06, |
|
"loss": 0.1395, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 8.523468057366362, |
|
"grad_norm": 0.4351995587348938, |
|
"learning_rate": 1.7241851368970013e-06, |
|
"loss": 0.1237, |
|
"step": 104600 |
|
}, |
|
{ |
|
"epoch": 8.53161668839635, |
|
"grad_norm": 7.187576770782471, |
|
"learning_rate": 1.7152216427640155e-06, |
|
"loss": 0.1346, |
|
"step": 104700 |
|
}, |
|
{ |
|
"epoch": 8.539765319426337, |
|
"grad_norm": 8.591465950012207, |
|
"learning_rate": 1.7062581486310301e-06, |
|
"loss": 0.1565, |
|
"step": 104800 |
|
}, |
|
{ |
|
"epoch": 8.547913950456323, |
|
"grad_norm": 17.407093048095703, |
|
"learning_rate": 1.697294654498044e-06, |
|
"loss": 0.1426, |
|
"step": 104900 |
|
}, |
|
{ |
|
"epoch": 8.55606258148631, |
|
"grad_norm": 11.454471588134766, |
|
"learning_rate": 1.6883311603650584e-06, |
|
"loss": 0.1436, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 8.564211212516298, |
|
"grad_norm": 21.674543380737305, |
|
"learning_rate": 1.6793676662320724e-06, |
|
"loss": 0.1309, |
|
"step": 105100 |
|
}, |
|
{ |
|
"epoch": 8.572359843546284, |
|
"grad_norm": 16.86087417602539, |
|
"learning_rate": 1.670404172099088e-06, |
|
"loss": 0.1348, |
|
"step": 105200 |
|
}, |
|
{ |
|
"epoch": 8.580508474576272, |
|
"grad_norm": 9.330849647521973, |
|
"learning_rate": 1.6614406779661022e-06, |
|
"loss": 0.1409, |
|
"step": 105300 |
|
}, |
|
{ |
|
"epoch": 8.588657105606258, |
|
"grad_norm": 1.9600093364715576, |
|
"learning_rate": 1.6524771838331164e-06, |
|
"loss": 0.1344, |
|
"step": 105400 |
|
}, |
|
{ |
|
"epoch": 8.596805736636245, |
|
"grad_norm": 3.9729743003845215, |
|
"learning_rate": 1.6435136897001303e-06, |
|
"loss": 0.1456, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 8.604954367666233, |
|
"grad_norm": 13.668156623840332, |
|
"learning_rate": 1.6345501955671447e-06, |
|
"loss": 0.1293, |
|
"step": 105600 |
|
}, |
|
{ |
|
"epoch": 8.613102998696219, |
|
"grad_norm": 13.970529556274414, |
|
"learning_rate": 1.6255867014341587e-06, |
|
"loss": 0.1527, |
|
"step": 105700 |
|
}, |
|
{ |
|
"epoch": 8.621251629726206, |
|
"grad_norm": 8.949882507324219, |
|
"learning_rate": 1.616623207301173e-06, |
|
"loss": 0.1363, |
|
"step": 105800 |
|
}, |
|
{ |
|
"epoch": 8.629400260756192, |
|
"grad_norm": 15.386001586914062, |
|
"learning_rate": 1.6076597131681872e-06, |
|
"loss": 0.1318, |
|
"step": 105900 |
|
}, |
|
{ |
|
"epoch": 8.63754889178618, |
|
"grad_norm": 11.052765846252441, |
|
"learning_rate": 1.5986962190352027e-06, |
|
"loss": 0.1407, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 8.645697522816167, |
|
"grad_norm": 21.316442489624023, |
|
"learning_rate": 1.5897327249022166e-06, |
|
"loss": 0.1389, |
|
"step": 106100 |
|
}, |
|
{ |
|
"epoch": 8.653846153846153, |
|
"grad_norm": 14.646417617797852, |
|
"learning_rate": 1.580769230769231e-06, |
|
"loss": 0.1378, |
|
"step": 106200 |
|
}, |
|
{ |
|
"epoch": 8.661994784876141, |
|
"grad_norm": 20.964698791503906, |
|
"learning_rate": 1.5718057366362454e-06, |
|
"loss": 0.1391, |
|
"step": 106300 |
|
}, |
|
{ |
|
"epoch": 8.670143415906129, |
|
"grad_norm": 8.166078567504883, |
|
"learning_rate": 1.5628422425032593e-06, |
|
"loss": 0.1551, |
|
"step": 106400 |
|
}, |
|
{ |
|
"epoch": 8.678292046936114, |
|
"grad_norm": 20.075531005859375, |
|
"learning_rate": 1.553878748370274e-06, |
|
"loss": 0.1457, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 8.686440677966102, |
|
"grad_norm": 19.019393920898438, |
|
"learning_rate": 1.5449152542372877e-06, |
|
"loss": 0.1311, |
|
"step": 106600 |
|
}, |
|
{ |
|
"epoch": 8.694589308996088, |
|
"grad_norm": 7.855512619018555, |
|
"learning_rate": 1.535951760104302e-06, |
|
"loss": 0.134, |
|
"step": 106700 |
|
}, |
|
{ |
|
"epoch": 8.702737940026076, |
|
"grad_norm": 15.56972885131836, |
|
"learning_rate": 1.5269882659713173e-06, |
|
"loss": 0.1322, |
|
"step": 106800 |
|
}, |
|
{ |
|
"epoch": 8.710886571056063, |
|
"grad_norm": 7.912954807281494, |
|
"learning_rate": 1.5180247718383317e-06, |
|
"loss": 0.1264, |
|
"step": 106900 |
|
}, |
|
{ |
|
"epoch": 8.719035202086049, |
|
"grad_norm": 12.312883377075195, |
|
"learning_rate": 1.5090612777053456e-06, |
|
"loss": 0.1357, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 8.727183833116037, |
|
"grad_norm": 7.040205001831055, |
|
"learning_rate": 1.5000977835723602e-06, |
|
"loss": 0.1411, |
|
"step": 107100 |
|
}, |
|
{ |
|
"epoch": 8.735332464146023, |
|
"grad_norm": 16.882474899291992, |
|
"learning_rate": 1.491134289439374e-06, |
|
"loss": 0.1258, |
|
"step": 107200 |
|
}, |
|
{ |
|
"epoch": 8.74348109517601, |
|
"grad_norm": 13.230895042419434, |
|
"learning_rate": 1.4821707953063883e-06, |
|
"loss": 0.1371, |
|
"step": 107300 |
|
}, |
|
{ |
|
"epoch": 8.751629726205998, |
|
"grad_norm": 1.2307300567626953, |
|
"learning_rate": 1.4732073011734027e-06, |
|
"loss": 0.1303, |
|
"step": 107400 |
|
}, |
|
{ |
|
"epoch": 8.759778357235984, |
|
"grad_norm": 2.0919406414031982, |
|
"learning_rate": 1.4642438070404167e-06, |
|
"loss": 0.1358, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 8.767926988265971, |
|
"grad_norm": 9.384042739868164, |
|
"learning_rate": 1.455280312907432e-06, |
|
"loss": 0.1434, |
|
"step": 107600 |
|
}, |
|
{ |
|
"epoch": 8.776075619295959, |
|
"grad_norm": 3.808473587036133, |
|
"learning_rate": 1.4463168187744465e-06, |
|
"loss": 0.1337, |
|
"step": 107700 |
|
}, |
|
{ |
|
"epoch": 8.784224250325945, |
|
"grad_norm": 9.139209747314453, |
|
"learning_rate": 1.4373533246414607e-06, |
|
"loss": 0.1237, |
|
"step": 107800 |
|
}, |
|
{ |
|
"epoch": 8.792372881355933, |
|
"grad_norm": 23.689847946166992, |
|
"learning_rate": 1.4283898305084744e-06, |
|
"loss": 0.1506, |
|
"step": 107900 |
|
}, |
|
{ |
|
"epoch": 8.800521512385918, |
|
"grad_norm": 17.127538681030273, |
|
"learning_rate": 1.419426336375489e-06, |
|
"loss": 0.1308, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 8.808670143415906, |
|
"grad_norm": 10.87045669555664, |
|
"learning_rate": 1.410462842242503e-06, |
|
"loss": 0.1366, |
|
"step": 108100 |
|
}, |
|
{ |
|
"epoch": 8.816818774445894, |
|
"grad_norm": 8.058269500732422, |
|
"learning_rate": 1.4014993481095174e-06, |
|
"loss": 0.122, |
|
"step": 108200 |
|
}, |
|
{ |
|
"epoch": 8.82496740547588, |
|
"grad_norm": 30.747976303100586, |
|
"learning_rate": 1.392535853976532e-06, |
|
"loss": 0.1476, |
|
"step": 108300 |
|
}, |
|
{ |
|
"epoch": 8.833116036505867, |
|
"grad_norm": 2.1181864738464355, |
|
"learning_rate": 1.383572359843547e-06, |
|
"loss": 0.1389, |
|
"step": 108400 |
|
}, |
|
{ |
|
"epoch": 8.841264667535853, |
|
"grad_norm": 16.122636795043945, |
|
"learning_rate": 1.3746088657105607e-06, |
|
"loss": 0.1494, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 8.84941329856584, |
|
"grad_norm": 1.7896504402160645, |
|
"learning_rate": 1.3656453715775753e-06, |
|
"loss": 0.1429, |
|
"step": 108600 |
|
}, |
|
{ |
|
"epoch": 8.857561929595828, |
|
"grad_norm": 14.50307559967041, |
|
"learning_rate": 1.3566818774445893e-06, |
|
"loss": 0.1391, |
|
"step": 108700 |
|
}, |
|
{ |
|
"epoch": 8.865710560625814, |
|
"grad_norm": 12.862863540649414, |
|
"learning_rate": 1.3477183833116036e-06, |
|
"loss": 0.1449, |
|
"step": 108800 |
|
}, |
|
{ |
|
"epoch": 8.873859191655802, |
|
"grad_norm": 4.946181774139404, |
|
"learning_rate": 1.3387548891786182e-06, |
|
"loss": 0.1547, |
|
"step": 108900 |
|
}, |
|
{ |
|
"epoch": 8.88200782268579, |
|
"grad_norm": 10.275555610656738, |
|
"learning_rate": 1.329791395045632e-06, |
|
"loss": 0.1231, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 8.890156453715775, |
|
"grad_norm": 12.205680847167969, |
|
"learning_rate": 1.3208279009126461e-06, |
|
"loss": 0.1273, |
|
"step": 109100 |
|
}, |
|
{ |
|
"epoch": 8.898305084745763, |
|
"grad_norm": 9.211130142211914, |
|
"learning_rate": 1.3118644067796605e-06, |
|
"loss": 0.1376, |
|
"step": 109200 |
|
}, |
|
{ |
|
"epoch": 8.906453715775749, |
|
"grad_norm": 3.3791849613189697, |
|
"learning_rate": 1.302900912646676e-06, |
|
"loss": 0.1403, |
|
"step": 109300 |
|
}, |
|
{ |
|
"epoch": 8.914602346805736, |
|
"grad_norm": 1.7787587642669678, |
|
"learning_rate": 1.29393741851369e-06, |
|
"loss": 0.1497, |
|
"step": 109400 |
|
}, |
|
{ |
|
"epoch": 8.922750977835724, |
|
"grad_norm": 19.89299964904785, |
|
"learning_rate": 1.2849739243807043e-06, |
|
"loss": 0.1231, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 8.93089960886571, |
|
"grad_norm": 1.886210322380066, |
|
"learning_rate": 1.2760104302477183e-06, |
|
"loss": 0.1336, |
|
"step": 109600 |
|
}, |
|
{ |
|
"epoch": 8.939048239895698, |
|
"grad_norm": 8.365442276000977, |
|
"learning_rate": 1.2670469361147324e-06, |
|
"loss": 0.1502, |
|
"step": 109700 |
|
}, |
|
{ |
|
"epoch": 8.947196870925685, |
|
"grad_norm": 7.866243839263916, |
|
"learning_rate": 1.258083441981747e-06, |
|
"loss": 0.1369, |
|
"step": 109800 |
|
}, |
|
{ |
|
"epoch": 8.955345501955671, |
|
"grad_norm": 13.674020767211914, |
|
"learning_rate": 1.249119947848761e-06, |
|
"loss": 0.1232, |
|
"step": 109900 |
|
}, |
|
{ |
|
"epoch": 8.963494132985659, |
|
"grad_norm": 10.142361640930176, |
|
"learning_rate": 1.2401564537157754e-06, |
|
"loss": 0.123, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 8.971642764015645, |
|
"grad_norm": 9.111577987670898, |
|
"learning_rate": 1.2311929595827906e-06, |
|
"loss": 0.1184, |
|
"step": 110100 |
|
}, |
|
{ |
|
"epoch": 8.979791395045632, |
|
"grad_norm": 5.888847827911377, |
|
"learning_rate": 1.2222294654498045e-06, |
|
"loss": 0.1449, |
|
"step": 110200 |
|
}, |
|
{ |
|
"epoch": 8.98794002607562, |
|
"grad_norm": 11.367358207702637, |
|
"learning_rate": 1.213265971316819e-06, |
|
"loss": 0.1326, |
|
"step": 110300 |
|
}, |
|
{ |
|
"epoch": 8.996088657105606, |
|
"grad_norm": 7.464927673339844, |
|
"learning_rate": 1.2043024771838333e-06, |
|
"loss": 0.1315, |
|
"step": 110400 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.8365461847389558, |
|
"eval_loss": 0.7411776185035706, |
|
"eval_runtime": 5.509, |
|
"eval_samples_per_second": 451.985, |
|
"eval_steps_per_second": 56.634, |
|
"step": 110448 |
|
}, |
|
{ |
|
"epoch": 9.004237288135593, |
|
"grad_norm": 15.54743766784668, |
|
"learning_rate": 1.1953389830508473e-06, |
|
"loss": 0.133, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 9.01238591916558, |
|
"grad_norm": 2.1653244495391846, |
|
"learning_rate": 1.1863754889178616e-06, |
|
"loss": 0.1328, |
|
"step": 110600 |
|
}, |
|
{ |
|
"epoch": 9.020534550195567, |
|
"grad_norm": 0.2608337104320526, |
|
"learning_rate": 1.1774119947848756e-06, |
|
"loss": 0.1013, |
|
"step": 110700 |
|
}, |
|
{ |
|
"epoch": 9.028683181225555, |
|
"grad_norm": 4.152002334594727, |
|
"learning_rate": 1.16844850065189e-06, |
|
"loss": 0.1265, |
|
"step": 110800 |
|
}, |
|
{ |
|
"epoch": 9.03683181225554, |
|
"grad_norm": 6.718245029449463, |
|
"learning_rate": 1.1594850065189052e-06, |
|
"loss": 0.1255, |
|
"step": 110900 |
|
}, |
|
{ |
|
"epoch": 9.044980443285528, |
|
"grad_norm": 19.890954971313477, |
|
"learning_rate": 1.1505215123859196e-06, |
|
"loss": 0.1221, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 9.053129074315516, |
|
"grad_norm": 11.731401443481445, |
|
"learning_rate": 1.1415580182529335e-06, |
|
"loss": 0.127, |
|
"step": 111100 |
|
}, |
|
{ |
|
"epoch": 9.061277705345502, |
|
"grad_norm": 11.024049758911133, |
|
"learning_rate": 1.132594524119948e-06, |
|
"loss": 0.1247, |
|
"step": 111200 |
|
}, |
|
{ |
|
"epoch": 9.06942633637549, |
|
"grad_norm": 20.952821731567383, |
|
"learning_rate": 1.1236310299869623e-06, |
|
"loss": 0.1189, |
|
"step": 111300 |
|
}, |
|
{ |
|
"epoch": 9.077574967405475, |
|
"grad_norm": 20.7772216796875, |
|
"learning_rate": 1.1146675358539763e-06, |
|
"loss": 0.1381, |
|
"step": 111400 |
|
}, |
|
{ |
|
"epoch": 9.085723598435463, |
|
"grad_norm": 6.1934027671813965, |
|
"learning_rate": 1.1057040417209906e-06, |
|
"loss": 0.134, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 9.09387222946545, |
|
"grad_norm": 10.067231178283691, |
|
"learning_rate": 1.0967405475880046e-06, |
|
"loss": 0.1136, |
|
"step": 111600 |
|
}, |
|
{ |
|
"epoch": 9.102020860495436, |
|
"grad_norm": 7.726376533508301, |
|
"learning_rate": 1.0877770534550198e-06, |
|
"loss": 0.1192, |
|
"step": 111700 |
|
}, |
|
{ |
|
"epoch": 9.110169491525424, |
|
"grad_norm": 11.030654907226562, |
|
"learning_rate": 1.0788135593220342e-06, |
|
"loss": 0.1129, |
|
"step": 111800 |
|
}, |
|
{ |
|
"epoch": 9.118318122555412, |
|
"grad_norm": 0.4144195020198822, |
|
"learning_rate": 1.0698500651890486e-06, |
|
"loss": 0.1309, |
|
"step": 111900 |
|
}, |
|
{ |
|
"epoch": 9.126466753585397, |
|
"grad_norm": 2.212282180786133, |
|
"learning_rate": 1.0608865710560625e-06, |
|
"loss": 0.1237, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 9.134615384615385, |
|
"grad_norm": 8.832420349121094, |
|
"learning_rate": 1.051923076923077e-06, |
|
"loss": 0.1244, |
|
"step": 112100 |
|
}, |
|
{ |
|
"epoch": 9.142764015645371, |
|
"grad_norm": 4.031465530395508, |
|
"learning_rate": 1.042959582790091e-06, |
|
"loss": 0.1374, |
|
"step": 112200 |
|
}, |
|
{ |
|
"epoch": 9.150912646675359, |
|
"grad_norm": 4.740699768066406, |
|
"learning_rate": 1.0339960886571053e-06, |
|
"loss": 0.1416, |
|
"step": 112300 |
|
}, |
|
{ |
|
"epoch": 9.159061277705346, |
|
"grad_norm": 8.297629356384277, |
|
"learning_rate": 1.0250325945241194e-06, |
|
"loss": 0.1216, |
|
"step": 112400 |
|
}, |
|
{ |
|
"epoch": 9.167209908735332, |
|
"grad_norm": 20.47978401184082, |
|
"learning_rate": 1.0160691003911349e-06, |
|
"loss": 0.1124, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 9.17535853976532, |
|
"grad_norm": 6.031740665435791, |
|
"learning_rate": 1.0071056062581488e-06, |
|
"loss": 0.1074, |
|
"step": 112600 |
|
}, |
|
{ |
|
"epoch": 9.183507170795306, |
|
"grad_norm": 14.264923095703125, |
|
"learning_rate": 9.981421121251632e-07, |
|
"loss": 0.1197, |
|
"step": 112700 |
|
}, |
|
{ |
|
"epoch": 9.191655801825293, |
|
"grad_norm": 11.424922943115234, |
|
"learning_rate": 9.891786179921774e-07, |
|
"loss": 0.132, |
|
"step": 112800 |
|
}, |
|
{ |
|
"epoch": 9.19980443285528, |
|
"grad_norm": 8.677164077758789, |
|
"learning_rate": 9.802151238591915e-07, |
|
"loss": 0.1338, |
|
"step": 112900 |
|
}, |
|
{ |
|
"epoch": 9.207953063885267, |
|
"grad_norm": 12.700013160705566, |
|
"learning_rate": 9.71251629726206e-07, |
|
"loss": 0.1312, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 9.216101694915254, |
|
"grad_norm": 4.308102607727051, |
|
"learning_rate": 9.622881355932199e-07, |
|
"loss": 0.1241, |
|
"step": 113100 |
|
}, |
|
{ |
|
"epoch": 9.224250325945242, |
|
"grad_norm": 2.213650703430176, |
|
"learning_rate": 9.533246414602343e-07, |
|
"loss": 0.1362, |
|
"step": 113200 |
|
}, |
|
{ |
|
"epoch": 9.232398956975228, |
|
"grad_norm": 30.006113052368164, |
|
"learning_rate": 9.443611473272495e-07, |
|
"loss": 0.129, |
|
"step": 113300 |
|
}, |
|
{ |
|
"epoch": 9.240547588005215, |
|
"grad_norm": 9.764779090881348, |
|
"learning_rate": 9.353976531942638e-07, |
|
"loss": 0.1289, |
|
"step": 113400 |
|
}, |
|
{ |
|
"epoch": 9.248696219035201, |
|
"grad_norm": 3.6789746284484863, |
|
"learning_rate": 9.26434159061278e-07, |
|
"loss": 0.1192, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 9.256844850065189, |
|
"grad_norm": 11.824788093566895, |
|
"learning_rate": 9.17470664928292e-07, |
|
"loss": 0.1263, |
|
"step": 113600 |
|
}, |
|
{ |
|
"epoch": 9.264993481095177, |
|
"grad_norm": 8.798393249511719, |
|
"learning_rate": 9.085071707953064e-07, |
|
"loss": 0.1135, |
|
"step": 113700 |
|
}, |
|
{ |
|
"epoch": 9.273142112125162, |
|
"grad_norm": 2.8183655738830566, |
|
"learning_rate": 8.995436766623205e-07, |
|
"loss": 0.1376, |
|
"step": 113800 |
|
}, |
|
{ |
|
"epoch": 9.28129074315515, |
|
"grad_norm": 3.4520905017852783, |
|
"learning_rate": 8.905801825293347e-07, |
|
"loss": 0.1314, |
|
"step": 113900 |
|
}, |
|
{ |
|
"epoch": 9.289439374185136, |
|
"grad_norm": 10.228925704956055, |
|
"learning_rate": 8.81616688396349e-07, |
|
"loss": 0.1292, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 9.297588005215124, |
|
"grad_norm": 0.910640299320221, |
|
"learning_rate": 8.726531942633642e-07, |
|
"loss": 0.1238, |
|
"step": 114100 |
|
}, |
|
{ |
|
"epoch": 9.305736636245111, |
|
"grad_norm": 1.382246732711792, |
|
"learning_rate": 8.636897001303785e-07, |
|
"loss": 0.1037, |
|
"step": 114200 |
|
}, |
|
{ |
|
"epoch": 9.313885267275097, |
|
"grad_norm": 23.052778244018555, |
|
"learning_rate": 8.547262059973927e-07, |
|
"loss": 0.1124, |
|
"step": 114300 |
|
}, |
|
{ |
|
"epoch": 9.322033898305085, |
|
"grad_norm": 33.121097564697266, |
|
"learning_rate": 8.457627118644068e-07, |
|
"loss": 0.1403, |
|
"step": 114400 |
|
}, |
|
{ |
|
"epoch": 9.330182529335072, |
|
"grad_norm": 13.370987892150879, |
|
"learning_rate": 8.36799217731421e-07, |
|
"loss": 0.1258, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 9.338331160365058, |
|
"grad_norm": 21.812013626098633, |
|
"learning_rate": 8.278357235984353e-07, |
|
"loss": 0.1288, |
|
"step": 114600 |
|
}, |
|
{ |
|
"epoch": 9.346479791395046, |
|
"grad_norm": 2.375351667404175, |
|
"learning_rate": 8.188722294654495e-07, |
|
"loss": 0.1349, |
|
"step": 114700 |
|
}, |
|
{ |
|
"epoch": 9.354628422425032, |
|
"grad_norm": 14.761313438415527, |
|
"learning_rate": 8.099087353324637e-07, |
|
"loss": 0.1334, |
|
"step": 114800 |
|
}, |
|
{ |
|
"epoch": 9.36277705345502, |
|
"grad_norm": 14.228364944458008, |
|
"learning_rate": 8.00945241199479e-07, |
|
"loss": 0.1239, |
|
"step": 114900 |
|
}, |
|
{ |
|
"epoch": 9.370925684485007, |
|
"grad_norm": 7.746068954467773, |
|
"learning_rate": 7.919817470664932e-07, |
|
"loss": 0.1049, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 9.379074315514993, |
|
"grad_norm": 0.5430285334587097, |
|
"learning_rate": 7.830182529335075e-07, |
|
"loss": 0.1293, |
|
"step": 115100 |
|
}, |
|
{ |
|
"epoch": 9.38722294654498, |
|
"grad_norm": 9.228418350219727, |
|
"learning_rate": 7.740547588005217e-07, |
|
"loss": 0.1234, |
|
"step": 115200 |
|
}, |
|
{ |
|
"epoch": 9.395371577574968, |
|
"grad_norm": 22.232402801513672, |
|
"learning_rate": 7.650912646675358e-07, |
|
"loss": 0.141, |
|
"step": 115300 |
|
}, |
|
{ |
|
"epoch": 9.403520208604954, |
|
"grad_norm": 2.0123260021209717, |
|
"learning_rate": 7.5612777053455e-07, |
|
"loss": 0.1063, |
|
"step": 115400 |
|
}, |
|
{ |
|
"epoch": 9.411668839634942, |
|
"grad_norm": 0.9207943081855774, |
|
"learning_rate": 7.471642764015643e-07, |
|
"loss": 0.1377, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 9.419817470664928, |
|
"grad_norm": 7.680292129516602, |
|
"learning_rate": 7.382007822685784e-07, |
|
"loss": 0.1131, |
|
"step": 115600 |
|
}, |
|
{ |
|
"epoch": 9.427966101694915, |
|
"grad_norm": 10.872476577758789, |
|
"learning_rate": 7.292372881355938e-07, |
|
"loss": 0.1266, |
|
"step": 115700 |
|
}, |
|
{ |
|
"epoch": 9.436114732724903, |
|
"grad_norm": 18.4665584564209, |
|
"learning_rate": 7.202737940026079e-07, |
|
"loss": 0.131, |
|
"step": 115800 |
|
}, |
|
{ |
|
"epoch": 9.444263363754889, |
|
"grad_norm": 23.102298736572266, |
|
"learning_rate": 7.113102998696221e-07, |
|
"loss": 0.1099, |
|
"step": 115900 |
|
}, |
|
{ |
|
"epoch": 9.452411994784876, |
|
"grad_norm": 1.6242234706878662, |
|
"learning_rate": 7.023468057366365e-07, |
|
"loss": 0.1231, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 9.460560625814864, |
|
"grad_norm": 8.302433013916016, |
|
"learning_rate": 6.933833116036504e-07, |
|
"loss": 0.1121, |
|
"step": 116100 |
|
}, |
|
{ |
|
"epoch": 9.46870925684485, |
|
"grad_norm": 25.818700790405273, |
|
"learning_rate": 6.844198174706648e-07, |
|
"loss": 0.1175, |
|
"step": 116200 |
|
}, |
|
{ |
|
"epoch": 9.476857887874838, |
|
"grad_norm": 18.826948165893555, |
|
"learning_rate": 6.75456323337679e-07, |
|
"loss": 0.1205, |
|
"step": 116300 |
|
}, |
|
{ |
|
"epoch": 9.485006518904823, |
|
"grad_norm": 0.6583182215690613, |
|
"learning_rate": 6.664928292046933e-07, |
|
"loss": 0.1358, |
|
"step": 116400 |
|
}, |
|
{ |
|
"epoch": 9.493155149934811, |
|
"grad_norm": 12.578688621520996, |
|
"learning_rate": 6.575293350717085e-07, |
|
"loss": 0.1096, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 9.501303780964799, |
|
"grad_norm": 13.54981803894043, |
|
"learning_rate": 6.485658409387227e-07, |
|
"loss": 0.1368, |
|
"step": 116600 |
|
}, |
|
{ |
|
"epoch": 9.509452411994785, |
|
"grad_norm": 6.809188365936279, |
|
"learning_rate": 6.396023468057369e-07, |
|
"loss": 0.1188, |
|
"step": 116700 |
|
}, |
|
{ |
|
"epoch": 9.517601043024772, |
|
"grad_norm": 11.08338737487793, |
|
"learning_rate": 6.306388526727511e-07, |
|
"loss": 0.1203, |
|
"step": 116800 |
|
}, |
|
{ |
|
"epoch": 9.525749674054758, |
|
"grad_norm": 4.479588031768799, |
|
"learning_rate": 6.216753585397653e-07, |
|
"loss": 0.1297, |
|
"step": 116900 |
|
}, |
|
{ |
|
"epoch": 9.533898305084746, |
|
"grad_norm": 7.722095966339111, |
|
"learning_rate": 6.127118644067795e-07, |
|
"loss": 0.1192, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 9.542046936114733, |
|
"grad_norm": 10.053470611572266, |
|
"learning_rate": 6.037483702737937e-07, |
|
"loss": 0.1222, |
|
"step": 117100 |
|
}, |
|
{ |
|
"epoch": 9.55019556714472, |
|
"grad_norm": 12.395706176757812, |
|
"learning_rate": 5.947848761408079e-07, |
|
"loss": 0.1207, |
|
"step": 117200 |
|
}, |
|
{ |
|
"epoch": 9.558344198174707, |
|
"grad_norm": 7.065868854522705, |
|
"learning_rate": 5.858213820078232e-07, |
|
"loss": 0.1262, |
|
"step": 117300 |
|
}, |
|
{ |
|
"epoch": 9.566492829204694, |
|
"grad_norm": 5.896761417388916, |
|
"learning_rate": 5.768578878748374e-07, |
|
"loss": 0.1258, |
|
"step": 117400 |
|
}, |
|
{ |
|
"epoch": 9.57464146023468, |
|
"grad_norm": 11.613407135009766, |
|
"learning_rate": 5.678943937418517e-07, |
|
"loss": 0.1234, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 9.582790091264668, |
|
"grad_norm": 20.267986297607422, |
|
"learning_rate": 5.589308996088658e-07, |
|
"loss": 0.1136, |
|
"step": 117600 |
|
}, |
|
{ |
|
"epoch": 9.590938722294654, |
|
"grad_norm": 7.217312335968018, |
|
"learning_rate": 5.499674054758801e-07, |
|
"loss": 0.1125, |
|
"step": 117700 |
|
}, |
|
{ |
|
"epoch": 9.599087353324641, |
|
"grad_norm": 13.285292625427246, |
|
"learning_rate": 5.410039113428943e-07, |
|
"loss": 0.1157, |
|
"step": 117800 |
|
}, |
|
{ |
|
"epoch": 9.607235984354629, |
|
"grad_norm": 22.7592716217041, |
|
"learning_rate": 5.320404172099084e-07, |
|
"loss": 0.1219, |
|
"step": 117900 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 17.77228546142578, |
|
"learning_rate": 5.230769230769226e-07, |
|
"loss": 0.1423, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 9.623533246414603, |
|
"grad_norm": 35.28404235839844, |
|
"learning_rate": 5.141134289439369e-07, |
|
"loss": 0.1254, |
|
"step": 118100 |
|
}, |
|
{ |
|
"epoch": 9.631681877444588, |
|
"grad_norm": 5.466562747955322, |
|
"learning_rate": 5.051499348109522e-07, |
|
"loss": 0.1481, |
|
"step": 118200 |
|
}, |
|
{ |
|
"epoch": 9.639830508474576, |
|
"grad_norm": 14.727246284484863, |
|
"learning_rate": 4.961864406779664e-07, |
|
"loss": 0.1369, |
|
"step": 118300 |
|
}, |
|
{ |
|
"epoch": 9.647979139504564, |
|
"grad_norm": 15.746825218200684, |
|
"learning_rate": 4.872229465449806e-07, |
|
"loss": 0.1176, |
|
"step": 118400 |
|
}, |
|
{ |
|
"epoch": 9.65612777053455, |
|
"grad_norm": 19.238407135009766, |
|
"learning_rate": 4.782594524119947e-07, |
|
"loss": 0.1178, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 9.664276401564537, |
|
"grad_norm": 13.003064155578613, |
|
"learning_rate": 4.69295958279009e-07, |
|
"loss": 0.1273, |
|
"step": 118600 |
|
}, |
|
{ |
|
"epoch": 9.672425032594525, |
|
"grad_norm": 5.110761642456055, |
|
"learning_rate": 4.603324641460232e-07, |
|
"loss": 0.1242, |
|
"step": 118700 |
|
}, |
|
{ |
|
"epoch": 9.68057366362451, |
|
"grad_norm": 12.598255157470703, |
|
"learning_rate": 4.513689700130374e-07, |
|
"loss": 0.1186, |
|
"step": 118800 |
|
}, |
|
{ |
|
"epoch": 9.688722294654498, |
|
"grad_norm": 7.440487384796143, |
|
"learning_rate": 4.424054758800516e-07, |
|
"loss": 0.1145, |
|
"step": 118900 |
|
}, |
|
{ |
|
"epoch": 9.696870925684484, |
|
"grad_norm": 10.29443073272705, |
|
"learning_rate": 4.33441981747067e-07, |
|
"loss": 0.1179, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 9.705019556714472, |
|
"grad_norm": 0.5482966899871826, |
|
"learning_rate": 4.244784876140811e-07, |
|
"loss": 0.1214, |
|
"step": 119100 |
|
}, |
|
{ |
|
"epoch": 9.71316818774446, |
|
"grad_norm": 8.483821868896484, |
|
"learning_rate": 4.1551499348109534e-07, |
|
"loss": 0.1443, |
|
"step": 119200 |
|
}, |
|
{ |
|
"epoch": 9.721316818774445, |
|
"grad_norm": 12.351343154907227, |
|
"learning_rate": 4.0655149934810956e-07, |
|
"loss": 0.13, |
|
"step": 119300 |
|
}, |
|
{ |
|
"epoch": 9.729465449804433, |
|
"grad_norm": 11.550556182861328, |
|
"learning_rate": 3.9758800521512373e-07, |
|
"loss": 0.1387, |
|
"step": 119400 |
|
}, |
|
{ |
|
"epoch": 9.737614080834419, |
|
"grad_norm": 7.8005690574646, |
|
"learning_rate": 3.8862451108213795e-07, |
|
"loss": 0.1144, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 9.745762711864407, |
|
"grad_norm": 1.0686675310134888, |
|
"learning_rate": 3.796610169491521e-07, |
|
"loss": 0.1166, |
|
"step": 119600 |
|
}, |
|
{ |
|
"epoch": 9.753911342894394, |
|
"grad_norm": 27.589384078979492, |
|
"learning_rate": 3.706975228161664e-07, |
|
"loss": 0.1322, |
|
"step": 119700 |
|
}, |
|
{ |
|
"epoch": 9.76205997392438, |
|
"grad_norm": 4.429134845733643, |
|
"learning_rate": 3.6173402868318173e-07, |
|
"loss": 0.1331, |
|
"step": 119800 |
|
}, |
|
{ |
|
"epoch": 9.770208604954368, |
|
"grad_norm": 24.076099395751953, |
|
"learning_rate": 3.527705345501959e-07, |
|
"loss": 0.1159, |
|
"step": 119900 |
|
}, |
|
{ |
|
"epoch": 9.778357235984355, |
|
"grad_norm": 11.259610176086426, |
|
"learning_rate": 3.438070404172101e-07, |
|
"loss": 0.1492, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 9.786505867014341, |
|
"grad_norm": 4.419936656951904, |
|
"learning_rate": 3.348435462842243e-07, |
|
"loss": 0.1174, |
|
"step": 120100 |
|
}, |
|
{ |
|
"epoch": 9.794654498044329, |
|
"grad_norm": 16.751590728759766, |
|
"learning_rate": 3.258800521512385e-07, |
|
"loss": 0.1293, |
|
"step": 120200 |
|
}, |
|
{ |
|
"epoch": 9.802803129074315, |
|
"grad_norm": 21.935205459594727, |
|
"learning_rate": 3.1691655801825273e-07, |
|
"loss": 0.1432, |
|
"step": 120300 |
|
}, |
|
{ |
|
"epoch": 9.810951760104302, |
|
"grad_norm": 8.330902099609375, |
|
"learning_rate": 3.079530638852669e-07, |
|
"loss": 0.1162, |
|
"step": 120400 |
|
}, |
|
{ |
|
"epoch": 9.81910039113429, |
|
"grad_norm": 2.7674975395202637, |
|
"learning_rate": 2.989895697522811e-07, |
|
"loss": 0.1237, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 9.827249022164276, |
|
"grad_norm": 13.894972801208496, |
|
"learning_rate": 2.900260756192964e-07, |
|
"loss": 0.1323, |
|
"step": 120600 |
|
}, |
|
{ |
|
"epoch": 9.835397653194264, |
|
"grad_norm": 15.896739959716797, |
|
"learning_rate": 2.810625814863106e-07, |
|
"loss": 0.1213, |
|
"step": 120700 |
|
}, |
|
{ |
|
"epoch": 9.843546284224251, |
|
"grad_norm": 14.188883781433105, |
|
"learning_rate": 2.7209908735332484e-07, |
|
"loss": 0.1383, |
|
"step": 120800 |
|
}, |
|
{ |
|
"epoch": 9.851694915254237, |
|
"grad_norm": 17.167747497558594, |
|
"learning_rate": 2.63135593220339e-07, |
|
"loss": 0.1198, |
|
"step": 120900 |
|
}, |
|
{ |
|
"epoch": 9.859843546284225, |
|
"grad_norm": 18.93288803100586, |
|
"learning_rate": 2.5417209908735323e-07, |
|
"loss": 0.1223, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 9.86799217731421, |
|
"grad_norm": 8.871918678283691, |
|
"learning_rate": 2.4520860495436746e-07, |
|
"loss": 0.1253, |
|
"step": 121100 |
|
}, |
|
{ |
|
"epoch": 9.876140808344198, |
|
"grad_norm": 23.355209350585938, |
|
"learning_rate": 2.3624511082138162e-07, |
|
"loss": 0.1208, |
|
"step": 121200 |
|
}, |
|
{ |
|
"epoch": 9.884289439374186, |
|
"grad_norm": 8.497678756713867, |
|
"learning_rate": 2.2728161668839582e-07, |
|
"loss": 0.1173, |
|
"step": 121300 |
|
}, |
|
{ |
|
"epoch": 9.892438070404172, |
|
"grad_norm": 7.700555801391602, |
|
"learning_rate": 2.1831812255541118e-07, |
|
"loss": 0.1193, |
|
"step": 121400 |
|
}, |
|
{ |
|
"epoch": 9.90058670143416, |
|
"grad_norm": 7.455496788024902, |
|
"learning_rate": 2.0935462842242537e-07, |
|
"loss": 0.1155, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 9.908735332464147, |
|
"grad_norm": 24.733013153076172, |
|
"learning_rate": 2.0039113428943957e-07, |
|
"loss": 0.1084, |
|
"step": 121600 |
|
}, |
|
{ |
|
"epoch": 9.916883963494133, |
|
"grad_norm": 8.922455787658691, |
|
"learning_rate": 1.914276401564538e-07, |
|
"loss": 0.1094, |
|
"step": 121700 |
|
}, |
|
{ |
|
"epoch": 9.92503259452412, |
|
"grad_norm": 9.614680290222168, |
|
"learning_rate": 1.8246414602346799e-07, |
|
"loss": 0.1221, |
|
"step": 121800 |
|
}, |
|
{ |
|
"epoch": 9.933181225554106, |
|
"grad_norm": 16.748014450073242, |
|
"learning_rate": 1.7350065189048218e-07, |
|
"loss": 0.1274, |
|
"step": 121900 |
|
}, |
|
{ |
|
"epoch": 9.941329856584094, |
|
"grad_norm": 5.290912628173828, |
|
"learning_rate": 1.645371577574964e-07, |
|
"loss": 0.1304, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 9.949478487614082, |
|
"grad_norm": 16.15484619140625, |
|
"learning_rate": 1.555736636245106e-07, |
|
"loss": 0.1335, |
|
"step": 122100 |
|
}, |
|
{ |
|
"epoch": 9.957627118644067, |
|
"grad_norm": 12.658509254455566, |
|
"learning_rate": 1.466101694915259e-07, |
|
"loss": 0.1219, |
|
"step": 122200 |
|
}, |
|
{ |
|
"epoch": 9.965775749674055, |
|
"grad_norm": 12.304189682006836, |
|
"learning_rate": 1.3764667535854013e-07, |
|
"loss": 0.1119, |
|
"step": 122300 |
|
}, |
|
{ |
|
"epoch": 9.973924380704041, |
|
"grad_norm": 21.67201805114746, |
|
"learning_rate": 1.2868318122555432e-07, |
|
"loss": 0.135, |
|
"step": 122400 |
|
}, |
|
{ |
|
"epoch": 9.982073011734029, |
|
"grad_norm": 18.46645164489746, |
|
"learning_rate": 1.1971968709256852e-07, |
|
"loss": 0.1222, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 9.990221642764016, |
|
"grad_norm": 8.396200180053711, |
|
"learning_rate": 1.1075619295958273e-07, |
|
"loss": 0.1213, |
|
"step": 122600 |
|
}, |
|
{ |
|
"epoch": 9.998370273794002, |
|
"grad_norm": 15.41945743560791, |
|
"learning_rate": 1.0179269882659693e-07, |
|
"loss": 0.1102, |
|
"step": 122700 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8369477911646587, |
|
"eval_loss": 0.7886391282081604, |
|
"eval_runtime": 5.5306, |
|
"eval_samples_per_second": 450.222, |
|
"eval_steps_per_second": 56.413, |
|
"step": 122720 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 122720, |
|
"total_flos": 1.6169329248859574e+17, |
|
"train_loss": 0.2765254594356482, |
|
"train_runtime": 31314.1287, |
|
"train_samples_per_second": 125.407, |
|
"train_steps_per_second": 3.919 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 122720, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6169329248859574e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|