|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 11.849372384937238, |
|
"eval_steps": 500, |
|
"global_step": 708, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.769251346588135, |
|
"learning_rate": 4.9975392245612254e-05, |
|
"loss": 2.1374, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.128970623016357, |
|
"learning_rate": 4.9901617425775067e-05, |
|
"loss": 0.4576, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.582453727722168, |
|
"learning_rate": 4.9778820775100185e-05, |
|
"loss": 0.2497, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4966882467269897, |
|
"learning_rate": 4.9607244033573156e-05, |
|
"loss": 0.1604, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.9401819705963135, |
|
"learning_rate": 4.93872249706591e-05, |
|
"loss": 0.1206, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5695924758911133, |
|
"learning_rate": 4.91191967203629e-05, |
|
"loss": 0.1109, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.77476167678833, |
|
"learning_rate": 4.8803686928552736e-05, |
|
"loss": 0.057, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.8595950603485107, |
|
"learning_rate": 4.84413167142257e-05, |
|
"loss": 0.0569, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 3.9191925525665283, |
|
"learning_rate": 4.803279944676032e-05, |
|
"loss": 0.0634, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.3463213443756104, |
|
"learning_rate": 4.7578939341563095e-05, |
|
"loss": 0.0634, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.22609496116638184, |
|
"learning_rate": 4.70806298768736e-05, |
|
"loss": 0.0437, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.9491143226623535, |
|
"learning_rate": 4.653885203484515e-05, |
|
"loss": 0.0627, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.0097993612289429, |
|
"learning_rate": 4.595467237036329e-05, |
|
"loss": 0.0361, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.7911064624786377, |
|
"learning_rate": 4.532924091140417e-05, |
|
"loss": 0.0267, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.8213822841644287, |
|
"learning_rate": 4.466378889506607e-05, |
|
"loss": 0.0362, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.4782440960407257, |
|
"learning_rate": 4.395962634373097e-05, |
|
"loss": 0.0403, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.6825015544891357, |
|
"learning_rate": 4.3218139486127854e-05, |
|
"loss": 0.0302, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.773872971534729, |
|
"learning_rate": 4.2440788028374624e-05, |
|
"loss": 0.048, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.590453565120697, |
|
"learning_rate": 4.1629102280370904e-05, |
|
"loss": 0.0219, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.30445072054862976, |
|
"learning_rate": 4.0784680143198836e-05, |
|
"loss": 0.0215, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.2872461676597595, |
|
"learning_rate": 3.990918396346254e-05, |
|
"loss": 0.0264, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.9185941219329834, |
|
"learning_rate": 3.900433726075865e-05, |
|
"loss": 0.029, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.8721823692321777, |
|
"learning_rate": 3.8071921334720696e-05, |
|
"loss": 0.031, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.34600716829299927, |
|
"learning_rate": 3.711377175831626e-05, |
|
"loss": 0.0205, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.4261312484741211, |
|
"learning_rate": 3.613177476430079e-05, |
|
"loss": 0.0172, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 1.4647105932235718, |
|
"learning_rate": 3.512786353194134e-05, |
|
"loss": 0.0162, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.12821082770824432, |
|
"learning_rate": 3.410401438132056e-05, |
|
"loss": 0.0136, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.6188161373138428, |
|
"learning_rate": 3.3062242882712724e-05, |
|
"loss": 0.0186, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.3787616789340973, |
|
"learning_rate": 3.200459988869111e-05, |
|
"loss": 0.0147, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 0.27198734879493713, |
|
"learning_rate": 3.093316749677788e-05, |
|
"loss": 0.0117, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 0.38122686743736267, |
|
"learning_rate": 2.985005495058446e-05, |
|
"loss": 0.0099, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 0.4904286861419678, |
|
"learning_rate": 2.875739448751176e-05, |
|
"loss": 0.013, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 5.21450662612915, |
|
"learning_rate": 2.7657337141184138e-05, |
|
"loss": 0.0178, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 0.09181027114391327, |
|
"learning_rate": 2.655204850688085e-05, |
|
"loss": 0.006, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.7764429450035095, |
|
"learning_rate": 2.5443704478301154e-05, |
|
"loss": 0.0132, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 0.3942791819572449, |
|
"learning_rate": 2.433448696405563e-05, |
|
"loss": 0.0086, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 0.32931485772132874, |
|
"learning_rate": 2.3226579592316538e-05, |
|
"loss": 0.0073, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 0.0025393522810190916, |
|
"learning_rate": 2.2122163412082927e-05, |
|
"loss": 0.0068, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"grad_norm": 0.17522938549518585, |
|
"learning_rate": 2.1023412599523204e-05, |
|
"loss": 0.0055, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"grad_norm": 0.07497023046016693, |
|
"learning_rate": 1.993249017784766e-05, |
|
"loss": 0.0104, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 0.34479033946990967, |
|
"learning_rate": 1.8851543759137007e-05, |
|
"loss": 0.0086, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"grad_norm": 0.4811187982559204, |
|
"learning_rate": 1.778270131650948e-05, |
|
"loss": 0.0077, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.33907610177993774, |
|
"learning_rate": 1.672806699494966e-05, |
|
"loss": 0.0028, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 0.12760841846466064, |
|
"learning_rate": 1.5689716969045848e-05, |
|
"loss": 0.0049, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.10119038820266724, |
|
"learning_rate": 1.4669695355790552e-05, |
|
"loss": 0.0041, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 0.1975838541984558, |
|
"learning_rate": 1.3670010190490073e-05, |
|
"loss": 0.0051, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"grad_norm": 0.0027128455694764853, |
|
"learning_rate": 1.2692629473705453e-05, |
|
"loss": 0.0062, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 0.45006152987480164, |
|
"learning_rate": 1.173947729700644e-05, |
|
"loss": 0.0036, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.0031273181084543467, |
|
"learning_rate": 1.081243005516571e-05, |
|
"loss": 0.0027, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.11013814806938171, |
|
"learning_rate": 9.913312752249903e-06, |
|
"loss": 0.0025, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 0.16879281401634216, |
|
"learning_rate": 9.043895408879505e-06, |
|
"loss": 0.0029, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"grad_norm": 0.4711211323738098, |
|
"learning_rate": 8.20588957773018e-06, |
|
"loss": 0.0026, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.4484475553035736, |
|
"learning_rate": 7.400944974135427e-06, |
|
"loss": 0.0029, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 0.0020081661641597748, |
|
"learning_rate": 6.6306462284233234e-06, |
|
"loss": 0.0022, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.21, |
|
"grad_norm": 0.051797155290842056, |
|
"learning_rate": 5.896509766381028e-06, |
|
"loss": 0.0011, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 0.05627438426017761, |
|
"learning_rate": 5.199980823988157e-06, |
|
"loss": 0.0016, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.54, |
|
"grad_norm": 0.054639093577861786, |
|
"learning_rate": 4.542430602295774e-06, |
|
"loss": 0.0012, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.71, |
|
"grad_norm": 0.09268685430288315, |
|
"learning_rate": 3.925153568052123e-06, |
|
"loss": 0.0011, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"grad_norm": 0.3072189688682556, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 0.0018, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"grad_norm": 0.002750764600932598, |
|
"learning_rate": 2.8161981235857143e-06, |
|
"loss": 0.001, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.21, |
|
"grad_norm": 0.14409799873828888, |
|
"learning_rate": 2.3267028256193036e-06, |
|
"loss": 0.0011, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.0033888304606080055, |
|
"learning_rate": 1.881842641895104e-06, |
|
"loss": 0.0014, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 0.05172213539481163, |
|
"learning_rate": 1.4824933332241692e-06, |
|
"loss": 0.0004, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 10.71, |
|
"grad_norm": 0.26695069670677185, |
|
"learning_rate": 1.129441066782702e-06, |
|
"loss": 0.001, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"grad_norm": 0.06269415467977524, |
|
"learning_rate": 8.233808684473959e-07, |
|
"loss": 0.0008, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 11.05, |
|
"grad_norm": 0.1800147444009781, |
|
"learning_rate": 5.649152545533332e-07, |
|
"loss": 0.0013, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 11.21, |
|
"grad_norm": 0.033635422587394714, |
|
"learning_rate": 3.5455304576806347e-07, |
|
"loss": 0.0008, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 11.38, |
|
"grad_norm": 0.011946323327720165, |
|
"learning_rate": 1.927083654168854e-07, |
|
"loss": 0.0004, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 11.55, |
|
"grad_norm": 0.1268051713705063, |
|
"learning_rate": 7.969982423124689e-08, |
|
"loss": 0.0007, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 0.20600733160972595, |
|
"learning_rate": 1.5749893125160954e-08, |
|
"loss": 0.0011, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 11.85, |
|
"step": 708, |
|
"total_flos": 2.9001892500996096e+16, |
|
"train_loss": 0.05958240834198955, |
|
"train_runtime": 2189.2208, |
|
"train_samples_per_second": 2.62, |
|
"train_steps_per_second": 0.323 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 708, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 1000, |
|
"total_flos": 2.9001892500996096e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|