|
{ |
|
"best_metric": 1.1765093803405762, |
|
"best_model_checkpoint": "/root/finetuning_executions/finetuning_04_utg4java_src_fm_fc_dctx/checkpoint-52644", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 87740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.022568941116333, |
|
"learning_rate": 2.45625e-05, |
|
"loss": 3.9615, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4656506776809692, |
|
"learning_rate": 4.95625e-05, |
|
"loss": 1.7529, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1944507360458374, |
|
"learning_rate": 4.977398205659076e-05, |
|
"loss": 1.544, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3468198776245117, |
|
"learning_rate": 4.954393834828618e-05, |
|
"loss": 1.4706, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.04371178150177, |
|
"learning_rate": 4.93138946399816e-05, |
|
"loss": 1.4141, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.1084234714508057, |
|
"learning_rate": 4.908385093167702e-05, |
|
"loss": 1.376, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1234780550003052, |
|
"learning_rate": 4.885380722337244e-05, |
|
"loss": 1.3424, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5502243041992188, |
|
"learning_rate": 4.8623763515067866e-05, |
|
"loss": 1.3219, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0266711711883545, |
|
"learning_rate": 4.839371980676329e-05, |
|
"loss": 1.2912, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1499156951904297, |
|
"learning_rate": 4.816367609845871e-05, |
|
"loss": 1.2708, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2277108430862427, |
|
"learning_rate": 4.7933632390154135e-05, |
|
"loss": 1.266, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.151228427886963, |
|
"learning_rate": 4.770358868184955e-05, |
|
"loss": 1.2418, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.230625867843628, |
|
"learning_rate": 4.7473544973544974e-05, |
|
"loss": 1.2383, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9033710360527039, |
|
"learning_rate": 4.72435012652404e-05, |
|
"loss": 1.2078, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9645096659660339, |
|
"learning_rate": 4.701345755693582e-05, |
|
"loss": 1.1893, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1617854833602905, |
|
"learning_rate": 4.678341384863124e-05, |
|
"loss": 1.1773, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9844051003456116, |
|
"learning_rate": 4.6553370140326666e-05, |
|
"loss": 1.1652, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.9416425824165344, |
|
"learning_rate": 4.632332643202208e-05, |
|
"loss": 1.1752, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.103413701057434, |
|
"learning_rate": 4.6093282723717505e-05, |
|
"loss": 1.1423, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1644244194030762, |
|
"learning_rate": 4.586323901541293e-05, |
|
"loss": 1.1412, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9481434226036072, |
|
"learning_rate": 4.563319530710835e-05, |
|
"loss": 1.1406, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1336678266525269, |
|
"learning_rate": 4.5403151598803774e-05, |
|
"loss": 1.1247, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1613398790359497, |
|
"learning_rate": 4.51731078904992e-05, |
|
"loss": 1.1139, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9106454849243164, |
|
"learning_rate": 4.494306418219462e-05, |
|
"loss": 1.1018, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.121297001838684, |
|
"learning_rate": 4.471302047389004e-05, |
|
"loss": 1.093, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.2510058879852295, |
|
"learning_rate": 4.4482976765585466e-05, |
|
"loss": 1.0772, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0042085647583008, |
|
"learning_rate": 4.425293305728089e-05, |
|
"loss": 1.0952, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9691294431686401, |
|
"learning_rate": 4.402288934897631e-05, |
|
"loss": 1.0728, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.900873064994812, |
|
"learning_rate": 4.379284564067173e-05, |
|
"loss": 1.0683, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.942535936832428, |
|
"learning_rate": 4.356280193236715e-05, |
|
"loss": 1.0574, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9630743265151978, |
|
"learning_rate": 4.3332758224062574e-05, |
|
"loss": 1.0626, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.9657124280929565, |
|
"learning_rate": 4.3102714515758e-05, |
|
"loss": 1.0664, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0142230987548828, |
|
"learning_rate": 4.287267080745342e-05, |
|
"loss": 1.0421, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.021824598312378, |
|
"learning_rate": 4.264262709914884e-05, |
|
"loss": 1.0393, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.866675853729248, |
|
"learning_rate": 4.241258339084426e-05, |
|
"loss": 1.0398, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0492345094680786, |
|
"learning_rate": 4.218253968253968e-05, |
|
"loss": 1.0266, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1229575872421265, |
|
"learning_rate": 4.1953071083505865e-05, |
|
"loss": 1.0328, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.9247739315032959, |
|
"learning_rate": 4.172302737520129e-05, |
|
"loss": 1.033, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8983958959579468, |
|
"learning_rate": 4.149298366689671e-05, |
|
"loss": 1.0075, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9420925974845886, |
|
"learning_rate": 4.1262939958592134e-05, |
|
"loss": 1.0005, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.9218415021896362, |
|
"learning_rate": 4.103289625028756e-05, |
|
"loss": 1.0117, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9700437784194946, |
|
"learning_rate": 4.080285254198298e-05, |
|
"loss": 1.0018, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9360381960868835, |
|
"learning_rate": 4.0572808833678396e-05, |
|
"loss": 0.9986, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2005890607833862, |
|
"eval_runtime": 368.0591, |
|
"eval_samples_per_second": 163.764, |
|
"eval_steps_per_second": 2.559, |
|
"step": 17548 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8369306921958923, |
|
"learning_rate": 4.034276512537382e-05, |
|
"loss": 0.9929, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.9470133781433105, |
|
"learning_rate": 4.011272141706924e-05, |
|
"loss": 0.9462, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.9572529196739197, |
|
"learning_rate": 3.9882677708764665e-05, |
|
"loss": 0.95, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.0236432552337646, |
|
"learning_rate": 3.965263400046009e-05, |
|
"loss": 0.947, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.105034589767456, |
|
"learning_rate": 3.942259029215551e-05, |
|
"loss": 0.9382, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.322292447090149, |
|
"learning_rate": 3.9192546583850934e-05, |
|
"loss": 0.941, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.137208104133606, |
|
"learning_rate": 3.896250287554636e-05, |
|
"loss": 0.9247, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.1079697608947754, |
|
"learning_rate": 3.873245916724178e-05, |
|
"loss": 0.9383, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.3894723653793335, |
|
"learning_rate": 3.85024154589372e-05, |
|
"loss": 0.9381, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.900830864906311, |
|
"learning_rate": 3.8272371750632626e-05, |
|
"loss": 0.9175, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.0307693481445312, |
|
"learning_rate": 3.804232804232805e-05, |
|
"loss": 0.924, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0316811800003052, |
|
"learning_rate": 3.7812284334023465e-05, |
|
"loss": 0.9165, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.1016398668289185, |
|
"learning_rate": 3.758224062571889e-05, |
|
"loss": 0.9168, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.0044025182724, |
|
"learning_rate": 3.735277202668507e-05, |
|
"loss": 0.9146, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.1868786811828613, |
|
"learning_rate": 3.7122728318380494e-05, |
|
"loss": 0.9253, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.041280746459961, |
|
"learning_rate": 3.689268461007592e-05, |
|
"loss": 0.9003, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.0607842206954956, |
|
"learning_rate": 3.666264090177134e-05, |
|
"loss": 0.9075, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.8795559406280518, |
|
"learning_rate": 3.643259719346676e-05, |
|
"loss": 0.907, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.0717073678970337, |
|
"learning_rate": 3.6203128594432946e-05, |
|
"loss": 0.9081, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.9691916108131409, |
|
"learning_rate": 3.597308488612837e-05, |
|
"loss": 0.8907, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.1249741315841675, |
|
"learning_rate": 3.574304117782379e-05, |
|
"loss": 0.8942, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.8723602890968323, |
|
"learning_rate": 3.551299746951921e-05, |
|
"loss": 0.9014, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.1265227794647217, |
|
"learning_rate": 3.528295376121463e-05, |
|
"loss": 0.8862, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.8394224047660828, |
|
"learning_rate": 3.5052910052910054e-05, |
|
"loss": 0.8823, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.1697936058044434, |
|
"learning_rate": 3.482344145387624e-05, |
|
"loss": 0.8786, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.8561883568763733, |
|
"learning_rate": 3.459339774557166e-05, |
|
"loss": 0.8816, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.054382562637329, |
|
"learning_rate": 3.436335403726708e-05, |
|
"loss": 0.8862, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.1177219152450562, |
|
"learning_rate": 3.4133310328962506e-05, |
|
"loss": 0.8867, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.9827663898468018, |
|
"learning_rate": 3.390326662065793e-05, |
|
"loss": 0.8893, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.9570045471191406, |
|
"learning_rate": 3.3673222912353345e-05, |
|
"loss": 0.8839, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 3.5448367595672607, |
|
"learning_rate": 3.3444904531861054e-05, |
|
"loss": 0.8851, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.0004465579986572, |
|
"learning_rate": 3.321543593282724e-05, |
|
"loss": 0.8824, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.9247887134552002, |
|
"learning_rate": 3.298539222452266e-05, |
|
"loss": 0.8633, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.1364272832870483, |
|
"learning_rate": 3.275534851621808e-05, |
|
"loss": 0.8755, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.057548999786377, |
|
"learning_rate": 3.2525304807913506e-05, |
|
"loss": 0.867, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.9271708130836487, |
|
"learning_rate": 3.229526109960893e-05, |
|
"loss": 0.8795, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.1106973886489868, |
|
"learning_rate": 3.2065217391304345e-05, |
|
"loss": 0.8636, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.1093181371688843, |
|
"learning_rate": 3.183517368299977e-05, |
|
"loss": 0.8633, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.1222974061965942, |
|
"learning_rate": 3.160512997469519e-05, |
|
"loss": 0.8637, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.9591478109359741, |
|
"learning_rate": 3.1375086266390614e-05, |
|
"loss": 0.8632, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.9238194823265076, |
|
"learning_rate": 3.1145042558086044e-05, |
|
"loss": 0.8489, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.0500926971435547, |
|
"learning_rate": 3.091499884978146e-05, |
|
"loss": 0.8581, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.459464430809021, |
|
"learning_rate": 3.068495514147688e-05, |
|
"loss": 0.8522, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.1386796236038208, |
|
"learning_rate": 3.045548654244307e-05, |
|
"loss": 0.8615, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.1808243989944458, |
|
"eval_runtime": 367.8213, |
|
"eval_samples_per_second": 163.87, |
|
"eval_steps_per_second": 2.561, |
|
"step": 35096 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.064687728881836, |
|
"learning_rate": 3.022544283413849e-05, |
|
"loss": 0.8377, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.260934591293335, |
|
"learning_rate": 2.999539912583391e-05, |
|
"loss": 0.8038, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.0460811853408813, |
|
"learning_rate": 2.9765355417529335e-05, |
|
"loss": 0.7998, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.0479072332382202, |
|
"learning_rate": 2.9535311709224754e-05, |
|
"loss": 0.8006, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.1579115390777588, |
|
"learning_rate": 2.9305268000920177e-05, |
|
"loss": 0.7927, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.0748111009597778, |
|
"learning_rate": 2.90752242926156e-05, |
|
"loss": 0.7942, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.0448018312454224, |
|
"learning_rate": 2.884518058431102e-05, |
|
"loss": 0.8047, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.122674822807312, |
|
"learning_rate": 2.8615136876006443e-05, |
|
"loss": 0.8034, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.0929648876190186, |
|
"learning_rate": 2.8385093167701866e-05, |
|
"loss": 0.7982, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.0236926078796387, |
|
"learning_rate": 2.8155049459397285e-05, |
|
"loss": 0.804, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.4479929208755493, |
|
"learning_rate": 2.792558086036347e-05, |
|
"loss": 0.7986, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.0635368824005127, |
|
"learning_rate": 2.769553715205889e-05, |
|
"loss": 0.793, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.9539445638656616, |
|
"learning_rate": 2.7466068553025077e-05, |
|
"loss": 0.7891, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.0995815992355347, |
|
"learning_rate": 2.723659995399126e-05, |
|
"loss": 0.8014, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.9494262337684631, |
|
"learning_rate": 2.7006556245686683e-05, |
|
"loss": 0.7913, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.9940603375434875, |
|
"learning_rate": 2.6776512537382103e-05, |
|
"loss": 0.7967, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.0324065685272217, |
|
"learning_rate": 2.6546468829077526e-05, |
|
"loss": 0.783, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.9601799845695496, |
|
"learning_rate": 2.631642512077295e-05, |
|
"loss": 0.7871, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.0334033966064453, |
|
"learning_rate": 2.6086381412468368e-05, |
|
"loss": 0.793, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.070986270904541, |
|
"learning_rate": 2.5856912813434555e-05, |
|
"loss": 0.7908, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9531447887420654, |
|
"learning_rate": 2.5626869105129974e-05, |
|
"loss": 0.7806, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.2823644876480103, |
|
"learning_rate": 2.5397400506096157e-05, |
|
"loss": 0.7953, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.8898524641990662, |
|
"learning_rate": 2.516735679779158e-05, |
|
"loss": 0.7926, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.2231683731079102, |
|
"learning_rate": 2.4937313089487006e-05, |
|
"loss": 0.7895, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.9554850459098816, |
|
"learning_rate": 2.4707269381182426e-05, |
|
"loss": 0.7871, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.007906436920166, |
|
"learning_rate": 2.447722567287785e-05, |
|
"loss": 0.7713, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.0699195861816406, |
|
"learning_rate": 2.4247181964573272e-05, |
|
"loss": 0.7817, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.1465818881988525, |
|
"learning_rate": 2.401713825626869e-05, |
|
"loss": 0.7779, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.8878755569458008, |
|
"learning_rate": 2.3787094547964114e-05, |
|
"loss": 0.7759, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.0451282262802124, |
|
"learning_rate": 2.3557050839659537e-05, |
|
"loss": 0.775, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.015073299407959, |
|
"learning_rate": 2.3327007131354957e-05, |
|
"loss": 0.7799, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.1620949506759644, |
|
"learning_rate": 2.309696342305038e-05, |
|
"loss": 0.7732, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.8986193537712097, |
|
"learning_rate": 2.2866919714745803e-05, |
|
"loss": 0.7819, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.1293410062789917, |
|
"learning_rate": 2.2636876006441222e-05, |
|
"loss": 0.7709, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.0725595951080322, |
|
"learning_rate": 2.2406832298136645e-05, |
|
"loss": 0.7635, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.98313307762146, |
|
"learning_rate": 2.2176788589832072e-05, |
|
"loss": 0.7692, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.0612819194793701, |
|
"learning_rate": 2.194674488152749e-05, |
|
"loss": 0.7626, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.1520133018493652, |
|
"learning_rate": 2.1717276282493678e-05, |
|
"loss": 0.7608, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.0242923498153687, |
|
"learning_rate": 2.1487232574189097e-05, |
|
"loss": 0.7787, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.09690523147583, |
|
"learning_rate": 2.125718886588452e-05, |
|
"loss": 0.7716, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.1755789518356323, |
|
"learning_rate": 2.1027145157579943e-05, |
|
"loss": 0.7713, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.9996896386146545, |
|
"learning_rate": 2.0797101449275363e-05, |
|
"loss": 0.7733, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.0416631698608398, |
|
"learning_rate": 2.0567057740970786e-05, |
|
"loss": 0.7707, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.2188984155654907, |
|
"learning_rate": 2.033701403266621e-05, |
|
"loss": 0.7639, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.1765093803405762, |
|
"eval_runtime": 367.9851, |
|
"eval_samples_per_second": 163.797, |
|
"eval_steps_per_second": 2.56, |
|
"step": 52644 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.4339423179626465, |
|
"learning_rate": 2.0106970324361628e-05, |
|
"loss": 0.7487, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.022152304649353, |
|
"learning_rate": 1.987692661605705e-05, |
|
"loss": 0.7295, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.0018941164016724, |
|
"learning_rate": 1.9646882907752474e-05, |
|
"loss": 0.7181, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.9470273852348328, |
|
"learning_rate": 1.9416839199447894e-05, |
|
"loss": 0.7245, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.9430755972862244, |
|
"learning_rate": 1.918737060041408e-05, |
|
"loss": 0.7293, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.1047594547271729, |
|
"learning_rate": 1.89573268921095e-05, |
|
"loss": 0.7243, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.011172890663147, |
|
"learning_rate": 1.8727858293075682e-05, |
|
"loss": 0.734, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.0907740592956543, |
|
"learning_rate": 1.849781458477111e-05, |
|
"loss": 0.7139, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.9432533979415894, |
|
"learning_rate": 1.8267770876466532e-05, |
|
"loss": 0.7144, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.0588159561157227, |
|
"learning_rate": 1.803772716816195e-05, |
|
"loss": 0.7205, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.0993419885635376, |
|
"learning_rate": 1.7807683459857374e-05, |
|
"loss": 0.7245, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.082985520362854, |
|
"learning_rate": 1.7577639751552797e-05, |
|
"loss": 0.72, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.021746039390564, |
|
"learning_rate": 1.7347596043248217e-05, |
|
"loss": 0.7165, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.167556643486023, |
|
"learning_rate": 1.7118127444214403e-05, |
|
"loss": 0.7177, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.1957422494888306, |
|
"learning_rate": 1.6888083735909823e-05, |
|
"loss": 0.7199, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.0189940929412842, |
|
"learning_rate": 1.6658040027605246e-05, |
|
"loss": 0.7142, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.7046924829483032, |
|
"learning_rate": 1.642799631930067e-05, |
|
"loss": 0.7187, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.220874309539795, |
|
"learning_rate": 1.6197952610996088e-05, |
|
"loss": 0.7252, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.0041950941085815, |
|
"learning_rate": 1.596790890269151e-05, |
|
"loss": 0.7053, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.0260080099105835, |
|
"learning_rate": 1.5737865194386934e-05, |
|
"loss": 0.7118, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.1826013326644897, |
|
"learning_rate": 1.5507821486082354e-05, |
|
"loss": 0.7176, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.9184863567352295, |
|
"learning_rate": 1.527835288704854e-05, |
|
"loss": 0.7227, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.1603368520736694, |
|
"learning_rate": 1.5048309178743963e-05, |
|
"loss": 0.7162, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.2338982820510864, |
|
"learning_rate": 1.4818265470439386e-05, |
|
"loss": 0.718, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.1764419078826904, |
|
"learning_rate": 1.4588221762134807e-05, |
|
"loss": 0.7213, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.0851845741271973, |
|
"learning_rate": 1.4358178053830229e-05, |
|
"loss": 0.717, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.0550730228424072, |
|
"learning_rate": 1.4128134345525652e-05, |
|
"loss": 0.7037, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.1784164905548096, |
|
"learning_rate": 1.3898665746491834e-05, |
|
"loss": 0.7194, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.9505347609519958, |
|
"learning_rate": 1.3668622038187256e-05, |
|
"loss": 0.7105, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.266885757446289, |
|
"learning_rate": 1.3438578329882679e-05, |
|
"loss": 0.7101, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.092987060546875, |
|
"learning_rate": 1.32085346215781e-05, |
|
"loss": 0.7089, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.2043814659118652, |
|
"learning_rate": 1.2978490913273523e-05, |
|
"loss": 0.6956, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 1.230975866317749, |
|
"learning_rate": 1.2748447204968944e-05, |
|
"loss": 0.7118, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.2574669122695923, |
|
"learning_rate": 1.2518978605935127e-05, |
|
"loss": 0.7102, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 1.1570420265197754, |
|
"learning_rate": 1.2288934897630552e-05, |
|
"loss": 0.696, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 1.3489502668380737, |
|
"learning_rate": 1.2058891189325973e-05, |
|
"loss": 0.7108, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.2118040323257446, |
|
"learning_rate": 1.1828847481021394e-05, |
|
"loss": 0.7211, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.1167511940002441, |
|
"learning_rate": 1.1599378881987579e-05, |
|
"loss": 0.7044, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.0914174318313599, |
|
"learning_rate": 1.1369335173683e-05, |
|
"loss": 0.7051, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.059415340423584, |
|
"learning_rate": 1.1139291465378423e-05, |
|
"loss": 0.7142, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.94761723279953, |
|
"learning_rate": 1.0909247757073844e-05, |
|
"loss": 0.7072, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.2550498247146606, |
|
"learning_rate": 1.0679204048769266e-05, |
|
"loss": 0.7034, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 1.0935131311416626, |
|
"learning_rate": 1.0449160340464689e-05, |
|
"loss": 0.6976, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 1.8066422939300537, |
|
"learning_rate": 1.0219116632160112e-05, |
|
"loss": 0.698, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.1811184883117676, |
|
"eval_runtime": 368.0007, |
|
"eval_samples_per_second": 163.79, |
|
"eval_steps_per_second": 2.56, |
|
"step": 70192 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 1.1222364902496338, |
|
"learning_rate": 9.989072923855533e-06, |
|
"loss": 0.6825, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 1.1273374557495117, |
|
"learning_rate": 9.759029215550954e-06, |
|
"loss": 0.6736, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.1399248838424683, |
|
"learning_rate": 9.528985507246377e-06, |
|
"loss": 0.6667, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 1.0817917585372925, |
|
"learning_rate": 9.298941798941798e-06, |
|
"loss": 0.6826, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 1.085259199142456, |
|
"learning_rate": 9.068898090637221e-06, |
|
"loss": 0.6699, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 1.1712231636047363, |
|
"learning_rate": 8.838854382332644e-06, |
|
"loss": 0.677, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 1.1486331224441528, |
|
"learning_rate": 8.608810674028066e-06, |
|
"loss": 0.674, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 1.0690157413482666, |
|
"learning_rate": 8.378766965723487e-06, |
|
"loss": 0.6712, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.0190378427505493, |
|
"learning_rate": 8.14872325741891e-06, |
|
"loss": 0.666, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 1.2103822231292725, |
|
"learning_rate": 7.919254658385093e-06, |
|
"loss": 0.673, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.1462812423706055, |
|
"learning_rate": 7.689210950080516e-06, |
|
"loss": 0.669, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.0003070831298828, |
|
"learning_rate": 7.459167241775939e-06, |
|
"loss": 0.6806, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 1.0154987573623657, |
|
"learning_rate": 7.22912353347136e-06, |
|
"loss": 0.6838, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 1.0686919689178467, |
|
"learning_rate": 6.999079825166782e-06, |
|
"loss": 0.6677, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 1.0061548948287964, |
|
"learning_rate": 6.769611226132966e-06, |
|
"loss": 0.6696, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 4.049505710601807, |
|
"learning_rate": 6.539567517828388e-06, |
|
"loss": 0.6625, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.1459605693817139, |
|
"learning_rate": 6.30952380952381e-06, |
|
"loss": 0.6747, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.0267729759216309, |
|
"learning_rate": 6.079480101219232e-06, |
|
"loss": 0.684, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.109384298324585, |
|
"learning_rate": 5.849436392914654e-06, |
|
"loss": 0.6695, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.1579252481460571, |
|
"learning_rate": 5.619392684610076e-06, |
|
"loss": 0.6727, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 1.2438113689422607, |
|
"learning_rate": 5.3893489763054985e-06, |
|
"loss": 0.6778, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 1.1091639995574951, |
|
"learning_rate": 5.159305268000921e-06, |
|
"loss": 0.6665, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 1.0381008386611938, |
|
"learning_rate": 4.929261559696342e-06, |
|
"loss": 0.6641, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 1.0532902479171753, |
|
"learning_rate": 4.699217851391765e-06, |
|
"loss": 0.6629, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 1.0946133136749268, |
|
"learning_rate": 4.469174143087187e-06, |
|
"loss": 0.6765, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.119805932044983, |
|
"learning_rate": 4.239130434782608e-06, |
|
"loss": 0.6707, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.2089961767196655, |
|
"learning_rate": 4.009086726478031e-06, |
|
"loss": 0.6753, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 1.2102470397949219, |
|
"learning_rate": 3.779618127444214e-06, |
|
"loss": 0.6644, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 1.0439101457595825, |
|
"learning_rate": 3.5495744191396367e-06, |
|
"loss": 0.6713, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 1.1744842529296875, |
|
"learning_rate": 3.319530710835059e-06, |
|
"loss": 0.6599, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.9925849437713623, |
|
"learning_rate": 3.089487002530481e-06, |
|
"loss": 0.6656, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 1.2233749628067017, |
|
"learning_rate": 2.8600184034966646e-06, |
|
"loss": 0.6813, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 1.1348555088043213, |
|
"learning_rate": 2.6299746951920863e-06, |
|
"loss": 0.6738, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.0024689435958862, |
|
"learning_rate": 2.399930986887509e-06, |
|
"loss": 0.6654, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 1.049222469329834, |
|
"learning_rate": 2.1704623878536926e-06, |
|
"loss": 0.6782, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 1.2224096059799194, |
|
"learning_rate": 1.9404186795491143e-06, |
|
"loss": 0.6623, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 1.0780987739562988, |
|
"learning_rate": 1.7103749712445366e-06, |
|
"loss": 0.6633, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 1.0614657402038574, |
|
"learning_rate": 1.4803312629399585e-06, |
|
"loss": 0.6748, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.1810928583145142, |
|
"learning_rate": 1.2502875546353809e-06, |
|
"loss": 0.6623, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.1799181699752808, |
|
"learning_rate": 1.0202438463308028e-06, |
|
"loss": 0.6653, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.012217402458191, |
|
"learning_rate": 7.902001380262249e-07, |
|
"loss": 0.6632, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 1.2088865041732788, |
|
"learning_rate": 5.601564297216472e-07, |
|
"loss": 0.6719, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 1.1404399871826172, |
|
"learning_rate": 3.301127214170693e-07, |
|
"loss": 0.6636, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 1.2054311037063599, |
|
"learning_rate": 1.0006901311249138e-07, |
|
"loss": 0.6656, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.1946403980255127, |
|
"eval_runtime": 367.8686, |
|
"eval_samples_per_second": 163.849, |
|
"eval_steps_per_second": 2.561, |
|
"step": 87740 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 87740, |
|
"total_flos": 1.7097588901675008e+18, |
|
"train_loss": 0.858644689469069, |
|
"train_runtime": 51993.6603, |
|
"train_samples_per_second": 54.003, |
|
"train_steps_per_second": 1.688 |
|
} |
|
], |
|
"logging_steps": 400, |
|
"max_steps": 87740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.7097588901675008e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|