|
{ |
|
"best_metric": 0.2451845407485962, |
|
"best_model_checkpoint": "./output/checkpoint-1950", |
|
"epoch": 4.20353982300885, |
|
"eval_steps": 150, |
|
"global_step": 2850, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014749262536873156, |
|
"grad_norm": 5.5810041427612305, |
|
"learning_rate": 4.125e-06, |
|
"loss": 0.6821, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029498525073746312, |
|
"grad_norm": 5.33672571182251, |
|
"learning_rate": 8.25e-06, |
|
"loss": 0.6731, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04424778761061947, |
|
"grad_norm": 3.1765787601470947, |
|
"learning_rate": 1.2375e-05, |
|
"loss": 0.6863, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.058997050147492625, |
|
"grad_norm": 7.523784637451172, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.5096, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07374631268436578, |
|
"grad_norm": 8.258329391479492, |
|
"learning_rate": 2.0625e-05, |
|
"loss": 0.552, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08849557522123894, |
|
"grad_norm": 4.990320205688477, |
|
"learning_rate": 2.475e-05, |
|
"loss": 0.5211, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10324483775811209, |
|
"grad_norm": 7.771764278411865, |
|
"learning_rate": 2.8874999999999997e-05, |
|
"loss": 0.4423, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11799410029498525, |
|
"grad_norm": 4.934092044830322, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.5025, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13274336283185842, |
|
"grad_norm": 5.5473551750183105, |
|
"learning_rate": 3.7125e-05, |
|
"loss": 0.5095, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14749262536873156, |
|
"grad_norm": 5.446054458618164, |
|
"learning_rate": 4.125e-05, |
|
"loss": 0.439, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16224188790560473, |
|
"grad_norm": 5.696810245513916, |
|
"learning_rate": 4.12495760935163e-05, |
|
"loss": 0.4491, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17699115044247787, |
|
"grad_norm": 4.427632808685303, |
|
"learning_rate": 4.1248304391490334e-05, |
|
"loss": 0.3156, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19174041297935104, |
|
"grad_norm": 3.307356357574463, |
|
"learning_rate": 4.1246184946196796e-05, |
|
"loss": 0.3194, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20648967551622419, |
|
"grad_norm": 1.1623802185058594, |
|
"learning_rate": 4.124321784475777e-05, |
|
"loss": 0.3677, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22123893805309736, |
|
"grad_norm": 4.190874099731445, |
|
"learning_rate": 4.123940320913919e-05, |
|
"loss": 0.42, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22123893805309736, |
|
"eval_loss": 0.36307621002197266, |
|
"eval_runtime": 43.6353, |
|
"eval_samples_per_second": 6.921, |
|
"eval_steps_per_second": 6.921, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2359882005899705, |
|
"grad_norm": 1.6925841569900513, |
|
"learning_rate": 4.123474119614577e-05, |
|
"loss": 0.4331, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25073746312684364, |
|
"grad_norm": 2.9325969219207764, |
|
"learning_rate": 4.1229231997414614e-05, |
|
"loss": 0.4183, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26548672566371684, |
|
"grad_norm": 2.1130642890930176, |
|
"learning_rate": 4.1222875839407306e-05, |
|
"loss": 0.3555, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.28023598820059, |
|
"grad_norm": 3.514279365539551, |
|
"learning_rate": 4.121567298340059e-05, |
|
"loss": 0.4134, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2949852507374631, |
|
"grad_norm": 3.123249053955078, |
|
"learning_rate": 4.120762372547569e-05, |
|
"loss": 0.5256, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.30973451327433627, |
|
"grad_norm": 2.3070755004882812, |
|
"learning_rate": 4.119872839650605e-05, |
|
"loss": 0.3679, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32448377581120946, |
|
"grad_norm": 2.9008662700653076, |
|
"learning_rate": 4.118898736214381e-05, |
|
"loss": 0.3923, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3392330383480826, |
|
"grad_norm": 2.0514321327209473, |
|
"learning_rate": 4.117840102280475e-05, |
|
"loss": 0.3725, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 3.542119264602661, |
|
"learning_rate": 4.116696981365181e-05, |
|
"loss": 0.4415, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3687315634218289, |
|
"grad_norm": 1.6755036115646362, |
|
"learning_rate": 4.115469420457721e-05, |
|
"loss": 0.2841, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3834808259587021, |
|
"grad_norm": 2.9505436420440674, |
|
"learning_rate": 4.1141574700183186e-05, |
|
"loss": 0.3778, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.39823008849557523, |
|
"grad_norm": 2.982377052307129, |
|
"learning_rate": 4.1127611839761155e-05, |
|
"loss": 0.3422, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.41297935103244837, |
|
"grad_norm": 1.809727668762207, |
|
"learning_rate": 4.111280619726964e-05, |
|
"loss": 0.3351, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4277286135693215, |
|
"grad_norm": 2.7590482234954834, |
|
"learning_rate": 4.109715838131059e-05, |
|
"loss": 0.3718, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4424778761061947, |
|
"grad_norm": 1.9859437942504883, |
|
"learning_rate": 4.108066903510445e-05, |
|
"loss": 0.3772, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4424778761061947, |
|
"eval_loss": 0.3334226906299591, |
|
"eval_runtime": 43.4221, |
|
"eval_samples_per_second": 6.955, |
|
"eval_steps_per_second": 6.955, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45722713864306785, |
|
"grad_norm": 2.9821550846099854, |
|
"learning_rate": 4.106333883646366e-05, |
|
"loss": 0.3968, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.471976401179941, |
|
"grad_norm": 3.6904613971710205, |
|
"learning_rate": 4.104516849776479e-05, |
|
"loss": 0.3127, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.48672566371681414, |
|
"grad_norm": 1.0732004642486572, |
|
"learning_rate": 4.1026158765919306e-05, |
|
"loss": 0.4087, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5014749262536873, |
|
"grad_norm": 1.7305632829666138, |
|
"learning_rate": 4.100631042234283e-05, |
|
"loss": 0.4596, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5162241887905604, |
|
"grad_norm": 2.5850343704223633, |
|
"learning_rate": 4.098562428292304e-05, |
|
"loss": 0.3444, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5309734513274337, |
|
"grad_norm": 2.5205276012420654, |
|
"learning_rate": 4.096410119798607e-05, |
|
"loss": 0.4583, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5457227138643068, |
|
"grad_norm": 2.062127113342285, |
|
"learning_rate": 4.094174205226167e-05, |
|
"loss": 0.4003, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.56047197640118, |
|
"grad_norm": 3.1239662170410156, |
|
"learning_rate": 4.0918547764846736e-05, |
|
"loss": 0.3674, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5752212389380531, |
|
"grad_norm": 2.7132530212402344, |
|
"learning_rate": 4.089451928916758e-05, |
|
"loss": 0.3639, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5899705014749262, |
|
"grad_norm": 3.0152759552001953, |
|
"learning_rate": 4.0869657612940723e-05, |
|
"loss": 0.2698, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6047197640117994, |
|
"grad_norm": 2.3073341846466064, |
|
"learning_rate": 4.08439637581323e-05, |
|
"loss": 0.439, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6194690265486725, |
|
"grad_norm": 1.7864807844161987, |
|
"learning_rate": 4.081743878091604e-05, |
|
"loss": 0.2919, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6342182890855457, |
|
"grad_norm": 2.861272096633911, |
|
"learning_rate": 4.079008377162988e-05, |
|
"loss": 0.4066, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6489675516224189, |
|
"grad_norm": 1.9505175352096558, |
|
"learning_rate": 4.0761899854731085e-05, |
|
"loss": 0.4823, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6637168141592921, |
|
"grad_norm": 1.8906564712524414, |
|
"learning_rate": 4.073288818875011e-05, |
|
"loss": 0.3265, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6637168141592921, |
|
"eval_loss": 0.30214831233024597, |
|
"eval_runtime": 43.7717, |
|
"eval_samples_per_second": 6.899, |
|
"eval_steps_per_second": 6.899, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6784660766961652, |
|
"grad_norm": 2.3650407791137695, |
|
"learning_rate": 4.070304996624291e-05, |
|
"loss": 0.5034, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6932153392330384, |
|
"grad_norm": 1.9324402809143066, |
|
"learning_rate": 4.067238641374194e-05, |
|
"loss": 0.349, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 2.0116679668426514, |
|
"learning_rate": 4.0640898791705745e-05, |
|
"loss": 0.5409, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7227138643067846, |
|
"grad_norm": 1.6090418100357056, |
|
"learning_rate": 4.060858839446713e-05, |
|
"loss": 0.3821, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7374631268436578, |
|
"grad_norm": 2.2912044525146484, |
|
"learning_rate": 4.057545655017998e-05, |
|
"loss": 0.2578, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7522123893805309, |
|
"grad_norm": 1.67880117893219, |
|
"learning_rate": 4.054150462076465e-05, |
|
"loss": 0.3137, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7669616519174042, |
|
"grad_norm": 1.5085221529006958, |
|
"learning_rate": 4.0506734001851976e-05, |
|
"loss": 0.2617, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7817109144542773, |
|
"grad_norm": 1.5776855945587158, |
|
"learning_rate": 4.0471146122725904e-05, |
|
"loss": 0.3693, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7964601769911505, |
|
"grad_norm": 2.036801815032959, |
|
"learning_rate": 4.043474244626477e-05, |
|
"loss": 0.2956, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8112094395280236, |
|
"grad_norm": 2.230562686920166, |
|
"learning_rate": 4.0397524468881125e-05, |
|
"loss": 0.3842, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8259587020648967, |
|
"grad_norm": 1.4315614700317383, |
|
"learning_rate": 4.0359493720460244e-05, |
|
"loss": 0.3418, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8407079646017699, |
|
"grad_norm": 3.4652326107025146, |
|
"learning_rate": 4.032065176429724e-05, |
|
"loss": 0.3102, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.855457227138643, |
|
"grad_norm": 1.9565349817276, |
|
"learning_rate": 4.0281000197032795e-05, |
|
"loss": 0.186, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8702064896755162, |
|
"grad_norm": 2.6647768020629883, |
|
"learning_rate": 4.0240540648587546e-05, |
|
"loss": 0.4584, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8849557522123894, |
|
"grad_norm": 1.2911219596862793, |
|
"learning_rate": 4.019927478209504e-05, |
|
"loss": 0.2314, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8849557522123894, |
|
"eval_loss": 0.29636457562446594, |
|
"eval_runtime": 43.2686, |
|
"eval_samples_per_second": 6.98, |
|
"eval_steps_per_second": 6.98, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8997050147492626, |
|
"grad_norm": 1.422271728515625, |
|
"learning_rate": 4.015720429383344e-05, |
|
"loss": 0.226, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9144542772861357, |
|
"grad_norm": 2.965221881866455, |
|
"learning_rate": 4.0114330913155726e-05, |
|
"loss": 0.4821, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9292035398230089, |
|
"grad_norm": 4.0180158615112305, |
|
"learning_rate": 4.007065640241867e-05, |
|
"loss": 0.2919, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.943952802359882, |
|
"grad_norm": 4.945089817047119, |
|
"learning_rate": 4.002618255691033e-05, |
|
"loss": 0.3667, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9587020648967551, |
|
"grad_norm": 1.7440930604934692, |
|
"learning_rate": 3.9980911204776306e-05, |
|
"loss": 0.3945, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9734513274336283, |
|
"grad_norm": 1.846238374710083, |
|
"learning_rate": 3.993484420694458e-05, |
|
"loss": 0.3624, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9882005899705014, |
|
"grad_norm": 2.105642795562744, |
|
"learning_rate": 3.988798345704899e-05, |
|
"loss": 0.3291, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0029498525073746, |
|
"grad_norm": 0.9437566995620728, |
|
"learning_rate": 3.984033088135143e-05, |
|
"loss": 0.2252, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0176991150442478, |
|
"grad_norm": 2.24537992477417, |
|
"learning_rate": 3.979188843866263e-05, |
|
"loss": 0.2462, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0324483775811208, |
|
"grad_norm": 1.486444354057312, |
|
"learning_rate": 3.97426581202617e-05, |
|
"loss": 0.2597, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.047197640117994, |
|
"grad_norm": 3.2096400260925293, |
|
"learning_rate": 3.969264194981418e-05, |
|
"loss": 0.2238, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0619469026548674, |
|
"grad_norm": 2.7134766578674316, |
|
"learning_rate": 3.9641841983288953e-05, |
|
"loss": 0.2689, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0766961651917404, |
|
"grad_norm": 0.9915286898612976, |
|
"learning_rate": 3.959026030887367e-05, |
|
"loss": 0.2326, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0914454277286136, |
|
"grad_norm": 2.3304569721221924, |
|
"learning_rate": 3.953789904688893e-05, |
|
"loss": 0.2508, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1061946902654867, |
|
"grad_norm": 0.8963961005210876, |
|
"learning_rate": 3.948476034970113e-05, |
|
"loss": 0.165, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1061946902654867, |
|
"eval_loss": 0.29341065883636475, |
|
"eval_runtime": 43.6734, |
|
"eval_samples_per_second": 6.915, |
|
"eval_steps_per_second": 6.915, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.12094395280236, |
|
"grad_norm": 1.2763745784759521, |
|
"learning_rate": 3.943084640163398e-05, |
|
"loss": 0.2356, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.135693215339233, |
|
"grad_norm": 1.333479642868042, |
|
"learning_rate": 3.937615941887873e-05, |
|
"loss": 0.2668, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1504424778761062, |
|
"grad_norm": 2.042940378189087, |
|
"learning_rate": 3.932070164940304e-05, |
|
"loss": 0.2435, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1651917404129795, |
|
"grad_norm": 2.6705069541931152, |
|
"learning_rate": 3.926447537285859e-05, |
|
"loss": 0.1938, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1799410029498525, |
|
"grad_norm": 2.413623571395874, |
|
"learning_rate": 3.920748290048739e-05, |
|
"loss": 0.1981, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1946902654867257, |
|
"grad_norm": 1.9013822078704834, |
|
"learning_rate": 3.914972657502677e-05, |
|
"loss": 0.3461, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2094395280235988, |
|
"grad_norm": 2.037879228591919, |
|
"learning_rate": 3.9091208770613036e-05, |
|
"loss": 0.2506, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.224188790560472, |
|
"grad_norm": 1.8921838998794556, |
|
"learning_rate": 3.9031931892683937e-05, |
|
"loss": 0.2074, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.238938053097345, |
|
"grad_norm": 1.2936807870864868, |
|
"learning_rate": 3.897189837787975e-05, |
|
"loss": 0.2762, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2536873156342183, |
|
"grad_norm": 1.600098967552185, |
|
"learning_rate": 3.891111069394313e-05, |
|
"loss": 0.2381, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2684365781710913, |
|
"grad_norm": 1.6837131977081299, |
|
"learning_rate": 3.884957133961768e-05, |
|
"loss": 0.1811, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2831858407079646, |
|
"grad_norm": 4.942287921905518, |
|
"learning_rate": 3.878728284454522e-05, |
|
"loss": 0.2511, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2979351032448379, |
|
"grad_norm": 1.305611491203308, |
|
"learning_rate": 3.872424776916183e-05, |
|
"loss": 0.2289, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3126843657817109, |
|
"grad_norm": 2.5036911964416504, |
|
"learning_rate": 3.866046870459253e-05, |
|
"loss": 0.4063, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3274336283185841, |
|
"grad_norm": 2.1621756553649902, |
|
"learning_rate": 3.8595948272544905e-05, |
|
"loss": 0.2515, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3274336283185841, |
|
"eval_loss": 0.278595894575119, |
|
"eval_runtime": 43.7764, |
|
"eval_samples_per_second": 6.899, |
|
"eval_steps_per_second": 6.899, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3421828908554572, |
|
"grad_norm": 0.7632271647453308, |
|
"learning_rate": 3.8530689125201184e-05, |
|
"loss": 0.1884, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3569321533923304, |
|
"grad_norm": 1.29710054397583, |
|
"learning_rate": 3.8464693945109305e-05, |
|
"loss": 0.225, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3716814159292037, |
|
"grad_norm": 2.357658863067627, |
|
"learning_rate": 3.839796544507265e-05, |
|
"loss": 0.3185, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.3864306784660767, |
|
"grad_norm": 1.8373112678527832, |
|
"learning_rate": 3.833050636803849e-05, |
|
"loss": 0.29, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4011799410029497, |
|
"grad_norm": 1.9975624084472656, |
|
"learning_rate": 3.826231948698527e-05, |
|
"loss": 0.3203, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.415929203539823, |
|
"grad_norm": 3.1040427684783936, |
|
"learning_rate": 3.819340760480859e-05, |
|
"loss": 0.2454, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4306784660766962, |
|
"grad_norm": 1.599753737449646, |
|
"learning_rate": 3.812377355420602e-05, |
|
"loss": 0.2825, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4454277286135693, |
|
"grad_norm": 1.1874561309814453, |
|
"learning_rate": 3.805342019756065e-05, |
|
"loss": 0.1932, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4601769911504425, |
|
"grad_norm": 1.8277095556259155, |
|
"learning_rate": 3.7982350426823406e-05, |
|
"loss": 0.2014, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4749262536873156, |
|
"grad_norm": 0.8104329109191895, |
|
"learning_rate": 3.791056716339421e-05, |
|
"loss": 0.2486, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4896755162241888, |
|
"grad_norm": 2.973177194595337, |
|
"learning_rate": 3.783807335800187e-05, |
|
"loss": 0.2373, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.504424778761062, |
|
"grad_norm": 1.6856945753097534, |
|
"learning_rate": 3.776487199058277e-05, |
|
"loss": 0.2203, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.519174041297935, |
|
"grad_norm": 2.4095230102539062, |
|
"learning_rate": 3.769096607015843e-05, |
|
"loss": 0.2813, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5339233038348081, |
|
"grad_norm": 2.658792495727539, |
|
"learning_rate": 3.761635863471175e-05, |
|
"loss": 0.2552, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5486725663716814, |
|
"grad_norm": 2.120602607727051, |
|
"learning_rate": 3.754105275106222e-05, |
|
"loss": 0.3001, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5486725663716814, |
|
"eval_loss": 0.2679256796836853, |
|
"eval_runtime": 43.6015, |
|
"eval_samples_per_second": 6.926, |
|
"eval_steps_per_second": 6.926, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5634218289085546, |
|
"grad_norm": 2.71559476852417, |
|
"learning_rate": 3.746505151473972e-05, |
|
"loss": 0.3008, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5781710914454279, |
|
"grad_norm": 2.8023104667663574, |
|
"learning_rate": 3.738835804985743e-05, |
|
"loss": 0.215, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.592920353982301, |
|
"grad_norm": 1.7893186807632446, |
|
"learning_rate": 3.731097550898329e-05, |
|
"loss": 0.1894, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.607669616519174, |
|
"grad_norm": 1.8876107931137085, |
|
"learning_rate": 3.723290707301047e-05, |
|
"loss": 0.1824, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6224188790560472, |
|
"grad_norm": 1.5161499977111816, |
|
"learning_rate": 3.7154155951026605e-05, |
|
"loss": 0.192, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6371681415929205, |
|
"grad_norm": 2.110395908355713, |
|
"learning_rate": 3.707472538018187e-05, |
|
"loss": 0.2047, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6519174041297935, |
|
"grad_norm": 2.1229217052459717, |
|
"learning_rate": 3.6994618625555925e-05, |
|
"loss": 0.1545, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 2.7134130001068115, |
|
"learning_rate": 3.691383898002368e-05, |
|
"loss": 0.2392, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6814159292035398, |
|
"grad_norm": 1.5704232454299927, |
|
"learning_rate": 3.683238976412e-05, |
|
"loss": 0.1984, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.696165191740413, |
|
"grad_norm": 1.9206628799438477, |
|
"learning_rate": 3.675027432590312e-05, |
|
"loss": 0.2669, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7109144542772863, |
|
"grad_norm": 2.9219446182250977, |
|
"learning_rate": 3.666749604081707e-05, |
|
"loss": 0.1978, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7256637168141593, |
|
"grad_norm": 0.8861755728721619, |
|
"learning_rate": 3.6584058311552954e-05, |
|
"loss": 0.1588, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.7404129793510323, |
|
"grad_norm": 3.052067279815674, |
|
"learning_rate": 3.6499964567909e-05, |
|
"loss": 0.1948, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7551622418879056, |
|
"grad_norm": 2.0168213844299316, |
|
"learning_rate": 3.641521826664964e-05, |
|
"loss": 0.2793, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"grad_norm": 3.0683932304382324, |
|
"learning_rate": 3.63298228913634e-05, |
|
"loss": 0.2384, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"eval_loss": 0.25286030769348145, |
|
"eval_runtime": 43.7965, |
|
"eval_samples_per_second": 6.896, |
|
"eval_steps_per_second": 6.896, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7846607669616519, |
|
"grad_norm": 2.649764060974121, |
|
"learning_rate": 3.624378195231967e-05, |
|
"loss": 0.3089, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.799410029498525, |
|
"grad_norm": 1.9995458126068115, |
|
"learning_rate": 3.615709898632448e-05, |
|
"loss": 0.2291, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8141592920353982, |
|
"grad_norm": 2.075753927230835, |
|
"learning_rate": 3.606977755657502e-05, |
|
"loss": 0.2188, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8289085545722714, |
|
"grad_norm": 3.202075958251953, |
|
"learning_rate": 3.5981821252513274e-05, |
|
"loss": 0.3073, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8436578171091447, |
|
"grad_norm": 2.0560741424560547, |
|
"learning_rate": 3.5893233689678384e-05, |
|
"loss": 0.2288, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8584070796460177, |
|
"grad_norm": 2.9390640258789062, |
|
"learning_rate": 3.5804018509558095e-05, |
|
"loss": 0.3001, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8731563421828907, |
|
"grad_norm": 0.6466512084007263, |
|
"learning_rate": 3.571417937943903e-05, |
|
"loss": 0.1617, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.887905604719764, |
|
"grad_norm": 1.9550355672836304, |
|
"learning_rate": 3.562371999225594e-05, |
|
"loss": 0.2687, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9026548672566372, |
|
"grad_norm": 2.3754589557647705, |
|
"learning_rate": 3.553264406643995e-05, |
|
"loss": 0.181, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9174041297935103, |
|
"grad_norm": 1.620802640914917, |
|
"learning_rate": 3.544095534576563e-05, |
|
"loss": 0.2422, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9321533923303835, |
|
"grad_norm": 1.5539398193359375, |
|
"learning_rate": 3.534865759919718e-05, |
|
"loss": 0.1669, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9469026548672566, |
|
"grad_norm": 2.4959328174591064, |
|
"learning_rate": 3.525575462073344e-05, |
|
"loss": 0.2058, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9616519174041298, |
|
"grad_norm": 2.10261607170105, |
|
"learning_rate": 3.516225022925199e-05, |
|
"loss": 0.2412, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.976401179941003, |
|
"grad_norm": 2.3935513496398926, |
|
"learning_rate": 3.5068148268352135e-05, |
|
"loss": 0.221, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.991150442477876, |
|
"grad_norm": 1.9170893430709839, |
|
"learning_rate": 3.497345260619691e-05, |
|
"loss": 0.1804, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.991150442477876, |
|
"eval_loss": 0.24877212941646576, |
|
"eval_runtime": 43.1775, |
|
"eval_samples_per_second": 6.994, |
|
"eval_steps_per_second": 6.994, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.005899705014749, |
|
"grad_norm": 1.3050181865692139, |
|
"learning_rate": 3.487816713535409e-05, |
|
"loss": 0.1889, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0206489675516224, |
|
"grad_norm": 6.0348219871521, |
|
"learning_rate": 3.478229577263617e-05, |
|
"loss": 0.1382, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.0353982300884956, |
|
"grad_norm": 2.732297897338867, |
|
"learning_rate": 3.4685842458939365e-05, |
|
"loss": 0.1052, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.050147492625369, |
|
"grad_norm": 2.3690683841705322, |
|
"learning_rate": 3.458881115908164e-05, |
|
"loss": 0.1049, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.0648967551622417, |
|
"grad_norm": 1.0060967206954956, |
|
"learning_rate": 3.449120586163966e-05, |
|
"loss": 0.1413, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.079646017699115, |
|
"grad_norm": 2.224308967590332, |
|
"learning_rate": 3.439303057878493e-05, |
|
"loss": 0.1162, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.094395280235988, |
|
"grad_norm": 1.902258038520813, |
|
"learning_rate": 3.429428934611879e-05, |
|
"loss": 0.1231, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1091445427728615, |
|
"grad_norm": 3.415437936782837, |
|
"learning_rate": 3.419498622250657e-05, |
|
"loss": 0.1914, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1238938053097347, |
|
"grad_norm": 2.006056070327759, |
|
"learning_rate": 3.409512528991075e-05, |
|
"loss": 0.1494, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1386430678466075, |
|
"grad_norm": 2.260693311691284, |
|
"learning_rate": 3.399471065322314e-05, |
|
"loss": 0.1251, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.1533923303834808, |
|
"grad_norm": 2.281259298324585, |
|
"learning_rate": 3.3893746440096144e-05, |
|
"loss": 0.1238, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.168141592920354, |
|
"grad_norm": 1.1317875385284424, |
|
"learning_rate": 3.3792236800773114e-05, |
|
"loss": 0.1144, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.1828908554572273, |
|
"grad_norm": 2.4440741539001465, |
|
"learning_rate": 3.369018590791776e-05, |
|
"loss": 0.1123, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.1976401179941005, |
|
"grad_norm": 1.8043389320373535, |
|
"learning_rate": 3.358759795644255e-05, |
|
"loss": 0.1347, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.2123893805309733, |
|
"grad_norm": 3.1604647636413574, |
|
"learning_rate": 3.3484477163336383e-05, |
|
"loss": 0.1287, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2123893805309733, |
|
"eval_loss": 0.25868239998817444, |
|
"eval_runtime": 43.8975, |
|
"eval_samples_per_second": 6.88, |
|
"eval_steps_per_second": 6.88, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2271386430678466, |
|
"grad_norm": 2.021027088165283, |
|
"learning_rate": 3.338082776749115e-05, |
|
"loss": 0.1872, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.24188790560472, |
|
"grad_norm": 1.5057893991470337, |
|
"learning_rate": 3.327665402952756e-05, |
|
"loss": 0.1177, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.256637168141593, |
|
"grad_norm": 1.7474180459976196, |
|
"learning_rate": 3.317196023161996e-05, |
|
"loss": 0.1443, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.271386430678466, |
|
"grad_norm": 1.8646279573440552, |
|
"learning_rate": 3.306675067732031e-05, |
|
"loss": 0.1386, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.286135693215339, |
|
"grad_norm": 0.44584158062934875, |
|
"learning_rate": 3.296102969138133e-05, |
|
"loss": 0.1612, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3008849557522124, |
|
"grad_norm": 0.7199454307556152, |
|
"learning_rate": 3.285480161957865e-05, |
|
"loss": 0.1217, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.3156342182890857, |
|
"grad_norm": 1.7795122861862183, |
|
"learning_rate": 3.274807082853226e-05, |
|
"loss": 0.1666, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.330383480825959, |
|
"grad_norm": 1.9658058881759644, |
|
"learning_rate": 3.264084170552692e-05, |
|
"loss": 0.1036, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3451327433628317, |
|
"grad_norm": 0.996110200881958, |
|
"learning_rate": 3.25331186583319e-05, |
|
"loss": 0.1381, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.359882005899705, |
|
"grad_norm": 5.947177410125732, |
|
"learning_rate": 3.242490611501975e-05, |
|
"loss": 0.1579, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3746312684365782, |
|
"grad_norm": 3.3651394844055176, |
|
"learning_rate": 3.231620852378428e-05, |
|
"loss": 0.165, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.3893805309734515, |
|
"grad_norm": 1.959834098815918, |
|
"learning_rate": 3.220703035275773e-05, |
|
"loss": 0.1379, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4041297935103243, |
|
"grad_norm": 2.7289717197418213, |
|
"learning_rate": 3.209737608982709e-05, |
|
"loss": 0.196, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.4188790560471976, |
|
"grad_norm": 4.184900283813477, |
|
"learning_rate": 3.1987250242449616e-05, |
|
"loss": 0.1701, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.433628318584071, |
|
"grad_norm": 2.6607248783111572, |
|
"learning_rate": 3.1876657337467564e-05, |
|
"loss": 0.1057, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.433628318584071, |
|
"eval_loss": 0.2500734031200409, |
|
"eval_runtime": 44.0, |
|
"eval_samples_per_second": 6.864, |
|
"eval_steps_per_second": 6.864, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.448377581120944, |
|
"grad_norm": 3.242337465286255, |
|
"learning_rate": 3.176560192092211e-05, |
|
"loss": 0.1478, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.4631268436578173, |
|
"grad_norm": 2.5350615978240967, |
|
"learning_rate": 3.165408855786642e-05, |
|
"loss": 0.1988, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.47787610619469, |
|
"grad_norm": 2.062612295150757, |
|
"learning_rate": 3.154212183217812e-05, |
|
"loss": 0.1228, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.4926253687315634, |
|
"grad_norm": 2.87460994720459, |
|
"learning_rate": 3.142970634637072e-05, |
|
"loss": 0.1057, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.5073746312684366, |
|
"grad_norm": 1.2329528331756592, |
|
"learning_rate": 3.131684672140458e-05, |
|
"loss": 0.1142, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.52212389380531, |
|
"grad_norm": 2.1753523349761963, |
|
"learning_rate": 3.120354759649682e-05, |
|
"loss": 0.1065, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.5368731563421827, |
|
"grad_norm": 2.7612011432647705, |
|
"learning_rate": 3.1089813628930695e-05, |
|
"loss": 0.189, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.551622418879056, |
|
"grad_norm": 1.8067593574523926, |
|
"learning_rate": 3.097564949386416e-05, |
|
"loss": 0.1085, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.566371681415929, |
|
"grad_norm": 2.544847011566162, |
|
"learning_rate": 3.086105988413766e-05, |
|
"loss": 0.2134, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.5811209439528024, |
|
"grad_norm": 0.837935209274292, |
|
"learning_rate": 3.074604951008122e-05, |
|
"loss": 0.0964, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.5958702064896757, |
|
"grad_norm": 1.2727034091949463, |
|
"learning_rate": 3.063062309932086e-05, |
|
"loss": 0.1179, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.6106194690265485, |
|
"grad_norm": 2.784679651260376, |
|
"learning_rate": 3.0514785396584238e-05, |
|
"loss": 0.1062, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.6253687315634218, |
|
"grad_norm": 1.9828710556030273, |
|
"learning_rate": 3.0398541163505598e-05, |
|
"loss": 0.1325, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.640117994100295, |
|
"grad_norm": 1.7931956052780151, |
|
"learning_rate": 3.028189517843007e-05, |
|
"loss": 0.1326, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.6548672566371683, |
|
"grad_norm": 2.979793071746826, |
|
"learning_rate": 3.0164852236217233e-05, |
|
"loss": 0.1903, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.6548672566371683, |
|
"eval_loss": 0.2538958191871643, |
|
"eval_runtime": 43.8985, |
|
"eval_samples_per_second": 6.879, |
|
"eval_steps_per_second": 6.879, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.669616519174041, |
|
"grad_norm": 1.796676516532898, |
|
"learning_rate": 3.0047417148044e-05, |
|
"loss": 0.125, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.6843657817109143, |
|
"grad_norm": 3.8405046463012695, |
|
"learning_rate": 2.99295947412069e-05, |
|
"loss": 0.119, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.6991150442477876, |
|
"grad_norm": 2.2203421592712402, |
|
"learning_rate": 2.9811389858923593e-05, |
|
"loss": 0.1104, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.713864306784661, |
|
"grad_norm": 1.727965235710144, |
|
"learning_rate": 2.9692807360133822e-05, |
|
"loss": 0.1523, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.728613569321534, |
|
"grad_norm": 2.0604889392852783, |
|
"learning_rate": 2.9573852119299634e-05, |
|
"loss": 0.1517, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7433628318584073, |
|
"grad_norm": 2.538522243499756, |
|
"learning_rate": 2.9454529026205092e-05, |
|
"loss": 0.1528, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.75811209439528, |
|
"grad_norm": 1.8873978853225708, |
|
"learning_rate": 2.9334842985755173e-05, |
|
"loss": 0.1234, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.7728613569321534, |
|
"grad_norm": 1.9439120292663574, |
|
"learning_rate": 2.921479891777423e-05, |
|
"loss": 0.1629, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.7876106194690267, |
|
"grad_norm": 1.4135169982910156, |
|
"learning_rate": 2.9094401756803725e-05, |
|
"loss": 0.1266, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.8023598820058995, |
|
"grad_norm": 3.5280425548553467, |
|
"learning_rate": 2.8973656451899372e-05, |
|
"loss": 0.1209, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8171091445427727, |
|
"grad_norm": 3.0947368144989014, |
|
"learning_rate": 2.8852567966427735e-05, |
|
"loss": 0.1104, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.831858407079646, |
|
"grad_norm": 2.14790940284729, |
|
"learning_rate": 2.8731141277862174e-05, |
|
"loss": 0.1489, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.8466076696165192, |
|
"grad_norm": 2.191667079925537, |
|
"learning_rate": 2.8609381377578267e-05, |
|
"loss": 0.0923, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.8613569321533925, |
|
"grad_norm": 1.2369720935821533, |
|
"learning_rate": 2.848729327064861e-05, |
|
"loss": 0.0907, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.8761061946902657, |
|
"grad_norm": 3.589946746826172, |
|
"learning_rate": 2.8364881975637094e-05, |
|
"loss": 0.1721, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.8761061946902657, |
|
"eval_loss": 0.2451845407485962, |
|
"eval_runtime": 43.8481, |
|
"eval_samples_per_second": 6.887, |
|
"eval_steps_per_second": 6.887, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.8908554572271385, |
|
"grad_norm": 2.7611799240112305, |
|
"learning_rate": 2.82421525243926e-05, |
|
"loss": 0.1254, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.905604719764012, |
|
"grad_norm": 1.5316367149353027, |
|
"learning_rate": 2.8119109961842176e-05, |
|
"loss": 0.1056, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.920353982300885, |
|
"grad_norm": 1.325637698173523, |
|
"learning_rate": 2.799575934578365e-05, |
|
"loss": 0.1512, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.935103244837758, |
|
"grad_norm": 2.2634294033050537, |
|
"learning_rate": 2.7872105746677694e-05, |
|
"loss": 0.1763, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.949852507374631, |
|
"grad_norm": 1.410277009010315, |
|
"learning_rate": 2.774815424743947e-05, |
|
"loss": 0.2031, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.9646017699115044, |
|
"grad_norm": 3.4232828617095947, |
|
"learning_rate": 2.762390994322962e-05, |
|
"loss": 0.1413, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.9793510324483776, |
|
"grad_norm": 1.2643831968307495, |
|
"learning_rate": 2.749937794124486e-05, |
|
"loss": 0.1268, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.994100294985251, |
|
"grad_norm": 3.378948926925659, |
|
"learning_rate": 2.7374563360508036e-05, |
|
"loss": 0.1299, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.0088495575221237, |
|
"grad_norm": 2.396994113922119, |
|
"learning_rate": 2.7249471331657693e-05, |
|
"loss": 0.0974, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.023598820058997, |
|
"grad_norm": 1.482927918434143, |
|
"learning_rate": 2.712410699673718e-05, |
|
"loss": 0.098, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.03834808259587, |
|
"grad_norm": 1.5194566249847412, |
|
"learning_rate": 2.699847550898329e-05, |
|
"loss": 0.0639, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.0530973451327434, |
|
"grad_norm": 1.4694433212280273, |
|
"learning_rate": 2.6872582032614426e-05, |
|
"loss": 0.1063, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.0678466076696167, |
|
"grad_norm": 0.5857545137405396, |
|
"learning_rate": 2.6746431742618305e-05, |
|
"loss": 0.0709, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.0825958702064895, |
|
"grad_norm": 2.3842179775238037, |
|
"learning_rate": 2.6620029824539257e-05, |
|
"loss": 0.1083, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.0973451327433628, |
|
"grad_norm": 1.8553909063339233, |
|
"learning_rate": 2.6493381474265044e-05, |
|
"loss": 0.0786, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.0973451327433628, |
|
"eval_loss": 0.2644558846950531, |
|
"eval_runtime": 43.8036, |
|
"eval_samples_per_second": 6.894, |
|
"eval_steps_per_second": 6.894, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.112094395280236, |
|
"grad_norm": 0.7660773992538452, |
|
"learning_rate": 2.636649189781331e-05, |
|
"loss": 0.0781, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.1268436578171093, |
|
"grad_norm": 2.0916807651519775, |
|
"learning_rate": 2.6239366311117528e-05, |
|
"loss": 0.129, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.1415929203539825, |
|
"grad_norm": 1.9852261543273926, |
|
"learning_rate": 2.6112009939812672e-05, |
|
"loss": 0.0631, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.1563421828908553, |
|
"grad_norm": 1.6749731302261353, |
|
"learning_rate": 2.5984428019020343e-05, |
|
"loss": 0.0932, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.1710914454277286, |
|
"grad_norm": 2.2517542839050293, |
|
"learning_rate": 2.5856625793133585e-05, |
|
"loss": 0.0752, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.185840707964602, |
|
"grad_norm": 3.476151466369629, |
|
"learning_rate": 2.5728608515601357e-05, |
|
"loss": 0.0657, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.200589970501475, |
|
"grad_norm": 8.088579177856445, |
|
"learning_rate": 2.560038144871252e-05, |
|
"loss": 0.1153, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.215339233038348, |
|
"grad_norm": 2.3388564586639404, |
|
"learning_rate": 2.547194986337956e-05, |
|
"loss": 0.0684, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.230088495575221, |
|
"grad_norm": 2.223625421524048, |
|
"learning_rate": 2.5343319038921927e-05, |
|
"loss": 0.0712, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.2448377581120944, |
|
"grad_norm": 1.7080127000808716, |
|
"learning_rate": 2.521449426284898e-05, |
|
"loss": 0.0701, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.2595870206489677, |
|
"grad_norm": 1.1411865949630737, |
|
"learning_rate": 2.5085480830642722e-05, |
|
"loss": 0.0773, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.274336283185841, |
|
"grad_norm": 3.657041072845459, |
|
"learning_rate": 2.4956284045540015e-05, |
|
"loss": 0.0779, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.2890855457227137, |
|
"grad_norm": 1.847219705581665, |
|
"learning_rate": 2.4826909218314684e-05, |
|
"loss": 0.0656, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.303834808259587, |
|
"grad_norm": 1.5679715871810913, |
|
"learning_rate": 2.4697361667059132e-05, |
|
"loss": 0.0741, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.3185840707964602, |
|
"grad_norm": 2.4843060970306396, |
|
"learning_rate": 2.4567646716965808e-05, |
|
"loss": 0.077, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.3185840707964602, |
|
"eval_loss": 0.2629007399082184, |
|
"eval_runtime": 43.9182, |
|
"eval_samples_per_second": 6.876, |
|
"eval_steps_per_second": 6.876, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 1.421711802482605, |
|
"learning_rate": 2.443776970010823e-05, |
|
"loss": 0.0612, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.3480825958702063, |
|
"grad_norm": 1.3979884386062622, |
|
"learning_rate": 2.430773595522188e-05, |
|
"loss": 0.0962, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.3628318584070795, |
|
"grad_norm": 2.23537540435791, |
|
"learning_rate": 2.4177550827484704e-05, |
|
"loss": 0.1082, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.377581120943953, |
|
"grad_norm": 0.7765734195709229, |
|
"learning_rate": 2.4047219668297402e-05, |
|
"loss": 0.1427, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.392330383480826, |
|
"grad_norm": 1.6800991296768188, |
|
"learning_rate": 2.3916747835063446e-05, |
|
"loss": 0.0701, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.4070796460176993, |
|
"grad_norm": 3.2461161613464355, |
|
"learning_rate": 2.3786140690968887e-05, |
|
"loss": 0.0997, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.421828908554572, |
|
"grad_norm": 6.6169257164001465, |
|
"learning_rate": 2.3655403604761872e-05, |
|
"loss": 0.0774, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.4365781710914454, |
|
"grad_norm": 0.5882118940353394, |
|
"learning_rate": 2.3524541950531934e-05, |
|
"loss": 0.0857, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.4513274336283186, |
|
"grad_norm": 1.4368208646774292, |
|
"learning_rate": 2.3393561107489144e-05, |
|
"loss": 0.1152, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.466076696165192, |
|
"grad_norm": 1.8174480199813843, |
|
"learning_rate": 2.3262466459742938e-05, |
|
"loss": 0.0788, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.4808259587020647, |
|
"grad_norm": 4.503509044647217, |
|
"learning_rate": 2.313126339608082e-05, |
|
"loss": 0.0961, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.495575221238938, |
|
"grad_norm": 1.568178415298462, |
|
"learning_rate": 2.2999957309746853e-05, |
|
"loss": 0.0868, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.510324483775811, |
|
"grad_norm": 2.2628488540649414, |
|
"learning_rate": 2.286855359821995e-05, |
|
"loss": 0.0766, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.5250737463126844, |
|
"grad_norm": 2.249336004257202, |
|
"learning_rate": 2.273705766299202e-05, |
|
"loss": 0.1004, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"grad_norm": 2.4996466636657715, |
|
"learning_rate": 2.2605474909345937e-05, |
|
"loss": 0.0864, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"eval_loss": 0.2607187330722809, |
|
"eval_runtime": 43.5839, |
|
"eval_samples_per_second": 6.929, |
|
"eval_steps_per_second": 6.929, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.554572271386431, |
|
"grad_norm": 2.188922643661499, |
|
"learning_rate": 2.2473810746133318e-05, |
|
"loss": 0.1021, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.5693215339233038, |
|
"grad_norm": 1.4103885889053345, |
|
"learning_rate": 2.234207058555222e-05, |
|
"loss": 0.1025, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.584070796460177, |
|
"grad_norm": 3.289332389831543, |
|
"learning_rate": 2.221025984292466e-05, |
|
"loss": 0.0835, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.5988200589970503, |
|
"grad_norm": 1.7993652820587158, |
|
"learning_rate": 2.2078383936473987e-05, |
|
"loss": 0.0776, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.613569321533923, |
|
"grad_norm": 2.0649101734161377, |
|
"learning_rate": 2.1946448287102206e-05, |
|
"loss": 0.0973, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.6283185840707963, |
|
"grad_norm": 1.656886339187622, |
|
"learning_rate": 2.18144583181671e-05, |
|
"loss": 0.0863, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.6430678466076696, |
|
"grad_norm": 2.353787899017334, |
|
"learning_rate": 2.168241945525932e-05, |
|
"loss": 0.0962, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.657817109144543, |
|
"grad_norm": 2.442674398422241, |
|
"learning_rate": 2.1550337125979373e-05, |
|
"loss": 0.0828, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.672566371681416, |
|
"grad_norm": 1.9212013483047485, |
|
"learning_rate": 2.1418216759714467e-05, |
|
"loss": 0.0729, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.6873156342182893, |
|
"grad_norm": 3.9996650218963623, |
|
"learning_rate": 2.1286063787415392e-05, |
|
"loss": 0.1083, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.702064896755162, |
|
"grad_norm": 1.5336638689041138, |
|
"learning_rate": 2.115388364137322e-05, |
|
"loss": 0.0758, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.7168141592920354, |
|
"grad_norm": 0.8267903923988342, |
|
"learning_rate": 2.1021681754996045e-05, |
|
"loss": 0.0739, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.7315634218289087, |
|
"grad_norm": 0.9670748114585876, |
|
"learning_rate": 2.0889463562585625e-05, |
|
"loss": 0.085, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.7463126843657815, |
|
"grad_norm": 1.723310947418213, |
|
"learning_rate": 2.075723449911398e-05, |
|
"loss": 0.066, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.7610619469026547, |
|
"grad_norm": 2.4782888889312744, |
|
"learning_rate": 2.0625e-05, |
|
"loss": 0.0905, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.7610619469026547, |
|
"eval_loss": 0.2644849419593811, |
|
"eval_runtime": 43.9466, |
|
"eval_samples_per_second": 6.872, |
|
"eval_steps_per_second": 6.872, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.775811209439528, |
|
"grad_norm": 2.117515802383423, |
|
"learning_rate": 2.049276550088602e-05, |
|
"loss": 0.0745, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.7905604719764012, |
|
"grad_norm": 2.7794859409332275, |
|
"learning_rate": 2.0360536437414375e-05, |
|
"loss": 0.0939, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.8053097345132745, |
|
"grad_norm": 1.0744003057479858, |
|
"learning_rate": 2.0228318245003955e-05, |
|
"loss": 0.0675, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.8200589970501477, |
|
"grad_norm": 1.0996911525726318, |
|
"learning_rate": 2.0096116358626783e-05, |
|
"loss": 0.0662, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.8348082595870205, |
|
"grad_norm": 0.7364205718040466, |
|
"learning_rate": 1.996393621258461e-05, |
|
"loss": 0.0553, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.849557522123894, |
|
"grad_norm": 1.5476138591766357, |
|
"learning_rate": 1.9831783240285533e-05, |
|
"loss": 0.0942, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.864306784660767, |
|
"grad_norm": 1.6006284952163696, |
|
"learning_rate": 1.9699662874020637e-05, |
|
"loss": 0.0969, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.87905604719764, |
|
"grad_norm": 0.5195714831352234, |
|
"learning_rate": 1.9567580544740682e-05, |
|
"loss": 0.0875, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.893805309734513, |
|
"grad_norm": 0.9594073295593262, |
|
"learning_rate": 1.9435541681832906e-05, |
|
"loss": 0.0713, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.9085545722713864, |
|
"grad_norm": 0.8206641674041748, |
|
"learning_rate": 1.9303551712897798e-05, |
|
"loss": 0.1369, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.9233038348082596, |
|
"grad_norm": 2.239541530609131, |
|
"learning_rate": 1.9171616063526012e-05, |
|
"loss": 0.0837, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.938053097345133, |
|
"grad_norm": 1.1113237142562866, |
|
"learning_rate": 1.9039740157075348e-05, |
|
"loss": 0.0671, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.952802359882006, |
|
"grad_norm": 2.035332441329956, |
|
"learning_rate": 1.8907929414447785e-05, |
|
"loss": 0.0955, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.967551622418879, |
|
"grad_norm": 0.7038524150848389, |
|
"learning_rate": 1.8776189253866686e-05, |
|
"loss": 0.0708, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.982300884955752, |
|
"grad_norm": 3.023179054260254, |
|
"learning_rate": 1.8644525090654063e-05, |
|
"loss": 0.0863, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.982300884955752, |
|
"eval_loss": 0.27085980772972107, |
|
"eval_runtime": 43.594, |
|
"eval_samples_per_second": 6.928, |
|
"eval_steps_per_second": 6.928, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.9970501474926254, |
|
"grad_norm": 0.3948822021484375, |
|
"learning_rate": 1.851294233700798e-05, |
|
"loss": 0.058, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.011799410029498, |
|
"grad_norm": 2.1759111881256104, |
|
"learning_rate": 1.8381446401780052e-05, |
|
"loss": 0.0544, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.0265486725663715, |
|
"grad_norm": 1.0792875289916992, |
|
"learning_rate": 1.825004269025315e-05, |
|
"loss": 0.0438, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.041297935103245, |
|
"grad_norm": 0.6035469770431519, |
|
"learning_rate": 1.811873660391918e-05, |
|
"loss": 0.068, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.056047197640118, |
|
"grad_norm": 2.127488613128662, |
|
"learning_rate": 1.7987533540257062e-05, |
|
"loss": 0.0842, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.070796460176991, |
|
"grad_norm": 0.9804584980010986, |
|
"learning_rate": 1.7856438892510862e-05, |
|
"loss": 0.0512, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.0855457227138645, |
|
"grad_norm": 0.3105282187461853, |
|
"learning_rate": 1.772545804946807e-05, |
|
"loss": 0.0632, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.100294985250738, |
|
"grad_norm": 0.9263339638710022, |
|
"learning_rate": 1.759459639523813e-05, |
|
"loss": 0.0495, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.115044247787611, |
|
"grad_norm": 0.42594772577285767, |
|
"learning_rate": 1.7463859309031106e-05, |
|
"loss": 0.0467, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.129793510324483, |
|
"grad_norm": 1.5321959257125854, |
|
"learning_rate": 1.7333252164936557e-05, |
|
"loss": 0.0622, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.144542772861357, |
|
"grad_norm": 0.47825008630752563, |
|
"learning_rate": 1.7202780331702608e-05, |
|
"loss": 0.0667, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.15929203539823, |
|
"grad_norm": 2.095520496368408, |
|
"learning_rate": 1.70724491725153e-05, |
|
"loss": 0.063, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.174041297935103, |
|
"grad_norm": 0.4359021782875061, |
|
"learning_rate": 1.694226404477812e-05, |
|
"loss": 0.0429, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.188790560471976, |
|
"grad_norm": 2.7264065742492676, |
|
"learning_rate": 1.681223029989177e-05, |
|
"loss": 0.0501, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.20353982300885, |
|
"grad_norm": 0.8709071278572083, |
|
"learning_rate": 1.66823532830342e-05, |
|
"loss": 0.0464, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.20353982300885, |
|
"eval_loss": 0.29095226526260376, |
|
"eval_runtime": 43.6544, |
|
"eval_samples_per_second": 6.918, |
|
"eval_steps_per_second": 6.918, |
|
"step": 2850 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.313588061300654e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|