|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9952305246422894, |
|
"eval_steps": 500, |
|
"global_step": 705, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04239533651298357, |
|
"grad_norm": 2.2316667982103406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7812, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08479067302596714, |
|
"grad_norm": 1.1993746776909373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7056, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1271860095389507, |
|
"grad_norm": 3.286114874502613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6864, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16958134605193428, |
|
"grad_norm": 2.561839816370102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.667, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21197668256491786, |
|
"grad_norm": 1.9169151537205058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6614, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2543720190779014, |
|
"grad_norm": 1.0973181669446288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6494, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.296767355590885, |
|
"grad_norm": 1.034933885223678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6484, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.33916269210386857, |
|
"grad_norm": 0.7865191243331952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6363, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3815580286168522, |
|
"grad_norm": 0.7056553103349794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6235, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4239533651298357, |
|
"grad_norm": 0.4937582745636324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6281, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4663487016428193, |
|
"grad_norm": 0.6432738707069521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6178, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5087440381558028, |
|
"grad_norm": 0.5657152783676671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6124, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5511393746687865, |
|
"grad_norm": 0.5728894423804287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6141, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.59353471118177, |
|
"grad_norm": 0.9000827049341398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6075, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6359300476947536, |
|
"grad_norm": 0.5732431292908422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6783253842077371, |
|
"grad_norm": 0.48964161415179625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6085, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 0.668747901477211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6041, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7631160572337043, |
|
"grad_norm": 0.5771529616338592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5979, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8055113937466879, |
|
"grad_norm": 0.6888697208557462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6059, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8479067302596714, |
|
"grad_norm": 0.6166461854075483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6003, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.890302066772655, |
|
"grad_norm": 0.5434935025631478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.603, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9326974032856385, |
|
"grad_norm": 0.5264272989814844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5992, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9750927397986221, |
|
"grad_norm": 0.46541558208627687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5889, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9962904080551139, |
|
"eval_loss": 0.6008393168449402, |
|
"eval_runtime": 163.6579, |
|
"eval_samples_per_second": 38.837, |
|
"eval_steps_per_second": 0.611, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0206677265500794, |
|
"grad_norm": 0.89166603085697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6229, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.063063063063063, |
|
"grad_norm": 0.581419858002089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5427, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1054583995760465, |
|
"grad_norm": 0.5191215066708207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.539, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1478537360890302, |
|
"grad_norm": 0.5196872625454427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5482, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1902490726020138, |
|
"grad_norm": 0.5298687346745908, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5468, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2326444091149973, |
|
"grad_norm": 0.6561812896623879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5423, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.275039745627981, |
|
"grad_norm": 0.6261936873756472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5456, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3174350821409644, |
|
"grad_norm": 0.5047293271651395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5459, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.359830418653948, |
|
"grad_norm": 0.5753008881404583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5489, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4022257551669317, |
|
"grad_norm": 0.6703542472092969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.543, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4446210916799151, |
|
"grad_norm": 0.5393615274555416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5441, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4870164281928988, |
|
"grad_norm": 0.48296715455359296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5439, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5294117647058822, |
|
"grad_norm": 0.5809446548698705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5354, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.571807101218866, |
|
"grad_norm": 0.5933872940021416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5399, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6142024377318496, |
|
"grad_norm": 0.524152859239676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5481, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6565977742448332, |
|
"grad_norm": 0.4982586627664606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5411, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6989931107578167, |
|
"grad_norm": 0.5456970956451752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5459, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7413884472708, |
|
"grad_norm": 0.5202754837524779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5402, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 0.6711734454080579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5364, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8261791202967674, |
|
"grad_norm": 0.5418495560274942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5339, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.8685744568097509, |
|
"grad_norm": 0.5620460678800686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5395, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.9109697933227345, |
|
"grad_norm": 0.5212758597038694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5362, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.953365129835718, |
|
"grad_norm": 0.795613654180602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5468, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.9957604663487016, |
|
"grad_norm": 0.7263533155733923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5378, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.9957604663487016, |
|
"eval_loss": 0.5872675180435181, |
|
"eval_runtime": 165.6457, |
|
"eval_samples_per_second": 38.371, |
|
"eval_steps_per_second": 0.604, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.041335453100159, |
|
"grad_norm": 0.7145538182476606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5322, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.0837307896131425, |
|
"grad_norm": 0.6510543332489417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.126126126126126, |
|
"grad_norm": 0.6504640941347611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4798, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.16852146263911, |
|
"grad_norm": 0.5437306589456042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4835, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.210916799152093, |
|
"grad_norm": 0.8607377487125576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4882, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2533121356650767, |
|
"grad_norm": 0.5714248182452936, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4915, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.2957074721780604, |
|
"grad_norm": 0.5730044528705454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4829, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.338102808691044, |
|
"grad_norm": 0.49043187057510523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4853, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.3804981452040277, |
|
"grad_norm": 0.5123480132951224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4841, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.4228934817170114, |
|
"grad_norm": 0.4937742679933037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4821, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.4652888182299946, |
|
"grad_norm": 0.621960122914475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4951, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.5076841547429782, |
|
"grad_norm": 0.502322262981531, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4808, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.550079491255962, |
|
"grad_norm": 0.5670786920010145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4858, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.5924748277689456, |
|
"grad_norm": 0.5292848554703771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4862, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.6348701642819288, |
|
"grad_norm": 0.5545077584631773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4837, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.6772655007949124, |
|
"grad_norm": 0.5419256647496618, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4863, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.719660837307896, |
|
"grad_norm": 0.5900403862641418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4851, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.7620561738208798, |
|
"grad_norm": 0.5678206357556943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4834, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.8044515103338634, |
|
"grad_norm": 0.48537231678832665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4813, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.846846846846847, |
|
"grad_norm": 0.6710551648607432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4839, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.8892421833598303, |
|
"grad_norm": 0.5984418858931433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4889, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.931637519872814, |
|
"grad_norm": 0.525238351520099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4859, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.9740328563857976, |
|
"grad_norm": 0.5831756326510696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4895, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.9952305246422894, |
|
"eval_loss": 0.586609423160553, |
|
"eval_runtime": 162.6024, |
|
"eval_samples_per_second": 39.089, |
|
"eval_steps_per_second": 0.615, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.9952305246422894, |
|
"step": 705, |
|
"total_flos": 1180691878379520.0, |
|
"train_loss": 0.5545286432225653, |
|
"train_runtime": 23340.2695, |
|
"train_samples_per_second": 15.52, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 705, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1180691878379520.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|