|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.23094688221709006, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004618937644341801, |
|
"grad_norm": 10.975909233093262, |
|
"learning_rate": 3.4642032332563515e-07, |
|
"loss": 2.4942, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009237875288683603, |
|
"grad_norm": 9.186948776245117, |
|
"learning_rate": 9.237875288683603e-07, |
|
"loss": 2.7582, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013856812933025405, |
|
"grad_norm": 10.907584190368652, |
|
"learning_rate": 1.443418013856813e-06, |
|
"loss": 2.6814, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018475750577367205, |
|
"grad_norm": 9.531168937683105, |
|
"learning_rate": 2.0207852193995383e-06, |
|
"loss": 2.6982, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023094688221709007, |
|
"grad_norm": 14.727725982666016, |
|
"learning_rate": 2.5981524249422633e-06, |
|
"loss": 2.0218, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02771362586605081, |
|
"grad_norm": 8.314309120178223, |
|
"learning_rate": 3.117782909930716e-06, |
|
"loss": 1.5595, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03233256351039261, |
|
"grad_norm": 4.944284915924072, |
|
"learning_rate": 3.6951501154734412e-06, |
|
"loss": 1.02, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03695150115473441, |
|
"grad_norm": 10.882843971252441, |
|
"learning_rate": 4.272517321016166e-06, |
|
"loss": 1.0419, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04157043879907621, |
|
"grad_norm": 12.310320854187012, |
|
"learning_rate": 4.849884526558892e-06, |
|
"loss": 1.0801, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.046189376443418015, |
|
"grad_norm": 6.49992036819458, |
|
"learning_rate": 5.427251732101616e-06, |
|
"loss": 0.8444, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.050808314087759814, |
|
"grad_norm": 2.89493465423584, |
|
"learning_rate": 6.004618937644342e-06, |
|
"loss": 0.8884, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05542725173210162, |
|
"grad_norm": 3.966763734817505, |
|
"learning_rate": 6.581986143187067e-06, |
|
"loss": 0.8672, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06004618937644342, |
|
"grad_norm": 4.442293167114258, |
|
"learning_rate": 7.159353348729793e-06, |
|
"loss": 0.8037, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06466512702078522, |
|
"grad_norm": 2.506918430328369, |
|
"learning_rate": 7.736720554272519e-06, |
|
"loss": 0.724, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06928406466512702, |
|
"grad_norm": 5.733686447143555, |
|
"learning_rate": 8.314087759815242e-06, |
|
"loss": 0.7692, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07390300230946882, |
|
"grad_norm": 4.161188125610352, |
|
"learning_rate": 8.891454965357968e-06, |
|
"loss": 0.7413, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07852193995381063, |
|
"grad_norm": 3.9434962272644043, |
|
"learning_rate": 9.468822170900693e-06, |
|
"loss": 0.7386, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08314087759815242, |
|
"grad_norm": 2.9100701808929443, |
|
"learning_rate": 1.0046189376443418e-05, |
|
"loss": 0.6942, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08775981524249422, |
|
"grad_norm": 5.367318153381348, |
|
"learning_rate": 1.0623556581986144e-05, |
|
"loss": 0.8011, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09237875288683603, |
|
"grad_norm": 3.1690614223480225, |
|
"learning_rate": 1.1200923787528869e-05, |
|
"loss": 0.6816, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09699769053117784, |
|
"grad_norm": 4.35976505279541, |
|
"learning_rate": 1.1778290993071595e-05, |
|
"loss": 0.7408, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10161662817551963, |
|
"grad_norm": 3.330937623977661, |
|
"learning_rate": 1.235565819861432e-05, |
|
"loss": 0.7159, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10623556581986143, |
|
"grad_norm": 5.761129379272461, |
|
"learning_rate": 1.2933025404157046e-05, |
|
"loss": 0.6838, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11085450346420324, |
|
"grad_norm": 7.05668830871582, |
|
"learning_rate": 1.351039260969977e-05, |
|
"loss": 0.7135, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11547344110854503, |
|
"grad_norm": 3.7135939598083496, |
|
"learning_rate": 1.4087759815242497e-05, |
|
"loss": 0.6385, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12009237875288684, |
|
"grad_norm": 5.477907657623291, |
|
"learning_rate": 1.4665127020785218e-05, |
|
"loss": 0.6292, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12471131639722864, |
|
"grad_norm": 6.577059268951416, |
|
"learning_rate": 1.5242494226327944e-05, |
|
"loss": 0.6921, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12933025404157045, |
|
"grad_norm": 3.6328892707824707, |
|
"learning_rate": 1.581986143187067e-05, |
|
"loss": 0.6621, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13394919168591224, |
|
"grad_norm": 4.084783554077148, |
|
"learning_rate": 1.6397228637413393e-05, |
|
"loss": 0.6667, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13856812933025403, |
|
"grad_norm": 3.8719701766967773, |
|
"learning_rate": 1.697459584295612e-05, |
|
"loss": 0.6692, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14318706697459585, |
|
"grad_norm": 7.860931873321533, |
|
"learning_rate": 1.7551963048498846e-05, |
|
"loss": 0.6251, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14780600461893764, |
|
"grad_norm": 4.381837368011475, |
|
"learning_rate": 1.812933025404157e-05, |
|
"loss": 0.6297, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15242494226327943, |
|
"grad_norm": 3.7145886421203613, |
|
"learning_rate": 1.8706697459584295e-05, |
|
"loss": 0.6483, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15704387990762125, |
|
"grad_norm": 2.609006643295288, |
|
"learning_rate": 1.9284064665127023e-05, |
|
"loss": 0.6149, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16166281755196305, |
|
"grad_norm": 4.774081230163574, |
|
"learning_rate": 1.9861431870669748e-05, |
|
"loss": 0.6034, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16628175519630484, |
|
"grad_norm": 7.305100440979004, |
|
"learning_rate": 2.0438799076212473e-05, |
|
"loss": 0.6496, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.17090069284064666, |
|
"grad_norm": 5.507181644439697, |
|
"learning_rate": 2.1016166281755197e-05, |
|
"loss": 0.643, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.17551963048498845, |
|
"grad_norm": 4.033135890960693, |
|
"learning_rate": 2.1593533487297922e-05, |
|
"loss": 0.6186, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18013856812933027, |
|
"grad_norm": 3.903007745742798, |
|
"learning_rate": 2.217090069284065e-05, |
|
"loss": 0.6041, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18475750577367206, |
|
"grad_norm": 4.785562992095947, |
|
"learning_rate": 2.2748267898383374e-05, |
|
"loss": 0.5527, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18937644341801385, |
|
"grad_norm": 3.4289231300354004, |
|
"learning_rate": 2.3325635103926096e-05, |
|
"loss": 0.5936, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19399538106235567, |
|
"grad_norm": 2.384840965270996, |
|
"learning_rate": 2.3903002309468824e-05, |
|
"loss": 0.5421, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19861431870669746, |
|
"grad_norm": 4.025755882263184, |
|
"learning_rate": 2.448036951501155e-05, |
|
"loss": 0.5839, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.20323325635103925, |
|
"grad_norm": 4.832013130187988, |
|
"learning_rate": 2.5057736720554276e-05, |
|
"loss": 0.5938, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20785219399538107, |
|
"grad_norm": 3.66886305809021, |
|
"learning_rate": 2.5635103926096998e-05, |
|
"loss": 0.5607, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21247113163972287, |
|
"grad_norm": 3.7285852432250977, |
|
"learning_rate": 2.6212471131639726e-05, |
|
"loss": 0.5457, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21709006928406466, |
|
"grad_norm": 3.755711555480957, |
|
"learning_rate": 2.678983833718245e-05, |
|
"loss": 0.5721, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22170900692840648, |
|
"grad_norm": 4.016116619110107, |
|
"learning_rate": 2.7367205542725178e-05, |
|
"loss": 0.59, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22632794457274827, |
|
"grad_norm": 6.123377799987793, |
|
"learning_rate": 2.79445727482679e-05, |
|
"loss": 0.6236, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23094688221709006, |
|
"grad_norm": 3.77093505859375, |
|
"learning_rate": 2.8521939953810624e-05, |
|
"loss": 0.6306, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8660, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.346142800487383e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|