|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 21873, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06857769853243725, |
|
"grad_norm": 1.0945861339569092, |
|
"learning_rate": 4.8857038357792714e-05, |
|
"loss": 0.318, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1371553970648745, |
|
"grad_norm": 0.6745380163192749, |
|
"learning_rate": 4.7714076715585427e-05, |
|
"loss": 0.1574, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20573309559731176, |
|
"grad_norm": 0.7351590991020203, |
|
"learning_rate": 4.657111507337814e-05, |
|
"loss": 0.1403, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.274310794129749, |
|
"grad_norm": 0.8816856145858765, |
|
"learning_rate": 4.542815343117085e-05, |
|
"loss": 0.1308, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.34288849266218624, |
|
"grad_norm": 0.8210085034370422, |
|
"learning_rate": 4.428519178896356e-05, |
|
"loss": 0.1228, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4114661911946235, |
|
"grad_norm": 0.6685318350791931, |
|
"learning_rate": 4.314223014675628e-05, |
|
"loss": 0.1219, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48004388972706075, |
|
"grad_norm": 0.7579106092453003, |
|
"learning_rate": 4.199926850454899e-05, |
|
"loss": 0.117, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.548621588259498, |
|
"grad_norm": 1.059991478919983, |
|
"learning_rate": 4.0856306862341706e-05, |
|
"loss": 0.1127, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6171992867919353, |
|
"grad_norm": 0.9921385049819946, |
|
"learning_rate": 3.971334522013441e-05, |
|
"loss": 0.1126, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6857769853243725, |
|
"grad_norm": 0.6036443114280701, |
|
"learning_rate": 3.857038357792713e-05, |
|
"loss": 0.1099, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7543546838568098, |
|
"grad_norm": 0.88544762134552, |
|
"learning_rate": 3.7427421935719835e-05, |
|
"loss": 0.1063, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.822932382389247, |
|
"grad_norm": 0.6532144546508789, |
|
"learning_rate": 3.6284460293512554e-05, |
|
"loss": 0.1065, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8915100809216843, |
|
"grad_norm": 0.755144476890564, |
|
"learning_rate": 3.5141498651305266e-05, |
|
"loss": 0.106, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9600877794541215, |
|
"grad_norm": 0.9388787150382996, |
|
"learning_rate": 3.399853700909797e-05, |
|
"loss": 0.1029, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0286654779865587, |
|
"grad_norm": 0.8779000043869019, |
|
"learning_rate": 3.285557536689069e-05, |
|
"loss": 0.1011, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.097243176518996, |
|
"grad_norm": 0.8181456923484802, |
|
"learning_rate": 3.1712613724683396e-05, |
|
"loss": 0.0941, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.1658208750514332, |
|
"grad_norm": 0.7053276300430298, |
|
"learning_rate": 3.0569652082476115e-05, |
|
"loss": 0.0943, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.2343985735838705, |
|
"grad_norm": 0.9002987146377563, |
|
"learning_rate": 2.9426690440268827e-05, |
|
"loss": 0.0942, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.3029762721163078, |
|
"grad_norm": 0.5714060664176941, |
|
"learning_rate": 2.8283728798061536e-05, |
|
"loss": 0.0937, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.371553970648745, |
|
"grad_norm": 0.7014051079750061, |
|
"learning_rate": 2.714076715585425e-05, |
|
"loss": 0.0925, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.4401316691811823, |
|
"grad_norm": 0.9592730402946472, |
|
"learning_rate": 2.599780551364696e-05, |
|
"loss": 0.0949, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.5087093677136196, |
|
"grad_norm": 0.6313921213150024, |
|
"learning_rate": 2.4854843871439675e-05, |
|
"loss": 0.0923, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.5772870662460567, |
|
"grad_norm": 0.8395763039588928, |
|
"learning_rate": 2.3711882229232387e-05, |
|
"loss": 0.0911, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.645864764778494, |
|
"grad_norm": 0.7302150130271912, |
|
"learning_rate": 2.25689205870251e-05, |
|
"loss": 0.0911, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.7144424633109314, |
|
"grad_norm": 0.5918404459953308, |
|
"learning_rate": 2.142595894481781e-05, |
|
"loss": 0.0896, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.7830201618433685, |
|
"grad_norm": 0.7551326155662537, |
|
"learning_rate": 2.0282997302610527e-05, |
|
"loss": 0.0905, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.8515978603758056, |
|
"grad_norm": 1.6809072494506836, |
|
"learning_rate": 1.914003566040324e-05, |
|
"loss": 0.0891, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.920175558908243, |
|
"grad_norm": 0.6433506011962891, |
|
"learning_rate": 1.799707401819595e-05, |
|
"loss": 0.0894, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.9887532574406803, |
|
"grad_norm": 0.7609830498695374, |
|
"learning_rate": 1.6854112375988663e-05, |
|
"loss": 0.0892, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.0573309559731174, |
|
"grad_norm": 0.611824095249176, |
|
"learning_rate": 1.5711150733781376e-05, |
|
"loss": 0.0847, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.125908654505555, |
|
"grad_norm": 0.7075466513633728, |
|
"learning_rate": 1.456818909157409e-05, |
|
"loss": 0.0841, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.194486353037992, |
|
"grad_norm": 0.6217673420906067, |
|
"learning_rate": 1.3425227449366801e-05, |
|
"loss": 0.0827, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.2630640515704292, |
|
"grad_norm": 0.862660825252533, |
|
"learning_rate": 1.2282265807159512e-05, |
|
"loss": 0.0841, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.3316417501028663, |
|
"grad_norm": 0.832482635974884, |
|
"learning_rate": 1.1139304164952224e-05, |
|
"loss": 0.0832, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.400219448635304, |
|
"grad_norm": 0.8499016761779785, |
|
"learning_rate": 9.996342522744938e-06, |
|
"loss": 0.0835, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.468797147167741, |
|
"grad_norm": 0.5063202977180481, |
|
"learning_rate": 8.85338088053765e-06, |
|
"loss": 0.082, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.537374845700178, |
|
"grad_norm": 0.8636651635169983, |
|
"learning_rate": 7.710419238330362e-06, |
|
"loss": 0.0816, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.6059525442326157, |
|
"grad_norm": 0.7150306105613708, |
|
"learning_rate": 6.567457596123075e-06, |
|
"loss": 0.0827, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.674530242765053, |
|
"grad_norm": 0.6820867657661438, |
|
"learning_rate": 5.424495953915787e-06, |
|
"loss": 0.0829, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.74310794129749, |
|
"grad_norm": 0.8032534718513489, |
|
"learning_rate": 4.2815343117085e-06, |
|
"loss": 0.0819, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.8116856398299275, |
|
"grad_norm": 0.6814680099487305, |
|
"learning_rate": 3.1385726695012116e-06, |
|
"loss": 0.0825, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.8802633383623646, |
|
"grad_norm": 0.5674154162406921, |
|
"learning_rate": 1.995611027293924e-06, |
|
"loss": 0.0791, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.9488410368948017, |
|
"grad_norm": 0.8843555450439453, |
|
"learning_rate": 8.526493850866365e-07, |
|
"loss": 0.0823, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 21873, |
|
"total_flos": 797101335136512.0, |
|
"train_loss": 0.09493603817759208, |
|
"train_runtime": 3874.7393, |
|
"train_samples_per_second": 45.156, |
|
"train_steps_per_second": 5.645 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 21873, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 797101335136512.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|