apcl
/

chiayisu's picture
init
2f30e1d
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.536,
"eval_steps": 500,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.032,
"grad_norm": 0.047607421875,
"learning_rate": 0.0001,
"loss": 0.3836,
"step": 5
},
{
"epoch": 0.064,
"grad_norm": 0.04541015625,
"learning_rate": 0.0001,
"loss": 0.2449,
"step": 10
},
{
"epoch": 0.096,
"grad_norm": 0.056396484375,
"learning_rate": 0.0001,
"loss": 0.1513,
"step": 15
},
{
"epoch": 0.128,
"grad_norm": 0.03857421875,
"learning_rate": 0.0001,
"loss": 0.0705,
"step": 20
},
{
"epoch": 0.16,
"grad_norm": 0.0286865234375,
"learning_rate": 0.0001,
"loss": 0.0488,
"step": 25
},
{
"epoch": 0.192,
"grad_norm": 0.03173828125,
"learning_rate": 0.0001,
"loss": 0.0391,
"step": 30
},
{
"epoch": 0.224,
"grad_norm": 0.054931640625,
"learning_rate": 0.0001,
"loss": 0.0278,
"step": 35
},
{
"epoch": 0.256,
"grad_norm": 0.0888671875,
"learning_rate": 0.0001,
"loss": 0.1414,
"step": 40
},
{
"epoch": 0.288,
"grad_norm": 0.015625,
"learning_rate": 0.0001,
"loss": 0.0371,
"step": 45
},
{
"epoch": 0.32,
"grad_norm": 0.0257568359375,
"learning_rate": 0.0001,
"loss": 0.0118,
"step": 50
},
{
"epoch": 0.352,
"grad_norm": 0.02197265625,
"learning_rate": 0.0001,
"loss": 0.0101,
"step": 55
},
{
"epoch": 0.384,
"grad_norm": 0.020751953125,
"learning_rate": 0.0001,
"loss": 0.0098,
"step": 60
},
{
"epoch": 0.416,
"grad_norm": 0.0164794921875,
"learning_rate": 0.0001,
"loss": 0.0088,
"step": 65
},
{
"epoch": 0.448,
"grad_norm": 0.0120849609375,
"learning_rate": 0.0001,
"loss": 0.0086,
"step": 70
},
{
"epoch": 0.48,
"grad_norm": 0.0269775390625,
"learning_rate": 0.0001,
"loss": 0.0168,
"step": 75
},
{
"epoch": 0.512,
"grad_norm": 0.05078125,
"learning_rate": 0.0001,
"loss": 0.0572,
"step": 80
},
{
"epoch": 0.544,
"grad_norm": 0.031982421875,
"learning_rate": 0.0001,
"loss": 0.0092,
"step": 85
},
{
"epoch": 0.576,
"grad_norm": 0.0196533203125,
"learning_rate": 0.0001,
"loss": 0.0077,
"step": 90
},
{
"epoch": 0.608,
"grad_norm": 0.0238037109375,
"learning_rate": 0.0001,
"loss": 0.0054,
"step": 95
},
{
"epoch": 0.64,
"grad_norm": 0.0108642578125,
"learning_rate": 0.0001,
"loss": 0.0043,
"step": 100
},
{
"epoch": 0.672,
"grad_norm": 0.0091552734375,
"learning_rate": 0.0001,
"loss": 0.004,
"step": 105
},
{
"epoch": 0.704,
"grad_norm": 0.01336669921875,
"learning_rate": 0.0001,
"loss": 0.0043,
"step": 110
},
{
"epoch": 0.736,
"grad_norm": 0.033203125,
"learning_rate": 0.0001,
"loss": 0.0122,
"step": 115
},
{
"epoch": 0.768,
"grad_norm": 0.0169677734375,
"learning_rate": 0.0001,
"loss": 0.0173,
"step": 120
},
{
"epoch": 0.8,
"grad_norm": 0.00909423828125,
"learning_rate": 0.0001,
"loss": 0.0031,
"step": 125
},
{
"epoch": 0.832,
"grad_norm": 0.01171875,
"learning_rate": 0.0001,
"loss": 0.0038,
"step": 130
},
{
"epoch": 0.864,
"grad_norm": 0.00946044921875,
"learning_rate": 0.0001,
"loss": 0.0036,
"step": 135
},
{
"epoch": 0.896,
"grad_norm": 0.014892578125,
"learning_rate": 0.0001,
"loss": 0.0047,
"step": 140
},
{
"epoch": 0.928,
"grad_norm": 0.01239013671875,
"learning_rate": 0.0001,
"loss": 0.006,
"step": 145
},
{
"epoch": 0.96,
"grad_norm": 0.00982666015625,
"learning_rate": 0.0001,
"loss": 0.0032,
"step": 150
},
{
"epoch": 0.992,
"grad_norm": 0.01031494140625,
"learning_rate": 0.0001,
"loss": 0.0037,
"step": 155
},
{
"epoch": 1.024,
"grad_norm": 0.006927490234375,
"learning_rate": 0.0001,
"loss": 0.0036,
"step": 160
},
{
"epoch": 1.056,
"grad_norm": 0.0084228515625,
"learning_rate": 0.0001,
"loss": 0.0017,
"step": 165
},
{
"epoch": 1.088,
"grad_norm": 0.005584716796875,
"learning_rate": 0.0001,
"loss": 0.0018,
"step": 170
},
{
"epoch": 1.12,
"grad_norm": 0.006683349609375,
"learning_rate": 0.0001,
"loss": 0.0017,
"step": 175
},
{
"epoch": 1.152,
"grad_norm": 0.004486083984375,
"learning_rate": 0.0001,
"loss": 0.0016,
"step": 180
},
{
"epoch": 1.184,
"grad_norm": 0.0087890625,
"learning_rate": 0.0001,
"loss": 0.0026,
"step": 185
},
{
"epoch": 1.216,
"grad_norm": 0.0062255859375,
"learning_rate": 0.0001,
"loss": 0.0015,
"step": 190
},
{
"epoch": 1.248,
"grad_norm": 0.0128173828125,
"learning_rate": 0.0001,
"loss": 0.0026,
"step": 195
},
{
"epoch": 1.28,
"grad_norm": 0.006683349609375,
"learning_rate": 0.0001,
"loss": 0.0039,
"step": 200
},
{
"epoch": 1.312,
"grad_norm": 0.00787353515625,
"learning_rate": 0.0001,
"loss": 0.0019,
"step": 205
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.0096435546875,
"learning_rate": 0.0001,
"loss": 0.0011,
"step": 210
},
{
"epoch": 1.376,
"grad_norm": 0.0096435546875,
"learning_rate": 0.0001,
"loss": 0.0016,
"step": 215
},
{
"epoch": 1.408,
"grad_norm": 0.005859375,
"learning_rate": 0.0001,
"loss": 0.0014,
"step": 220
},
{
"epoch": 1.44,
"grad_norm": 0.00848388671875,
"learning_rate": 0.0001,
"loss": 0.0014,
"step": 225
},
{
"epoch": 1.472,
"grad_norm": 0.015625,
"learning_rate": 0.0001,
"loss": 0.002,
"step": 230
},
{
"epoch": 1.504,
"grad_norm": 0.03857421875,
"learning_rate": 0.0001,
"loss": 0.0067,
"step": 235
},
{
"epoch": 1.536,
"grad_norm": 0.00811767578125,
"learning_rate": 0.0001,
"loss": 0.0062,
"step": 240
}
],
"logging_steps": 5,
"max_steps": 240,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 90,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.355246833433805e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}