|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.673469387755102,
|
|
"eval_steps": 500,
|
|
"global_step": 90,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.04,
|
|
"learning_rate": 6.666666666666667e-05,
|
|
"loss": 3.1608,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"learning_rate": 0.00013333333333333334,
|
|
"loss": 3.7941,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"learning_rate": 0.0002,
|
|
"loss": 3.2615,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"learning_rate": 0.00019784946236559142,
|
|
"loss": 2.493,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"learning_rate": 0.0001956989247311828,
|
|
"loss": 2.478,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"learning_rate": 0.00019354838709677422,
|
|
"loss": 2.4586,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"learning_rate": 0.0001913978494623656,
|
|
"loss": 1.8358,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"learning_rate": 0.000189247311827957,
|
|
"loss": 1.8871,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"learning_rate": 0.0001870967741935484,
|
|
"loss": 1.8396,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"learning_rate": 0.00018494623655913978,
|
|
"loss": 1.9403,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"learning_rate": 0.0001827956989247312,
|
|
"loss": 1.9696,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"learning_rate": 0.00018064516129032257,
|
|
"loss": 2.0119,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"learning_rate": 0.00017849462365591398,
|
|
"loss": 1.7039,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"learning_rate": 0.0001763440860215054,
|
|
"loss": 1.5358,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"learning_rate": 0.00017419354838709678,
|
|
"loss": 1.4563,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"learning_rate": 0.0001720430107526882,
|
|
"loss": 1.4858,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"learning_rate": 0.00016989247311827957,
|
|
"loss": 1.6904,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"learning_rate": 0.00016774193548387098,
|
|
"loss": 1.9223,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"learning_rate": 0.0001655913978494624,
|
|
"loss": 1.4318,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"learning_rate": 0.00016344086021505378,
|
|
"loss": 1.3245,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"learning_rate": 0.00016129032258064516,
|
|
"loss": 1.2831,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"learning_rate": 0.00015913978494623657,
|
|
"loss": 1.4268,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"learning_rate": 0.00015698924731182796,
|
|
"loss": 1.4465,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"learning_rate": 0.00015483870967741937,
|
|
"loss": 1.5686,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 1.02,
|
|
"learning_rate": 0.00015268817204301075,
|
|
"loss": 1.3262,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 1.06,
|
|
"learning_rate": 0.00015053763440860216,
|
|
"loss": 1.1648,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 1.1,
|
|
"learning_rate": 0.00014838709677419355,
|
|
"loss": 1.1263,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 1.14,
|
|
"learning_rate": 0.00014623655913978496,
|
|
"loss": 1.119,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 1.18,
|
|
"learning_rate": 0.00014408602150537637,
|
|
"loss": 1.1306,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 1.22,
|
|
"learning_rate": 0.00014193548387096775,
|
|
"loss": 1.2271,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 1.27,
|
|
"learning_rate": 0.00013978494623655916,
|
|
"loss": 1.269,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 1.31,
|
|
"learning_rate": 0.00013763440860215055,
|
|
"loss": 0.9227,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 1.35,
|
|
"learning_rate": 0.00013548387096774193,
|
|
"loss": 0.9733,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 1.39,
|
|
"learning_rate": 0.00013333333333333334,
|
|
"loss": 0.8932,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 1.43,
|
|
"learning_rate": 0.00013118279569892472,
|
|
"loss": 0.9639,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 1.47,
|
|
"learning_rate": 0.00012903225806451613,
|
|
"loss": 1.0789,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 1.51,
|
|
"learning_rate": 0.00012688172043010752,
|
|
"loss": 1.2016,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 1.55,
|
|
"learning_rate": 0.00012473118279569893,
|
|
"loss": 0.8486,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 1.59,
|
|
"learning_rate": 0.00012258064516129034,
|
|
"loss": 0.8141,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 1.63,
|
|
"learning_rate": 0.00012043010752688172,
|
|
"loss": 0.8986,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 1.67,
|
|
"learning_rate": 0.00011827956989247313,
|
|
"loss": 0.9075,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 1.71,
|
|
"learning_rate": 0.00011612903225806453,
|
|
"loss": 0.9939,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 1.76,
|
|
"learning_rate": 0.00011397849462365593,
|
|
"loss": 1.1734,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 1.8,
|
|
"learning_rate": 0.00011182795698924731,
|
|
"loss": 0.7703,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 1.84,
|
|
"learning_rate": 0.00010967741935483871,
|
|
"loss": 0.7249,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 1.88,
|
|
"learning_rate": 0.00010752688172043011,
|
|
"loss": 0.6638,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 1.92,
|
|
"learning_rate": 0.0001053763440860215,
|
|
"loss": 0.754,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 1.96,
|
|
"learning_rate": 0.0001032258064516129,
|
|
"loss": 1.1539,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"learning_rate": 0.0001010752688172043,
|
|
"loss": 0.8483,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 2.04,
|
|
"learning_rate": 9.892473118279571e-05,
|
|
"loss": 0.8251,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 2.08,
|
|
"learning_rate": 9.677419354838711e-05,
|
|
"loss": 0.5105,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 2.12,
|
|
"learning_rate": 9.46236559139785e-05,
|
|
"loss": 0.4452,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 2.16,
|
|
"learning_rate": 9.247311827956989e-05,
|
|
"loss": 0.5524,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 2.2,
|
|
"learning_rate": 9.032258064516129e-05,
|
|
"loss": 0.6445,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 2.24,
|
|
"learning_rate": 8.81720430107527e-05,
|
|
"loss": 0.6312,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 2.29,
|
|
"learning_rate": 8.60215053763441e-05,
|
|
"loss": 0.8308,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 2.33,
|
|
"learning_rate": 8.387096774193549e-05,
|
|
"loss": 0.4779,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 2.37,
|
|
"learning_rate": 8.172043010752689e-05,
|
|
"loss": 0.4451,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 2.41,
|
|
"learning_rate": 7.956989247311829e-05,
|
|
"loss": 0.4612,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 2.45,
|
|
"learning_rate": 7.741935483870968e-05,
|
|
"loss": 0.5381,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 2.49,
|
|
"learning_rate": 7.526881720430108e-05,
|
|
"loss": 0.6287,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 2.53,
|
|
"learning_rate": 7.311827956989248e-05,
|
|
"loss": 0.6218,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 2.57,
|
|
"learning_rate": 7.096774193548388e-05,
|
|
"loss": 0.44,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 2.61,
|
|
"learning_rate": 6.881720430107527e-05,
|
|
"loss": 0.453,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 2.65,
|
|
"learning_rate": 6.666666666666667e-05,
|
|
"loss": 0.4534,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 2.69,
|
|
"learning_rate": 6.451612903225807e-05,
|
|
"loss": 0.5502,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 2.73,
|
|
"learning_rate": 6.236559139784946e-05,
|
|
"loss": 0.5771,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 2.78,
|
|
"learning_rate": 6.021505376344086e-05,
|
|
"loss": 0.5361,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 2.82,
|
|
"learning_rate": 5.8064516129032266e-05,
|
|
"loss": 0.4565,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 2.86,
|
|
"learning_rate": 5.5913978494623656e-05,
|
|
"loss": 0.4856,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 2.9,
|
|
"learning_rate": 5.3763440860215054e-05,
|
|
"loss": 0.4482,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 2.94,
|
|
"learning_rate": 5.161290322580645e-05,
|
|
"loss": 0.545,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 2.98,
|
|
"learning_rate": 4.9462365591397855e-05,
|
|
"loss": 0.59,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 3.02,
|
|
"learning_rate": 4.731182795698925e-05,
|
|
"loss": 0.6918,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 3.06,
|
|
"learning_rate": 4.516129032258064e-05,
|
|
"loss": 0.3778,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 3.1,
|
|
"learning_rate": 4.301075268817205e-05,
|
|
"loss": 0.3713,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 3.14,
|
|
"learning_rate": 4.0860215053763444e-05,
|
|
"loss": 0.3689,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 3.18,
|
|
"learning_rate": 3.870967741935484e-05,
|
|
"loss": 0.3884,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 3.22,
|
|
"learning_rate": 3.655913978494624e-05,
|
|
"loss": 0.4363,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 3.27,
|
|
"learning_rate": 3.4408602150537636e-05,
|
|
"loss": 0.627,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 3.31,
|
|
"learning_rate": 3.2258064516129034e-05,
|
|
"loss": 0.357,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 3.35,
|
|
"learning_rate": 3.010752688172043e-05,
|
|
"loss": 0.3098,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 3.39,
|
|
"learning_rate": 2.7956989247311828e-05,
|
|
"loss": 0.341,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 3.43,
|
|
"learning_rate": 2.5806451612903226e-05,
|
|
"loss": 0.3881,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 3.47,
|
|
"learning_rate": 2.3655913978494626e-05,
|
|
"loss": 0.4509,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 3.51,
|
|
"learning_rate": 2.1505376344086024e-05,
|
|
"loss": 0.6519,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 3.55,
|
|
"learning_rate": 1.935483870967742e-05,
|
|
"loss": 0.3646,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 3.59,
|
|
"learning_rate": 1.7204301075268818e-05,
|
|
"loss": 0.2933,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 3.63,
|
|
"learning_rate": 1.5053763440860215e-05,
|
|
"loss": 0.3629,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 3.67,
|
|
"learning_rate": 1.2903225806451613e-05,
|
|
"loss": 0.3859,
|
|
"step": 90
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 96,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 4,
|
|
"save_steps": 10,
|
|
"total_flos": 1.2623208115765248e+16,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|