human_heavy / trainer_state.json
tadsatlawa-na's picture
Add human_heavy_model
3f43d0c
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.991040318566451,
"eval_steps": 10000,
"global_step": 500000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"learning_rate": 9.83937751004016e-06,
"loss": 1.3677,
"step": 10000
},
{
"epoch": 0.04,
"eval_loss": 0.8635309934616089,
"eval_runtime": 127.2968,
"eval_samples_per_second": 1932.492,
"eval_steps_per_second": 20.134,
"step": 10000
},
{
"epoch": 0.08,
"learning_rate": 9.638674698795182e-06,
"loss": 0.7723,
"step": 20000
},
{
"epoch": 0.08,
"eval_loss": 0.6709557175636292,
"eval_runtime": 127.3309,
"eval_samples_per_second": 1931.974,
"eval_steps_per_second": 20.129,
"step": 20000
},
{
"epoch": 0.12,
"learning_rate": 9.437951807228917e-06,
"loss": 0.6635,
"step": 30000
},
{
"epoch": 0.12,
"eval_loss": 0.6188660264015198,
"eval_runtime": 127.3716,
"eval_samples_per_second": 1931.357,
"eval_steps_per_second": 20.122,
"step": 30000
},
{
"epoch": 0.16,
"learning_rate": 9.237248995983937e-06,
"loss": 0.62,
"step": 40000
},
{
"epoch": 0.16,
"eval_loss": 0.5857706069946289,
"eval_runtime": 128.3689,
"eval_samples_per_second": 1916.353,
"eval_steps_per_second": 19.966,
"step": 40000
},
{
"epoch": 0.2,
"learning_rate": 9.036465863453816e-06,
"loss": 0.5933,
"step": 50000
},
{
"epoch": 0.2,
"eval_loss": 0.5672558546066284,
"eval_runtime": 128.5129,
"eval_samples_per_second": 1914.205,
"eval_steps_per_second": 19.944,
"step": 50000
},
{
"epoch": 0.24,
"learning_rate": 8.835763052208836e-06,
"loss": 0.5755,
"step": 60000
},
{
"epoch": 0.24,
"eval_loss": 0.5512102842330933,
"eval_runtime": 127.3167,
"eval_samples_per_second": 1932.19,
"eval_steps_per_second": 20.131,
"step": 60000
},
{
"epoch": 0.28,
"learning_rate": 8.63504016064257e-06,
"loss": 0.5622,
"step": 70000
},
{
"epoch": 0.28,
"eval_loss": 0.5416693687438965,
"eval_runtime": 127.5393,
"eval_samples_per_second": 1928.817,
"eval_steps_per_second": 20.096,
"step": 70000
},
{
"epoch": 0.32,
"learning_rate": 8.434337349397592e-06,
"loss": 0.5522,
"step": 80000
},
{
"epoch": 0.32,
"eval_loss": 0.5347551107406616,
"eval_runtime": 127.3132,
"eval_samples_per_second": 1932.243,
"eval_steps_per_second": 20.131,
"step": 80000
},
{
"epoch": 0.36,
"learning_rate": 8.23363453815261e-06,
"loss": 0.5437,
"step": 90000
},
{
"epoch": 0.36,
"eval_loss": 0.5291008353233337,
"eval_runtime": 127.3464,
"eval_samples_per_second": 1931.739,
"eval_steps_per_second": 20.126,
"step": 90000
},
{
"epoch": 0.4,
"learning_rate": 8.032931726907631e-06,
"loss": 0.5365,
"step": 100000
},
{
"epoch": 0.4,
"eval_loss": 0.5225369334220886,
"eval_runtime": 127.5532,
"eval_samples_per_second": 1928.608,
"eval_steps_per_second": 20.094,
"step": 100000
},
{
"epoch": 0.44,
"learning_rate": 7.832228915662651e-06,
"loss": 0.5309,
"step": 110000
},
{
"epoch": 0.44,
"eval_loss": 0.5173077583312988,
"eval_runtime": 127.5475,
"eval_samples_per_second": 1928.693,
"eval_steps_per_second": 20.094,
"step": 110000
},
{
"epoch": 0.48,
"learning_rate": 7.63152610441767e-06,
"loss": 0.5252,
"step": 120000
},
{
"epoch": 0.48,
"eval_loss": 0.513536274433136,
"eval_runtime": 130.7886,
"eval_samples_per_second": 1880.897,
"eval_steps_per_second": 19.597,
"step": 120000
},
{
"epoch": 0.52,
"learning_rate": 7.430823293172691e-06,
"loss": 0.5204,
"step": 130000
},
{
"epoch": 0.52,
"eval_loss": 0.5111202001571655,
"eval_runtime": 127.6816,
"eval_samples_per_second": 1926.668,
"eval_steps_per_second": 20.073,
"step": 130000
},
{
"epoch": 0.56,
"learning_rate": 7.230100401606426e-06,
"loss": 0.518,
"step": 140000
},
{
"epoch": 0.56,
"eval_loss": 0.5084987282752991,
"eval_runtime": 128.2814,
"eval_samples_per_second": 1917.659,
"eval_steps_per_second": 19.98,
"step": 140000
},
{
"epoch": 0.6,
"learning_rate": 7.029397590361447e-06,
"loss": 0.5135,
"step": 150000
},
{
"epoch": 0.6,
"eval_loss": 0.5029130578041077,
"eval_runtime": 130.8657,
"eval_samples_per_second": 1879.79,
"eval_steps_per_second": 19.585,
"step": 150000
},
{
"epoch": 0.64,
"learning_rate": 6.828694779116466e-06,
"loss": 0.5101,
"step": 160000
},
{
"epoch": 0.64,
"eval_loss": 0.5005983710289001,
"eval_runtime": 128.9427,
"eval_samples_per_second": 1907.824,
"eval_steps_per_second": 19.877,
"step": 160000
},
{
"epoch": 0.68,
"learning_rate": 6.627991967871487e-06,
"loss": 0.5065,
"step": 170000
},
{
"epoch": 0.68,
"eval_loss": 0.4987814128398895,
"eval_runtime": 127.8532,
"eval_samples_per_second": 1924.081,
"eval_steps_per_second": 20.046,
"step": 170000
},
{
"epoch": 0.72,
"learning_rate": 6.427289156626506e-06,
"loss": 0.5052,
"step": 180000
},
{
"epoch": 0.72,
"eval_loss": 0.49448052048683167,
"eval_runtime": 128.0108,
"eval_samples_per_second": 1921.713,
"eval_steps_per_second": 20.022,
"step": 180000
},
{
"epoch": 0.76,
"learning_rate": 6.226586345381527e-06,
"loss": 0.5025,
"step": 190000
},
{
"epoch": 0.76,
"eval_loss": 0.49206921458244324,
"eval_runtime": 127.1399,
"eval_samples_per_second": 1934.877,
"eval_steps_per_second": 20.159,
"step": 190000
},
{
"epoch": 0.8,
"learning_rate": 6.025883534136546e-06,
"loss": 0.4998,
"step": 200000
},
{
"epoch": 0.8,
"eval_loss": 0.4929586946964264,
"eval_runtime": 127.6742,
"eval_samples_per_second": 1926.779,
"eval_steps_per_second": 20.075,
"step": 200000
},
{
"epoch": 0.84,
"learning_rate": 5.825180722891567e-06,
"loss": 0.4982,
"step": 210000
},
{
"epoch": 0.84,
"eval_loss": 0.48860839009284973,
"eval_runtime": 127.3878,
"eval_samples_per_second": 1931.112,
"eval_steps_per_second": 20.12,
"step": 210000
},
{
"epoch": 0.88,
"learning_rate": 5.6244578313253014e-06,
"loss": 0.4969,
"step": 220000
},
{
"epoch": 0.88,
"eval_loss": 0.4888823628425598,
"eval_runtime": 127.2533,
"eval_samples_per_second": 1933.152,
"eval_steps_per_second": 20.141,
"step": 220000
},
{
"epoch": 0.92,
"learning_rate": 5.423755020080321e-06,
"loss": 0.495,
"step": 230000
},
{
"epoch": 0.92,
"eval_loss": 0.4841912090778351,
"eval_runtime": 126.968,
"eval_samples_per_second": 1937.496,
"eval_steps_per_second": 20.186,
"step": 230000
},
{
"epoch": 0.96,
"learning_rate": 5.223052208835342e-06,
"loss": 0.4927,
"step": 240000
},
{
"epoch": 0.96,
"eval_loss": 0.4853549897670746,
"eval_runtime": 127.4501,
"eval_samples_per_second": 1930.167,
"eval_steps_per_second": 20.11,
"step": 240000
},
{
"epoch": 1.0,
"learning_rate": 5.022349397590361e-06,
"loss": 0.4914,
"step": 250000
},
{
"epoch": 1.0,
"eval_loss": 0.4826248586177826,
"eval_runtime": 127.3161,
"eval_samples_per_second": 1932.198,
"eval_steps_per_second": 20.131,
"step": 250000
},
{
"epoch": 1.04,
"learning_rate": 4.821646586345382e-06,
"loss": 0.4902,
"step": 260000
},
{
"epoch": 1.04,
"eval_loss": 0.48145654797554016,
"eval_runtime": 127.4143,
"eval_samples_per_second": 1930.709,
"eval_steps_per_second": 20.115,
"step": 260000
},
{
"epoch": 1.08,
"learning_rate": 4.620943775100402e-06,
"loss": 0.4894,
"step": 270000
},
{
"epoch": 1.08,
"eval_loss": 0.47896286845207214,
"eval_runtime": 127.419,
"eval_samples_per_second": 1930.638,
"eval_steps_per_second": 20.115,
"step": 270000
},
{
"epoch": 1.11,
"learning_rate": 4.420240963855422e-06,
"loss": 0.4881,
"step": 280000
},
{
"epoch": 1.11,
"eval_loss": 0.48297473788261414,
"eval_runtime": 127.4472,
"eval_samples_per_second": 1930.211,
"eval_steps_per_second": 20.11,
"step": 280000
},
{
"epoch": 1.15,
"learning_rate": 4.219538152610443e-06,
"loss": 0.487,
"step": 290000
},
{
"epoch": 1.15,
"eval_loss": 0.47816893458366394,
"eval_runtime": 127.1599,
"eval_samples_per_second": 1934.573,
"eval_steps_per_second": 20.156,
"step": 290000
},
{
"epoch": 1.19,
"learning_rate": 4.018835341365462e-06,
"loss": 0.4859,
"step": 300000
},
{
"epoch": 1.19,
"eval_loss": 0.4779074192047119,
"eval_runtime": 127.96,
"eval_samples_per_second": 1922.476,
"eval_steps_per_second": 20.03,
"step": 300000
},
{
"epoch": 1.23,
"learning_rate": 3.818132530120483e-06,
"loss": 0.4845,
"step": 310000
},
{
"epoch": 1.23,
"eval_loss": 0.47683581709861755,
"eval_runtime": 127.9081,
"eval_samples_per_second": 1923.256,
"eval_steps_per_second": 20.038,
"step": 310000
},
{
"epoch": 1.27,
"learning_rate": 3.6174297188755025e-06,
"loss": 0.4835,
"step": 320000
},
{
"epoch": 1.27,
"eval_loss": 0.4755454957485199,
"eval_runtime": 127.7316,
"eval_samples_per_second": 1925.914,
"eval_steps_per_second": 20.066,
"step": 320000
},
{
"epoch": 1.31,
"learning_rate": 3.4167068273092375e-06,
"loss": 0.483,
"step": 330000
},
{
"epoch": 1.31,
"eval_loss": 0.4744352400302887,
"eval_runtime": 128.197,
"eval_samples_per_second": 1918.921,
"eval_steps_per_second": 19.993,
"step": 330000
},
{
"epoch": 1.35,
"learning_rate": 3.2160040160642576e-06,
"loss": 0.4819,
"step": 340000
},
{
"epoch": 1.35,
"eval_loss": 0.4745638966560364,
"eval_runtime": 128.0584,
"eval_samples_per_second": 1920.999,
"eval_steps_per_second": 20.014,
"step": 340000
},
{
"epoch": 1.39,
"learning_rate": 3.0153012048192777e-06,
"loss": 0.481,
"step": 350000
},
{
"epoch": 1.39,
"eval_loss": 0.4744107723236084,
"eval_runtime": 128.1739,
"eval_samples_per_second": 1919.268,
"eval_steps_per_second": 19.996,
"step": 350000
},
{
"epoch": 1.43,
"learning_rate": 2.8145983935742978e-06,
"loss": 0.481,
"step": 360000
},
{
"epoch": 1.43,
"eval_loss": 0.472385436296463,
"eval_runtime": 128.2659,
"eval_samples_per_second": 1917.89,
"eval_steps_per_second": 19.982,
"step": 360000
},
{
"epoch": 1.47,
"learning_rate": 2.613895582329318e-06,
"loss": 0.4799,
"step": 370000
},
{
"epoch": 1.47,
"eval_loss": 0.4733026921749115,
"eval_runtime": 127.698,
"eval_samples_per_second": 1926.419,
"eval_steps_per_second": 20.071,
"step": 370000
},
{
"epoch": 1.51,
"learning_rate": 2.4131927710843376e-06,
"loss": 0.4795,
"step": 380000
},
{
"epoch": 1.51,
"eval_loss": 0.4719351530075073,
"eval_runtime": 128.2337,
"eval_samples_per_second": 1918.372,
"eval_steps_per_second": 19.987,
"step": 380000
},
{
"epoch": 1.55,
"learning_rate": 2.2124899598393577e-06,
"loss": 0.4784,
"step": 390000
},
{
"epoch": 1.55,
"eval_loss": 0.4699419438838959,
"eval_runtime": 127.9847,
"eval_samples_per_second": 1922.105,
"eval_steps_per_second": 20.026,
"step": 390000
},
{
"epoch": 1.59,
"learning_rate": 2.0117871485943778e-06,
"loss": 0.4785,
"step": 400000
},
{
"epoch": 1.59,
"eval_loss": 0.4711839556694031,
"eval_runtime": 127.89,
"eval_samples_per_second": 1923.527,
"eval_steps_per_second": 20.041,
"step": 400000
},
{
"epoch": 1.63,
"learning_rate": 1.8110843373493979e-06,
"loss": 0.4777,
"step": 410000
},
{
"epoch": 1.63,
"eval_loss": 0.46987083554267883,
"eval_runtime": 128.85,
"eval_samples_per_second": 1909.197,
"eval_steps_per_second": 19.891,
"step": 410000
},
{
"epoch": 1.67,
"learning_rate": 1.6103614457831327e-06,
"loss": 0.477,
"step": 420000
},
{
"epoch": 1.67,
"eval_loss": 0.46960577368736267,
"eval_runtime": 130.3922,
"eval_samples_per_second": 1886.616,
"eval_steps_per_second": 19.656,
"step": 420000
},
{
"epoch": 1.71,
"learning_rate": 1.4096586345381528e-06,
"loss": 0.4771,
"step": 430000
},
{
"epoch": 1.71,
"eval_loss": 0.47003933787345886,
"eval_runtime": 129.4605,
"eval_samples_per_second": 1900.193,
"eval_steps_per_second": 19.798,
"step": 430000
},
{
"epoch": 1.75,
"learning_rate": 1.2089558232931729e-06,
"loss": 0.4766,
"step": 440000
},
{
"epoch": 1.75,
"eval_loss": 0.47017282247543335,
"eval_runtime": 129.4902,
"eval_samples_per_second": 1899.758,
"eval_steps_per_second": 19.793,
"step": 440000
},
{
"epoch": 1.79,
"learning_rate": 1.008253012048193e-06,
"loss": 0.476,
"step": 450000
},
{
"epoch": 1.79,
"eval_loss": 0.46954795718193054,
"eval_runtime": 129.5407,
"eval_samples_per_second": 1899.017,
"eval_steps_per_second": 19.785,
"step": 450000
},
{
"epoch": 1.83,
"learning_rate": 8.07550200803213e-07,
"loss": 0.4757,
"step": 460000
},
{
"epoch": 1.83,
"eval_loss": 0.4694086015224457,
"eval_runtime": 129.2469,
"eval_samples_per_second": 1903.334,
"eval_steps_per_second": 19.83,
"step": 460000
},
{
"epoch": 1.87,
"learning_rate": 6.068273092369479e-07,
"loss": 0.4758,
"step": 470000
},
{
"epoch": 1.87,
"eval_loss": 0.4685874581336975,
"eval_runtime": 129.0023,
"eval_samples_per_second": 1906.943,
"eval_steps_per_second": 19.868,
"step": 470000
},
{
"epoch": 1.91,
"learning_rate": 4.061244979919679e-07,
"loss": 0.4754,
"step": 480000
},
{
"epoch": 1.91,
"eval_loss": 0.46817249059677124,
"eval_runtime": 130.8406,
"eval_samples_per_second": 1880.15,
"eval_steps_per_second": 19.589,
"step": 480000
},
{
"epoch": 1.95,
"learning_rate": 2.0542168674698798e-07,
"loss": 0.475,
"step": 490000
},
{
"epoch": 1.95,
"eval_loss": 0.4691283404827118,
"eval_runtime": 129.0096,
"eval_samples_per_second": 1906.836,
"eval_steps_per_second": 19.867,
"step": 490000
},
{
"epoch": 1.99,
"learning_rate": 4.718875502008032e-09,
"loss": 0.4756,
"step": 500000
},
{
"epoch": 1.99,
"eval_loss": 0.46795061230659485,
"eval_runtime": 129.0896,
"eval_samples_per_second": 1905.653,
"eval_steps_per_second": 19.854,
"step": 500000
}
],
"logging_steps": 10000,
"max_steps": 500000,
"num_train_epochs": 2,
"save_steps": 10000,
"total_flos": 8.290835482935528e+17,
"trial_name": null,
"trial_params": null
}