|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.991040318566451, |
|
"eval_steps": 10000, |
|
"global_step": 500000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 9.83937751004016e-06, |
|
"loss": 1.3677, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.8635309934616089, |
|
"eval_runtime": 127.2968, |
|
"eval_samples_per_second": 1932.492, |
|
"eval_steps_per_second": 20.134, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 9.638674698795182e-06, |
|
"loss": 0.7723, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.6709557175636292, |
|
"eval_runtime": 127.3309, |
|
"eval_samples_per_second": 1931.974, |
|
"eval_steps_per_second": 20.129, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 9.437951807228917e-06, |
|
"loss": 0.6635, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.6188660264015198, |
|
"eval_runtime": 127.3716, |
|
"eval_samples_per_second": 1931.357, |
|
"eval_steps_per_second": 20.122, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 9.237248995983937e-06, |
|
"loss": 0.62, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.5857706069946289, |
|
"eval_runtime": 128.3689, |
|
"eval_samples_per_second": 1916.353, |
|
"eval_steps_per_second": 19.966, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 9.036465863453816e-06, |
|
"loss": 0.5933, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.5672558546066284, |
|
"eval_runtime": 128.5129, |
|
"eval_samples_per_second": 1914.205, |
|
"eval_steps_per_second": 19.944, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 8.835763052208836e-06, |
|
"loss": 0.5755, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.5512102842330933, |
|
"eval_runtime": 127.3167, |
|
"eval_samples_per_second": 1932.19, |
|
"eval_steps_per_second": 20.131, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 8.63504016064257e-06, |
|
"loss": 0.5622, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.5416693687438965, |
|
"eval_runtime": 127.5393, |
|
"eval_samples_per_second": 1928.817, |
|
"eval_steps_per_second": 20.096, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 8.434337349397592e-06, |
|
"loss": 0.5522, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.5347551107406616, |
|
"eval_runtime": 127.3132, |
|
"eval_samples_per_second": 1932.243, |
|
"eval_steps_per_second": 20.131, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 8.23363453815261e-06, |
|
"loss": 0.5437, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.5291008353233337, |
|
"eval_runtime": 127.3464, |
|
"eval_samples_per_second": 1931.739, |
|
"eval_steps_per_second": 20.126, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 8.032931726907631e-06, |
|
"loss": 0.5365, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.5225369334220886, |
|
"eval_runtime": 127.5532, |
|
"eval_samples_per_second": 1928.608, |
|
"eval_steps_per_second": 20.094, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 7.832228915662651e-06, |
|
"loss": 0.5309, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.5173077583312988, |
|
"eval_runtime": 127.5475, |
|
"eval_samples_per_second": 1928.693, |
|
"eval_steps_per_second": 20.094, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 7.63152610441767e-06, |
|
"loss": 0.5252, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.513536274433136, |
|
"eval_runtime": 130.7886, |
|
"eval_samples_per_second": 1880.897, |
|
"eval_steps_per_second": 19.597, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 7.430823293172691e-06, |
|
"loss": 0.5204, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 0.5111202001571655, |
|
"eval_runtime": 127.6816, |
|
"eval_samples_per_second": 1926.668, |
|
"eval_steps_per_second": 20.073, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 7.230100401606426e-06, |
|
"loss": 0.518, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.5084987282752991, |
|
"eval_runtime": 128.2814, |
|
"eval_samples_per_second": 1917.659, |
|
"eval_steps_per_second": 19.98, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 7.029397590361447e-06, |
|
"loss": 0.5135, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.5029130578041077, |
|
"eval_runtime": 130.8657, |
|
"eval_samples_per_second": 1879.79, |
|
"eval_steps_per_second": 19.585, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 6.828694779116466e-06, |
|
"loss": 0.5101, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.5005983710289001, |
|
"eval_runtime": 128.9427, |
|
"eval_samples_per_second": 1907.824, |
|
"eval_steps_per_second": 19.877, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 6.627991967871487e-06, |
|
"loss": 0.5065, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.4987814128398895, |
|
"eval_runtime": 127.8532, |
|
"eval_samples_per_second": 1924.081, |
|
"eval_steps_per_second": 20.046, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 6.427289156626506e-06, |
|
"loss": 0.5052, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.49448052048683167, |
|
"eval_runtime": 128.0108, |
|
"eval_samples_per_second": 1921.713, |
|
"eval_steps_per_second": 20.022, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 6.226586345381527e-06, |
|
"loss": 0.5025, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.49206921458244324, |
|
"eval_runtime": 127.1399, |
|
"eval_samples_per_second": 1934.877, |
|
"eval_steps_per_second": 20.159, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 6.025883534136546e-06, |
|
"loss": 0.4998, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.4929586946964264, |
|
"eval_runtime": 127.6742, |
|
"eval_samples_per_second": 1926.779, |
|
"eval_steps_per_second": 20.075, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 5.825180722891567e-06, |
|
"loss": 0.4982, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.48860839009284973, |
|
"eval_runtime": 127.3878, |
|
"eval_samples_per_second": 1931.112, |
|
"eval_steps_per_second": 20.12, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 5.6244578313253014e-06, |
|
"loss": 0.4969, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.4888823628425598, |
|
"eval_runtime": 127.2533, |
|
"eval_samples_per_second": 1933.152, |
|
"eval_steps_per_second": 20.141, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 5.423755020080321e-06, |
|
"loss": 0.495, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.4841912090778351, |
|
"eval_runtime": 126.968, |
|
"eval_samples_per_second": 1937.496, |
|
"eval_steps_per_second": 20.186, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 5.223052208835342e-06, |
|
"loss": 0.4927, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.4853549897670746, |
|
"eval_runtime": 127.4501, |
|
"eval_samples_per_second": 1930.167, |
|
"eval_steps_per_second": 20.11, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 5.022349397590361e-06, |
|
"loss": 0.4914, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4826248586177826, |
|
"eval_runtime": 127.3161, |
|
"eval_samples_per_second": 1932.198, |
|
"eval_steps_per_second": 20.131, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 4.821646586345382e-06, |
|
"loss": 0.4902, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 0.48145654797554016, |
|
"eval_runtime": 127.4143, |
|
"eval_samples_per_second": 1930.709, |
|
"eval_steps_per_second": 20.115, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 4.620943775100402e-06, |
|
"loss": 0.4894, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 0.47896286845207214, |
|
"eval_runtime": 127.419, |
|
"eval_samples_per_second": 1930.638, |
|
"eval_steps_per_second": 20.115, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 4.420240963855422e-06, |
|
"loss": 0.4881, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_loss": 0.48297473788261414, |
|
"eval_runtime": 127.4472, |
|
"eval_samples_per_second": 1930.211, |
|
"eval_steps_per_second": 20.11, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 4.219538152610443e-06, |
|
"loss": 0.487, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_loss": 0.47816893458366394, |
|
"eval_runtime": 127.1599, |
|
"eval_samples_per_second": 1934.573, |
|
"eval_steps_per_second": 20.156, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"learning_rate": 4.018835341365462e-06, |
|
"loss": 0.4859, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_loss": 0.4779074192047119, |
|
"eval_runtime": 127.96, |
|
"eval_samples_per_second": 1922.476, |
|
"eval_steps_per_second": 20.03, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 3.818132530120483e-06, |
|
"loss": 0.4845, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 0.47683581709861755, |
|
"eval_runtime": 127.9081, |
|
"eval_samples_per_second": 1923.256, |
|
"eval_steps_per_second": 20.038, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 3.6174297188755025e-06, |
|
"loss": 0.4835, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_loss": 0.4755454957485199, |
|
"eval_runtime": 127.7316, |
|
"eval_samples_per_second": 1925.914, |
|
"eval_steps_per_second": 20.066, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 3.4167068273092375e-06, |
|
"loss": 0.483, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_loss": 0.4744352400302887, |
|
"eval_runtime": 128.197, |
|
"eval_samples_per_second": 1918.921, |
|
"eval_steps_per_second": 19.993, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 3.2160040160642576e-06, |
|
"loss": 0.4819, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 0.4745638966560364, |
|
"eval_runtime": 128.0584, |
|
"eval_samples_per_second": 1920.999, |
|
"eval_steps_per_second": 20.014, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 3.0153012048192777e-06, |
|
"loss": 0.481, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_loss": 0.4744107723236084, |
|
"eval_runtime": 128.1739, |
|
"eval_samples_per_second": 1919.268, |
|
"eval_steps_per_second": 19.996, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 2.8145983935742978e-06, |
|
"loss": 0.481, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_loss": 0.472385436296463, |
|
"eval_runtime": 128.2659, |
|
"eval_samples_per_second": 1917.89, |
|
"eval_steps_per_second": 19.982, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 2.613895582329318e-06, |
|
"loss": 0.4799, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_loss": 0.4733026921749115, |
|
"eval_runtime": 127.698, |
|
"eval_samples_per_second": 1926.419, |
|
"eval_steps_per_second": 20.071, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 2.4131927710843376e-06, |
|
"loss": 0.4795, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_loss": 0.4719351530075073, |
|
"eval_runtime": 128.2337, |
|
"eval_samples_per_second": 1918.372, |
|
"eval_steps_per_second": 19.987, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 2.2124899598393577e-06, |
|
"loss": 0.4784, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_loss": 0.4699419438838959, |
|
"eval_runtime": 127.9847, |
|
"eval_samples_per_second": 1922.105, |
|
"eval_steps_per_second": 20.026, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 2.0117871485943778e-06, |
|
"loss": 0.4785, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_loss": 0.4711839556694031, |
|
"eval_runtime": 127.89, |
|
"eval_samples_per_second": 1923.527, |
|
"eval_steps_per_second": 20.041, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 1.8110843373493979e-06, |
|
"loss": 0.4777, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"eval_loss": 0.46987083554267883, |
|
"eval_runtime": 128.85, |
|
"eval_samples_per_second": 1909.197, |
|
"eval_steps_per_second": 19.891, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 1.6103614457831327e-06, |
|
"loss": 0.477, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_loss": 0.46960577368736267, |
|
"eval_runtime": 130.3922, |
|
"eval_samples_per_second": 1886.616, |
|
"eval_steps_per_second": 19.656, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"learning_rate": 1.4096586345381528e-06, |
|
"loss": 0.4771, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_loss": 0.47003933787345886, |
|
"eval_runtime": 129.4605, |
|
"eval_samples_per_second": 1900.193, |
|
"eval_steps_per_second": 19.798, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"learning_rate": 1.2089558232931729e-06, |
|
"loss": 0.4766, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.47017282247543335, |
|
"eval_runtime": 129.4902, |
|
"eval_samples_per_second": 1899.758, |
|
"eval_steps_per_second": 19.793, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"learning_rate": 1.008253012048193e-06, |
|
"loss": 0.476, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"eval_loss": 0.46954795718193054, |
|
"eval_runtime": 129.5407, |
|
"eval_samples_per_second": 1899.017, |
|
"eval_steps_per_second": 19.785, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 8.07550200803213e-07, |
|
"loss": 0.4757, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"eval_loss": 0.4694086015224457, |
|
"eval_runtime": 129.2469, |
|
"eval_samples_per_second": 1903.334, |
|
"eval_steps_per_second": 19.83, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 6.068273092369479e-07, |
|
"loss": 0.4758, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_loss": 0.4685874581336975, |
|
"eval_runtime": 129.0023, |
|
"eval_samples_per_second": 1906.943, |
|
"eval_steps_per_second": 19.868, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"learning_rate": 4.061244979919679e-07, |
|
"loss": 0.4754, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_loss": 0.46817249059677124, |
|
"eval_runtime": 130.8406, |
|
"eval_samples_per_second": 1880.15, |
|
"eval_steps_per_second": 19.589, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 2.0542168674698798e-07, |
|
"loss": 0.475, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 0.4691283404827118, |
|
"eval_runtime": 129.0096, |
|
"eval_samples_per_second": 1906.836, |
|
"eval_steps_per_second": 19.867, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 4.718875502008032e-09, |
|
"loss": 0.4756, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"eval_loss": 0.46795061230659485, |
|
"eval_runtime": 129.0896, |
|
"eval_samples_per_second": 1905.653, |
|
"eval_steps_per_second": 19.854, |
|
"step": 500000 |
|
} |
|
], |
|
"logging_steps": 10000, |
|
"max_steps": 500000, |
|
"num_train_epochs": 2, |
|
"save_steps": 10000, |
|
"total_flos": 8.290835482935528e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|