|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5037783375314862, |
|
"eval_steps": 500, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00559753708368318, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.647, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01119507416736636, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9886, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016792611251049538, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7588, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02239014833473272, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7681, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.027987685418415897, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.746, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.033585222502099076, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6599, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.039182759585782254, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6387, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04478029666946544, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6317, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05037783375314862, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6049, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.055975370836831795, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5222, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06157290792051497, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5276, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06717044500419815, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.532, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07276798208788134, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5667, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07836551917156451, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5785, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08396305625524769, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5983, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08956059333893088, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5038, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09515813042261405, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5087, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.10075566750629723, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.563, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1063532045899804, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5014, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11195074167366359, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4986, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11754827875734676, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5194, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12314581584102995, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5515, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12874335292471312, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5236, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1343408900083963, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4187, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1399384270920795, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4352, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14553596417576267, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4618, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15113350125944586, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5728, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15673103834312901, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.422, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1623285754268122, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5431, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16792611251049538, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3112, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17352364959417857, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5344, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17912118667786175, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5224, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1847187237615449, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5421, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1903162608452281, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4925, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19591379792891128, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5017, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.20151133501259447, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4552, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20710887209627762, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5017, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2127064091799608, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4128, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.218303946263644, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4197, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.22390148334732718, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3101, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22949902043101036, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4507, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.23509655751469352, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5322, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2406940945983767, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4553, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2462916316820599, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5025, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2518891687657431, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5067, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.25748670584942623, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5141, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.26308424293310945, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5031, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2686817800167926, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5061, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2742793171004758, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4099, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.279876854184159, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3343, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28547439126784213, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4818, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.29107192835152534, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5145, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2966694654352085, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5582, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3022670025188917, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4479, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.30786453960257487, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5605, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.31346207668625803, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4777, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31905961376994124, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4301, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3246571508536244, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5234, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3302546879373076, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4054, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.33585222502099077, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3038, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3414497621046739, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4683, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.34704729918835714, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.519, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3526448362720403, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4532, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3582423733557235, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4129, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.36383991043940667, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4434, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3694374475230898, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4407, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.37503498460677304, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4702, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3806325216904562, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4599, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.38623005877413935, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3855, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.39182759585782256, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3064, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3974251329415057, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3561, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.40302267002518893, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5276, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4086202071088721, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4405, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.41421774419255525, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5209, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41981528127623846, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.459, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4254128183599216, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3911, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.43101035544360483, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3857, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.436607892527288, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4294, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.44220542961097115, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3572, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.44780296669465436, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2168, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4534005037783375, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4701, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.45899804086202073, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4556, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4645955779457039, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4448, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.47019311502938704, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4272, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.47579065211307026, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4135, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4813881891967534, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5029, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4869857262804366, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5365, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4925832633641198, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3895, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.49818080044780294, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3481, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5037783375314862, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2994, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5037783375314862, |
|
"step": 450, |
|
"total_flos": 4.334613097500672e+17, |
|
"train_loss": 1.499400347603692, |
|
"train_runtime": 17883.1844, |
|
"train_samples_per_second": 1.61, |
|
"train_steps_per_second": 0.025 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 90, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.334613097500672e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|