|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5037783375314862, |
|
"eval_steps": 500, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00559753708368318, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0001, |
|
"loss": 4.5193, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01119507416736636, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.8387, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016792611251049538, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.1966, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02239014833473272, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0024, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.027987685418415897, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7735, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.033585222502099076, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6781, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.039182759585782254, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6619, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04478029666946544, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6361, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05037783375314862, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6153, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.055975370836831795, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5201, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06157290792051497, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5211, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06717044500419815, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5359, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07276798208788134, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5686, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07836551917156451, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08396305625524769, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5958, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08956059333893088, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5006, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09515813042261405, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5051, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.10075566750629723, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5649, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1063532045899804, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11195074167366359, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4951, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11754827875734676, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5258, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12314581584102995, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.556, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12874335292471312, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5306, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1343408900083963, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4242, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1399384270920795, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4403, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14553596417576267, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4604, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15113350125944586, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5809, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15673103834312901, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4282, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1623285754268122, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5452, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16792611251049538, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3127, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17352364959417857, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5287, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17912118667786175, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5197, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1847187237615449, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5512, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1903162608452281, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4973, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19591379792891128, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.503, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.20151133501259447, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4571, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20710887209627762, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5066, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2127064091799608, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.42, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.218303946263644, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4306, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.22390148334732718, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3198, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22949902043101036, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4567, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.23509655751469352, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5331, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2406940945983767, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4561, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2462916316820599, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5067, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2518891687657431, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5058, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.25748670584942623, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5166, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.26308424293310945, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5097, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2686817800167926, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5102, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2742793171004758, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4113, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.279876854184159, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3356, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28547439126784213, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4804, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.29107192835152534, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5235, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2966694654352085, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5687, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3022670025188917, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4548, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.30786453960257487, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5667, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.31346207668625803, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4787, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31905961376994124, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4374, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3246571508536244, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5262, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3302546879373076, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4109, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.33585222502099077, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.307, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3414497621046739, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4678, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.34704729918835714, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5243, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3526448362720403, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4596, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3582423733557235, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4231, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.36383991043940667, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4536, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3694374475230898, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4464, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.37503498460677304, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4785, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3806325216904562, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4717, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.38623005877413935, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3935, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.39182759585782256, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3039, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3974251329415057, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3593, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.40302267002518893, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5417, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4086202071088721, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4483, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.41421774419255525, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5291, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.41981528127623846, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4624, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4254128183599216, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3895, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.43101035544360483, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3898, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.436607892527288, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4392, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.44220542961097115, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3673, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.44780296669465436, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2251, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4534005037783375, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4745, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.45899804086202073, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4641, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4645955779457039, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.447, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.47019311502938704, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4333, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.47579065211307026, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4172, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4813881891967534, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5103, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4869857262804366, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5454, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4925832633641198, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3954, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.49818080044780294, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3549, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5037783375314862, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3045, |
|
"step": 450 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 90, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.334613097500672e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|