{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.015384615384615, "eval_steps": 500000000, "global_step": 65, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "crossentropy": 3.136348009109497, "epoch": 0.015384615384615385, "grad_norm": 2.609375, "learning_rate": 0.0006000000000000001, "loss": 62.727, "step": 1 }, { "crossentropy": 3.124629259109497, "epoch": 0.03076923076923077, "grad_norm": 2.640625, "learning_rate": 0.0012000000000000001, "loss": 62.4926, "step": 2 }, { "crossentropy": 3.071411609649658, "epoch": 0.046153846153846156, "grad_norm": 1.671875, "learning_rate": 0.0018, "loss": 61.4282, "step": 3 }, { "crossentropy": 2.8995280265808105, "epoch": 0.06153846153846154, "grad_norm": 2.203125, "learning_rate": 0.0024000000000000002, "loss": 57.9906, "step": 4 }, { "crossentropy": 2.881728172302246, "epoch": 0.07692307692307693, "grad_norm": 2.9375, "learning_rate": 0.003, "loss": 57.6346, "step": 5 }, { "crossentropy": 2.9400224685668945, "epoch": 0.09230769230769231, "grad_norm": 2.265625, "learning_rate": 0.0036, "loss": 58.8004, "step": 6 }, { "crossentropy": 3.066098690032959, "epoch": 0.1076923076923077, "grad_norm": 2.375, "learning_rate": 0.0042, "loss": 61.322, "step": 7 }, { "crossentropy": 3.1193857192993164, "epoch": 0.12307692307692308, "grad_norm": 2.046875, "learning_rate": 0.0048000000000000004, "loss": 62.3877, "step": 8 }, { "crossentropy": 3.0263144969940186, "epoch": 1.0153846153846153, "grad_norm": 1.828125, "learning_rate": 0.0054, "loss": 60.5263, "step": 9 }, { "crossentropy": 2.8756701946258545, "epoch": 1.0307692307692307, "grad_norm": 2.15625, "learning_rate": 0.006, "loss": 57.5134, "step": 10 }, { "crossentropy": 2.9259140491485596, "epoch": 1.0461538461538462, "grad_norm": 2.40625, "learning_rate": 0.005995107311778406, "loss": 58.5183, "step": 11 }, { "crossentropy": 2.937499761581421, "epoch": 1.0615384615384615, "grad_norm": 2.828125, "learning_rate": 0.00598044520604565, "loss": 58.75, "step": 12 }, { "crossentropy": 2.806440830230713, "epoch": 1.0769230769230769, "grad_norm": 2.109375, "learning_rate": 0.005956061507543079, "loss": 56.1288, "step": 13 }, { "crossentropy": 2.9169092178344727, "epoch": 1.0923076923076924, "grad_norm": 2.1875, "learning_rate": 0.005922035750827001, "loss": 58.3382, "step": 14 }, { "crossentropy": 2.781325101852417, "epoch": 1.1076923076923078, "grad_norm": 2.609375, "grad_norm_var": 0.12228902180989583, "learning_rate": 0.005878478920843493, "loss": 55.6265, "step": 15 }, { "crossentropy": 2.775790214538574, "epoch": 1.123076923076923, "grad_norm": 1.9140625, "grad_norm_var": 0.12452977498372396, "learning_rate": 0.005825533090918575, "loss": 55.5158, "step": 16 }, { "crossentropy": 2.780381202697754, "epoch": 2.0153846153846153, "grad_norm": 2.40625, "grad_norm_var": 0.1012115478515625, "learning_rate": 0.00576337095934455, "loss": 55.6076, "step": 17 }, { "crossentropy": 2.7276179790496826, "epoch": 2.0307692307692307, "grad_norm": 2.109375, "grad_norm_var": 0.1025299072265625, "learning_rate": 0.005692195286074075, "loss": 54.5524, "step": 18 }, { "crossentropy": 2.514774799346924, "epoch": 2.046153846153846, "grad_norm": 2.0625, "grad_norm_var": 0.07119852701822917, "learning_rate": 0.005612238231359332, "loss": 50.2955, "step": 19 }, { "crossentropy": 2.735562801361084, "epoch": 2.0615384615384613, "grad_norm": 2.375, "grad_norm_var": 0.07284342447916667, "learning_rate": 0.005523760598493544, "loss": 54.7113, "step": 20 }, { "crossentropy": 2.6968579292297363, "epoch": 2.076923076923077, "grad_norm": 1.9453125, "grad_norm_var": 0.0749834696451823, "learning_rate": 0.005427050983124842, "loss": 53.9372, "step": 21 }, { "crossentropy": 2.7723076343536377, "epoch": 2.0923076923076924, "grad_norm": 1.8046875, "grad_norm_var": 0.08308003743489584, "learning_rate": 0.005322424831917248, "loss": 55.4462, "step": 22 }, { "crossentropy": 2.6105570793151855, "epoch": 2.1076923076923078, "grad_norm": 1.53125, "grad_norm_var": 0.10808919270833334, "learning_rate": 0.005210223413629215, "loss": 52.2111, "step": 23 }, { "crossentropy": 2.627753257751465, "epoch": 2.123076923076923, "grad_norm": 1.8046875, "grad_norm_var": 0.10908788045247396, "learning_rate": 0.005090812705965881, "loss": 52.5551, "step": 24 }, { "crossentropy": 2.449847459793091, "epoch": 3.0153846153846153, "grad_norm": 2.140625, "grad_norm_var": 0.10986226399739583, "learning_rate": 0.004964582201835856, "loss": 48.9969, "step": 25 }, { "crossentropy": 2.566053867340088, "epoch": 3.0307692307692307, "grad_norm": 1.9453125, "grad_norm_var": 0.07247492472330729, "learning_rate": 0.004831943638906315, "loss": 51.3211, "step": 26 }, { "crossentropy": 2.5103166103363037, "epoch": 3.046153846153846, "grad_norm": 2.40625, "grad_norm_var": 0.08066991170247396, "learning_rate": 0.004693329656600308, "loss": 50.2063, "step": 27 }, { "crossentropy": 2.586671829223633, "epoch": 3.0615384615384613, "grad_norm": 1.8671875, "grad_norm_var": 0.08163960774739583, "learning_rate": 0.004549192384916886, "loss": 51.7334, "step": 28 }, { "crossentropy": 2.383892297744751, "epoch": 3.076923076923077, "grad_norm": 1.7890625, "grad_norm_var": 0.06142552693684896, "learning_rate": 0.004400001969677022, "loss": 47.6778, "step": 29 }, { "crossentropy": 2.6217358112335205, "epoch": 3.0923076923076924, "grad_norm": 1.890625, "grad_norm_var": 0.06169331868489583, "learning_rate": 0.00424624503900566, "loss": 52.4347, "step": 30 }, { "crossentropy": 2.494020700454712, "epoch": 3.1076923076923078, "grad_norm": 1.75, "grad_norm_var": 0.06497777303059896, "learning_rate": 0.004088423116051923, "loss": 49.8804, "step": 31 }, { "crossentropy": 2.6089510917663574, "epoch": 3.123076923076923, "grad_norm": 1.859375, "grad_norm_var": 0.052374013264973956, "learning_rate": 0.0039270509831248425, "loss": 52.179, "step": 32 }, { "crossentropy": 2.3228707313537598, "epoch": 4.015384615384615, "grad_norm": 2.015625, "grad_norm_var": 0.050022125244140625, "learning_rate": 0.0037626550025804616, "loss": 46.4574, "step": 33 }, { "crossentropy": 2.5025362968444824, "epoch": 4.030769230769231, "grad_norm": 1.9765625, "grad_norm_var": 0.0360015869140625, "learning_rate": 0.0035957713999372363, "loss": 50.0507, "step": 34 }, { "crossentropy": 2.3985209465026855, "epoch": 4.046153846153846, "grad_norm": 2.109375, "grad_norm_var": 0.03868789672851562, "learning_rate": 0.003426944514819856, "loss": 47.9704, "step": 35 }, { "crossentropy": 2.4309520721435547, "epoch": 4.061538461538461, "grad_norm": 1.9140625, "grad_norm_var": 0.037904612223307294, "learning_rate": 0.0032567250254365195, "loss": 48.619, "step": 36 }, { "crossentropy": 2.406186103820801, "epoch": 4.076923076923077, "grad_norm": 1.6875, "grad_norm_var": 0.031404368082682294, "learning_rate": 0.0030856681523810884, "loss": 48.1237, "step": 37 }, { "crossentropy": 2.3365566730499268, "epoch": 4.092307692307692, "grad_norm": 1.75, "grad_norm_var": 0.03247782389322917, "learning_rate": 0.0029143318476189117, "loss": 46.7311, "step": 38 }, { "crossentropy": 2.3258492946624756, "epoch": 4.107692307692307, "grad_norm": 1.6484375, "grad_norm_var": 0.0364654541015625, "learning_rate": 0.0027432749745634815, "loss": 46.517, "step": 39 }, { "crossentropy": 2.4440741539001465, "epoch": 4.123076923076923, "grad_norm": 1.8203125, "grad_norm_var": 0.03316014607747396, "learning_rate": 0.002573055485180145, "loss": 48.8815, "step": 40 }, { "crossentropy": 2.299198865890503, "epoch": 5.015384615384615, "grad_norm": 1.8515625, "grad_norm_var": 0.013963826497395833, "learning_rate": 0.0024042286000627642, "loss": 45.984, "step": 41 }, { "crossentropy": 2.284895658493042, "epoch": 5.030769230769231, "grad_norm": 1.71875, "grad_norm_var": 0.015012359619140625, "learning_rate": 0.00223734499741954, "loss": 45.6979, "step": 42 }, { "crossentropy": 2.3056931495666504, "epoch": 5.046153846153846, "grad_norm": 1.7109375, "grad_norm_var": 0.01593805948893229, "learning_rate": 0.002072949016875158, "loss": 46.1139, "step": 43 }, { "crossentropy": 2.3087401390075684, "epoch": 5.061538461538461, "grad_norm": 1.765625, "grad_norm_var": 0.01601130167643229, "learning_rate": 0.0019115768839480774, "loss": 46.1748, "step": 44 }, { "crossentropy": 2.3146181106567383, "epoch": 5.076923076923077, "grad_norm": 1.7890625, "grad_norm_var": 0.015697224934895834, "learning_rate": 0.0017537549609943411, "loss": 46.2924, "step": 45 }, { "crossentropy": 2.21163010597229, "epoch": 5.092307692307692, "grad_norm": 1.5078125, "grad_norm_var": 0.02209447224934896, "learning_rate": 0.0015999980303229788, "loss": 44.2326, "step": 46 }, { "crossentropy": 2.4418914318084717, "epoch": 5.107692307692307, "grad_norm": 1.609375, "grad_norm_var": 0.02432428995768229, "learning_rate": 0.0014508076150831146, "loss": 48.8378, "step": 47 }, { "crossentropy": 2.319216251373291, "epoch": 5.123076923076923, "grad_norm": 1.6328125, "grad_norm_var": 0.0221435546875, "learning_rate": 0.0013066703433996932, "loss": 46.3843, "step": 48 }, { "crossentropy": 2.3361504077911377, "epoch": 6.015384615384615, "grad_norm": 1.4765625, "grad_norm_var": 0.014581044514973959, "learning_rate": 0.0011680563610936853, "loss": 46.723, "step": 49 }, { "crossentropy": 2.2992982864379883, "epoch": 6.030769230769231, "grad_norm": 1.5625, "grad_norm_var": 0.012669881184895834, "learning_rate": 0.001035417798164145, "loss": 45.986, "step": 50 }, { "crossentropy": 2.2670037746429443, "epoch": 6.046153846153846, "grad_norm": 1.5, "grad_norm_var": 0.014842732747395834, "learning_rate": 0.0009091872940341201, "loss": 45.3401, "step": 51 }, { "crossentropy": 2.3044376373291016, "epoch": 6.061538461538461, "grad_norm": 1.5625, "grad_norm_var": 0.015160115559895833, "learning_rate": 0.0007897765863707849, "loss": 46.0888, "step": 52 }, { "crossentropy": 2.183406352996826, "epoch": 6.076923076923077, "grad_norm": 1.484375, "grad_norm_var": 0.017162831624348958, "learning_rate": 0.0006775751680827525, "loss": 43.6681, "step": 53 }, { "crossentropy": 2.323101758956909, "epoch": 6.092307692307692, "grad_norm": 1.515625, "grad_norm_var": 0.016161092122395835, "learning_rate": 0.000572949016875158, "loss": 46.462, "step": 54 }, { "crossentropy": 2.234001398086548, "epoch": 6.107692307692307, "grad_norm": 1.5703125, "grad_norm_var": 0.013849894205729166, "learning_rate": 0.00047623940150645606, "loss": 44.68, "step": 55 }, { "crossentropy": 2.326664447784424, "epoch": 6.123076923076923, "grad_norm": 1.5625, "grad_norm_var": 0.010076649983723958, "learning_rate": 0.00038776176864066894, "loss": 46.5333, "step": 56 }, { "crossentropy": 2.2117629051208496, "epoch": 7.015384615384615, "grad_norm": 1.359375, "grad_norm_var": 0.011201985677083333, "learning_rate": 0.00030780471392592533, "loss": 44.2353, "step": 57 }, { "crossentropy": 2.2985191345214844, "epoch": 7.030769230769231, "grad_norm": 1.421875, "grad_norm_var": 0.0095458984375, "learning_rate": 0.00023662904065544998, "loss": 45.9704, "step": 58 }, { "crossentropy": 2.245023012161255, "epoch": 7.046153846153846, "grad_norm": 1.390625, "grad_norm_var": 0.006601715087890625, "learning_rate": 0.00017446690908142615, "loss": 44.9005, "step": 59 }, { "crossentropy": 2.174379825592041, "epoch": 7.061538461538461, "grad_norm": 1.390625, "grad_norm_var": 0.0076812744140625, "learning_rate": 0.00012152107915650823, "loss": 43.4876, "step": 60 }, { "crossentropy": 2.481731414794922, "epoch": 7.076923076923077, "grad_norm": 1.5078125, "grad_norm_var": 0.007043202718098958, "learning_rate": 7.79642491729996e-05, "loss": 49.6346, "step": 61 }, { "crossentropy": 2.240917444229126, "epoch": 7.092307692307692, "grad_norm": 1.375, "grad_norm_var": 0.006917317708333333, "learning_rate": 4.393849245692105e-05, "loss": 44.8183, "step": 62 }, { "crossentropy": 2.343332052230835, "epoch": 7.107692307692307, "grad_norm": 1.453125, "grad_norm_var": 0.005566151936848959, "learning_rate": 1.9554793954349337e-05, "loss": 46.8666, "step": 63 }, { "crossentropy": 2.241647958755493, "epoch": 7.123076923076923, "grad_norm": 1.40625, "grad_norm_var": 0.0059163411458333336, "learning_rate": 4.892688221593055e-06, "loss": 44.833, "step": 64 }, { "crossentropy": 2.247157335281372, "epoch": 8.015384615384615, "grad_norm": 1.3515625, "grad_norm_var": 0.006371815999348958, "learning_rate": 0.0, "loss": 44.9431, "step": 65 } ], "logging_steps": 1, "max_steps": 65, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.359728188850176e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }