{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.04, "eval_steps": 500000000, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "crossentropy": 2.9291462898254395, "epoch": 0.02, "grad_norm": 2.53125, "learning_rate": 0.001, "loss": 58.5829, "step": 1 }, { "crossentropy": 2.953669786453247, "epoch": 0.04, "grad_norm": 2.703125, "learning_rate": 0.002, "loss": 59.0734, "step": 2 }, { "crossentropy": 2.6353204250335693, "epoch": 0.06, "grad_norm": 1.5703125, "learning_rate": 0.003, "loss": 52.7064, "step": 3 }, { "crossentropy": 2.8646180629730225, "epoch": 0.08, "grad_norm": 3.796875, "learning_rate": 0.004, "loss": 57.2924, "step": 4 }, { "crossentropy": 2.8073368072509766, "epoch": 0.1, "grad_norm": 2.09375, "learning_rate": 0.005, "loss": 56.1467, "step": 5 }, { "crossentropy": 2.7765023708343506, "epoch": 0.12, "grad_norm": 1.953125, "learning_rate": 0.006, "loss": 55.53, "step": 6 }, { "crossentropy": 2.821621894836426, "epoch": 0.14, "grad_norm": 2.09375, "learning_rate": 0.006999999999999999, "loss": 56.4324, "step": 7 }, { "crossentropy": 2.7881150245666504, "epoch": 0.16, "grad_norm": 2.984375, "learning_rate": 0.008, "loss": 55.7623, "step": 8 }, { "crossentropy": 2.7072463035583496, "epoch": 1.02, "grad_norm": 2.359375, "learning_rate": 0.009000000000000001, "loss": 54.1449, "step": 9 }, { "crossentropy": 2.682814598083496, "epoch": 1.04, "grad_norm": 2.546875, "learning_rate": 0.01, "loss": 53.6563, "step": 10 }, { "crossentropy": 2.7697486877441406, "epoch": 1.06, "grad_norm": 3.015625, "learning_rate": 0.00998458666866564, "loss": 55.395, "step": 11 }, { "crossentropy": 2.5579347610473633, "epoch": 1.08, "grad_norm": 3.0, "learning_rate": 0.009938441702975689, "loss": 51.1587, "step": 12 }, { "crossentropy": 2.638291835784912, "epoch": 1.1, "grad_norm": 2.828125, "learning_rate": 0.009861849601988383, "loss": 52.7658, "step": 13 }, { "crossentropy": 2.4984238147735596, "epoch": 1.12, "grad_norm": 2.578125, "learning_rate": 0.009755282581475769, "loss": 49.9685, "step": 14 }, { "crossentropy": 2.607607841491699, "epoch": 1.1400000000000001, "grad_norm": 2.375, "grad_norm_var": 0.281939442952474, "learning_rate": 0.009619397662556433, "loss": 52.1522, "step": 15 }, { "crossentropy": 2.4619388580322266, "epoch": 1.16, "grad_norm": 2.359375, "grad_norm_var": 0.28509496053059896, "learning_rate": 0.00945503262094184, "loss": 49.2388, "step": 16 }, { "crossentropy": 2.4099559783935547, "epoch": 2.02, "grad_norm": 2.28125, "grad_norm_var": 0.2248687744140625, "learning_rate": 0.009263200821770462, "loss": 48.1991, "step": 17 }, { "crossentropy": 2.372147798538208, "epoch": 2.04, "grad_norm": 2.375, "grad_norm_var": 0.12443033854166667, "learning_rate": 0.009045084971874737, "loss": 47.443, "step": 18 }, { "crossentropy": 2.2578866481781006, "epoch": 2.06, "grad_norm": 3.078125, "grad_norm_var": 0.13013407389322917, "learning_rate": 0.008802029828000156, "loss": 45.1577, "step": 19 }, { "crossentropy": 2.375894069671631, "epoch": 2.08, "grad_norm": 2.75, "grad_norm_var": 0.10393473307291666, "learning_rate": 0.008535533905932738, "loss": 47.5179, "step": 20 }, { "crossentropy": 2.3451476097106934, "epoch": 2.1, "grad_norm": 2.421875, "grad_norm_var": 0.08750712076822917, "learning_rate": 0.008247240241650917, "loss": 46.903, "step": 21 }, { "crossentropy": 2.360849142074585, "epoch": 2.12, "grad_norm": 2.40625, "grad_norm_var": 0.08212483723958333, "learning_rate": 0.007938926261462366, "loss": 47.217, "step": 22 }, { "crossentropy": 2.291487455368042, "epoch": 2.14, "grad_norm": 2.03125, "grad_norm_var": 0.09099833170572917, "learning_rate": 0.0076124928235797445, "loss": 45.8298, "step": 23 }, { "crossentropy": 2.2348930835723877, "epoch": 2.16, "grad_norm": 2.390625, "grad_norm_var": 0.0902740478515625, "learning_rate": 0.007269952498697734, "loss": 44.6979, "step": 24 }, { "crossentropy": 2.0260097980499268, "epoch": 3.02, "grad_norm": 2.15625, "grad_norm_var": 0.08352457682291667, "learning_rate": 0.00691341716182545, "loss": 40.5202, "step": 25 }, { "crossentropy": 2.2256264686584473, "epoch": 3.04, "grad_norm": 2.390625, "grad_norm_var": 0.06499735514322917, "learning_rate": 0.006545084971874737, "loss": 44.5125, "step": 26 }, { "crossentropy": 2.1248207092285156, "epoch": 3.06, "grad_norm": 2.71875, "grad_norm_var": 0.060205078125, "learning_rate": 0.0061672268192795275, "loss": 42.4964, "step": 27 }, { "crossentropy": 1.990981936454773, "epoch": 3.08, "grad_norm": 2.15625, "grad_norm_var": 0.0636383056640625, "learning_rate": 0.0057821723252011546, "loss": 39.8196, "step": 28 }, { "crossentropy": 2.1135897636413574, "epoch": 3.1, "grad_norm": 2.125, "grad_norm_var": 0.06887919108072917, "learning_rate": 0.0053922954786392256, "loss": 42.2718, "step": 29 }, { "crossentropy": 1.9957607984542847, "epoch": 3.12, "grad_norm": 2.140625, "grad_norm_var": 0.0730377197265625, "learning_rate": 0.005, "loss": 39.9152, "step": 30 }, { "crossentropy": 2.0707690715789795, "epoch": 3.14, "grad_norm": 2.03125, "grad_norm_var": 0.08092041015625, "learning_rate": 0.004607704521360776, "loss": 41.4154, "step": 31 }, { "crossentropy": 2.187530040740967, "epoch": 3.16, "grad_norm": 1.9765625, "grad_norm_var": 0.09013442993164063, "learning_rate": 0.004217827674798845, "loss": 43.7506, "step": 32 }, { "crossentropy": 2.0033135414123535, "epoch": 4.02, "grad_norm": 2.265625, "grad_norm_var": 0.057795206705729164, "learning_rate": 0.003832773180720475, "loss": 40.0663, "step": 33 }, { "crossentropy": 1.8232710361480713, "epoch": 4.04, "grad_norm": 2.140625, "grad_norm_var": 0.04204508463541667, "learning_rate": 0.003454915028125263, "loss": 36.4654, "step": 34 }, { "crossentropy": 1.9771795272827148, "epoch": 4.06, "grad_norm": 2.203125, "grad_norm_var": 0.039510091145833336, "learning_rate": 0.0030865828381745515, "loss": 39.5436, "step": 35 }, { "crossentropy": 1.8496342897415161, "epoch": 4.08, "grad_norm": 1.9765625, "grad_norm_var": 0.04030736287434896, "learning_rate": 0.0027300475013022664, "loss": 36.9927, "step": 36 }, { "crossentropy": 1.87111234664917, "epoch": 4.1, "grad_norm": 2.0625, "grad_norm_var": 0.03969904581705729, "learning_rate": 0.002387507176420256, "loss": 37.4222, "step": 37 }, { "crossentropy": 1.9682161808013916, "epoch": 4.12, "grad_norm": 2.046875, "grad_norm_var": 0.038065338134765626, "learning_rate": 0.0020610737385376348, "loss": 39.3643, "step": 38 }, { "crossentropy": 1.920502781867981, "epoch": 4.14, "grad_norm": 1.8984375, "grad_norm_var": 0.0388824462890625, "learning_rate": 0.0017527597583490823, "loss": 38.4101, "step": 39 }, { "crossentropy": 1.86055588722229, "epoch": 4.16, "grad_norm": 1.90625, "grad_norm_var": 0.0423004150390625, "learning_rate": 0.0014644660940672626, "loss": 37.2111, "step": 40 }, { "crossentropy": 1.9031261205673218, "epoch": 5.02, "grad_norm": 1.8671875, "grad_norm_var": 0.014452870686848958, "learning_rate": 0.0011979701719998454, "loss": 38.0625, "step": 41 }, { "crossentropy": 1.7550657987594604, "epoch": 5.04, "grad_norm": 1.671875, "grad_norm_var": 0.021768951416015626, "learning_rate": 0.0009549150281252633, "loss": 35.1013, "step": 42 }, { "crossentropy": 1.8313697576522827, "epoch": 5.06, "grad_norm": 1.765625, "grad_norm_var": 0.024436187744140626, "learning_rate": 0.0007367991782295391, "loss": 36.6274, "step": 43 }, { "crossentropy": 1.9601967334747314, "epoch": 5.08, "grad_norm": 1.875, "grad_norm_var": 0.023502349853515625, "learning_rate": 0.0005449673790581611, "loss": 39.2039, "step": 44 }, { "crossentropy": 1.7513489723205566, "epoch": 5.1, "grad_norm": 1.765625, "grad_norm_var": 0.02585423787434896, "learning_rate": 0.0003806023374435663, "loss": 35.027, "step": 45 }, { "crossentropy": 1.912644624710083, "epoch": 5.12, "grad_norm": 1.84375, "grad_norm_var": 0.0266021728515625, "learning_rate": 0.00024471741852423234, "loss": 38.2529, "step": 46 }, { "crossentropy": 1.789371132850647, "epoch": 5.14, "grad_norm": 1.7890625, "grad_norm_var": 0.0280914306640625, "learning_rate": 0.0001381503980116172, "loss": 35.7874, "step": 47 }, { "crossentropy": 1.7998543977737427, "epoch": 5.16, "grad_norm": 1.859375, "grad_norm_var": 0.020580037434895834, "learning_rate": 6.15582970243117e-05, "loss": 35.9971, "step": 48 }, { "crossentropy": 1.7249021530151367, "epoch": 6.02, "grad_norm": 1.6484375, "grad_norm_var": 0.013252512613932291, "learning_rate": 1.541333133436018e-05, "loss": 34.498, "step": 49 }, { "crossentropy": 1.7648383378982544, "epoch": 6.04, "grad_norm": 1.78125, "grad_norm_var": 0.012572224934895833, "learning_rate": 0.0, "loss": 35.2968, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.29220314955776e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }