{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04740459824602986, "eval_steps": 20, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_accuracy": 0.9098228663446055, "eval_f1": 0.3, "eval_loss": 0.39109358191490173, "eval_precision": 0.2222222222222222, "eval_recall": 0.46153846153846156, "eval_runtime": 50.0411, "eval_samples_per_second": 5.416, "eval_steps_per_second": 0.18, "step": 0 }, { "epoch": 0.00047404598246029864, "grad_norm": 2.736264228820801, "learning_rate": 9.478672985781992e-08, "loss": 0.6485, "step": 1 }, { "epoch": 0.0009480919649205973, "grad_norm": 2.5090606212615967, "learning_rate": 1.8957345971563984e-07, "loss": 0.6663, "step": 2 }, { "epoch": 0.001422137947380896, "grad_norm": 2.8418514728546143, "learning_rate": 2.843601895734597e-07, "loss": 0.6669, "step": 3 }, { "epoch": 0.0018961839298411946, "grad_norm": 3.081920862197876, "learning_rate": 3.791469194312797e-07, "loss": 0.7195, "step": 4 }, { "epoch": 0.002370229912301493, "grad_norm": 2.9263253211975098, "learning_rate": 4.7393364928909956e-07, "loss": 0.699, "step": 5 }, { "epoch": 0.002844275894761792, "grad_norm": 2.3481531143188477, "learning_rate": 5.687203791469194e-07, "loss": 0.6352, "step": 6 }, { "epoch": 0.0033183218772220905, "grad_norm": 2.840491533279419, "learning_rate": 6.635071090047394e-07, "loss": 0.6859, "step": 7 }, { "epoch": 0.003792367859682389, "grad_norm": 2.8396637439727783, "learning_rate": 7.582938388625594e-07, "loss": 0.6912, "step": 8 }, { "epoch": 0.004266413842142688, "grad_norm": 3.1124157905578613, "learning_rate": 8.530805687203792e-07, "loss": 0.6481, "step": 9 }, { "epoch": 0.004740459824602986, "grad_norm": 2.768177032470703, "learning_rate": 9.478672985781991e-07, "loss": 0.6911, "step": 10 }, { "epoch": 0.0052145058070632855, "grad_norm": 2.4753313064575195, "learning_rate": 1.042654028436019e-06, "loss": 0.5654, "step": 11 }, { "epoch": 0.005688551789523584, "grad_norm": 3.73299241065979, "learning_rate": 1.1374407582938388e-06, "loss": 0.7249, "step": 12 }, { "epoch": 0.006162597771983883, "grad_norm": 2.8647408485412598, "learning_rate": 1.2322274881516587e-06, "loss": 0.571, "step": 13 }, { "epoch": 0.006636643754444181, "grad_norm": 1.9949300289154053, "learning_rate": 1.3270142180094788e-06, "loss": 0.5458, "step": 14 }, { "epoch": 0.00711068973690448, "grad_norm": 2.8095905780792236, "learning_rate": 1.4218009478672987e-06, "loss": 0.6974, "step": 15 }, { "epoch": 0.007584735719364778, "grad_norm": 2.3465747833251953, "learning_rate": 1.5165876777251187e-06, "loss": 0.5798, "step": 16 }, { "epoch": 0.008058781701825076, "grad_norm": 2.2376415729522705, "learning_rate": 1.6113744075829384e-06, "loss": 0.6219, "step": 17 }, { "epoch": 0.008532827684285376, "grad_norm": 2.2321646213531494, "learning_rate": 1.7061611374407585e-06, "loss": 0.6327, "step": 18 }, { "epoch": 0.009006873666745675, "grad_norm": 2.9532177448272705, "learning_rate": 1.8009478672985784e-06, "loss": 0.7287, "step": 19 }, { "epoch": 0.009480919649205973, "grad_norm": 2.7521305084228516, "learning_rate": 1.8957345971563982e-06, "loss": 0.7904, "step": 20 }, { "epoch": 0.009480919649205973, "eval_accuracy": 0.9162640901771336, "eval_f1": 0.3157894736842105, "eval_loss": 0.3854508101940155, "eval_precision": 0.24, "eval_recall": 0.46153846153846156, "eval_runtime": 50.1605, "eval_samples_per_second": 5.403, "eval_steps_per_second": 0.179, "step": 20 }, { "epoch": 0.009954965631666271, "grad_norm": 2.5812387466430664, "learning_rate": 1.990521327014218e-06, "loss": 0.6582, "step": 21 }, { "epoch": 0.010429011614126571, "grad_norm": 2.5878043174743652, "learning_rate": 2.085308056872038e-06, "loss": 0.5975, "step": 22 }, { "epoch": 0.01090305759658687, "grad_norm": 2.4602837562561035, "learning_rate": 2.180094786729858e-06, "loss": 0.6356, "step": 23 }, { "epoch": 0.011377103579047167, "grad_norm": 2.857377290725708, "learning_rate": 2.2748815165876777e-06, "loss": 0.6933, "step": 24 }, { "epoch": 0.011851149561507466, "grad_norm": 2.478761911392212, "learning_rate": 2.369668246445498e-06, "loss": 0.6806, "step": 25 }, { "epoch": 0.012325195543967766, "grad_norm": 2.6150331497192383, "learning_rate": 2.4644549763033174e-06, "loss": 0.6727, "step": 26 }, { "epoch": 0.012799241526428064, "grad_norm": 2.4646215438842773, "learning_rate": 2.5592417061611373e-06, "loss": 0.7231, "step": 27 }, { "epoch": 0.013273287508888362, "grad_norm": 2.3204421997070312, "learning_rate": 2.6540284360189576e-06, "loss": 0.715, "step": 28 }, { "epoch": 0.01374733349134866, "grad_norm": 2.1901276111602783, "learning_rate": 2.7488151658767775e-06, "loss": 0.6207, "step": 29 }, { "epoch": 0.01422137947380896, "grad_norm": 2.659156322479248, "learning_rate": 2.8436018957345973e-06, "loss": 0.6465, "step": 30 }, { "epoch": 0.014695425456269258, "grad_norm": 3.0104305744171143, "learning_rate": 2.938388625592417e-06, "loss": 0.6848, "step": 31 }, { "epoch": 0.015169471438729557, "grad_norm": 3.2612526416778564, "learning_rate": 3.0331753554502375e-06, "loss": 0.6094, "step": 32 }, { "epoch": 0.015643517421189856, "grad_norm": 2.8630073070526123, "learning_rate": 3.1279620853080574e-06, "loss": 0.6679, "step": 33 }, { "epoch": 0.016117563403650153, "grad_norm": 3.1366546154022217, "learning_rate": 3.222748815165877e-06, "loss": 0.6961, "step": 34 }, { "epoch": 0.016591609386110453, "grad_norm": 2.5289793014526367, "learning_rate": 3.3175355450236967e-06, "loss": 0.6363, "step": 35 }, { "epoch": 0.017065655368570753, "grad_norm": 1.996009111404419, "learning_rate": 3.412322274881517e-06, "loss": 0.6137, "step": 36 }, { "epoch": 0.01753970135103105, "grad_norm": 2.309265613555908, "learning_rate": 3.507109004739337e-06, "loss": 0.5873, "step": 37 }, { "epoch": 0.01801374733349135, "grad_norm": 2.2232859134674072, "learning_rate": 3.6018957345971567e-06, "loss": 0.6081, "step": 38 }, { "epoch": 0.018487793315951646, "grad_norm": 2.608635902404785, "learning_rate": 3.6966824644549766e-06, "loss": 0.5946, "step": 39 }, { "epoch": 0.018961839298411946, "grad_norm": 2.9958667755126953, "learning_rate": 3.7914691943127964e-06, "loss": 0.6496, "step": 40 }, { "epoch": 0.018961839298411946, "eval_accuracy": 0.9299516908212561, "eval_f1": 0.304, "eval_loss": 0.357759952545166, "eval_precision": 0.2602739726027397, "eval_recall": 0.36538461538461536, "eval_runtime": 50.7819, "eval_samples_per_second": 5.337, "eval_steps_per_second": 0.177, "step": 40 }, { "epoch": 0.019435885280872246, "grad_norm": 2.9960222244262695, "learning_rate": 3.886255924170616e-06, "loss": 0.7012, "step": 41 }, { "epoch": 0.019909931263332542, "grad_norm": 2.469219923019409, "learning_rate": 3.981042654028436e-06, "loss": 0.5172, "step": 42 }, { "epoch": 0.020383977245792842, "grad_norm": 2.2367403507232666, "learning_rate": 4.075829383886256e-06, "loss": 0.6086, "step": 43 }, { "epoch": 0.020858023228253142, "grad_norm": 2.455852746963501, "learning_rate": 4.170616113744076e-06, "loss": 0.6616, "step": 44 }, { "epoch": 0.02133206921071344, "grad_norm": 2.6048426628112793, "learning_rate": 4.265402843601897e-06, "loss": 0.6319, "step": 45 }, { "epoch": 0.02180611519317374, "grad_norm": 2.633476495742798, "learning_rate": 4.360189573459716e-06, "loss": 0.5807, "step": 46 }, { "epoch": 0.022280161175634035, "grad_norm": 2.525595188140869, "learning_rate": 4.4549763033175355e-06, "loss": 0.566, "step": 47 }, { "epoch": 0.022754207158094335, "grad_norm": 2.116396427154541, "learning_rate": 4.549763033175355e-06, "loss": 0.5222, "step": 48 }, { "epoch": 0.023228253140554635, "grad_norm": 2.2869620323181152, "learning_rate": 4.644549763033176e-06, "loss": 0.6677, "step": 49 }, { "epoch": 0.02370229912301493, "grad_norm": 3.656646966934204, "learning_rate": 4.739336492890996e-06, "loss": 0.6329, "step": 50 }, { "epoch": 0.02417634510547523, "grad_norm": 2.4779574871063232, "learning_rate": 4.834123222748816e-06, "loss": 0.6179, "step": 51 }, { "epoch": 0.02465039108793553, "grad_norm": 2.9239354133605957, "learning_rate": 4.928909952606635e-06, "loss": 0.5679, "step": 52 }, { "epoch": 0.025124437070395828, "grad_norm": 2.596090793609619, "learning_rate": 5.023696682464455e-06, "loss": 0.5907, "step": 53 }, { "epoch": 0.025598483052856127, "grad_norm": 2.4275245666503906, "learning_rate": 5.118483412322275e-06, "loss": 0.5432, "step": 54 }, { "epoch": 0.026072529035316427, "grad_norm": 3.1805362701416016, "learning_rate": 5.213270142180096e-06, "loss": 0.6221, "step": 55 }, { "epoch": 0.026546575017776724, "grad_norm": 2.3142030239105225, "learning_rate": 5.308056872037915e-06, "loss": 0.6459, "step": 56 }, { "epoch": 0.027020621000237024, "grad_norm": 2.3154592514038086, "learning_rate": 5.402843601895735e-06, "loss": 0.5481, "step": 57 }, { "epoch": 0.02749466698269732, "grad_norm": 2.70127272605896, "learning_rate": 5.497630331753555e-06, "loss": 0.5592, "step": 58 }, { "epoch": 0.02796871296515762, "grad_norm": 2.5554442405700684, "learning_rate": 5.592417061611375e-06, "loss": 0.587, "step": 59 }, { "epoch": 0.02844275894761792, "grad_norm": 2.4974448680877686, "learning_rate": 5.687203791469195e-06, "loss": 0.5209, "step": 60 }, { "epoch": 0.02844275894761792, "eval_accuracy": 0.9468599033816425, "eval_f1": 0.23255813953488372, "eval_loss": 0.30114662647247314, "eval_precision": 0.29411764705882354, "eval_recall": 0.19230769230769232, "eval_runtime": 50.6864, "eval_samples_per_second": 5.347, "eval_steps_per_second": 0.178, "step": 60 }, { "epoch": 0.028916804930078217, "grad_norm": 2.1992790699005127, "learning_rate": 5.7819905213270145e-06, "loss": 0.6134, "step": 61 }, { "epoch": 0.029390850912538517, "grad_norm": 2.135422468185425, "learning_rate": 5.876777251184834e-06, "loss": 0.5917, "step": 62 }, { "epoch": 0.029864896894998817, "grad_norm": 1.9710865020751953, "learning_rate": 5.971563981042654e-06, "loss": 0.5341, "step": 63 }, { "epoch": 0.030338942877459113, "grad_norm": 2.6831486225128174, "learning_rate": 6.066350710900475e-06, "loss": 0.5878, "step": 64 }, { "epoch": 0.030812988859919413, "grad_norm": 2.277893543243408, "learning_rate": 6.161137440758295e-06, "loss": 0.5407, "step": 65 }, { "epoch": 0.03128703484237971, "grad_norm": 2.153470993041992, "learning_rate": 6.255924170616115e-06, "loss": 0.5109, "step": 66 }, { "epoch": 0.03176108082484001, "grad_norm": 2.458293914794922, "learning_rate": 6.350710900473935e-06, "loss": 0.5687, "step": 67 }, { "epoch": 0.032235126807300306, "grad_norm": 1.6730012893676758, "learning_rate": 6.445497630331754e-06, "loss": 0.511, "step": 68 }, { "epoch": 0.032709172789760606, "grad_norm": 2.294477939605713, "learning_rate": 6.5402843601895735e-06, "loss": 0.5163, "step": 69 }, { "epoch": 0.033183218772220906, "grad_norm": 1.931765079498291, "learning_rate": 6.635071090047393e-06, "loss": 0.5463, "step": 70 }, { "epoch": 0.033657264754681206, "grad_norm": 1.9582473039627075, "learning_rate": 6.729857819905213e-06, "loss": 0.5404, "step": 71 }, { "epoch": 0.034131310737141506, "grad_norm": 2.352447986602783, "learning_rate": 6.824644549763034e-06, "loss": 0.5004, "step": 72 }, { "epoch": 0.0346053567196018, "grad_norm": 2.5306575298309326, "learning_rate": 6.919431279620854e-06, "loss": 0.491, "step": 73 }, { "epoch": 0.0350794027020621, "grad_norm": 1.954287052154541, "learning_rate": 7.014218009478674e-06, "loss": 0.5096, "step": 74 }, { "epoch": 0.0355534486845224, "grad_norm": 1.7585203647613525, "learning_rate": 7.1090047393364935e-06, "loss": 0.4437, "step": 75 }, { "epoch": 0.0360274946669827, "grad_norm": 1.9448845386505127, "learning_rate": 7.203791469194313e-06, "loss": 0.4902, "step": 76 }, { "epoch": 0.036501540649443, "grad_norm": 2.1417629718780518, "learning_rate": 7.298578199052133e-06, "loss": 0.5599, "step": 77 }, { "epoch": 0.03697558663190329, "grad_norm": 1.9677048921585083, "learning_rate": 7.393364928909953e-06, "loss": 0.5196, "step": 78 }, { "epoch": 0.03744963261436359, "grad_norm": 4.773871421813965, "learning_rate": 7.488151658767773e-06, "loss": 0.7193, "step": 79 }, { "epoch": 0.03792367859682389, "grad_norm": 1.7716329097747803, "learning_rate": 7.582938388625593e-06, "loss": 0.482, "step": 80 }, { "epoch": 0.03792367859682389, "eval_accuracy": 0.9524959742351047, "eval_f1": 0.21333333333333335, "eval_loss": 0.2597336769104004, "eval_precision": 0.34782608695652173, "eval_recall": 0.15384615384615385, "eval_runtime": 50.3051, "eval_samples_per_second": 5.387, "eval_steps_per_second": 0.179, "step": 80 }, { "epoch": 0.03839772457928419, "grad_norm": 1.7248247861862183, "learning_rate": 7.677725118483414e-06, "loss": 0.4988, "step": 81 }, { "epoch": 0.03887177056174449, "grad_norm": 2.6806564331054688, "learning_rate": 7.772511848341233e-06, "loss": 0.6173, "step": 82 }, { "epoch": 0.03934581654420479, "grad_norm": 3.3090500831604004, "learning_rate": 7.867298578199053e-06, "loss": 0.6747, "step": 83 }, { "epoch": 0.039819862526665084, "grad_norm": 1.7768396139144897, "learning_rate": 7.962085308056872e-06, "loss": 0.4246, "step": 84 }, { "epoch": 0.040293908509125384, "grad_norm": 2.553398847579956, "learning_rate": 8.056872037914693e-06, "loss": 0.5885, "step": 85 }, { "epoch": 0.040767954491585684, "grad_norm": 2.223745107650757, "learning_rate": 8.151658767772512e-06, "loss": 0.631, "step": 86 }, { "epoch": 0.041242000474045984, "grad_norm": 2.303098440170288, "learning_rate": 8.246445497630333e-06, "loss": 0.4689, "step": 87 }, { "epoch": 0.041716046456506284, "grad_norm": 1.8970552682876587, "learning_rate": 8.341232227488152e-06, "loss": 0.523, "step": 88 }, { "epoch": 0.04219009243896658, "grad_norm": 2.505955934524536, "learning_rate": 8.436018957345973e-06, "loss": 0.4935, "step": 89 }, { "epoch": 0.04266413842142688, "grad_norm": 1.875301718711853, "learning_rate": 8.530805687203793e-06, "loss": 0.4522, "step": 90 }, { "epoch": 0.04313818440388718, "grad_norm": 1.900534749031067, "learning_rate": 8.625592417061612e-06, "loss": 0.4667, "step": 91 }, { "epoch": 0.04361223038634748, "grad_norm": 3.142495632171631, "learning_rate": 8.720379146919431e-06, "loss": 0.7367, "step": 92 }, { "epoch": 0.04408627636880778, "grad_norm": 2.096675395965576, "learning_rate": 8.815165876777252e-06, "loss": 0.5286, "step": 93 }, { "epoch": 0.04456032235126807, "grad_norm": 2.4111526012420654, "learning_rate": 8.909952606635071e-06, "loss": 0.4165, "step": 94 }, { "epoch": 0.04503436833372837, "grad_norm": 2.4553468227386475, "learning_rate": 9.004739336492892e-06, "loss": 0.5336, "step": 95 }, { "epoch": 0.04550841431618867, "grad_norm": 2.3772170543670654, "learning_rate": 9.09952606635071e-06, "loss": 0.455, "step": 96 }, { "epoch": 0.04598246029864897, "grad_norm": 2.652953863143921, "learning_rate": 9.194312796208532e-06, "loss": 0.5995, "step": 97 }, { "epoch": 0.04645650628110927, "grad_norm": 1.7384384870529175, "learning_rate": 9.289099526066352e-06, "loss": 0.4296, "step": 98 }, { "epoch": 0.04693055226356957, "grad_norm": 2.1251447200775146, "learning_rate": 9.383886255924171e-06, "loss": 0.5505, "step": 99 }, { "epoch": 0.04740459824602986, "grad_norm": 1.7407325506210327, "learning_rate": 9.478672985781992e-06, "loss": 0.4165, "step": 100 }, { "epoch": 0.04740459824602986, "eval_accuracy": 0.9549114331723028, "eval_f1": 0.40425531914893614, "eval_loss": 0.2602430284023285, "eval_precision": 0.4523809523809524, "eval_recall": 0.36538461538461536, "eval_runtime": 50.5287, "eval_samples_per_second": 5.363, "eval_steps_per_second": 0.178, "step": 100 } ], "logging_steps": 1, "max_steps": 2109, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.677803521880883e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }