{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 20, "global_step": 135, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 2e-05, "loss": 0.9768, "step": 1 }, { "epoch": 0.07, "learning_rate": 4e-05, "loss": 1.0553, "step": 2 }, { "epoch": 0.11, "learning_rate": 6e-05, "loss": 0.9074, "step": 3 }, { "epoch": 0.15, "learning_rate": 8e-05, "loss": 1.0351, "step": 4 }, { "epoch": 0.19, "learning_rate": 0.0001, "loss": 0.9918, "step": 5 }, { "epoch": 0.22, "learning_rate": 0.00012, "loss": 0.9872, "step": 6 }, { "epoch": 0.26, "learning_rate": 0.00014, "loss": 0.9573, "step": 7 }, { "epoch": 0.3, "learning_rate": 0.00016, "loss": 1.0466, "step": 8 }, { "epoch": 0.33, "learning_rate": 0.00018, "loss": 0.8995, "step": 9 }, { "epoch": 0.37, "learning_rate": 0.0002, "loss": 0.9041, "step": 10 }, { "epoch": 0.41, "learning_rate": 0.00019996841892833, "loss": 0.936, "step": 11 }, { "epoch": 0.44, "learning_rate": 0.00019987369566060176, "loss": 0.8254, "step": 12 }, { "epoch": 0.48, "learning_rate": 0.0001997158900260614, "loss": 0.9508, "step": 13 }, { "epoch": 0.52, "learning_rate": 0.00019949510169813003, "loss": 0.929, "step": 14 }, { "epoch": 0.56, "learning_rate": 0.0001992114701314478, "loss": 0.9618, "step": 15 }, { "epoch": 0.59, "learning_rate": 0.0001988651744737914, "loss": 0.9317, "step": 16 }, { "epoch": 0.63, "learning_rate": 0.00019845643345292054, "loss": 0.9399, "step": 17 }, { "epoch": 0.67, "learning_rate": 0.0001979855052384247, "loss": 0.9377, "step": 18 }, { "epoch": 0.7, "learning_rate": 0.00019745268727865774, "loss": 0.9048, "step": 19 }, { "epoch": 0.74, "learning_rate": 0.0001968583161128631, "loss": 0.9311, "step": 20 }, { "epoch": 0.74, "eval_loss": 0.8045752644538879, "eval_runtime": 2.684, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 20 }, { "epoch": 0.78, "learning_rate": 0.0001962027671586086, "loss": 0.9376, "step": 21 }, { "epoch": 0.81, "learning_rate": 0.00019548645447466431, "loss": 0.8598, "step": 22 }, { "epoch": 0.85, "learning_rate": 0.00019470983049947444, "loss": 0.991, "step": 23 }, { "epoch": 0.89, "learning_rate": 0.00019387338576538744, "loss": 0.8472, "step": 24 }, { "epoch": 0.93, "learning_rate": 0.00019297764858882514, "loss": 0.8818, "step": 25 }, { "epoch": 0.96, "learning_rate": 0.00019202318473658705, "loss": 0.8879, "step": 26 }, { "epoch": 1.0, "learning_rate": 0.00019101059706849957, "loss": 0.8483, "step": 27 }, { "epoch": 1.04, "learning_rate": 0.0001899405251566371, "loss": 0.9505, "step": 28 }, { "epoch": 1.07, "learning_rate": 0.00018881364488135448, "loss": 0.9116, "step": 29 }, { "epoch": 1.11, "learning_rate": 0.00018763066800438636, "loss": 0.8575, "step": 30 }, { "epoch": 1.15, "learning_rate": 0.00018639234171928353, "loss": 0.8093, "step": 31 }, { "epoch": 1.19, "learning_rate": 0.00018509944817946922, "loss": 0.7966, "step": 32 }, { "epoch": 1.22, "learning_rate": 0.0001837528040042142, "loss": 0.8263, "step": 33 }, { "epoch": 1.26, "learning_rate": 0.00018235325976284275, "loss": 0.7951, "step": 34 }, { "epoch": 1.3, "learning_rate": 0.00018090169943749476, "loss": 0.849, "step": 35 }, { "epoch": 1.33, "learning_rate": 0.00017939903986478355, "loss": 0.863, "step": 36 }, { "epoch": 1.37, "learning_rate": 0.00017784623015670238, "loss": 0.8144, "step": 37 }, { "epoch": 1.41, "learning_rate": 0.0001762442511011448, "loss": 0.8078, "step": 38 }, { "epoch": 1.44, "learning_rate": 0.00017459411454241822, "loss": 0.7997, "step": 39 }, { "epoch": 1.48, "learning_rate": 0.00017289686274214118, "loss": 0.9322, "step": 40 }, { "epoch": 1.48, "eval_loss": 0.7793169617652893, "eval_runtime": 2.6811, "eval_samples_per_second": 1.119, "eval_steps_per_second": 0.746, "step": 40 }, { "epoch": 1.52, "learning_rate": 0.00017115356772092857, "loss": 0.8279, "step": 41 }, { "epoch": 1.56, "learning_rate": 0.0001693653305812805, "loss": 0.8759, "step": 42 }, { "epoch": 1.59, "learning_rate": 0.00016753328081210245, "loss": 0.8748, "step": 43 }, { "epoch": 1.63, "learning_rate": 0.00016565857557529566, "loss": 0.7638, "step": 44 }, { "epoch": 1.67, "learning_rate": 0.000163742398974869, "loss": 0.7941, "step": 45 }, { "epoch": 1.7, "learning_rate": 0.00016178596130903344, "loss": 0.8321, "step": 46 }, { "epoch": 1.74, "learning_rate": 0.0001597904983057519, "loss": 0.894, "step": 47 }, { "epoch": 1.78, "learning_rate": 0.00015775727034222675, "loss": 0.9176, "step": 48 }, { "epoch": 1.81, "learning_rate": 0.00015568756164881882, "loss": 0.8286, "step": 49 }, { "epoch": 1.85, "learning_rate": 0.00015358267949789966, "loss": 0.9328, "step": 50 }, { "epoch": 1.89, "learning_rate": 0.00015144395337815064, "loss": 0.8644, "step": 51 }, { "epoch": 1.93, "learning_rate": 0.00014927273415482915, "loss": 0.7769, "step": 52 }, { "epoch": 1.96, "learning_rate": 0.0001470703932165333, "loss": 0.8, "step": 53 }, { "epoch": 2.0, "learning_rate": 0.00014483832160900326, "loss": 0.7781, "step": 54 }, { "epoch": 2.04, "learning_rate": 0.00014257792915650728, "loss": 0.7852, "step": 55 }, { "epoch": 2.07, "learning_rate": 0.00014029064357136628, "loss": 0.7796, "step": 56 }, { "epoch": 2.11, "learning_rate": 0.00013797790955218014, "loss": 0.8287, "step": 57 }, { "epoch": 2.15, "learning_rate": 0.00013564118787132506, "loss": 0.6845, "step": 58 }, { "epoch": 2.19, "learning_rate": 0.00013328195445229868, "loss": 0.7821, "step": 59 }, { "epoch": 2.22, "learning_rate": 0.00013090169943749476, "loss": 0.708, "step": 60 }, { "epoch": 2.22, "eval_loss": 0.7880761027336121, "eval_runtime": 2.6843, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 60 }, { "epoch": 2.26, "learning_rate": 0.0001285019262469976, "loss": 0.8098, "step": 61 }, { "epoch": 2.3, "learning_rate": 0.00012608415062898972, "loss": 0.82, "step": 62 }, { "epoch": 2.33, "learning_rate": 0.00012364989970237248, "loss": 0.7187, "step": 63 }, { "epoch": 2.37, "learning_rate": 0.00012120071099220549, "loss": 0.7802, "step": 64 }, { "epoch": 2.41, "learning_rate": 0.00011873813145857249, "loss": 0.6834, "step": 65 }, { "epoch": 2.44, "learning_rate": 0.00011626371651948838, "loss": 0.6808, "step": 66 }, { "epoch": 2.48, "learning_rate": 0.0001137790290684638, "loss": 0.7881, "step": 67 }, { "epoch": 2.52, "learning_rate": 0.00011128563848734816, "loss": 0.7281, "step": 68 }, { "epoch": 2.56, "learning_rate": 0.00010878511965507434, "loss": 0.7231, "step": 69 }, { "epoch": 2.59, "learning_rate": 0.00010627905195293135, "loss": 0.6938, "step": 70 }, { "epoch": 2.63, "learning_rate": 0.00010376901826699348, "loss": 0.7633, "step": 71 }, { "epoch": 2.67, "learning_rate": 0.00010125660398833528, "loss": 0.8253, "step": 72 }, { "epoch": 2.7, "learning_rate": 9.874339601166473e-05, "loss": 0.8197, "step": 73 }, { "epoch": 2.74, "learning_rate": 9.623098173300654e-05, "loss": 0.7403, "step": 74 }, { "epoch": 2.78, "learning_rate": 9.372094804706867e-05, "loss": 0.8175, "step": 75 }, { "epoch": 2.81, "learning_rate": 9.121488034492569e-05, "loss": 0.7249, "step": 76 }, { "epoch": 2.85, "learning_rate": 8.871436151265184e-05, "loss": 0.7029, "step": 77 }, { "epoch": 2.89, "learning_rate": 8.62209709315362e-05, "loss": 0.8081, "step": 78 }, { "epoch": 2.93, "learning_rate": 8.373628348051165e-05, "loss": 0.7087, "step": 79 }, { "epoch": 2.96, "learning_rate": 8.126186854142752e-05, "loss": 0.762, "step": 80 }, { "epoch": 2.96, "eval_loss": 0.7806326746940613, "eval_runtime": 2.6841, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 80 }, { "epoch": 3.0, "learning_rate": 7.879928900779456e-05, "loss": 0.6724, "step": 81 }, { "epoch": 3.04, "learning_rate": 7.635010029762756e-05, "loss": 0.578, "step": 82 }, { "epoch": 3.07, "learning_rate": 7.391584937101033e-05, "loss": 0.6599, "step": 83 }, { "epoch": 3.11, "learning_rate": 7.149807375300239e-05, "loss": 0.732, "step": 84 }, { "epoch": 3.15, "learning_rate": 6.909830056250527e-05, "loss": 0.6144, "step": 85 }, { "epoch": 3.19, "learning_rate": 6.671804554770135e-05, "loss": 0.6812, "step": 86 }, { "epoch": 3.22, "learning_rate": 6.435881212867493e-05, "loss": 0.6753, "step": 87 }, { "epoch": 3.26, "learning_rate": 6.20220904478199e-05, "loss": 0.6341, "step": 88 }, { "epoch": 3.3, "learning_rate": 5.9709356428633746e-05, "loss": 0.6752, "step": 89 }, { "epoch": 3.33, "learning_rate": 5.7422070843492734e-05, "loss": 0.6995, "step": 90 }, { "epoch": 3.37, "learning_rate": 5.5161678390996796e-05, "loss": 0.6411, "step": 91 }, { "epoch": 3.41, "learning_rate": 5.292960678346675e-05, "loss": 0.6527, "step": 92 }, { "epoch": 3.44, "learning_rate": 5.072726584517086e-05, "loss": 0.7026, "step": 93 }, { "epoch": 3.48, "learning_rate": 4.8556046621849346e-05, "loss": 0.6603, "step": 94 }, { "epoch": 3.52, "learning_rate": 4.6417320502100316e-05, "loss": 0.6798, "step": 95 }, { "epoch": 3.56, "learning_rate": 4.431243835118124e-05, "loss": 0.623, "step": 96 }, { "epoch": 3.59, "learning_rate": 4.224272965777326e-05, "loss": 0.685, "step": 97 }, { "epoch": 3.63, "learning_rate": 4.020950169424815e-05, "loss": 0.7674, "step": 98 }, { "epoch": 3.67, "learning_rate": 3.821403869096658e-05, "loss": 0.7068, "step": 99 }, { "epoch": 3.7, "learning_rate": 3.6257601025131026e-05, "loss": 0.6724, "step": 100 }, { "epoch": 3.7, "eval_loss": 0.811485767364502, "eval_runtime": 2.6837, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 100 }, { "epoch": 3.74, "learning_rate": 3.4341424424704375e-05, "loss": 0.7169, "step": 101 }, { "epoch": 3.78, "learning_rate": 3.246671918789755e-05, "loss": 0.6499, "step": 102 }, { "epoch": 3.81, "learning_rate": 3.063466941871952e-05, "loss": 0.7342, "step": 103 }, { "epoch": 3.85, "learning_rate": 2.8846432279071467e-05, "loss": 0.6587, "step": 104 }, { "epoch": 3.89, "learning_rate": 2.7103137257858868e-05, "loss": 0.6042, "step": 105 }, { "epoch": 3.93, "learning_rate": 2.540588545758179e-05, "loss": 0.6507, "step": 106 }, { "epoch": 3.96, "learning_rate": 2.37557488988552e-05, "loss": 0.6646, "step": 107 }, { "epoch": 4.0, "learning_rate": 2.2153769843297667e-05, "loss": 0.6783, "step": 108 }, { "epoch": 4.04, "learning_rate": 2.0600960135216462e-05, "loss": 0.6036, "step": 109 }, { "epoch": 4.07, "learning_rate": 1.9098300562505266e-05, "loss": 0.6631, "step": 110 }, { "epoch": 4.11, "learning_rate": 1.7646740237157256e-05, "loss": 0.5743, "step": 111 }, { "epoch": 4.15, "learning_rate": 1.6247195995785837e-05, "loss": 0.6115, "step": 112 }, { "epoch": 4.19, "learning_rate": 1.4900551820530828e-05, "loss": 0.6387, "step": 113 }, { "epoch": 4.22, "learning_rate": 1.3607658280716473e-05, "loss": 0.6785, "step": 114 }, { "epoch": 4.26, "learning_rate": 1.2369331995613665e-05, "loss": 0.6049, "step": 115 }, { "epoch": 4.3, "learning_rate": 1.1186355118645554e-05, "loss": 0.5745, "step": 116 }, { "epoch": 4.33, "learning_rate": 1.0059474843362892e-05, "loss": 0.5218, "step": 117 }, { "epoch": 4.37, "learning_rate": 8.989402931500434e-06, "loss": 0.5492, "step": 118 }, { "epoch": 4.41, "learning_rate": 7.976815263412963e-06, "loss": 0.6314, "step": 119 }, { "epoch": 4.44, "learning_rate": 7.022351411174866e-06, "loss": 0.6404, "step": 120 }, { "epoch": 4.44, "eval_loss": 0.830319344997406, "eval_runtime": 2.6841, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.745, "step": 120 }, { "epoch": 4.48, "learning_rate": 6.126614234612593e-06, "loss": 0.6818, "step": 121 }, { "epoch": 4.52, "learning_rate": 5.290169500525577e-06, "loss": 0.6437, "step": 122 }, { "epoch": 4.56, "learning_rate": 4.513545525335705e-06, "loss": 0.6208, "step": 123 }, { "epoch": 4.59, "learning_rate": 3.797232841391407e-06, "loss": 0.5599, "step": 124 }, { "epoch": 4.63, "learning_rate": 3.1416838871368924e-06, "loss": 0.6858, "step": 125 }, { "epoch": 4.67, "learning_rate": 2.5473127213422763e-06, "loss": 0.7139, "step": 126 }, { "epoch": 4.7, "learning_rate": 2.014494761575314e-06, "loss": 0.6138, "step": 127 }, { "epoch": 4.74, "learning_rate": 1.543566547079467e-06, "loss": 0.631, "step": 128 }, { "epoch": 4.78, "learning_rate": 1.134825526208605e-06, "loss": 0.5295, "step": 129 }, { "epoch": 4.81, "learning_rate": 7.885298685522235e-07, "loss": 0.6071, "step": 130 }, { "epoch": 4.85, "learning_rate": 5.048983018699827e-07, "loss": 0.665, "step": 131 }, { "epoch": 4.89, "learning_rate": 2.841099739386066e-07, "loss": 0.6381, "step": 132 }, { "epoch": 4.93, "learning_rate": 1.2630433939825327e-07, "loss": 0.6353, "step": 133 }, { "epoch": 4.96, "learning_rate": 3.1581071670006015e-08, "loss": 0.7291, "step": 134 }, { "epoch": 5.0, "learning_rate": 0.0, "loss": 0.6235, "step": 135 } ], "logging_steps": 1, "max_steps": 135, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.638301223755776e+17, "trial_name": null, "trial_params": null }