{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968454258675079, "eval_steps": 60, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004206098843322818, "grad_norm": 0.5299676656723022, "learning_rate": 2e-05, "loss": 1.77, "step": 1 }, { "epoch": 0.004206098843322818, "eval_loss": 1.9898090362548828, "eval_runtime": 65.4901, "eval_samples_per_second": 41.167, "eval_steps_per_second": 20.583, "step": 1 }, { "epoch": 0.008412197686645636, "grad_norm": 0.5349143743515015, "learning_rate": 4e-05, "loss": 1.745, "step": 2 }, { "epoch": 0.012618296529968454, "grad_norm": 0.5094612240791321, "learning_rate": 6e-05, "loss": 1.7007, "step": 3 }, { "epoch": 0.016824395373291272, "grad_norm": 0.5268917083740234, "learning_rate": 8e-05, "loss": 1.6582, "step": 4 }, { "epoch": 0.02103049421661409, "grad_norm": 0.5398459434509277, "learning_rate": 0.0001, "loss": 1.7391, "step": 5 }, { "epoch": 0.025236593059936908, "grad_norm": 0.5613242983818054, "learning_rate": 0.00012, "loss": 1.6436, "step": 6 }, { "epoch": 0.029442691903259727, "grad_norm": 0.45200833678245544, "learning_rate": 0.00014, "loss": 1.5859, "step": 7 }, { "epoch": 0.033648790746582544, "grad_norm": 0.3515471816062927, "learning_rate": 0.00016, "loss": 1.4385, "step": 8 }, { "epoch": 0.03785488958990536, "grad_norm": 0.2859397530555725, "learning_rate": 0.00018, "loss": 1.427, "step": 9 }, { "epoch": 0.04206098843322818, "grad_norm": 0.3456244170665741, "learning_rate": 0.0002, "loss": 1.408, "step": 10 }, { "epoch": 0.046267087276550996, "grad_norm": 0.42806366086006165, "learning_rate": 0.0001999904234053922, "loss": 1.4541, "step": 11 }, { "epoch": 0.050473186119873815, "grad_norm": 0.5130056142807007, "learning_rate": 0.00019996169545579207, "loss": 1.3664, "step": 12 }, { "epoch": 0.054679284963196635, "grad_norm": 0.39732199907302856, "learning_rate": 0.00019991382165351814, "loss": 1.3276, "step": 13 }, { "epoch": 0.058885383806519455, "grad_norm": 0.3794059157371521, "learning_rate": 0.00019984681116793038, "loss": 1.3153, "step": 14 }, { "epoch": 0.06309148264984227, "grad_norm": 0.27593305706977844, "learning_rate": 0.00019976067683367385, "loss": 1.2554, "step": 15 }, { "epoch": 0.06729758149316509, "grad_norm": 0.28591713309288025, "learning_rate": 0.00019965543514822062, "loss": 1.283, "step": 16 }, { "epoch": 0.07150368033648791, "grad_norm": 0.26724520325660706, "learning_rate": 0.00019953110626870979, "loss": 1.1646, "step": 17 }, { "epoch": 0.07570977917981073, "grad_norm": 0.24611811339855194, "learning_rate": 0.0001993877140080869, "loss": 1.1762, "step": 18 }, { "epoch": 0.07991587802313355, "grad_norm": 0.2281356304883957, "learning_rate": 0.000199225285830543, "loss": 1.1467, "step": 19 }, { "epoch": 0.08412197686645637, "grad_norm": 0.22052225470542908, "learning_rate": 0.00019904385284625424, "loss": 1.1377, "step": 20 }, { "epoch": 0.08832807570977919, "grad_norm": 0.23453611135482788, "learning_rate": 0.00019884344980542338, "loss": 1.1162, "step": 21 }, { "epoch": 0.09253417455310199, "grad_norm": 0.22467325627803802, "learning_rate": 0.00019862411509162406, "loss": 1.155, "step": 22 }, { "epoch": 0.09674027339642481, "grad_norm": 0.2170630544424057, "learning_rate": 0.00019838589071444903, "loss": 1.1279, "step": 23 }, { "epoch": 0.10094637223974763, "grad_norm": 0.21346993744373322, "learning_rate": 0.00019812882230146398, "loss": 1.0946, "step": 24 }, { "epoch": 0.10515247108307045, "grad_norm": 0.21408380568027496, "learning_rate": 0.00019785295908946848, "loss": 1.0889, "step": 25 }, { "epoch": 0.10935856992639327, "grad_norm": 0.22000430524349213, "learning_rate": 0.0001975583539150655, "loss": 1.0476, "step": 26 }, { "epoch": 0.11356466876971609, "grad_norm": 0.20778758823871613, "learning_rate": 0.00019724506320454153, "loss": 1.0954, "step": 27 }, { "epoch": 0.11777076761303891, "grad_norm": 0.22037693858146667, "learning_rate": 0.00019691314696305913, "loss": 1.055, "step": 28 }, { "epoch": 0.12197686645636173, "grad_norm": 0.20428280532360077, "learning_rate": 0.0001965626687631641, "loss": 1.0159, "step": 29 }, { "epoch": 0.12618296529968454, "grad_norm": 0.20502522587776184, "learning_rate": 0.00019619369573260924, "loss": 1.0254, "step": 30 }, { "epoch": 0.13038906414300735, "grad_norm": 0.2062043696641922, "learning_rate": 0.0001958062985414972, "loss": 0.9779, "step": 31 }, { "epoch": 0.13459516298633017, "grad_norm": 0.22229152917861938, "learning_rate": 0.00019540055138874505, "loss": 1.0201, "step": 32 }, { "epoch": 0.138801261829653, "grad_norm": 0.21910454332828522, "learning_rate": 0.00019497653198787264, "loss": 0.9958, "step": 33 }, { "epoch": 0.14300736067297581, "grad_norm": 0.22630847990512848, "learning_rate": 0.0001945343215521182, "loss": 0.9892, "step": 34 }, { "epoch": 0.14721345951629863, "grad_norm": 0.21370179951190948, "learning_rate": 0.00019407400477888315, "loss": 0.9409, "step": 35 }, { "epoch": 0.15141955835962145, "grad_norm": 0.22368259727954865, "learning_rate": 0.00019359566983351013, "loss": 0.9626, "step": 36 }, { "epoch": 0.15562565720294427, "grad_norm": 0.24231955409049988, "learning_rate": 0.00019309940833239626, "loss": 0.9914, "step": 37 }, { "epoch": 0.1598317560462671, "grad_norm": 0.24762062728405, "learning_rate": 0.00019258531532544585, "loss": 0.9311, "step": 38 }, { "epoch": 0.1640378548895899, "grad_norm": 0.21248659491539001, "learning_rate": 0.00019205348927786532, "loss": 0.9399, "step": 39 }, { "epoch": 0.16824395373291273, "grad_norm": 0.2374017834663391, "learning_rate": 0.00019150403205130383, "loss": 0.9664, "step": 40 }, { "epoch": 0.17245005257623555, "grad_norm": 0.25241079926490784, "learning_rate": 0.0001909370488843436, "loss": 0.9475, "step": 41 }, { "epoch": 0.17665615141955837, "grad_norm": 0.24083252251148224, "learning_rate": 0.00019035264837234347, "loss": 0.9602, "step": 42 }, { "epoch": 0.1808622502628812, "grad_norm": 0.24024806916713715, "learning_rate": 0.0001897509424466393, "loss": 0.9167, "step": 43 }, { "epoch": 0.18506834910620398, "grad_norm": 0.2538228929042816, "learning_rate": 0.0001891320463531055, "loss": 0.904, "step": 44 }, { "epoch": 0.1892744479495268, "grad_norm": 0.2393723875284195, "learning_rate": 0.00018849607863008193, "loss": 0.8927, "step": 45 }, { "epoch": 0.19348054679284962, "grad_norm": 0.2394389659166336, "learning_rate": 0.00018784316108566996, "loss": 0.8675, "step": 46 }, { "epoch": 0.19768664563617244, "grad_norm": 0.24351197481155396, "learning_rate": 0.00018717341877440226, "loss": 0.873, "step": 47 }, { "epoch": 0.20189274447949526, "grad_norm": 0.2396727055311203, "learning_rate": 0.000186486979973291, "loss": 0.8972, "step": 48 }, { "epoch": 0.20609884332281808, "grad_norm": 0.2674885392189026, "learning_rate": 0.0001857839761572586, "loss": 0.8613, "step": 49 }, { "epoch": 0.2103049421661409, "grad_norm": 0.25012922286987305, "learning_rate": 0.00018506454197395606, "loss": 0.8481, "step": 50 }, { "epoch": 0.21451104100946372, "grad_norm": 0.23941218852996826, "learning_rate": 0.0001843288152179739, "loss": 0.8638, "step": 51 }, { "epoch": 0.21871713985278654, "grad_norm": 0.25679612159729004, "learning_rate": 0.00018357693680444976, "loss": 0.8928, "step": 52 }, { "epoch": 0.22292323869610936, "grad_norm": 0.25766387581825256, "learning_rate": 0.00018280905074207884, "loss": 0.8997, "step": 53 }, { "epoch": 0.22712933753943218, "grad_norm": 0.24009671807289124, "learning_rate": 0.00018202530410553163, "loss": 0.8536, "step": 54 }, { "epoch": 0.231335436382755, "grad_norm": 0.24763701856136322, "learning_rate": 0.00018122584700728443, "loss": 0.8581, "step": 55 }, { "epoch": 0.23554153522607782, "grad_norm": 0.2651236653327942, "learning_rate": 0.0001804108325688679, "loss": 0.8164, "step": 56 }, { "epoch": 0.23974763406940064, "grad_norm": 0.23978441953659058, "learning_rate": 0.0001795804168915396, "loss": 0.8321, "step": 57 }, { "epoch": 0.24395373291272346, "grad_norm": 0.2508217394351959, "learning_rate": 0.00017873475902638553, "loss": 0.815, "step": 58 }, { "epoch": 0.24815983175604628, "grad_norm": 0.2765346169471741, "learning_rate": 0.00017787402094385666, "loss": 0.8674, "step": 59 }, { "epoch": 0.25236593059936907, "grad_norm": 0.27468088269233704, "learning_rate": 0.00017699836750274662, "loss": 0.8841, "step": 60 }, { "epoch": 0.25236593059936907, "eval_loss": 1.075273036956787, "eval_runtime": 66.5472, "eval_samples_per_second": 40.513, "eval_steps_per_second": 20.256, "step": 60 }, { "epoch": 0.2565720294426919, "grad_norm": 0.27056071162223816, "learning_rate": 0.00017610796641861581, "loss": 0.8459, "step": 61 }, { "epoch": 0.2607781282860147, "grad_norm": 0.2631956934928894, "learning_rate": 0.00017520298823166873, "loss": 0.8853, "step": 62 }, { "epoch": 0.26498422712933756, "grad_norm": 0.28352680802345276, "learning_rate": 0.00017428360627408978, "loss": 0.8625, "step": 63 }, { "epoch": 0.26919032597266035, "grad_norm": 0.24897028505802155, "learning_rate": 0.00017334999663684504, "loss": 0.8627, "step": 64 }, { "epoch": 0.2733964248159832, "grad_norm": 0.2620624303817749, "learning_rate": 0.00017240233813595478, "loss": 0.8088, "step": 65 }, { "epoch": 0.277602523659306, "grad_norm": 0.24983716011047363, "learning_rate": 0.0001714408122782448, "loss": 0.8318, "step": 66 }, { "epoch": 0.28180862250262884, "grad_norm": 0.2667708694934845, "learning_rate": 0.000170465603226582, "loss": 0.8368, "step": 67 }, { "epoch": 0.28601472134595163, "grad_norm": 0.2828388214111328, "learning_rate": 0.0001694768977646013, "loss": 0.8282, "step": 68 }, { "epoch": 0.2902208201892745, "grad_norm": 0.2581498324871063, "learning_rate": 0.0001684748852609306, "loss": 0.8375, "step": 69 }, { "epoch": 0.29442691903259727, "grad_norm": 0.27101799845695496, "learning_rate": 0.0001674597576329207, "loss": 0.818, "step": 70 }, { "epoch": 0.29863301787592006, "grad_norm": 0.27231255173683167, "learning_rate": 0.00016643170930988698, "loss": 0.843, "step": 71 }, { "epoch": 0.3028391167192429, "grad_norm": 0.2566690444946289, "learning_rate": 0.00016539093719586994, "loss": 0.8348, "step": 72 }, { "epoch": 0.3070452155625657, "grad_norm": 0.2482360601425171, "learning_rate": 0.00016433764063192194, "loss": 0.8122, "step": 73 }, { "epoch": 0.31125131440588855, "grad_norm": 0.25742995738983154, "learning_rate": 0.00016327202135792685, "loss": 0.776, "step": 74 }, { "epoch": 0.31545741324921134, "grad_norm": 0.25104233622550964, "learning_rate": 0.00016219428347396053, "loss": 0.7823, "step": 75 }, { "epoch": 0.3196635120925342, "grad_norm": 0.2921640872955322, "learning_rate": 0.00016110463340119913, "loss": 0.8127, "step": 76 }, { "epoch": 0.323869610935857, "grad_norm": 0.26554426550865173, "learning_rate": 0.00016000327984238292, "loss": 0.7716, "step": 77 }, { "epoch": 0.3280757097791798, "grad_norm": 0.24784542620182037, "learning_rate": 0.00015889043374184286, "loss": 0.7714, "step": 78 }, { "epoch": 0.3322818086225026, "grad_norm": 0.26592087745666504, "learning_rate": 0.0001577663082450984, "loss": 0.7397, "step": 79 }, { "epoch": 0.33648790746582546, "grad_norm": 0.3072431683540344, "learning_rate": 0.00015663111865803285, "loss": 0.7579, "step": 80 }, { "epoch": 0.34069400630914826, "grad_norm": 0.29445305466651917, "learning_rate": 0.00015548508240565583, "loss": 0.7998, "step": 81 }, { "epoch": 0.3449001051524711, "grad_norm": 0.26053521037101746, "learning_rate": 0.0001543284189904592, "loss": 0.7832, "step": 82 }, { "epoch": 0.3491062039957939, "grad_norm": 0.2956802248954773, "learning_rate": 0.00015316134995037545, "loss": 0.8054, "step": 83 }, { "epoch": 0.35331230283911674, "grad_norm": 0.2673921287059784, "learning_rate": 0.00015198409881634617, "loss": 0.8061, "step": 84 }, { "epoch": 0.35751840168243953, "grad_norm": 0.2793889045715332, "learning_rate": 0.00015079689106950854, "loss": 0.7902, "step": 85 }, { "epoch": 0.3617245005257624, "grad_norm": 0.26718223094940186, "learning_rate": 0.00014959995409800873, "loss": 0.7769, "step": 86 }, { "epoch": 0.3659305993690852, "grad_norm": 0.300536572933197, "learning_rate": 0.00014839351715344968, "loss": 0.8245, "step": 87 }, { "epoch": 0.37013669821240797, "grad_norm": 0.2824515998363495, "learning_rate": 0.00014717781130698212, "loss": 0.8122, "step": 88 }, { "epoch": 0.3743427970557308, "grad_norm": 0.28050506114959717, "learning_rate": 0.00014595306940504716, "loss": 0.778, "step": 89 }, { "epoch": 0.3785488958990536, "grad_norm": 0.2906787395477295, "learning_rate": 0.00014471952602477866, "loss": 0.7703, "step": 90 }, { "epoch": 0.38275499474237645, "grad_norm": 0.298177033662796, "learning_rate": 0.00014347741742907433, "loss": 0.7672, "step": 91 }, { "epoch": 0.38696109358569925, "grad_norm": 0.27583765983581543, "learning_rate": 0.00014222698152134374, "loss": 0.7784, "step": 92 }, { "epoch": 0.3911671924290221, "grad_norm": 0.28834670782089233, "learning_rate": 0.0001409684577999423, "loss": 0.8278, "step": 93 }, { "epoch": 0.3953732912723449, "grad_norm": 0.29721811413764954, "learning_rate": 0.00013970208731229974, "loss": 0.7997, "step": 94 }, { "epoch": 0.39957939011566773, "grad_norm": 0.2688146233558655, "learning_rate": 0.00013842811260875168, "loss": 0.7465, "step": 95 }, { "epoch": 0.4037854889589905, "grad_norm": 0.27095234394073486, "learning_rate": 0.0001371467776960837, "loss": 0.757, "step": 96 }, { "epoch": 0.40799158780231337, "grad_norm": 0.30743858218193054, "learning_rate": 0.0001358583279907961, "loss": 0.7882, "step": 97 }, { "epoch": 0.41219768664563616, "grad_norm": 0.274873822927475, "learning_rate": 0.00013456301027209882, "loss": 0.7737, "step": 98 }, { "epoch": 0.416403785488959, "grad_norm": 0.25485867261886597, "learning_rate": 0.00013326107263464558, "loss": 0.7454, "step": 99 }, { "epoch": 0.4206098843322818, "grad_norm": 0.2994694709777832, "learning_rate": 0.00013195276444101547, "loss": 0.8133, "step": 100 }, { "epoch": 0.42481598317560465, "grad_norm": 0.2943129241466522, "learning_rate": 0.0001306383362739523, "loss": 0.7501, "step": 101 }, { "epoch": 0.42902208201892744, "grad_norm": 0.2888595163822174, "learning_rate": 0.0001293180398883701, "loss": 0.7522, "step": 102 }, { "epoch": 0.4332281808622503, "grad_norm": 0.28455743193626404, "learning_rate": 0.00012799212816313376, "loss": 0.7278, "step": 103 }, { "epoch": 0.4374342797055731, "grad_norm": 0.32477039098739624, "learning_rate": 0.00012666085505262485, "loss": 0.819, "step": 104 }, { "epoch": 0.4416403785488959, "grad_norm": 0.28067031502723694, "learning_rate": 0.00012532447553810126, "loss": 0.7979, "step": 105 }, { "epoch": 0.4458464773922187, "grad_norm": 0.26430413126945496, "learning_rate": 0.00012398324557885994, "loss": 0.7497, "step": 106 }, { "epoch": 0.4500525762355415, "grad_norm": 0.27110588550567627, "learning_rate": 0.00012263742206321287, "loss": 0.7937, "step": 107 }, { "epoch": 0.45425867507886436, "grad_norm": 0.287041574716568, "learning_rate": 0.0001212872627592845, "loss": 0.7897, "step": 108 }, { "epoch": 0.45846477392218715, "grad_norm": 0.28561776876449585, "learning_rate": 0.00011993302626564102, "loss": 0.8011, "step": 109 }, { "epoch": 0.46267087276551, "grad_norm": 0.2852155566215515, "learning_rate": 0.00011857497196176049, "loss": 0.7426, "step": 110 }, { "epoch": 0.4668769716088328, "grad_norm": 0.2712121903896332, "learning_rate": 0.00011721335995835336, "loss": 0.7277, "step": 111 }, { "epoch": 0.47108307045215564, "grad_norm": 0.2779647409915924, "learning_rate": 0.00011584845104754304, "loss": 0.7698, "step": 112 }, { "epoch": 0.47528916929547843, "grad_norm": 0.2774654030799866, "learning_rate": 0.00011448050665291587, "loss": 0.7583, "step": 113 }, { "epoch": 0.4794952681388013, "grad_norm": 0.3046507239341736, "learning_rate": 0.00011310978877945007, "loss": 0.7987, "step": 114 }, { "epoch": 0.48370136698212407, "grad_norm": 0.2816363573074341, "learning_rate": 0.00011173655996333357, "loss": 0.7898, "step": 115 }, { "epoch": 0.4879074658254469, "grad_norm": 0.27383196353912354, "learning_rate": 0.00011036108322167988, "loss": 0.7248, "step": 116 }, { "epoch": 0.4921135646687697, "grad_norm": 0.28104445338249207, "learning_rate": 0.00010898362200215197, "loss": 0.7144, "step": 117 }, { "epoch": 0.49631966351209256, "grad_norm": 0.28643152117729187, "learning_rate": 0.0001076044401325036, "loss": 0.7856, "step": 118 }, { "epoch": 0.5005257623554153, "grad_norm": 0.261483371257782, "learning_rate": 0.0001062238017700478, "loss": 0.7429, "step": 119 }, { "epoch": 0.5047318611987381, "grad_norm": 0.2796306908130646, "learning_rate": 0.00010484197135106263, "loss": 0.7772, "step": 120 }, { "epoch": 0.5047318611987381, "eval_loss": 0.9960550665855408, "eval_runtime": 65.8413, "eval_samples_per_second": 40.947, "eval_steps_per_second": 20.473, "step": 120 }, { "epoch": 0.508937960042061, "grad_norm": 0.3079998791217804, "learning_rate": 0.00010345921354014279, "loss": 0.7497, "step": 121 }, { "epoch": 0.5131440588853838, "grad_norm": 0.3106074929237366, "learning_rate": 0.00010207579317950827, "loss": 0.7568, "step": 122 }, { "epoch": 0.5173501577287066, "grad_norm": 0.27859166264533997, "learning_rate": 0.00010069197523827833, "loss": 0.7695, "step": 123 }, { "epoch": 0.5215562565720294, "grad_norm": 0.2840277850627899, "learning_rate": 9.930802476172169e-05, "loss": 0.7815, "step": 124 }, { "epoch": 0.5257623554153522, "grad_norm": 0.28042981028556824, "learning_rate": 9.792420682049174e-05, "loss": 0.7546, "step": 125 }, { "epoch": 0.5299684542586751, "grad_norm": 0.2857164144515991, "learning_rate": 9.654078645985722e-05, "loss": 0.7617, "step": 126 }, { "epoch": 0.5341745531019979, "grad_norm": 0.29590827226638794, "learning_rate": 9.515802864893739e-05, "loss": 0.748, "step": 127 }, { "epoch": 0.5383806519453207, "grad_norm": 0.29375162720680237, "learning_rate": 9.377619822995219e-05, "loss": 0.7532, "step": 128 }, { "epoch": 0.5425867507886435, "grad_norm": 0.28436464071273804, "learning_rate": 9.239555986749645e-05, "loss": 0.7511, "step": 129 }, { "epoch": 0.5467928496319664, "grad_norm": 0.29677248001098633, "learning_rate": 9.101637799784804e-05, "loss": 0.7456, "step": 130 }, { "epoch": 0.5509989484752892, "grad_norm": 0.27983585000038147, "learning_rate": 8.963891677832011e-05, "loss": 0.6888, "step": 131 }, { "epoch": 0.555205047318612, "grad_norm": 0.27517008781433105, "learning_rate": 8.826344003666647e-05, "loss": 0.7431, "step": 132 }, { "epoch": 0.5594111461619348, "grad_norm": 0.28197160363197327, "learning_rate": 8.689021122054996e-05, "loss": 0.7379, "step": 133 }, { "epoch": 0.5636172450052577, "grad_norm": 0.29125264286994934, "learning_rate": 8.551949334708415e-05, "loss": 0.7639, "step": 134 }, { "epoch": 0.5678233438485805, "grad_norm": 0.2851899266242981, "learning_rate": 8.415154895245697e-05, "loss": 0.7764, "step": 135 }, { "epoch": 0.5720294426919033, "grad_norm": 0.2771802544593811, "learning_rate": 8.278664004164665e-05, "loss": 0.6961, "step": 136 }, { "epoch": 0.576235541535226, "grad_norm": 0.27956414222717285, "learning_rate": 8.142502803823955e-05, "loss": 0.7454, "step": 137 }, { "epoch": 0.580441640378549, "grad_norm": 0.30068668723106384, "learning_rate": 8.0066973734359e-05, "loss": 0.7683, "step": 138 }, { "epoch": 0.5846477392218717, "grad_norm": 0.2820778489112854, "learning_rate": 7.871273724071553e-05, "loss": 0.7412, "step": 139 }, { "epoch": 0.5888538380651945, "grad_norm": 0.2672085165977478, "learning_rate": 7.736257793678714e-05, "loss": 0.716, "step": 140 }, { "epoch": 0.5930599369085173, "grad_norm": 0.27900293469429016, "learning_rate": 7.601675442114009e-05, "loss": 0.7259, "step": 141 }, { "epoch": 0.5972660357518401, "grad_norm": 0.2954063415527344, "learning_rate": 7.46755244618988e-05, "loss": 0.7047, "step": 142 }, { "epoch": 0.601472134595163, "grad_norm": 0.3212134838104248, "learning_rate": 7.333914494737514e-05, "loss": 0.7657, "step": 143 }, { "epoch": 0.6056782334384858, "grad_norm": 0.30651283264160156, "learning_rate": 7.200787183686625e-05, "loss": 0.7489, "step": 144 }, { "epoch": 0.6098843322818086, "grad_norm": 0.26834797859191895, "learning_rate": 7.068196011162994e-05, "loss": 0.7484, "step": 145 }, { "epoch": 0.6140904311251314, "grad_norm": 0.2777973711490631, "learning_rate": 6.936166372604773e-05, "loss": 0.7245, "step": 146 }, { "epoch": 0.6182965299684543, "grad_norm": 0.293694406747818, "learning_rate": 6.804723555898458e-05, "loss": 0.7211, "step": 147 }, { "epoch": 0.6225026288117771, "grad_norm": 0.28515610098838806, "learning_rate": 6.673892736535448e-05, "loss": 0.7439, "step": 148 }, { "epoch": 0.6267087276550999, "grad_norm": 0.2929891049861908, "learning_rate": 6.543698972790117e-05, "loss": 0.7434, "step": 149 }, { "epoch": 0.6309148264984227, "grad_norm": 0.29031944274902344, "learning_rate": 6.414167200920391e-05, "loss": 0.7176, "step": 150 }, { "epoch": 0.6351209253417456, "grad_norm": 0.2764637768268585, "learning_rate": 6.28532223039163e-05, "loss": 0.7503, "step": 151 }, { "epoch": 0.6393270241850684, "grad_norm": 0.2900468707084656, "learning_rate": 6.157188739124834e-05, "loss": 0.6879, "step": 152 }, { "epoch": 0.6435331230283912, "grad_norm": 0.2989012897014618, "learning_rate": 6.029791268770029e-05, "loss": 0.7135, "step": 153 }, { "epoch": 0.647739221871714, "grad_norm": 0.2998535931110382, "learning_rate": 5.903154220005771e-05, "loss": 0.7171, "step": 154 }, { "epoch": 0.6519453207150369, "grad_norm": 0.27283868193626404, "learning_rate": 5.777301847865629e-05, "loss": 0.7112, "step": 155 }, { "epoch": 0.6561514195583596, "grad_norm": 0.2988041341304779, "learning_rate": 5.652258257092569e-05, "loss": 0.7444, "step": 156 }, { "epoch": 0.6603575184016824, "grad_norm": 0.2845938205718994, "learning_rate": 5.528047397522133e-05, "loss": 0.716, "step": 157 }, { "epoch": 0.6645636172450052, "grad_norm": 0.29695218801498413, "learning_rate": 5.404693059495285e-05, "loss": 0.7585, "step": 158 }, { "epoch": 0.668769716088328, "grad_norm": 0.28558245301246643, "learning_rate": 5.282218869301788e-05, "loss": 0.6908, "step": 159 }, { "epoch": 0.6729758149316509, "grad_norm": 0.280200719833374, "learning_rate": 5.160648284655032e-05, "loss": 0.7508, "step": 160 }, { "epoch": 0.6771819137749737, "grad_norm": 0.2981257438659668, "learning_rate": 5.040004590199128e-05, "loss": 0.7147, "step": 161 }, { "epoch": 0.6813880126182965, "grad_norm": 0.2873106598854065, "learning_rate": 4.920310893049146e-05, "loss": 0.7011, "step": 162 }, { "epoch": 0.6855941114616193, "grad_norm": 0.2717635929584503, "learning_rate": 4.801590118365383e-05, "loss": 0.6668, "step": 163 }, { "epoch": 0.6898002103049422, "grad_norm": 0.27607038617134094, "learning_rate": 4.683865004962452e-05, "loss": 0.7033, "step": 164 }, { "epoch": 0.694006309148265, "grad_norm": 0.2881218194961548, "learning_rate": 4.567158100954083e-05, "loss": 0.7275, "step": 165 }, { "epoch": 0.6982124079915878, "grad_norm": 0.2758018672466278, "learning_rate": 4.4514917594344184e-05, "loss": 0.737, "step": 166 }, { "epoch": 0.7024185068349106, "grad_norm": 0.29527172446250916, "learning_rate": 4.3368881341967135e-05, "loss": 0.7433, "step": 167 }, { "epoch": 0.7066246056782335, "grad_norm": 0.2847643792629242, "learning_rate": 4.223369175490162e-05, "loss": 0.7471, "step": 168 }, { "epoch": 0.7108307045215563, "grad_norm": 0.2958676815032959, "learning_rate": 4.110956625815713e-05, "loss": 0.6838, "step": 169 }, { "epoch": 0.7150368033648791, "grad_norm": 0.28350576758384705, "learning_rate": 3.9996720157617094e-05, "loss": 0.7306, "step": 170 }, { "epoch": 0.7192429022082019, "grad_norm": 0.2808986008167267, "learning_rate": 3.8895366598800896e-05, "loss": 0.6823, "step": 171 }, { "epoch": 0.7234490010515248, "grad_norm": 0.2684039771556854, "learning_rate": 3.780571652603949e-05, "loss": 0.7105, "step": 172 }, { "epoch": 0.7276550998948476, "grad_norm": 0.28138425946235657, "learning_rate": 3.672797864207316e-05, "loss": 0.7221, "step": 173 }, { "epoch": 0.7318611987381703, "grad_norm": 0.2772335708141327, "learning_rate": 3.566235936807808e-05, "loss": 0.6835, "step": 174 }, { "epoch": 0.7360672975814931, "grad_norm": 0.27244430780410767, "learning_rate": 3.460906280413007e-05, "loss": 0.6577, "step": 175 }, { "epoch": 0.7402733964248159, "grad_norm": 0.2977088689804077, "learning_rate": 3.3568290690113034e-05, "loss": 0.7213, "step": 176 }, { "epoch": 0.7444794952681388, "grad_norm": 0.289736270904541, "learning_rate": 3.25402423670793e-05, "loss": 0.7154, "step": 177 }, { "epoch": 0.7486855941114616, "grad_norm": 0.287818044424057, "learning_rate": 3.1525114739069415e-05, "loss": 0.6977, "step": 178 }, { "epoch": 0.7528916929547844, "grad_norm": 0.31408464908599854, "learning_rate": 3.0523102235398714e-05, "loss": 0.781, "step": 179 }, { "epoch": 0.7570977917981072, "grad_norm": 0.27790582180023193, "learning_rate": 2.9534396773417994e-05, "loss": 0.7169, "step": 180 }, { "epoch": 0.7570977917981072, "eval_loss": 0.9679059386253357, "eval_runtime": 66.127, "eval_samples_per_second": 40.77, "eval_steps_per_second": 20.385, "step": 180 }, { "epoch": 0.7613038906414301, "grad_norm": 0.28392866253852844, "learning_rate": 2.855918772175522e-05, "loss": 0.6662, "step": 181 }, { "epoch": 0.7655099894847529, "grad_norm": 0.2941664159297943, "learning_rate": 2.7597661864045233e-05, "loss": 0.6816, "step": 182 }, { "epoch": 0.7697160883280757, "grad_norm": 0.2740324139595032, "learning_rate": 2.6650003363154963e-05, "loss": 0.7046, "step": 183 }, { "epoch": 0.7739221871713985, "grad_norm": 0.2933352291584015, "learning_rate": 2.5716393725910215e-05, "loss": 0.7208, "step": 184 }, { "epoch": 0.7781282860147214, "grad_norm": 0.2843799591064453, "learning_rate": 2.47970117683313e-05, "loss": 0.685, "step": 185 }, { "epoch": 0.7823343848580442, "grad_norm": 0.27152329683303833, "learning_rate": 2.389203358138419e-05, "loss": 0.7176, "step": 186 }, { "epoch": 0.786540483701367, "grad_norm": 0.2916063964366913, "learning_rate": 2.3001632497253424e-05, "loss": 0.7439, "step": 187 }, { "epoch": 0.7907465825446898, "grad_norm": 0.27915897965431213, "learning_rate": 2.2125979056143364e-05, "loss": 0.7, "step": 188 }, { "epoch": 0.7949526813880127, "grad_norm": 0.30191752314567566, "learning_rate": 2.1265240973614486e-05, "loss": 0.7377, "step": 189 }, { "epoch": 0.7991587802313355, "grad_norm": 0.286101758480072, "learning_rate": 2.0419583108460418e-05, "loss": 0.6916, "step": 190 }, { "epoch": 0.8033648790746583, "grad_norm": 0.2800692319869995, "learning_rate": 1.958916743113214e-05, "loss": 0.7374, "step": 191 }, { "epoch": 0.807570977917981, "grad_norm": 0.27292168140411377, "learning_rate": 1.877415299271561e-05, "loss": 0.6757, "step": 192 }, { "epoch": 0.8117770767613038, "grad_norm": 0.28094640374183655, "learning_rate": 1.7974695894468384e-05, "loss": 0.7024, "step": 193 }, { "epoch": 0.8159831756046267, "grad_norm": 0.2871862053871155, "learning_rate": 1.7190949257921196e-05, "loss": 0.7173, "step": 194 }, { "epoch": 0.8201892744479495, "grad_norm": 0.27189600467681885, "learning_rate": 1.642306319555027e-05, "loss": 0.7019, "step": 195 }, { "epoch": 0.8243953732912723, "grad_norm": 0.28526559472084045, "learning_rate": 1.5671184782026106e-05, "loss": 0.7113, "step": 196 }, { "epoch": 0.8286014721345951, "grad_norm": 0.2855590283870697, "learning_rate": 1.4935458026043959e-05, "loss": 0.6977, "step": 197 }, { "epoch": 0.832807570977918, "grad_norm": 0.28118449449539185, "learning_rate": 1.4216023842741455e-05, "loss": 0.7241, "step": 198 }, { "epoch": 0.8370136698212408, "grad_norm": 0.28818827867507935, "learning_rate": 1.3513020026709023e-05, "loss": 0.6964, "step": 199 }, { "epoch": 0.8412197686645636, "grad_norm": 0.3235337436199188, "learning_rate": 1.2826581225597767e-05, "loss": 0.7406, "step": 200 }, { "epoch": 0.8454258675078864, "grad_norm": 0.2899198830127716, "learning_rate": 1.2156838914330072e-05, "loss": 0.7374, "step": 201 }, { "epoch": 0.8496319663512093, "grad_norm": 0.28662335872650146, "learning_rate": 1.1503921369918091e-05, "loss": 0.7039, "step": 202 }, { "epoch": 0.8538380651945321, "grad_norm": 0.2748032510280609, "learning_rate": 1.0867953646894525e-05, "loss": 0.7517, "step": 203 }, { "epoch": 0.8580441640378549, "grad_norm": 0.27125102281570435, "learning_rate": 1.0249057553360742e-05, "loss": 0.6948, "step": 204 }, { "epoch": 0.8622502628811777, "grad_norm": 0.2795623242855072, "learning_rate": 9.647351627656543e-06, "loss": 0.7123, "step": 205 }, { "epoch": 0.8664563617245006, "grad_norm": 0.28939002752304077, "learning_rate": 9.062951115656403e-06, "loss": 0.7266, "step": 206 }, { "epoch": 0.8706624605678234, "grad_norm": 0.2878707945346832, "learning_rate": 8.495967948696192e-06, "loss": 0.7335, "step": 207 }, { "epoch": 0.8748685594111462, "grad_norm": 0.27489086985588074, "learning_rate": 7.946510722134692e-06, "loss": 0.692, "step": 208 }, { "epoch": 0.879074658254469, "grad_norm": 0.2869216799736023, "learning_rate": 7.4146846745541506e-06, "loss": 0.7193, "step": 209 }, { "epoch": 0.8832807570977917, "grad_norm": 0.2801933288574219, "learning_rate": 6.900591667603751e-06, "loss": 0.7178, "step": 210 }, { "epoch": 0.8874868559411146, "grad_norm": 0.2767332196235657, "learning_rate": 6.40433016648988e-06, "loss": 0.7499, "step": 211 }, { "epoch": 0.8916929547844374, "grad_norm": 0.2783336043357849, "learning_rate": 5.925995221116853e-06, "loss": 0.7152, "step": 212 }, { "epoch": 0.8958990536277602, "grad_norm": 0.27832481265068054, "learning_rate": 5.465678447881828e-06, "loss": 0.6977, "step": 213 }, { "epoch": 0.900105152471083, "grad_norm": 0.2835717499256134, "learning_rate": 5.023468012127364e-06, "loss": 0.7251, "step": 214 }, { "epoch": 0.9043112513144059, "grad_norm": 0.27503538131713867, "learning_rate": 4.599448611254964e-06, "loss": 0.7166, "step": 215 }, { "epoch": 0.9085173501577287, "grad_norm": 0.26619476079940796, "learning_rate": 4.193701458502807e-06, "loss": 0.7095, "step": 216 }, { "epoch": 0.9127234490010515, "grad_norm": 0.2752280533313751, "learning_rate": 3.80630426739077e-06, "loss": 0.7412, "step": 217 }, { "epoch": 0.9169295478443743, "grad_norm": 0.281093567609787, "learning_rate": 3.4373312368358944e-06, "loss": 0.7592, "step": 218 }, { "epoch": 0.9211356466876972, "grad_norm": 0.28015753626823425, "learning_rate": 3.086853036940862e-06, "loss": 0.7104, "step": 219 }, { "epoch": 0.92534174553102, "grad_norm": 0.2644014358520508, "learning_rate": 2.754936795458485e-06, "loss": 0.6985, "step": 220 }, { "epoch": 0.9295478443743428, "grad_norm": 0.2755027413368225, "learning_rate": 2.4416460849345123e-06, "loss": 0.7159, "step": 221 }, { "epoch": 0.9337539432176656, "grad_norm": 0.28020283579826355, "learning_rate": 2.1470409105315283e-06, "loss": 0.7389, "step": 222 }, { "epoch": 0.9379600420609885, "grad_norm": 0.2773683965206146, "learning_rate": 1.8711776985360308e-06, "loss": 0.686, "step": 223 }, { "epoch": 0.9421661409043113, "grad_norm": 0.2784758508205414, "learning_rate": 1.61410928555098e-06, "loss": 0.6857, "step": 224 }, { "epoch": 0.9463722397476341, "grad_norm": 0.2857016623020172, "learning_rate": 1.3758849083759352e-06, "loss": 0.6982, "step": 225 }, { "epoch": 0.9505783385909569, "grad_norm": 0.27618998289108276, "learning_rate": 1.1565501945766222e-06, "loss": 0.7328, "step": 226 }, { "epoch": 0.9547844374342797, "grad_norm": 0.273423969745636, "learning_rate": 9.56147153745779e-07, "loss": 0.6762, "step": 227 }, { "epoch": 0.9589905362776026, "grad_norm": 0.2603454291820526, "learning_rate": 7.747141694570026e-07, "loss": 0.6784, "step": 228 }, { "epoch": 0.9631966351209253, "grad_norm": 0.2638219892978668, "learning_rate": 6.122859919130974e-07, "loss": 0.731, "step": 229 }, { "epoch": 0.9674027339642481, "grad_norm": 0.28604456782341003, "learning_rate": 4.6889373129022085e-07, "loss": 0.6937, "step": 230 }, { "epoch": 0.9716088328075709, "grad_norm": 0.2867179811000824, "learning_rate": 3.445648517793942e-07, "loss": 0.7492, "step": 231 }, { "epoch": 0.9758149316508938, "grad_norm": 0.27991774678230286, "learning_rate": 2.3932316632614416e-07, "loss": 0.7411, "step": 232 }, { "epoch": 0.9800210304942166, "grad_norm": 0.2658878266811371, "learning_rate": 1.5318883206962842e-07, "loss": 0.7317, "step": 233 }, { "epoch": 0.9842271293375394, "grad_norm": 0.26533135771751404, "learning_rate": 8.617834648185774e-08, "loss": 0.6636, "step": 234 }, { "epoch": 0.9884332281808622, "grad_norm": 0.26577314734458923, "learning_rate": 3.8304544207945495e-08, "loss": 0.7273, "step": 235 }, { "epoch": 0.9926393270241851, "grad_norm": 0.2715383768081665, "learning_rate": 9.576594607807465e-09, "loss": 0.7253, "step": 236 }, { "epoch": 0.9968454258675079, "grad_norm": 0.28140708804130554, "learning_rate": 0.0, "loss": 0.6756, "step": 237 } ], "logging_steps": 1, "max_steps": 237, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.319764496895181e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }