{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0048543689320388345, "grad_norm": 6.210430663947802, "learning_rate": 9.999854640567861e-06, "loss": 0.3997, "step": 1 }, { "epoch": 0.009708737864077669, "grad_norm": 4.227189490011747, "learning_rate": 9.999418570723189e-06, "loss": 0.2524, "step": 2 }, { "epoch": 0.014563106796116505, "grad_norm": 7.973290607748759, "learning_rate": 9.998691815820732e-06, "loss": 0.3552, "step": 3 }, { "epoch": 0.019417475728155338, "grad_norm": 4.819894715953605, "learning_rate": 9.997674418116759e-06, "loss": 0.3119, "step": 4 }, { "epoch": 0.024271844660194174, "grad_norm": 4.09159126654525, "learning_rate": 9.996366436766612e-06, "loss": 0.3151, "step": 5 }, { "epoch": 0.02912621359223301, "grad_norm": 4.187126010756727, "learning_rate": 9.994767947821261e-06, "loss": 0.329, "step": 6 }, { "epoch": 0.03398058252427184, "grad_norm": 4.383597220390925, "learning_rate": 9.992879044222887e-06, "loss": 0.34, "step": 7 }, { "epoch": 0.038834951456310676, "grad_norm": 4.116613521137449, "learning_rate": 9.99069983579947e-06, "loss": 0.334, "step": 8 }, { "epoch": 0.043689320388349516, "grad_norm": 3.984906208195154, "learning_rate": 9.988230449258409e-06, "loss": 0.2341, "step": 9 }, { "epoch": 0.04854368932038835, "grad_norm": 3.339049042995617, "learning_rate": 9.985471028179155e-06, "loss": 0.1869, "step": 10 }, { "epoch": 0.05339805825242718, "grad_norm": 4.311519074818221, "learning_rate": 9.982421733004857e-06, "loss": 0.2406, "step": 11 }, { "epoch": 0.05825242718446602, "grad_norm": 4.233565022056961, "learning_rate": 9.979082741033047e-06, "loss": 0.2962, "step": 12 }, { "epoch": 0.06310679611650485, "grad_norm": 3.7179082049339316, "learning_rate": 9.975454246405312e-06, "loss": 0.2562, "step": 13 }, { "epoch": 0.06796116504854369, "grad_norm": 5.263472581117849, "learning_rate": 9.971536460096021e-06, "loss": 0.3807, "step": 14 }, { "epoch": 0.07281553398058252, "grad_norm": 5.2976417442365005, "learning_rate": 9.96732960990005e-06, "loss": 0.3832, "step": 15 }, { "epoch": 0.07766990291262135, "grad_norm": 3.6983893698309545, "learning_rate": 9.96283394041954e-06, "loss": 0.2894, "step": 16 }, { "epoch": 0.0825242718446602, "grad_norm": 4.524293189671337, "learning_rate": 9.95804971304968e-06, "loss": 0.2956, "step": 17 }, { "epoch": 0.08737864077669903, "grad_norm": 3.386669066277188, "learning_rate": 9.952977205963496e-06, "loss": 0.238, "step": 18 }, { "epoch": 0.09223300970873786, "grad_norm": 3.752398939649026, "learning_rate": 9.94761671409569e-06, "loss": 0.2737, "step": 19 }, { "epoch": 0.0970873786407767, "grad_norm": 3.689729761320351, "learning_rate": 9.941968549125481e-06, "loss": 0.3675, "step": 20 }, { "epoch": 0.10194174757281553, "grad_norm": 3.4111593550487314, "learning_rate": 9.936033039458494e-06, "loss": 0.2982, "step": 21 }, { "epoch": 0.10679611650485436, "grad_norm": 3.491057365985366, "learning_rate": 9.929810530207651e-06, "loss": 0.2903, "step": 22 }, { "epoch": 0.11165048543689321, "grad_norm": 3.634709283896408, "learning_rate": 9.923301383173119e-06, "loss": 0.3121, "step": 23 }, { "epoch": 0.11650485436893204, "grad_norm": 3.5935371403839604, "learning_rate": 9.916505976821262e-06, "loss": 0.282, "step": 24 }, { "epoch": 0.12135922330097088, "grad_norm": 3.30903147043253, "learning_rate": 9.909424706262647e-06, "loss": 0.2639, "step": 25 }, { "epoch": 0.1262135922330097, "grad_norm": 3.684588740468831, "learning_rate": 9.902057983229059e-06, "loss": 0.2612, "step": 26 }, { "epoch": 0.13106796116504854, "grad_norm": 3.3783312419992444, "learning_rate": 9.894406236049569e-06, "loss": 0.265, "step": 27 }, { "epoch": 0.13592233009708737, "grad_norm": 3.817639326362559, "learning_rate": 9.886469909625624e-06, "loss": 0.2991, "step": 28 }, { "epoch": 0.1407766990291262, "grad_norm": 4.031888162738911, "learning_rate": 9.87824946540519e-06, "loss": 0.3011, "step": 29 }, { "epoch": 0.14563106796116504, "grad_norm": 3.917822140150248, "learning_rate": 9.869745381355906e-06, "loss": 0.3071, "step": 30 }, { "epoch": 0.15048543689320387, "grad_norm": 3.604163323292, "learning_rate": 9.860958151937303e-06, "loss": 0.2693, "step": 31 }, { "epoch": 0.1553398058252427, "grad_norm": 3.708521598565974, "learning_rate": 9.851888288072053e-06, "loss": 0.2279, "step": 32 }, { "epoch": 0.16019417475728157, "grad_norm": 3.993300009629637, "learning_rate": 9.842536317116262e-06, "loss": 0.3392, "step": 33 }, { "epoch": 0.1650485436893204, "grad_norm": 3.964577385827282, "learning_rate": 9.832902782828801e-06, "loss": 0.3298, "step": 34 }, { "epoch": 0.16990291262135923, "grad_norm": 4.016692923561901, "learning_rate": 9.822988245339701e-06, "loss": 0.3701, "step": 35 }, { "epoch": 0.17475728155339806, "grad_norm": 4.2484018963482475, "learning_rate": 9.81279328111758e-06, "loss": 0.2648, "step": 36 }, { "epoch": 0.1796116504854369, "grad_norm": 4.042034687249592, "learning_rate": 9.802318482936121e-06, "loss": 0.3095, "step": 37 }, { "epoch": 0.18446601941747573, "grad_norm": 3.6743225094002923, "learning_rate": 9.791564459839609e-06, "loss": 0.2352, "step": 38 }, { "epoch": 0.18932038834951456, "grad_norm": 4.630316840096387, "learning_rate": 9.780531837107519e-06, "loss": 0.3897, "step": 39 }, { "epoch": 0.1941747572815534, "grad_norm": 4.828099520767686, "learning_rate": 9.769221256218165e-06, "loss": 0.3759, "step": 40 }, { "epoch": 0.19902912621359223, "grad_norm": 4.118845743261451, "learning_rate": 9.75763337481139e-06, "loss": 0.3365, "step": 41 }, { "epoch": 0.20388349514563106, "grad_norm": 4.344934491144676, "learning_rate": 9.745768866650339e-06, "loss": 0.4009, "step": 42 }, { "epoch": 0.2087378640776699, "grad_norm": 3.487694704812952, "learning_rate": 9.73362842158228e-06, "loss": 0.23, "step": 43 }, { "epoch": 0.21359223300970873, "grad_norm": 3.275779389897151, "learning_rate": 9.721212745498493e-06, "loss": 0.2448, "step": 44 }, { "epoch": 0.21844660194174756, "grad_norm": 4.291101533529537, "learning_rate": 9.70852256029323e-06, "loss": 0.3191, "step": 45 }, { "epoch": 0.22330097087378642, "grad_norm": 3.9239967632055475, "learning_rate": 9.695558603821735e-06, "loss": 0.3397, "step": 46 }, { "epoch": 0.22815533980582525, "grad_norm": 3.7298463420645165, "learning_rate": 9.682321629857348e-06, "loss": 0.2493, "step": 47 }, { "epoch": 0.23300970873786409, "grad_norm": 3.7705025843040114, "learning_rate": 9.66881240804768e-06, "loss": 0.2968, "step": 48 }, { "epoch": 0.23786407766990292, "grad_norm": 4.057040213379552, "learning_rate": 9.655031723869848e-06, "loss": 0.3169, "step": 49 }, { "epoch": 0.24271844660194175, "grad_norm": 4.781317338104395, "learning_rate": 9.64098037858483e-06, "loss": 0.3263, "step": 50 }, { "epoch": 0.24757281553398058, "grad_norm": 3.1648895858694015, "learning_rate": 9.626659189190852e-06, "loss": 0.2311, "step": 51 }, { "epoch": 0.2524271844660194, "grad_norm": 3.7360417663244663, "learning_rate": 9.612068988375898e-06, "loss": 0.3026, "step": 52 }, { "epoch": 0.25728155339805825, "grad_norm": 4.086739937648807, "learning_rate": 9.597210624469288e-06, "loss": 0.2589, "step": 53 }, { "epoch": 0.2621359223300971, "grad_norm": 3.8729560040865914, "learning_rate": 9.582084961392358e-06, "loss": 0.2784, "step": 54 }, { "epoch": 0.2669902912621359, "grad_norm": 3.749215458840012, "learning_rate": 9.566692878608229e-06, "loss": 0.3099, "step": 55 }, { "epoch": 0.27184466019417475, "grad_norm": 4.1230178138502085, "learning_rate": 9.551035271070665e-06, "loss": 0.3656, "step": 56 }, { "epoch": 0.2766990291262136, "grad_norm": 4.4592643364121285, "learning_rate": 9.53511304917204e-06, "loss": 0.3012, "step": 57 }, { "epoch": 0.2815533980582524, "grad_norm": 3.7722197604334218, "learning_rate": 9.51892713869041e-06, "loss": 0.2482, "step": 58 }, { "epoch": 0.28640776699029125, "grad_norm": 4.094725061396738, "learning_rate": 9.502478480735678e-06, "loss": 0.3385, "step": 59 }, { "epoch": 0.2912621359223301, "grad_norm": 3.8598094632718403, "learning_rate": 9.485768031694872e-06, "loss": 0.3278, "step": 60 }, { "epoch": 0.2961165048543689, "grad_norm": 5.038792104412311, "learning_rate": 9.468796763176549e-06, "loss": 0.433, "step": 61 }, { "epoch": 0.30097087378640774, "grad_norm": 3.7014546313533665, "learning_rate": 9.45156566195429e-06, "loss": 0.2728, "step": 62 }, { "epoch": 0.3058252427184466, "grad_norm": 3.7160657221842377, "learning_rate": 9.43407572990933e-06, "loss": 0.3043, "step": 63 }, { "epoch": 0.3106796116504854, "grad_norm": 3.9919360840768365, "learning_rate": 9.416327983972304e-06, "loss": 0.3023, "step": 64 }, { "epoch": 0.3155339805825243, "grad_norm": 4.5660474365815125, "learning_rate": 9.398323456064124e-06, "loss": 0.3423, "step": 65 }, { "epoch": 0.32038834951456313, "grad_norm": 4.238176345863259, "learning_rate": 9.380063193035968e-06, "loss": 0.2929, "step": 66 }, { "epoch": 0.32524271844660196, "grad_norm": 3.5077492554102037, "learning_rate": 9.361548256608421e-06, "loss": 0.2808, "step": 67 }, { "epoch": 0.3300970873786408, "grad_norm": 4.032578683152206, "learning_rate": 9.342779723309746e-06, "loss": 0.4029, "step": 68 }, { "epoch": 0.33495145631067963, "grad_norm": 3.3554051090761865, "learning_rate": 9.323758684413272e-06, "loss": 0.243, "step": 69 }, { "epoch": 0.33980582524271846, "grad_norm": 3.879585933471807, "learning_rate": 9.304486245873973e-06, "loss": 0.3128, "step": 70 }, { "epoch": 0.3446601941747573, "grad_norm": 3.731946111523948, "learning_rate": 9.284963528264133e-06, "loss": 0.2941, "step": 71 }, { "epoch": 0.34951456310679613, "grad_norm": 3.9721144674445967, "learning_rate": 9.26519166670821e-06, "loss": 0.312, "step": 72 }, { "epoch": 0.35436893203883496, "grad_norm": 3.9998847309911576, "learning_rate": 9.24517181081683e-06, "loss": 0.3064, "step": 73 }, { "epoch": 0.3592233009708738, "grad_norm": 3.374711639163579, "learning_rate": 9.22490512461995e-06, "loss": 0.253, "step": 74 }, { "epoch": 0.3640776699029126, "grad_norm": 3.9483890668891553, "learning_rate": 9.204392786499168e-06, "loss": 0.3359, "step": 75 }, { "epoch": 0.36893203883495146, "grad_norm": 3.531679899433064, "learning_rate": 9.183635989119211e-06, "loss": 0.292, "step": 76 }, { "epoch": 0.3737864077669903, "grad_norm": 4.14965051528395, "learning_rate": 9.162635939358593e-06, "loss": 0.2823, "step": 77 }, { "epoch": 0.3786407766990291, "grad_norm": 3.8580644890753533, "learning_rate": 9.141393858239435e-06, "loss": 0.2937, "step": 78 }, { "epoch": 0.38349514563106796, "grad_norm": 3.73083376723211, "learning_rate": 9.119910980856477e-06, "loss": 0.29, "step": 79 }, { "epoch": 0.3883495145631068, "grad_norm": 3.915057853049899, "learning_rate": 9.098188556305262e-06, "loss": 0.3841, "step": 80 }, { "epoch": 0.3932038834951456, "grad_norm": 3.499805881729913, "learning_rate": 9.076227847609513e-06, "loss": 0.256, "step": 81 }, { "epoch": 0.39805825242718446, "grad_norm": 2.891131262737878, "learning_rate": 9.054030131647682e-06, "loss": 0.2297, "step": 82 }, { "epoch": 0.4029126213592233, "grad_norm": 4.170596553603535, "learning_rate": 9.031596699078727e-06, "loss": 0.3718, "step": 83 }, { "epoch": 0.4077669902912621, "grad_norm": 4.079068599431899, "learning_rate": 9.008928854267054e-06, "loss": 0.3218, "step": 84 }, { "epoch": 0.41262135922330095, "grad_norm": 4.129363271952187, "learning_rate": 8.986027915206686e-06, "loss": 0.3648, "step": 85 }, { "epoch": 0.4174757281553398, "grad_norm": 4.365523852178193, "learning_rate": 8.962895213444618e-06, "loss": 0.3164, "step": 86 }, { "epoch": 0.4223300970873786, "grad_norm": 3.1374692240956423, "learning_rate": 8.939532094003409e-06, "loss": 0.273, "step": 87 }, { "epoch": 0.42718446601941745, "grad_norm": 4.043352386493659, "learning_rate": 8.91593991530297e-06, "loss": 0.3396, "step": 88 }, { "epoch": 0.4320388349514563, "grad_norm": 4.842730025251133, "learning_rate": 8.892120049081577e-06, "loss": 0.4713, "step": 89 }, { "epoch": 0.4368932038834951, "grad_norm": 3.814985084448428, "learning_rate": 8.868073880316125e-06, "loss": 0.3214, "step": 90 }, { "epoch": 0.441747572815534, "grad_norm": 3.635214826761109, "learning_rate": 8.843802807141584e-06, "loss": 0.3041, "step": 91 }, { "epoch": 0.44660194174757284, "grad_norm": 3.612227280899933, "learning_rate": 8.819308240769726e-06, "loss": 0.2984, "step": 92 }, { "epoch": 0.45145631067961167, "grad_norm": 3.7193763496451004, "learning_rate": 8.794591605407047e-06, "loss": 0.3337, "step": 93 }, { "epoch": 0.4563106796116505, "grad_norm": 4.097372819073227, "learning_rate": 8.769654338171986e-06, "loss": 0.3472, "step": 94 }, { "epoch": 0.46116504854368934, "grad_norm": 3.4942002457161347, "learning_rate": 8.744497889011344e-06, "loss": 0.2732, "step": 95 }, { "epoch": 0.46601941747572817, "grad_norm": 3.8438416337426995, "learning_rate": 8.71912372061598e-06, "loss": 0.3228, "step": 96 }, { "epoch": 0.470873786407767, "grad_norm": 3.304127854585822, "learning_rate": 8.693533308335786e-06, "loss": 0.2455, "step": 97 }, { "epoch": 0.47572815533980584, "grad_norm": 3.3694307387503835, "learning_rate": 8.667728140093876e-06, "loss": 0.3073, "step": 98 }, { "epoch": 0.48058252427184467, "grad_norm": 3.86344762718617, "learning_rate": 8.641709716300092e-06, "loss": 0.3285, "step": 99 }, { "epoch": 0.4854368932038835, "grad_norm": 3.8116089280754233, "learning_rate": 8.615479549763756e-06, "loss": 0.319, "step": 100 }, { "epoch": 0.49029126213592233, "grad_norm": 3.086354260448708, "learning_rate": 8.589039165605716e-06, "loss": 0.2178, "step": 101 }, { "epoch": 0.49514563106796117, "grad_norm": 3.7476551353810437, "learning_rate": 8.56239010116966e-06, "loss": 0.2919, "step": 102 }, { "epoch": 0.5, "grad_norm": 3.785093046897589, "learning_rate": 8.535533905932739e-06, "loss": 0.2571, "step": 103 }, { "epoch": 0.5048543689320388, "grad_norm": 4.0231917874093, "learning_rate": 8.508472141415468e-06, "loss": 0.3036, "step": 104 }, { "epoch": 0.5097087378640777, "grad_norm": 3.7385347116069565, "learning_rate": 8.481206381090934e-06, "loss": 0.2467, "step": 105 }, { "epoch": 0.5145631067961165, "grad_norm": 3.602545705608682, "learning_rate": 8.453738210293316e-06, "loss": 0.2186, "step": 106 }, { "epoch": 0.5194174757281553, "grad_norm": 4.1007833695819675, "learning_rate": 8.426069226125695e-06, "loss": 0.2737, "step": 107 }, { "epoch": 0.5242718446601942, "grad_norm": 3.80868624453031, "learning_rate": 8.398201037367202e-06, "loss": 0.2791, "step": 108 }, { "epoch": 0.529126213592233, "grad_norm": 3.5951751438189734, "learning_rate": 8.370135264379475e-06, "loss": 0.2632, "step": 109 }, { "epoch": 0.5339805825242718, "grad_norm": 3.5348844873169227, "learning_rate": 8.341873539012443e-06, "loss": 0.2909, "step": 110 }, { "epoch": 0.5388349514563107, "grad_norm": 3.8559278281037503, "learning_rate": 8.313417504509446e-06, "loss": 0.2859, "step": 111 }, { "epoch": 0.5436893203883495, "grad_norm": 3.6567434562276953, "learning_rate": 8.284768815411693e-06, "loss": 0.3081, "step": 112 }, { "epoch": 0.5485436893203883, "grad_norm": 4.349634463147372, "learning_rate": 8.255929137462049e-06, "loss": 0.3138, "step": 113 }, { "epoch": 0.5533980582524272, "grad_norm": 4.140565405065645, "learning_rate": 8.226900147508205e-06, "loss": 0.2776, "step": 114 }, { "epoch": 0.558252427184466, "grad_norm": 3.4696274936350897, "learning_rate": 8.197683533405156e-06, "loss": 0.2594, "step": 115 }, { "epoch": 0.5631067961165048, "grad_norm": 3.9661109477485432, "learning_rate": 8.168280993917078e-06, "loss": 0.3351, "step": 116 }, { "epoch": 0.5679611650485437, "grad_norm": 3.497321004086631, "learning_rate": 8.138694238618543e-06, "loss": 0.291, "step": 117 }, { "epoch": 0.5728155339805825, "grad_norm": 3.564655666582997, "learning_rate": 8.108924987795137e-06, "loss": 0.3024, "step": 118 }, { "epoch": 0.5776699029126213, "grad_norm": 3.278540066507871, "learning_rate": 8.078974972343414e-06, "loss": 0.282, "step": 119 }, { "epoch": 0.5825242718446602, "grad_norm": 4.048208611405343, "learning_rate": 8.048845933670274e-06, "loss": 0.2921, "step": 120 }, { "epoch": 0.587378640776699, "grad_norm": 3.6600519017648185, "learning_rate": 8.01853962359169e-06, "loss": 0.2478, "step": 121 }, { "epoch": 0.5922330097087378, "grad_norm": 4.471503427628977, "learning_rate": 7.988057804230878e-06, "loss": 0.2817, "step": 122 }, { "epoch": 0.5970873786407767, "grad_norm": 3.2904708922601773, "learning_rate": 7.957402247915817e-06, "loss": 0.197, "step": 123 }, { "epoch": 0.6019417475728155, "grad_norm": 3.1357747776476255, "learning_rate": 7.92657473707621e-06, "loss": 0.2145, "step": 124 }, { "epoch": 0.6067961165048543, "grad_norm": 3.6177514931559265, "learning_rate": 7.895577064139847e-06, "loss": 0.325, "step": 125 }, { "epoch": 0.6116504854368932, "grad_norm": 4.1754135650788635, "learning_rate": 7.864411031428379e-06, "loss": 0.3291, "step": 126 }, { "epoch": 0.616504854368932, "grad_norm": 3.338577622252916, "learning_rate": 7.833078451052537e-06, "loss": 0.2303, "step": 127 }, { "epoch": 0.6213592233009708, "grad_norm": 3.6885915388120294, "learning_rate": 7.801581144806752e-06, "loss": 0.3228, "step": 128 }, { "epoch": 0.6262135922330098, "grad_norm": 3.0719056875110073, "learning_rate": 7.769920944063244e-06, "loss": 0.2162, "step": 129 }, { "epoch": 0.6310679611650486, "grad_norm": 3.6862260675136285, "learning_rate": 7.73809968966554e-06, "loss": 0.2106, "step": 130 }, { "epoch": 0.6359223300970874, "grad_norm": 3.7442123547218875, "learning_rate": 7.706119231821423e-06, "loss": 0.3161, "step": 131 }, { "epoch": 0.6407766990291263, "grad_norm": 3.697501889309697, "learning_rate": 7.673981429995372e-06, "loss": 0.2502, "step": 132 }, { "epoch": 0.6456310679611651, "grad_norm": 3.5123393681025474, "learning_rate": 7.641688152800433e-06, "loss": 0.2766, "step": 133 }, { "epoch": 0.6504854368932039, "grad_norm": 3.9392716071246174, "learning_rate": 7.609241277889583e-06, "loss": 0.296, "step": 134 }, { "epoch": 0.6553398058252428, "grad_norm": 3.6767767960640056, "learning_rate": 7.5766426918465455e-06, "loss": 0.3197, "step": 135 }, { "epoch": 0.6601941747572816, "grad_norm": 3.7839155942019223, "learning_rate": 7.5438942900761035e-06, "loss": 0.2802, "step": 136 }, { "epoch": 0.6650485436893204, "grad_norm": 3.5941291157313353, "learning_rate": 7.51099797669389e-06, "loss": 0.2465, "step": 137 }, { "epoch": 0.6699029126213593, "grad_norm": 3.6060034184126213, "learning_rate": 7.477955664415678e-06, "loss": 0.2695, "step": 138 }, { "epoch": 0.6747572815533981, "grad_norm": 3.3929342003666, "learning_rate": 7.444769274446168e-06, "loss": 0.2645, "step": 139 }, { "epoch": 0.6796116504854369, "grad_norm": 4.0533204570914965, "learning_rate": 7.411440736367281e-06, "loss": 0.2825, "step": 140 }, { "epoch": 0.6844660194174758, "grad_norm": 3.0832937155475793, "learning_rate": 7.377971988025964e-06, "loss": 0.2237, "step": 141 }, { "epoch": 0.6893203883495146, "grad_norm": 3.9971945752703517, "learning_rate": 7.3443649754215175e-06, "loss": 0.2797, "step": 142 }, { "epoch": 0.6941747572815534, "grad_norm": 3.8928124235357906, "learning_rate": 7.310621652592449e-06, "loss": 0.2815, "step": 143 }, { "epoch": 0.6990291262135923, "grad_norm": 3.5346035227407087, "learning_rate": 7.276743981502856e-06, "loss": 0.2498, "step": 144 }, { "epoch": 0.7038834951456311, "grad_norm": 3.4944240536899684, "learning_rate": 7.242733931928352e-06, "loss": 0.2945, "step": 145 }, { "epoch": 0.7087378640776699, "grad_norm": 3.5146865308836572, "learning_rate": 7.208593481341536e-06, "loss": 0.3818, "step": 146 }, { "epoch": 0.7135922330097088, "grad_norm": 3.211250797034866, "learning_rate": 7.1743246147970095e-06, "loss": 0.232, "step": 147 }, { "epoch": 0.7184466019417476, "grad_norm": 3.7238705095588824, "learning_rate": 7.139929324815965e-06, "loss": 0.352, "step": 148 }, { "epoch": 0.7233009708737864, "grad_norm": 2.9895962744160967, "learning_rate": 7.105409611270332e-06, "loss": 0.203, "step": 149 }, { "epoch": 0.7281553398058253, "grad_norm": 3.101083280074575, "learning_rate": 7.070767481266493e-06, "loss": 0.3131, "step": 150 }, { "epoch": 0.7330097087378641, "grad_norm": 3.949644807643613, "learning_rate": 7.036004949028587e-06, "loss": 0.2611, "step": 151 }, { "epoch": 0.7378640776699029, "grad_norm": 3.9175533296673164, "learning_rate": 7.00112403578139e-06, "loss": 0.3112, "step": 152 }, { "epoch": 0.7427184466019418, "grad_norm": 3.7852971264684054, "learning_rate": 6.9661267696328015e-06, "loss": 0.3705, "step": 153 }, { "epoch": 0.7475728155339806, "grad_norm": 3.9359500023027403, "learning_rate": 6.931015185455915e-06, "loss": 0.2571, "step": 154 }, { "epoch": 0.7524271844660194, "grad_norm": 3.9650169395954764, "learning_rate": 6.895791324770702e-06, "loss": 0.3296, "step": 155 }, { "epoch": 0.7572815533980582, "grad_norm": 3.7958450244217086, "learning_rate": 6.860457235625322e-06, "loss": 0.2469, "step": 156 }, { "epoch": 0.7621359223300971, "grad_norm": 3.496168181560017, "learning_rate": 6.825014972477024e-06, "loss": 0.2956, "step": 157 }, { "epoch": 0.7669902912621359, "grad_norm": 4.071530235783808, "learning_rate": 6.7894665960727105e-06, "loss": 0.3203, "step": 158 }, { "epoch": 0.7718446601941747, "grad_norm": 3.8783000918837347, "learning_rate": 6.7538141733291e-06, "loss": 0.2717, "step": 159 }, { "epoch": 0.7766990291262136, "grad_norm": 4.46769398061441, "learning_rate": 6.7180597772125665e-06, "loss": 0.3234, "step": 160 }, { "epoch": 0.7815533980582524, "grad_norm": 4.010952710998782, "learning_rate": 6.682205486618592e-06, "loss": 0.3088, "step": 161 }, { "epoch": 0.7864077669902912, "grad_norm": 3.9302028177879405, "learning_rate": 6.646253386250909e-06, "loss": 0.4054, "step": 162 }, { "epoch": 0.7912621359223301, "grad_norm": 3.4582904819046916, "learning_rate": 6.610205566500272e-06, "loss": 0.2608, "step": 163 }, { "epoch": 0.7961165048543689, "grad_norm": 3.199594645551734, "learning_rate": 6.574064123322925e-06, "loss": 0.2666, "step": 164 }, { "epoch": 0.8009708737864077, "grad_norm": 3.320187675357568, "learning_rate": 6.537831158118733e-06, "loss": 0.2775, "step": 165 }, { "epoch": 0.8058252427184466, "grad_norm": 3.7609182752449537, "learning_rate": 6.50150877760899e-06, "loss": 0.281, "step": 166 }, { "epoch": 0.8106796116504854, "grad_norm": 5.024679499746106, "learning_rate": 6.465099093713944e-06, "loss": 0.3759, "step": 167 }, { "epoch": 0.8155339805825242, "grad_norm": 2.948077935889249, "learning_rate": 6.42860422342998e-06, "loss": 0.2413, "step": 168 }, { "epoch": 0.8203883495145631, "grad_norm": 3.2850125733214726, "learning_rate": 6.392026288706549e-06, "loss": 0.1929, "step": 169 }, { "epoch": 0.8252427184466019, "grad_norm": 3.0226385324656526, "learning_rate": 6.3553674163227786e-06, "loss": 0.2279, "step": 170 }, { "epoch": 0.8300970873786407, "grad_norm": 2.936287369104951, "learning_rate": 6.318629737763818e-06, "loss": 0.2668, "step": 171 }, { "epoch": 0.8349514563106796, "grad_norm": 3.8997788814405396, "learning_rate": 6.281815389096903e-06, "loss": 0.3259, "step": 172 }, { "epoch": 0.8398058252427184, "grad_norm": 3.530614371411849, "learning_rate": 6.244926510847162e-06, "loss": 0.2793, "step": 173 }, { "epoch": 0.8446601941747572, "grad_norm": 3.7867057004141755, "learning_rate": 6.207965247873151e-06, "loss": 0.255, "step": 174 }, { "epoch": 0.8495145631067961, "grad_norm": 3.0038206006812365, "learning_rate": 6.1709337492421515e-06, "loss": 0.1763, "step": 175 }, { "epoch": 0.8543689320388349, "grad_norm": 3.8658961879244633, "learning_rate": 6.133834168105206e-06, "loss": 0.2924, "step": 176 }, { "epoch": 0.8592233009708737, "grad_norm": 3.3929339966964687, "learning_rate": 6.096668661571934e-06, "loss": 0.2814, "step": 177 }, { "epoch": 0.8640776699029126, "grad_norm": 3.9053323690312154, "learning_rate": 6.0594393905851065e-06, "loss": 0.3558, "step": 178 }, { "epoch": 0.8689320388349514, "grad_norm": 4.065248166642409, "learning_rate": 6.0221485197949995e-06, "loss": 0.321, "step": 179 }, { "epoch": 0.8737864077669902, "grad_norm": 4.018682087536964, "learning_rate": 5.9847982174335314e-06, "loss": 0.3449, "step": 180 }, { "epoch": 0.8786407766990292, "grad_norm": 3.7072669540254424, "learning_rate": 5.9473906551881985e-06, "loss": 0.3404, "step": 181 }, { "epoch": 0.883495145631068, "grad_norm": 3.956319759837972, "learning_rate": 5.9099280080758085e-06, "loss": 0.3168, "step": 182 }, { "epoch": 0.8883495145631068, "grad_norm": 3.5236623822055972, "learning_rate": 5.872412454315999e-06, "loss": 0.2937, "step": 183 }, { "epoch": 0.8932038834951457, "grad_norm": 4.08661843731342, "learning_rate": 5.834846175204612e-06, "loss": 0.3673, "step": 184 }, { "epoch": 0.8980582524271845, "grad_norm": 4.079532903207089, "learning_rate": 5.797231354986842e-06, "loss": 0.3794, "step": 185 }, { "epoch": 0.9029126213592233, "grad_norm": 2.988013828916688, "learning_rate": 5.759570180730255e-06, "loss": 0.2965, "step": 186 }, { "epoch": 0.9077669902912622, "grad_norm": 3.818186698005153, "learning_rate": 5.721864842197612e-06, "loss": 0.3281, "step": 187 }, { "epoch": 0.912621359223301, "grad_norm": 4.10589448860243, "learning_rate": 5.684117531719552e-06, "loss": 0.362, "step": 188 }, { "epoch": 0.9174757281553398, "grad_norm": 3.955293523567552, "learning_rate": 5.646330444067121e-06, "loss": 0.2393, "step": 189 }, { "epoch": 0.9223300970873787, "grad_norm": 3.175363653603869, "learning_rate": 5.608505776324158e-06, "loss": 0.2132, "step": 190 }, { "epoch": 0.9271844660194175, "grad_norm": 3.6675370281933524, "learning_rate": 5.570645727759558e-06, "loss": 0.3137, "step": 191 }, { "epoch": 0.9320388349514563, "grad_norm": 3.3657417279174933, "learning_rate": 5.532752499699381e-06, "loss": 0.3126, "step": 192 }, { "epoch": 0.9368932038834952, "grad_norm": 3.237869833517625, "learning_rate": 5.494828295398874e-06, "loss": 0.2674, "step": 193 }, { "epoch": 0.941747572815534, "grad_norm": 2.68498880740092, "learning_rate": 5.456875319914355e-06, "loss": 0.2058, "step": 194 }, { "epoch": 0.9466019417475728, "grad_norm": 3.819629336681082, "learning_rate": 5.4188957799750145e-06, "loss": 0.243, "step": 195 }, { "epoch": 0.9514563106796117, "grad_norm": 3.8190776716072534, "learning_rate": 5.380891883854591e-06, "loss": 0.2602, "step": 196 }, { "epoch": 0.9563106796116505, "grad_norm": 2.9642710722914516, "learning_rate": 5.34286584124299e-06, "loss": 0.2278, "step": 197 }, { "epoch": 0.9611650485436893, "grad_norm": 3.3934382925472657, "learning_rate": 5.304819863117796e-06, "loss": 0.2339, "step": 198 }, { "epoch": 0.9660194174757282, "grad_norm": 3.271431326651924, "learning_rate": 5.266756161615719e-06, "loss": 0.2711, "step": 199 }, { "epoch": 0.970873786407767, "grad_norm": 3.8927044313728536, "learning_rate": 5.228676949903974e-06, "loss": 0.2698, "step": 200 }, { "epoch": 0.9757281553398058, "grad_norm": 3.7275186678956187, "learning_rate": 5.190584442051594e-06, "loss": 0.2928, "step": 201 }, { "epoch": 0.9805825242718447, "grad_norm": 3.2966754991822804, "learning_rate": 5.1524808529007075e-06, "loss": 0.3131, "step": 202 }, { "epoch": 0.9854368932038835, "grad_norm": 3.7881489155550474, "learning_rate": 5.114368397937744e-06, "loss": 0.3464, "step": 203 }, { "epoch": 0.9902912621359223, "grad_norm": 2.998833922835965, "learning_rate": 5.07624929316463e-06, "loss": 0.2758, "step": 204 }, { "epoch": 0.9951456310679612, "grad_norm": 3.6967922664311335, "learning_rate": 5.038125754969933e-06, "loss": 0.2301, "step": 205 }, { "epoch": 1.0, "grad_norm": 2.9155000406381153, "learning_rate": 5e-06, "loss": 0.1539, "step": 206 }, { "epoch": 1.0048543689320388, "grad_norm": 3.160746824251009, "learning_rate": 4.9618742450300675e-06, "loss": 0.2132, "step": 207 }, { "epoch": 1.0097087378640777, "grad_norm": 3.0418689426792342, "learning_rate": 4.923750706835371e-06, "loss": 0.1767, "step": 208 }, { "epoch": 1.0145631067961165, "grad_norm": 2.6849992545863173, "learning_rate": 4.8856316020622564e-06, "loss": 0.1792, "step": 209 }, { "epoch": 1.0194174757281553, "grad_norm": 2.2979064230600192, "learning_rate": 4.847519147099294e-06, "loss": 0.1343, "step": 210 }, { "epoch": 1.0242718446601942, "grad_norm": 2.6056612623080615, "learning_rate": 4.809415557948407e-06, "loss": 0.1668, "step": 211 }, { "epoch": 1.029126213592233, "grad_norm": 2.3802666974571083, "learning_rate": 4.771323050096028e-06, "loss": 0.1512, "step": 212 }, { "epoch": 1.0339805825242718, "grad_norm": 3.332832921104355, "learning_rate": 4.733243838384282e-06, "loss": 0.179, "step": 213 }, { "epoch": 1.0388349514563107, "grad_norm": 3.1500911182937634, "learning_rate": 4.6951801368822055e-06, "loss": 0.2166, "step": 214 }, { "epoch": 1.0436893203883495, "grad_norm": 3.18376749194062, "learning_rate": 4.6571341587570114e-06, "loss": 0.1907, "step": 215 }, { "epoch": 1.0485436893203883, "grad_norm": 2.816023208795076, "learning_rate": 4.619108116145411e-06, "loss": 0.2005, "step": 216 }, { "epoch": 1.0533980582524272, "grad_norm": 2.689617289657561, "learning_rate": 4.581104220024988e-06, "loss": 0.1813, "step": 217 }, { "epoch": 1.058252427184466, "grad_norm": 3.3163262564747673, "learning_rate": 4.5431246800856455e-06, "loss": 0.1906, "step": 218 }, { "epoch": 1.0631067961165048, "grad_norm": 3.8653401148954725, "learning_rate": 4.505171704601128e-06, "loss": 0.1612, "step": 219 }, { "epoch": 1.0679611650485437, "grad_norm": 2.5726914256941726, "learning_rate": 4.467247500300621e-06, "loss": 0.1417, "step": 220 }, { "epoch": 1.0728155339805825, "grad_norm": 3.0988358997625545, "learning_rate": 4.4293542722404435e-06, "loss": 0.1351, "step": 221 }, { "epoch": 1.0776699029126213, "grad_norm": 5.097669072695037, "learning_rate": 4.391494223675843e-06, "loss": 0.2117, "step": 222 }, { "epoch": 1.0825242718446602, "grad_norm": 3.157416027324999, "learning_rate": 4.3536695559328816e-06, "loss": 0.1451, "step": 223 }, { "epoch": 1.087378640776699, "grad_norm": 3.975842856248664, "learning_rate": 4.31588246828045e-06, "loss": 0.2091, "step": 224 }, { "epoch": 1.0922330097087378, "grad_norm": 3.597822643374925, "learning_rate": 4.278135157802389e-06, "loss": 0.1894, "step": 225 }, { "epoch": 1.0970873786407767, "grad_norm": 3.218034380893191, "learning_rate": 4.240429819269746e-06, "loss": 0.1474, "step": 226 }, { "epoch": 1.1019417475728155, "grad_norm": 3.5515662293472476, "learning_rate": 4.20276864501316e-06, "loss": 0.1401, "step": 227 }, { "epoch": 1.1067961165048543, "grad_norm": 3.772426002613653, "learning_rate": 4.165153824795391e-06, "loss": 0.1716, "step": 228 }, { "epoch": 1.1116504854368932, "grad_norm": 2.9968999443352495, "learning_rate": 4.127587545684002e-06, "loss": 0.1477, "step": 229 }, { "epoch": 1.116504854368932, "grad_norm": 2.7933599680953582, "learning_rate": 4.090071991924194e-06, "loss": 0.1312, "step": 230 }, { "epoch": 1.1213592233009708, "grad_norm": 3.195519529407199, "learning_rate": 4.052609344811802e-06, "loss": 0.1402, "step": 231 }, { "epoch": 1.1262135922330097, "grad_norm": 3.025602057001244, "learning_rate": 4.015201782566471e-06, "loss": 0.1638, "step": 232 }, { "epoch": 1.1310679611650485, "grad_norm": 3.3907224396342843, "learning_rate": 3.977851480205003e-06, "loss": 0.194, "step": 233 }, { "epoch": 1.1359223300970873, "grad_norm": 2.9844220687429956, "learning_rate": 3.940560609414894e-06, "loss": 0.1192, "step": 234 }, { "epoch": 1.1407766990291262, "grad_norm": 3.061417871679989, "learning_rate": 3.903331338428067e-06, "loss": 0.1441, "step": 235 }, { "epoch": 1.145631067961165, "grad_norm": 2.743553105714314, "learning_rate": 3.866165831894796e-06, "loss": 0.1386, "step": 236 }, { "epoch": 1.1504854368932038, "grad_norm": 3.0920897077283933, "learning_rate": 3.829066250757851e-06, "loss": 0.1746, "step": 237 }, { "epoch": 1.1553398058252426, "grad_norm": 3.386823524425614, "learning_rate": 3.7920347521268514e-06, "loss": 0.1631, "step": 238 }, { "epoch": 1.1601941747572815, "grad_norm": 3.107558545193331, "learning_rate": 3.7550734891528413e-06, "loss": 0.1573, "step": 239 }, { "epoch": 1.1650485436893203, "grad_norm": 3.6654578779660074, "learning_rate": 3.7181846109031007e-06, "loss": 0.139, "step": 240 }, { "epoch": 1.1699029126213591, "grad_norm": 2.650567446300332, "learning_rate": 3.6813702622361858e-06, "loss": 0.1299, "step": 241 }, { "epoch": 1.174757281553398, "grad_norm": 2.972969049243631, "learning_rate": 3.6446325836772244e-06, "loss": 0.1272, "step": 242 }, { "epoch": 1.1796116504854368, "grad_norm": 3.534971712900453, "learning_rate": 3.6079737112934533e-06, "loss": 0.1935, "step": 243 }, { "epoch": 1.1844660194174756, "grad_norm": 3.6281821640978857, "learning_rate": 3.5713957765700224e-06, "loss": 0.1467, "step": 244 }, { "epoch": 1.1893203883495145, "grad_norm": 3.088842207924769, "learning_rate": 3.5349009062860586e-06, "loss": 0.1561, "step": 245 }, { "epoch": 1.1941747572815533, "grad_norm": 3.652766813505308, "learning_rate": 3.4984912223910105e-06, "loss": 0.155, "step": 246 }, { "epoch": 1.1990291262135921, "grad_norm": 3.321232797822304, "learning_rate": 3.46216884188127e-06, "loss": 0.1561, "step": 247 }, { "epoch": 1.203883495145631, "grad_norm": 3.30905090399823, "learning_rate": 3.425935876677077e-06, "loss": 0.1601, "step": 248 }, { "epoch": 1.2087378640776698, "grad_norm": 3.103878919477987, "learning_rate": 3.38979443349973e-06, "loss": 0.1252, "step": 249 }, { "epoch": 1.2135922330097086, "grad_norm": 3.356888557035591, "learning_rate": 3.3537466137490937e-06, "loss": 0.1448, "step": 250 }, { "epoch": 1.2184466019417475, "grad_norm": 3.0108989952760807, "learning_rate": 3.3177945133814093e-06, "loss": 0.1873, "step": 251 }, { "epoch": 1.2233009708737863, "grad_norm": 3.442016897902175, "learning_rate": 3.2819402227874364e-06, "loss": 0.1653, "step": 252 }, { "epoch": 1.2281553398058254, "grad_norm": 3.5087016303375957, "learning_rate": 3.2461858266709017e-06, "loss": 0.1502, "step": 253 }, { "epoch": 1.233009708737864, "grad_norm": 2.43670383738927, "learning_rate": 3.2105334039272924e-06, "loss": 0.1352, "step": 254 }, { "epoch": 1.237864077669903, "grad_norm": 2.8425962923403625, "learning_rate": 3.1749850275229777e-06, "loss": 0.1594, "step": 255 }, { "epoch": 1.2427184466019416, "grad_norm": 3.077957685699968, "learning_rate": 3.1395427643746802e-06, "loss": 0.183, "step": 256 }, { "epoch": 1.2475728155339807, "grad_norm": 2.5809573718739824, "learning_rate": 3.1042086752292995e-06, "loss": 0.1136, "step": 257 }, { "epoch": 1.2524271844660193, "grad_norm": 3.571997073430392, "learning_rate": 3.068984814544087e-06, "loss": 0.1882, "step": 258 }, { "epoch": 1.2572815533980584, "grad_norm": 3.4145265255516066, "learning_rate": 3.0338732303671993e-06, "loss": 0.141, "step": 259 }, { "epoch": 1.262135922330097, "grad_norm": 3.4004738904261487, "learning_rate": 2.99887596421861e-06, "loss": 0.1603, "step": 260 }, { "epoch": 1.266990291262136, "grad_norm": 3.4843136793524963, "learning_rate": 2.9639950509714138e-06, "loss": 0.2057, "step": 261 }, { "epoch": 1.2718446601941746, "grad_norm": 3.3436744946616113, "learning_rate": 2.929232518733507e-06, "loss": 0.2166, "step": 262 }, { "epoch": 1.2766990291262137, "grad_norm": 3.344656422814064, "learning_rate": 2.8945903887296686e-06, "loss": 0.1758, "step": 263 }, { "epoch": 1.2815533980582523, "grad_norm": 2.8475576506179743, "learning_rate": 2.860070675184036e-06, "loss": 0.133, "step": 264 }, { "epoch": 1.2864077669902914, "grad_norm": 3.5534592241145377, "learning_rate": 2.8256753852029917e-06, "loss": 0.1742, "step": 265 }, { "epoch": 1.29126213592233, "grad_norm": 3.1200922849065855, "learning_rate": 2.7914065186584637e-06, "loss": 0.1483, "step": 266 }, { "epoch": 1.296116504854369, "grad_norm": 3.780533229871125, "learning_rate": 2.757266068071648e-06, "loss": 0.1625, "step": 267 }, { "epoch": 1.3009708737864076, "grad_norm": 4.3083395598114125, "learning_rate": 2.7232560184971437e-06, "loss": 0.2069, "step": 268 }, { "epoch": 1.3058252427184467, "grad_norm": 3.1950473988546557, "learning_rate": 2.689378347407553e-06, "loss": 0.1695, "step": 269 }, { "epoch": 1.3106796116504853, "grad_norm": 3.0979133207909606, "learning_rate": 2.6556350245784833e-06, "loss": 0.1309, "step": 270 }, { "epoch": 1.3155339805825244, "grad_norm": 3.2528404783896336, "learning_rate": 2.6220280119740376e-06, "loss": 0.1952, "step": 271 }, { "epoch": 1.3203883495145632, "grad_norm": 3.310872943853271, "learning_rate": 2.588559263632719e-06, "loss": 0.2049, "step": 272 }, { "epoch": 1.325242718446602, "grad_norm": 3.0721231775877813, "learning_rate": 2.555230725553832e-06, "loss": 0.142, "step": 273 }, { "epoch": 1.3300970873786409, "grad_norm": 2.9785535387157247, "learning_rate": 2.522044335584322e-06, "loss": 0.1528, "step": 274 }, { "epoch": 1.3349514563106797, "grad_norm": 4.282120496591909, "learning_rate": 2.489002023306112e-06, "loss": 0.1974, "step": 275 }, { "epoch": 1.3398058252427185, "grad_norm": 3.7181553620275825, "learning_rate": 2.4561057099238973e-06, "loss": 0.2049, "step": 276 }, { "epoch": 1.3446601941747574, "grad_norm": 4.210693735284902, "learning_rate": 2.423357308153454e-06, "loss": 0.1696, "step": 277 }, { "epoch": 1.3495145631067962, "grad_norm": 2.3867800030064004, "learning_rate": 2.390758722110418e-06, "loss": 0.1194, "step": 278 }, { "epoch": 1.354368932038835, "grad_norm": 2.909759922700865, "learning_rate": 2.358311847199567e-06, "loss": 0.1197, "step": 279 }, { "epoch": 1.3592233009708738, "grad_norm": 2.98018554930524, "learning_rate": 2.3260185700046295e-06, "loss": 0.1513, "step": 280 }, { "epoch": 1.3640776699029127, "grad_norm": 2.892740767816702, "learning_rate": 2.2938807681785764e-06, "loss": 0.1511, "step": 281 }, { "epoch": 1.3689320388349515, "grad_norm": 3.473855584309103, "learning_rate": 2.2619003103344607e-06, "loss": 0.182, "step": 282 }, { "epoch": 1.3737864077669903, "grad_norm": 3.1980427152381665, "learning_rate": 2.2300790559367553e-06, "loss": 0.1432, "step": 283 }, { "epoch": 1.3786407766990292, "grad_norm": 3.5392346079919945, "learning_rate": 2.1984188551932513e-06, "loss": 0.1345, "step": 284 }, { "epoch": 1.383495145631068, "grad_norm": 4.016339646721385, "learning_rate": 2.166921548947466e-06, "loss": 0.2307, "step": 285 }, { "epoch": 1.3883495145631068, "grad_norm": 3.570422748813042, "learning_rate": 2.1355889685716225e-06, "loss": 0.1729, "step": 286 }, { "epoch": 1.3932038834951457, "grad_norm": 4.156547877651326, "learning_rate": 2.1044229358601543e-06, "loss": 0.1883, "step": 287 }, { "epoch": 1.3980582524271845, "grad_norm": 3.1064366188701977, "learning_rate": 2.0734252629237892e-06, "loss": 0.1423, "step": 288 }, { "epoch": 1.4029126213592233, "grad_norm": 2.782671582823821, "learning_rate": 2.0425977520841837e-06, "loss": 0.1241, "step": 289 }, { "epoch": 1.4077669902912622, "grad_norm": 3.5498417247773224, "learning_rate": 2.011942195769122e-06, "loss": 0.1754, "step": 290 }, { "epoch": 1.412621359223301, "grad_norm": 3.071004935839173, "learning_rate": 1.9814603764083112e-06, "loss": 0.1557, "step": 291 }, { "epoch": 1.4174757281553398, "grad_norm": 4.178610662239688, "learning_rate": 1.9511540663297284e-06, "loss": 0.2131, "step": 292 }, { "epoch": 1.4223300970873787, "grad_norm": 2.9982512910179913, "learning_rate": 1.921025027656587e-06, "loss": 0.1728, "step": 293 }, { "epoch": 1.4271844660194175, "grad_norm": 3.2315558751551055, "learning_rate": 1.8910750122048638e-06, "loss": 0.1412, "step": 294 }, { "epoch": 1.4320388349514563, "grad_norm": 3.1567815069996255, "learning_rate": 1.8613057613814584e-06, "loss": 0.1491, "step": 295 }, { "epoch": 1.4368932038834952, "grad_norm": 3.3124376510065288, "learning_rate": 1.8317190060829242e-06, "loss": 0.1508, "step": 296 }, { "epoch": 1.441747572815534, "grad_norm": 3.323200849660782, "learning_rate": 1.8023164665948455e-06, "loss": 0.1769, "step": 297 }, { "epoch": 1.4466019417475728, "grad_norm": 3.493426451745033, "learning_rate": 1.773099852491796e-06, "loss": 0.2209, "step": 298 }, { "epoch": 1.4514563106796117, "grad_norm": 3.3002947612510223, "learning_rate": 1.7440708625379503e-06, "loss": 0.1763, "step": 299 }, { "epoch": 1.4563106796116505, "grad_norm": 4.0483215116936115, "learning_rate": 1.7152311845883096e-06, "loss": 0.2317, "step": 300 }, { "epoch": 1.4611650485436893, "grad_norm": 3.60771860269156, "learning_rate": 1.686582495490554e-06, "loss": 0.2104, "step": 301 }, { "epoch": 1.4660194174757282, "grad_norm": 3.217357555194152, "learning_rate": 1.658126460987558e-06, "loss": 0.1243, "step": 302 }, { "epoch": 1.470873786407767, "grad_norm": 3.7823925040159305, "learning_rate": 1.6298647356205255e-06, "loss": 0.2052, "step": 303 }, { "epoch": 1.4757281553398058, "grad_norm": 2.9934670845224405, "learning_rate": 1.601798962632799e-06, "loss": 0.1361, "step": 304 }, { "epoch": 1.4805825242718447, "grad_norm": 3.2821411131872353, "learning_rate": 1.573930773874306e-06, "loss": 0.133, "step": 305 }, { "epoch": 1.4854368932038835, "grad_norm": 3.7773232759540267, "learning_rate": 1.5462617897066863e-06, "loss": 0.1646, "step": 306 }, { "epoch": 1.4902912621359223, "grad_norm": 2.638001155212348, "learning_rate": 1.5187936189090668e-06, "loss": 0.1514, "step": 307 }, { "epoch": 1.4951456310679612, "grad_norm": 3.2252721259012813, "learning_rate": 1.491527858584535e-06, "loss": 0.167, "step": 308 }, { "epoch": 1.5, "grad_norm": 3.0181049914072755, "learning_rate": 1.4644660940672628e-06, "loss": 0.1381, "step": 309 }, { "epoch": 1.5048543689320388, "grad_norm": 3.1248799414839463, "learning_rate": 1.4376098988303406e-06, "loss": 0.1348, "step": 310 }, { "epoch": 1.5097087378640777, "grad_norm": 2.908615485767901, "learning_rate": 1.4109608343942855e-06, "loss": 0.1372, "step": 311 }, { "epoch": 1.5145631067961165, "grad_norm": 3.121802512544766, "learning_rate": 1.3845204502362442e-06, "loss": 0.1667, "step": 312 }, { "epoch": 1.5194174757281553, "grad_norm": 2.9993671273260976, "learning_rate": 1.35829028369991e-06, "loss": 0.1464, "step": 313 }, { "epoch": 1.5242718446601942, "grad_norm": 3.00801424867595, "learning_rate": 1.3322718599061252e-06, "loss": 0.1428, "step": 314 }, { "epoch": 1.529126213592233, "grad_norm": 4.127922499278938, "learning_rate": 1.306466691664216e-06, "loss": 0.1783, "step": 315 }, { "epoch": 1.5339805825242718, "grad_norm": 3.592347640448891, "learning_rate": 1.28087627938402e-06, "loss": 0.1598, "step": 316 }, { "epoch": 1.5388349514563107, "grad_norm": 3.558225491994999, "learning_rate": 1.2555021109886589e-06, "loss": 0.1952, "step": 317 }, { "epoch": 1.5436893203883495, "grad_norm": 3.127420311053151, "learning_rate": 1.2303456618280141e-06, "loss": 0.1511, "step": 318 }, { "epoch": 1.5485436893203883, "grad_norm": 2.876495501889026, "learning_rate": 1.2054083945929534e-06, "loss": 0.1576, "step": 319 }, { "epoch": 1.5533980582524272, "grad_norm": 2.5492208265128453, "learning_rate": 1.1806917592302763e-06, "loss": 0.134, "step": 320 }, { "epoch": 1.558252427184466, "grad_norm": 3.3584196006751066, "learning_rate": 1.1561971928584158e-06, "loss": 0.1472, "step": 321 }, { "epoch": 1.5631067961165048, "grad_norm": 2.770182811864934, "learning_rate": 1.1319261196838782e-06, "loss": 0.1324, "step": 322 }, { "epoch": 1.5679611650485437, "grad_norm": 4.2223887187459646, "learning_rate": 1.1078799509184246e-06, "loss": 0.1668, "step": 323 }, { "epoch": 1.5728155339805825, "grad_norm": 4.620366826697862, "learning_rate": 1.0840600846970333e-06, "loss": 0.1799, "step": 324 }, { "epoch": 1.5776699029126213, "grad_norm": 3.0809327209146664, "learning_rate": 1.0604679059965923e-06, "loss": 0.1589, "step": 325 }, { "epoch": 1.5825242718446602, "grad_norm": 3.2154407663447078, "learning_rate": 1.0371047865553847e-06, "loss": 0.1799, "step": 326 }, { "epoch": 1.587378640776699, "grad_norm": 2.9047733485894742, "learning_rate": 1.0139720847933166e-06, "loss": 0.1316, "step": 327 }, { "epoch": 1.5922330097087378, "grad_norm": 3.186537335786956, "learning_rate": 9.91071145732948e-07, "loss": 0.1642, "step": 328 }, { "epoch": 1.5970873786407767, "grad_norm": 3.3463671892570055, "learning_rate": 9.684033009212752e-07, "loss": 0.1673, "step": 329 }, { "epoch": 1.6019417475728155, "grad_norm": 3.326248524005417, "learning_rate": 9.459698683523205e-07, "loss": 0.1689, "step": 330 }, { "epoch": 1.6067961165048543, "grad_norm": 3.925279416638482, "learning_rate": 9.237721523904891e-07, "loss": 0.1927, "step": 331 }, { "epoch": 1.6116504854368932, "grad_norm": 2.42951528739867, "learning_rate": 9.018114436947373e-07, "loss": 0.1466, "step": 332 }, { "epoch": 1.616504854368932, "grad_norm": 3.615506822019659, "learning_rate": 8.80089019143524e-07, "loss": 0.1884, "step": 333 }, { "epoch": 1.6213592233009708, "grad_norm": 3.066801562501715, "learning_rate": 8.586061417605668e-07, "loss": 0.1003, "step": 334 }, { "epoch": 1.6262135922330097, "grad_norm": 3.1728658929193037, "learning_rate": 8.373640606414097e-07, "loss": 0.1365, "step": 335 }, { "epoch": 1.6310679611650487, "grad_norm": 3.5080640459425783, "learning_rate": 8.163640108807897e-07, "loss": 0.1955, "step": 336 }, { "epoch": 1.6359223300970873, "grad_norm": 3.0612094441044784, "learning_rate": 7.956072135008336e-07, "loss": 0.1867, "step": 337 }, { "epoch": 1.6407766990291264, "grad_norm": 3.2371155663756745, "learning_rate": 7.750948753800508e-07, "loss": 0.1652, "step": 338 }, { "epoch": 1.645631067961165, "grad_norm": 3.5749411765431556, "learning_rate": 7.548281891831715e-07, "loss": 0.1496, "step": 339 }, { "epoch": 1.650485436893204, "grad_norm": 3.4791888939963185, "learning_rate": 7.348083332917927e-07, "loss": 0.1601, "step": 340 }, { "epoch": 1.6553398058252426, "grad_norm": 4.358974152025139, "learning_rate": 7.150364717358699e-07, "loss": 0.1873, "step": 341 }, { "epoch": 1.6601941747572817, "grad_norm": 3.0456599088485574, "learning_rate": 6.955137541260287e-07, "loss": 0.1445, "step": 342 }, { "epoch": 1.6650485436893203, "grad_norm": 2.580646507167624, "learning_rate": 6.762413155867276e-07, "loss": 0.1344, "step": 343 }, { "epoch": 1.6699029126213594, "grad_norm": 4.024482777021098, "learning_rate": 6.572202766902569e-07, "loss": 0.1893, "step": 344 }, { "epoch": 1.674757281553398, "grad_norm": 2.735113527380298, "learning_rate": 6.384517433915794e-07, "loss": 0.1289, "step": 345 }, { "epoch": 1.679611650485437, "grad_norm": 3.350448090422375, "learning_rate": 6.199368069640343e-07, "loss": 0.1888, "step": 346 }, { "epoch": 1.6844660194174756, "grad_norm": 3.39806465405521, "learning_rate": 6.016765439358774e-07, "loss": 0.2168, "step": 347 }, { "epoch": 1.6893203883495147, "grad_norm": 3.725698517147329, "learning_rate": 5.836720160276971e-07, "loss": 0.1857, "step": 348 }, { "epoch": 1.6941747572815533, "grad_norm": 3.2035012408970065, "learning_rate": 5.659242700906719e-07, "loss": 0.1848, "step": 349 }, { "epoch": 1.6990291262135924, "grad_norm": 3.1518355692904216, "learning_rate": 5.484343380457124e-07, "loss": 0.1715, "step": 350 }, { "epoch": 1.703883495145631, "grad_norm": 2.8978576944112064, "learning_rate": 5.312032368234527e-07, "loss": 0.1409, "step": 351 }, { "epoch": 1.70873786407767, "grad_norm": 3.079722015900227, "learning_rate": 5.1423196830513e-07, "loss": 0.1422, "step": 352 }, { "epoch": 1.7135922330097086, "grad_norm": 2.5366819603410353, "learning_rate": 4.975215192643246e-07, "loss": 0.1131, "step": 353 }, { "epoch": 1.7184466019417477, "grad_norm": 3.0419811158465437, "learning_rate": 4.81072861309591e-07, "loss": 0.1143, "step": 354 }, { "epoch": 1.7233009708737863, "grad_norm": 3.150078523799839, "learning_rate": 4.648869508279613e-07, "loss": 0.1529, "step": 355 }, { "epoch": 1.7281553398058254, "grad_norm": 3.207876915280642, "learning_rate": 4.4896472892933693e-07, "loss": 0.1787, "step": 356 }, { "epoch": 1.733009708737864, "grad_norm": 2.8136660182381577, "learning_rate": 4.333071213917722e-07, "loss": 0.135, "step": 357 }, { "epoch": 1.737864077669903, "grad_norm": 3.1058142254527223, "learning_rate": 4.179150386076425e-07, "loss": 0.1645, "step": 358 }, { "epoch": 1.7427184466019416, "grad_norm": 3.3736526733724643, "learning_rate": 4.027893755307144e-07, "loss": 0.1869, "step": 359 }, { "epoch": 1.7475728155339807, "grad_norm": 4.3267872425935225, "learning_rate": 3.8793101162410417e-07, "loss": 0.1727, "step": 360 }, { "epoch": 1.7524271844660193, "grad_norm": 3.3826748188563527, "learning_rate": 3.733408108091485e-07, "loss": 0.1516, "step": 361 }, { "epoch": 1.7572815533980584, "grad_norm": 3.5055071787105483, "learning_rate": 3.5901962141516975e-07, "loss": 0.185, "step": 362 }, { "epoch": 1.762135922330097, "grad_norm": 3.796717480941089, "learning_rate": 3.4496827613015206e-07, "loss": 0.1845, "step": 363 }, { "epoch": 1.766990291262136, "grad_norm": 3.4468868106396133, "learning_rate": 3.3118759195232273e-07, "loss": 0.1764, "step": 364 }, { "epoch": 1.7718446601941746, "grad_norm": 3.1691105859680317, "learning_rate": 3.176783701426528e-07, "loss": 0.1804, "step": 365 }, { "epoch": 1.7766990291262137, "grad_norm": 3.488051937713229, "learning_rate": 3.0444139617826605e-07, "loss": 0.1636, "step": 366 }, { "epoch": 1.7815533980582523, "grad_norm": 3.334948140025472, "learning_rate": 2.91477439706771e-07, "loss": 0.1932, "step": 367 }, { "epoch": 1.7864077669902914, "grad_norm": 3.311234660558749, "learning_rate": 2.787872545015069e-07, "loss": 0.1556, "step": 368 }, { "epoch": 1.79126213592233, "grad_norm": 3.2388837538588384, "learning_rate": 2.663715784177201e-07, "loss": 0.1729, "step": 369 }, { "epoch": 1.796116504854369, "grad_norm": 3.620040323770401, "learning_rate": 2.542311333496622e-07, "loss": 0.1464, "step": 370 }, { "epoch": 1.8009708737864076, "grad_norm": 3.3680326988023572, "learning_rate": 2.423666251886114e-07, "loss": 0.1732, "step": 371 }, { "epoch": 1.8058252427184467, "grad_norm": 3.328616374220285, "learning_rate": 2.307787437818365e-07, "loss": 0.1381, "step": 372 }, { "epoch": 1.8106796116504853, "grad_norm": 3.23043677982351, "learning_rate": 2.1946816289248163e-07, "loss": 0.1878, "step": 373 }, { "epoch": 1.8155339805825244, "grad_norm": 3.470034744928561, "learning_rate": 2.0843554016039326e-07, "loss": 0.1723, "step": 374 }, { "epoch": 1.820388349514563, "grad_norm": 3.7061503742223145, "learning_rate": 1.9768151706388016e-07, "loss": 0.1369, "step": 375 }, { "epoch": 1.825242718446602, "grad_norm": 3.7497130384555573, "learning_rate": 1.8720671888242058e-07, "loss": 0.2012, "step": 376 }, { "epoch": 1.8300970873786406, "grad_norm": 3.436876594074953, "learning_rate": 1.7701175466029895e-07, "loss": 0.1426, "step": 377 }, { "epoch": 1.8349514563106797, "grad_norm": 3.624404980222278, "learning_rate": 1.6709721717120042e-07, "loss": 0.2063, "step": 378 }, { "epoch": 1.8398058252427183, "grad_norm": 2.8620406311750344, "learning_rate": 1.574636828837395e-07, "loss": 0.1487, "step": 379 }, { "epoch": 1.8446601941747574, "grad_norm": 2.9906794498227254, "learning_rate": 1.4811171192794628e-07, "loss": 0.1396, "step": 380 }, { "epoch": 1.849514563106796, "grad_norm": 3.332118360639753, "learning_rate": 1.3904184806269705e-07, "loss": 0.1593, "step": 381 }, { "epoch": 1.854368932038835, "grad_norm": 2.7710018577039652, "learning_rate": 1.3025461864409395e-07, "loss": 0.1258, "step": 382 }, { "epoch": 1.8592233009708736, "grad_norm": 3.565313470385255, "learning_rate": 1.2175053459481e-07, "loss": 0.1895, "step": 383 }, { "epoch": 1.8640776699029127, "grad_norm": 3.2115052286975843, "learning_rate": 1.1353009037437523e-07, "loss": 0.1219, "step": 384 }, { "epoch": 1.8689320388349513, "grad_norm": 3.512534626882796, "learning_rate": 1.0559376395043285e-07, "loss": 0.1899, "step": 385 }, { "epoch": 1.8737864077669903, "grad_norm": 3.141848757745345, "learning_rate": 9.794201677094162e-08, "loss": 0.1516, "step": 386 }, { "epoch": 1.8786407766990292, "grad_norm": 4.0880722934982, "learning_rate": 9.05752937373533e-08, "loss": 0.1863, "step": 387 }, { "epoch": 1.883495145631068, "grad_norm": 3.0345709611176197, "learning_rate": 8.34940231787379e-08, "loss": 0.1491, "step": 388 }, { "epoch": 1.8883495145631068, "grad_norm": 3.4285625856714135, "learning_rate": 7.66986168268824e-08, "loss": 0.1817, "step": 389 }, { "epoch": 1.8932038834951457, "grad_norm": 3.23634948457969, "learning_rate": 7.018946979234997e-08, "loss": 0.1681, "step": 390 }, { "epoch": 1.8980582524271845, "grad_norm": 3.3264857052085337, "learning_rate": 6.396696054150719e-08, "loss": 0.1488, "step": 391 }, { "epoch": 1.9029126213592233, "grad_norm": 3.1331261049021975, "learning_rate": 5.803145087451945e-08, "loss": 0.1248, "step": 392 }, { "epoch": 1.9077669902912622, "grad_norm": 3.431209996004934, "learning_rate": 5.238328590431163e-08, "loss": 0.1978, "step": 393 }, { "epoch": 1.912621359223301, "grad_norm": 2.8856763633904268, "learning_rate": 4.702279403650534e-08, "loss": 0.1296, "step": 394 }, { "epoch": 1.9174757281553398, "grad_norm": 3.6015875108429922, "learning_rate": 4.195028695032133e-08, "loss": 0.2237, "step": 395 }, { "epoch": 1.9223300970873787, "grad_norm": 3.3839669781066184, "learning_rate": 3.716605958046071e-08, "loss": 0.1435, "step": 396 }, { "epoch": 1.9271844660194175, "grad_norm": 3.2936879549180573, "learning_rate": 3.2670390099951985e-08, "loss": 0.1352, "step": 397 }, { "epoch": 1.9320388349514563, "grad_norm": 4.0482909263485585, "learning_rate": 2.846353990398065e-08, "loss": 0.2036, "step": 398 }, { "epoch": 1.9368932038834952, "grad_norm": 3.2181834732822345, "learning_rate": 2.4545753594688582e-08, "loss": 0.1736, "step": 399 }, { "epoch": 1.941747572815534, "grad_norm": 3.256056432958188, "learning_rate": 2.0917258966953735e-08, "loss": 0.1658, "step": 400 }, { "epoch": 1.9466019417475728, "grad_norm": 3.5416592185969034, "learning_rate": 1.757826699514298e-08, "loss": 0.1358, "step": 401 }, { "epoch": 1.9514563106796117, "grad_norm": 2.974287238806288, "learning_rate": 1.4528971820846894e-08, "loss": 0.1333, "step": 402 }, { "epoch": 1.9563106796116505, "grad_norm": 2.7794520133162792, "learning_rate": 1.176955074159214e-08, "loss": 0.1228, "step": 403 }, { "epoch": 1.9611650485436893, "grad_norm": 4.025922943332189, "learning_rate": 9.300164200530815e-09, "loss": 0.146, "step": 404 }, { "epoch": 1.9660194174757282, "grad_norm": 3.2620039027016556, "learning_rate": 7.120955777112915e-09, "loss": 0.158, "step": 405 }, { "epoch": 1.970873786407767, "grad_norm": 3.0803478429162516, "learning_rate": 5.232052178738567e-09, "loss": 0.1403, "step": 406 }, { "epoch": 1.9757281553398058, "grad_norm": 3.604305626982771, "learning_rate": 3.633563233388926e-09, "loss": 0.1779, "step": 407 }, { "epoch": 1.9805825242718447, "grad_norm": 3.6073829998551448, "learning_rate": 2.3255818832423894e-09, "loss": 0.1472, "step": 408 }, { "epoch": 1.9854368932038835, "grad_norm": 3.517454720715475, "learning_rate": 1.3081841792694783e-09, "loss": 0.1463, "step": 409 }, { "epoch": 1.9902912621359223, "grad_norm": 2.582664875235327, "learning_rate": 5.814292768108187e-10, "loss": 0.1139, "step": 410 }, { "epoch": 1.9951456310679612, "grad_norm": 3.5271977721643184, "learning_rate": 1.453594321393359e-10, "loss": 0.1961, "step": 411 }, { "epoch": 2.0, "grad_norm": 2.3817418169608273, "learning_rate": 0.0, "loss": 0.1162, "step": 412 }, { "epoch": 2.0, "step": 412, "total_flos": 924937248768.0, "train_loss": 0.22817527616703973, "train_runtime": 256.4713, "train_samples_per_second": 12.797, "train_steps_per_second": 1.606 } ], "logging_steps": 1, "max_steps": 412, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 924937248768.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }