{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.98812351543943, "eval_steps": 100, "global_step": 525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 4.008147055910771, "learning_rate": 1.25e-05, "loss": 4.2415, "step": 1 }, { "epoch": 0.02, "grad_norm": 4.04569203441769, "learning_rate": 2.5e-05, "loss": 4.3121, "step": 2 }, { "epoch": 0.03, "grad_norm": 3.865746651377984, "learning_rate": 3.7500000000000003e-05, "loss": 4.3208, "step": 3 }, { "epoch": 0.04, "grad_norm": 2.6407193073379105, "learning_rate": 5e-05, "loss": 3.8848, "step": 4 }, { "epoch": 0.05, "grad_norm": 2.451159328560232, "learning_rate": 6.25e-05, "loss": 3.4391, "step": 5 }, { "epoch": 0.06, "grad_norm": 1.8259504797317525, "learning_rate": 7.500000000000001e-05, "loss": 3.0656, "step": 6 }, { "epoch": 0.07, "grad_norm": 1.1881779175566867, "learning_rate": 8.75e-05, "loss": 2.8135, "step": 7 }, { "epoch": 0.08, "grad_norm": 1.614839668966139, "learning_rate": 0.0001, "loss": 2.7319, "step": 8 }, { "epoch": 0.09, "grad_norm": 1.5198673994210212, "learning_rate": 0.00011250000000000001, "loss": 2.6903, "step": 9 }, { "epoch": 0.1, "grad_norm": 1.0044025931610727, "learning_rate": 0.000125, "loss": 2.584, "step": 10 }, { "epoch": 0.1, "grad_norm": 1.1531821793787296, "learning_rate": 0.0001375, "loss": 2.586, "step": 11 }, { "epoch": 0.11, "grad_norm": 0.6210600474209341, "learning_rate": 0.00015000000000000001, "loss": 2.5298, "step": 12 }, { "epoch": 0.12, "grad_norm": 0.5025244204180619, "learning_rate": 0.00016250000000000002, "loss": 2.4665, "step": 13 }, { "epoch": 0.13, "grad_norm": 0.5058788641352842, "learning_rate": 0.000175, "loss": 2.4194, "step": 14 }, { "epoch": 0.14, "grad_norm": 0.44571801666869537, "learning_rate": 0.0001875, "loss": 2.3531, "step": 15 }, { "epoch": 0.15, "grad_norm": 0.44028009268534757, "learning_rate": 0.0002, "loss": 2.2749, "step": 16 }, { "epoch": 0.16, "grad_norm": 0.42473118020142525, "learning_rate": 0.00019999809527270051, "loss": 2.2587, "step": 17 }, { "epoch": 0.17, "grad_norm": 0.465029302165452, "learning_rate": 0.0001999923811633618, "loss": 2.2196, "step": 18 }, { "epoch": 0.18, "grad_norm": 0.49040381415815754, "learning_rate": 0.00019998285788966027, "loss": 2.2061, "step": 19 }, { "epoch": 0.19, "grad_norm": 0.4160855034634493, "learning_rate": 0.00019996952581438068, "loss": 2.1173, "step": 20 }, { "epoch": 0.2, "grad_norm": 0.45625369964232165, "learning_rate": 0.00019995238544540241, "loss": 2.1267, "step": 21 }, { "epoch": 0.21, "grad_norm": 0.42551849567803673, "learning_rate": 0.00019993143743568, "loss": 2.0976, "step": 22 }, { "epoch": 0.22, "grad_norm": 0.5100052595965069, "learning_rate": 0.0001999066825832184, "loss": 2.0428, "step": 23 }, { "epoch": 0.23, "grad_norm": 0.4717525078599394, "learning_rate": 0.00019987812183104247, "loss": 2.0068, "step": 24 }, { "epoch": 0.24, "grad_norm": 0.5596905853419681, "learning_rate": 0.0001998457562671611, "loss": 2.0303, "step": 25 }, { "epoch": 0.25, "grad_norm": 0.4931645550169434, "learning_rate": 0.00019980958712452577, "loss": 1.9722, "step": 26 }, { "epoch": 0.26, "grad_norm": 0.4433810930704678, "learning_rate": 0.0001997696157809835, "loss": 1.957, "step": 27 }, { "epoch": 0.27, "grad_norm": 0.5522396650266582, "learning_rate": 0.0001997258437592245, "loss": 1.915, "step": 28 }, { "epoch": 0.28, "grad_norm": 0.49861222066728145, "learning_rate": 0.00019967827272672408, "loss": 1.8303, "step": 29 }, { "epoch": 0.29, "grad_norm": 0.6169911964169147, "learning_rate": 0.00019962690449567912, "loss": 1.8454, "step": 30 }, { "epoch": 0.29, "grad_norm": 0.5639780725078123, "learning_rate": 0.000199571741022939, "loss": 1.8068, "step": 31 }, { "epoch": 0.3, "grad_norm": 0.6302805853808786, "learning_rate": 0.0001995127844099313, "loss": 1.7166, "step": 32 }, { "epoch": 0.31, "grad_norm": 0.6494693483139545, "learning_rate": 0.00019945003690258125, "loss": 1.6433, "step": 33 }, { "epoch": 0.32, "grad_norm": 0.7598443409498918, "learning_rate": 0.00019938350089122682, "loss": 1.7081, "step": 34 }, { "epoch": 0.33, "grad_norm": 0.6512764391881087, "learning_rate": 0.00019931317891052708, "loss": 1.6436, "step": 35 }, { "epoch": 0.34, "grad_norm": 0.6953537359048508, "learning_rate": 0.00019923907363936593, "loss": 1.5862, "step": 36 }, { "epoch": 0.35, "grad_norm": 0.6011387829084072, "learning_rate": 0.00019916118790075008, "loss": 1.5432, "step": 37 }, { "epoch": 0.36, "grad_norm": 0.659130437748028, "learning_rate": 0.00019907952466170138, "loss": 1.5132, "step": 38 }, { "epoch": 0.37, "grad_norm": 0.7211467253555573, "learning_rate": 0.00019899408703314385, "loss": 1.506, "step": 39 }, { "epoch": 0.38, "grad_norm": 0.7006890038987398, "learning_rate": 0.0001989048782697851, "loss": 1.4498, "step": 40 }, { "epoch": 0.39, "grad_norm": 0.64642158324997, "learning_rate": 0.00019881190176999255, "loss": 1.4478, "step": 41 }, { "epoch": 0.4, "grad_norm": 0.6608085069521318, "learning_rate": 0.00019871516107566366, "loss": 1.3542, "step": 42 }, { "epoch": 0.41, "grad_norm": 0.7707478188072372, "learning_rate": 0.0001986146598720913, "loss": 1.3309, "step": 43 }, { "epoch": 0.42, "grad_norm": 0.8119298049916807, "learning_rate": 0.00019851040198782326, "loss": 1.345, "step": 44 }, { "epoch": 0.43, "grad_norm": 0.7712308653234212, "learning_rate": 0.0001984023913945162, "loss": 1.3076, "step": 45 }, { "epoch": 0.44, "grad_norm": 0.682341709525683, "learning_rate": 0.0001982906322067847, "loss": 1.2565, "step": 46 }, { "epoch": 0.45, "grad_norm": 0.7071991083514119, "learning_rate": 0.00019817512868204425, "loss": 1.1796, "step": 47 }, { "epoch": 0.46, "grad_norm": 0.745222014713615, "learning_rate": 0.00019805588522034916, "loss": 1.1649, "step": 48 }, { "epoch": 0.47, "grad_norm": 0.7158459299510994, "learning_rate": 0.00019793290636422505, "loss": 1.2109, "step": 49 }, { "epoch": 0.48, "grad_norm": 0.7335821144549012, "learning_rate": 0.00019780619679849552, "loss": 1.1475, "step": 50 }, { "epoch": 0.48, "grad_norm": 0.7804306024320766, "learning_rate": 0.000197675761350104, "loss": 1.1068, "step": 51 }, { "epoch": 0.49, "grad_norm": 0.8274924156959725, "learning_rate": 0.00019754160498792965, "loss": 1.1839, "step": 52 }, { "epoch": 0.5, "grad_norm": 0.8840482383868431, "learning_rate": 0.0001974037328225982, "loss": 1.0928, "step": 53 }, { "epoch": 0.51, "grad_norm": 0.7224652999279871, "learning_rate": 0.00019726215010628718, "loss": 1.0299, "step": 54 }, { "epoch": 0.52, "grad_norm": 0.7109288879933862, "learning_rate": 0.0001971168622325259, "loss": 1.0436, "step": 55 }, { "epoch": 0.53, "grad_norm": 0.7650325966583326, "learning_rate": 0.00019696787473598993, "loss": 1.041, "step": 56 }, { "epoch": 0.54, "grad_norm": 0.7307809391946058, "learning_rate": 0.00019681519329229033, "loss": 1.0195, "step": 57 }, { "epoch": 0.55, "grad_norm": 0.6873943623441443, "learning_rate": 0.00019665882371775733, "loss": 0.972, "step": 58 }, { "epoch": 0.56, "grad_norm": 0.8185924734616268, "learning_rate": 0.00019649877196921896, "loss": 0.9986, "step": 59 }, { "epoch": 0.57, "grad_norm": 0.7907558585543373, "learning_rate": 0.00019633504414377388, "loss": 0.9201, "step": 60 }, { "epoch": 0.58, "grad_norm": 0.7216280408288712, "learning_rate": 0.00019616764647855926, "loss": 0.9976, "step": 61 }, { "epoch": 0.59, "grad_norm": 0.6946470891456141, "learning_rate": 0.00019599658535051314, "loss": 0.9008, "step": 62 }, { "epoch": 0.6, "grad_norm": 0.6470248283451219, "learning_rate": 0.00019582186727613152, "loss": 0.8226, "step": 63 }, { "epoch": 0.61, "grad_norm": 0.8297915622585336, "learning_rate": 0.00019564349891122018, "loss": 0.8825, "step": 64 }, { "epoch": 0.62, "grad_norm": 0.7018515834126928, "learning_rate": 0.00019546148705064097, "loss": 0.8521, "step": 65 }, { "epoch": 0.63, "grad_norm": 0.6119835758734723, "learning_rate": 0.00019527583862805303, "loss": 0.7872, "step": 66 }, { "epoch": 0.64, "grad_norm": 0.6396036538427098, "learning_rate": 0.00019508656071564882, "loss": 0.7887, "step": 67 }, { "epoch": 0.65, "grad_norm": 0.6712059239435435, "learning_rate": 0.00019489366052388441, "loss": 0.8406, "step": 68 }, { "epoch": 0.66, "grad_norm": 0.6498227189328728, "learning_rate": 0.00019469714540120507, "loss": 0.7109, "step": 69 }, { "epoch": 0.67, "grad_norm": 0.6950957852561941, "learning_rate": 0.00019449702283376517, "loss": 0.7008, "step": 70 }, { "epoch": 0.67, "grad_norm": 0.6415745385783075, "learning_rate": 0.00019429330044514305, "loss": 0.6808, "step": 71 }, { "epoch": 0.68, "grad_norm": 0.6774461765802887, "learning_rate": 0.0001940859859960506, "loss": 0.7122, "step": 72 }, { "epoch": 0.69, "grad_norm": 0.6335543398879422, "learning_rate": 0.00019387508738403768, "loss": 0.6826, "step": 73 }, { "epoch": 0.7, "grad_norm": 0.6455659601218003, "learning_rate": 0.0001936606126431911, "loss": 0.7342, "step": 74 }, { "epoch": 0.71, "grad_norm": 0.6804108080708727, "learning_rate": 0.00019344256994382878, "loss": 0.6983, "step": 75 }, { "epoch": 0.72, "grad_norm": 0.6233570198373359, "learning_rate": 0.00019322096759218836, "loss": 0.6426, "step": 76 }, { "epoch": 0.73, "grad_norm": 0.6354196060962453, "learning_rate": 0.00019299581403011082, "loss": 0.6978, "step": 77 }, { "epoch": 0.74, "grad_norm": 0.6723728632702363, "learning_rate": 0.0001927671178347189, "loss": 0.6449, "step": 78 }, { "epoch": 0.75, "grad_norm": 0.6055794839258588, "learning_rate": 0.00019253488771809024, "loss": 0.6608, "step": 79 }, { "epoch": 0.76, "grad_norm": 0.6032563228830964, "learning_rate": 0.0001922991325269258, "loss": 0.6691, "step": 80 }, { "epoch": 0.77, "grad_norm": 0.5917538532836075, "learning_rate": 0.00019205986124221251, "loss": 0.6418, "step": 81 }, { "epoch": 0.78, "grad_norm": 0.6558132078005496, "learning_rate": 0.00019181708297888133, "loss": 0.6562, "step": 82 }, { "epoch": 0.79, "grad_norm": 0.6110330049943966, "learning_rate": 0.00019157080698546, "loss": 0.5855, "step": 83 }, { "epoch": 0.8, "grad_norm": 0.6481622083495842, "learning_rate": 0.00019132104264372063, "loss": 0.628, "step": 84 }, { "epoch": 0.81, "grad_norm": 0.5730813607452849, "learning_rate": 0.0001910677994683225, "loss": 0.5476, "step": 85 }, { "epoch": 0.82, "grad_norm": 0.6938507563801335, "learning_rate": 0.00019081108710644932, "loss": 0.6018, "step": 86 }, { "epoch": 0.83, "grad_norm": 0.625439427503205, "learning_rate": 0.00019055091533744202, "loss": 0.5735, "step": 87 }, { "epoch": 0.84, "grad_norm": 0.6628596764324554, "learning_rate": 0.00019028729407242597, "loss": 0.5389, "step": 88 }, { "epoch": 0.85, "grad_norm": 0.6112099968245533, "learning_rate": 0.00019002023335393364, "loss": 0.5235, "step": 89 }, { "epoch": 0.86, "grad_norm": 0.6098216223216336, "learning_rate": 0.0001897497433555218, "loss": 0.6058, "step": 90 }, { "epoch": 0.86, "grad_norm": 0.6469247467013166, "learning_rate": 0.0001894758343813842, "loss": 0.5524, "step": 91 }, { "epoch": 0.87, "grad_norm": 0.6344920759870597, "learning_rate": 0.00018919851686595874, "loss": 0.5605, "step": 92 }, { "epoch": 0.88, "grad_norm": 0.6756355159547938, "learning_rate": 0.00018891780137353034, "loss": 0.5096, "step": 93 }, { "epoch": 0.89, "grad_norm": 0.6439314455537293, "learning_rate": 0.00018863369859782825, "loss": 0.5516, "step": 94 }, { "epoch": 0.9, "grad_norm": 0.5567728554741562, "learning_rate": 0.0001883462193616187, "loss": 0.4576, "step": 95 }, { "epoch": 0.91, "grad_norm": 0.553595533418767, "learning_rate": 0.00018805537461629265, "loss": 0.4947, "step": 96 }, { "epoch": 0.92, "grad_norm": 0.6200223910647112, "learning_rate": 0.00018776117544144863, "loss": 0.5073, "step": 97 }, { "epoch": 0.93, "grad_norm": 0.6294322114297511, "learning_rate": 0.00018746363304447073, "loss": 0.4938, "step": 98 }, { "epoch": 0.94, "grad_norm": 0.6000145257745209, "learning_rate": 0.00018716275876010135, "loss": 0.473, "step": 99 }, { "epoch": 0.95, "grad_norm": 0.5927861897994469, "learning_rate": 0.00018685856405000983, "loss": 0.4724, "step": 100 }, { "epoch": 0.95, "eval_blimp_filtered_avg": 0.7155223880597015, "eval_blimp_filtered_std": 0.005000433138834185, "step": 100 }, { "epoch": 0.95, "eval_blimp_supplement_avg": 0.8405172413793104, "eval_blimp_supplement_std": 0.016486001732879434, "step": 100 }, { "epoch": 0.95, "eval_vqa_filtered_avg": 0.52, "eval_vqa_filtered_std": 0.05021167315686779, "step": 100 }, { "epoch": 0.95, "eval_winoground_filtered_avg": 0.64, "eval_winoground_filtered_std": 0.04824181513244218, "step": 100 }, { "epoch": 0.96, "grad_norm": 0.5504516732077648, "learning_rate": 0.00018655106050235548, "loss": 0.4393, "step": 101 }, { "epoch": 0.97, "grad_norm": 0.5801589113252366, "learning_rate": 0.00018624025983134644, "loss": 0.468, "step": 102 }, { "epoch": 0.98, "grad_norm": 0.5273944337529535, "learning_rate": 0.00018592617387679306, "loss": 0.439, "step": 103 }, { "epoch": 0.99, "grad_norm": 0.508609381383424, "learning_rate": 0.00018560881460365724, "loss": 0.4272, "step": 104 }, { "epoch": 1.0, "grad_norm": 0.5396859577867195, "learning_rate": 0.0001852881941015964, "loss": 0.4362, "step": 105 }, { "epoch": 1.01, "grad_norm": 0.5122858999271028, "learning_rate": 0.00018496432458450294, "loss": 0.3893, "step": 106 }, { "epoch": 1.02, "grad_norm": 0.49626561438760436, "learning_rate": 0.00018463721839003915, "loss": 0.3498, "step": 107 }, { "epoch": 1.03, "grad_norm": 0.48748413013987063, "learning_rate": 0.000184306887979167, "loss": 0.3256, "step": 108 }, { "epoch": 1.04, "grad_norm": 0.5310280563857716, "learning_rate": 0.00018397334593567348, "loss": 0.3225, "step": 109 }, { "epoch": 1.05, "grad_norm": 0.6232514021230662, "learning_rate": 0.00018363660496569127, "loss": 0.3489, "step": 110 }, { "epoch": 1.05, "grad_norm": 0.5274577320762, "learning_rate": 0.00018329667789721485, "loss": 0.3123, "step": 111 }, { "epoch": 1.06, "grad_norm": 0.5096311315676365, "learning_rate": 0.00018295357767961144, "loss": 0.3325, "step": 112 }, { "epoch": 1.07, "grad_norm": 0.4613577097438129, "learning_rate": 0.00018260731738312818, "loss": 0.2936, "step": 113 }, { "epoch": 1.08, "grad_norm": 0.4997938044342101, "learning_rate": 0.00018225791019839375, "loss": 0.3351, "step": 114 }, { "epoch": 1.09, "grad_norm": 0.538085494988463, "learning_rate": 0.00018190536943591624, "loss": 0.329, "step": 115 }, { "epoch": 1.1, "grad_norm": 0.5567068979809859, "learning_rate": 0.00018154970852557603, "loss": 0.318, "step": 116 }, { "epoch": 1.11, "grad_norm": 0.5548141608588357, "learning_rate": 0.0001811909410161139, "loss": 0.3289, "step": 117 }, { "epoch": 1.12, "grad_norm": 0.47326466614968965, "learning_rate": 0.0001808290805746153, "loss": 0.3076, "step": 118 }, { "epoch": 1.13, "grad_norm": 0.47629585466918467, "learning_rate": 0.00018046414098598948, "loss": 0.3016, "step": 119 }, { "epoch": 1.14, "grad_norm": 0.44135735344426463, "learning_rate": 0.00018009613615244436, "loss": 0.2704, "step": 120 }, { "epoch": 1.15, "grad_norm": 0.5127645747027901, "learning_rate": 0.000179725080092957, "loss": 0.2887, "step": 121 }, { "epoch": 1.16, "grad_norm": 0.5209981172771183, "learning_rate": 0.0001793509869427395, "loss": 0.2938, "step": 122 }, { "epoch": 1.17, "grad_norm": 0.5481082193558409, "learning_rate": 0.00017897387095270058, "loss": 0.3191, "step": 123 }, { "epoch": 1.18, "grad_norm": 0.4770065158307258, "learning_rate": 0.0001785937464889027, "loss": 0.2795, "step": 124 }, { "epoch": 1.19, "grad_norm": 0.44845204938493194, "learning_rate": 0.0001782106280320147, "loss": 0.2667, "step": 125 }, { "epoch": 1.2, "grad_norm": 0.47824147005907164, "learning_rate": 0.00017782453017676025, "loss": 0.267, "step": 126 }, { "epoch": 1.21, "grad_norm": 0.501015317452837, "learning_rate": 0.00017743546763136187, "loss": 0.2831, "step": 127 }, { "epoch": 1.22, "grad_norm": 0.5232536606095718, "learning_rate": 0.00017704345521698058, "loss": 0.2769, "step": 128 }, { "epoch": 1.23, "grad_norm": 0.5495388553709665, "learning_rate": 0.00017664850786715136, "loss": 0.3031, "step": 129 }, { "epoch": 1.24, "grad_norm": 0.5371555106361774, "learning_rate": 0.00017625064062721415, "loss": 0.2955, "step": 130 }, { "epoch": 1.24, "grad_norm": 0.4716773551397148, "learning_rate": 0.00017584986865374082, "loss": 0.2666, "step": 131 }, { "epoch": 1.25, "grad_norm": 0.5089124561646106, "learning_rate": 0.00017544620721395777, "loss": 0.3379, "step": 132 }, { "epoch": 1.26, "grad_norm": 0.4715340007422714, "learning_rate": 0.00017503967168516426, "loss": 0.2771, "step": 133 }, { "epoch": 1.27, "grad_norm": 0.43502563576445413, "learning_rate": 0.0001746302775541467, "loss": 0.2423, "step": 134 }, { "epoch": 1.28, "grad_norm": 0.4967705692007805, "learning_rate": 0.00017421804041658863, "loss": 0.2498, "step": 135 }, { "epoch": 1.29, "grad_norm": 0.49127370733051945, "learning_rate": 0.00017380297597647667, "loss": 0.2616, "step": 136 }, { "epoch": 1.3, "grad_norm": 0.47835649282708265, "learning_rate": 0.00017338510004550223, "loss": 0.241, "step": 137 }, { "epoch": 1.31, "grad_norm": 0.4843464174553606, "learning_rate": 0.00017296442854245915, "loss": 0.2458, "step": 138 }, { "epoch": 1.32, "grad_norm": 0.5209405133977896, "learning_rate": 0.00017254097749263734, "loss": 0.2452, "step": 139 }, { "epoch": 1.33, "grad_norm": 0.4709574288825739, "learning_rate": 0.0001721147630272123, "loss": 0.2627, "step": 140 }, { "epoch": 1.34, "grad_norm": 0.4752105435022234, "learning_rate": 0.00017168580138263062, "loss": 0.2527, "step": 141 }, { "epoch": 1.35, "grad_norm": 0.48781843284289905, "learning_rate": 0.00017125410889999134, "loss": 0.2356, "step": 142 }, { "epoch": 1.36, "grad_norm": 0.5731736183258567, "learning_rate": 0.00017081970202442362, "loss": 0.2668, "step": 143 }, { "epoch": 1.37, "grad_norm": 0.48105126464697834, "learning_rate": 0.0001703825973044602, "loss": 0.2454, "step": 144 }, { "epoch": 1.38, "grad_norm": 0.5280645599674879, "learning_rate": 0.00016994281139140688, "loss": 0.2454, "step": 145 }, { "epoch": 1.39, "grad_norm": 0.47876489284248624, "learning_rate": 0.0001695003610387084, "loss": 0.2463, "step": 146 }, { "epoch": 1.4, "grad_norm": 0.48826354198860017, "learning_rate": 0.00016905526310130999, "loss": 0.2295, "step": 147 }, { "epoch": 1.41, "grad_norm": 0.47715494831436517, "learning_rate": 0.0001686075345350156, "loss": 0.252, "step": 148 }, { "epoch": 1.42, "grad_norm": 0.5152105233009641, "learning_rate": 0.0001681571923958416, "loss": 0.2771, "step": 149 }, { "epoch": 1.43, "grad_norm": 0.4990883717055415, "learning_rate": 0.00016770425383936735, "loss": 0.2497, "step": 150 }, { "epoch": 1.43, "grad_norm": 0.4674093996422124, "learning_rate": 0.00016724873612008155, "loss": 0.2441, "step": 151 }, { "epoch": 1.44, "grad_norm": 0.4432102664091143, "learning_rate": 0.00016679065659072487, "loss": 0.2418, "step": 152 }, { "epoch": 1.45, "grad_norm": 0.4677926556162063, "learning_rate": 0.00016633003270162902, "loss": 0.2483, "step": 153 }, { "epoch": 1.46, "grad_norm": 0.5050389021999718, "learning_rate": 0.00016586688200005193, "loss": 0.225, "step": 154 }, { "epoch": 1.47, "grad_norm": 0.538150442089787, "learning_rate": 0.00016540122212950934, "loss": 0.2629, "step": 155 }, { "epoch": 1.48, "grad_norm": 0.4831894197759429, "learning_rate": 0.00016493307082910249, "loss": 0.2539, "step": 156 }, { "epoch": 1.49, "grad_norm": 0.4864294249801108, "learning_rate": 0.00016446244593284277, "loss": 0.2638, "step": 157 }, { "epoch": 1.5, "grad_norm": 0.46236092553249764, "learning_rate": 0.00016398936536897183, "loss": 0.2255, "step": 158 }, { "epoch": 1.51, "grad_norm": 0.4963120760517666, "learning_rate": 0.00016351384715927898, "loss": 0.2524, "step": 159 }, { "epoch": 1.52, "grad_norm": 0.5210286477375989, "learning_rate": 0.00016303590941841458, "loss": 0.225, "step": 160 }, { "epoch": 1.53, "grad_norm": 0.5288475623534257, "learning_rate": 0.0001625555703531998, "loss": 0.2428, "step": 161 }, { "epoch": 1.54, "grad_norm": 0.4973215047467683, "learning_rate": 0.00016207284826193335, "loss": 0.2522, "step": 162 }, { "epoch": 1.55, "grad_norm": 0.44826317640998203, "learning_rate": 0.00016158776153369402, "loss": 0.2019, "step": 163 }, { "epoch": 1.56, "grad_norm": 0.45392654459830534, "learning_rate": 0.0001611003286476406, "loss": 0.2338, "step": 164 }, { "epoch": 1.57, "grad_norm": 0.4430521150056381, "learning_rate": 0.00016061056817230754, "loss": 0.2273, "step": 165 }, { "epoch": 1.58, "grad_norm": 0.44345119147374473, "learning_rate": 0.00016011849876489776, "loss": 0.211, "step": 166 }, { "epoch": 1.59, "grad_norm": 0.4808061249544928, "learning_rate": 0.000159624139170572, "loss": 0.2104, "step": 167 }, { "epoch": 1.6, "grad_norm": 0.5573402749682285, "learning_rate": 0.00015912750822173445, "loss": 0.2492, "step": 168 }, { "epoch": 1.61, "grad_norm": 0.5334950652460796, "learning_rate": 0.00015862862483731574, "loss": 0.2187, "step": 169 }, { "epoch": 1.62, "grad_norm": 0.49497739813798797, "learning_rate": 0.00015812750802205187, "loss": 0.2097, "step": 170 }, { "epoch": 1.62, "grad_norm": 0.44446540691990566, "learning_rate": 0.00015762417686576038, "loss": 0.204, "step": 171 }, { "epoch": 1.63, "grad_norm": 0.42142200135464725, "learning_rate": 0.0001571186505426132, "loss": 0.1989, "step": 172 }, { "epoch": 1.64, "grad_norm": 0.4328533901196503, "learning_rate": 0.00015661094831040598, "loss": 0.2173, "step": 173 }, { "epoch": 1.65, "grad_norm": 0.43093996542664664, "learning_rate": 0.00015610108950982494, "loss": 0.1865, "step": 174 }, { "epoch": 1.66, "grad_norm": 0.4850613308932528, "learning_rate": 0.00015558909356370944, "loss": 0.2181, "step": 175 }, { "epoch": 1.67, "grad_norm": 0.47485870685329246, "learning_rate": 0.00015507497997631266, "loss": 0.2223, "step": 176 }, { "epoch": 1.68, "grad_norm": 0.42085147271583295, "learning_rate": 0.0001545587683325583, "loss": 0.1845, "step": 177 }, { "epoch": 1.69, "grad_norm": 0.4479801309419239, "learning_rate": 0.00015404047829729457, "loss": 0.1987, "step": 178 }, { "epoch": 1.7, "grad_norm": 0.4624584058381783, "learning_rate": 0.00015352012961454507, "loss": 0.217, "step": 179 }, { "epoch": 1.71, "grad_norm": 0.44005765649196454, "learning_rate": 0.00015299774210675657, "loss": 0.1837, "step": 180 }, { "epoch": 1.72, "grad_norm": 0.4508346255489124, "learning_rate": 0.00015247333567404406, "loss": 0.2007, "step": 181 }, { "epoch": 1.73, "grad_norm": 0.40396006791211914, "learning_rate": 0.00015194693029343248, "loss": 0.1866, "step": 182 }, { "epoch": 1.74, "grad_norm": 0.44558839018398966, "learning_rate": 0.00015141854601809581, "loss": 0.1967, "step": 183 }, { "epoch": 1.75, "grad_norm": 0.4337334328022437, "learning_rate": 0.00015088820297659314, "loss": 0.1891, "step": 184 }, { "epoch": 1.76, "grad_norm": 0.4636781912221849, "learning_rate": 0.00015035592137210187, "loss": 0.193, "step": 185 }, { "epoch": 1.77, "grad_norm": 0.47955885394967973, "learning_rate": 0.00014982172148164804, "loss": 0.1793, "step": 186 }, { "epoch": 1.78, "grad_norm": 0.4721310395975314, "learning_rate": 0.00014928562365533392, "loss": 0.186, "step": 187 }, { "epoch": 1.79, "grad_norm": 0.4737141537120664, "learning_rate": 0.00014874764831556285, "loss": 0.2058, "step": 188 }, { "epoch": 1.8, "grad_norm": 0.40830849621087567, "learning_rate": 0.00014820781595626116, "loss": 0.1822, "step": 189 }, { "epoch": 1.81, "grad_norm": 0.4272142710058541, "learning_rate": 0.0001476661471420975, "loss": 0.2057, "step": 190 }, { "epoch": 1.81, "grad_norm": 0.4212227727031309, "learning_rate": 0.0001471226625076993, "loss": 0.1845, "step": 191 }, { "epoch": 1.82, "grad_norm": 0.39660108389275345, "learning_rate": 0.0001465773827568671, "loss": 0.1769, "step": 192 }, { "epoch": 1.83, "grad_norm": 0.38828383424285384, "learning_rate": 0.00014603032866178538, "loss": 0.1699, "step": 193 }, { "epoch": 1.84, "grad_norm": 0.3681031142044674, "learning_rate": 0.00014548152106223157, "loss": 0.1456, "step": 194 }, { "epoch": 1.85, "grad_norm": 0.46248659870169556, "learning_rate": 0.00014493098086478196, "loss": 0.1846, "step": 195 }, { "epoch": 1.86, "grad_norm": 0.4437664820090981, "learning_rate": 0.00014437872904201542, "loss": 0.1706, "step": 196 }, { "epoch": 1.87, "grad_norm": 0.4410375026146085, "learning_rate": 0.0001438247866317145, "loss": 0.1757, "step": 197 }, { "epoch": 1.88, "grad_norm": 0.4290870801703047, "learning_rate": 0.00014326917473606366, "loss": 0.1777, "step": 198 }, { "epoch": 1.89, "grad_norm": 0.4812130220306999, "learning_rate": 0.00014271191452084597, "loss": 0.2013, "step": 199 }, { "epoch": 1.9, "grad_norm": 0.4314920290891278, "learning_rate": 0.00014215302721463623, "loss": 0.1857, "step": 200 }, { "epoch": 1.9, "eval_blimp_filtered_avg": 0.7161194029850746, "eval_blimp_filtered_std": 0.005001692965803923, "step": 200 }, { "epoch": 1.9, "eval_blimp_supplement_avg": 0.8211206896551724, "eval_blimp_supplement_std": 0.016785621805327337, "step": 200 }, { "epoch": 1.9, "eval_vqa_filtered_avg": 0.51, "eval_vqa_filtered_std": 0.05024183937956912, "step": 200 }, { "epoch": 1.9, "eval_winoground_filtered_avg": 0.62, "eval_winoground_filtered_std": 0.04878317312145633, "step": 200 }, { "epoch": 1.91, "grad_norm": 0.41562514975066434, "learning_rate": 0.0001415925341079927, "loss": 0.21, "step": 201 }, { "epoch": 1.92, "grad_norm": 0.37833993286875955, "learning_rate": 0.00014103045655264576, "loss": 0.1659, "step": 202 }, { "epoch": 1.93, "grad_norm": 0.3880529818353851, "learning_rate": 0.00014046681596068466, "loss": 0.1638, "step": 203 }, { "epoch": 1.94, "grad_norm": 0.40159118156434603, "learning_rate": 0.00013990163380374194, "loss": 0.1768, "step": 204 }, { "epoch": 1.95, "grad_norm": 0.4086449128732129, "learning_rate": 0.00013933493161217523, "loss": 0.1544, "step": 205 }, { "epoch": 1.96, "grad_norm": 0.3808287729283849, "learning_rate": 0.0001387667309742472, "loss": 0.1366, "step": 206 }, { "epoch": 1.97, "grad_norm": 0.39609061286446773, "learning_rate": 0.0001381970535353032, "loss": 0.1494, "step": 207 }, { "epoch": 1.98, "grad_norm": 0.40847272653729905, "learning_rate": 0.00013762592099694665, "loss": 0.1615, "step": 208 }, { "epoch": 1.99, "grad_norm": 0.4334994696681873, "learning_rate": 0.00013705335511621228, "loss": 0.1542, "step": 209 }, { "epoch": 2.0, "grad_norm": 0.4546384761691546, "learning_rate": 0.00013647937770473737, "loss": 0.1834, "step": 210 }, { "epoch": 2.0, "grad_norm": 0.36130610610645814, "learning_rate": 0.00013590401062793083, "loss": 0.123, "step": 211 }, { "epoch": 2.01, "grad_norm": 0.29975302946848653, "learning_rate": 0.0001353272758041402, "loss": 0.0824, "step": 212 }, { "epoch": 2.02, "grad_norm": 0.29392603086414587, "learning_rate": 0.00013474919520381671, "loss": 0.0836, "step": 213 }, { "epoch": 2.03, "grad_norm": 0.33169221984700814, "learning_rate": 0.00013416979084867852, "loss": 0.0683, "step": 214 }, { "epoch": 2.04, "grad_norm": 0.39192700338704206, "learning_rate": 0.00013358908481087134, "loss": 0.0804, "step": 215 }, { "epoch": 2.05, "grad_norm": 0.42443737109460977, "learning_rate": 0.0001330070992121281, "loss": 0.0797, "step": 216 }, { "epoch": 2.06, "grad_norm": 0.42848813761714244, "learning_rate": 0.00013242385622292592, "loss": 0.0776, "step": 217 }, { "epoch": 2.07, "grad_norm": 0.37448633759803696, "learning_rate": 0.00013183937806164172, "loss": 0.0739, "step": 218 }, { "epoch": 2.08, "grad_norm": 0.3437440816482259, "learning_rate": 0.00013125368699370567, "loss": 0.0652, "step": 219 }, { "epoch": 2.09, "grad_norm": 0.356415907025676, "learning_rate": 0.0001306668053307531, "loss": 0.0778, "step": 220 }, { "epoch": 2.1, "grad_norm": 0.30675625825005026, "learning_rate": 0.00013007875542977448, "loss": 0.0665, "step": 221 }, { "epoch": 2.11, "grad_norm": 0.29794655672460485, "learning_rate": 0.00012948955969226383, "loss": 0.0696, "step": 222 }, { "epoch": 2.12, "grad_norm": 0.30163505061461343, "learning_rate": 0.00012889924056336532, "loss": 0.0705, "step": 223 }, { "epoch": 2.13, "grad_norm": 0.32541739323213426, "learning_rate": 0.00012830782053101805, "loss": 0.0733, "step": 224 }, { "epoch": 2.14, "grad_norm": 0.31121536090331003, "learning_rate": 0.00012771532212509974, "loss": 0.0711, "step": 225 }, { "epoch": 2.15, "grad_norm": 0.34593292210442944, "learning_rate": 0.00012712176791656807, "loss": 0.0788, "step": 226 }, { "epoch": 2.16, "grad_norm": 0.33946278651997686, "learning_rate": 0.0001265271805166012, "loss": 0.0677, "step": 227 }, { "epoch": 2.17, "grad_norm": 0.3400898219352628, "learning_rate": 0.0001259315825757362, "loss": 0.0643, "step": 228 }, { "epoch": 2.18, "grad_norm": 0.3813085350755264, "learning_rate": 0.00012533499678300618, "loss": 0.0761, "step": 229 }, { "epoch": 2.19, "grad_norm": 0.3523012248149677, "learning_rate": 0.00012473744586507604, "loss": 0.0648, "step": 230 }, { "epoch": 2.19, "grad_norm": 0.37842862853695125, "learning_rate": 0.00012413895258537675, "loss": 0.0812, "step": 231 }, { "epoch": 2.2, "grad_norm": 0.39475455813661525, "learning_rate": 0.00012353953974323807, "loss": 0.0801, "step": 232 }, { "epoch": 2.21, "grad_norm": 0.3205081471986943, "learning_rate": 0.00012293923017302002, "loss": 0.0677, "step": 233 }, { "epoch": 2.22, "grad_norm": 0.31006899448135294, "learning_rate": 0.0001223380467432432, "loss": 0.07, "step": 234 }, { "epoch": 2.23, "grad_norm": 0.3048520942780853, "learning_rate": 0.00012173601235571742, "loss": 0.0615, "step": 235 }, { "epoch": 2.24, "grad_norm": 0.3425413653893973, "learning_rate": 0.0001211331499446693, "loss": 0.0658, "step": 236 }, { "epoch": 2.25, "grad_norm": 0.31929344956491607, "learning_rate": 0.00012052948247586873, "loss": 0.0653, "step": 237 }, { "epoch": 2.26, "grad_norm": 0.3414359773691709, "learning_rate": 0.00011992503294575383, "loss": 0.0723, "step": 238 }, { "epoch": 2.27, "grad_norm": 0.32978160245312554, "learning_rate": 0.00011931982438055505, "loss": 0.07, "step": 239 }, { "epoch": 2.28, "grad_norm": 0.33271868205929617, "learning_rate": 0.00011871387983541789, "loss": 0.0672, "step": 240 }, { "epoch": 2.29, "grad_norm": 0.29862145989444433, "learning_rate": 0.00011810722239352467, "loss": 0.0603, "step": 241 }, { "epoch": 2.3, "grad_norm": 0.34485364985513034, "learning_rate": 0.00011749987516521523, "loss": 0.0632, "step": 242 }, { "epoch": 2.31, "grad_norm": 0.3299899118013224, "learning_rate": 0.00011689186128710654, "loss": 0.0601, "step": 243 }, { "epoch": 2.32, "grad_norm": 0.29635972892096896, "learning_rate": 0.00011628320392121117, "loss": 0.0558, "step": 244 }, { "epoch": 2.33, "grad_norm": 0.3414458592363874, "learning_rate": 0.0001156739262540552, "loss": 0.0703, "step": 245 }, { "epoch": 2.34, "grad_norm": 0.3280087622706941, "learning_rate": 0.00011506405149579468, "loss": 0.0657, "step": 246 }, { "epoch": 2.35, "grad_norm": 0.373086375777386, "learning_rate": 0.00011445360287933165, "loss": 0.0668, "step": 247 }, { "epoch": 2.36, "grad_norm": 0.2937645914714354, "learning_rate": 0.00011384260365942904, "loss": 0.0612, "step": 248 }, { "epoch": 2.37, "grad_norm": 0.39022311054047737, "learning_rate": 0.00011323107711182473, "loss": 0.0762, "step": 249 }, { "epoch": 2.38, "grad_norm": 0.3345521008714258, "learning_rate": 0.00011261904653234485, "loss": 0.0711, "step": 250 }, { "epoch": 2.38, "grad_norm": 0.30608871062806836, "learning_rate": 0.00011200653523601652, "loss": 0.0617, "step": 251 }, { "epoch": 2.39, "grad_norm": 0.30714147902477945, "learning_rate": 0.00011139356655617945, "loss": 0.063, "step": 252 }, { "epoch": 2.4, "grad_norm": 0.31051190204375445, "learning_rate": 0.00011078016384359724, "loss": 0.0659, "step": 253 }, { "epoch": 2.41, "grad_norm": 0.3071085278813772, "learning_rate": 0.00011016635046556772, "loss": 0.061, "step": 254 }, { "epoch": 2.42, "grad_norm": 0.3045837343462885, "learning_rate": 0.00010955214980503284, "loss": 0.0597, "step": 255 }, { "epoch": 2.43, "grad_norm": 0.3049959198680976, "learning_rate": 0.00010893758525968789, "loss": 0.0587, "step": 256 }, { "epoch": 2.44, "grad_norm": 0.3168437149994661, "learning_rate": 0.00010832268024109025, "loss": 0.0559, "step": 257 }, { "epoch": 2.45, "grad_norm": 0.3024342626013227, "learning_rate": 0.00010770745817376742, "loss": 0.0583, "step": 258 }, { "epoch": 2.46, "grad_norm": 0.3188509232471995, "learning_rate": 0.0001070919424943247, "loss": 0.061, "step": 259 }, { "epoch": 2.47, "grad_norm": 0.3381945814712772, "learning_rate": 0.0001064761566505525, "loss": 0.0648, "step": 260 }, { "epoch": 2.48, "grad_norm": 0.3131931451431926, "learning_rate": 0.00010586012410053292, "loss": 0.0624, "step": 261 }, { "epoch": 2.49, "grad_norm": 0.32809637984753304, "learning_rate": 0.00010524386831174628, "loss": 0.0627, "step": 262 }, { "epoch": 2.5, "grad_norm": 0.2832796499168925, "learning_rate": 0.00010462741276017711, "loss": 0.0535, "step": 263 }, { "epoch": 2.51, "grad_norm": 0.3334141162384235, "learning_rate": 0.00010401078092941971, "loss": 0.061, "step": 264 }, { "epoch": 2.52, "grad_norm": 0.27653747850590626, "learning_rate": 0.00010339399630978373, "loss": 0.0497, "step": 265 }, { "epoch": 2.53, "grad_norm": 0.32205480409336124, "learning_rate": 0.00010277708239739924, "loss": 0.0658, "step": 266 }, { "epoch": 2.54, "grad_norm": 0.310079147965717, "learning_rate": 0.0001021600626933217, "loss": 0.0525, "step": 267 }, { "epoch": 2.55, "grad_norm": 0.31094425691461797, "learning_rate": 0.00010154296070263649, "loss": 0.0619, "step": 268 }, { "epoch": 2.56, "grad_norm": 0.33419799536496597, "learning_rate": 0.00010092579993356386, "loss": 0.0615, "step": 269 }, { "epoch": 2.57, "grad_norm": 0.3343121767672678, "learning_rate": 0.00010030860389656305, "loss": 0.0663, "step": 270 }, { "epoch": 2.57, "grad_norm": 0.3516117623617434, "learning_rate": 9.969139610343696e-05, "loss": 0.0662, "step": 271 }, { "epoch": 2.58, "grad_norm": 0.31796912631433194, "learning_rate": 9.907420006643619e-05, "loss": 0.0624, "step": 272 }, { "epoch": 2.59, "grad_norm": 0.29219460425245597, "learning_rate": 9.845703929736351e-05, "loss": 0.0596, "step": 273 }, { "epoch": 2.6, "grad_norm": 0.316635170830544, "learning_rate": 9.783993730667831e-05, "loss": 0.0659, "step": 274 }, { "epoch": 2.61, "grad_norm": 0.33766616368603597, "learning_rate": 9.722291760260077e-05, "loss": 0.0646, "step": 275 }, { "epoch": 2.62, "grad_norm": 0.31287192455811574, "learning_rate": 9.66060036902163e-05, "loss": 0.0585, "step": 276 }, { "epoch": 2.63, "grad_norm": 0.28964582015181484, "learning_rate": 9.598921907058033e-05, "loss": 0.0543, "step": 277 }, { "epoch": 2.64, "grad_norm": 0.3037919396698326, "learning_rate": 9.53725872398229e-05, "loss": 0.0512, "step": 278 }, { "epoch": 2.65, "grad_norm": 0.3229974938313004, "learning_rate": 9.475613168825374e-05, "loss": 0.0531, "step": 279 }, { "epoch": 2.66, "grad_norm": 0.29881091304580676, "learning_rate": 9.413987589946711e-05, "loss": 0.0569, "step": 280 }, { "epoch": 2.67, "grad_norm": 0.29692909307641674, "learning_rate": 9.352384334944753e-05, "loss": 0.0547, "step": 281 }, { "epoch": 2.68, "grad_norm": 0.33439942628885455, "learning_rate": 9.290805750567532e-05, "loss": 0.0622, "step": 282 }, { "epoch": 2.69, "grad_norm": 0.2991141437988068, "learning_rate": 9.22925418262326e-05, "loss": 0.0464, "step": 283 }, { "epoch": 2.7, "grad_norm": 0.3171911760038229, "learning_rate": 9.167731975890976e-05, "loss": 0.059, "step": 284 }, { "epoch": 2.71, "grad_norm": 0.30072460150102115, "learning_rate": 9.106241474031212e-05, "loss": 0.0559, "step": 285 }, { "epoch": 2.72, "grad_norm": 0.3301896190647226, "learning_rate": 9.04478501949672e-05, "loss": 0.0514, "step": 286 }, { "epoch": 2.73, "grad_norm": 0.3298071637508188, "learning_rate": 8.983364953443227e-05, "loss": 0.0618, "step": 287 }, { "epoch": 2.74, "grad_norm": 0.3497185839244567, "learning_rate": 8.921983615640277e-05, "loss": 0.065, "step": 288 }, { "epoch": 2.75, "grad_norm": 0.33084725547728233, "learning_rate": 8.860643344382056e-05, "loss": 0.0527, "step": 289 }, { "epoch": 2.76, "grad_norm": 0.33012822636415956, "learning_rate": 8.79934647639835e-05, "loss": 0.0666, "step": 290 }, { "epoch": 2.76, "grad_norm": 0.3151687548518561, "learning_rate": 8.738095346765518e-05, "loss": 0.0573, "step": 291 }, { "epoch": 2.77, "grad_norm": 0.30346203875619676, "learning_rate": 8.676892288817531e-05, "loss": 0.0491, "step": 292 }, { "epoch": 2.78, "grad_norm": 0.3133369298353677, "learning_rate": 8.615739634057098e-05, "loss": 0.0595, "step": 293 }, { "epoch": 2.79, "grad_norm": 0.28715782085999497, "learning_rate": 8.554639712066836e-05, "loss": 0.0542, "step": 294 }, { "epoch": 2.8, "grad_norm": 0.2815995010771035, "learning_rate": 8.493594850420537e-05, "loss": 0.0551, "step": 295 }, { "epoch": 2.81, "grad_norm": 0.280576878443274, "learning_rate": 8.432607374594484e-05, "loss": 0.0488, "step": 296 }, { "epoch": 2.82, "grad_norm": 0.298809991890747, "learning_rate": 8.371679607878884e-05, "loss": 0.0544, "step": 297 }, { "epoch": 2.83, "grad_norm": 0.30088222272143067, "learning_rate": 8.310813871289348e-05, "loss": 0.0591, "step": 298 }, { "epoch": 2.84, "grad_norm": 0.3237358977236424, "learning_rate": 8.250012483478478e-05, "loss": 0.0547, "step": 299 }, { "epoch": 2.85, "grad_norm": 0.34075237005827885, "learning_rate": 8.189277760647537e-05, "loss": 0.0566, "step": 300 }, { "epoch": 2.85, "eval_blimp_filtered_avg": 0.7037313432835821, "eval_blimp_filtered_std": 0.005058972315437875, "step": 300 }, { "epoch": 2.85, "eval_blimp_supplement_avg": 0.8103448275862069, "eval_blimp_supplement_std": 0.017321145118445798, "step": 300 }, { "epoch": 2.85, "eval_vqa_filtered_avg": 0.53, "eval_vqa_filtered_std": 0.0501613558046592, "step": 300 }, { "epoch": 2.85, "eval_winoground_filtered_avg": 0.68, "eval_winoground_filtered_std": 0.046882617226215034, "step": 300 }, { "epoch": 2.86, "grad_norm": 0.3237263865460515, "learning_rate": 8.128612016458215e-05, "loss": 0.059, "step": 301 }, { "epoch": 2.87, "grad_norm": 0.2977357286247905, "learning_rate": 8.068017561944499e-05, "loss": 0.0492, "step": 302 }, { "epoch": 2.88, "grad_norm": 0.29591506818063545, "learning_rate": 8.00749670542462e-05, "loss": 0.052, "step": 303 }, { "epoch": 2.89, "grad_norm": 0.2789469075911483, "learning_rate": 7.94705175241313e-05, "loss": 0.0455, "step": 304 }, { "epoch": 2.9, "grad_norm": 0.2997082343784124, "learning_rate": 7.886685005533072e-05, "loss": 0.0498, "step": 305 }, { "epoch": 2.91, "grad_norm": 0.30157528073661777, "learning_rate": 7.82639876442826e-05, "loss": 0.0567, "step": 306 }, { "epoch": 2.92, "grad_norm": 0.32803298910194756, "learning_rate": 7.76619532567568e-05, "loss": 0.0622, "step": 307 }, { "epoch": 2.93, "grad_norm": 0.28556449374878695, "learning_rate": 7.706076982697999e-05, "loss": 0.0489, "step": 308 }, { "epoch": 2.94, "grad_norm": 0.32287162854623286, "learning_rate": 7.646046025676198e-05, "loss": 0.066, "step": 309 }, { "epoch": 2.95, "grad_norm": 0.3384064716667544, "learning_rate": 7.586104741462325e-05, "loss": 0.0629, "step": 310 }, { "epoch": 2.95, "grad_norm": 0.3005901634146794, "learning_rate": 7.526255413492395e-05, "loss": 0.051, "step": 311 }, { "epoch": 2.96, "grad_norm": 0.2907146546357962, "learning_rate": 7.466500321699383e-05, "loss": 0.0546, "step": 312 }, { "epoch": 2.97, "grad_norm": 0.30779520364750435, "learning_rate": 7.40684174242638e-05, "loss": 0.058, "step": 313 }, { "epoch": 2.98, "grad_norm": 0.29074373091101263, "learning_rate": 7.347281948339879e-05, "loss": 0.0463, "step": 314 }, { "epoch": 2.99, "grad_norm": 0.32970798475445445, "learning_rate": 7.287823208343192e-05, "loss": 0.0589, "step": 315 }, { "epoch": 3.0, "grad_norm": 0.2798345327195924, "learning_rate": 7.228467787490028e-05, "loss": 0.0438, "step": 316 }, { "epoch": 3.01, "grad_norm": 0.18326848967204043, "learning_rate": 7.169217946898197e-05, "loss": 0.0225, "step": 317 }, { "epoch": 3.02, "grad_norm": 0.18022372679373735, "learning_rate": 7.110075943663472e-05, "loss": 0.0161, "step": 318 }, { "epoch": 3.03, "grad_norm": 0.1633153575928502, "learning_rate": 7.051044030773618e-05, "loss": 0.0153, "step": 319 }, { "epoch": 3.04, "grad_norm": 0.17802284328446474, "learning_rate": 6.992124457022553e-05, "loss": 0.0176, "step": 320 }, { "epoch": 3.05, "grad_norm": 0.17359891604740127, "learning_rate": 6.933319466924693e-05, "loss": 0.0162, "step": 321 }, { "epoch": 3.06, "grad_norm": 0.2202987501804585, "learning_rate": 6.874631300629435e-05, "loss": 0.0162, "step": 322 }, { "epoch": 3.07, "grad_norm": 0.22277821921264357, "learning_rate": 6.81606219383583e-05, "loss": 0.0187, "step": 323 }, { "epoch": 3.08, "grad_norm": 0.18724963681022663, "learning_rate": 6.757614377707409e-05, "loss": 0.0153, "step": 324 }, { "epoch": 3.09, "grad_norm": 0.21995220887794256, "learning_rate": 6.699290078787193e-05, "loss": 0.0188, "step": 325 }, { "epoch": 3.1, "grad_norm": 0.1967935793635855, "learning_rate": 6.641091518912867e-05, "loss": 0.0156, "step": 326 }, { "epoch": 3.11, "grad_norm": 0.20661934683104752, "learning_rate": 6.583020915132152e-05, "loss": 0.0158, "step": 327 }, { "epoch": 3.12, "grad_norm": 0.2422474266231083, "learning_rate": 6.525080479618331e-05, "loss": 0.0177, "step": 328 }, { "epoch": 3.13, "grad_norm": 0.18354685059507367, "learning_rate": 6.467272419585984e-05, "loss": 0.013, "step": 329 }, { "epoch": 3.14, "grad_norm": 0.22423754187379397, "learning_rate": 6.40959893720692e-05, "loss": 0.0188, "step": 330 }, { "epoch": 3.14, "grad_norm": 0.18994008796265852, "learning_rate": 6.352062229526266e-05, "loss": 0.0132, "step": 331 }, { "epoch": 3.15, "grad_norm": 0.24715301748493912, "learning_rate": 6.294664488378776e-05, "loss": 0.015, "step": 332 }, { "epoch": 3.16, "grad_norm": 0.17280498203848704, "learning_rate": 6.237407900305335e-05, "loss": 0.0138, "step": 333 }, { "epoch": 3.17, "grad_norm": 0.21773200395950232, "learning_rate": 6.180294646469679e-05, "loss": 0.0155, "step": 334 }, { "epoch": 3.18, "grad_norm": 0.2144971485793242, "learning_rate": 6.123326902575282e-05, "loss": 0.0158, "step": 335 }, { "epoch": 3.19, "grad_norm": 0.18331926033535073, "learning_rate": 6.06650683878248e-05, "loss": 0.013, "step": 336 }, { "epoch": 3.2, "grad_norm": 0.1788180130126268, "learning_rate": 6.009836619625809e-05, "loss": 0.0133, "step": 337 }, { "epoch": 3.21, "grad_norm": 0.20337677688861636, "learning_rate": 5.953318403931532e-05, "loss": 0.0129, "step": 338 }, { "epoch": 3.22, "grad_norm": 0.20853998405220736, "learning_rate": 5.896954344735426e-05, "loss": 0.0176, "step": 339 }, { "epoch": 3.23, "grad_norm": 0.1919639102705018, "learning_rate": 5.840746589200732e-05, "loss": 0.0144, "step": 340 }, { "epoch": 3.24, "grad_norm": 0.2134469059873606, "learning_rate": 5.784697278536379e-05, "loss": 0.0138, "step": 341 }, { "epoch": 3.25, "grad_norm": 0.18435084201272836, "learning_rate": 5.728808547915405e-05, "loss": 0.0135, "step": 342 }, { "epoch": 3.26, "grad_norm": 0.19554570393158438, "learning_rate": 5.673082526393634e-05, "loss": 0.015, "step": 343 }, { "epoch": 3.27, "grad_norm": 0.18522448379098544, "learning_rate": 5.617521336828556e-05, "loss": 0.0129, "step": 344 }, { "epoch": 3.28, "grad_norm": 0.190207008998555, "learning_rate": 5.5621270957984573e-05, "loss": 0.0161, "step": 345 }, { "epoch": 3.29, "grad_norm": 0.19594053008897275, "learning_rate": 5.506901913521808e-05, "loss": 0.0162, "step": 346 }, { "epoch": 3.3, "grad_norm": 0.20111569255746164, "learning_rate": 5.451847893776845e-05, "loss": 0.0147, "step": 347 }, { "epoch": 3.31, "grad_norm": 0.20867562278084897, "learning_rate": 5.396967133821461e-05, "loss": 0.0154, "step": 348 }, { "epoch": 3.32, "grad_norm": 0.16028325232055693, "learning_rate": 5.342261724313292e-05, "loss": 0.0117, "step": 349 }, { "epoch": 3.33, "grad_norm": 0.14992620939570764, "learning_rate": 5.28773374923007e-05, "loss": 0.0106, "step": 350 }, { "epoch": 3.33, "grad_norm": 0.20669460754401175, "learning_rate": 5.2333852857902575e-05, "loss": 0.0161, "step": 351 }, { "epoch": 3.34, "grad_norm": 0.21934716169620833, "learning_rate": 5.1792184043738855e-05, "loss": 0.0128, "step": 352 }, { "epoch": 3.35, "grad_norm": 0.18204794157825063, "learning_rate": 5.1252351684437136e-05, "loss": 0.0129, "step": 353 }, { "epoch": 3.36, "grad_norm": 0.21363608639584963, "learning_rate": 5.071437634466609e-05, "loss": 0.0105, "step": 354 }, { "epoch": 3.37, "grad_norm": 0.15881770971724649, "learning_rate": 5.0178278518351983e-05, "loss": 0.0096, "step": 355 }, { "epoch": 3.38, "grad_norm": 0.1980006966366768, "learning_rate": 4.964407862789817e-05, "loss": 0.0119, "step": 356 }, { "epoch": 3.39, "grad_norm": 0.21004802159627842, "learning_rate": 4.911179702340688e-05, "loss": 0.0119, "step": 357 }, { "epoch": 3.4, "grad_norm": 0.20419756258161648, "learning_rate": 4.85814539819042e-05, "loss": 0.0145, "step": 358 }, { "epoch": 3.41, "grad_norm": 0.1565818058300373, "learning_rate": 4.8053069706567554e-05, "loss": 0.0105, "step": 359 }, { "epoch": 3.42, "grad_norm": 0.19501698471957343, "learning_rate": 4.752666432595596e-05, "loss": 0.0126, "step": 360 }, { "epoch": 3.43, "grad_norm": 0.20941486180216556, "learning_rate": 4.700225789324343e-05, "loss": 0.0105, "step": 361 }, { "epoch": 3.44, "grad_norm": 0.18304197382791004, "learning_rate": 4.647987038545496e-05, "loss": 0.011, "step": 362 }, { "epoch": 3.45, "grad_norm": 0.16720171411001336, "learning_rate": 4.595952170270542e-05, "loss": 0.0112, "step": 363 }, { "epoch": 3.46, "grad_norm": 0.22478251297433013, "learning_rate": 4.544123166744172e-05, "loss": 0.0118, "step": 364 }, { "epoch": 3.47, "grad_norm": 0.1598572948562243, "learning_rate": 4.492502002368738e-05, "loss": 0.0107, "step": 365 }, { "epoch": 3.48, "grad_norm": 0.22373563049772874, "learning_rate": 4.4410906436290566e-05, "loss": 0.0104, "step": 366 }, { "epoch": 3.49, "grad_norm": 0.16802667132434534, "learning_rate": 4.38989104901751e-05, "loss": 0.0114, "step": 367 }, { "epoch": 3.5, "grad_norm": 0.24550738449688075, "learning_rate": 4.3389051689594e-05, "loss": 0.0121, "step": 368 }, { "epoch": 3.51, "grad_norm": 0.1660066244443363, "learning_rate": 4.288134945738684e-05, "loss": 0.0099, "step": 369 }, { "epoch": 3.52, "grad_norm": 0.1783889244909253, "learning_rate": 4.237582313423962e-05, "loss": 0.0094, "step": 370 }, { "epoch": 3.52, "grad_norm": 0.17141038466777303, "learning_rate": 4.187249197794813e-05, "loss": 0.0095, "step": 371 }, { "epoch": 3.53, "grad_norm": 0.1893721805088239, "learning_rate": 4.137137516268426e-05, "loss": 0.013, "step": 372 }, { "epoch": 3.54, "grad_norm": 0.16935951673752134, "learning_rate": 4.0872491778265535e-05, "loss": 0.0091, "step": 373 }, { "epoch": 3.55, "grad_norm": 0.13309068523326859, "learning_rate": 4.037586082942805e-05, "loss": 0.0091, "step": 374 }, { "epoch": 3.56, "grad_norm": 0.18791651271841342, "learning_rate": 3.988150123510224e-05, "loss": 0.0121, "step": 375 }, { "epoch": 3.57, "grad_norm": 0.1559825545952661, "learning_rate": 3.938943182769246e-05, "loss": 0.0102, "step": 376 }, { "epoch": 3.58, "grad_norm": 0.2261919531211638, "learning_rate": 3.88996713523594e-05, "loss": 0.0127, "step": 377 }, { "epoch": 3.59, "grad_norm": 0.20792420146527377, "learning_rate": 3.841223846630599e-05, "loss": 0.013, "step": 378 }, { "epoch": 3.6, "grad_norm": 0.16486082885129608, "learning_rate": 3.792715173806669e-05, "loss": 0.0105, "step": 379 }, { "epoch": 3.61, "grad_norm": 0.1549020176177142, "learning_rate": 3.74444296468002e-05, "loss": 0.0098, "step": 380 }, { "epoch": 3.62, "grad_norm": 0.17250200199106172, "learning_rate": 3.696409058158544e-05, "loss": 0.0109, "step": 381 }, { "epoch": 3.63, "grad_norm": 0.1415293330470341, "learning_rate": 3.6486152840721046e-05, "loss": 0.0084, "step": 382 }, { "epoch": 3.64, "grad_norm": 0.14461810975420877, "learning_rate": 3.6010634631028226e-05, "loss": 0.0084, "step": 383 }, { "epoch": 3.65, "grad_norm": 0.1557012557289619, "learning_rate": 3.553755406715724e-05, "loss": 0.0089, "step": 384 }, { "epoch": 3.66, "grad_norm": 0.15752891661687976, "learning_rate": 3.506692917089751e-05, "loss": 0.0109, "step": 385 }, { "epoch": 3.67, "grad_norm": 0.1694876915505117, "learning_rate": 3.459877787049072e-05, "loss": 0.009, "step": 386 }, { "epoch": 3.68, "grad_norm": 0.1582663784415179, "learning_rate": 3.413311799994808e-05, "loss": 0.0095, "step": 387 }, { "epoch": 3.69, "grad_norm": 0.13693031068741818, "learning_rate": 3.366996729837102e-05, "loss": 0.0092, "step": 388 }, { "epoch": 3.7, "grad_norm": 0.14543112940410688, "learning_rate": 3.320934340927513e-05, "loss": 0.0108, "step": 389 }, { "epoch": 3.71, "grad_norm": 0.19389482832864774, "learning_rate": 3.275126387991847e-05, "loss": 0.0098, "step": 390 }, { "epoch": 3.71, "grad_norm": 0.15797165592004603, "learning_rate": 3.229574616063268e-05, "loss": 0.0076, "step": 391 }, { "epoch": 3.72, "grad_norm": 0.21281942854700847, "learning_rate": 3.184280760415843e-05, "loss": 0.0142, "step": 392 }, { "epoch": 3.73, "grad_norm": 0.12498130411986656, "learning_rate": 3.1392465464984455e-05, "loss": 0.0081, "step": 393 }, { "epoch": 3.74, "grad_norm": 0.1152125429659436, "learning_rate": 3.094473689869002e-05, "loss": 0.0058, "step": 394 }, { "epoch": 3.75, "grad_norm": 0.1567733530080216, "learning_rate": 3.0499638961291623e-05, "loss": 0.011, "step": 395 }, { "epoch": 3.76, "grad_norm": 0.14500898906990572, "learning_rate": 3.0057188608593147e-05, "loss": 0.0085, "step": 396 }, { "epoch": 3.77, "grad_norm": 0.16163974543952728, "learning_rate": 2.9617402695539808e-05, "loss": 0.013, "step": 397 }, { "epoch": 3.78, "grad_norm": 0.13868168811451842, "learning_rate": 2.9180297975576364e-05, "loss": 0.0084, "step": 398 }, { "epoch": 3.79, "grad_norm": 0.17847032901949134, "learning_rate": 2.8745891100008683e-05, "loss": 0.0121, "step": 399 }, { "epoch": 3.8, "grad_norm": 0.17527442252411723, "learning_rate": 2.83141986173694e-05, "loss": 0.0084, "step": 400 }, { "epoch": 3.8, "eval_blimp_filtered_avg": 0.7053731343283582, "eval_blimp_filtered_std": 0.005043001462199571, "step": 400 }, { "epoch": 3.8, "eval_blimp_supplement_avg": 0.8125, "eval_blimp_supplement_std": 0.01736311122127593, "step": 400 }, { "epoch": 3.8, "eval_vqa_filtered_avg": 0.52, "eval_vqa_filtered_std": 0.05021167315686779, "step": 400 }, { "epoch": 3.8, "eval_winoground_filtered_avg": 0.64, "eval_winoground_filtered_std": 0.048241815132442176, "step": 400 }, { "epoch": 3.81, "grad_norm": 0.14598157841040266, "learning_rate": 2.788523697278773e-05, "loss": 0.0093, "step": 401 }, { "epoch": 3.82, "grad_norm": 0.20150542514971506, "learning_rate": 2.7459022507362686e-05, "loss": 0.0122, "step": 402 }, { "epoch": 3.83, "grad_norm": 0.18255123614923588, "learning_rate": 2.7035571457540865e-05, "loss": 0.0103, "step": 403 }, { "epoch": 3.84, "grad_norm": 0.16704045474943452, "learning_rate": 2.6614899954497795e-05, "loss": 0.0114, "step": 404 }, { "epoch": 3.85, "grad_norm": 0.14683721625679494, "learning_rate": 2.619702402352332e-05, "loss": 0.01, "step": 405 }, { "epoch": 3.86, "grad_norm": 0.18144743721435366, "learning_rate": 2.5781959583411374e-05, "loss": 0.0129, "step": 406 }, { "epoch": 3.87, "grad_norm": 0.19646570441433073, "learning_rate": 2.5369722445853304e-05, "loss": 0.0143, "step": 407 }, { "epoch": 3.88, "grad_norm": 0.1668088181727681, "learning_rate": 2.4960328314835745e-05, "loss": 0.0089, "step": 408 }, { "epoch": 3.89, "grad_norm": 0.16111476451284476, "learning_rate": 2.4553792786042262e-05, "loss": 0.0091, "step": 409 }, { "epoch": 3.9, "grad_norm": 0.17729690845562673, "learning_rate": 2.4150131346259197e-05, "loss": 0.0103, "step": 410 }, { "epoch": 3.9, "grad_norm": 0.15155895346947004, "learning_rate": 2.3749359372785883e-05, "loss": 0.0096, "step": 411 }, { "epoch": 3.91, "grad_norm": 0.15041370885333255, "learning_rate": 2.3351492132848664e-05, "loss": 0.0085, "step": 412 }, { "epoch": 3.92, "grad_norm": 0.12197907148956355, "learning_rate": 2.2956544783019418e-05, "loss": 0.0067, "step": 413 }, { "epoch": 3.93, "grad_norm": 0.1788434056496877, "learning_rate": 2.2564532368638146e-05, "loss": 0.01, "step": 414 }, { "epoch": 3.94, "grad_norm": 0.19269466130772045, "learning_rate": 2.2175469823239768e-05, "loss": 0.0117, "step": 415 }, { "epoch": 3.95, "grad_norm": 0.15780826445252463, "learning_rate": 2.1789371967985338e-05, "loss": 0.0101, "step": 416 }, { "epoch": 3.96, "grad_norm": 0.19229144408434373, "learning_rate": 2.140625351109733e-05, "loss": 0.0084, "step": 417 }, { "epoch": 3.97, "grad_norm": 0.15474486143047034, "learning_rate": 2.1026129047299436e-05, "loss": 0.0067, "step": 418 }, { "epoch": 3.98, "grad_norm": 0.15864166155594778, "learning_rate": 2.0649013057260546e-05, "loss": 0.0098, "step": 419 }, { "epoch": 3.99, "grad_norm": 0.22515244613844015, "learning_rate": 2.0274919907043033e-05, "loss": 0.0094, "step": 420 }, { "epoch": 4.0, "grad_norm": 0.18684872878382638, "learning_rate": 1.9903863847555648e-05, "loss": 0.0127, "step": 421 }, { "epoch": 4.01, "grad_norm": 0.06270483785922072, "learning_rate": 1.9535859014010526e-05, "loss": 0.0028, "step": 422 }, { "epoch": 4.02, "grad_norm": 0.09948637260912774, "learning_rate": 1.917091942538469e-05, "loss": 0.0037, "step": 423 }, { "epoch": 4.03, "grad_norm": 0.07530065845248647, "learning_rate": 1.880905898388612e-05, "loss": 0.0039, "step": 424 }, { "epoch": 4.04, "grad_norm": 0.054461890750773165, "learning_rate": 1.8450291474423998e-05, "loss": 0.0025, "step": 425 }, { "epoch": 4.05, "grad_norm": 0.08002877578075594, "learning_rate": 1.8094630564083736e-05, "loss": 0.0035, "step": 426 }, { "epoch": 4.06, "grad_norm": 0.05746226463965698, "learning_rate": 1.7742089801606276e-05, "loss": 0.0025, "step": 427 }, { "epoch": 4.07, "grad_norm": 0.0633358139605444, "learning_rate": 1.7392682616871837e-05, "loss": 0.0027, "step": 428 }, { "epoch": 4.08, "grad_norm": 0.06509683268742919, "learning_rate": 1.7046422320388556e-05, "loss": 0.0027, "step": 429 }, { "epoch": 4.09, "grad_norm": 0.054571154616853274, "learning_rate": 1.6703322102785168e-05, "loss": 0.0026, "step": 430 }, { "epoch": 4.1, "grad_norm": 0.06888564779650448, "learning_rate": 1.6363395034308703e-05, "loss": 0.0027, "step": 431 }, { "epoch": 4.1, "grad_norm": 0.05307117129834359, "learning_rate": 1.6026654064326553e-05, "loss": 0.0025, "step": 432 }, { "epoch": 4.11, "grad_norm": 0.06598879328529111, "learning_rate": 1.5693112020833013e-05, "loss": 0.003, "step": 433 }, { "epoch": 4.12, "grad_norm": 0.054752236275106794, "learning_rate": 1.5362781609960852e-05, "loss": 0.0025, "step": 434 }, { "epoch": 4.13, "grad_norm": 0.07106963888787232, "learning_rate": 1.5035675415497063e-05, "loss": 0.0031, "step": 435 }, { "epoch": 4.14, "grad_norm": 0.052548572683446884, "learning_rate": 1.471180589840363e-05, "loss": 0.0025, "step": 436 }, { "epoch": 4.15, "grad_norm": 0.08828036910254508, "learning_rate": 1.4391185396342789e-05, "loss": 0.0038, "step": 437 }, { "epoch": 4.16, "grad_norm": 0.09463459893212552, "learning_rate": 1.4073826123206946e-05, "loss": 0.0038, "step": 438 }, { "epoch": 4.17, "grad_norm": 0.08002928457971342, "learning_rate": 1.375974016865359e-05, "loss": 0.0031, "step": 439 }, { "epoch": 4.18, "grad_norm": 0.07631532690730236, "learning_rate": 1.3448939497644509e-05, "loss": 0.0031, "step": 440 }, { "epoch": 4.19, "grad_norm": 0.04831761603516682, "learning_rate": 1.3141435949990188e-05, "loss": 0.0027, "step": 441 }, { "epoch": 4.2, "grad_norm": 0.07344003153336562, "learning_rate": 1.2837241239898667e-05, "loss": 0.0032, "step": 442 }, { "epoch": 4.21, "grad_norm": 0.08305075630986966, "learning_rate": 1.253636695552931e-05, "loss": 0.003, "step": 443 }, { "epoch": 4.22, "grad_norm": 0.1034575433958594, "learning_rate": 1.2238824558551365e-05, "loss": 0.0039, "step": 444 }, { "epoch": 4.23, "grad_norm": 0.06655324788558148, "learning_rate": 1.1944625383707374e-05, "loss": 0.003, "step": 445 }, { "epoch": 4.24, "grad_norm": 0.0790599253839735, "learning_rate": 1.1653780638381328e-05, "loss": 0.0029, "step": 446 }, { "epoch": 4.25, "grad_norm": 0.04198685628145689, "learning_rate": 1.1366301402171775e-05, "loss": 0.0017, "step": 447 }, { "epoch": 4.26, "grad_norm": 0.06439353264983554, "learning_rate": 1.1082198626469686e-05, "loss": 0.0024, "step": 448 }, { "epoch": 4.27, "grad_norm": 0.07762450043477247, "learning_rate": 1.0801483134041268e-05, "loss": 0.0027, "step": 449 }, { "epoch": 4.28, "grad_norm": 0.07856883953783565, "learning_rate": 1.0524165618615845e-05, "loss": 0.0033, "step": 450 }, { "epoch": 4.29, "grad_norm": 0.07929308057852809, "learning_rate": 1.0250256644478195e-05, "loss": 0.003, "step": 451 }, { "epoch": 4.29, "grad_norm": 0.0587512154822952, "learning_rate": 9.979766646066368e-06, "loss": 0.0027, "step": 452 }, { "epoch": 4.3, "grad_norm": 0.06109551507247056, "learning_rate": 9.71270592757404e-06, "loss": 0.0032, "step": 453 }, { "epoch": 4.31, "grad_norm": 0.05909029031199419, "learning_rate": 9.449084662557982e-06, "loss": 0.0026, "step": 454 }, { "epoch": 4.32, "grad_norm": 0.0814055458144323, "learning_rate": 9.188912893550695e-06, "loss": 0.0026, "step": 455 }, { "epoch": 4.33, "grad_norm": 0.07735385332942207, "learning_rate": 8.932200531677537e-06, "loss": 0.0028, "step": 456 }, { "epoch": 4.34, "grad_norm": 0.08519595591969155, "learning_rate": 8.678957356279371e-06, "loss": 0.0024, "step": 457 }, { "epoch": 4.35, "grad_norm": 0.055031384326470804, "learning_rate": 8.429193014540015e-06, "loss": 0.0026, "step": 458 }, { "epoch": 4.36, "grad_norm": 0.05387324401647046, "learning_rate": 8.182917021118663e-06, "loss": 0.0026, "step": 459 }, { "epoch": 4.37, "grad_norm": 0.07168879976269556, "learning_rate": 7.940138757787507e-06, "loss": 0.0032, "step": 460 }, { "epoch": 4.38, "grad_norm": 0.07661756681904786, "learning_rate": 7.700867473074224e-06, "loss": 0.0035, "step": 461 }, { "epoch": 4.39, "grad_norm": 0.09486930411075328, "learning_rate": 7.46511228190977e-06, "loss": 0.0049, "step": 462 }, { "epoch": 4.4, "grad_norm": 0.0679530025111762, "learning_rate": 7.232882165281141e-06, "loss": 0.0026, "step": 463 }, { "epoch": 4.41, "grad_norm": 0.06514922044267304, "learning_rate": 7.004185969889187e-06, "loss": 0.0027, "step": 464 }, { "epoch": 4.42, "grad_norm": 0.06706026131022384, "learning_rate": 6.7790324078116364e-06, "loss": 0.0027, "step": 465 }, { "epoch": 4.43, "grad_norm": 0.07709046890424658, "learning_rate": 6.557430056171221e-06, "loss": 0.0033, "step": 466 }, { "epoch": 4.44, "grad_norm": 0.051443041020356704, "learning_rate": 6.339387356808912e-06, "loss": 0.0026, "step": 467 }, { "epoch": 4.45, "grad_norm": 0.060318722923432995, "learning_rate": 6.124912615962341e-06, "loss": 0.0028, "step": 468 }, { "epoch": 4.46, "grad_norm": 0.062212012735137795, "learning_rate": 5.9140140039494084e-06, "loss": 0.0025, "step": 469 }, { "epoch": 4.47, "grad_norm": 0.06556299474776538, "learning_rate": 5.706699554856964e-06, "loss": 0.0023, "step": 470 }, { "epoch": 4.48, "grad_norm": 0.08649267044276539, "learning_rate": 5.502977166234857e-06, "loss": 0.0035, "step": 471 }, { "epoch": 4.48, "grad_norm": 0.08526822145924882, "learning_rate": 5.302854598794937e-06, "loss": 0.003, "step": 472 }, { "epoch": 4.49, "grad_norm": 0.04133711118453636, "learning_rate": 5.106339476115596e-06, "loss": 0.0019, "step": 473 }, { "epoch": 4.5, "grad_norm": 0.05708577094578342, "learning_rate": 4.913439284351207e-06, "loss": 0.0026, "step": 474 }, { "epoch": 4.51, "grad_norm": 0.07367912633186298, "learning_rate": 4.724161371946978e-06, "loss": 0.0029, "step": 475 }, { "epoch": 4.52, "grad_norm": 0.08135320771271103, "learning_rate": 4.538512949359075e-06, "loss": 0.0027, "step": 476 }, { "epoch": 4.53, "grad_norm": 0.0849858165893086, "learning_rate": 4.356501088779841e-06, "loss": 0.0027, "step": 477 }, { "epoch": 4.54, "grad_norm": 0.05260609110954984, "learning_rate": 4.178132723868477e-06, "loss": 0.0019, "step": 478 }, { "epoch": 4.55, "grad_norm": 0.0795477617292828, "learning_rate": 4.003414649486892e-06, "loss": 0.0032, "step": 479 }, { "epoch": 4.56, "grad_norm": 0.08161922179718771, "learning_rate": 3.832353521440768e-06, "loss": 0.0026, "step": 480 }, { "epoch": 4.57, "grad_norm": 0.06830643544893618, "learning_rate": 3.6649558562261375e-06, "loss": 0.0032, "step": 481 }, { "epoch": 4.58, "grad_norm": 0.08641205617098656, "learning_rate": 3.501228030781034e-06, "loss": 0.0028, "step": 482 }, { "epoch": 4.59, "grad_norm": 0.04921706287498077, "learning_rate": 3.341176282242653e-06, "loss": 0.0021, "step": 483 }, { "epoch": 4.6, "grad_norm": 0.05901589705081983, "learning_rate": 3.184806707709698e-06, "loss": 0.0027, "step": 484 }, { "epoch": 4.61, "grad_norm": 0.08562934355546689, "learning_rate": 3.0321252640100885e-06, "loss": 0.0035, "step": 485 }, { "epoch": 4.62, "grad_norm": 0.056139936545776606, "learning_rate": 2.88313776747412e-06, "loss": 0.0027, "step": 486 }, { "epoch": 4.63, "grad_norm": 0.06574452787357139, "learning_rate": 2.7378498937128404e-06, "loss": 0.0031, "step": 487 }, { "epoch": 4.64, "grad_norm": 0.06295208396607756, "learning_rate": 2.5962671774018234e-06, "loss": 0.0029, "step": 488 }, { "epoch": 4.65, "grad_norm": 0.06348707610420529, "learning_rate": 2.458395012070369e-06, "loss": 0.0027, "step": 489 }, { "epoch": 4.66, "grad_norm": 0.06438459591992919, "learning_rate": 2.3242386498960266e-06, "loss": 0.003, "step": 490 }, { "epoch": 4.67, "grad_norm": 0.0936033257355208, "learning_rate": 2.1938032015044964e-06, "loss": 0.0053, "step": 491 }, { "epoch": 4.67, "grad_norm": 0.0712704009642112, "learning_rate": 2.067093635774975e-06, "loss": 0.0033, "step": 492 }, { "epoch": 4.68, "grad_norm": 0.05278839840964536, "learning_rate": 1.9441147796508407e-06, "loss": 0.0025, "step": 493 }, { "epoch": 4.69, "grad_norm": 0.05158800004403027, "learning_rate": 1.8248713179557786e-06, "loss": 0.002, "step": 494 }, { "epoch": 4.7, "grad_norm": 0.06302315225352234, "learning_rate": 1.7093677932153218e-06, "loss": 0.002, "step": 495 }, { "epoch": 4.71, "grad_norm": 0.09014451602286425, "learning_rate": 1.5976086054838025e-06, "loss": 0.0031, "step": 496 }, { "epoch": 4.72, "grad_norm": 0.08249201483869177, "learning_rate": 1.4895980121767627e-06, "loss": 0.0029, "step": 497 }, { "epoch": 4.73, "grad_norm": 0.07887788932672342, "learning_rate": 1.3853401279086854e-06, "loss": 0.0028, "step": 498 }, { "epoch": 4.74, "grad_norm": 0.09271365227044996, "learning_rate": 1.2848389243363512e-06, "loss": 0.0026, "step": 499 }, { "epoch": 4.75, "grad_norm": 0.05191622392926365, "learning_rate": 1.1880982300074838e-06, "loss": 0.0027, "step": 500 }, { "epoch": 4.75, "eval_blimp_filtered_avg": 0.7105970149253731, "eval_blimp_filtered_std": 0.005015059082306442, "step": 500 }, { "epoch": 4.75, "eval_blimp_supplement_avg": 0.8146551724137931, "eval_blimp_supplement_std": 0.01739418193453382, "step": 500 }, { "epoch": 4.75, "eval_vqa_filtered_avg": 0.52, "eval_vqa_filtered_std": 0.05021167315686779, "step": 500 }, { "epoch": 4.75, "eval_winoground_filtered_avg": 0.64, "eval_winoground_filtered_std": 0.048241815132442176, "step": 500 }, { "epoch": 4.76, "grad_norm": 0.06004512680198857, "learning_rate": 1.0951217302148986e-06, "loss": 0.0021, "step": 501 }, { "epoch": 4.77, "grad_norm": 0.07576379765393293, "learning_rate": 1.0059129668561707e-06, "loss": 0.0027, "step": 502 }, { "epoch": 4.78, "grad_norm": 0.0655321501764931, "learning_rate": 9.204753382986097e-07, "loss": 0.0029, "step": 503 }, { "epoch": 4.79, "grad_norm": 0.06668565079155468, "learning_rate": 8.388120992499083e-07, "loss": 0.0024, "step": 504 }, { "epoch": 4.8, "grad_norm": 0.08295379764022878, "learning_rate": 7.609263606340622e-07, "loss": 0.003, "step": 505 }, { "epoch": 4.81, "grad_norm": 0.05830372848137469, "learning_rate": 6.868210894729332e-07, "loss": 0.0027, "step": 506 }, { "epoch": 4.82, "grad_norm": 0.04555270319966449, "learning_rate": 6.164991087731831e-07, "loss": 0.0021, "step": 507 }, { "epoch": 4.83, "grad_norm": 0.057930715171302063, "learning_rate": 5.499630974187375e-07, "loss": 0.0024, "step": 508 }, { "epoch": 4.84, "grad_norm": 0.09648171217668358, "learning_rate": 4.872155900687347e-07, "loss": 0.0032, "step": 509 }, { "epoch": 4.85, "grad_norm": 0.08324119499167887, "learning_rate": 4.2825897706100235e-07, "loss": 0.0018, "step": 510 }, { "epoch": 4.86, "grad_norm": 0.05280884195269513, "learning_rate": 3.7309550432090835e-07, "loss": 0.003, "step": 511 }, { "epoch": 4.86, "grad_norm": 0.06550697689715686, "learning_rate": 3.217272732759402e-07, "loss": 0.0029, "step": 512 }, { "epoch": 4.87, "grad_norm": 0.07122072726515956, "learning_rate": 2.741562407755138e-07, "loss": 0.0026, "step": 513 }, { "epoch": 4.88, "grad_norm": 0.08524060823989948, "learning_rate": 2.3038421901651064e-07, "loss": 0.0032, "step": 514 }, { "epoch": 4.89, "grad_norm": 0.06597786647502633, "learning_rate": 1.9041287547424403e-07, "loss": 0.0026, "step": 515 }, { "epoch": 4.9, "grad_norm": 0.0640231658570345, "learning_rate": 1.5424373283889904e-07, "loss": 0.0025, "step": 516 }, { "epoch": 4.91, "grad_norm": 0.06369562949011548, "learning_rate": 1.2187816895752324e-07, "loss": 0.003, "step": 517 }, { "epoch": 4.92, "grad_norm": 0.05040741786604575, "learning_rate": 9.3317416781602e-08, "loss": 0.0021, "step": 518 }, { "epoch": 4.93, "grad_norm": 0.059709231647531516, "learning_rate": 6.856256432000718e-08, "loss": 0.0024, "step": 519 }, { "epoch": 4.94, "grad_norm": 0.07196915675318658, "learning_rate": 4.7614554597608105e-08, "loss": 0.0033, "step": 520 }, { "epoch": 4.95, "grad_norm": 0.057403114857655216, "learning_rate": 3.047418561933357e-08, "loss": 0.002, "step": 521 }, { "epoch": 4.96, "grad_norm": 0.08027211044033893, "learning_rate": 1.7142110339740668e-08, "loss": 0.003, "step": 522 }, { "epoch": 4.97, "grad_norm": 0.09851204603686081, "learning_rate": 7.618836638190186e-09, "loss": 0.0028, "step": 523 }, { "epoch": 4.98, "grad_norm": 0.0505999817391235, "learning_rate": 1.904727299473219e-09, "loss": 0.0023, "step": 524 }, { "epoch": 4.99, "grad_norm": 0.059018226862226256, "learning_rate": 0.0, "loss": 0.0029, "step": 525 }, { "epoch": 4.99, "step": 525, "total_flos": 415734656204800.0, "train_loss": 0.33918485829939266, "train_runtime": 37482.4854, "train_samples_per_second": 8.975, "train_steps_per_second": 0.014 } ], "logging_steps": 1.0, "max_steps": 525, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 415734656204800.0, "train_batch_size": 40, "trial_name": null, "trial_params": null }