|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.750593824228028, |
|
"eval_steps": 100, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.008147055910771, |
|
"learning_rate": 1.25e-05, |
|
"loss": 4.2415, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.04569203441769, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.3121, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.865746651377984, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 4.3208, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.6407193073379105, |
|
"learning_rate": 5e-05, |
|
"loss": 3.8848, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.451159328560232, |
|
"learning_rate": 6.25e-05, |
|
"loss": 3.4391, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.8259504797317525, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 3.0656, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1881779175566867, |
|
"learning_rate": 8.75e-05, |
|
"loss": 2.8135, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.614839668966139, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7319, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5198673994210212, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 2.6903, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0044025931610727, |
|
"learning_rate": 0.000125, |
|
"loss": 2.584, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.1531821793787296, |
|
"learning_rate": 0.0001375, |
|
"loss": 2.586, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6210600474209341, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 2.5298, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5025244204180619, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 2.4665, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5058788641352842, |
|
"learning_rate": 0.000175, |
|
"loss": 2.4194, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.44571801666869537, |
|
"learning_rate": 0.0001875, |
|
"loss": 2.3531, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.44028009268534757, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2749, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.42473118020142525, |
|
"learning_rate": 0.00019999809527270051, |
|
"loss": 2.2587, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.465029302165452, |
|
"learning_rate": 0.0001999923811633618, |
|
"loss": 2.2196, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.49040381415815754, |
|
"learning_rate": 0.00019998285788966027, |
|
"loss": 2.2061, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.4160855034634493, |
|
"learning_rate": 0.00019996952581438068, |
|
"loss": 2.1173, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.45625369964232165, |
|
"learning_rate": 0.00019995238544540241, |
|
"loss": 2.1267, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.42551849567803673, |
|
"learning_rate": 0.00019993143743568, |
|
"loss": 2.0976, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5100052595965069, |
|
"learning_rate": 0.0001999066825832184, |
|
"loss": 2.0428, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.4717525078599394, |
|
"learning_rate": 0.00019987812183104247, |
|
"loss": 2.0068, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5596905853419681, |
|
"learning_rate": 0.0001998457562671611, |
|
"loss": 2.0303, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4931645550169434, |
|
"learning_rate": 0.00019980958712452577, |
|
"loss": 1.9722, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4433810930704678, |
|
"learning_rate": 0.0001997696157809835, |
|
"loss": 1.957, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5522396650266582, |
|
"learning_rate": 0.0001997258437592245, |
|
"loss": 1.915, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.49861222066728145, |
|
"learning_rate": 0.00019967827272672408, |
|
"loss": 1.8303, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6169911964169147, |
|
"learning_rate": 0.00019962690449567912, |
|
"loss": 1.8454, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5639780725078123, |
|
"learning_rate": 0.000199571741022939, |
|
"loss": 1.8068, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6302805853808786, |
|
"learning_rate": 0.0001995127844099313, |
|
"loss": 1.7166, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6494693483139545, |
|
"learning_rate": 0.00019945003690258125, |
|
"loss": 1.6433, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7598443409498918, |
|
"learning_rate": 0.00019938350089122682, |
|
"loss": 1.7081, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6512764391881087, |
|
"learning_rate": 0.00019931317891052708, |
|
"loss": 1.6436, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6953537359048508, |
|
"learning_rate": 0.00019923907363936593, |
|
"loss": 1.5862, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6011387829084072, |
|
"learning_rate": 0.00019916118790075008, |
|
"loss": 1.5432, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.659130437748028, |
|
"learning_rate": 0.00019907952466170138, |
|
"loss": 1.5132, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7211467253555573, |
|
"learning_rate": 0.00019899408703314385, |
|
"loss": 1.506, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7006890038987398, |
|
"learning_rate": 0.0001989048782697851, |
|
"loss": 1.4498, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.64642158324997, |
|
"learning_rate": 0.00019881190176999255, |
|
"loss": 1.4478, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6608085069521318, |
|
"learning_rate": 0.00019871516107566366, |
|
"loss": 1.3542, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7707478188072372, |
|
"learning_rate": 0.0001986146598720913, |
|
"loss": 1.3309, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8119298049916807, |
|
"learning_rate": 0.00019851040198782326, |
|
"loss": 1.345, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7712308653234212, |
|
"learning_rate": 0.0001984023913945162, |
|
"loss": 1.3076, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.682341709525683, |
|
"learning_rate": 0.0001982906322067847, |
|
"loss": 1.2565, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7071991083514119, |
|
"learning_rate": 0.00019817512868204425, |
|
"loss": 1.1796, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.745222014713615, |
|
"learning_rate": 0.00019805588522034916, |
|
"loss": 1.1649, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7158459299510994, |
|
"learning_rate": 0.00019793290636422505, |
|
"loss": 1.2109, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7335821144549012, |
|
"learning_rate": 0.00019780619679849552, |
|
"loss": 1.1475, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7804306024320766, |
|
"learning_rate": 0.000197675761350104, |
|
"loss": 1.1068, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8274924156959725, |
|
"learning_rate": 0.00019754160498792965, |
|
"loss": 1.1839, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8840482383868431, |
|
"learning_rate": 0.0001974037328225982, |
|
"loss": 1.0928, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7224652999279871, |
|
"learning_rate": 0.00019726215010628718, |
|
"loss": 1.0299, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7109288879933862, |
|
"learning_rate": 0.0001971168622325259, |
|
"loss": 1.0436, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7650325966583326, |
|
"learning_rate": 0.00019696787473598993, |
|
"loss": 1.041, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7307809391946058, |
|
"learning_rate": 0.00019681519329229033, |
|
"loss": 1.0195, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6873943623441443, |
|
"learning_rate": 0.00019665882371775733, |
|
"loss": 0.972, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8185924734616268, |
|
"learning_rate": 0.00019649877196921896, |
|
"loss": 0.9986, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7907558585543373, |
|
"learning_rate": 0.00019633504414377388, |
|
"loss": 0.9201, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7216280408288712, |
|
"learning_rate": 0.00019616764647855926, |
|
"loss": 0.9976, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6946470891456141, |
|
"learning_rate": 0.00019599658535051314, |
|
"loss": 0.9008, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6470248283451219, |
|
"learning_rate": 0.00019582186727613152, |
|
"loss": 0.8226, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8297915622585336, |
|
"learning_rate": 0.00019564349891122018, |
|
"loss": 0.8825, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7018515834126928, |
|
"learning_rate": 0.00019546148705064097, |
|
"loss": 0.8521, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6119835758734723, |
|
"learning_rate": 0.00019527583862805303, |
|
"loss": 0.7872, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6396036538427098, |
|
"learning_rate": 0.00019508656071564882, |
|
"loss": 0.7887, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6712059239435435, |
|
"learning_rate": 0.00019489366052388441, |
|
"loss": 0.8406, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6498227189328728, |
|
"learning_rate": 0.00019469714540120507, |
|
"loss": 0.7109, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6950957852561941, |
|
"learning_rate": 0.00019449702283376517, |
|
"loss": 0.7008, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6415745385783075, |
|
"learning_rate": 0.00019429330044514305, |
|
"loss": 0.6808, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6774461765802887, |
|
"learning_rate": 0.0001940859859960506, |
|
"loss": 0.7122, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6335543398879422, |
|
"learning_rate": 0.00019387508738403768, |
|
"loss": 0.6826, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6455659601218003, |
|
"learning_rate": 0.0001936606126431911, |
|
"loss": 0.7342, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6804108080708727, |
|
"learning_rate": 0.00019344256994382878, |
|
"loss": 0.6983, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6233570198373359, |
|
"learning_rate": 0.00019322096759218836, |
|
"loss": 0.6426, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6354196060962453, |
|
"learning_rate": 0.00019299581403011082, |
|
"loss": 0.6978, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6723728632702363, |
|
"learning_rate": 0.0001927671178347189, |
|
"loss": 0.6449, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6055794839258588, |
|
"learning_rate": 0.00019253488771809024, |
|
"loss": 0.6608, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6032563228830964, |
|
"learning_rate": 0.0001922991325269258, |
|
"loss": 0.6691, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5917538532836075, |
|
"learning_rate": 0.00019205986124221251, |
|
"loss": 0.6418, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6558132078005496, |
|
"learning_rate": 0.00019181708297888133, |
|
"loss": 0.6562, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6110330049943966, |
|
"learning_rate": 0.00019157080698546, |
|
"loss": 0.5855, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6481622083495842, |
|
"learning_rate": 0.00019132104264372063, |
|
"loss": 0.628, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5730813607452849, |
|
"learning_rate": 0.0001910677994683225, |
|
"loss": 0.5476, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6938507563801335, |
|
"learning_rate": 0.00019081108710644932, |
|
"loss": 0.6018, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.625439427503205, |
|
"learning_rate": 0.00019055091533744202, |
|
"loss": 0.5735, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6628596764324554, |
|
"learning_rate": 0.00019028729407242597, |
|
"loss": 0.5389, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.6112099968245533, |
|
"learning_rate": 0.00019002023335393364, |
|
"loss": 0.5235, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.6098216223216336, |
|
"learning_rate": 0.0001897497433555218, |
|
"loss": 0.6058, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.6469247467013166, |
|
"learning_rate": 0.0001894758343813842, |
|
"loss": 0.5524, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6344920759870597, |
|
"learning_rate": 0.00018919851686595874, |
|
"loss": 0.5605, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6756355159547938, |
|
"learning_rate": 0.00018891780137353034, |
|
"loss": 0.5096, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6439314455537293, |
|
"learning_rate": 0.00018863369859782825, |
|
"loss": 0.5516, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5567728554741562, |
|
"learning_rate": 0.0001883462193616187, |
|
"loss": 0.4576, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.553595533418767, |
|
"learning_rate": 0.00018805537461629265, |
|
"loss": 0.4947, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6200223910647112, |
|
"learning_rate": 0.00018776117544144863, |
|
"loss": 0.5073, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6294322114297511, |
|
"learning_rate": 0.00018746363304447073, |
|
"loss": 0.4938, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6000145257745209, |
|
"learning_rate": 0.00018716275876010135, |
|
"loss": 0.473, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5927861897994469, |
|
"learning_rate": 0.00018685856405000983, |
|
"loss": 0.4724, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_blimp_filtered_avg": 0.7155223880597015, |
|
"eval_blimp_filtered_std": 0.005000433138834185, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_blimp_supplement_avg": 0.8405172413793104, |
|
"eval_blimp_supplement_std": 0.016486001732879434, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_vqa_filtered_avg": 0.52, |
|
"eval_vqa_filtered_std": 0.05021167315686779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_winoground_filtered_avg": 0.64, |
|
"eval_winoground_filtered_std": 0.04824181513244218, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5504516732077648, |
|
"learning_rate": 0.00018655106050235548, |
|
"loss": 0.4393, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5801589113252366, |
|
"learning_rate": 0.00018624025983134644, |
|
"loss": 0.468, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5273944337529535, |
|
"learning_rate": 0.00018592617387679306, |
|
"loss": 0.439, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.508609381383424, |
|
"learning_rate": 0.00018560881460365724, |
|
"loss": 0.4272, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5396859577867195, |
|
"learning_rate": 0.0001852881941015964, |
|
"loss": 0.4362, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.5122858999271028, |
|
"learning_rate": 0.00018496432458450294, |
|
"loss": 0.3893, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.49626561438760436, |
|
"learning_rate": 0.00018463721839003915, |
|
"loss": 0.3498, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.48748413013987063, |
|
"learning_rate": 0.000184306887979167, |
|
"loss": 0.3256, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.5310280563857716, |
|
"learning_rate": 0.00018397334593567348, |
|
"loss": 0.3225, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.6232514021230662, |
|
"learning_rate": 0.00018363660496569127, |
|
"loss": 0.3489, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.5274577320762, |
|
"learning_rate": 0.00018329667789721485, |
|
"loss": 0.3123, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.5096311315676365, |
|
"learning_rate": 0.00018295357767961144, |
|
"loss": 0.3325, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.4613577097438129, |
|
"learning_rate": 0.00018260731738312818, |
|
"loss": 0.2936, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.4997938044342101, |
|
"learning_rate": 0.00018225791019839375, |
|
"loss": 0.3351, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.538085494988463, |
|
"learning_rate": 0.00018190536943591624, |
|
"loss": 0.329, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.5567068979809859, |
|
"learning_rate": 0.00018154970852557603, |
|
"loss": 0.318, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.5548141608588357, |
|
"learning_rate": 0.0001811909410161139, |
|
"loss": 0.3289, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.47326466614968965, |
|
"learning_rate": 0.0001808290805746153, |
|
"loss": 0.3076, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.47629585466918467, |
|
"learning_rate": 0.00018046414098598948, |
|
"loss": 0.3016, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.44135735344426463, |
|
"learning_rate": 0.00018009613615244436, |
|
"loss": 0.2704, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.5127645747027901, |
|
"learning_rate": 0.000179725080092957, |
|
"loss": 0.2887, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.5209981172771183, |
|
"learning_rate": 0.0001793509869427395, |
|
"loss": 0.2938, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.5481082193558409, |
|
"learning_rate": 0.00017897387095270058, |
|
"loss": 0.3191, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.4770065158307258, |
|
"learning_rate": 0.0001785937464889027, |
|
"loss": 0.2795, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.44845204938493194, |
|
"learning_rate": 0.0001782106280320147, |
|
"loss": 0.2667, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.47824147005907164, |
|
"learning_rate": 0.00017782453017676025, |
|
"loss": 0.267, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.501015317452837, |
|
"learning_rate": 0.00017743546763136187, |
|
"loss": 0.2831, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.5232536606095718, |
|
"learning_rate": 0.00017704345521698058, |
|
"loss": 0.2769, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.5495388553709665, |
|
"learning_rate": 0.00017664850786715136, |
|
"loss": 0.3031, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.5371555106361774, |
|
"learning_rate": 0.00017625064062721415, |
|
"loss": 0.2955, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.4716773551397148, |
|
"learning_rate": 0.00017584986865374082, |
|
"loss": 0.2666, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.5089124561646106, |
|
"learning_rate": 0.00017544620721395777, |
|
"loss": 0.3379, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.4715340007422714, |
|
"learning_rate": 0.00017503967168516426, |
|
"loss": 0.2771, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.43502563576445413, |
|
"learning_rate": 0.0001746302775541467, |
|
"loss": 0.2423, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.4967705692007805, |
|
"learning_rate": 0.00017421804041658863, |
|
"loss": 0.2498, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.49127370733051945, |
|
"learning_rate": 0.00017380297597647667, |
|
"loss": 0.2616, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.47835649282708265, |
|
"learning_rate": 0.00017338510004550223, |
|
"loss": 0.241, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.4843464174553606, |
|
"learning_rate": 0.00017296442854245915, |
|
"loss": 0.2458, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.5209405133977896, |
|
"learning_rate": 0.00017254097749263734, |
|
"loss": 0.2452, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.4709574288825739, |
|
"learning_rate": 0.0001721147630272123, |
|
"loss": 0.2627, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.4752105435022234, |
|
"learning_rate": 0.00017168580138263062, |
|
"loss": 0.2527, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.48781843284289905, |
|
"learning_rate": 0.00017125410889999134, |
|
"loss": 0.2356, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.5731736183258567, |
|
"learning_rate": 0.00017081970202442362, |
|
"loss": 0.2668, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.48105126464697834, |
|
"learning_rate": 0.0001703825973044602, |
|
"loss": 0.2454, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.5280645599674879, |
|
"learning_rate": 0.00016994281139140688, |
|
"loss": 0.2454, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.47876489284248624, |
|
"learning_rate": 0.0001695003610387084, |
|
"loss": 0.2463, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.48826354198860017, |
|
"learning_rate": 0.00016905526310130999, |
|
"loss": 0.2295, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.47715494831436517, |
|
"learning_rate": 0.0001686075345350156, |
|
"loss": 0.252, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.5152105233009641, |
|
"learning_rate": 0.0001681571923958416, |
|
"loss": 0.2771, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.4990883717055415, |
|
"learning_rate": 0.00016770425383936735, |
|
"loss": 0.2497, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.4674093996422124, |
|
"learning_rate": 0.00016724873612008155, |
|
"loss": 0.2441, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.4432102664091143, |
|
"learning_rate": 0.00016679065659072487, |
|
"loss": 0.2418, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.4677926556162063, |
|
"learning_rate": 0.00016633003270162902, |
|
"loss": 0.2483, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.5050389021999718, |
|
"learning_rate": 0.00016586688200005193, |
|
"loss": 0.225, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.538150442089787, |
|
"learning_rate": 0.00016540122212950934, |
|
"loss": 0.2629, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.4831894197759429, |
|
"learning_rate": 0.00016493307082910249, |
|
"loss": 0.2539, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.4864294249801108, |
|
"learning_rate": 0.00016446244593284277, |
|
"loss": 0.2638, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.46236092553249764, |
|
"learning_rate": 0.00016398936536897183, |
|
"loss": 0.2255, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.4963120760517666, |
|
"learning_rate": 0.00016351384715927898, |
|
"loss": 0.2524, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.5210286477375989, |
|
"learning_rate": 0.00016303590941841458, |
|
"loss": 0.225, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.5288475623534257, |
|
"learning_rate": 0.0001625555703531998, |
|
"loss": 0.2428, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.4973215047467683, |
|
"learning_rate": 0.00016207284826193335, |
|
"loss": 0.2522, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.44826317640998203, |
|
"learning_rate": 0.00016158776153369402, |
|
"loss": 0.2019, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.45392654459830534, |
|
"learning_rate": 0.0001611003286476406, |
|
"loss": 0.2338, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.4430521150056381, |
|
"learning_rate": 0.00016061056817230754, |
|
"loss": 0.2273, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.44345119147374473, |
|
"learning_rate": 0.00016011849876489776, |
|
"loss": 0.211, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.4808061249544928, |
|
"learning_rate": 0.000159624139170572, |
|
"loss": 0.2104, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.5573402749682285, |
|
"learning_rate": 0.00015912750822173445, |
|
"loss": 0.2492, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.5334950652460796, |
|
"learning_rate": 0.00015862862483731574, |
|
"loss": 0.2187, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.49497739813798797, |
|
"learning_rate": 0.00015812750802205187, |
|
"loss": 0.2097, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.44446540691990566, |
|
"learning_rate": 0.00015762417686576038, |
|
"loss": 0.204, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.42142200135464725, |
|
"learning_rate": 0.0001571186505426132, |
|
"loss": 0.1989, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.4328533901196503, |
|
"learning_rate": 0.00015661094831040598, |
|
"loss": 0.2173, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.43093996542664664, |
|
"learning_rate": 0.00015610108950982494, |
|
"loss": 0.1865, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.4850613308932528, |
|
"learning_rate": 0.00015558909356370944, |
|
"loss": 0.2181, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.47485870685329246, |
|
"learning_rate": 0.00015507497997631266, |
|
"loss": 0.2223, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.42085147271583295, |
|
"learning_rate": 0.0001545587683325583, |
|
"loss": 0.1845, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.4479801309419239, |
|
"learning_rate": 0.00015404047829729457, |
|
"loss": 0.1987, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.4624584058381783, |
|
"learning_rate": 0.00015352012961454507, |
|
"loss": 0.217, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.44005765649196454, |
|
"learning_rate": 0.00015299774210675657, |
|
"loss": 0.1837, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.4508346255489124, |
|
"learning_rate": 0.00015247333567404406, |
|
"loss": 0.2007, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.40396006791211914, |
|
"learning_rate": 0.00015194693029343248, |
|
"loss": 0.1866, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.44558839018398966, |
|
"learning_rate": 0.00015141854601809581, |
|
"loss": 0.1967, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.4337334328022437, |
|
"learning_rate": 0.00015088820297659314, |
|
"loss": 0.1891, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.4636781912221849, |
|
"learning_rate": 0.00015035592137210187, |
|
"loss": 0.193, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.47955885394967973, |
|
"learning_rate": 0.00014982172148164804, |
|
"loss": 0.1793, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.4721310395975314, |
|
"learning_rate": 0.00014928562365533392, |
|
"loss": 0.186, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.4737141537120664, |
|
"learning_rate": 0.00014874764831556285, |
|
"loss": 0.2058, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.40830849621087567, |
|
"learning_rate": 0.00014820781595626116, |
|
"loss": 0.1822, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.4272142710058541, |
|
"learning_rate": 0.0001476661471420975, |
|
"loss": 0.2057, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.4212227727031309, |
|
"learning_rate": 0.0001471226625076993, |
|
"loss": 0.1845, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.39660108389275345, |
|
"learning_rate": 0.0001465773827568671, |
|
"loss": 0.1769, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.38828383424285384, |
|
"learning_rate": 0.00014603032866178538, |
|
"loss": 0.1699, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.3681031142044674, |
|
"learning_rate": 0.00014548152106223157, |
|
"loss": 0.1456, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.46248659870169556, |
|
"learning_rate": 0.00014493098086478196, |
|
"loss": 0.1846, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.4437664820090981, |
|
"learning_rate": 0.00014437872904201542, |
|
"loss": 0.1706, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.4410375026146085, |
|
"learning_rate": 0.0001438247866317145, |
|
"loss": 0.1757, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.4290870801703047, |
|
"learning_rate": 0.00014326917473606366, |
|
"loss": 0.1777, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.4812130220306999, |
|
"learning_rate": 0.00014271191452084597, |
|
"loss": 0.2013, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.4314920290891278, |
|
"learning_rate": 0.00014215302721463623, |
|
"loss": 0.1857, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_blimp_filtered_avg": 0.7161194029850746, |
|
"eval_blimp_filtered_std": 0.005001692965803923, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_blimp_supplement_avg": 0.8211206896551724, |
|
"eval_blimp_supplement_std": 0.016785621805327337, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_vqa_filtered_avg": 0.51, |
|
"eval_vqa_filtered_std": 0.05024183937956912, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_winoground_filtered_avg": 0.62, |
|
"eval_winoground_filtered_std": 0.04878317312145633, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.41562514975066434, |
|
"learning_rate": 0.0001415925341079927, |
|
"loss": 0.21, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.37833993286875955, |
|
"learning_rate": 0.00014103045655264576, |
|
"loss": 0.1659, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.3880529818353851, |
|
"learning_rate": 0.00014046681596068466, |
|
"loss": 0.1638, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.40159118156434603, |
|
"learning_rate": 0.00013990163380374194, |
|
"loss": 0.1768, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.4086449128732129, |
|
"learning_rate": 0.00013933493161217523, |
|
"loss": 0.1544, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.3808287729283849, |
|
"learning_rate": 0.0001387667309742472, |
|
"loss": 0.1366, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.39609061286446773, |
|
"learning_rate": 0.0001381970535353032, |
|
"loss": 0.1494, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.40847272653729905, |
|
"learning_rate": 0.00013762592099694665, |
|
"loss": 0.1615, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.4334994696681873, |
|
"learning_rate": 0.00013705335511621228, |
|
"loss": 0.1542, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4546384761691546, |
|
"learning_rate": 0.00013647937770473737, |
|
"loss": 0.1834, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.36130610610645814, |
|
"learning_rate": 0.00013590401062793083, |
|
"loss": 0.123, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.29975302946848653, |
|
"learning_rate": 0.0001353272758041402, |
|
"loss": 0.0824, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.29392603086414587, |
|
"learning_rate": 0.00013474919520381671, |
|
"loss": 0.0836, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.33169221984700814, |
|
"learning_rate": 0.00013416979084867852, |
|
"loss": 0.0683, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.39192700338704206, |
|
"learning_rate": 0.00013358908481087134, |
|
"loss": 0.0804, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.42443737109460977, |
|
"learning_rate": 0.0001330070992121281, |
|
"loss": 0.0797, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.42848813761714244, |
|
"learning_rate": 0.00013242385622292592, |
|
"loss": 0.0776, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.37448633759803696, |
|
"learning_rate": 0.00013183937806164172, |
|
"loss": 0.0739, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.3437440816482259, |
|
"learning_rate": 0.00013125368699370567, |
|
"loss": 0.0652, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.356415907025676, |
|
"learning_rate": 0.0001306668053307531, |
|
"loss": 0.0778, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.30675625825005026, |
|
"learning_rate": 0.00013007875542977448, |
|
"loss": 0.0665, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.29794655672460485, |
|
"learning_rate": 0.00012948955969226383, |
|
"loss": 0.0696, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.30163505061461343, |
|
"learning_rate": 0.00012889924056336532, |
|
"loss": 0.0705, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.32541739323213426, |
|
"learning_rate": 0.00012830782053101805, |
|
"loss": 0.0733, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.31121536090331003, |
|
"learning_rate": 0.00012771532212509974, |
|
"loss": 0.0711, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.34593292210442944, |
|
"learning_rate": 0.00012712176791656807, |
|
"loss": 0.0788, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.33946278651997686, |
|
"learning_rate": 0.0001265271805166012, |
|
"loss": 0.0677, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.3400898219352628, |
|
"learning_rate": 0.0001259315825757362, |
|
"loss": 0.0643, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.3813085350755264, |
|
"learning_rate": 0.00012533499678300618, |
|
"loss": 0.0761, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.3523012248149677, |
|
"learning_rate": 0.00012473744586507604, |
|
"loss": 0.0648, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.37842862853695125, |
|
"learning_rate": 0.00012413895258537675, |
|
"loss": 0.0812, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.39475455813661525, |
|
"learning_rate": 0.00012353953974323807, |
|
"loss": 0.0801, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.3205081471986943, |
|
"learning_rate": 0.00012293923017302002, |
|
"loss": 0.0677, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.31006899448135294, |
|
"learning_rate": 0.0001223380467432432, |
|
"loss": 0.07, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.3048520942780853, |
|
"learning_rate": 0.00012173601235571742, |
|
"loss": 0.0615, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.3425413653893973, |
|
"learning_rate": 0.0001211331499446693, |
|
"loss": 0.0658, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.31929344956491607, |
|
"learning_rate": 0.00012052948247586873, |
|
"loss": 0.0653, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.3414359773691709, |
|
"learning_rate": 0.00011992503294575383, |
|
"loss": 0.0723, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.32978160245312554, |
|
"learning_rate": 0.00011931982438055505, |
|
"loss": 0.07, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.33271868205929617, |
|
"learning_rate": 0.00011871387983541789, |
|
"loss": 0.0672, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.29862145989444433, |
|
"learning_rate": 0.00011810722239352467, |
|
"loss": 0.0603, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.34485364985513034, |
|
"learning_rate": 0.00011749987516521523, |
|
"loss": 0.0632, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.3299899118013224, |
|
"learning_rate": 0.00011689186128710654, |
|
"loss": 0.0601, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.29635972892096896, |
|
"learning_rate": 0.00011628320392121117, |
|
"loss": 0.0558, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.3414458592363874, |
|
"learning_rate": 0.0001156739262540552, |
|
"loss": 0.0703, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.3280087622706941, |
|
"learning_rate": 0.00011506405149579468, |
|
"loss": 0.0657, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.373086375777386, |
|
"learning_rate": 0.00011445360287933165, |
|
"loss": 0.0668, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.2937645914714354, |
|
"learning_rate": 0.00011384260365942904, |
|
"loss": 0.0612, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.39022311054047737, |
|
"learning_rate": 0.00011323107711182473, |
|
"loss": 0.0762, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.3345521008714258, |
|
"learning_rate": 0.00011261904653234485, |
|
"loss": 0.0711, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.30608871062806836, |
|
"learning_rate": 0.00011200653523601652, |
|
"loss": 0.0617, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.30714147902477945, |
|
"learning_rate": 0.00011139356655617945, |
|
"loss": 0.063, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.31051190204375445, |
|
"learning_rate": 0.00011078016384359724, |
|
"loss": 0.0659, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.3071085278813772, |
|
"learning_rate": 0.00011016635046556772, |
|
"loss": 0.061, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.3045837343462885, |
|
"learning_rate": 0.00010955214980503284, |
|
"loss": 0.0597, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.3049959198680976, |
|
"learning_rate": 0.00010893758525968789, |
|
"loss": 0.0587, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.3168437149994661, |
|
"learning_rate": 0.00010832268024109025, |
|
"loss": 0.0559, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.3024342626013227, |
|
"learning_rate": 0.00010770745817376742, |
|
"loss": 0.0583, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.3188509232471995, |
|
"learning_rate": 0.0001070919424943247, |
|
"loss": 0.061, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.3381945814712772, |
|
"learning_rate": 0.0001064761566505525, |
|
"loss": 0.0648, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.3131931451431926, |
|
"learning_rate": 0.00010586012410053292, |
|
"loss": 0.0624, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.32809637984753304, |
|
"learning_rate": 0.00010524386831174628, |
|
"loss": 0.0627, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.2832796499168925, |
|
"learning_rate": 0.00010462741276017711, |
|
"loss": 0.0535, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.3334141162384235, |
|
"learning_rate": 0.00010401078092941971, |
|
"loss": 0.061, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.27653747850590626, |
|
"learning_rate": 0.00010339399630978373, |
|
"loss": 0.0497, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.32205480409336124, |
|
"learning_rate": 0.00010277708239739924, |
|
"loss": 0.0658, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.310079147965717, |
|
"learning_rate": 0.0001021600626933217, |
|
"loss": 0.0525, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.31094425691461797, |
|
"learning_rate": 0.00010154296070263649, |
|
"loss": 0.0619, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.33419799536496597, |
|
"learning_rate": 0.00010092579993356386, |
|
"loss": 0.0615, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.3343121767672678, |
|
"learning_rate": 0.00010030860389656305, |
|
"loss": 0.0663, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.3516117623617434, |
|
"learning_rate": 9.969139610343696e-05, |
|
"loss": 0.0662, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.31796912631433194, |
|
"learning_rate": 9.907420006643619e-05, |
|
"loss": 0.0624, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.29219460425245597, |
|
"learning_rate": 9.845703929736351e-05, |
|
"loss": 0.0596, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.316635170830544, |
|
"learning_rate": 9.783993730667831e-05, |
|
"loss": 0.0659, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.33766616368603597, |
|
"learning_rate": 9.722291760260077e-05, |
|
"loss": 0.0646, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.31287192455811574, |
|
"learning_rate": 9.66060036902163e-05, |
|
"loss": 0.0585, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.28964582015181484, |
|
"learning_rate": 9.598921907058033e-05, |
|
"loss": 0.0543, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.3037919396698326, |
|
"learning_rate": 9.53725872398229e-05, |
|
"loss": 0.0512, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.3229974938313004, |
|
"learning_rate": 9.475613168825374e-05, |
|
"loss": 0.0531, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.29881091304580676, |
|
"learning_rate": 9.413987589946711e-05, |
|
"loss": 0.0569, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.29692909307641674, |
|
"learning_rate": 9.352384334944753e-05, |
|
"loss": 0.0547, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.33439942628885455, |
|
"learning_rate": 9.290805750567532e-05, |
|
"loss": 0.0622, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.2991141437988068, |
|
"learning_rate": 9.22925418262326e-05, |
|
"loss": 0.0464, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.3171911760038229, |
|
"learning_rate": 9.167731975890976e-05, |
|
"loss": 0.059, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.30072460150102115, |
|
"learning_rate": 9.106241474031212e-05, |
|
"loss": 0.0559, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.3301896190647226, |
|
"learning_rate": 9.04478501949672e-05, |
|
"loss": 0.0514, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.3298071637508188, |
|
"learning_rate": 8.983364953443227e-05, |
|
"loss": 0.0618, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.3497185839244567, |
|
"learning_rate": 8.921983615640277e-05, |
|
"loss": 0.065, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.33084725547728233, |
|
"learning_rate": 8.860643344382056e-05, |
|
"loss": 0.0527, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.33012822636415956, |
|
"learning_rate": 8.79934647639835e-05, |
|
"loss": 0.0666, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.3151687548518561, |
|
"learning_rate": 8.738095346765518e-05, |
|
"loss": 0.0573, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.30346203875619676, |
|
"learning_rate": 8.676892288817531e-05, |
|
"loss": 0.0491, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.3133369298353677, |
|
"learning_rate": 8.615739634057098e-05, |
|
"loss": 0.0595, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.28715782085999497, |
|
"learning_rate": 8.554639712066836e-05, |
|
"loss": 0.0542, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.2815995010771035, |
|
"learning_rate": 8.493594850420537e-05, |
|
"loss": 0.0551, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.280576878443274, |
|
"learning_rate": 8.432607374594484e-05, |
|
"loss": 0.0488, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.298809991890747, |
|
"learning_rate": 8.371679607878884e-05, |
|
"loss": 0.0544, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.30088222272143067, |
|
"learning_rate": 8.310813871289348e-05, |
|
"loss": 0.0591, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.3237358977236424, |
|
"learning_rate": 8.250012483478478e-05, |
|
"loss": 0.0547, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.34075237005827885, |
|
"learning_rate": 8.189277760647537e-05, |
|
"loss": 0.0566, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"eval_blimp_filtered_avg": 0.7037313432835821, |
|
"eval_blimp_filtered_std": 0.005058972315437875, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"eval_blimp_supplement_avg": 0.8103448275862069, |
|
"eval_blimp_supplement_std": 0.017321145118445798, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"eval_vqa_filtered_avg": 0.53, |
|
"eval_vqa_filtered_std": 0.0501613558046592, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"eval_winoground_filtered_avg": 0.68, |
|
"eval_winoground_filtered_std": 0.046882617226215034, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.3237263865460515, |
|
"learning_rate": 8.128612016458215e-05, |
|
"loss": 0.059, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.2977357286247905, |
|
"learning_rate": 8.068017561944499e-05, |
|
"loss": 0.0492, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.29591506818063545, |
|
"learning_rate": 8.00749670542462e-05, |
|
"loss": 0.052, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.2789469075911483, |
|
"learning_rate": 7.94705175241313e-05, |
|
"loss": 0.0455, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.2997082343784124, |
|
"learning_rate": 7.886685005533072e-05, |
|
"loss": 0.0498, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.30157528073661777, |
|
"learning_rate": 7.82639876442826e-05, |
|
"loss": 0.0567, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.32803298910194756, |
|
"learning_rate": 7.76619532567568e-05, |
|
"loss": 0.0622, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.28556449374878695, |
|
"learning_rate": 7.706076982697999e-05, |
|
"loss": 0.0489, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.32287162854623286, |
|
"learning_rate": 7.646046025676198e-05, |
|
"loss": 0.066, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.3384064716667544, |
|
"learning_rate": 7.586104741462325e-05, |
|
"loss": 0.0629, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.3005901634146794, |
|
"learning_rate": 7.526255413492395e-05, |
|
"loss": 0.051, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.2907146546357962, |
|
"learning_rate": 7.466500321699383e-05, |
|
"loss": 0.0546, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.30779520364750435, |
|
"learning_rate": 7.40684174242638e-05, |
|
"loss": 0.058, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.29074373091101263, |
|
"learning_rate": 7.347281948339879e-05, |
|
"loss": 0.0463, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.32970798475445445, |
|
"learning_rate": 7.287823208343192e-05, |
|
"loss": 0.0589, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.2798345327195924, |
|
"learning_rate": 7.228467787490028e-05, |
|
"loss": 0.0438, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.18326848967204043, |
|
"learning_rate": 7.169217946898197e-05, |
|
"loss": 0.0225, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.18022372679373735, |
|
"learning_rate": 7.110075943663472e-05, |
|
"loss": 0.0161, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.1633153575928502, |
|
"learning_rate": 7.051044030773618e-05, |
|
"loss": 0.0153, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.17802284328446474, |
|
"learning_rate": 6.992124457022553e-05, |
|
"loss": 0.0176, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.17359891604740127, |
|
"learning_rate": 6.933319466924693e-05, |
|
"loss": 0.0162, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.2202987501804585, |
|
"learning_rate": 6.874631300629435e-05, |
|
"loss": 0.0162, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.22277821921264357, |
|
"learning_rate": 6.81606219383583e-05, |
|
"loss": 0.0187, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.18724963681022663, |
|
"learning_rate": 6.757614377707409e-05, |
|
"loss": 0.0153, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.21995220887794256, |
|
"learning_rate": 6.699290078787193e-05, |
|
"loss": 0.0188, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.1967935793635855, |
|
"learning_rate": 6.641091518912867e-05, |
|
"loss": 0.0156, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.20661934683104752, |
|
"learning_rate": 6.583020915132152e-05, |
|
"loss": 0.0158, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.2422474266231083, |
|
"learning_rate": 6.525080479618331e-05, |
|
"loss": 0.0177, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.18354685059507367, |
|
"learning_rate": 6.467272419585984e-05, |
|
"loss": 0.013, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.22423754187379397, |
|
"learning_rate": 6.40959893720692e-05, |
|
"loss": 0.0188, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.18994008796265852, |
|
"learning_rate": 6.352062229526266e-05, |
|
"loss": 0.0132, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.24715301748493912, |
|
"learning_rate": 6.294664488378776e-05, |
|
"loss": 0.015, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.17280498203848704, |
|
"learning_rate": 6.237407900305335e-05, |
|
"loss": 0.0138, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.21773200395950232, |
|
"learning_rate": 6.180294646469679e-05, |
|
"loss": 0.0155, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.2144971485793242, |
|
"learning_rate": 6.123326902575282e-05, |
|
"loss": 0.0158, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.18331926033535073, |
|
"learning_rate": 6.06650683878248e-05, |
|
"loss": 0.013, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.1788180130126268, |
|
"learning_rate": 6.009836619625809e-05, |
|
"loss": 0.0133, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.20337677688861636, |
|
"learning_rate": 5.953318403931532e-05, |
|
"loss": 0.0129, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.20853998405220736, |
|
"learning_rate": 5.896954344735426e-05, |
|
"loss": 0.0176, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.1919639102705018, |
|
"learning_rate": 5.840746589200732e-05, |
|
"loss": 0.0144, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.2134469059873606, |
|
"learning_rate": 5.784697278536379e-05, |
|
"loss": 0.0138, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.18435084201272836, |
|
"learning_rate": 5.728808547915405e-05, |
|
"loss": 0.0135, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.19554570393158438, |
|
"learning_rate": 5.673082526393634e-05, |
|
"loss": 0.015, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.18522448379098544, |
|
"learning_rate": 5.617521336828556e-05, |
|
"loss": 0.0129, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.190207008998555, |
|
"learning_rate": 5.5621270957984573e-05, |
|
"loss": 0.0161, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.19594053008897275, |
|
"learning_rate": 5.506901913521808e-05, |
|
"loss": 0.0162, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.20111569255746164, |
|
"learning_rate": 5.451847893776845e-05, |
|
"loss": 0.0147, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.20867562278084897, |
|
"learning_rate": 5.396967133821461e-05, |
|
"loss": 0.0154, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.16028325232055693, |
|
"learning_rate": 5.342261724313292e-05, |
|
"loss": 0.0117, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.14992620939570764, |
|
"learning_rate": 5.28773374923007e-05, |
|
"loss": 0.0106, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.20669460754401175, |
|
"learning_rate": 5.2333852857902575e-05, |
|
"loss": 0.0161, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.21934716169620833, |
|
"learning_rate": 5.1792184043738855e-05, |
|
"loss": 0.0128, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.18204794157825063, |
|
"learning_rate": 5.1252351684437136e-05, |
|
"loss": 0.0129, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.21363608639584963, |
|
"learning_rate": 5.071437634466609e-05, |
|
"loss": 0.0105, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.15881770971724649, |
|
"learning_rate": 5.0178278518351983e-05, |
|
"loss": 0.0096, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.1980006966366768, |
|
"learning_rate": 4.964407862789817e-05, |
|
"loss": 0.0119, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.21004802159627842, |
|
"learning_rate": 4.911179702340688e-05, |
|
"loss": 0.0119, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.20419756258161648, |
|
"learning_rate": 4.85814539819042e-05, |
|
"loss": 0.0145, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.1565818058300373, |
|
"learning_rate": 4.8053069706567554e-05, |
|
"loss": 0.0105, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.19501698471957343, |
|
"learning_rate": 4.752666432595596e-05, |
|
"loss": 0.0126, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.20941486180216556, |
|
"learning_rate": 4.700225789324343e-05, |
|
"loss": 0.0105, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.18304197382791004, |
|
"learning_rate": 4.647987038545496e-05, |
|
"loss": 0.011, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.16720171411001336, |
|
"learning_rate": 4.595952170270542e-05, |
|
"loss": 0.0112, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.22478251297433013, |
|
"learning_rate": 4.544123166744172e-05, |
|
"loss": 0.0118, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.1598572948562243, |
|
"learning_rate": 4.492502002368738e-05, |
|
"loss": 0.0107, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.22373563049772874, |
|
"learning_rate": 4.4410906436290566e-05, |
|
"loss": 0.0104, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.16802667132434534, |
|
"learning_rate": 4.38989104901751e-05, |
|
"loss": 0.0114, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.24550738449688075, |
|
"learning_rate": 4.3389051689594e-05, |
|
"loss": 0.0121, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.1660066244443363, |
|
"learning_rate": 4.288134945738684e-05, |
|
"loss": 0.0099, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.1783889244909253, |
|
"learning_rate": 4.237582313423962e-05, |
|
"loss": 0.0094, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.17141038466777303, |
|
"learning_rate": 4.187249197794813e-05, |
|
"loss": 0.0095, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.1893721805088239, |
|
"learning_rate": 4.137137516268426e-05, |
|
"loss": 0.013, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.16935951673752134, |
|
"learning_rate": 4.0872491778265535e-05, |
|
"loss": 0.0091, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.13309068523326859, |
|
"learning_rate": 4.037586082942805e-05, |
|
"loss": 0.0091, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.18791651271841342, |
|
"learning_rate": 3.988150123510224e-05, |
|
"loss": 0.0121, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.1559825545952661, |
|
"learning_rate": 3.938943182769246e-05, |
|
"loss": 0.0102, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.2261919531211638, |
|
"learning_rate": 3.88996713523594e-05, |
|
"loss": 0.0127, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.20792420146527377, |
|
"learning_rate": 3.841223846630599e-05, |
|
"loss": 0.013, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.16486082885129608, |
|
"learning_rate": 3.792715173806669e-05, |
|
"loss": 0.0105, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.1549020176177142, |
|
"learning_rate": 3.74444296468002e-05, |
|
"loss": 0.0098, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.17250200199106172, |
|
"learning_rate": 3.696409058158544e-05, |
|
"loss": 0.0109, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.1415293330470341, |
|
"learning_rate": 3.6486152840721046e-05, |
|
"loss": 0.0084, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.14461810975420877, |
|
"learning_rate": 3.6010634631028226e-05, |
|
"loss": 0.0084, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.1557012557289619, |
|
"learning_rate": 3.553755406715724e-05, |
|
"loss": 0.0089, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.15752891661687976, |
|
"learning_rate": 3.506692917089751e-05, |
|
"loss": 0.0109, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.1694876915505117, |
|
"learning_rate": 3.459877787049072e-05, |
|
"loss": 0.009, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.1582663784415179, |
|
"learning_rate": 3.413311799994808e-05, |
|
"loss": 0.0095, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.13693031068741818, |
|
"learning_rate": 3.366996729837102e-05, |
|
"loss": 0.0092, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.14543112940410688, |
|
"learning_rate": 3.320934340927513e-05, |
|
"loss": 0.0108, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.19389482832864774, |
|
"learning_rate": 3.275126387991847e-05, |
|
"loss": 0.0098, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.15797165592004603, |
|
"learning_rate": 3.229574616063268e-05, |
|
"loss": 0.0076, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.21281942854700847, |
|
"learning_rate": 3.184280760415843e-05, |
|
"loss": 0.0142, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.12498130411986656, |
|
"learning_rate": 3.1392465464984455e-05, |
|
"loss": 0.0081, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.1152125429659436, |
|
"learning_rate": 3.094473689869002e-05, |
|
"loss": 0.0058, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.1567733530080216, |
|
"learning_rate": 3.0499638961291623e-05, |
|
"loss": 0.011, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.14500898906990572, |
|
"learning_rate": 3.0057188608593147e-05, |
|
"loss": 0.0085, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.16163974543952728, |
|
"learning_rate": 2.9617402695539808e-05, |
|
"loss": 0.013, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.13868168811451842, |
|
"learning_rate": 2.9180297975576364e-05, |
|
"loss": 0.0084, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.17847032901949134, |
|
"learning_rate": 2.8745891100008683e-05, |
|
"loss": 0.0121, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.17527442252411723, |
|
"learning_rate": 2.83141986173694e-05, |
|
"loss": 0.0084, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_blimp_filtered_avg": 0.7053731343283582, |
|
"eval_blimp_filtered_std": 0.005043001462199571, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_blimp_supplement_avg": 0.8125, |
|
"eval_blimp_supplement_std": 0.01736311122127593, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_vqa_filtered_avg": 0.52, |
|
"eval_vqa_filtered_std": 0.05021167315686779, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_winoground_filtered_avg": 0.64, |
|
"eval_winoground_filtered_std": 0.048241815132442176, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.14598157841040266, |
|
"learning_rate": 2.788523697278773e-05, |
|
"loss": 0.0093, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.20150542514971506, |
|
"learning_rate": 2.7459022507362686e-05, |
|
"loss": 0.0122, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.18255123614923588, |
|
"learning_rate": 2.7035571457540865e-05, |
|
"loss": 0.0103, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.16704045474943452, |
|
"learning_rate": 2.6614899954497795e-05, |
|
"loss": 0.0114, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.14683721625679494, |
|
"learning_rate": 2.619702402352332e-05, |
|
"loss": 0.01, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.18144743721435366, |
|
"learning_rate": 2.5781959583411374e-05, |
|
"loss": 0.0129, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.19646570441433073, |
|
"learning_rate": 2.5369722445853304e-05, |
|
"loss": 0.0143, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.1668088181727681, |
|
"learning_rate": 2.4960328314835745e-05, |
|
"loss": 0.0089, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.16111476451284476, |
|
"learning_rate": 2.4553792786042262e-05, |
|
"loss": 0.0091, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.17729690845562673, |
|
"learning_rate": 2.4150131346259197e-05, |
|
"loss": 0.0103, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.15155895346947004, |
|
"learning_rate": 2.3749359372785883e-05, |
|
"loss": 0.0096, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.15041370885333255, |
|
"learning_rate": 2.3351492132848664e-05, |
|
"loss": 0.0085, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.12197907148956355, |
|
"learning_rate": 2.2956544783019418e-05, |
|
"loss": 0.0067, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.1788434056496877, |
|
"learning_rate": 2.2564532368638146e-05, |
|
"loss": 0.01, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.19269466130772045, |
|
"learning_rate": 2.2175469823239768e-05, |
|
"loss": 0.0117, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.15780826445252463, |
|
"learning_rate": 2.1789371967985338e-05, |
|
"loss": 0.0101, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.19229144408434373, |
|
"learning_rate": 2.140625351109733e-05, |
|
"loss": 0.0084, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.15474486143047034, |
|
"learning_rate": 2.1026129047299436e-05, |
|
"loss": 0.0067, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.15864166155594778, |
|
"learning_rate": 2.0649013057260546e-05, |
|
"loss": 0.0098, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.22515244613844015, |
|
"learning_rate": 2.0274919907043033e-05, |
|
"loss": 0.0094, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.18684872878382638, |
|
"learning_rate": 1.9903863847555648e-05, |
|
"loss": 0.0127, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.06270483785922072, |
|
"learning_rate": 1.9535859014010526e-05, |
|
"loss": 0.0028, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.09948637260912774, |
|
"learning_rate": 1.917091942538469e-05, |
|
"loss": 0.0037, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.07530065845248647, |
|
"learning_rate": 1.880905898388612e-05, |
|
"loss": 0.0039, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.054461890750773165, |
|
"learning_rate": 1.8450291474423998e-05, |
|
"loss": 0.0025, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.08002877578075594, |
|
"learning_rate": 1.8094630564083736e-05, |
|
"loss": 0.0035, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.05746226463965698, |
|
"learning_rate": 1.7742089801606276e-05, |
|
"loss": 0.0025, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.0633358139605444, |
|
"learning_rate": 1.7392682616871837e-05, |
|
"loss": 0.0027, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.06509683268742919, |
|
"learning_rate": 1.7046422320388556e-05, |
|
"loss": 0.0027, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.054571154616853274, |
|
"learning_rate": 1.6703322102785168e-05, |
|
"loss": 0.0026, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.06888564779650448, |
|
"learning_rate": 1.6363395034308703e-05, |
|
"loss": 0.0027, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.05307117129834359, |
|
"learning_rate": 1.6026654064326553e-05, |
|
"loss": 0.0025, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.06598879328529111, |
|
"learning_rate": 1.5693112020833013e-05, |
|
"loss": 0.003, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.054752236275106794, |
|
"learning_rate": 1.5362781609960852e-05, |
|
"loss": 0.0025, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.07106963888787232, |
|
"learning_rate": 1.5035675415497063e-05, |
|
"loss": 0.0031, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.052548572683446884, |
|
"learning_rate": 1.471180589840363e-05, |
|
"loss": 0.0025, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.08828036910254508, |
|
"learning_rate": 1.4391185396342789e-05, |
|
"loss": 0.0038, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.09463459893212552, |
|
"learning_rate": 1.4073826123206946e-05, |
|
"loss": 0.0038, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.08002928457971342, |
|
"learning_rate": 1.375974016865359e-05, |
|
"loss": 0.0031, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.07631532690730236, |
|
"learning_rate": 1.3448939497644509e-05, |
|
"loss": 0.0031, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.04831761603516682, |
|
"learning_rate": 1.3141435949990188e-05, |
|
"loss": 0.0027, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.07344003153336562, |
|
"learning_rate": 1.2837241239898667e-05, |
|
"loss": 0.0032, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.08305075630986966, |
|
"learning_rate": 1.253636695552931e-05, |
|
"loss": 0.003, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.1034575433958594, |
|
"learning_rate": 1.2238824558551365e-05, |
|
"loss": 0.0039, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.06655324788558148, |
|
"learning_rate": 1.1944625383707374e-05, |
|
"loss": 0.003, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.0790599253839735, |
|
"learning_rate": 1.1653780638381328e-05, |
|
"loss": 0.0029, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.04198685628145689, |
|
"learning_rate": 1.1366301402171775e-05, |
|
"loss": 0.0017, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.06439353264983554, |
|
"learning_rate": 1.1082198626469686e-05, |
|
"loss": 0.0024, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.07762450043477247, |
|
"learning_rate": 1.0801483134041268e-05, |
|
"loss": 0.0027, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.07856883953783565, |
|
"learning_rate": 1.0524165618615845e-05, |
|
"loss": 0.0033, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.07929308057852809, |
|
"learning_rate": 1.0250256644478195e-05, |
|
"loss": 0.003, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.0587512154822952, |
|
"learning_rate": 9.979766646066368e-06, |
|
"loss": 0.0027, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.06109551507247056, |
|
"learning_rate": 9.71270592757404e-06, |
|
"loss": 0.0032, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.05909029031199419, |
|
"learning_rate": 9.449084662557982e-06, |
|
"loss": 0.0026, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.0814055458144323, |
|
"learning_rate": 9.188912893550695e-06, |
|
"loss": 0.0026, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.07735385332942207, |
|
"learning_rate": 8.932200531677537e-06, |
|
"loss": 0.0028, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.08519595591969155, |
|
"learning_rate": 8.678957356279371e-06, |
|
"loss": 0.0024, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.055031384326470804, |
|
"learning_rate": 8.429193014540015e-06, |
|
"loss": 0.0026, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.05387324401647046, |
|
"learning_rate": 8.182917021118663e-06, |
|
"loss": 0.0026, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.07168879976269556, |
|
"learning_rate": 7.940138757787507e-06, |
|
"loss": 0.0032, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.07661756681904786, |
|
"learning_rate": 7.700867473074224e-06, |
|
"loss": 0.0035, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.09486930411075328, |
|
"learning_rate": 7.46511228190977e-06, |
|
"loss": 0.0049, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.0679530025111762, |
|
"learning_rate": 7.232882165281141e-06, |
|
"loss": 0.0026, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.06514922044267304, |
|
"learning_rate": 7.004185969889187e-06, |
|
"loss": 0.0027, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.06706026131022384, |
|
"learning_rate": 6.7790324078116364e-06, |
|
"loss": 0.0027, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.07709046890424658, |
|
"learning_rate": 6.557430056171221e-06, |
|
"loss": 0.0033, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.051443041020356704, |
|
"learning_rate": 6.339387356808912e-06, |
|
"loss": 0.0026, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.060318722923432995, |
|
"learning_rate": 6.124912615962341e-06, |
|
"loss": 0.0028, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.062212012735137795, |
|
"learning_rate": 5.9140140039494084e-06, |
|
"loss": 0.0025, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.06556299474776538, |
|
"learning_rate": 5.706699554856964e-06, |
|
"loss": 0.0023, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.08649267044276539, |
|
"learning_rate": 5.502977166234857e-06, |
|
"loss": 0.0035, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.08526822145924882, |
|
"learning_rate": 5.302854598794937e-06, |
|
"loss": 0.003, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.04133711118453636, |
|
"learning_rate": 5.106339476115596e-06, |
|
"loss": 0.0019, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.05708577094578342, |
|
"learning_rate": 4.913439284351207e-06, |
|
"loss": 0.0026, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.07367912633186298, |
|
"learning_rate": 4.724161371946978e-06, |
|
"loss": 0.0029, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.08135320771271103, |
|
"learning_rate": 4.538512949359075e-06, |
|
"loss": 0.0027, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.0849858165893086, |
|
"learning_rate": 4.356501088779841e-06, |
|
"loss": 0.0027, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.05260609110954984, |
|
"learning_rate": 4.178132723868477e-06, |
|
"loss": 0.0019, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.0795477617292828, |
|
"learning_rate": 4.003414649486892e-06, |
|
"loss": 0.0032, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.08161922179718771, |
|
"learning_rate": 3.832353521440768e-06, |
|
"loss": 0.0026, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.06830643544893618, |
|
"learning_rate": 3.6649558562261375e-06, |
|
"loss": 0.0032, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.08641205617098656, |
|
"learning_rate": 3.501228030781034e-06, |
|
"loss": 0.0028, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.04921706287498077, |
|
"learning_rate": 3.341176282242653e-06, |
|
"loss": 0.0021, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.05901589705081983, |
|
"learning_rate": 3.184806707709698e-06, |
|
"loss": 0.0027, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.08562934355546689, |
|
"learning_rate": 3.0321252640100885e-06, |
|
"loss": 0.0035, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.056139936545776606, |
|
"learning_rate": 2.88313776747412e-06, |
|
"loss": 0.0027, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.06574452787357139, |
|
"learning_rate": 2.7378498937128404e-06, |
|
"loss": 0.0031, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.06295208396607756, |
|
"learning_rate": 2.5962671774018234e-06, |
|
"loss": 0.0029, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.06348707610420529, |
|
"learning_rate": 2.458395012070369e-06, |
|
"loss": 0.0027, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.06438459591992919, |
|
"learning_rate": 2.3242386498960266e-06, |
|
"loss": 0.003, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.0936033257355208, |
|
"learning_rate": 2.1938032015044964e-06, |
|
"loss": 0.0053, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.0712704009642112, |
|
"learning_rate": 2.067093635774975e-06, |
|
"loss": 0.0033, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.05278839840964536, |
|
"learning_rate": 1.9441147796508407e-06, |
|
"loss": 0.0025, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.05158800004403027, |
|
"learning_rate": 1.8248713179557786e-06, |
|
"loss": 0.002, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.06302315225352234, |
|
"learning_rate": 1.7093677932153218e-06, |
|
"loss": 0.002, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.09014451602286425, |
|
"learning_rate": 1.5976086054838025e-06, |
|
"loss": 0.0031, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.08249201483869177, |
|
"learning_rate": 1.4895980121767627e-06, |
|
"loss": 0.0029, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.07887788932672342, |
|
"learning_rate": 1.3853401279086854e-06, |
|
"loss": 0.0028, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.09271365227044996, |
|
"learning_rate": 1.2848389243363512e-06, |
|
"loss": 0.0026, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.05191622392926365, |
|
"learning_rate": 1.1880982300074838e-06, |
|
"loss": 0.0027, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_blimp_filtered_avg": 0.7105970149253731, |
|
"eval_blimp_filtered_std": 0.005015059082306442, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_blimp_supplement_avg": 0.8146551724137931, |
|
"eval_blimp_supplement_std": 0.01739418193453382, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_vqa_filtered_avg": 0.52, |
|
"eval_vqa_filtered_std": 0.05021167315686779, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_winoground_filtered_avg": 0.64, |
|
"eval_winoground_filtered_std": 0.048241815132442176, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 525, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 394333829201920.0, |
|
"train_batch_size": 40, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|