|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 39, |
|
"global_step": 384, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0026041666666666665, |
|
"grad_norm": 0.3952034496250024, |
|
"learning_rate": 5e-06, |
|
"loss": 2.4903, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0026041666666666665, |
|
"eval_loss": 2.554558753967285, |
|
"eval_runtime": 65.1161, |
|
"eval_samples_per_second": 1.229, |
|
"eval_steps_per_second": 0.154, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005208333333333333, |
|
"grad_norm": 0.3885918424668888, |
|
"learning_rate": 1e-05, |
|
"loss": 2.4825, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0078125, |
|
"grad_norm": 0.3847099050493279, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.5035, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010416666666666666, |
|
"grad_norm": 0.4235545463515579, |
|
"learning_rate": 2e-05, |
|
"loss": 2.5577, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013020833333333334, |
|
"grad_norm": 0.3801745772810421, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.5297, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 0.3512277500716236, |
|
"learning_rate": 3e-05, |
|
"loss": 2.4777, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.018229166666666668, |
|
"grad_norm": 0.28888015786994375, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.4556, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.25350975750236143, |
|
"learning_rate": 4e-05, |
|
"loss": 2.5022, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0234375, |
|
"grad_norm": 0.22691977505165736, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.4143, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.026041666666666668, |
|
"grad_norm": 0.20864896327745933, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4545, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.028645833333333332, |
|
"grad_norm": 0.20008113121425344, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 2.4939, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 0.24785418789288546, |
|
"learning_rate": 6e-05, |
|
"loss": 2.3184, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.033854166666666664, |
|
"grad_norm": 0.2153715911656377, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 2.4738, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.036458333333333336, |
|
"grad_norm": 0.22249287774026938, |
|
"learning_rate": 7e-05, |
|
"loss": 2.4814, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 0.2076426443768558, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 2.3529, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.041666666666666664, |
|
"grad_norm": 0.2246776060857838, |
|
"learning_rate": 8e-05, |
|
"loss": 2.2371, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.044270833333333336, |
|
"grad_norm": 0.18383862266336323, |
|
"learning_rate": 8.5e-05, |
|
"loss": 2.4787, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 0.20631688287583885, |
|
"learning_rate": 9e-05, |
|
"loss": 2.6178, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.049479166666666664, |
|
"grad_norm": 0.18016643348558348, |
|
"learning_rate": 9.5e-05, |
|
"loss": 2.4891, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.052083333333333336, |
|
"grad_norm": 0.14534618682097772, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4522, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0546875, |
|
"grad_norm": 0.15428537805246717, |
|
"learning_rate": 9.999832398924833e-05, |
|
"loss": 2.4111, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.057291666666666664, |
|
"grad_norm": 0.1631103718894274, |
|
"learning_rate": 9.999329608183822e-05, |
|
"loss": 2.387, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.059895833333333336, |
|
"grad_norm": 0.1552602669275131, |
|
"learning_rate": 9.998491665229539e-05, |
|
"loss": 2.3648, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.1704183704935776, |
|
"learning_rate": 9.997318632479817e-05, |
|
"loss": 2.4997, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06510416666666667, |
|
"grad_norm": 0.1585395745370806, |
|
"learning_rate": 9.995810597313128e-05, |
|
"loss": 2.5326, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06770833333333333, |
|
"grad_norm": 0.1344528253347993, |
|
"learning_rate": 9.993967672062052e-05, |
|
"loss": 2.5174, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0703125, |
|
"grad_norm": 0.14705487245235002, |
|
"learning_rate": 9.991789994004929e-05, |
|
"loss": 2.3446, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07291666666666667, |
|
"grad_norm": 0.1461054029585455, |
|
"learning_rate": 9.989277725355615e-05, |
|
"loss": 2.4945, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07552083333333333, |
|
"grad_norm": 0.14508526627249885, |
|
"learning_rate": 9.986431053251411e-05, |
|
"loss": 2.3801, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 0.14473824067661442, |
|
"learning_rate": 9.983250189739119e-05, |
|
"loss": 2.4877, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08072916666666667, |
|
"grad_norm": 0.18945595196350265, |
|
"learning_rate": 9.97973537175925e-05, |
|
"loss": 2.5833, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 0.1541697712272958, |
|
"learning_rate": 9.975886861128368e-05, |
|
"loss": 2.4619, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0859375, |
|
"grad_norm": 0.16052736172586615, |
|
"learning_rate": 9.971704944519594e-05, |
|
"loss": 2.5037, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08854166666666667, |
|
"grad_norm": 0.15543243937875972, |
|
"learning_rate": 9.967189933441243e-05, |
|
"loss": 2.4374, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09114583333333333, |
|
"grad_norm": 0.16705335802827087, |
|
"learning_rate": 9.962342164213639e-05, |
|
"loss": 2.5955, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 0.14789348505066385, |
|
"learning_rate": 9.957161997944034e-05, |
|
"loss": 2.4322, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09635416666666667, |
|
"grad_norm": 0.15495422007897772, |
|
"learning_rate": 9.95164982049974e-05, |
|
"loss": 2.4307, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09895833333333333, |
|
"grad_norm": 0.15049524925558916, |
|
"learning_rate": 9.94580604247936e-05, |
|
"loss": 2.3423, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1015625, |
|
"grad_norm": 0.1444251980172912, |
|
"learning_rate": 9.939631099182219e-05, |
|
"loss": 2.2882, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1015625, |
|
"eval_loss": 2.4301750659942627, |
|
"eval_runtime": 65.3323, |
|
"eval_samples_per_second": 1.225, |
|
"eval_steps_per_second": 0.153, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 0.1475367612200888, |
|
"learning_rate": 9.933125450575932e-05, |
|
"loss": 2.4509, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10677083333333333, |
|
"grad_norm": 0.16356380426266176, |
|
"learning_rate": 9.926289581262147e-05, |
|
"loss": 2.5307, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 0.166302003601292, |
|
"learning_rate": 9.919124000440438e-05, |
|
"loss": 2.526, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11197916666666667, |
|
"grad_norm": 0.14645402434215465, |
|
"learning_rate": 9.91162924187038e-05, |
|
"loss": 2.2958, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11458333333333333, |
|
"grad_norm": 0.2119539720051403, |
|
"learning_rate": 9.903805863831799e-05, |
|
"loss": 2.3299, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 0.151653634781893, |
|
"learning_rate": 9.895654449083166e-05, |
|
"loss": 2.4133, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11979166666666667, |
|
"grad_norm": 0.1547044553169797, |
|
"learning_rate": 9.887175604818206e-05, |
|
"loss": 2.5952, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12239583333333333, |
|
"grad_norm": 0.16664262013704687, |
|
"learning_rate": 9.87836996262066e-05, |
|
"loss": 2.4714, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.15551192268579045, |
|
"learning_rate": 9.869238178417235e-05, |
|
"loss": 2.3432, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12760416666666666, |
|
"grad_norm": 0.14968525834314664, |
|
"learning_rate": 9.859780932428756e-05, |
|
"loss": 2.4037, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13020833333333334, |
|
"grad_norm": 0.1685655914159011, |
|
"learning_rate": 9.849998929119488e-05, |
|
"loss": 2.4032, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1328125, |
|
"grad_norm": 0.13644233785328275, |
|
"learning_rate": 9.839892897144663e-05, |
|
"loss": 2.4095, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13541666666666666, |
|
"grad_norm": 0.14364815720124613, |
|
"learning_rate": 9.829463589296203e-05, |
|
"loss": 2.4398, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13802083333333334, |
|
"grad_norm": 0.14915929369602085, |
|
"learning_rate": 9.818711782446645e-05, |
|
"loss": 2.4816, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 0.14935461072359194, |
|
"learning_rate": 9.80763827749127e-05, |
|
"loss": 2.3426, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14322916666666666, |
|
"grad_norm": 0.15190176296605723, |
|
"learning_rate": 9.796243899288456e-05, |
|
"loss": 2.4705, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14583333333333334, |
|
"grad_norm": 0.14303795896023264, |
|
"learning_rate": 9.784529496598214e-05, |
|
"loss": 2.5108, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1484375, |
|
"grad_norm": 0.1556316812368993, |
|
"learning_rate": 9.772495942018985e-05, |
|
"loss": 2.3112, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15104166666666666, |
|
"grad_norm": 0.1483632196038671, |
|
"learning_rate": 9.760144131922628e-05, |
|
"loss": 2.2325, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15364583333333334, |
|
"grad_norm": 0.12410345584803052, |
|
"learning_rate": 9.747474986387654e-05, |
|
"loss": 2.3749, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.16335694957610425, |
|
"learning_rate": 9.734489449130695e-05, |
|
"loss": 2.4645, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15885416666666666, |
|
"grad_norm": 0.1479375196060614, |
|
"learning_rate": 9.721188487436195e-05, |
|
"loss": 2.3834, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16145833333333334, |
|
"grad_norm": 0.13750323689269844, |
|
"learning_rate": 9.707573092084368e-05, |
|
"loss": 2.3592, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1640625, |
|
"grad_norm": 0.16027916915470333, |
|
"learning_rate": 9.693644277277391e-05, |
|
"loss": 2.4954, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.14229365410212697, |
|
"learning_rate": 9.679403080563861e-05, |
|
"loss": 2.4404, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16927083333333334, |
|
"grad_norm": 0.153686499622388, |
|
"learning_rate": 9.6648505627615e-05, |
|
"loss": 2.2282, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 0.16590644992700104, |
|
"learning_rate": 9.649987807878148e-05, |
|
"loss": 2.5126, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17447916666666666, |
|
"grad_norm": 0.15502972088699477, |
|
"learning_rate": 9.634815923030997e-05, |
|
"loss": 2.3749, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17708333333333334, |
|
"grad_norm": 0.14756465898580834, |
|
"learning_rate": 9.61933603836415e-05, |
|
"loss": 2.3476, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1796875, |
|
"grad_norm": 0.14919525754908758, |
|
"learning_rate": 9.603549306964407e-05, |
|
"loss": 2.3509, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18229166666666666, |
|
"grad_norm": 0.13861494211266717, |
|
"learning_rate": 9.587456904775393e-05, |
|
"loss": 2.2449, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18489583333333334, |
|
"grad_norm": 0.14821518394829722, |
|
"learning_rate": 9.57106003050996e-05, |
|
"loss": 2.3813, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.16324527196573113, |
|
"learning_rate": 9.554359905560886e-05, |
|
"loss": 2.477, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19010416666666666, |
|
"grad_norm": 0.14892748528676938, |
|
"learning_rate": 9.537357773909906e-05, |
|
"loss": 2.264, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19270833333333334, |
|
"grad_norm": 0.1284546434475265, |
|
"learning_rate": 9.520054902035035e-05, |
|
"loss": 2.4267, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 0.14293551215488642, |
|
"learning_rate": 9.502452578816244e-05, |
|
"loss": 2.5131, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19791666666666666, |
|
"grad_norm": 0.14775108859068506, |
|
"learning_rate": 9.484552115439445e-05, |
|
"loss": 2.4076, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20052083333333334, |
|
"grad_norm": 0.15478800535673032, |
|
"learning_rate": 9.466354845298817e-05, |
|
"loss": 2.5762, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 0.16835921991226063, |
|
"learning_rate": 9.44786212389749e-05, |
|
"loss": 2.3752, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"eval_loss": 2.4171054363250732, |
|
"eval_runtime": 65.4109, |
|
"eval_samples_per_second": 1.223, |
|
"eval_steps_per_second": 0.153, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20572916666666666, |
|
"grad_norm": 0.14467889640535875, |
|
"learning_rate": 9.42907532874657e-05, |
|
"loss": 2.4769, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 0.13503613492125413, |
|
"learning_rate": 9.40999585926253e-05, |
|
"loss": 2.4346, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2109375, |
|
"grad_norm": 0.1517728973900508, |
|
"learning_rate": 9.390625136662972e-05, |
|
"loss": 2.261, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21354166666666666, |
|
"grad_norm": 0.1466563383653839, |
|
"learning_rate": 9.370964603860753e-05, |
|
"loss": 2.5261, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21614583333333334, |
|
"grad_norm": 0.14618631155344838, |
|
"learning_rate": 9.351015725356514e-05, |
|
"loss": 2.4269, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.15601390292939937, |
|
"learning_rate": 9.33077998712958e-05, |
|
"loss": 2.3773, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22135416666666666, |
|
"grad_norm": 0.14415725413526403, |
|
"learning_rate": 9.310258896527278e-05, |
|
"loss": 2.3473, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22395833333333334, |
|
"grad_norm": 0.13470412653239092, |
|
"learning_rate": 9.289453982152653e-05, |
|
"loss": 2.3472, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2265625, |
|
"grad_norm": 0.15570776289510233, |
|
"learning_rate": 9.2683667937506e-05, |
|
"loss": 2.4414, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.22916666666666666, |
|
"grad_norm": 0.14612626399487072, |
|
"learning_rate": 9.246998902092428e-05, |
|
"loss": 2.4655, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23177083333333334, |
|
"grad_norm": 0.18267625807633459, |
|
"learning_rate": 9.22535189885886e-05, |
|
"loss": 2.2771, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 0.16065010725957032, |
|
"learning_rate": 9.203427396521454e-05, |
|
"loss": 2.4194, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23697916666666666, |
|
"grad_norm": 0.18321820032984756, |
|
"learning_rate": 9.181227028222508e-05, |
|
"loss": 2.4456, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23958333333333334, |
|
"grad_norm": 0.13929464732035163, |
|
"learning_rate": 9.158752447653397e-05, |
|
"loss": 2.3989, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2421875, |
|
"grad_norm": 0.15585898969003706, |
|
"learning_rate": 9.136005328931395e-05, |
|
"loss": 2.4777, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.24479166666666666, |
|
"grad_norm": 0.14793073723102618, |
|
"learning_rate": 9.112987366474972e-05, |
|
"loss": 2.379, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.24739583333333334, |
|
"grad_norm": 0.14075868568691305, |
|
"learning_rate": 9.089700274877574e-05, |
|
"loss": 2.3281, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.15327339815586916, |
|
"learning_rate": 9.066145788779908e-05, |
|
"loss": 2.4517, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2526041666666667, |
|
"grad_norm": 0.15505433166839452, |
|
"learning_rate": 9.042325662740726e-05, |
|
"loss": 2.2919, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2552083333333333, |
|
"grad_norm": 0.1468856057426848, |
|
"learning_rate": 9.018241671106134e-05, |
|
"loss": 2.3889, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2578125, |
|
"grad_norm": 0.1478488710337695, |
|
"learning_rate": 8.993895607877418e-05, |
|
"loss": 2.3595, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2604166666666667, |
|
"grad_norm": 0.175359520279898, |
|
"learning_rate": 8.969289286577408e-05, |
|
"loss": 2.553, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2630208333333333, |
|
"grad_norm": 0.14503182442700818, |
|
"learning_rate": 8.9444245401154e-05, |
|
"loss": 2.3573, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 0.14177870868430487, |
|
"learning_rate": 8.919303220650606e-05, |
|
"loss": 2.3274, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2682291666666667, |
|
"grad_norm": 0.1779007031093982, |
|
"learning_rate": 8.893927199454207e-05, |
|
"loss": 2.2008, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2708333333333333, |
|
"grad_norm": 0.14578869126284813, |
|
"learning_rate": 8.868298366769954e-05, |
|
"loss": 2.444, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 0.14046914929392473, |
|
"learning_rate": 8.842418631673365e-05, |
|
"loss": 2.2844, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2760416666666667, |
|
"grad_norm": 0.13720708755401478, |
|
"learning_rate": 8.816289921929516e-05, |
|
"loss": 2.5047, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2786458333333333, |
|
"grad_norm": 0.13901967393067377, |
|
"learning_rate": 8.789914183849449e-05, |
|
"loss": 2.4602, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.14165507438668587, |
|
"learning_rate": 8.763293382145195e-05, |
|
"loss": 2.3135, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2838541666666667, |
|
"grad_norm": 0.1588574639328753, |
|
"learning_rate": 8.73642949978341e-05, |
|
"loss": 2.3731, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2864583333333333, |
|
"grad_norm": 0.1905241886636867, |
|
"learning_rate": 8.709324537837684e-05, |
|
"loss": 2.4152, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2890625, |
|
"grad_norm": 0.16919809232424363, |
|
"learning_rate": 8.681980515339464e-05, |
|
"loss": 2.2948, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2916666666666667, |
|
"grad_norm": 0.15114015395920236, |
|
"learning_rate": 8.654399469127673e-05, |
|
"loss": 2.4309, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2942708333333333, |
|
"grad_norm": 0.13890547261939354, |
|
"learning_rate": 8.626583453696976e-05, |
|
"loss": 2.4866, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 0.1490334985258922, |
|
"learning_rate": 8.598534541044747e-05, |
|
"loss": 2.4748, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2994791666666667, |
|
"grad_norm": 0.15113023404371403, |
|
"learning_rate": 8.570254820516728e-05, |
|
"loss": 2.3583, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3020833333333333, |
|
"grad_norm": 0.14358757342098682, |
|
"learning_rate": 8.541746398651395e-05, |
|
"loss": 2.4005, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3046875, |
|
"grad_norm": 0.1494340804949735, |
|
"learning_rate": 8.513011399023036e-05, |
|
"loss": 2.3249, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3046875, |
|
"eval_loss": 2.4119415283203125, |
|
"eval_runtime": 65.4329, |
|
"eval_samples_per_second": 1.223, |
|
"eval_steps_per_second": 0.153, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3072916666666667, |
|
"grad_norm": 0.14857253164474957, |
|
"learning_rate": 8.484051962083579e-05, |
|
"loss": 2.3431, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3098958333333333, |
|
"grad_norm": 0.14191988741229736, |
|
"learning_rate": 8.454870245003141e-05, |
|
"loss": 2.3861, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.1516024307266096, |
|
"learning_rate": 8.425468421509349e-05, |
|
"loss": 2.411, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3151041666666667, |
|
"grad_norm": 0.15002982823803926, |
|
"learning_rate": 8.395848681725416e-05, |
|
"loss": 2.49, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3177083333333333, |
|
"grad_norm": 0.14904411125984457, |
|
"learning_rate": 8.366013232007002e-05, |
|
"loss": 2.2998, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3203125, |
|
"grad_norm": 0.15513959182528284, |
|
"learning_rate": 8.335964294777862e-05, |
|
"loss": 2.4748, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3229166666666667, |
|
"grad_norm": 0.16028597088352084, |
|
"learning_rate": 8.305704108364301e-05, |
|
"loss": 2.5415, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3255208333333333, |
|
"grad_norm": 0.1382845005964039, |
|
"learning_rate": 8.275234926828446e-05, |
|
"loss": 2.3863, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 0.13146327268638525, |
|
"learning_rate": 8.244559019800328e-05, |
|
"loss": 2.2718, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3307291666666667, |
|
"grad_norm": 0.14918540441469405, |
|
"learning_rate": 8.213678672308841e-05, |
|
"loss": 2.29, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.14572964949235084, |
|
"learning_rate": 8.182596184611514e-05, |
|
"loss": 2.3865, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3359375, |
|
"grad_norm": 0.1467709610422986, |
|
"learning_rate": 8.151313872023172e-05, |
|
"loss": 2.3566, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3385416666666667, |
|
"grad_norm": 0.1696428030171741, |
|
"learning_rate": 8.119834064743469e-05, |
|
"loss": 2.4145, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3411458333333333, |
|
"grad_norm": 0.1567212274267596, |
|
"learning_rate": 8.088159107683314e-05, |
|
"loss": 2.3996, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.16059171688564705, |
|
"learning_rate": 8.056291360290201e-05, |
|
"loss": 2.4796, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3463541666666667, |
|
"grad_norm": 0.17528470844982455, |
|
"learning_rate": 8.024233196372453e-05, |
|
"loss": 2.3711, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3489583333333333, |
|
"grad_norm": 0.15733889602776618, |
|
"learning_rate": 7.9919870039224e-05, |
|
"loss": 2.2677, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 0.14873870088112393, |
|
"learning_rate": 7.959555184938495e-05, |
|
"loss": 2.4515, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3541666666666667, |
|
"grad_norm": 0.15417965735327782, |
|
"learning_rate": 7.926940155246397e-05, |
|
"loss": 2.4285, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3567708333333333, |
|
"grad_norm": 0.1431617281765595, |
|
"learning_rate": 7.894144344319014e-05, |
|
"loss": 2.2096, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 0.1582812354896834, |
|
"learning_rate": 7.861170195095537e-05, |
|
"loss": 2.5397, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3619791666666667, |
|
"grad_norm": 0.17372786439002758, |
|
"learning_rate": 7.828020163799455e-05, |
|
"loss": 2.4293, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3645833333333333, |
|
"grad_norm": 0.12985922414383083, |
|
"learning_rate": 7.794696719755612e-05, |
|
"loss": 2.3064, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3671875, |
|
"grad_norm": 0.14444453543114258, |
|
"learning_rate": 7.761202345206249e-05, |
|
"loss": 2.4924, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3697916666666667, |
|
"grad_norm": 0.14361378320872797, |
|
"learning_rate": 7.727539535126118e-05, |
|
"loss": 2.485, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3723958333333333, |
|
"grad_norm": 0.14506765397834698, |
|
"learning_rate": 7.69371079703662e-05, |
|
"loss": 2.4081, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.15082242594669004, |
|
"learning_rate": 7.65971865081904e-05, |
|
"loss": 2.4042, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3776041666666667, |
|
"grad_norm": 0.14811116555183387, |
|
"learning_rate": 7.625565628526818e-05, |
|
"loss": 2.5335, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3802083333333333, |
|
"grad_norm": 0.14726265807121236, |
|
"learning_rate": 7.591254274196959e-05, |
|
"loss": 2.2424, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3828125, |
|
"grad_norm": 0.1582839530592079, |
|
"learning_rate": 7.556787143660521e-05, |
|
"loss": 2.3499, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3854166666666667, |
|
"grad_norm": 0.15670436913091046, |
|
"learning_rate": 7.522166804352226e-05, |
|
"loss": 2.4295, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3880208333333333, |
|
"grad_norm": 0.14730612216140554, |
|
"learning_rate": 7.487395835119231e-05, |
|
"loss": 2.3101, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 0.15120580103738476, |
|
"learning_rate": 7.452476826029011e-05, |
|
"loss": 2.4888, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3932291666666667, |
|
"grad_norm": 0.15081791485599214, |
|
"learning_rate": 7.417412378176446e-05, |
|
"loss": 2.3946, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3958333333333333, |
|
"grad_norm": 0.13244742107918075, |
|
"learning_rate": 7.382205103490043e-05, |
|
"loss": 2.2704, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3984375, |
|
"grad_norm": 0.14571632653572605, |
|
"learning_rate": 7.346857624537407e-05, |
|
"loss": 2.4644, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4010416666666667, |
|
"grad_norm": 0.15252862923031207, |
|
"learning_rate": 7.311372574329854e-05, |
|
"loss": 2.554, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4036458333333333, |
|
"grad_norm": 0.14927980854491554, |
|
"learning_rate": 7.275752596126308e-05, |
|
"loss": 2.3804, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.1406590744757482, |
|
"learning_rate": 7.240000343236385e-05, |
|
"loss": 2.2504, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_loss": 2.408146619796753, |
|
"eval_runtime": 65.4856, |
|
"eval_samples_per_second": 1.222, |
|
"eval_steps_per_second": 0.153, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4088541666666667, |
|
"grad_norm": 0.13753178031418029, |
|
"learning_rate": 7.204118478822766e-05, |
|
"loss": 2.3063, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4114583333333333, |
|
"grad_norm": 0.13719355865812055, |
|
"learning_rate": 7.168109675702806e-05, |
|
"loss": 2.1826, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4140625, |
|
"grad_norm": 0.14916916897375052, |
|
"learning_rate": 7.131976616149445e-05, |
|
"loss": 2.3635, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.1544520288957668, |
|
"learning_rate": 7.095721991691411e-05, |
|
"loss": 2.6067, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4192708333333333, |
|
"grad_norm": 0.1466811081471814, |
|
"learning_rate": 7.05934850291272e-05, |
|
"loss": 2.2636, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 0.14245354657490636, |
|
"learning_rate": 7.022858859251517e-05, |
|
"loss": 2.3278, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4244791666666667, |
|
"grad_norm": 0.14310350882179557, |
|
"learning_rate": 6.986255778798253e-05, |
|
"loss": 2.2951, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4270833333333333, |
|
"grad_norm": 0.15028525220062852, |
|
"learning_rate": 6.949541988093208e-05, |
|
"loss": 2.3647, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.13536590220383313, |
|
"learning_rate": 6.912720221923405e-05, |
|
"loss": 2.3635, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4322916666666667, |
|
"grad_norm": 0.14965390120450028, |
|
"learning_rate": 6.875793223118888e-05, |
|
"loss": 2.3191, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4348958333333333, |
|
"grad_norm": 0.15696846625410496, |
|
"learning_rate": 6.838763742348415e-05, |
|
"loss": 2.4342, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.16266249005182767, |
|
"learning_rate": 6.801634537914555e-05, |
|
"loss": 2.4487, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4401041666666667, |
|
"grad_norm": 0.15099304315918932, |
|
"learning_rate": 6.764408375548237e-05, |
|
"loss": 2.4774, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4427083333333333, |
|
"grad_norm": 0.1437502300792439, |
|
"learning_rate": 6.727088028202723e-05, |
|
"loss": 2.4369, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4453125, |
|
"grad_norm": 0.13339677555582233, |
|
"learning_rate": 6.68967627584705e-05, |
|
"loss": 2.3039, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4479166666666667, |
|
"grad_norm": 0.1416720700803862, |
|
"learning_rate": 6.652175905258963e-05, |
|
"loss": 2.2844, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4505208333333333, |
|
"grad_norm": 0.14502909820943563, |
|
"learning_rate": 6.614589709817317e-05, |
|
"loss": 2.4192, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 0.14499193575249233, |
|
"learning_rate": 6.576920489294011e-05, |
|
"loss": 2.3057, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4557291666666667, |
|
"grad_norm": 0.15038851044223103, |
|
"learning_rate": 6.539171049645426e-05, |
|
"loss": 2.4237, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4583333333333333, |
|
"grad_norm": 0.14553942382157312, |
|
"learning_rate": 6.501344202803414e-05, |
|
"loss": 2.4642, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4609375, |
|
"grad_norm": 0.13896484063095102, |
|
"learning_rate": 6.463442766465847e-05, |
|
"loss": 2.4132, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4635416666666667, |
|
"grad_norm": 0.14978646915320232, |
|
"learning_rate": 6.425469563886715e-05, |
|
"loss": 2.3692, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4661458333333333, |
|
"grad_norm": 0.13794418734075914, |
|
"learning_rate": 6.387427423665829e-05, |
|
"loss": 2.3524, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.1467143240226456, |
|
"learning_rate": 6.349319179538126e-05, |
|
"loss": 2.359, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4713541666666667, |
|
"grad_norm": 0.14326937356779176, |
|
"learning_rate": 6.311147670162576e-05, |
|
"loss": 2.5547, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4739583333333333, |
|
"grad_norm": 0.141220074236358, |
|
"learning_rate": 6.272915738910743e-05, |
|
"loss": 2.4299, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4765625, |
|
"grad_norm": 0.148901889828269, |
|
"learning_rate": 6.234626233654973e-05, |
|
"loss": 2.5296, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4791666666666667, |
|
"grad_norm": 0.15238094995154358, |
|
"learning_rate": 6.196282006556266e-05, |
|
"loss": 2.2642, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4817708333333333, |
|
"grad_norm": 0.14562487124583376, |
|
"learning_rate": 6.157885913851818e-05, |
|
"loss": 2.2485, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 0.1557482340822006, |
|
"learning_rate": 6.119440815642258e-05, |
|
"loss": 2.5018, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4869791666666667, |
|
"grad_norm": 0.15557904288598579, |
|
"learning_rate": 6.080949575678606e-05, |
|
"loss": 2.3121, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4895833333333333, |
|
"grad_norm": 0.132624820989215, |
|
"learning_rate": 6.042415061148954e-05, |
|
"loss": 2.4456, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4921875, |
|
"grad_norm": 0.14540093060758663, |
|
"learning_rate": 6.003840142464886e-05, |
|
"loss": 2.4166, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4947916666666667, |
|
"grad_norm": 0.13698817848218522, |
|
"learning_rate": 5.9652276930476656e-05, |
|
"loss": 2.4569, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4973958333333333, |
|
"grad_norm": 0.13530133541335626, |
|
"learning_rate": 5.926580589114201e-05, |
|
"loss": 2.521, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.13985911131684764, |
|
"learning_rate": 5.8879017094627874e-05, |
|
"loss": 2.4112, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5026041666666666, |
|
"grad_norm": 0.13203633038116566, |
|
"learning_rate": 5.849193935258679e-05, |
|
"loss": 2.1839, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5052083333333334, |
|
"grad_norm": 0.16221179568567085, |
|
"learning_rate": 5.810460149819462e-05, |
|
"loss": 2.2793, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"grad_norm": 0.15402021583934355, |
|
"learning_rate": 5.771703238400288e-05, |
|
"loss": 2.3905, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"eval_loss": 2.4029593467712402, |
|
"eval_runtime": 65.2935, |
|
"eval_samples_per_second": 1.225, |
|
"eval_steps_per_second": 0.153, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5104166666666666, |
|
"grad_norm": 0.15606553454493674, |
|
"learning_rate": 5.7329260879789437e-05, |
|
"loss": 2.3709, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5130208333333334, |
|
"grad_norm": 0.1448236972696976, |
|
"learning_rate": 5.6941315870408066e-05, |
|
"loss": 2.4256, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.515625, |
|
"grad_norm": 0.15924118657730604, |
|
"learning_rate": 5.655322625363687e-05, |
|
"loss": 2.5228, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5182291666666666, |
|
"grad_norm": 0.17454487852950232, |
|
"learning_rate": 5.616502093802565e-05, |
|
"loss": 2.2945, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 0.14248471717781994, |
|
"learning_rate": 5.577672884074249e-05, |
|
"loss": 2.3514, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5234375, |
|
"grad_norm": 0.14873123082787126, |
|
"learning_rate": 5.538837888541986e-05, |
|
"loss": 2.4918, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5260416666666666, |
|
"grad_norm": 0.1600074911775423, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 2.3231, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5286458333333334, |
|
"grad_norm": 0.15123679319262007, |
|
"learning_rate": 5.461162111458016e-05, |
|
"loss": 2.3613, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.13067031369476279, |
|
"learning_rate": 5.422327115925753e-05, |
|
"loss": 2.3407, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5338541666666666, |
|
"grad_norm": 0.1419685152456832, |
|
"learning_rate": 5.3834979061974376e-05, |
|
"loss": 2.4375, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5364583333333334, |
|
"grad_norm": 0.13456464948427768, |
|
"learning_rate": 5.3446773746363153e-05, |
|
"loss": 2.3542, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5390625, |
|
"grad_norm": 0.1394757337185396, |
|
"learning_rate": 5.305868412959195e-05, |
|
"loss": 2.6189, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5416666666666666, |
|
"grad_norm": 0.14661264347416875, |
|
"learning_rate": 5.2670739120210574e-05, |
|
"loss": 2.4666, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5442708333333334, |
|
"grad_norm": 0.15772002651226175, |
|
"learning_rate": 5.2282967615997125e-05, |
|
"loss": 2.2665, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 0.1436207432226202, |
|
"learning_rate": 5.1895398501805383e-05, |
|
"loss": 2.4412, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5494791666666666, |
|
"grad_norm": 0.1627045485383769, |
|
"learning_rate": 5.150806064741323e-05, |
|
"loss": 2.4241, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5520833333333334, |
|
"grad_norm": 0.1608053633868765, |
|
"learning_rate": 5.112098290537213e-05, |
|
"loss": 2.2647, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5546875, |
|
"grad_norm": 0.15608204954248603, |
|
"learning_rate": 5.0734194108858e-05, |
|
"loss": 2.4768, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5572916666666666, |
|
"grad_norm": 0.1576523104437354, |
|
"learning_rate": 5.0347723069523355e-05, |
|
"loss": 2.2488, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5598958333333334, |
|
"grad_norm": 0.16267311756196431, |
|
"learning_rate": 4.9961598575351155e-05, |
|
"loss": 2.2745, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.14597656332784, |
|
"learning_rate": 4.9575849388510473e-05, |
|
"loss": 2.2226, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5651041666666666, |
|
"grad_norm": 0.12952717920836662, |
|
"learning_rate": 4.919050424321395e-05, |
|
"loss": 2.2863, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5677083333333334, |
|
"grad_norm": 0.14126095486957518, |
|
"learning_rate": 4.880559184357745e-05, |
|
"loss": 2.2773, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5703125, |
|
"grad_norm": 0.15894975244045567, |
|
"learning_rate": 4.842114086148185e-05, |
|
"loss": 2.4144, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5729166666666666, |
|
"grad_norm": 0.15187930552412898, |
|
"learning_rate": 4.803717993443734e-05, |
|
"loss": 2.2729, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5755208333333334, |
|
"grad_norm": 0.15567668429445186, |
|
"learning_rate": 4.765373766345028e-05, |
|
"loss": 2.4327, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.578125, |
|
"grad_norm": 0.1577838508745425, |
|
"learning_rate": 4.727084261089257e-05, |
|
"loss": 2.5534, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5807291666666666, |
|
"grad_norm": 0.1447851641278385, |
|
"learning_rate": 4.688852329837424e-05, |
|
"loss": 2.2928, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.13963925711509403, |
|
"learning_rate": 4.6506808204618754e-05, |
|
"loss": 2.3612, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5859375, |
|
"grad_norm": 0.17944687510007068, |
|
"learning_rate": 4.612572576334171e-05, |
|
"loss": 2.4195, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5885416666666666, |
|
"grad_norm": 0.15388075071309162, |
|
"learning_rate": 4.574530436113286e-05, |
|
"loss": 2.5277, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5911458333333334, |
|
"grad_norm": 0.12718481374460847, |
|
"learning_rate": 4.536557233534153e-05, |
|
"loss": 2.2991, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.14948263293785125, |
|
"learning_rate": 4.498655797196586e-05, |
|
"loss": 2.4404, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5963541666666666, |
|
"grad_norm": 0.13608773847019048, |
|
"learning_rate": 4.460828950354577e-05, |
|
"loss": 2.2945, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5989583333333334, |
|
"grad_norm": 0.14126010771163286, |
|
"learning_rate": 4.423079510705992e-05, |
|
"loss": 2.2835, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6015625, |
|
"grad_norm": 0.15458611014177978, |
|
"learning_rate": 4.3854102901826834e-05, |
|
"loss": 2.4491, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6041666666666666, |
|
"grad_norm": 0.17078246755698387, |
|
"learning_rate": 4.3478240947410386e-05, |
|
"loss": 2.5331, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6067708333333334, |
|
"grad_norm": 0.14807435693469648, |
|
"learning_rate": 4.3103237241529506e-05, |
|
"loss": 2.4154, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"grad_norm": 0.15605308853260608, |
|
"learning_rate": 4.272911971797279e-05, |
|
"loss": 2.3354, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"eval_loss": 2.401761531829834, |
|
"eval_runtime": 65.3141, |
|
"eval_samples_per_second": 1.225, |
|
"eval_steps_per_second": 0.153, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6119791666666666, |
|
"grad_norm": 0.15343655698839206, |
|
"learning_rate": 4.235591624451763e-05, |
|
"loss": 2.5605, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6145833333333334, |
|
"grad_norm": 0.14304569592672528, |
|
"learning_rate": 4.198365462085446e-05, |
|
"loss": 2.5812, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6171875, |
|
"grad_norm": 0.15320835548074707, |
|
"learning_rate": 4.161236257651587e-05, |
|
"loss": 2.5275, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6197916666666666, |
|
"grad_norm": 0.15452569712518102, |
|
"learning_rate": 4.1242067768811134e-05, |
|
"loss": 2.4707, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6223958333333334, |
|
"grad_norm": 0.16320750887974791, |
|
"learning_rate": 4.0872797780765946e-05, |
|
"loss": 2.3996, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.14194397858802227, |
|
"learning_rate": 4.0504580119067933e-05, |
|
"loss": 2.2431, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6276041666666666, |
|
"grad_norm": 0.1437781011206105, |
|
"learning_rate": 4.01374422120175e-05, |
|
"loss": 2.3929, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6302083333333334, |
|
"grad_norm": 0.14383943780586664, |
|
"learning_rate": 3.977141140748484e-05, |
|
"loss": 2.3989, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6328125, |
|
"grad_norm": 0.16948819133813695, |
|
"learning_rate": 3.94065149708728e-05, |
|
"loss": 2.4256, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6354166666666666, |
|
"grad_norm": 0.14755837584042042, |
|
"learning_rate": 3.904278008308589e-05, |
|
"loss": 2.2711, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6380208333333334, |
|
"grad_norm": 0.15519908616035058, |
|
"learning_rate": 3.868023383850556e-05, |
|
"loss": 2.4623, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.640625, |
|
"grad_norm": 0.16021538951276384, |
|
"learning_rate": 3.831890324297197e-05, |
|
"loss": 2.3857, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6432291666666666, |
|
"grad_norm": 0.15886505001684953, |
|
"learning_rate": 3.795881521177236e-05, |
|
"loss": 2.5196, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6458333333333334, |
|
"grad_norm": 0.16005244689800305, |
|
"learning_rate": 3.7599996567636156e-05, |
|
"loss": 2.406, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6484375, |
|
"grad_norm": 0.1470078515999776, |
|
"learning_rate": 3.724247403873694e-05, |
|
"loss": 2.4975, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6510416666666666, |
|
"grad_norm": 0.1560311928142992, |
|
"learning_rate": 3.688627425670147e-05, |
|
"loss": 2.374, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6536458333333334, |
|
"grad_norm": 0.15349615074542047, |
|
"learning_rate": 3.653142375462596e-05, |
|
"loss": 2.4155, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.20641136824203335, |
|
"learning_rate": 3.6177948965099585e-05, |
|
"loss": 2.4358, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6588541666666666, |
|
"grad_norm": 0.14326347914064022, |
|
"learning_rate": 3.582587621823558e-05, |
|
"loss": 2.4528, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6614583333333334, |
|
"grad_norm": 0.13097442902507145, |
|
"learning_rate": 3.547523173970989e-05, |
|
"loss": 2.3682, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6640625, |
|
"grad_norm": 0.14938841182330287, |
|
"learning_rate": 3.51260416488077e-05, |
|
"loss": 2.5273, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.1532472206020292, |
|
"learning_rate": 3.477833195647773e-05, |
|
"loss": 2.5301, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6692708333333334, |
|
"grad_norm": 0.12783586158700921, |
|
"learning_rate": 3.443212856339481e-05, |
|
"loss": 2.3279, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.671875, |
|
"grad_norm": 0.15547617846106007, |
|
"learning_rate": 3.408745725803042e-05, |
|
"loss": 2.4209, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6744791666666666, |
|
"grad_norm": 0.13531260542176757, |
|
"learning_rate": 3.3744343714731835e-05, |
|
"loss": 2.3595, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6770833333333334, |
|
"grad_norm": 0.1488109854224464, |
|
"learning_rate": 3.3402813491809623e-05, |
|
"loss": 2.2631, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6796875, |
|
"grad_norm": 0.1753652780376821, |
|
"learning_rate": 3.3062892029633817e-05, |
|
"loss": 2.4748, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6822916666666666, |
|
"grad_norm": 0.1593205802273226, |
|
"learning_rate": 3.272460464873884e-05, |
|
"loss": 2.4484, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6848958333333334, |
|
"grad_norm": 0.1401885264986934, |
|
"learning_rate": 3.238797654793752e-05, |
|
"loss": 2.4234, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.15910838717602993, |
|
"learning_rate": 3.205303280244389e-05, |
|
"loss": 2.4679, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6901041666666666, |
|
"grad_norm": 0.16179488061734165, |
|
"learning_rate": 3.1719798362005444e-05, |
|
"loss": 2.4883, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6927083333333334, |
|
"grad_norm": 0.15848452909780508, |
|
"learning_rate": 3.138829804904464e-05, |
|
"loss": 2.4583, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6953125, |
|
"grad_norm": 0.16307212504652477, |
|
"learning_rate": 3.105855655680986e-05, |
|
"loss": 2.3327, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6979166666666666, |
|
"grad_norm": 0.1467535420565889, |
|
"learning_rate": 3.073059844753604e-05, |
|
"loss": 2.4382, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7005208333333334, |
|
"grad_norm": 0.14201414432531673, |
|
"learning_rate": 3.0404448150615063e-05, |
|
"loss": 2.3501, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 0.1549923554458448, |
|
"learning_rate": 3.0080129960776017e-05, |
|
"loss": 2.396, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7057291666666666, |
|
"grad_norm": 0.1524034184779795, |
|
"learning_rate": 2.9757668036275477e-05, |
|
"loss": 2.2784, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7083333333333334, |
|
"grad_norm": 0.16256677012412982, |
|
"learning_rate": 2.9437086397097995e-05, |
|
"loss": 2.3027, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7109375, |
|
"grad_norm": 0.15633557288864075, |
|
"learning_rate": 2.9118408923166875e-05, |
|
"loss": 2.5473, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7109375, |
|
"eval_loss": 2.3996334075927734, |
|
"eval_runtime": 65.6177, |
|
"eval_samples_per_second": 1.219, |
|
"eval_steps_per_second": 0.152, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7135416666666666, |
|
"grad_norm": 0.14577207953482688, |
|
"learning_rate": 2.8801659352565335e-05, |
|
"loss": 2.3106, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7161458333333334, |
|
"grad_norm": 0.12720217228966463, |
|
"learning_rate": 2.848686127976829e-05, |
|
"loss": 2.4048, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.1727762156703378, |
|
"learning_rate": 2.8174038153884862e-05, |
|
"loss": 2.3996, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7213541666666666, |
|
"grad_norm": 0.20872850658454511, |
|
"learning_rate": 2.786321327691158e-05, |
|
"loss": 2.5836, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7239583333333334, |
|
"grad_norm": 0.1591486153830447, |
|
"learning_rate": 2.7554409801996723e-05, |
|
"loss": 2.4478, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7265625, |
|
"grad_norm": 0.14032674083135735, |
|
"learning_rate": 2.7247650731715564e-05, |
|
"loss": 2.2852, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 0.15626590715798697, |
|
"learning_rate": 2.6942958916356998e-05, |
|
"loss": 2.525, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7317708333333334, |
|
"grad_norm": 0.16623699130944486, |
|
"learning_rate": 2.66403570522214e-05, |
|
"loss": 2.3565, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.734375, |
|
"grad_norm": 0.1449533445649136, |
|
"learning_rate": 2.6339867679929997e-05, |
|
"loss": 2.2674, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7369791666666666, |
|
"grad_norm": 0.14290681116465945, |
|
"learning_rate": 2.6041513182745837e-05, |
|
"loss": 2.1482, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7395833333333334, |
|
"grad_norm": 0.13123439654719476, |
|
"learning_rate": 2.574531578490651e-05, |
|
"loss": 2.2889, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7421875, |
|
"grad_norm": 0.1510144014374668, |
|
"learning_rate": 2.54512975499686e-05, |
|
"loss": 2.3577, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7447916666666666, |
|
"grad_norm": 0.17738379736772392, |
|
"learning_rate": 2.515948037916423e-05, |
|
"loss": 2.3591, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7473958333333334, |
|
"grad_norm": 0.14040918825453863, |
|
"learning_rate": 2.4869886009769657e-05, |
|
"loss": 2.3019, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.15309282350105588, |
|
"learning_rate": 2.4582536013486054e-05, |
|
"loss": 2.3621, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7526041666666666, |
|
"grad_norm": 0.15908796843842757, |
|
"learning_rate": 2.429745179483272e-05, |
|
"loss": 2.451, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7552083333333334, |
|
"grad_norm": 0.15934279432082493, |
|
"learning_rate": 2.4014654589552526e-05, |
|
"loss": 2.4775, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7578125, |
|
"grad_norm": 0.15682626436856428, |
|
"learning_rate": 2.3734165463030244e-05, |
|
"loss": 2.4, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7604166666666666, |
|
"grad_norm": 0.1688230501684105, |
|
"learning_rate": 2.345600530872328e-05, |
|
"loss": 2.4442, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7630208333333334, |
|
"grad_norm": 0.16115250940484133, |
|
"learning_rate": 2.3180194846605367e-05, |
|
"loss": 2.4088, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.765625, |
|
"grad_norm": 0.14208493985226064, |
|
"learning_rate": 2.290675462162318e-05, |
|
"loss": 2.2623, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7682291666666666, |
|
"grad_norm": 0.13703687813086513, |
|
"learning_rate": 2.263570500216591e-05, |
|
"loss": 2.486, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7708333333333334, |
|
"grad_norm": 0.16480865487948104, |
|
"learning_rate": 2.2367066178548072e-05, |
|
"loss": 2.321, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7734375, |
|
"grad_norm": 0.15426178206689278, |
|
"learning_rate": 2.2100858161505506e-05, |
|
"loss": 2.2805, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7760416666666666, |
|
"grad_norm": 0.14570862119709826, |
|
"learning_rate": 2.183710078070485e-05, |
|
"loss": 2.551, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7786458333333334, |
|
"grad_norm": 0.14473284935283884, |
|
"learning_rate": 2.157581368326635e-05, |
|
"loss": 2.1132, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.15820239745667702, |
|
"learning_rate": 2.1317016332300447e-05, |
|
"loss": 2.4685, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7838541666666666, |
|
"grad_norm": 0.14299466522368254, |
|
"learning_rate": 2.106072800545793e-05, |
|
"loss": 2.4417, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7864583333333334, |
|
"grad_norm": 0.13794584283858527, |
|
"learning_rate": 2.080696779349396e-05, |
|
"loss": 2.3209, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7890625, |
|
"grad_norm": 0.14676514709546462, |
|
"learning_rate": 2.0555754598846027e-05, |
|
"loss": 2.3155, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7916666666666666, |
|
"grad_norm": 0.1551110645608698, |
|
"learning_rate": 2.030710713422592e-05, |
|
"loss": 2.5221, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7942708333333334, |
|
"grad_norm": 0.15076270062351085, |
|
"learning_rate": 2.0061043921225828e-05, |
|
"loss": 2.3056, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.796875, |
|
"grad_norm": 0.14884431911568607, |
|
"learning_rate": 1.981758328893866e-05, |
|
"loss": 2.4404, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7994791666666666, |
|
"grad_norm": 0.14993854232241055, |
|
"learning_rate": 1.9576743372592747e-05, |
|
"loss": 2.2778, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8020833333333334, |
|
"grad_norm": 0.13916032144027837, |
|
"learning_rate": 1.933854211220094e-05, |
|
"loss": 2.3168, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8046875, |
|
"grad_norm": 0.1567961212337068, |
|
"learning_rate": 1.9102997251224282e-05, |
|
"loss": 2.3217, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8072916666666666, |
|
"grad_norm": 0.14185229006162023, |
|
"learning_rate": 1.8870126335250293e-05, |
|
"loss": 2.3056, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8098958333333334, |
|
"grad_norm": 0.14083323135716713, |
|
"learning_rate": 1.8639946710686064e-05, |
|
"loss": 2.5101, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.1428611888153054, |
|
"learning_rate": 1.841247552346603e-05, |
|
"loss": 2.4123, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_loss": 2.398231029510498, |
|
"eval_runtime": 65.4627, |
|
"eval_samples_per_second": 1.222, |
|
"eval_steps_per_second": 0.153, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8151041666666666, |
|
"grad_norm": 0.13210051743324183, |
|
"learning_rate": 1.8187729717774925e-05, |
|
"loss": 2.2753, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8177083333333334, |
|
"grad_norm": 0.1474646492548143, |
|
"learning_rate": 1.7965726034785466e-05, |
|
"loss": 2.3933, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8203125, |
|
"grad_norm": 0.16871013478557104, |
|
"learning_rate": 1.7746481011411416e-05, |
|
"loss": 2.3614, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8229166666666666, |
|
"grad_norm": 0.15193041380102754, |
|
"learning_rate": 1.753001097907572e-05, |
|
"loss": 2.3525, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8255208333333334, |
|
"grad_norm": 0.14955167945389547, |
|
"learning_rate": 1.7316332062494016e-05, |
|
"loss": 2.4257, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.828125, |
|
"grad_norm": 0.1572944020219996, |
|
"learning_rate": 1.710546017847347e-05, |
|
"loss": 2.5182, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8307291666666666, |
|
"grad_norm": 0.1493341360803509, |
|
"learning_rate": 1.6897411034727218e-05, |
|
"loss": 2.3794, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.13561727835199758, |
|
"learning_rate": 1.66922001287042e-05, |
|
"loss": 2.2277, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8359375, |
|
"grad_norm": 0.13052163343368234, |
|
"learning_rate": 1.648984274643487e-05, |
|
"loss": 2.309, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8385416666666666, |
|
"grad_norm": 0.1578566178286769, |
|
"learning_rate": 1.629035396139247e-05, |
|
"loss": 2.4832, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8411458333333334, |
|
"grad_norm": 0.16163117665249763, |
|
"learning_rate": 1.6093748633370295e-05, |
|
"loss": 2.3515, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.16589478114748782, |
|
"learning_rate": 1.5900041407374708e-05, |
|
"loss": 2.5438, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8463541666666666, |
|
"grad_norm": 0.13933316472296842, |
|
"learning_rate": 1.5709246712534315e-05, |
|
"loss": 2.4306, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8489583333333334, |
|
"grad_norm": 0.1606181863885322, |
|
"learning_rate": 1.5521378761025113e-05, |
|
"loss": 2.5248, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8515625, |
|
"grad_norm": 0.15583835916491617, |
|
"learning_rate": 1.5336451547011838e-05, |
|
"loss": 2.57, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8541666666666666, |
|
"grad_norm": 0.13760770777048018, |
|
"learning_rate": 1.515447884560556e-05, |
|
"loss": 2.4217, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8567708333333334, |
|
"grad_norm": 0.14254277680719865, |
|
"learning_rate": 1.4975474211837561e-05, |
|
"loss": 2.5538, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 0.15022329502854323, |
|
"learning_rate": 1.479945097964967e-05, |
|
"loss": 2.4875, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8619791666666666, |
|
"grad_norm": 0.14442101795365278, |
|
"learning_rate": 1.4626422260900962e-05, |
|
"loss": 2.4053, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8645833333333334, |
|
"grad_norm": 0.1339112894355035, |
|
"learning_rate": 1.4456400944391146e-05, |
|
"loss": 2.3938, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8671875, |
|
"grad_norm": 0.15065063782655086, |
|
"learning_rate": 1.4289399694900398e-05, |
|
"loss": 2.4288, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8697916666666666, |
|
"grad_norm": 0.15054714911892625, |
|
"learning_rate": 1.4125430952246071e-05, |
|
"loss": 2.4214, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8723958333333334, |
|
"grad_norm": 0.1560638225083581, |
|
"learning_rate": 1.3964506930355947e-05, |
|
"loss": 2.3693, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.17170476718899333, |
|
"learning_rate": 1.380663961635852e-05, |
|
"loss": 2.3902, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8776041666666666, |
|
"grad_norm": 0.14853468036224993, |
|
"learning_rate": 1.3651840769690028e-05, |
|
"loss": 2.5229, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8802083333333334, |
|
"grad_norm": 0.13862935132651075, |
|
"learning_rate": 1.350012192121854e-05, |
|
"loss": 2.272, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8828125, |
|
"grad_norm": 0.16359619179880716, |
|
"learning_rate": 1.3351494372384995e-05, |
|
"loss": 2.5031, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8854166666666666, |
|
"grad_norm": 0.15198032362712752, |
|
"learning_rate": 1.3205969194361395e-05, |
|
"loss": 2.355, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8880208333333334, |
|
"grad_norm": 0.12892013869101834, |
|
"learning_rate": 1.3063557227226094e-05, |
|
"loss": 2.3602, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.890625, |
|
"grad_norm": 0.1498170376596413, |
|
"learning_rate": 1.292426907915634e-05, |
|
"loss": 2.3912, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8932291666666666, |
|
"grad_norm": 0.15638364831022664, |
|
"learning_rate": 1.2788115125638068e-05, |
|
"loss": 2.4359, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8958333333333334, |
|
"grad_norm": 0.151291537071441, |
|
"learning_rate": 1.2655105508693065e-05, |
|
"loss": 2.3082, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8984375, |
|
"grad_norm": 0.14865959228520678, |
|
"learning_rate": 1.252525013612346e-05, |
|
"loss": 2.3877, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9010416666666666, |
|
"grad_norm": 0.1459248998632338, |
|
"learning_rate": 1.2398558680773736e-05, |
|
"loss": 2.3293, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9036458333333334, |
|
"grad_norm": 0.14220792400443616, |
|
"learning_rate": 1.227504057981016e-05, |
|
"loss": 2.4427, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.15415554255450412, |
|
"learning_rate": 1.2154705034017866e-05, |
|
"loss": 2.383, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9088541666666666, |
|
"grad_norm": 0.1457198339110242, |
|
"learning_rate": 1.203756100711545e-05, |
|
"loss": 2.3751, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9114583333333334, |
|
"grad_norm": 0.15986391066488098, |
|
"learning_rate": 1.1923617225087293e-05, |
|
"loss": 2.3891, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9140625, |
|
"grad_norm": 0.13681607679501942, |
|
"learning_rate": 1.1812882175533564e-05, |
|
"loss": 2.2878, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9140625, |
|
"eval_loss": 2.397103786468506, |
|
"eval_runtime": 65.1202, |
|
"eval_samples_per_second": 1.228, |
|
"eval_steps_per_second": 0.154, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.17312639183632889, |
|
"learning_rate": 1.1705364107037981e-05, |
|
"loss": 2.4481, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9192708333333334, |
|
"grad_norm": 0.17233940633090827, |
|
"learning_rate": 1.1601071028553371e-05, |
|
"loss": 2.4747, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.921875, |
|
"grad_norm": 0.13057391845213812, |
|
"learning_rate": 1.1500010708805123e-05, |
|
"loss": 2.3276, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9244791666666666, |
|
"grad_norm": 0.15218005054913405, |
|
"learning_rate": 1.1402190675712448e-05, |
|
"loss": 2.4584, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9270833333333334, |
|
"grad_norm": 0.1405908251620449, |
|
"learning_rate": 1.130761821582766e-05, |
|
"loss": 2.3576, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9296875, |
|
"grad_norm": 0.15487043477366738, |
|
"learning_rate": 1.1216300373793417e-05, |
|
"loss": 2.3773, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9322916666666666, |
|
"grad_norm": 0.15092606814376955, |
|
"learning_rate": 1.1128243951817937e-05, |
|
"loss": 2.2986, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9348958333333334, |
|
"grad_norm": 0.1484173556255145, |
|
"learning_rate": 1.1043455509168339e-05, |
|
"loss": 2.2237, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.14235753907847776, |
|
"learning_rate": 1.0961941361682013e-05, |
|
"loss": 2.4375, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9401041666666666, |
|
"grad_norm": 0.14142558045399545, |
|
"learning_rate": 1.0883707581296196e-05, |
|
"loss": 2.5165, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9427083333333334, |
|
"grad_norm": 0.13890140092039985, |
|
"learning_rate": 1.080875999559564e-05, |
|
"loss": 2.477, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9453125, |
|
"grad_norm": 0.149033997441562, |
|
"learning_rate": 1.0737104187378542e-05, |
|
"loss": 2.386, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9479166666666666, |
|
"grad_norm": 0.15829792856388608, |
|
"learning_rate": 1.066874549424068e-05, |
|
"loss": 2.2997, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9505208333333334, |
|
"grad_norm": 0.14793151985278888, |
|
"learning_rate": 1.0603689008177822e-05, |
|
"loss": 2.4599, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.953125, |
|
"grad_norm": 0.1556260763161562, |
|
"learning_rate": 1.0541939575206412e-05, |
|
"loss": 2.2611, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9557291666666666, |
|
"grad_norm": 0.16684067808748887, |
|
"learning_rate": 1.0483501795002612e-05, |
|
"loss": 2.4216, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9583333333333334, |
|
"grad_norm": 0.14213262998853762, |
|
"learning_rate": 1.0428380020559658e-05, |
|
"loss": 2.4624, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9609375, |
|
"grad_norm": 0.16542015576320396, |
|
"learning_rate": 1.0376578357863627e-05, |
|
"loss": 2.087, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9635416666666666, |
|
"grad_norm": 0.16312011726686781, |
|
"learning_rate": 1.0328100665587574e-05, |
|
"loss": 2.3865, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9661458333333334, |
|
"grad_norm": 0.14758591702174165, |
|
"learning_rate": 1.0282950554804085e-05, |
|
"loss": 2.3726, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.1316294327416778, |
|
"learning_rate": 1.0241131388716332e-05, |
|
"loss": 2.4155, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9713541666666666, |
|
"grad_norm": 0.14988844894688882, |
|
"learning_rate": 1.0202646282407505e-05, |
|
"loss": 2.3134, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9739583333333334, |
|
"grad_norm": 0.16468705638317635, |
|
"learning_rate": 1.016749810260881e-05, |
|
"loss": 2.3345, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9765625, |
|
"grad_norm": 0.16639080943133064, |
|
"learning_rate": 1.01356894674859e-05, |
|
"loss": 2.1808, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9791666666666666, |
|
"grad_norm": 0.1537739755991523, |
|
"learning_rate": 1.0107222746443862e-05, |
|
"loss": 2.3806, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9817708333333334, |
|
"grad_norm": 0.1582641580303379, |
|
"learning_rate": 1.0082100059950713e-05, |
|
"loss": 2.4064, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.984375, |
|
"grad_norm": 0.16490051463401306, |
|
"learning_rate": 1.0060323279379476e-05, |
|
"loss": 2.3932, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9869791666666666, |
|
"grad_norm": 0.14677990127698765, |
|
"learning_rate": 1.0041894026868732e-05, |
|
"loss": 2.3932, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9895833333333334, |
|
"grad_norm": 0.17686734833911588, |
|
"learning_rate": 1.0026813675201832e-05, |
|
"loss": 2.2527, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9921875, |
|
"grad_norm": 0.15542797613175577, |
|
"learning_rate": 1.0015083347704623e-05, |
|
"loss": 2.4111, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9947916666666666, |
|
"grad_norm": 0.1559166935312072, |
|
"learning_rate": 1.0006703918161775e-05, |
|
"loss": 2.4522, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9973958333333334, |
|
"grad_norm": 0.14284208961058514, |
|
"learning_rate": 1.000167601075169e-05, |
|
"loss": 2.3575, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.14031951339304571, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3152, |
|
"step": 384 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 384, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 39, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 126942053400576.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|