typhoon_cosmetic / trainer_state.json
SADATO's picture
Upload 11 files
b996f24 verified
{
"best_metric": 0.49299946427345276,
"best_model_checkpoint": "model/E3/typhoon_E3_shuffle_augment_gpt4/checkpoint-7179",
"epoch": 9.995821145006268,
"eval_steps": 500,
"global_step": 11960,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.7644,
"step": 20
},
{
"epoch": 0.03,
"learning_rate": 4.99999133180605e-05,
"loss": 1.8649,
"step": 40
},
{
"epoch": 0.05,
"learning_rate": 4.999921986615105e-05,
"loss": 1.6283,
"step": 60
},
{
"epoch": 0.07,
"learning_rate": 4.999783298156723e-05,
"loss": 1.6308,
"step": 80
},
{
"epoch": 0.08,
"learning_rate": 4.99957527027787e-05,
"loss": 1.5728,
"step": 100
},
{
"epoch": 0.1,
"learning_rate": 4.999297908748858e-05,
"loss": 1.5868,
"step": 120
},
{
"epoch": 0.12,
"learning_rate": 4.998951221263189e-05,
"loss": 1.5358,
"step": 140
},
{
"epoch": 0.13,
"learning_rate": 4.9985352174373414e-05,
"loss": 1.5481,
"step": 160
},
{
"epoch": 0.15,
"learning_rate": 4.998049908810499e-05,
"loss": 1.5296,
"step": 180
},
{
"epoch": 0.17,
"learning_rate": 4.997495308844239e-05,
"loss": 1.5304,
"step": 200
},
{
"epoch": 0.18,
"learning_rate": 4.9968714329221486e-05,
"loss": 1.5055,
"step": 220
},
{
"epoch": 0.2,
"learning_rate": 4.996178298349407e-05,
"loss": 1.4966,
"step": 240
},
{
"epoch": 0.22,
"learning_rate": 4.9954159243523e-05,
"loss": 1.4697,
"step": 260
},
{
"epoch": 0.23,
"learning_rate": 4.9945843320776896e-05,
"loss": 1.4877,
"step": 280
},
{
"epoch": 0.25,
"learning_rate": 4.993683544592426e-05,
"loss": 1.4842,
"step": 300
},
{
"epoch": 0.27,
"learning_rate": 4.9927135868827065e-05,
"loss": 1.4808,
"step": 320
},
{
"epoch": 0.28,
"learning_rate": 4.991674485853387e-05,
"loss": 1.4508,
"step": 340
},
{
"epoch": 0.3,
"learning_rate": 4.99056627032723e-05,
"loss": 1.458,
"step": 360
},
{
"epoch": 0.32,
"learning_rate": 4.98938897104411e-05,
"loss": 1.4582,
"step": 380
},
{
"epoch": 0.33,
"learning_rate": 4.9881426206601566e-05,
"loss": 1.4379,
"step": 400
},
{
"epoch": 0.35,
"learning_rate": 4.986827253746853e-05,
"loss": 1.4557,
"step": 420
},
{
"epoch": 0.37,
"learning_rate": 4.9854429067900723e-05,
"loss": 1.4214,
"step": 440
},
{
"epoch": 0.38,
"learning_rate": 4.98398961818907e-05,
"loss": 1.4331,
"step": 460
},
{
"epoch": 0.4,
"learning_rate": 4.9824674282554165e-05,
"loss": 1.4215,
"step": 480
},
{
"epoch": 0.42,
"learning_rate": 4.980876379211879e-05,
"loss": 1.4167,
"step": 500
},
{
"epoch": 0.43,
"learning_rate": 4.9792165151912484e-05,
"loss": 1.4046,
"step": 520
},
{
"epoch": 0.45,
"learning_rate": 4.977487882235121e-05,
"loss": 1.4067,
"step": 540
},
{
"epoch": 0.47,
"learning_rate": 4.975690528292617e-05,
"loss": 1.413,
"step": 560
},
{
"epoch": 0.48,
"learning_rate": 4.973824503219048e-05,
"loss": 1.4029,
"step": 580
},
{
"epoch": 0.5,
"learning_rate": 4.97188985877454e-05,
"loss": 1.3601,
"step": 600
},
{
"epoch": 0.52,
"learning_rate": 4.9698866486225955e-05,
"loss": 1.3631,
"step": 620
},
{
"epoch": 0.53,
"learning_rate": 4.9678149283286024e-05,
"loss": 1.3568,
"step": 640
},
{
"epoch": 0.55,
"learning_rate": 4.965674755358296e-05,
"loss": 1.378,
"step": 660
},
{
"epoch": 0.57,
"learning_rate": 4.9634661890761634e-05,
"loss": 1.3617,
"step": 680
},
{
"epoch": 0.59,
"learning_rate": 4.9611892907437974e-05,
"loss": 1.3748,
"step": 700
},
{
"epoch": 0.6,
"learning_rate": 4.958844123518197e-05,
"loss": 1.329,
"step": 720
},
{
"epoch": 0.62,
"learning_rate": 4.956430752450014e-05,
"loss": 1.3485,
"step": 740
},
{
"epoch": 0.64,
"learning_rate": 4.953949244481754e-05,
"loss": 1.3259,
"step": 760
},
{
"epoch": 0.65,
"learning_rate": 4.9513996684459105e-05,
"loss": 1.3245,
"step": 780
},
{
"epoch": 0.67,
"learning_rate": 4.948782095063066e-05,
"loss": 1.3413,
"step": 800
},
{
"epoch": 0.69,
"learning_rate": 4.946096596939921e-05,
"loss": 1.302,
"step": 820
},
{
"epoch": 0.7,
"learning_rate": 4.9433432485672864e-05,
"loss": 1.2948,
"step": 840
},
{
"epoch": 0.72,
"learning_rate": 4.940522126318015e-05,
"loss": 1.3077,
"step": 860
},
{
"epoch": 0.74,
"learning_rate": 4.9376333084448806e-05,
"loss": 1.3172,
"step": 880
},
{
"epoch": 0.75,
"learning_rate": 4.934676875078414e-05,
"loss": 1.3117,
"step": 900
},
{
"epoch": 0.77,
"learning_rate": 4.931652908224673e-05,
"loss": 1.3061,
"step": 920
},
{
"epoch": 0.79,
"learning_rate": 4.928561491762973e-05,
"loss": 1.2932,
"step": 940
},
{
"epoch": 0.8,
"learning_rate": 4.9254027114435554e-05,
"loss": 1.2772,
"step": 960
},
{
"epoch": 0.82,
"learning_rate": 4.922176654885215e-05,
"loss": 1.2484,
"step": 980
},
{
"epoch": 0.84,
"learning_rate": 4.9188834115728653e-05,
"loss": 1.2507,
"step": 1000
},
{
"epoch": 0.85,
"learning_rate": 4.9155230728550584e-05,
"loss": 1.242,
"step": 1020
},
{
"epoch": 0.87,
"learning_rate": 4.912095731941447e-05,
"loss": 1.231,
"step": 1040
},
{
"epoch": 0.89,
"learning_rate": 4.908601483900207e-05,
"loss": 1.2293,
"step": 1060
},
{
"epoch": 0.9,
"learning_rate": 4.9050404256553925e-05,
"loss": 1.2514,
"step": 1080
},
{
"epoch": 0.92,
"learning_rate": 4.901412655984252e-05,
"loss": 1.2267,
"step": 1100
},
{
"epoch": 0.94,
"learning_rate": 4.897718275514487e-05,
"loss": 1.2314,
"step": 1120
},
{
"epoch": 0.95,
"learning_rate": 4.893957386721459e-05,
"loss": 1.2156,
"step": 1140
},
{
"epoch": 0.97,
"learning_rate": 4.8901300939253516e-05,
"loss": 1.199,
"step": 1160
},
{
"epoch": 0.99,
"learning_rate": 4.8862365032882726e-05,
"loss": 1.2056,
"step": 1180
},
{
"epoch": 1.0,
"eval_loss": 1.2155479192733765,
"eval_runtime": 137.6539,
"eval_samples_per_second": 34.376,
"eval_steps_per_second": 8.594,
"step": 1196
},
{
"epoch": 1.0,
"learning_rate": 4.882276722811311e-05,
"loss": 1.1928,
"step": 1200
},
{
"epoch": 1.02,
"learning_rate": 4.8782508623315396e-05,
"loss": 1.1303,
"step": 1220
},
{
"epoch": 1.04,
"learning_rate": 4.874159033518973e-05,
"loss": 1.1062,
"step": 1240
},
{
"epoch": 1.05,
"learning_rate": 4.870001349873464e-05,
"loss": 1.0967,
"step": 1260
},
{
"epoch": 1.07,
"learning_rate": 4.865777926721559e-05,
"loss": 1.0976,
"step": 1280
},
{
"epoch": 1.09,
"learning_rate": 4.8614888812132976e-05,
"loss": 1.0931,
"step": 1300
},
{
"epoch": 1.1,
"learning_rate": 4.8571343323189654e-05,
"loss": 1.0678,
"step": 1320
},
{
"epoch": 1.12,
"learning_rate": 4.85271440082579e-05,
"loss": 1.0681,
"step": 1340
},
{
"epoch": 1.14,
"learning_rate": 4.8482292093345944e-05,
"loss": 1.0742,
"step": 1360
},
{
"epoch": 1.15,
"learning_rate": 4.843678882256394e-05,
"loss": 1.0473,
"step": 1380
},
{
"epoch": 1.17,
"learning_rate": 4.8390635458089465e-05,
"loss": 1.0621,
"step": 1400
},
{
"epoch": 1.19,
"learning_rate": 4.83438332801325e-05,
"loss": 1.0477,
"step": 1420
},
{
"epoch": 1.2,
"learning_rate": 4.829638358689995e-05,
"loss": 1.0378,
"step": 1440
},
{
"epoch": 1.22,
"learning_rate": 4.824828769455957e-05,
"loss": 1.0426,
"step": 1460
},
{
"epoch": 1.24,
"learning_rate": 4.8199546937203546e-05,
"loss": 1.0268,
"step": 1480
},
{
"epoch": 1.25,
"learning_rate": 4.8150162666811395e-05,
"loss": 1.0189,
"step": 1500
},
{
"epoch": 1.27,
"learning_rate": 4.810013625321253e-05,
"loss": 1.0067,
"step": 1520
},
{
"epoch": 1.29,
"learning_rate": 4.804946908404827e-05,
"loss": 0.998,
"step": 1540
},
{
"epoch": 1.3,
"learning_rate": 4.799816256473327e-05,
"loss": 1.0112,
"step": 1560
},
{
"epoch": 1.32,
"learning_rate": 4.794621811841663e-05,
"loss": 1.0129,
"step": 1580
},
{
"epoch": 1.34,
"learning_rate": 4.789363718594235e-05,
"loss": 1.002,
"step": 1600
},
{
"epoch": 1.35,
"learning_rate": 4.784042122580943e-05,
"loss": 0.9852,
"step": 1620
},
{
"epoch": 1.37,
"learning_rate": 4.778657171413133e-05,
"loss": 0.9952,
"step": 1640
},
{
"epoch": 1.39,
"learning_rate": 4.773209014459512e-05,
"loss": 1.0058,
"step": 1660
},
{
"epoch": 1.4,
"learning_rate": 4.767697802841996e-05,
"loss": 0.9679,
"step": 1680
},
{
"epoch": 1.42,
"learning_rate": 4.7621236894315244e-05,
"loss": 0.9707,
"step": 1700
},
{
"epoch": 1.44,
"learning_rate": 4.756486828843818e-05,
"loss": 0.9704,
"step": 1720
},
{
"epoch": 1.45,
"learning_rate": 4.7507873774350865e-05,
"loss": 0.9588,
"step": 1740
},
{
"epoch": 1.47,
"learning_rate": 4.7450254932976965e-05,
"loss": 0.9598,
"step": 1760
},
{
"epoch": 1.49,
"learning_rate": 4.739201336255786e-05,
"loss": 0.9409,
"step": 1780
},
{
"epoch": 1.5,
"learning_rate": 4.733315067860828e-05,
"loss": 0.9543,
"step": 1800
},
{
"epoch": 1.52,
"learning_rate": 4.727366851387149e-05,
"loss": 0.9643,
"step": 1820
},
{
"epoch": 1.54,
"learning_rate": 4.721356851827407e-05,
"loss": 0.9293,
"step": 1840
},
{
"epoch": 1.55,
"learning_rate": 4.715285235888003e-05,
"loss": 0.9214,
"step": 1860
},
{
"epoch": 1.57,
"learning_rate": 4.709152171984471e-05,
"loss": 0.9292,
"step": 1880
},
{
"epoch": 1.59,
"learning_rate": 4.702957830236794e-05,
"loss": 0.9117,
"step": 1900
},
{
"epoch": 1.6,
"learning_rate": 4.696702382464692e-05,
"loss": 0.9049,
"step": 1920
},
{
"epoch": 1.62,
"learning_rate": 4.690386002182856e-05,
"loss": 0.9069,
"step": 1940
},
{
"epoch": 1.64,
"learning_rate": 4.6840088645961325e-05,
"loss": 0.8863,
"step": 1960
},
{
"epoch": 1.65,
"learning_rate": 4.6775711465946614e-05,
"loss": 0.8913,
"step": 1980
},
{
"epoch": 1.67,
"learning_rate": 4.671073026748979e-05,
"loss": 0.8946,
"step": 2000
},
{
"epoch": 1.69,
"learning_rate": 4.6645146853050524e-05,
"loss": 0.8652,
"step": 2020
},
{
"epoch": 1.7,
"learning_rate": 4.65789630417929e-05,
"loss": 0.8654,
"step": 2040
},
{
"epoch": 1.72,
"learning_rate": 4.6512180669534886e-05,
"loss": 0.865,
"step": 2060
},
{
"epoch": 1.74,
"learning_rate": 4.644480158869744e-05,
"loss": 0.8613,
"step": 2080
},
{
"epoch": 1.76,
"learning_rate": 4.6376827668253145e-05,
"loss": 0.8624,
"step": 2100
},
{
"epoch": 1.77,
"learning_rate": 4.630826079367433e-05,
"loss": 0.8536,
"step": 2120
},
{
"epoch": 1.79,
"learning_rate": 4.62391028668808e-05,
"loss": 0.8413,
"step": 2140
},
{
"epoch": 1.81,
"learning_rate": 4.6169355806187056e-05,
"loss": 0.872,
"step": 2160
},
{
"epoch": 1.82,
"learning_rate": 4.609902154624909e-05,
"loss": 0.8331,
"step": 2180
},
{
"epoch": 1.84,
"learning_rate": 4.6028102038010766e-05,
"loss": 0.8309,
"step": 2200
},
{
"epoch": 1.86,
"learning_rate": 4.595659924864962e-05,
"loss": 0.8177,
"step": 2220
},
{
"epoch": 1.87,
"learning_rate": 4.588451516152238e-05,
"loss": 0.8042,
"step": 2240
},
{
"epoch": 1.89,
"learning_rate": 4.581185177610988e-05,
"loss": 0.8168,
"step": 2260
},
{
"epoch": 1.91,
"learning_rate": 4.573861110796165e-05,
"loss": 0.7986,
"step": 2280
},
{
"epoch": 1.92,
"learning_rate": 4.5664795188639967e-05,
"loss": 0.7797,
"step": 2300
},
{
"epoch": 1.94,
"learning_rate": 4.559040606566355e-05,
"loss": 0.7756,
"step": 2320
},
{
"epoch": 1.96,
"learning_rate": 4.5515445802450735e-05,
"loss": 0.7965,
"step": 2340
},
{
"epoch": 1.97,
"learning_rate": 4.543991647826222e-05,
"loss": 0.786,
"step": 2360
},
{
"epoch": 1.99,
"learning_rate": 4.536382018814345e-05,
"loss": 0.7767,
"step": 2380
},
{
"epoch": 2.0,
"eval_loss": 0.853377640247345,
"eval_runtime": 137.5007,
"eval_samples_per_second": 34.414,
"eval_steps_per_second": 8.604,
"step": 2393
},
{
"epoch": 2.01,
"learning_rate": 4.528715904286644e-05,
"loss": 0.7362,
"step": 2400
},
{
"epoch": 2.02,
"learning_rate": 4.520993516887126e-05,
"loss": 0.6641,
"step": 2420
},
{
"epoch": 2.04,
"learning_rate": 4.513215070820708e-05,
"loss": 0.6559,
"step": 2440
},
{
"epoch": 2.06,
"learning_rate": 4.505380781847266e-05,
"loss": 0.6527,
"step": 2460
},
{
"epoch": 2.07,
"learning_rate": 4.497490867275661e-05,
"loss": 0.6507,
"step": 2480
},
{
"epoch": 2.09,
"learning_rate": 4.489545545957704e-05,
"loss": 0.6531,
"step": 2500
},
{
"epoch": 2.11,
"learning_rate": 4.481545038282089e-05,
"loss": 0.6294,
"step": 2520
},
{
"epoch": 2.12,
"learning_rate": 4.4734895661682796e-05,
"loss": 0.6504,
"step": 2540
},
{
"epoch": 2.14,
"learning_rate": 4.465379353060349e-05,
"loss": 0.6653,
"step": 2560
},
{
"epoch": 2.16,
"learning_rate": 4.4572146239207904e-05,
"loss": 0.6271,
"step": 2580
},
{
"epoch": 2.17,
"learning_rate": 4.448995605224268e-05,
"loss": 0.6393,
"step": 2600
},
{
"epoch": 2.19,
"learning_rate": 4.440722524951341e-05,
"loss": 0.654,
"step": 2620
},
{
"epoch": 2.21,
"learning_rate": 4.432395612582136e-05,
"loss": 0.6366,
"step": 2640
},
{
"epoch": 2.22,
"learning_rate": 4.424015099089989e-05,
"loss": 0.6333,
"step": 2660
},
{
"epoch": 2.24,
"learning_rate": 4.415581216935025e-05,
"loss": 0.6414,
"step": 2680
},
{
"epoch": 2.26,
"learning_rate": 4.4070942000577256e-05,
"loss": 0.6278,
"step": 2700
},
{
"epoch": 2.27,
"learning_rate": 4.398554283872428e-05,
"loss": 0.6251,
"step": 2720
},
{
"epoch": 2.29,
"learning_rate": 4.389961705260801e-05,
"loss": 0.6243,
"step": 2740
},
{
"epoch": 2.31,
"learning_rate": 4.381316702565274e-05,
"loss": 0.6392,
"step": 2760
},
{
"epoch": 2.32,
"learning_rate": 4.3726195155824214e-05,
"loss": 0.6393,
"step": 2780
},
{
"epoch": 2.34,
"learning_rate": 4.363870385556318e-05,
"loss": 0.6157,
"step": 2800
},
{
"epoch": 2.36,
"learning_rate": 4.355069555171841e-05,
"loss": 0.6021,
"step": 2820
},
{
"epoch": 2.37,
"learning_rate": 4.346217268547944e-05,
"loss": 0.6122,
"step": 2840
},
{
"epoch": 2.39,
"learning_rate": 4.3373137712308794e-05,
"loss": 0.6098,
"step": 2860
},
{
"epoch": 2.41,
"learning_rate": 4.328359310187393e-05,
"loss": 0.6026,
"step": 2880
},
{
"epoch": 2.42,
"learning_rate": 4.3193541337978693e-05,
"loss": 0.5976,
"step": 2900
},
{
"epoch": 2.44,
"learning_rate": 4.3102984918494454e-05,
"loss": 0.6003,
"step": 2920
},
{
"epoch": 2.46,
"learning_rate": 4.301192635529081e-05,
"loss": 0.6012,
"step": 2940
},
{
"epoch": 2.47,
"learning_rate": 4.292036817416589e-05,
"loss": 0.5924,
"step": 2960
},
{
"epoch": 2.49,
"learning_rate": 4.282831291477632e-05,
"loss": 0.5935,
"step": 2980
},
{
"epoch": 2.51,
"learning_rate": 4.273576313056678e-05,
"loss": 0.5941,
"step": 3000
},
{
"epoch": 2.52,
"learning_rate": 4.2642721388699145e-05,
"loss": 0.5985,
"step": 3020
},
{
"epoch": 2.54,
"learning_rate": 4.254919026998131e-05,
"loss": 0.591,
"step": 3040
},
{
"epoch": 2.56,
"learning_rate": 4.245517236879558e-05,
"loss": 0.5781,
"step": 3060
},
{
"epoch": 2.57,
"learning_rate": 4.2360670293026725e-05,
"loss": 0.583,
"step": 3080
},
{
"epoch": 2.59,
"learning_rate": 4.2265686663989635e-05,
"loss": 0.5912,
"step": 3100
},
{
"epoch": 2.61,
"learning_rate": 4.217022411635658e-05,
"loss": 0.5907,
"step": 3120
},
{
"epoch": 2.62,
"learning_rate": 4.207428529808421e-05,
"loss": 0.5739,
"step": 3140
},
{
"epoch": 2.64,
"learning_rate": 4.197787287034001e-05,
"loss": 0.5788,
"step": 3160
},
{
"epoch": 2.66,
"learning_rate": 4.188098950742852e-05,
"loss": 0.5712,
"step": 3180
},
{
"epoch": 2.67,
"learning_rate": 4.1783637896717195e-05,
"loss": 0.5631,
"step": 3200
},
{
"epoch": 2.69,
"learning_rate": 4.16858207385618e-05,
"loss": 0.5647,
"step": 3220
},
{
"epoch": 2.71,
"learning_rate": 4.1587540746231565e-05,
"loss": 0.5668,
"step": 3240
},
{
"epoch": 2.72,
"learning_rate": 4.148880064583386e-05,
"loss": 0.5581,
"step": 3260
},
{
"epoch": 2.74,
"learning_rate": 4.138960317623863e-05,
"loss": 0.5662,
"step": 3280
},
{
"epoch": 2.76,
"learning_rate": 4.128995108900241e-05,
"loss": 0.5677,
"step": 3300
},
{
"epoch": 2.77,
"learning_rate": 4.118984714829199e-05,
"loss": 0.5616,
"step": 3320
},
{
"epoch": 2.79,
"learning_rate": 4.108929413080774e-05,
"loss": 0.5549,
"step": 3340
},
{
"epoch": 2.81,
"learning_rate": 4.098829482570662e-05,
"loss": 0.5511,
"step": 3360
},
{
"epoch": 2.82,
"learning_rate": 4.088685203452479e-05,
"loss": 0.5455,
"step": 3380
},
{
"epoch": 2.84,
"learning_rate": 4.078496857109987e-05,
"loss": 0.5507,
"step": 3400
},
{
"epoch": 2.86,
"learning_rate": 4.068264726149298e-05,
"loss": 0.5459,
"step": 3420
},
{
"epoch": 2.88,
"learning_rate": 4.057989094391024e-05,
"loss": 0.5203,
"step": 3440
},
{
"epoch": 2.89,
"learning_rate": 4.0476702468624126e-05,
"loss": 0.5319,
"step": 3460
},
{
"epoch": 2.91,
"learning_rate": 4.037308469789437e-05,
"loss": 0.5322,
"step": 3480
},
{
"epoch": 2.93,
"learning_rate": 4.026904050588858e-05,
"loss": 0.5491,
"step": 3500
},
{
"epoch": 2.94,
"learning_rate": 4.01645727786025e-05,
"loss": 0.5423,
"step": 3520
},
{
"epoch": 2.96,
"learning_rate": 4.005968441377998e-05,
"loss": 0.5381,
"step": 3540
},
{
"epoch": 2.98,
"learning_rate": 3.9954378320832585e-05,
"loss": 0.5304,
"step": 3560
},
{
"epoch": 2.99,
"learning_rate": 3.9848657420758886e-05,
"loss": 0.5227,
"step": 3580
},
{
"epoch": 3.0,
"eval_loss": 0.6265327334403992,
"eval_runtime": 137.828,
"eval_samples_per_second": 34.333,
"eval_steps_per_second": 8.583,
"step": 3589
},
{
"epoch": 3.01,
"learning_rate": 3.974252464606345e-05,
"loss": 0.493,
"step": 3600
},
{
"epoch": 3.03,
"learning_rate": 3.963598294067551e-05,
"loss": 0.4674,
"step": 3620
},
{
"epoch": 3.04,
"learning_rate": 3.9529035259867265e-05,
"loss": 0.4711,
"step": 3640
},
{
"epoch": 3.06,
"learning_rate": 3.9421684570171926e-05,
"loss": 0.4701,
"step": 3660
},
{
"epoch": 3.08,
"learning_rate": 3.931393384930148e-05,
"loss": 0.4661,
"step": 3680
},
{
"epoch": 3.09,
"learning_rate": 3.920578608606398e-05,
"loss": 0.4685,
"step": 3700
},
{
"epoch": 3.11,
"learning_rate": 3.909724428028076e-05,
"loss": 0.4553,
"step": 3720
},
{
"epoch": 3.13,
"learning_rate": 3.898831144270316e-05,
"loss": 0.4608,
"step": 3740
},
{
"epoch": 3.14,
"learning_rate": 3.8878990594929024e-05,
"loss": 0.4529,
"step": 3760
},
{
"epoch": 3.16,
"learning_rate": 3.876928476931889e-05,
"loss": 0.4775,
"step": 3780
},
{
"epoch": 3.18,
"learning_rate": 3.865919700891188e-05,
"loss": 0.459,
"step": 3800
},
{
"epoch": 3.19,
"learning_rate": 3.854873036734129e-05,
"loss": 0.4726,
"step": 3820
},
{
"epoch": 3.21,
"learning_rate": 3.843788790874988e-05,
"loss": 0.4669,
"step": 3840
},
{
"epoch": 3.23,
"learning_rate": 3.8326672707704894e-05,
"loss": 0.4542,
"step": 3860
},
{
"epoch": 3.24,
"learning_rate": 3.8215087849112776e-05,
"loss": 0.4652,
"step": 3880
},
{
"epoch": 3.26,
"learning_rate": 3.810313642813358e-05,
"loss": 0.4536,
"step": 3900
},
{
"epoch": 3.28,
"learning_rate": 3.7990821550095146e-05,
"loss": 0.4605,
"step": 3920
},
{
"epoch": 3.29,
"learning_rate": 3.7878146330406924e-05,
"loss": 0.4696,
"step": 3940
},
{
"epoch": 3.31,
"learning_rate": 3.7765113894473634e-05,
"loss": 0.4621,
"step": 3960
},
{
"epoch": 3.33,
"learning_rate": 3.765172737760846e-05,
"loss": 0.4594,
"step": 3980
},
{
"epoch": 3.34,
"learning_rate": 3.753798992494617e-05,
"loss": 0.4504,
"step": 4000
},
{
"epoch": 3.36,
"learning_rate": 3.742390469135587e-05,
"loss": 0.4668,
"step": 4020
},
{
"epoch": 3.38,
"learning_rate": 3.7309474841353444e-05,
"loss": 0.4498,
"step": 4040
},
{
"epoch": 3.39,
"learning_rate": 3.7194703549013823e-05,
"loss": 0.4546,
"step": 4060
},
{
"epoch": 3.41,
"learning_rate": 3.707959399788291e-05,
"loss": 0.4583,
"step": 4080
},
{
"epoch": 3.43,
"learning_rate": 3.6964149380889305e-05,
"loss": 0.4635,
"step": 4100
},
{
"epoch": 3.44,
"learning_rate": 3.6848372900255715e-05,
"loss": 0.4637,
"step": 4120
},
{
"epoch": 3.46,
"learning_rate": 3.6732267767410126e-05,
"loss": 0.4491,
"step": 4140
},
{
"epoch": 3.48,
"learning_rate": 3.661583720289676e-05,
"loss": 0.45,
"step": 4160
},
{
"epoch": 3.49,
"learning_rate": 3.64990844362867e-05,
"loss": 0.4603,
"step": 4180
},
{
"epoch": 3.51,
"learning_rate": 3.638201270608833e-05,
"loss": 0.4581,
"step": 4200
},
{
"epoch": 3.53,
"learning_rate": 3.6264625259657516e-05,
"loss": 0.4469,
"step": 4220
},
{
"epoch": 3.54,
"learning_rate": 3.614692535310748e-05,
"loss": 0.4482,
"step": 4240
},
{
"epoch": 3.56,
"learning_rate": 3.602891625121856e-05,
"loss": 0.4529,
"step": 4260
},
{
"epoch": 3.58,
"learning_rate": 3.591060122734758e-05,
"loss": 0.4647,
"step": 4280
},
{
"epoch": 3.59,
"learning_rate": 3.579198356333709e-05,
"loss": 0.4585,
"step": 4300
},
{
"epoch": 3.61,
"learning_rate": 3.567306654942432e-05,
"loss": 0.4556,
"step": 4320
},
{
"epoch": 3.63,
"learning_rate": 3.5553853484149914e-05,
"loss": 0.4525,
"step": 4340
},
{
"epoch": 3.64,
"learning_rate": 3.5434347674266465e-05,
"loss": 0.4485,
"step": 4360
},
{
"epoch": 3.66,
"learning_rate": 3.531455243464673e-05,
"loss": 0.4514,
"step": 4380
},
{
"epoch": 3.68,
"learning_rate": 3.5194471088191746e-05,
"loss": 0.4458,
"step": 4400
},
{
"epoch": 3.69,
"learning_rate": 3.507410696573863e-05,
"loss": 0.4472,
"step": 4420
},
{
"epoch": 3.71,
"learning_rate": 3.495346340596817e-05,
"loss": 0.4528,
"step": 4440
},
{
"epoch": 3.73,
"learning_rate": 3.483254375531224e-05,
"loss": 0.4397,
"step": 4460
},
{
"epoch": 3.74,
"learning_rate": 3.471135136786098e-05,
"loss": 0.4482,
"step": 4480
},
{
"epoch": 3.76,
"learning_rate": 3.458988960526974e-05,
"loss": 0.4404,
"step": 4500
},
{
"epoch": 3.78,
"learning_rate": 3.446816183666588e-05,
"loss": 0.4404,
"step": 4520
},
{
"epoch": 3.79,
"learning_rate": 3.43461714385552e-05,
"loss": 0.441,
"step": 4540
},
{
"epoch": 3.81,
"learning_rate": 3.422392179472845e-05,
"loss": 0.4285,
"step": 4560
},
{
"epoch": 3.83,
"learning_rate": 3.410141629616733e-05,
"loss": 0.4444,
"step": 4580
},
{
"epoch": 3.84,
"learning_rate": 3.39786583409505e-05,
"loss": 0.4494,
"step": 4600
},
{
"epoch": 3.86,
"learning_rate": 3.38556513341593e-05,
"loss": 0.4552,
"step": 4620
},
{
"epoch": 3.88,
"learning_rate": 3.373239868778333e-05,
"loss": 0.4424,
"step": 4640
},
{
"epoch": 3.89,
"learning_rate": 3.360890382062574e-05,
"loss": 0.4427,
"step": 4660
},
{
"epoch": 3.91,
"learning_rate": 3.348517015820847e-05,
"loss": 0.4451,
"step": 4680
},
{
"epoch": 3.93,
"learning_rate": 3.33612011326772e-05,
"loss": 0.4482,
"step": 4700
},
{
"epoch": 3.94,
"learning_rate": 3.323700018270616e-05,
"loss": 0.447,
"step": 4720
},
{
"epoch": 3.96,
"learning_rate": 3.3112570753402715e-05,
"loss": 0.4452,
"step": 4740
},
{
"epoch": 3.98,
"learning_rate": 3.298791629621187e-05,
"loss": 0.4327,
"step": 4760
},
{
"epoch": 3.99,
"learning_rate": 3.2863040268820444e-05,
"loss": 0.4294,
"step": 4780
},
{
"epoch": 4.0,
"eval_loss": 0.5381744503974915,
"eval_runtime": 137.6658,
"eval_samples_per_second": 34.373,
"eval_steps_per_second": 8.593,
"step": 4786
},
{
"epoch": 4.01,
"learning_rate": 3.2737946135061236e-05,
"loss": 0.409,
"step": 4800
},
{
"epoch": 4.03,
"learning_rate": 3.2612637364816936e-05,
"loss": 0.3919,
"step": 4820
},
{
"epoch": 4.05,
"learning_rate": 3.248711743392381e-05,
"loss": 0.3898,
"step": 4840
},
{
"epoch": 4.06,
"learning_rate": 3.2361389824075374e-05,
"loss": 0.3802,
"step": 4860
},
{
"epoch": 4.08,
"learning_rate": 3.2235458022725764e-05,
"loss": 0.3892,
"step": 4880
},
{
"epoch": 4.1,
"learning_rate": 3.210932552299301e-05,
"loss": 0.3876,
"step": 4900
},
{
"epoch": 4.11,
"learning_rate": 3.198299582356215e-05,
"loss": 0.3938,
"step": 4920
},
{
"epoch": 4.13,
"learning_rate": 3.1856472428588194e-05,
"loss": 0.3928,
"step": 4940
},
{
"epoch": 4.15,
"learning_rate": 3.172975884759891e-05,
"loss": 0.3891,
"step": 4960
},
{
"epoch": 4.16,
"learning_rate": 3.160285859539745e-05,
"loss": 0.3902,
"step": 4980
},
{
"epoch": 4.18,
"learning_rate": 3.147577519196493e-05,
"loss": 0.3909,
"step": 5000
},
{
"epoch": 4.2,
"learning_rate": 3.134851216236272e-05,
"loss": 0.384,
"step": 5020
},
{
"epoch": 4.21,
"learning_rate": 3.122107303663468e-05,
"loss": 0.3997,
"step": 5040
},
{
"epoch": 4.23,
"learning_rate": 3.1093461349709285e-05,
"loss": 0.3894,
"step": 5060
},
{
"epoch": 4.25,
"learning_rate": 3.096568064130151e-05,
"loss": 0.3863,
"step": 5080
},
{
"epoch": 4.26,
"learning_rate": 3.083773445581472e-05,
"loss": 0.3955,
"step": 5100
},
{
"epoch": 4.28,
"learning_rate": 3.0709626342242266e-05,
"loss": 0.3937,
"step": 5120
},
{
"epoch": 4.3,
"learning_rate": 3.05813598540691e-05,
"loss": 0.3912,
"step": 5140
},
{
"epoch": 4.31,
"learning_rate": 3.0452938549173234e-05,
"loss": 0.3903,
"step": 5160
},
{
"epoch": 4.33,
"learning_rate": 3.0324365989726948e-05,
"loss": 0.3821,
"step": 5180
},
{
"epoch": 4.35,
"learning_rate": 3.019564574209811e-05,
"loss": 0.3985,
"step": 5200
},
{
"epoch": 4.36,
"learning_rate": 3.006678137675114e-05,
"loss": 0.3954,
"step": 5220
},
{
"epoch": 4.38,
"learning_rate": 2.9937776468148053e-05,
"loss": 0.383,
"step": 5240
},
{
"epoch": 4.4,
"learning_rate": 2.9808634594649266e-05,
"loss": 0.4008,
"step": 5260
},
{
"epoch": 4.41,
"learning_rate": 2.9679359338414335e-05,
"loss": 0.3937,
"step": 5280
},
{
"epoch": 4.43,
"learning_rate": 2.9549954285302632e-05,
"loss": 0.3953,
"step": 5300
},
{
"epoch": 4.45,
"learning_rate": 2.9420423024773854e-05,
"loss": 0.3832,
"step": 5320
},
{
"epoch": 4.46,
"learning_rate": 2.929076914978845e-05,
"loss": 0.3982,
"step": 5340
},
{
"epoch": 4.48,
"learning_rate": 2.9160996256707985e-05,
"loss": 0.4004,
"step": 5360
},
{
"epoch": 4.5,
"learning_rate": 2.9031107945195345e-05,
"loss": 0.3879,
"step": 5380
},
{
"epoch": 4.51,
"learning_rate": 2.8901107818114947e-05,
"loss": 0.3907,
"step": 5400
},
{
"epoch": 4.53,
"learning_rate": 2.8770999481432738e-05,
"loss": 0.3882,
"step": 5420
},
{
"epoch": 4.55,
"learning_rate": 2.8640786544116205e-05,
"loss": 0.3857,
"step": 5440
},
{
"epoch": 4.56,
"learning_rate": 2.851047261803429e-05,
"loss": 0.3863,
"step": 5460
},
{
"epoch": 4.58,
"learning_rate": 2.8380061317857136e-05,
"loss": 0.3819,
"step": 5480
},
{
"epoch": 4.6,
"learning_rate": 2.8249556260955924e-05,
"loss": 0.3991,
"step": 5500
},
{
"epoch": 4.61,
"learning_rate": 2.8118961067302402e-05,
"loss": 0.386,
"step": 5520
},
{
"epoch": 4.63,
"learning_rate": 2.7988279359368612e-05,
"loss": 0.381,
"step": 5540
},
{
"epoch": 4.65,
"learning_rate": 2.7857514762026317e-05,
"loss": 0.3746,
"step": 5560
},
{
"epoch": 4.66,
"learning_rate": 2.772667090244647e-05,
"loss": 0.384,
"step": 5580
},
{
"epoch": 4.68,
"learning_rate": 2.7595751409998638e-05,
"loss": 0.3981,
"step": 5600
},
{
"epoch": 4.7,
"learning_rate": 2.7464759916150283e-05,
"loss": 0.3829,
"step": 5620
},
{
"epoch": 4.71,
"learning_rate": 2.733370005436608e-05,
"loss": 0.3837,
"step": 5640
},
{
"epoch": 4.73,
"learning_rate": 2.7202575460007067e-05,
"loss": 0.3872,
"step": 5660
},
{
"epoch": 4.75,
"learning_rate": 2.7071389770229895e-05,
"loss": 0.3916,
"step": 5680
},
{
"epoch": 4.76,
"learning_rate": 2.6940146623885836e-05,
"loss": 0.3811,
"step": 5700
},
{
"epoch": 4.78,
"learning_rate": 2.6808849661419955e-05,
"loss": 0.3858,
"step": 5720
},
{
"epoch": 4.8,
"learning_rate": 2.667750252477004e-05,
"loss": 0.3937,
"step": 5740
},
{
"epoch": 4.81,
"learning_rate": 2.654610885726563e-05,
"loss": 0.3903,
"step": 5760
},
{
"epoch": 4.83,
"learning_rate": 2.6414672303526938e-05,
"loss": 0.3869,
"step": 5780
},
{
"epoch": 4.85,
"learning_rate": 2.6283196509363807e-05,
"loss": 0.3899,
"step": 5800
},
{
"epoch": 4.86,
"learning_rate": 2.6151685121674458e-05,
"loss": 0.3909,
"step": 5820
},
{
"epoch": 4.88,
"learning_rate": 2.6020141788344495e-05,
"loss": 0.382,
"step": 5840
},
{
"epoch": 4.9,
"learning_rate": 2.588857015814556e-05,
"loss": 0.3937,
"step": 5860
},
{
"epoch": 4.91,
"learning_rate": 2.5756973880634257e-05,
"loss": 0.3847,
"step": 5880
},
{
"epoch": 4.93,
"learning_rate": 2.5625356606050837e-05,
"loss": 0.3799,
"step": 5900
},
{
"epoch": 4.95,
"learning_rate": 2.5493721985217974e-05,
"loss": 0.3861,
"step": 5920
},
{
"epoch": 4.96,
"learning_rate": 2.5362073669439485e-05,
"loss": 0.3929,
"step": 5940
},
{
"epoch": 4.98,
"learning_rate": 2.5230415310399068e-05,
"loss": 0.3755,
"step": 5960
},
{
"epoch": 5.0,
"learning_rate": 2.5098750560059026e-05,
"loss": 0.3806,
"step": 5980
},
{
"epoch": 5.0,
"eval_loss": 0.5340477228164673,
"eval_runtime": 138.035,
"eval_samples_per_second": 34.281,
"eval_steps_per_second": 8.57,
"step": 5982
},
{
"epoch": 5.01,
"learning_rate": 2.4967083070558905e-05,
"loss": 0.3509,
"step": 6000
},
{
"epoch": 5.03,
"learning_rate": 2.4835416494114254e-05,
"loss": 0.3382,
"step": 6020
},
{
"epoch": 5.05,
"learning_rate": 2.470375448291529e-05,
"loss": 0.3409,
"step": 6040
},
{
"epoch": 5.06,
"learning_rate": 2.45721006890256e-05,
"loss": 0.3385,
"step": 6060
},
{
"epoch": 5.08,
"learning_rate": 2.444045876428082e-05,
"loss": 0.3422,
"step": 6080
},
{
"epoch": 5.1,
"learning_rate": 2.43088323601874e-05,
"loss": 0.3492,
"step": 6100
},
{
"epoch": 5.11,
"learning_rate": 2.417722512782123e-05,
"loss": 0.3397,
"step": 6120
},
{
"epoch": 5.13,
"learning_rate": 2.404564071772644e-05,
"loss": 0.3414,
"step": 6140
},
{
"epoch": 5.15,
"learning_rate": 2.3914082779814103e-05,
"loss": 0.3418,
"step": 6160
},
{
"epoch": 5.17,
"learning_rate": 2.3782554963260995e-05,
"loss": 0.3417,
"step": 6180
},
{
"epoch": 5.18,
"learning_rate": 2.3651060916408386e-05,
"loss": 0.3535,
"step": 6200
},
{
"epoch": 5.2,
"learning_rate": 2.3519604286660857e-05,
"loss": 0.3423,
"step": 6220
},
{
"epoch": 5.22,
"learning_rate": 2.3388188720385063e-05,
"loss": 0.3353,
"step": 6240
},
{
"epoch": 5.23,
"learning_rate": 2.3256817862808672e-05,
"loss": 0.3479,
"step": 6260
},
{
"epoch": 5.25,
"learning_rate": 2.3125495357919187e-05,
"loss": 0.3297,
"step": 6280
},
{
"epoch": 5.27,
"learning_rate": 2.299422484836292e-05,
"loss": 0.3388,
"step": 6300
},
{
"epoch": 5.28,
"learning_rate": 2.2863009975343926e-05,
"loss": 0.343,
"step": 6320
},
{
"epoch": 5.3,
"learning_rate": 2.2731854378522964e-05,
"loss": 0.3506,
"step": 6340
},
{
"epoch": 5.32,
"learning_rate": 2.260076169591664e-05,
"loss": 0.3358,
"step": 6360
},
{
"epoch": 5.33,
"learning_rate": 2.246973556379639e-05,
"loss": 0.3534,
"step": 6380
},
{
"epoch": 5.35,
"learning_rate": 2.2338779616587672e-05,
"loss": 0.3379,
"step": 6400
},
{
"epoch": 5.37,
"learning_rate": 2.2207897486769164e-05,
"loss": 0.3414,
"step": 6420
},
{
"epoch": 5.38,
"learning_rate": 2.2077092804771957e-05,
"loss": 0.3374,
"step": 6440
},
{
"epoch": 5.4,
"learning_rate": 2.1946369198878886e-05,
"loss": 0.3551,
"step": 6460
},
{
"epoch": 5.42,
"learning_rate": 2.1815730295123888e-05,
"loss": 0.3448,
"step": 6480
},
{
"epoch": 5.43,
"learning_rate": 2.168517971719143e-05,
"loss": 0.3462,
"step": 6500
},
{
"epoch": 5.45,
"learning_rate": 2.1554721086315957e-05,
"loss": 0.3485,
"step": 6520
},
{
"epoch": 5.47,
"learning_rate": 2.1424358021181485e-05,
"loss": 0.3464,
"step": 6540
},
{
"epoch": 5.48,
"learning_rate": 2.1294094137821226e-05,
"loss": 0.3489,
"step": 6560
},
{
"epoch": 5.5,
"learning_rate": 2.116393304951723e-05,
"loss": 0.3457,
"step": 6580
},
{
"epoch": 5.52,
"learning_rate": 2.103387836670024e-05,
"loss": 0.3313,
"step": 6600
},
{
"epoch": 5.53,
"learning_rate": 2.0903933696849474e-05,
"loss": 0.3482,
"step": 6620
},
{
"epoch": 5.55,
"learning_rate": 2.0774102644392627e-05,
"loss": 0.3366,
"step": 6640
},
{
"epoch": 5.57,
"learning_rate": 2.0644388810605833e-05,
"loss": 0.3378,
"step": 6660
},
{
"epoch": 5.58,
"learning_rate": 2.051479579351377e-05,
"loss": 0.3414,
"step": 6680
},
{
"epoch": 5.6,
"learning_rate": 2.0385327187789942e-05,
"loss": 0.3283,
"step": 6700
},
{
"epoch": 5.62,
"learning_rate": 2.0255986584656854e-05,
"loss": 0.336,
"step": 6720
},
{
"epoch": 5.63,
"learning_rate": 2.0126777571786473e-05,
"loss": 0.3403,
"step": 6740
},
{
"epoch": 5.65,
"learning_rate": 1.9997703733200706e-05,
"loss": 0.3495,
"step": 6760
},
{
"epoch": 5.67,
"learning_rate": 1.986876864917196e-05,
"loss": 0.351,
"step": 6780
},
{
"epoch": 5.68,
"learning_rate": 1.973997589612383e-05,
"loss": 0.3406,
"step": 6800
},
{
"epoch": 5.7,
"learning_rate": 1.961132904653193e-05,
"loss": 0.3371,
"step": 6820
},
{
"epoch": 5.72,
"learning_rate": 1.9482831668824778e-05,
"loss": 0.3451,
"step": 6840
},
{
"epoch": 5.73,
"learning_rate": 1.9354487327284803e-05,
"loss": 0.3379,
"step": 6860
},
{
"epoch": 5.75,
"learning_rate": 1.9226299581949497e-05,
"loss": 0.3488,
"step": 6880
},
{
"epoch": 5.77,
"learning_rate": 1.9098271988512664e-05,
"loss": 0.344,
"step": 6900
},
{
"epoch": 5.78,
"learning_rate": 1.897040809822579e-05,
"loss": 0.3401,
"step": 6920
},
{
"epoch": 5.8,
"learning_rate": 1.8842711457799504e-05,
"loss": 0.349,
"step": 6940
},
{
"epoch": 5.82,
"learning_rate": 1.8715185609305272e-05,
"loss": 0.3371,
"step": 6960
},
{
"epoch": 5.83,
"learning_rate": 1.8587834090077077e-05,
"loss": 0.3329,
"step": 6980
},
{
"epoch": 5.85,
"learning_rate": 1.8460660432613348e-05,
"loss": 0.3426,
"step": 7000
},
{
"epoch": 5.87,
"learning_rate": 1.8333668164478907e-05,
"loss": 0.3379,
"step": 7020
},
{
"epoch": 5.88,
"learning_rate": 1.8206860808207238e-05,
"loss": 0.3367,
"step": 7040
},
{
"epoch": 5.9,
"learning_rate": 1.808024188120265e-05,
"loss": 0.3415,
"step": 7060
},
{
"epoch": 5.92,
"learning_rate": 1.7953814895642788e-05,
"loss": 0.3347,
"step": 7080
},
{
"epoch": 5.93,
"learning_rate": 1.7827583358381207e-05,
"loss": 0.3442,
"step": 7100
},
{
"epoch": 5.95,
"learning_rate": 1.770155077085006e-05,
"loss": 0.3347,
"step": 7120
},
{
"epoch": 5.97,
"learning_rate": 1.7575720628963004e-05,
"loss": 0.3514,
"step": 7140
},
{
"epoch": 5.98,
"learning_rate": 1.7450096423018225e-05,
"loss": 0.3283,
"step": 7160
},
{
"epoch": 6.0,
"eval_loss": 0.49299946427345276,
"eval_runtime": 137.9962,
"eval_samples_per_second": 34.291,
"eval_steps_per_second": 8.573,
"step": 7179
},
{
"epoch": 6.0,
"learning_rate": 1.7324681637601637e-05,
"loss": 0.3392,
"step": 7180
},
{
"epoch": 6.02,
"learning_rate": 1.719947975149019e-05,
"loss": 0.2997,
"step": 7200
},
{
"epoch": 6.03,
"learning_rate": 1.7074494237555405e-05,
"loss": 0.2942,
"step": 7220
},
{
"epoch": 6.05,
"learning_rate": 1.6949728562667037e-05,
"loss": 0.2994,
"step": 7240
},
{
"epoch": 6.07,
"learning_rate": 1.6825186187596915e-05,
"loss": 0.3034,
"step": 7260
},
{
"epoch": 6.08,
"learning_rate": 1.6700870566922905e-05,
"loss": 0.2958,
"step": 7280
},
{
"epoch": 6.1,
"learning_rate": 1.6576785148933165e-05,
"loss": 0.3023,
"step": 7300
},
{
"epoch": 6.12,
"learning_rate": 1.645293337553042e-05,
"loss": 0.3043,
"step": 7320
},
{
"epoch": 6.13,
"learning_rate": 1.632931868213654e-05,
"loss": 0.3053,
"step": 7340
},
{
"epoch": 6.15,
"learning_rate": 1.6205944497597203e-05,
"loss": 0.3047,
"step": 7360
},
{
"epoch": 6.17,
"learning_rate": 1.6082814244086858e-05,
"loss": 0.2986,
"step": 7380
},
{
"epoch": 6.18,
"learning_rate": 1.5959931337013696e-05,
"loss": 0.3075,
"step": 7400
},
{
"epoch": 6.2,
"learning_rate": 1.5837299184925004e-05,
"loss": 0.3081,
"step": 7420
},
{
"epoch": 6.22,
"learning_rate": 1.571492118941259e-05,
"loss": 0.293,
"step": 7440
},
{
"epoch": 6.23,
"learning_rate": 1.5592800745018397e-05,
"loss": 0.3063,
"step": 7460
},
{
"epoch": 6.25,
"learning_rate": 1.547094123914039e-05,
"loss": 0.3125,
"step": 7480
},
{
"epoch": 6.27,
"learning_rate": 1.5349346051938574e-05,
"loss": 0.3032,
"step": 7500
},
{
"epoch": 6.28,
"learning_rate": 1.5228018556241222e-05,
"loss": 0.2957,
"step": 7520
},
{
"epoch": 6.3,
"learning_rate": 1.5106962117451354e-05,
"loss": 0.2936,
"step": 7540
},
{
"epoch": 6.32,
"learning_rate": 1.4986180093453351e-05,
"loss": 0.3023,
"step": 7560
},
{
"epoch": 6.34,
"learning_rate": 1.4865675834519844e-05,
"loss": 0.293,
"step": 7580
},
{
"epoch": 6.35,
"learning_rate": 1.474545268321876e-05,
"loss": 0.3002,
"step": 7600
},
{
"epoch": 6.37,
"learning_rate": 1.4625513974320598e-05,
"loss": 0.3056,
"step": 7620
},
{
"epoch": 6.39,
"learning_rate": 1.4505863034705987e-05,
"loss": 0.296,
"step": 7640
},
{
"epoch": 6.4,
"learning_rate": 1.438650318327333e-05,
"loss": 0.2948,
"step": 7660
},
{
"epoch": 6.42,
"learning_rate": 1.4267437730846776e-05,
"loss": 0.3086,
"step": 7680
},
{
"epoch": 6.44,
"learning_rate": 1.4148669980084379e-05,
"loss": 0.2986,
"step": 7700
},
{
"epoch": 6.45,
"learning_rate": 1.4030203225386517e-05,
"loss": 0.3058,
"step": 7720
},
{
"epoch": 6.47,
"learning_rate": 1.3912040752804478e-05,
"loss": 0.3062,
"step": 7740
},
{
"epoch": 6.49,
"learning_rate": 1.3794185839949304e-05,
"loss": 0.3129,
"step": 7760
},
{
"epoch": 6.5,
"learning_rate": 1.3676641755900916e-05,
"loss": 0.3065,
"step": 7780
},
{
"epoch": 6.52,
"learning_rate": 1.3559411761117385e-05,
"loss": 0.3026,
"step": 7800
},
{
"epoch": 6.54,
"learning_rate": 1.3442499107344542e-05,
"loss": 0.313,
"step": 7820
},
{
"epoch": 6.55,
"learning_rate": 1.3325907037525765e-05,
"loss": 0.303,
"step": 7840
},
{
"epoch": 6.57,
"learning_rate": 1.320963878571198e-05,
"loss": 0.2982,
"step": 7860
},
{
"epoch": 6.59,
"learning_rate": 1.3093697576972042e-05,
"loss": 0.3036,
"step": 7880
},
{
"epoch": 6.6,
"learning_rate": 1.2978086627303182e-05,
"loss": 0.3093,
"step": 7900
},
{
"epoch": 6.62,
"learning_rate": 1.2862809143541896e-05,
"loss": 0.2952,
"step": 7920
},
{
"epoch": 6.64,
"learning_rate": 1.2747868323274898e-05,
"loss": 0.3041,
"step": 7940
},
{
"epoch": 6.65,
"learning_rate": 1.26332673547505e-05,
"loss": 0.2994,
"step": 7960
},
{
"epoch": 6.67,
"learning_rate": 1.2519009416790156e-05,
"loss": 0.2969,
"step": 7980
},
{
"epoch": 6.69,
"learning_rate": 1.2405097678700253e-05,
"loss": 0.293,
"step": 8000
},
{
"epoch": 6.7,
"learning_rate": 1.2291535300184221e-05,
"loss": 0.2986,
"step": 8020
},
{
"epoch": 6.72,
"learning_rate": 1.2178325431254948e-05,
"loss": 0.3081,
"step": 8040
},
{
"epoch": 6.74,
"learning_rate": 1.2065471212147295e-05,
"loss": 0.2939,
"step": 8060
},
{
"epoch": 6.75,
"learning_rate": 1.1952975773231052e-05,
"loss": 0.31,
"step": 8080
},
{
"epoch": 6.77,
"learning_rate": 1.1840842234924129e-05,
"loss": 0.3018,
"step": 8100
},
{
"epoch": 6.79,
"learning_rate": 1.1729073707605966e-05,
"loss": 0.2994,
"step": 8120
},
{
"epoch": 6.8,
"learning_rate": 1.1617673291531256e-05,
"loss": 0.3089,
"step": 8140
},
{
"epoch": 6.82,
"learning_rate": 1.150664407674395e-05,
"loss": 0.3099,
"step": 8160
},
{
"epoch": 6.84,
"learning_rate": 1.1395989142991573e-05,
"loss": 0.3016,
"step": 8180
},
{
"epoch": 6.85,
"learning_rate": 1.1285711559639785e-05,
"loss": 0.2972,
"step": 8200
},
{
"epoch": 6.87,
"learning_rate": 1.1175814385587203e-05,
"loss": 0.2955,
"step": 8220
},
{
"epoch": 6.89,
"learning_rate": 1.106630066918061e-05,
"loss": 0.298,
"step": 8240
},
{
"epoch": 6.9,
"learning_rate": 1.095717344813038e-05,
"loss": 0.3031,
"step": 8260
},
{
"epoch": 6.92,
"learning_rate": 1.0848435749426192e-05,
"loss": 0.2963,
"step": 8280
},
{
"epoch": 6.94,
"learning_rate": 1.0740090589253088e-05,
"loss": 0.3035,
"step": 8300
},
{
"epoch": 6.95,
"learning_rate": 1.063214097290782e-05,
"loss": 0.2965,
"step": 8320
},
{
"epoch": 6.97,
"learning_rate": 1.0524589894715489e-05,
"loss": 0.3038,
"step": 8340
},
{
"epoch": 6.99,
"learning_rate": 1.0417440337946438e-05,
"loss": 0.3028,
"step": 8360
},
{
"epoch": 7.0,
"eval_loss": 0.5165771842002869,
"eval_runtime": 137.3802,
"eval_samples_per_second": 34.445,
"eval_steps_per_second": 8.611,
"step": 8375
},
{
"epoch": 7.0,
"learning_rate": 1.031069527473358e-05,
"loss": 0.2909,
"step": 8380
},
{
"epoch": 7.02,
"learning_rate": 1.0204357665989883e-05,
"loss": 0.266,
"step": 8400
},
{
"epoch": 7.04,
"learning_rate": 1.0098430461326303e-05,
"loss": 0.2688,
"step": 8420
},
{
"epoch": 7.05,
"learning_rate": 9.9929165989699e-06,
"loss": 0.2694,
"step": 8440
},
{
"epoch": 7.07,
"learning_rate": 9.887819005682411e-06,
"loss": 0.2697,
"step": 8460
},
{
"epoch": 7.09,
"learning_rate": 9.783140596679016e-06,
"loss": 0.2736,
"step": 8480
},
{
"epoch": 7.1,
"learning_rate": 9.678884275547471e-06,
"loss": 0.2693,
"step": 8500
},
{
"epoch": 7.12,
"learning_rate": 9.575052934167619e-06,
"loss": 0.2658,
"step": 8520
},
{
"epoch": 7.14,
"learning_rate": 9.47164945263111e-06,
"loss": 0.2595,
"step": 8540
},
{
"epoch": 7.15,
"learning_rate": 9.36867669916156e-06,
"loss": 0.2714,
"step": 8560
},
{
"epoch": 7.17,
"learning_rate": 9.266137530034986e-06,
"loss": 0.2624,
"step": 8580
},
{
"epoch": 7.19,
"learning_rate": 9.164034789500542e-06,
"loss": 0.2682,
"step": 8600
},
{
"epoch": 7.2,
"learning_rate": 9.062371309701658e-06,
"loss": 0.2758,
"step": 8620
},
{
"epoch": 7.22,
"learning_rate": 8.961149910597492e-06,
"loss": 0.2723,
"step": 8640
},
{
"epoch": 7.24,
"learning_rate": 8.860373399884675e-06,
"loss": 0.2685,
"step": 8660
},
{
"epoch": 7.25,
"learning_rate": 8.760044572919455e-06,
"loss": 0.2631,
"step": 8680
},
{
"epoch": 7.27,
"learning_rate": 8.66016621264012e-06,
"loss": 0.264,
"step": 8700
},
{
"epoch": 7.29,
"learning_rate": 8.560741089489898e-06,
"loss": 0.2749,
"step": 8720
},
{
"epoch": 7.3,
"learning_rate": 8.461771961339998e-06,
"loss": 0.2617,
"step": 8740
},
{
"epoch": 7.32,
"learning_rate": 8.363261573413163e-06,
"loss": 0.2659,
"step": 8760
},
{
"epoch": 7.34,
"learning_rate": 8.265212658207541e-06,
"loss": 0.2667,
"step": 8780
},
{
"epoch": 7.35,
"learning_rate": 8.16762793542087e-06,
"loss": 0.2813,
"step": 8800
},
{
"epoch": 7.37,
"learning_rate": 8.070510111875015e-06,
"loss": 0.2719,
"step": 8820
},
{
"epoch": 7.39,
"learning_rate": 7.973861881440921e-06,
"loss": 0.2713,
"step": 8840
},
{
"epoch": 7.4,
"learning_rate": 7.87768592496389e-06,
"loss": 0.2709,
"step": 8860
},
{
"epoch": 7.42,
"learning_rate": 7.781984910189202e-06,
"loss": 0.2736,
"step": 8880
},
{
"epoch": 7.44,
"learning_rate": 7.686761491688105e-06,
"loss": 0.273,
"step": 8900
},
{
"epoch": 7.46,
"learning_rate": 7.592018310784219e-06,
"loss": 0.2655,
"step": 8920
},
{
"epoch": 7.47,
"learning_rate": 7.4977579954802565e-06,
"loss": 0.2722,
"step": 8940
},
{
"epoch": 7.49,
"learning_rate": 7.403983160385095e-06,
"loss": 0.2709,
"step": 8960
},
{
"epoch": 7.51,
"learning_rate": 7.3106964066412844e-06,
"loss": 0.2658,
"step": 8980
},
{
"epoch": 7.52,
"learning_rate": 7.217900321852908e-06,
"loss": 0.2709,
"step": 9000
},
{
"epoch": 7.54,
"learning_rate": 7.125597480013785e-06,
"loss": 0.2681,
"step": 9020
},
{
"epoch": 7.56,
"learning_rate": 7.03379044143605e-06,
"loss": 0.2715,
"step": 9040
},
{
"epoch": 7.57,
"learning_rate": 6.942481752679203e-06,
"loss": 0.2689,
"step": 9060
},
{
"epoch": 7.59,
"learning_rate": 6.851673946479387e-06,
"loss": 0.2746,
"step": 9080
},
{
"epoch": 7.61,
"learning_rate": 6.761369541679211e-06,
"loss": 0.2679,
"step": 9100
},
{
"epoch": 7.62,
"learning_rate": 6.671571043157843e-06,
"loss": 0.2648,
"step": 9120
},
{
"epoch": 7.64,
"learning_rate": 6.582280941761518e-06,
"loss": 0.2624,
"step": 9140
},
{
"epoch": 7.66,
"learning_rate": 6.493501714234487e-06,
"loss": 0.2693,
"step": 9160
},
{
"epoch": 7.67,
"learning_rate": 6.405235823150269e-06,
"loss": 0.265,
"step": 9180
},
{
"epoch": 7.69,
"learning_rate": 6.3174857168433995e-06,
"loss": 0.2695,
"step": 9200
},
{
"epoch": 7.71,
"learning_rate": 6.230253829341448e-06,
"loss": 0.2714,
"step": 9220
},
{
"epoch": 7.72,
"learning_rate": 6.143542580297576e-06,
"loss": 0.2704,
"step": 9240
},
{
"epoch": 7.74,
"learning_rate": 6.057354374923374e-06,
"loss": 0.2724,
"step": 9260
},
{
"epoch": 7.76,
"learning_rate": 5.97169160392215e-06,
"loss": 0.265,
"step": 9280
},
{
"epoch": 7.77,
"learning_rate": 5.886556643422611e-06,
"loss": 0.2664,
"step": 9300
},
{
"epoch": 7.79,
"learning_rate": 5.801951854913016e-06,
"loss": 0.2746,
"step": 9320
},
{
"epoch": 7.81,
"learning_rate": 5.717879585175564e-06,
"loss": 0.2631,
"step": 9340
},
{
"epoch": 7.82,
"learning_rate": 5.634342166221382e-06,
"loss": 0.2691,
"step": 9360
},
{
"epoch": 7.84,
"learning_rate": 5.551341915225816e-06,
"loss": 0.2673,
"step": 9380
},
{
"epoch": 7.86,
"learning_rate": 5.468881134464154e-06,
"loss": 0.2717,
"step": 9400
},
{
"epoch": 7.87,
"learning_rate": 5.386962111247756e-06,
"loss": 0.2663,
"step": 9420
},
{
"epoch": 7.89,
"learning_rate": 5.30558711786062e-06,
"loss": 0.2736,
"step": 9440
},
{
"epoch": 7.91,
"learning_rate": 5.224758411496356e-06,
"loss": 0.2672,
"step": 9460
},
{
"epoch": 7.92,
"learning_rate": 5.144478234195579e-06,
"loss": 0.2632,
"step": 9480
},
{
"epoch": 7.94,
"learning_rate": 5.064748812783685e-06,
"loss": 0.2601,
"step": 9500
},
{
"epoch": 7.96,
"learning_rate": 4.985572358809127e-06,
"loss": 0.261,
"step": 9520
},
{
"epoch": 7.97,
"learning_rate": 4.906951068482057e-06,
"loss": 0.2617,
"step": 9540
},
{
"epoch": 7.99,
"learning_rate": 4.8288871226133875e-06,
"loss": 0.2727,
"step": 9560
},
{
"epoch": 8.0,
"eval_loss": 0.5100283622741699,
"eval_runtime": 136.9296,
"eval_samples_per_second": 34.558,
"eval_steps_per_second": 8.639,
"step": 9572
},
{
"epoch": 8.01,
"learning_rate": 4.751382686554309e-06,
"loss": 0.2592,
"step": 9580
},
{
"epoch": 8.02,
"learning_rate": 4.674439910136253e-06,
"loss": 0.2475,
"step": 9600
},
{
"epoch": 8.04,
"learning_rate": 4.598060927611228e-06,
"loss": 0.2477,
"step": 9620
},
{
"epoch": 8.06,
"learning_rate": 4.5222478575926265e-06,
"loss": 0.2484,
"step": 9640
},
{
"epoch": 8.07,
"learning_rate": 4.447002802996475e-06,
"loss": 0.2461,
"step": 9660
},
{
"epoch": 8.09,
"learning_rate": 4.372327850983069e-06,
"loss": 0.2466,
"step": 9680
},
{
"epoch": 8.11,
"learning_rate": 4.298225072899123e-06,
"loss": 0.2433,
"step": 9700
},
{
"epoch": 8.12,
"learning_rate": 4.224696524220276e-06,
"loss": 0.251,
"step": 9720
},
{
"epoch": 8.14,
"learning_rate": 4.151744244494102e-06,
"loss": 0.2447,
"step": 9740
},
{
"epoch": 8.16,
"learning_rate": 4.079370257283529e-06,
"loss": 0.241,
"step": 9760
},
{
"epoch": 8.17,
"learning_rate": 4.007576570110691e-06,
"loss": 0.2421,
"step": 9780
},
{
"epoch": 8.19,
"learning_rate": 3.9363651744012855e-06,
"loss": 0.2486,
"step": 9800
},
{
"epoch": 8.21,
"learning_rate": 3.865738045429279e-06,
"loss": 0.2482,
"step": 9820
},
{
"epoch": 8.22,
"learning_rate": 3.7956971422621723e-06,
"loss": 0.2406,
"step": 9840
},
{
"epoch": 8.24,
"learning_rate": 3.7262444077066248e-06,
"loss": 0.2467,
"step": 9860
},
{
"epoch": 8.26,
"learning_rate": 3.6573817682545667e-06,
"loss": 0.245,
"step": 9880
},
{
"epoch": 8.27,
"learning_rate": 3.589111134029771e-06,
"loss": 0.2437,
"step": 9900
},
{
"epoch": 8.29,
"learning_rate": 3.5214343987348743e-06,
"loss": 0.2405,
"step": 9920
},
{
"epoch": 8.31,
"learning_rate": 3.4543534395988427e-06,
"loss": 0.2432,
"step": 9940
},
{
"epoch": 8.32,
"learning_rate": 3.38787011732489e-06,
"loss": 0.2435,
"step": 9960
},
{
"epoch": 8.34,
"learning_rate": 3.3219862760388715e-06,
"loss": 0.2493,
"step": 9980
},
{
"epoch": 8.36,
"learning_rate": 3.256703743238168e-06,
"loss": 0.251,
"step": 10000
},
{
"epoch": 8.37,
"learning_rate": 3.1920243297409253e-06,
"loss": 0.2441,
"step": 10020
},
{
"epoch": 8.39,
"learning_rate": 3.1279498296358735e-06,
"loss": 0.2436,
"step": 10040
},
{
"epoch": 8.41,
"learning_rate": 3.0644820202325593e-06,
"loss": 0.25,
"step": 10060
},
{
"epoch": 8.42,
"learning_rate": 3.0016226620120296e-06,
"loss": 0.2429,
"step": 10080
},
{
"epoch": 8.44,
"learning_rate": 2.939373498578013e-06,
"loss": 0.2486,
"step": 10100
},
{
"epoch": 8.46,
"learning_rate": 2.877736256608535e-06,
"loss": 0.2485,
"step": 10120
},
{
"epoch": 8.47,
"learning_rate": 2.816712645808056e-06,
"loss": 0.2546,
"step": 10140
},
{
"epoch": 8.49,
"learning_rate": 2.756304358860029e-06,
"loss": 0.2452,
"step": 10160
},
{
"epoch": 8.51,
"learning_rate": 2.6965130713799273e-06,
"loss": 0.2445,
"step": 10180
},
{
"epoch": 8.52,
"learning_rate": 2.6373404418688104e-06,
"loss": 0.2474,
"step": 10200
},
{
"epoch": 8.54,
"learning_rate": 2.5787881116672853e-06,
"loss": 0.2387,
"step": 10220
},
{
"epoch": 8.56,
"learning_rate": 2.52085770490999e-06,
"loss": 0.2419,
"step": 10240
},
{
"epoch": 8.58,
"learning_rate": 2.463550828480535e-06,
"loss": 0.2458,
"step": 10260
},
{
"epoch": 8.59,
"learning_rate": 2.406869071966955e-06,
"loss": 0.236,
"step": 10280
},
{
"epoch": 8.61,
"learning_rate": 2.350814007617597e-06,
"loss": 0.2512,
"step": 10300
},
{
"epoch": 8.63,
"learning_rate": 2.2953871902974987e-06,
"loss": 0.2463,
"step": 10320
},
{
"epoch": 8.64,
"learning_rate": 2.2405901574452907e-06,
"loss": 0.2496,
"step": 10340
},
{
"epoch": 8.66,
"learning_rate": 2.1864244290305202e-06,
"loss": 0.2408,
"step": 10360
},
{
"epoch": 8.68,
"learning_rate": 2.1328915075115085e-06,
"loss": 0.2469,
"step": 10380
},
{
"epoch": 8.69,
"learning_rate": 2.0799928777936795e-06,
"loss": 0.2465,
"step": 10400
},
{
"epoch": 8.71,
"learning_rate": 2.027730007188339e-06,
"loss": 0.2494,
"step": 10420
},
{
"epoch": 8.73,
"learning_rate": 1.9761043453720207e-06,
"loss": 0.2443,
"step": 10440
},
{
"epoch": 8.74,
"learning_rate": 1.9251173243462317e-06,
"loss": 0.2434,
"step": 10460
},
{
"epoch": 8.76,
"learning_rate": 1.8747703583977678e-06,
"loss": 0.243,
"step": 10480
},
{
"epoch": 8.78,
"learning_rate": 1.8250648440594486e-06,
"loss": 0.2446,
"step": 10500
},
{
"epoch": 8.79,
"learning_rate": 1.7760021600714106e-06,
"loss": 0.2455,
"step": 10520
},
{
"epoch": 8.81,
"learning_rate": 1.7275836673428581e-06,
"loss": 0.2483,
"step": 10540
},
{
"epoch": 8.83,
"learning_rate": 1.6798107089142868e-06,
"loss": 0.2491,
"step": 10560
},
{
"epoch": 8.84,
"learning_rate": 1.632684609920254e-06,
"loss": 0.2468,
"step": 10580
},
{
"epoch": 8.86,
"learning_rate": 1.5862066775526457e-06,
"loss": 0.2472,
"step": 10600
},
{
"epoch": 8.88,
"learning_rate": 1.5403782010243589e-06,
"loss": 0.2474,
"step": 10620
},
{
"epoch": 8.89,
"learning_rate": 1.4952004515335794e-06,
"loss": 0.2362,
"step": 10640
},
{
"epoch": 8.91,
"learning_rate": 1.45067468222852e-06,
"loss": 0.2445,
"step": 10660
},
{
"epoch": 8.93,
"learning_rate": 1.4068021281726602e-06,
"loss": 0.2388,
"step": 10680
},
{
"epoch": 8.94,
"learning_rate": 1.36358400631047e-06,
"loss": 0.2415,
"step": 10700
},
{
"epoch": 8.96,
"learning_rate": 1.3210215154336681e-06,
"loss": 0.2481,
"step": 10720
},
{
"epoch": 8.98,
"learning_rate": 1.2791158361479733e-06,
"loss": 0.2422,
"step": 10740
},
{
"epoch": 8.99,
"learning_rate": 1.2378681308403501e-06,
"loss": 0.2478,
"step": 10760
},
{
"epoch": 9.0,
"eval_loss": 0.495464563369751,
"eval_runtime": 137.2277,
"eval_samples_per_second": 34.483,
"eval_steps_per_second": 8.621,
"step": 10768
},
{
"epoch": 9.01,
"learning_rate": 1.1972795436467676e-06,
"loss": 0.247,
"step": 10780
},
{
"epoch": 9.03,
"learning_rate": 1.1573512004204478e-06,
"loss": 0.2322,
"step": 10800
},
{
"epoch": 9.04,
"learning_rate": 1.1180842087006843e-06,
"loss": 0.233,
"step": 10820
},
{
"epoch": 9.06,
"learning_rate": 1.0794796576820587e-06,
"loss": 0.2366,
"step": 10840
},
{
"epoch": 9.08,
"learning_rate": 1.041538618184265e-06,
"loss": 0.23,
"step": 10860
},
{
"epoch": 9.09,
"learning_rate": 1.0042621426224114e-06,
"loss": 0.242,
"step": 10880
},
{
"epoch": 9.11,
"learning_rate": 9.676512649778092e-07,
"loss": 0.2395,
"step": 10900
},
{
"epoch": 9.13,
"learning_rate": 9.317070007692913e-07,
"loss": 0.2322,
"step": 10920
},
{
"epoch": 9.14,
"learning_rate": 8.964303470250707e-07,
"loss": 0.2413,
"step": 10940
},
{
"epoch": 9.16,
"learning_rate": 8.618222822550482e-07,
"loss": 0.2363,
"step": 10960
},
{
"epoch": 9.18,
"learning_rate": 8.27883766423701e-07,
"loss": 0.234,
"step": 10980
},
{
"epoch": 9.19,
"learning_rate": 7.94615740923435e-07,
"loss": 0.2323,
"step": 11000
},
{
"epoch": 9.21,
"learning_rate": 7.620191285484828e-07,
"loss": 0.2361,
"step": 11020
},
{
"epoch": 9.23,
"learning_rate": 7.300948334693053e-07,
"loss": 0.2313,
"step": 11040
},
{
"epoch": 9.24,
"learning_rate": 6.988437412075055e-07,
"loss": 0.2314,
"step": 11060
},
{
"epoch": 9.26,
"learning_rate": 6.68266718611274e-07,
"loss": 0.2285,
"step": 11080
},
{
"epoch": 9.28,
"learning_rate": 6.383646138313381e-07,
"loss": 0.233,
"step": 11100
},
{
"epoch": 9.29,
"learning_rate": 6.091382562974396e-07,
"loss": 0.2354,
"step": 11120
},
{
"epoch": 9.31,
"learning_rate": 5.805884566953329e-07,
"loss": 0.2292,
"step": 11140
},
{
"epoch": 9.33,
"learning_rate": 5.527160069442788e-07,
"loss": 0.233,
"step": 11160
},
{
"epoch": 9.34,
"learning_rate": 5.255216801751006e-07,
"loss": 0.2341,
"step": 11180
},
{
"epoch": 9.36,
"learning_rate": 4.990062307087262e-07,
"loss": 0.2303,
"step": 11200
},
{
"epoch": 9.38,
"learning_rate": 4.731703940352716e-07,
"loss": 0.233,
"step": 11220
},
{
"epoch": 9.39,
"learning_rate": 4.480148867936268e-07,
"loss": 0.2312,
"step": 11240
},
{
"epoch": 9.41,
"learning_rate": 4.2354040675159635e-07,
"loss": 0.2345,
"step": 11260
},
{
"epoch": 9.43,
"learning_rate": 3.997476327865318e-07,
"loss": 0.2365,
"step": 11280
},
{
"epoch": 9.44,
"learning_rate": 3.7663722486649957e-07,
"loss": 0.23,
"step": 11300
},
{
"epoch": 9.46,
"learning_rate": 3.542098240319813e-07,
"loss": 0.2354,
"step": 11320
},
{
"epoch": 9.48,
"learning_rate": 3.3246605237809426e-07,
"loss": 0.2362,
"step": 11340
},
{
"epoch": 9.49,
"learning_rate": 3.114065130373295e-07,
"loss": 0.2345,
"step": 11360
},
{
"epoch": 9.51,
"learning_rate": 2.9103179016282124e-07,
"loss": 0.2318,
"step": 11380
},
{
"epoch": 9.53,
"learning_rate": 2.7134244891214853e-07,
"loss": 0.2329,
"step": 11400
},
{
"epoch": 9.54,
"learning_rate": 2.523390354316535e-07,
"loss": 0.231,
"step": 11420
},
{
"epoch": 9.56,
"learning_rate": 2.3402207684130596e-07,
"loss": 0.2337,
"step": 11440
},
{
"epoch": 9.58,
"learning_rate": 2.1639208122005704e-07,
"loss": 0.2284,
"step": 11460
},
{
"epoch": 9.59,
"learning_rate": 1.9944953759176987e-07,
"loss": 0.2256,
"step": 11480
},
{
"epoch": 9.61,
"learning_rate": 1.8319491591164417e-07,
"loss": 0.2349,
"step": 11500
},
{
"epoch": 9.63,
"learning_rate": 1.676286670531796e-07,
"loss": 0.2381,
"step": 11520
},
{
"epoch": 9.64,
"learning_rate": 1.5275122279567178e-07,
"loss": 0.2405,
"step": 11540
},
{
"epoch": 9.66,
"learning_rate": 1.3856299581223864e-07,
"loss": 0.2249,
"step": 11560
},
{
"epoch": 9.68,
"learning_rate": 1.2506437965836836e-07,
"loss": 0.2315,
"step": 11580
},
{
"epoch": 9.69,
"learning_rate": 1.1225574876100597e-07,
"loss": 0.2382,
"step": 11600
},
{
"epoch": 9.71,
"learning_rate": 1.0013745840816158e-07,
"loss": 0.238,
"step": 11620
},
{
"epoch": 9.73,
"learning_rate": 8.87098447390683e-08,
"loss": 0.2349,
"step": 11640
},
{
"epoch": 9.75,
"learning_rate": 7.797322473484248e-08,
"loss": 0.2438,
"step": 11660
},
{
"epoch": 9.76,
"learning_rate": 6.79278962097074e-08,
"loss": 0.2345,
"step": 11680
},
{
"epoch": 9.78,
"learning_rate": 5.8574137802713814e-08,
"loss": 0.2315,
"step": 11700
},
{
"epoch": 9.8,
"learning_rate": 4.991220897002935e-08,
"loss": 0.24,
"step": 11720
},
{
"epoch": 9.81,
"learning_rate": 4.1942349977727724e-08,
"loss": 0.2405,
"step": 11740
},
{
"epoch": 9.83,
"learning_rate": 3.466478189513567e-08,
"loss": 0.227,
"step": 11760
},
{
"epoch": 9.85,
"learning_rate": 2.807970658869341e-08,
"loss": 0.2397,
"step": 11780
},
{
"epoch": 9.86,
"learning_rate": 2.2187306716353608e-08,
"loss": 0.2357,
"step": 11800
},
{
"epoch": 9.88,
"learning_rate": 1.6987745722521487e-08,
"loss": 0.2363,
"step": 11820
},
{
"epoch": 9.9,
"learning_rate": 1.2481167833516827e-08,
"loss": 0.2381,
"step": 11840
},
{
"epoch": 9.91,
"learning_rate": 8.667698053574369e-09,
"loss": 0.2309,
"step": 11860
},
{
"epoch": 9.93,
"learning_rate": 5.5474421613799235e-09,
"loss": 0.2377,
"step": 11880
},
{
"epoch": 9.95,
"learning_rate": 3.120486707125503e-09,
"loss": 0.2308,
"step": 11900
},
{
"epoch": 9.96,
"learning_rate": 1.3868990101223445e-09,
"loss": 0.2315,
"step": 11920
},
{
"epoch": 9.98,
"learning_rate": 3.467271569246311e-10,
"loss": 0.2337,
"step": 11940
},
{
"epoch": 10.0,
"learning_rate": 0.0,
"loss": 0.2335,
"step": 11960
},
{
"epoch": 10.0,
"eval_loss": 0.5053394436836243,
"eval_runtime": 136.9203,
"eval_samples_per_second": 34.56,
"eval_steps_per_second": 8.64,
"step": 11960
},
{
"epoch": 10.0,
"step": 11960,
"total_flos": 3.434387362032845e+18,
"train_loss": 0.5183154742055912,
"train_runtime": 30461.9007,
"train_samples_per_second": 12.568,
"train_steps_per_second": 0.393
}
],
"logging_steps": 20,
"max_steps": 11960,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 3.434387362032845e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}