Qwen2.5-7B-pinyin / trainer_state.json
Geaming's picture
Upload folder using huggingface_hub
6d11971 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9871244635193133,
"eval_steps": 500,
"global_step": 522,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02861230329041488,
"grad_norm": 2827.3015022702225,
"learning_rate": 9.766490138119515e-06,
"loss": 6.1313,
"step": 5
},
{
"epoch": 0.05722460658082976,
"grad_norm": 516.297994115725,
"learning_rate": 1.3972688495262568e-05,
"loss": 3.18,
"step": 10
},
{
"epoch": 0.08583690987124463,
"grad_norm": 10.64334537255521,
"learning_rate": 1.6433156804786183e-05,
"loss": 0.6962,
"step": 15
},
{
"epoch": 0.11444921316165951,
"grad_norm": 21.79989086330797,
"learning_rate": 1.8178886852405614e-05,
"loss": 0.4631,
"step": 20
},
{
"epoch": 0.1430615164520744,
"grad_norm": 8.407923664935195,
"learning_rate": 1.953298027623903e-05,
"loss": 0.3384,
"step": 25
},
{
"epoch": 0.17167381974248927,
"grad_norm": 4.980763069359495,
"learning_rate": 2e-05,
"loss": 0.2112,
"step": 30
},
{
"epoch": 0.20028612303290416,
"grad_norm": 3.3050323983887084,
"learning_rate": 2e-05,
"loss": 0.1508,
"step": 35
},
{
"epoch": 0.22889842632331903,
"grad_norm": 2.747487969978562,
"learning_rate": 2e-05,
"loss": 0.1331,
"step": 40
},
{
"epoch": 0.2575107296137339,
"grad_norm": 3.8930367082100776,
"learning_rate": 2e-05,
"loss": 0.1279,
"step": 45
},
{
"epoch": 0.2861230329041488,
"grad_norm": 2.262054738771732,
"learning_rate": 2e-05,
"loss": 0.1245,
"step": 50
},
{
"epoch": 0.3147353361945637,
"grad_norm": 2.1250650940368487,
"learning_rate": 2e-05,
"loss": 0.1133,
"step": 55
},
{
"epoch": 0.34334763948497854,
"grad_norm": 1.770036809289755,
"learning_rate": 2e-05,
"loss": 0.0945,
"step": 60
},
{
"epoch": 0.3719599427753934,
"grad_norm": 1.7938876086769961,
"learning_rate": 2e-05,
"loss": 0.0866,
"step": 65
},
{
"epoch": 0.4005722460658083,
"grad_norm": 1.4764321868179242,
"learning_rate": 2e-05,
"loss": 0.0827,
"step": 70
},
{
"epoch": 0.4291845493562232,
"grad_norm": 1.9128984486999376,
"learning_rate": 2e-05,
"loss": 0.0798,
"step": 75
},
{
"epoch": 0.45779685264663805,
"grad_norm": 1.46483598523261,
"learning_rate": 2e-05,
"loss": 0.0804,
"step": 80
},
{
"epoch": 0.4864091559370529,
"grad_norm": 1.6523142945768603,
"learning_rate": 2e-05,
"loss": 0.0758,
"step": 85
},
{
"epoch": 0.5150214592274678,
"grad_norm": 1.5333568395438437,
"learning_rate": 2e-05,
"loss": 0.0709,
"step": 90
},
{
"epoch": 0.5436337625178826,
"grad_norm": 1.5031573664657636,
"learning_rate": 2e-05,
"loss": 0.0687,
"step": 95
},
{
"epoch": 0.5722460658082976,
"grad_norm": 1.4132321325139292,
"learning_rate": 2e-05,
"loss": 0.0574,
"step": 100
},
{
"epoch": 0.6008583690987125,
"grad_norm": 1.3745064149711035,
"learning_rate": 2e-05,
"loss": 0.0572,
"step": 105
},
{
"epoch": 0.6294706723891274,
"grad_norm": 1.3470725333478024,
"learning_rate": 2e-05,
"loss": 0.0611,
"step": 110
},
{
"epoch": 0.6580829756795422,
"grad_norm": 1.1315562008981583,
"learning_rate": 2e-05,
"loss": 0.05,
"step": 115
},
{
"epoch": 0.6866952789699571,
"grad_norm": 1.2282177741629088,
"learning_rate": 2e-05,
"loss": 0.0525,
"step": 120
},
{
"epoch": 0.7153075822603719,
"grad_norm": 1.3933198446492454,
"learning_rate": 2e-05,
"loss": 0.0519,
"step": 125
},
{
"epoch": 0.7439198855507868,
"grad_norm": 1.7829406158054193,
"learning_rate": 2e-05,
"loss": 0.051,
"step": 130
},
{
"epoch": 0.7725321888412017,
"grad_norm": 1.1948798936363785,
"learning_rate": 2e-05,
"loss": 0.0514,
"step": 135
},
{
"epoch": 0.8011444921316166,
"grad_norm": 1.0816375150345683,
"learning_rate": 2e-05,
"loss": 0.0512,
"step": 140
},
{
"epoch": 0.8297567954220315,
"grad_norm": 1.469354846951377,
"learning_rate": 2e-05,
"loss": 0.0465,
"step": 145
},
{
"epoch": 0.8583690987124464,
"grad_norm": 1.2522970466753844,
"learning_rate": 2e-05,
"loss": 0.0493,
"step": 150
},
{
"epoch": 0.8869814020028612,
"grad_norm": 1.162286189716735,
"learning_rate": 2e-05,
"loss": 0.0474,
"step": 155
},
{
"epoch": 0.9155937052932761,
"grad_norm": 1.0718851830181713,
"learning_rate": 2e-05,
"loss": 0.0415,
"step": 160
},
{
"epoch": 0.944206008583691,
"grad_norm": 1.0733174430217316,
"learning_rate": 2e-05,
"loss": 0.0528,
"step": 165
},
{
"epoch": 0.9728183118741058,
"grad_norm": 1.0644789712973826,
"learning_rate": 2e-05,
"loss": 0.0456,
"step": 170
},
{
"epoch": 1.0014306151645207,
"grad_norm": 1.1496891631410193,
"learning_rate": 2e-05,
"loss": 0.0432,
"step": 175
},
{
"epoch": 1.0300429184549356,
"grad_norm": 1.0200369998966563,
"learning_rate": 2e-05,
"loss": 0.0241,
"step": 180
},
{
"epoch": 1.0586552217453504,
"grad_norm": 1.3931472581826994,
"learning_rate": 2e-05,
"loss": 0.0225,
"step": 185
},
{
"epoch": 1.0872675250357653,
"grad_norm": 1.0114502707229882,
"learning_rate": 2e-05,
"loss": 0.0239,
"step": 190
},
{
"epoch": 1.1158798283261802,
"grad_norm": 0.7510204564859241,
"learning_rate": 2e-05,
"loss": 0.027,
"step": 195
},
{
"epoch": 1.144492131616595,
"grad_norm": 0.7295063245482636,
"learning_rate": 2e-05,
"loss": 0.0235,
"step": 200
},
{
"epoch": 1.17310443490701,
"grad_norm": 0.856671906186716,
"learning_rate": 2e-05,
"loss": 0.0247,
"step": 205
},
{
"epoch": 1.201716738197425,
"grad_norm": 0.7255694844652782,
"learning_rate": 2e-05,
"loss": 0.0258,
"step": 210
},
{
"epoch": 1.2303290414878398,
"grad_norm": 0.9184490713297652,
"learning_rate": 2e-05,
"loss": 0.0276,
"step": 215
},
{
"epoch": 1.2589413447782547,
"grad_norm": 0.818205379161712,
"learning_rate": 2e-05,
"loss": 0.0219,
"step": 220
},
{
"epoch": 1.2875536480686696,
"grad_norm": 0.6654369429209799,
"learning_rate": 2e-05,
"loss": 0.0261,
"step": 225
},
{
"epoch": 1.3161659513590844,
"grad_norm": 0.5829250860946364,
"learning_rate": 2e-05,
"loss": 0.017,
"step": 230
},
{
"epoch": 1.3447782546494993,
"grad_norm": 0.6661015288674467,
"learning_rate": 2e-05,
"loss": 0.0228,
"step": 235
},
{
"epoch": 1.3733905579399142,
"grad_norm": 0.7177595155125043,
"learning_rate": 2e-05,
"loss": 0.0178,
"step": 240
},
{
"epoch": 1.402002861230329,
"grad_norm": 0.8848450057764279,
"learning_rate": 2e-05,
"loss": 0.0147,
"step": 245
},
{
"epoch": 1.4306151645207439,
"grad_norm": 0.6571566830429797,
"learning_rate": 2e-05,
"loss": 0.0216,
"step": 250
},
{
"epoch": 1.4592274678111588,
"grad_norm": 0.812436234834659,
"learning_rate": 2e-05,
"loss": 0.023,
"step": 255
},
{
"epoch": 1.4878397711015736,
"grad_norm": 0.7840598860015469,
"learning_rate": 2e-05,
"loss": 0.0209,
"step": 260
},
{
"epoch": 1.5164520743919887,
"grad_norm": 0.7844249253805873,
"learning_rate": 2e-05,
"loss": 0.0205,
"step": 265
},
{
"epoch": 1.5450643776824036,
"grad_norm": 0.7640044613122257,
"learning_rate": 2e-05,
"loss": 0.0238,
"step": 270
},
{
"epoch": 1.5736766809728184,
"grad_norm": 1.0261564863265702,
"learning_rate": 2e-05,
"loss": 0.0271,
"step": 275
},
{
"epoch": 1.6022889842632333,
"grad_norm": 0.6603554019675723,
"learning_rate": 2e-05,
"loss": 0.0224,
"step": 280
},
{
"epoch": 1.6309012875536482,
"grad_norm": 0.6112434008888443,
"learning_rate": 2e-05,
"loss": 0.0201,
"step": 285
},
{
"epoch": 1.659513590844063,
"grad_norm": 0.6941562759172227,
"learning_rate": 2e-05,
"loss": 0.0209,
"step": 290
},
{
"epoch": 1.688125894134478,
"grad_norm": 0.920122539331784,
"learning_rate": 2e-05,
"loss": 0.0224,
"step": 295
},
{
"epoch": 1.7167381974248928,
"grad_norm": 0.784663743006703,
"learning_rate": 2e-05,
"loss": 0.0204,
"step": 300
},
{
"epoch": 1.7453505007153076,
"grad_norm": 0.5371735092585386,
"learning_rate": 2e-05,
"loss": 0.0175,
"step": 305
},
{
"epoch": 1.7739628040057225,
"grad_norm": 0.4569754495971157,
"learning_rate": 2e-05,
"loss": 0.0191,
"step": 310
},
{
"epoch": 1.8025751072961373,
"grad_norm": 0.5809346149070659,
"learning_rate": 2e-05,
"loss": 0.0163,
"step": 315
},
{
"epoch": 1.8311874105865522,
"grad_norm": 0.9416876917379606,
"learning_rate": 2e-05,
"loss": 0.0186,
"step": 320
},
{
"epoch": 1.859799713876967,
"grad_norm": 0.9128407546360238,
"learning_rate": 2e-05,
"loss": 0.0177,
"step": 325
},
{
"epoch": 1.888412017167382,
"grad_norm": 0.6404265787090032,
"learning_rate": 2e-05,
"loss": 0.0142,
"step": 330
},
{
"epoch": 1.9170243204577968,
"grad_norm": 0.8659777729110113,
"learning_rate": 2e-05,
"loss": 0.0188,
"step": 335
},
{
"epoch": 1.9456366237482117,
"grad_norm": 0.7389926990941214,
"learning_rate": 2e-05,
"loss": 0.0205,
"step": 340
},
{
"epoch": 1.9742489270386265,
"grad_norm": 0.5010753341589564,
"learning_rate": 2e-05,
"loss": 0.0173,
"step": 345
},
{
"epoch": 2.0028612303290414,
"grad_norm": 0.44585995494505515,
"learning_rate": 2e-05,
"loss": 0.0176,
"step": 350
},
{
"epoch": 2.0314735336194563,
"grad_norm": 0.3372647662374521,
"learning_rate": 2e-05,
"loss": 0.0114,
"step": 355
},
{
"epoch": 2.060085836909871,
"grad_norm": 0.5724492338755197,
"learning_rate": 2e-05,
"loss": 0.0144,
"step": 360
},
{
"epoch": 2.088698140200286,
"grad_norm": 0.5187760378661088,
"learning_rate": 2e-05,
"loss": 0.0137,
"step": 365
},
{
"epoch": 2.117310443490701,
"grad_norm": 0.5106154674345083,
"learning_rate": 2e-05,
"loss": 0.0116,
"step": 370
},
{
"epoch": 2.1459227467811157,
"grad_norm": 0.5354245824814097,
"learning_rate": 2e-05,
"loss": 0.0122,
"step": 375
},
{
"epoch": 2.1745350500715306,
"grad_norm": 0.5465913593959528,
"learning_rate": 2e-05,
"loss": 0.0135,
"step": 380
},
{
"epoch": 2.2031473533619454,
"grad_norm": 0.7553346807762494,
"learning_rate": 2e-05,
"loss": 0.0139,
"step": 385
},
{
"epoch": 2.2317596566523603,
"grad_norm": 0.47986647806349203,
"learning_rate": 2e-05,
"loss": 0.0125,
"step": 390
},
{
"epoch": 2.260371959942775,
"grad_norm": 0.6926460223156353,
"learning_rate": 2e-05,
"loss": 0.012,
"step": 395
},
{
"epoch": 2.28898426323319,
"grad_norm": 0.7721328467064986,
"learning_rate": 2e-05,
"loss": 0.0104,
"step": 400
},
{
"epoch": 2.317596566523605,
"grad_norm": 0.3938351782885143,
"learning_rate": 2e-05,
"loss": 0.0094,
"step": 405
},
{
"epoch": 2.34620886981402,
"grad_norm": 0.7956446454256582,
"learning_rate": 2e-05,
"loss": 0.0127,
"step": 410
},
{
"epoch": 2.374821173104435,
"grad_norm": 0.7163390976077646,
"learning_rate": 2e-05,
"loss": 0.0117,
"step": 415
},
{
"epoch": 2.40343347639485,
"grad_norm": 0.40417151839328475,
"learning_rate": 2e-05,
"loss": 0.0116,
"step": 420
},
{
"epoch": 2.432045779685265,
"grad_norm": 0.7406033829214401,
"learning_rate": 2e-05,
"loss": 0.0115,
"step": 425
},
{
"epoch": 2.4606580829756797,
"grad_norm": 1.0372948520488305,
"learning_rate": 2e-05,
"loss": 0.013,
"step": 430
},
{
"epoch": 2.4892703862660945,
"grad_norm": 0.48303247551117084,
"learning_rate": 2e-05,
"loss": 0.0121,
"step": 435
},
{
"epoch": 2.5178826895565094,
"grad_norm": 0.822531770752665,
"learning_rate": 2e-05,
"loss": 0.0091,
"step": 440
},
{
"epoch": 2.5464949928469243,
"grad_norm": 0.5751055353850153,
"learning_rate": 2e-05,
"loss": 0.0117,
"step": 445
},
{
"epoch": 2.575107296137339,
"grad_norm": 0.8111046011909318,
"learning_rate": 2e-05,
"loss": 0.0138,
"step": 450
},
{
"epoch": 2.603719599427754,
"grad_norm": 0.5529988693729204,
"learning_rate": 2e-05,
"loss": 0.0137,
"step": 455
},
{
"epoch": 2.632331902718169,
"grad_norm": 0.6426046706622803,
"learning_rate": 2e-05,
"loss": 0.0128,
"step": 460
},
{
"epoch": 2.6609442060085837,
"grad_norm": 0.5842059243112792,
"learning_rate": 2e-05,
"loss": 0.0097,
"step": 465
},
{
"epoch": 2.6895565092989986,
"grad_norm": 0.9462035289351468,
"learning_rate": 2e-05,
"loss": 0.0111,
"step": 470
},
{
"epoch": 2.7181688125894135,
"grad_norm": 0.47730280851213186,
"learning_rate": 2e-05,
"loss": 0.01,
"step": 475
},
{
"epoch": 2.7467811158798283,
"grad_norm": 0.7829145546901836,
"learning_rate": 2e-05,
"loss": 0.0147,
"step": 480
},
{
"epoch": 2.775393419170243,
"grad_norm": 0.41532702346006606,
"learning_rate": 2e-05,
"loss": 0.0106,
"step": 485
},
{
"epoch": 2.804005722460658,
"grad_norm": 0.42916154288878555,
"learning_rate": 2e-05,
"loss": 0.0135,
"step": 490
},
{
"epoch": 2.832618025751073,
"grad_norm": 0.5117471137135019,
"learning_rate": 2e-05,
"loss": 0.0095,
"step": 495
},
{
"epoch": 2.8612303290414878,
"grad_norm": 0.42607884594383777,
"learning_rate": 2e-05,
"loss": 0.0105,
"step": 500
},
{
"epoch": 2.8898426323319026,
"grad_norm": 0.5078330866142711,
"learning_rate": 2e-05,
"loss": 0.0111,
"step": 505
},
{
"epoch": 2.9184549356223175,
"grad_norm": 0.872560256021644,
"learning_rate": 2e-05,
"loss": 0.0096,
"step": 510
},
{
"epoch": 2.9470672389127324,
"grad_norm": 0.5473131169980906,
"learning_rate": 2e-05,
"loss": 0.0119,
"step": 515
},
{
"epoch": 2.9756795422031472,
"grad_norm": 0.33829761962434335,
"learning_rate": 2e-05,
"loss": 0.0094,
"step": 520
}
],
"logging_steps": 5,
"max_steps": 522,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 10457554665472.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}