Uploaded checkpoint-4000

Browse files

Files changed (5) hide show

adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +719 -3

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a4b05a5b65eb469f401a2ef2bb091ffae27e60f3ab9b87eeccdd1b89d55e021
 size 119975656

 version https://git-lfs.github.com/spec/v1
+oid sha256:972dc6c25a7ba22f961a54b6dfec63f8f71aa505086b3975df6cf082fc13115e
 size 119975656

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:491a365467f19688384c47c41c87716a9585747c16862fa9ba0b732548ebc124
 size 240145026

 version https://git-lfs.github.com/spec/v1
+oid sha256:3143a5365fcf9595164c34006dbd78149b5d0e6391fbf16643ee4c70b2a2e398
 size 240145026

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7eeee07b40fef8c7bdf027c427b1fc8d6a45d979762d8d637d73e82015e5add
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:b90bc42e5601a089b4f97e9c36e907416b25c070b74e200626385618b0995aae
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:770db92ac44ccb712216aece2abb8a41e68fd6d952c7ae7884e9032fb3cc3f81
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f80b0441e18382140898e5947e4bf00161c8985bfd13094069daa8dad861cc8
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.018277771770954132,
   "best_model_checkpoint": "runs/deepseek_lora_20240424-122712/checkpoint-2000",
-  "epoch": 0.9399232396020991,
   "eval_steps": 500,
-  "global_step": 3000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2155,6 +2155,722 @@
       "eval_samples_per_second": 16.132,
       "eval_steps_per_second": 16.132,
       "step": 3000
     }
   ],
   "logging_steps": 10,
@@ -2162,7 +2878,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,
-  "total_flos": 4.8306377981952e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.018277771770954132,
   "best_model_checkpoint": "runs/deepseek_lora_20240424-122712/checkpoint-2000",
+  "epoch": 1.2532309861361322,
   "eval_steps": 500,
+  "global_step": 4000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 16.132,
       "eval_steps_per_second": 16.132,
       "step": 3000
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.95473575592041,
+      "learning_rate": 8.844444444444445e-06,
+      "loss": 0.1515,
+      "step": 3010
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.005038989707827568,
+      "learning_rate": 8.8e-06,
+      "loss": 0.0355,
+      "step": 3020
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.002464048098772764,
+      "learning_rate": 8.755555555555556e-06,
+      "loss": 0.0157,
+      "step": 3030
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 2.865673065185547,
+      "learning_rate": 8.711111111111111e-06,
+      "loss": 0.0994,
+      "step": 3040
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.023971589282155037,
+      "learning_rate": 8.666666666666668e-06,
+      "loss": 0.0467,
+      "step": 3050
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.0025374030228704214,
+      "learning_rate": 8.622222222222223e-06,
+      "loss": 0.0488,
+      "step": 3060
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.8780492544174194,
+      "learning_rate": 8.577777777777778e-06,
+      "loss": 0.0453,
+      "step": 3070
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.0010841538896784186,
+      "learning_rate": 8.533333333333335e-06,
+      "loss": 0.0473,
+      "step": 3080
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 2.532902240753174,
+      "learning_rate": 8.48888888888889e-06,
+      "loss": 0.0135,
+      "step": 3090
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.0008325451053678989,
+      "learning_rate": 8.444444444444446e-06,
+      "loss": 0.0645,
+      "step": 3100
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.01362746674567461,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.107,
+      "step": 3110
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.056719791144132614,
+      "learning_rate": 8.355555555555556e-06,
+      "loss": 0.0187,
+      "step": 3120
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.0013840706087648869,
+      "learning_rate": 8.311111111111111e-06,
+      "loss": 0.0435,
+      "step": 3130
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.0014659567968919873,
+      "learning_rate": 8.266666666666667e-06,
+      "loss": 0.0918,
+      "step": 3140
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 3.019699811935425,
+      "learning_rate": 8.222222222222222e-06,
+      "loss": 0.0166,
+      "step": 3150
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.002052758354693651,
+      "learning_rate": 8.177777777777779e-06,
+      "loss": 0.0373,
+      "step": 3160
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.0011199481086805463,
+      "learning_rate": 8.133333333333334e-06,
+      "loss": 0.0105,
+      "step": 3170
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.0013342432212084532,
+      "learning_rate": 8.08888888888889e-06,
+      "loss": 0.0512,
+      "step": 3180
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.0014090395998209715,
+      "learning_rate": 8.044444444444444e-06,
+      "loss": 0.0338,
+      "step": 3190
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.101834774017334,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.0129,
+      "step": 3200
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.5294092297554016,
+      "learning_rate": 7.955555555555557e-06,
+      "loss": 0.0153,
+      "step": 3210
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 3.8237059116363525,
+      "learning_rate": 7.911111111111112e-06,
+      "loss": 0.0738,
+      "step": 3220
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.0007992366445250809,
+      "learning_rate": 7.866666666666667e-06,
+      "loss": 0.0166,
+      "step": 3230
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.7395054697990417,
+      "learning_rate": 7.822222222222224e-06,
+      "loss": 0.0087,
+      "step": 3240
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.0022702962160110474,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 0.0137,
+      "step": 3250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.18367303907871246,
+      "learning_rate": 7.733333333333334e-06,
+      "loss": 0.0246,
+      "step": 3260
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 1.4116305112838745,
+      "learning_rate": 7.68888888888889e-06,
+      "loss": 0.0722,
+      "step": 3270
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.0008715521544218063,
+      "learning_rate": 7.644444444444445e-06,
+      "loss": 0.0018,
+      "step": 3280
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.0013335467083379626,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 0.0003,
+      "step": 3290
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.002115165116265416,
+      "learning_rate": 7.555555555555556e-06,
+      "loss": 0.0087,
+      "step": 3300
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.0042011737823486,
+      "learning_rate": 7.511111111111111e-06,
+      "loss": 0.0303,
+      "step": 3310
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.000751888903323561,
+      "learning_rate": 7.4666666666666675e-06,
+      "loss": 0.0214,
+      "step": 3320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.09692036360502243,
+      "learning_rate": 7.422222222222223e-06,
+      "loss": 0.113,
+      "step": 3330
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 4.702492713928223,
+      "learning_rate": 7.377777777777778e-06,
+      "loss": 0.0581,
+      "step": 3340
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.01480321865528822,
+      "learning_rate": 7.333333333333333e-06,
+      "loss": 0.022,
+      "step": 3350
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.0011497796513140202,
+      "learning_rate": 7.28888888888889e-06,
+      "loss": 0.0232,
+      "step": 3360
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.0010983615648001432,
+      "learning_rate": 7.244444444444445e-06,
+      "loss": 0.0375,
+      "step": 3370
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.005294387228786945,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 0.0548,
+      "step": 3380
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 2.251269578933716,
+      "learning_rate": 7.155555555555556e-06,
+      "loss": 0.0205,
+      "step": 3390
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.003230300033465028,
+      "learning_rate": 7.111111111111112e-06,
+      "loss": 0.0147,
+      "step": 3400
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 1.5663217306137085,
+      "learning_rate": 7.066666666666667e-06,
+      "loss": 0.0186,
+      "step": 3410
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.0012102341279387474,
+      "learning_rate": 7.022222222222222e-06,
+      "loss": 0.0016,
+      "step": 3420
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.000960271863732487,
+      "learning_rate": 6.977777777777779e-06,
+      "loss": 0.0351,
+      "step": 3430
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.001627901685424149,
+      "learning_rate": 6.9333333333333344e-06,
+      "loss": 0.0494,
+      "step": 3440
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.0015967305516824126,
+      "learning_rate": 6.88888888888889e-06,
+      "loss": 0.0052,
+      "step": 3450
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.0006052978569641709,
+      "learning_rate": 6.844444444444445e-06,
+      "loss": 0.0198,
+      "step": 3460
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.832760214805603,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.0147,
+      "step": 3470
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.8160419464111328,
+      "learning_rate": 6.755555555555556e-06,
+      "loss": 0.0256,
+      "step": 3480
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 1.7934602499008179,
+      "learning_rate": 6.711111111111111e-06,
+      "loss": 0.0471,
+      "step": 3490
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.01658487133681774,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.043,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1,
+      "eval_loss": 0.021265115588903427,
+      "eval_runtime": 61.9415,
+      "eval_samples_per_second": 16.144,
+      "eval_steps_per_second": 16.144,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.002576815662905574,
+      "learning_rate": 6.6222222222222236e-06,
+      "loss": 0.0521,
+      "step": 3510
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.005587077233940363,
+      "learning_rate": 6.577777777777779e-06,
+      "loss": 0.0098,
+      "step": 3520
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.7236731052398682,
+      "learning_rate": 6.533333333333334e-06,
+      "loss": 0.0434,
+      "step": 3530
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.013782077468931675,
+      "learning_rate": 6.488888888888889e-06,
+      "loss": 0.0231,
+      "step": 3540
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.0013029536930844188,
+      "learning_rate": 6.444444444444445e-06,
+      "loss": 0.006,
+      "step": 3550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0017811213620007038,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.0335,
+      "step": 3560
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0008715124567970634,
+      "learning_rate": 6.355555555555556e-06,
+      "loss": 0.0285,
+      "step": 3570
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.002087782369926572,
+      "learning_rate": 6.311111111111111e-06,
+      "loss": 0.0157,
+      "step": 3580
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0018431423231959343,
+      "learning_rate": 6.266666666666668e-06,
+      "loss": 0.007,
+      "step": 3590
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 2.5827839374542236,
+      "learning_rate": 6.222222222222223e-06,
+      "loss": 0.0288,
+      "step": 3600
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.00216556154191494,
+      "learning_rate": 6.177777777777778e-06,
+      "loss": 0.0001,
+      "step": 3610
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.0011645135236904025,
+      "learning_rate": 6.133333333333334e-06,
+      "loss": 0.0384,
+      "step": 3620
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.5017549991607666,
+      "learning_rate": 6.08888888888889e-06,
+      "loss": 0.0133,
+      "step": 3630
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 1.0787444114685059,
+      "learning_rate": 6.044444444444445e-06,
+      "loss": 0.0201,
+      "step": 3640
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.6228489875793457,
+      "learning_rate": 6e-06,
+      "loss": 0.0316,
+      "step": 3650
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 2.6893579959869385,
+      "learning_rate": 5.955555555555555e-06,
+      "loss": 0.0296,
+      "step": 3660
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 2.737757921218872,
+      "learning_rate": 5.911111111111112e-06,
+      "loss": 0.0281,
+      "step": 3670
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.0011677155271172523,
+      "learning_rate": 5.8666666666666675e-06,
+      "loss": 0.0446,
+      "step": 3680
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.590535044670105,
+      "learning_rate": 5.822222222222223e-06,
+      "loss": 0.0668,
+      "step": 3690
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 3.329134464263916,
+      "learning_rate": 5.777777777777778e-06,
+      "loss": 0.023,
+      "step": 3700
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.0014503680868074298,
+      "learning_rate": 5.733333333333334e-06,
+      "loss": 0.0343,
+      "step": 3710
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.03579283133149147,
+      "learning_rate": 5.688888888888889e-06,
+      "loss": 0.0767,
+      "step": 3720
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.0014656345592811704,
+      "learning_rate": 5.6444444444444445e-06,
+      "loss": 0.0202,
+      "step": 3730
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.0010772488312795758,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.0521,
+      "step": 3740
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.007391482125967741,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 0.0177,
+      "step": 3750
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.0013084843521937728,
+      "learning_rate": 5.511111111111112e-06,
+      "loss": 0.0132,
+      "step": 3760
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.0018328677397221327,
+      "learning_rate": 5.466666666666667e-06,
+      "loss": 0.0833,
+      "step": 3770
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.002266037743538618,
+      "learning_rate": 5.422222222222223e-06,
+      "loss": 0.0165,
+      "step": 3780
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.35491234064102173,
+      "learning_rate": 5.3777777777777784e-06,
+      "loss": 0.0205,
+      "step": 3790
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.08552182465791702,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.0153,
+      "step": 3800
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.0011903179110959172,
+      "learning_rate": 5.288888888888889e-06,
+      "loss": 0.0164,
+      "step": 3810
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.002342939842492342,
+      "learning_rate": 5.244444444444445e-06,
+      "loss": 0.0301,
+      "step": 3820
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.0657195895910263,
+      "learning_rate": 5.2e-06,
+      "loss": 0.0162,
+      "step": 3830
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.0015936404233798385,
+      "learning_rate": 5.155555555555556e-06,
+      "loss": 0.03,
+      "step": 3840
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.0019919448532164097,
+      "learning_rate": 5.1111111111111115e-06,
+      "loss": 0.0003,
+      "step": 3850
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.1910542249679565,
+      "learning_rate": 5.0666666666666676e-06,
+      "loss": 0.0309,
+      "step": 3860
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.0016105415998026729,
+      "learning_rate": 5.022222222222223e-06,
+      "loss": 0.0319,
+      "step": 3870
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.0006896441336721182,
+      "learning_rate": 4.977777777777778e-06,
+      "loss": 0.0187,
+      "step": 3880
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 2.6113905906677246,
+      "learning_rate": 4.933333333333334e-06,
+      "loss": 0.016,
+      "step": 3890
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 1.3897886276245117,
+      "learning_rate": 4.888888888888889e-06,
+      "loss": 0.0575,
+      "step": 3900
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.0009651753352954984,
+      "learning_rate": 4.8444444444444446e-06,
+      "loss": 0.001,
+      "step": 3910
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.003367891302332282,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.0507,
+      "step": 3920
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.4884891510009766,
+      "learning_rate": 4.755555555555556e-06,
+      "loss": 0.0078,
+      "step": 3930
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.0011329209664836526,
+      "learning_rate": 4.711111111111111e-06,
+      "loss": 0.0,
+      "step": 3940
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.0015582548221573234,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.0263,
+      "step": 3950
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.0019160009687766433,
+      "learning_rate": 4.622222222222222e-06,
+      "loss": 0.025,
+      "step": 3960
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.0009318340453319252,
+      "learning_rate": 4.5777777777777785e-06,
+      "loss": 0.0474,
+      "step": 3970
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0015654967864975333,
+      "learning_rate": 4.533333333333334e-06,
+      "loss": 0.025,
+      "step": 3980
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.0137476921081543,
+      "learning_rate": 4.488888888888889e-06,
+      "loss": 0.0092,
+      "step": 3990
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.03859930485486984,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.0231,
+      "step": 4000
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.02214735746383667,
+      "eval_runtime": 61.9646,
+      "eval_samples_per_second": 16.138,
+      "eval_steps_per_second": 16.138,
+      "step": 4000
     }
   ],
   "logging_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,
+  "total_flos": 6.4408503975936e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null