rakhman-llm commited on
Commit
4d06f00
·
verified ·
1 Parent(s): f9c41b6

Training in progress, step 14500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6126aaf54fb0a8fa636e61c84c76f38b4601fd61074fb5bca240d2a736f4f66
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff540b9931ff3b9051aa7ca25e64a8e24ca7e8526e26bc089bc76ac294f8b424
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f867dcc57d02d73c1b56b57d7d1b1b32d706b98344ba93376758e87f7b8d3c66
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a063f1bf8d5647653d13d9d67ef96e737ad0d45a0be438a667d109a536c16697
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:641d66ee8d5d7452522e196306c4649612954d1a71ab68743aace7ace44a6ed9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b330fcd1c4b02a2985ceb253ae804bf16171e108be502780a5eab9132fe8fd30
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01772d8cb8f09c4951eb7361881a716af1bc369413a65c2401d6dc240309cc7c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c2d50ca5391f126ee8cc2961637b87c22247c1f8f80ebd7b00bac4d79271141
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
- "epoch": 2.24,
5
  "eval_steps": 500,
6
- "global_step": 14000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2191,6 +2191,84 @@
2191
  "eval_samples_per_second": 17.136,
2192
  "eval_steps_per_second": 2.142,
2193
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2194
  }
2195
  ],
2196
  "logging_steps": 50,
@@ -2210,7 +2288,7 @@
2210
  "attributes": {}
2211
  }
2212
  },
2213
- "total_flos": 6.820328374272e+16,
2214
  "train_batch_size": 8,
2215
  "trial_name": null,
2216
  "trial_params": null
 
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.32,
5
  "eval_steps": 500,
6
+ "global_step": 14500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2191
  "eval_samples_per_second": 17.136,
2192
  "eval_steps_per_second": 2.142,
2193
  "step": 14000
2194
+ },
2195
+ {
2196
+ "epoch": 2.248,
2197
+ "grad_norm": 5661.556640625,
2198
+ "learning_rate": 7.52e-06,
2199
+ "loss": 0.0516,
2200
+ "step": 14050
2201
+ },
2202
+ {
2203
+ "epoch": 2.2560000000000002,
2204
+ "grad_norm": 6117.46728515625,
2205
+ "learning_rate": 7.44e-06,
2206
+ "loss": 0.0535,
2207
+ "step": 14100
2208
+ },
2209
+ {
2210
+ "epoch": 2.2640000000000002,
2211
+ "grad_norm": 5083.50634765625,
2212
+ "learning_rate": 7.36e-06,
2213
+ "loss": 0.0514,
2214
+ "step": 14150
2215
+ },
2216
+ {
2217
+ "epoch": 2.2720000000000002,
2218
+ "grad_norm": 6597.24365234375,
2219
+ "learning_rate": 7.280000000000001e-06,
2220
+ "loss": 0.0566,
2221
+ "step": 14200
2222
+ },
2223
+ {
2224
+ "epoch": 2.2800000000000002,
2225
+ "grad_norm": 7306.90185546875,
2226
+ "learning_rate": 7.2e-06,
2227
+ "loss": 0.0523,
2228
+ "step": 14250
2229
+ },
2230
+ {
2231
+ "epoch": 2.288,
2232
+ "grad_norm": 6694.41552734375,
2233
+ "learning_rate": 7.1200000000000004e-06,
2234
+ "loss": 0.0475,
2235
+ "step": 14300
2236
+ },
2237
+ {
2238
+ "epoch": 2.296,
2239
+ "grad_norm": 3753.303466796875,
2240
+ "learning_rate": 7.04e-06,
2241
+ "loss": 0.0501,
2242
+ "step": 14350
2243
+ },
2244
+ {
2245
+ "epoch": 2.304,
2246
+ "grad_norm": 5714.30078125,
2247
+ "learning_rate": 6.96e-06,
2248
+ "loss": 0.0485,
2249
+ "step": 14400
2250
+ },
2251
+ {
2252
+ "epoch": 2.312,
2253
+ "grad_norm": 7579.119140625,
2254
+ "learning_rate": 6.88e-06,
2255
+ "loss": 0.0504,
2256
+ "step": 14450
2257
+ },
2258
+ {
2259
+ "epoch": 2.32,
2260
+ "grad_norm": 6103.64599609375,
2261
+ "learning_rate": 6.8e-06,
2262
+ "loss": 0.0531,
2263
+ "step": 14500
2264
+ },
2265
+ {
2266
+ "epoch": 2.32,
2267
+ "eval_loss": 0.08199251443147659,
2268
+ "eval_runtime": 116.661,
2269
+ "eval_samples_per_second": 17.144,
2270
+ "eval_steps_per_second": 2.143,
2271
+ "step": 14500
2272
  }
2273
  ],
2274
  "logging_steps": 50,
 
2288
  "attributes": {}
2289
  }
2290
  },
2291
+ "total_flos": 7.063911530496e+16,
2292
  "train_batch_size": 8,
2293
  "trial_name": null,
2294
  "trial_params": null