rakhman-llm commited on
Commit
7e60b10
·
verified ·
1 Parent(s): a91b3dc

Training in progress, step 15000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff540b9931ff3b9051aa7ca25e64a8e24ca7e8526e26bc089bc76ac294f8b424
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e05637d9fe00567351aebe30b8907548391539066a69466b08d62fb0de2c8b6a
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a063f1bf8d5647653d13d9d67ef96e737ad0d45a0be438a667d109a536c16697
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd2dde3b7d6cb9a958c80e4da86c9ac7e84d7b0aad33d337c26e27372676e0e8
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b330fcd1c4b02a2985ceb253ae804bf16171e108be502780a5eab9132fe8fd30
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ac6d446adeddd129c374743386b9fda911e1104accc0a9ad12d81db0a9913ff
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c2d50ca5391f126ee8cc2961637b87c22247c1f8f80ebd7b00bac4d79271141
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e468fb4a523a6bf7dace3eac71fcc8bc1ed6b95078548573228e864e9505bcd
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
- "epoch": 2.32,
5
  "eval_steps": 500,
6
- "global_step": 14500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2269,6 +2269,84 @@
2269
  "eval_samples_per_second": 17.144,
2270
  "eval_steps_per_second": 2.143,
2271
  "step": 14500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2272
  }
2273
  ],
2274
  "logging_steps": 50,
@@ -2288,7 +2366,7 @@
2288
  "attributes": {}
2289
  }
2290
  },
2291
- "total_flos": 7.063911530496e+16,
2292
  "train_batch_size": 8,
2293
  "trial_name": null,
2294
  "trial_params": null
 
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.4,
5
  "eval_steps": 500,
6
+ "global_step": 15000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2269
  "eval_samples_per_second": 17.144,
2270
  "eval_steps_per_second": 2.143,
2271
  "step": 14500
2272
+ },
2273
+ {
2274
+ "epoch": 2.328,
2275
+ "grad_norm": 7419.63623046875,
2276
+ "learning_rate": 6.72e-06,
2277
+ "loss": 0.0527,
2278
+ "step": 14550
2279
+ },
2280
+ {
2281
+ "epoch": 2.336,
2282
+ "grad_norm": 6152.6513671875,
2283
+ "learning_rate": 6.64e-06,
2284
+ "loss": 0.048,
2285
+ "step": 14600
2286
+ },
2287
+ {
2288
+ "epoch": 2.344,
2289
+ "grad_norm": 6703.68994140625,
2290
+ "learning_rate": 6.560000000000001e-06,
2291
+ "loss": 0.0537,
2292
+ "step": 14650
2293
+ },
2294
+ {
2295
+ "epoch": 2.352,
2296
+ "grad_norm": 8612.31640625,
2297
+ "learning_rate": 6.48e-06,
2298
+ "loss": 0.0512,
2299
+ "step": 14700
2300
+ },
2301
+ {
2302
+ "epoch": 2.36,
2303
+ "grad_norm": 6183.3798828125,
2304
+ "learning_rate": 6.4000000000000006e-06,
2305
+ "loss": 0.0499,
2306
+ "step": 14750
2307
+ },
2308
+ {
2309
+ "epoch": 2.368,
2310
+ "grad_norm": 7795.396484375,
2311
+ "learning_rate": 6.3200000000000005e-06,
2312
+ "loss": 0.0525,
2313
+ "step": 14800
2314
+ },
2315
+ {
2316
+ "epoch": 2.376,
2317
+ "grad_norm": 6911.2099609375,
2318
+ "learning_rate": 6.2399999999999995e-06,
2319
+ "loss": 0.0503,
2320
+ "step": 14850
2321
+ },
2322
+ {
2323
+ "epoch": 2.384,
2324
+ "grad_norm": 9744.9267578125,
2325
+ "learning_rate": 6.16e-06,
2326
+ "loss": 0.0509,
2327
+ "step": 14900
2328
+ },
2329
+ {
2330
+ "epoch": 2.392,
2331
+ "grad_norm": 4487.8115234375,
2332
+ "learning_rate": 6.08e-06,
2333
+ "loss": 0.0504,
2334
+ "step": 14950
2335
+ },
2336
+ {
2337
+ "epoch": 2.4,
2338
+ "grad_norm": 6276.47607421875,
2339
+ "learning_rate": 6e-06,
2340
+ "loss": 0.0505,
2341
+ "step": 15000
2342
+ },
2343
+ {
2344
+ "epoch": 2.4,
2345
+ "eval_loss": 0.08178989589214325,
2346
+ "eval_runtime": 116.6529,
2347
+ "eval_samples_per_second": 17.145,
2348
+ "eval_steps_per_second": 2.143,
2349
+ "step": 15000
2350
  }
2351
  ],
2352
  "logging_steps": 50,
 
2366
  "attributes": {}
2367
  }
2368
  },
2369
+ "total_flos": 7.30749468672e+16,
2370
  "train_batch_size": 8,
2371
  "trial_name": null,
2372
  "trial_params": null