rakhman-llm commited on
Commit
a4b917c
·
verified ·
1 Parent(s): 5b9180b

Training in progress, step 16500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39065861c3b099700bb000236a9e498a039343f48398b98c2fb9a14bb096b6c1
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcd4b260e271cf50f226ed5b7b322e87d12ca1eaa8c5c59023d510360785bd75
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e12bc163cfe02cd564c638e298b40f4155a4f87b8f01adeb81746cbd797e5ef
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c320206d058917027a9677aae069cffa405a4671b0c52efd251f8eff20644910
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4eebbe4891f802a9c170f9968f3ddf4d71a0eff2d959ee8775998aa9a7034155
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55c1d4daeb9503469e608c59ee558a87893ccabfba88dd0e89cafcafa3474077
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6949e36257fd6db2d60885342ebdb2dc7d96352bf8e47bc5271716a48f4cbc6e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17e813c289d034a09f864648b82a92fe61bb865901d4d03fe480b494ff98cc3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
- "epoch": 2.56,
5
  "eval_steps": 500,
6
- "global_step": 16000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2503,6 +2503,84 @@
2503
  "eval_samples_per_second": 17.123,
2504
  "eval_steps_per_second": 2.14,
2505
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2506
  }
2507
  ],
2508
  "logging_steps": 50,
@@ -2522,7 +2600,7 @@
2522
  "attributes": {}
2523
  }
2524
  },
2525
- "total_flos": 7.794660999168e+16,
2526
  "train_batch_size": 8,
2527
  "trial_name": null,
2528
  "trial_params": null
 
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.64,
5
  "eval_steps": 500,
6
+ "global_step": 16500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2503
  "eval_samples_per_second": 17.123,
2504
  "eval_steps_per_second": 2.14,
2505
  "step": 16000
2506
+ },
2507
+ {
2508
+ "epoch": 2.568,
2509
+ "grad_norm": 5753.2734375,
2510
+ "learning_rate": 4.32e-06,
2511
+ "loss": 0.0474,
2512
+ "step": 16050
2513
+ },
2514
+ {
2515
+ "epoch": 2.576,
2516
+ "grad_norm": 8457.216796875,
2517
+ "learning_rate": 4.24e-06,
2518
+ "loss": 0.0511,
2519
+ "step": 16100
2520
+ },
2521
+ {
2522
+ "epoch": 2.584,
2523
+ "grad_norm": 8808.5458984375,
2524
+ "learning_rate": 4.16e-06,
2525
+ "loss": 0.0497,
2526
+ "step": 16150
2527
+ },
2528
+ {
2529
+ "epoch": 2.592,
2530
+ "grad_norm": 7374.8994140625,
2531
+ "learning_rate": 4.080000000000001e-06,
2532
+ "loss": 0.0469,
2533
+ "step": 16200
2534
+ },
2535
+ {
2536
+ "epoch": 2.6,
2537
+ "grad_norm": 8480.7109375,
2538
+ "learning_rate": 4e-06,
2539
+ "loss": 0.0527,
2540
+ "step": 16250
2541
+ },
2542
+ {
2543
+ "epoch": 2.608,
2544
+ "grad_norm": 9268.271484375,
2545
+ "learning_rate": 3.92e-06,
2546
+ "loss": 0.0514,
2547
+ "step": 16300
2548
+ },
2549
+ {
2550
+ "epoch": 2.616,
2551
+ "grad_norm": 7013.30810546875,
2552
+ "learning_rate": 3.8400000000000005e-06,
2553
+ "loss": 0.0529,
2554
+ "step": 16350
2555
+ },
2556
+ {
2557
+ "epoch": 2.624,
2558
+ "grad_norm": 5726.31298828125,
2559
+ "learning_rate": 3.76e-06,
2560
+ "loss": 0.0532,
2561
+ "step": 16400
2562
+ },
2563
+ {
2564
+ "epoch": 2.632,
2565
+ "grad_norm": 6278.4267578125,
2566
+ "learning_rate": 3.68e-06,
2567
+ "loss": 0.0508,
2568
+ "step": 16450
2569
+ },
2570
+ {
2571
+ "epoch": 2.64,
2572
+ "grad_norm": 7901.65576171875,
2573
+ "learning_rate": 3.6e-06,
2574
+ "loss": 0.0489,
2575
+ "step": 16500
2576
+ },
2577
+ {
2578
+ "epoch": 2.64,
2579
+ "eval_loss": 0.08172949403524399,
2580
+ "eval_runtime": 116.6616,
2581
+ "eval_samples_per_second": 17.144,
2582
+ "eval_steps_per_second": 2.143,
2583
+ "step": 16500
2584
  }
2585
  ],
2586
  "logging_steps": 50,
 
2600
  "attributes": {}
2601
  }
2602
  },
2603
+ "total_flos": 8.038244155392e+16,
2604
  "train_batch_size": 8,
2605
  "trial_name": null,
2606
  "trial_params": null