rakhman-llm commited on
Commit
f1f13a5
·
verified ·
1 Parent(s): 877acc0

Training in progress, step 13500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84ffb6564333bec2f290dbb25cc2aa16322f854baa8d4d551f3c98e898121f1a
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d7e7ce5d8087ead3443b4f08e582b8deb0371c597dc0bcefa92b27b47cf757b
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55950639d5a01737cdced1ff8ae565003480c956e54966fe29dbcc5ee832bbe0
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b3343f5795c77c59b4597ad652cbd699824ac18f885e075e189a3d182b5959e
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ede19b1f06f575564a160b0c17fcb5315d8660261b38069d03c83f6d06084b12
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:134eebb9d67a08338a927439b5399d422ef15da51b7ad38136806c07d685a893
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ed905996f8c375ddf1e71a02110476c5d9bb4d922dca340182e086437e4a3a1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f1ce7c9ab70607ae470f91da12ff0d4b8f6d4eec2f38f9df9861eec2ba77599
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
- "epoch": 2.08,
5
  "eval_steps": 500,
6
- "global_step": 13000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2035,6 +2035,84 @@
2035
  "eval_samples_per_second": 17.094,
2036
  "eval_steps_per_second": 2.137,
2037
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
  }
2039
  ],
2040
  "logging_steps": 50,
@@ -2054,7 +2132,7 @@
2054
  "attributes": {}
2055
  }
2056
  },
2057
- "total_flos": 6.333162061824e+16,
2058
  "train_batch_size": 8,
2059
  "trial_name": null,
2060
  "trial_params": null
 
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.16,
5
  "eval_steps": 500,
6
+ "global_step": 13500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2035
  "eval_samples_per_second": 17.094,
2036
  "eval_steps_per_second": 2.137,
2037
  "step": 13000
2038
+ },
2039
+ {
2040
+ "epoch": 2.088,
2041
+ "grad_norm": 7875.97900390625,
2042
+ "learning_rate": 9.12e-06,
2043
+ "loss": 0.049,
2044
+ "step": 13050
2045
+ },
2046
+ {
2047
+ "epoch": 2.096,
2048
+ "grad_norm": 6825.78076171875,
2049
+ "learning_rate": 9.04e-06,
2050
+ "loss": 0.0465,
2051
+ "step": 13100
2052
+ },
2053
+ {
2054
+ "epoch": 2.104,
2055
+ "grad_norm": 5515.30322265625,
2056
+ "learning_rate": 8.96e-06,
2057
+ "loss": 0.0535,
2058
+ "step": 13150
2059
+ },
2060
+ {
2061
+ "epoch": 2.112,
2062
+ "grad_norm": 8940.48828125,
2063
+ "learning_rate": 8.88e-06,
2064
+ "loss": 0.0564,
2065
+ "step": 13200
2066
+ },
2067
+ {
2068
+ "epoch": 2.12,
2069
+ "grad_norm": 5110.7119140625,
2070
+ "learning_rate": 8.8e-06,
2071
+ "loss": 0.0509,
2072
+ "step": 13250
2073
+ },
2074
+ {
2075
+ "epoch": 2.128,
2076
+ "grad_norm": 8984.7353515625,
2077
+ "learning_rate": 8.720000000000001e-06,
2078
+ "loss": 0.0479,
2079
+ "step": 13300
2080
+ },
2081
+ {
2082
+ "epoch": 2.136,
2083
+ "grad_norm": 8438.55078125,
2084
+ "learning_rate": 8.64e-06,
2085
+ "loss": 0.0502,
2086
+ "step": 13350
2087
+ },
2088
+ {
2089
+ "epoch": 2.144,
2090
+ "grad_norm": 5724.0849609375,
2091
+ "learning_rate": 8.56e-06,
2092
+ "loss": 0.0501,
2093
+ "step": 13400
2094
+ },
2095
+ {
2096
+ "epoch": 2.152,
2097
+ "grad_norm": 7649.28955078125,
2098
+ "learning_rate": 8.48e-06,
2099
+ "loss": 0.0569,
2100
+ "step": 13450
2101
+ },
2102
+ {
2103
+ "epoch": 2.16,
2104
+ "grad_norm": 8429.0166015625,
2105
+ "learning_rate": 8.400000000000001e-06,
2106
+ "loss": 0.053,
2107
+ "step": 13500
2108
+ },
2109
+ {
2110
+ "epoch": 2.16,
2111
+ "eval_loss": 0.08213882148265839,
2112
+ "eval_runtime": 116.6956,
2113
+ "eval_samples_per_second": 17.139,
2114
+ "eval_steps_per_second": 2.142,
2115
+ "step": 13500
2116
  }
2117
  ],
2118
  "logging_steps": 50,
 
2132
  "attributes": {}
2133
  }
2134
  },
2135
+ "total_flos": 6.576745218048e+16,
2136
  "train_batch_size": 8,
2137
  "trial_name": null,
2138
  "trial_params": null