rakhman-llm commited on
Commit
d118270
·
verified ·
1 Parent(s): d4bde4b

Training in progress, step 14000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d7e7ce5d8087ead3443b4f08e582b8deb0371c597dc0bcefa92b27b47cf757b
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6126aaf54fb0a8fa636e61c84c76f38b4601fd61074fb5bca240d2a736f4f66
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b3343f5795c77c59b4597ad652cbd699824ac18f885e075e189a3d182b5959e
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f867dcc57d02d73c1b56b57d7d1b1b32d706b98344ba93376758e87f7b8d3c66
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:134eebb9d67a08338a927439b5399d422ef15da51b7ad38136806c07d685a893
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:641d66ee8d5d7452522e196306c4649612954d1a71ab68743aace7ace44a6ed9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f1ce7c9ab70607ae470f91da12ff0d4b8f6d4eec2f38f9df9861eec2ba77599
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01772d8cb8f09c4951eb7361881a716af1bc369413a65c2401d6dc240309cc7c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
- "epoch": 2.16,
5
  "eval_steps": 500,
6
- "global_step": 13500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2113,6 +2113,84 @@
2113
  "eval_samples_per_second": 17.139,
2114
  "eval_steps_per_second": 2.142,
2115
  "step": 13500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2116
  }
2117
  ],
2118
  "logging_steps": 50,
@@ -2132,7 +2210,7 @@
2132
  "attributes": {}
2133
  }
2134
  },
2135
- "total_flos": 6.576745218048e+16,
2136
  "train_batch_size": 8,
2137
  "trial_name": null,
2138
  "trial_params": null
 
1
  {
2
  "best_metric": 0.08158940076828003,
3
  "best_model_checkpoint": "./fine-tuned/checkpoint-12500",
4
+ "epoch": 2.24,
5
  "eval_steps": 500,
6
+ "global_step": 14000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2113
  "eval_samples_per_second": 17.139,
2114
  "eval_steps_per_second": 2.142,
2115
  "step": 13500
2116
+ },
2117
+ {
2118
+ "epoch": 2.168,
2119
+ "grad_norm": 4402.388671875,
2120
+ "learning_rate": 8.32e-06,
2121
+ "loss": 0.0499,
2122
+ "step": 13550
2123
+ },
2124
+ {
2125
+ "epoch": 2.176,
2126
+ "grad_norm": 9858.970703125,
2127
+ "learning_rate": 8.24e-06,
2128
+ "loss": 0.0506,
2129
+ "step": 13600
2130
+ },
2131
+ {
2132
+ "epoch": 2.184,
2133
+ "grad_norm": 6748.5732421875,
2134
+ "learning_rate": 8.160000000000001e-06,
2135
+ "loss": 0.05,
2136
+ "step": 13650
2137
+ },
2138
+ {
2139
+ "epoch": 2.192,
2140
+ "grad_norm": 7720.3994140625,
2141
+ "learning_rate": 8.079999999999999e-06,
2142
+ "loss": 0.0504,
2143
+ "step": 13700
2144
+ },
2145
+ {
2146
+ "epoch": 2.2,
2147
+ "grad_norm": 5066.37060546875,
2148
+ "learning_rate": 8e-06,
2149
+ "loss": 0.0533,
2150
+ "step": 13750
2151
+ },
2152
+ {
2153
+ "epoch": 2.208,
2154
+ "grad_norm": 7975.1376953125,
2155
+ "learning_rate": 7.92e-06,
2156
+ "loss": 0.0482,
2157
+ "step": 13800
2158
+ },
2159
+ {
2160
+ "epoch": 2.216,
2161
+ "grad_norm": 6690.85302734375,
2162
+ "learning_rate": 7.84e-06,
2163
+ "loss": 0.0518,
2164
+ "step": 13850
2165
+ },
2166
+ {
2167
+ "epoch": 2.224,
2168
+ "grad_norm": 8501.337890625,
2169
+ "learning_rate": 7.76e-06,
2170
+ "loss": 0.0534,
2171
+ "step": 13900
2172
+ },
2173
+ {
2174
+ "epoch": 2.232,
2175
+ "grad_norm": 15215.427734375,
2176
+ "learning_rate": 7.680000000000001e-06,
2177
+ "loss": 0.0488,
2178
+ "step": 13950
2179
+ },
2180
+ {
2181
+ "epoch": 2.24,
2182
+ "grad_norm": 6265.7568359375,
2183
+ "learning_rate": 7.600000000000001e-06,
2184
+ "loss": 0.0468,
2185
+ "step": 14000
2186
+ },
2187
+ {
2188
+ "epoch": 2.24,
2189
+ "eval_loss": 0.08207998424768448,
2190
+ "eval_runtime": 116.7104,
2191
+ "eval_samples_per_second": 17.136,
2192
+ "eval_steps_per_second": 2.142,
2193
+ "step": 14000
2194
  }
2195
  ],
2196
  "logging_steps": 50,
 
2210
  "attributes": {}
2211
  }
2212
  },
2213
+ "total_flos": 6.820328374272e+16,
2214
  "train_batch_size": 8,
2215
  "trial_name": null,
2216
  "trial_params": null