csikasote commited on
Commit
8d50746
·
verified ·
1 Parent(s): 89b17c6

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,9 @@ library_name: transformers
3
  license: cc-by-nc-4.0
4
  base_model: facebook/mms-1b-all
5
  tags:
 
 
 
6
  - generated_from_trainer
7
  metrics:
8
  - wer
@@ -16,10 +19,10 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # mms-1b-toigen-balanced-model
18
 
19
- This model is a fine-tuned version of [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.3314
22
- - Wer: 0.3856
23
 
24
  ## Model description
25
 
 
3
  license: cc-by-nc-4.0
4
  base_model: facebook/mms-1b-all
5
  tags:
6
+ - automatic-speech-recognition
7
+ - toigen
8
+ - mms
9
  - generated_from_trainer
10
  metrics:
11
  - wer
 
19
 
20
  # mms-1b-toigen-balanced-model
21
 
22
+ This model is a fine-tuned version of [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all) on the TOIGEN - TOI dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 0.3234
25
+ - Wer: 0.3755
26
 
27
  ## Model description
28
 
adapter.toi.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92c459e7b229b8b12b75b104db63cdcae0a13dfde2053a2dbaaf47ec6f45d4a9
3
  size 8793408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4acdbd02780dbdbd87b295da519d4ebe4f7dcdb96a76aa54751353fc32048525
3
  size 8793408
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 12.053571428571429,
3
- "eval_loss": 0.37403130531311035,
4
- "eval_runtime": 17.8093,
5
- "eval_samples": 204,
6
- "eval_samples_per_second": 11.455,
7
- "eval_steps_per_second": 2.864,
8
- "eval_wer": 0.39895882631329865,
9
- "total_flos": 9.163482510982138e+18,
10
- "train_loss": 0.8344545293737341,
11
- "train_runtime": 2390.0152,
12
- "train_samples": 894,
13
- "train_samples_per_second": 11.222,
14
- "train_steps_per_second": 2.812
15
  }
 
1
  {
2
+ "epoch": 21.238938053097346,
3
+ "eval_loss": 0.32341432571411133,
4
+ "eval_runtime": 18.8763,
5
+ "eval_samples": 208,
6
+ "eval_samples_per_second": 11.019,
7
+ "eval_steps_per_second": 2.755,
8
+ "eval_wer": 0.37546296296296294,
9
+ "total_flos": 1.5969122077347269e+19,
10
+ "train_loss": 1.5763085651397706,
11
+ "train_runtime": 3424.9125,
12
+ "train_samples": 901,
13
+ "train_samples_per_second": 657.681,
14
+ "train_steps_per_second": 82.484
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 12.053571428571429,
3
- "eval_loss": 0.37403130531311035,
4
- "eval_runtime": 17.8093,
5
- "eval_samples": 204,
6
- "eval_samples_per_second": 11.455,
7
- "eval_steps_per_second": 2.864,
8
- "eval_wer": 0.39895882631329865
9
  }
 
1
  {
2
+ "epoch": 21.238938053097346,
3
+ "eval_loss": 0.32341432571411133,
4
+ "eval_runtime": 18.8763,
5
+ "eval_samples": 208,
6
+ "eval_samples_per_second": 11.019,
7
+ "eval_steps_per_second": 2.755,
8
+ "eval_wer": 0.37546296296296294
9
  }
runs/Jan03_15-37-18_srvrocgpu011.uct.ac.za/events.out.tfevents.1735915299.srvrocgpu011.uct.ac.za ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c30fc276364b2ab520b67a01f32b5ae9db7f3b55bf8ec65723bab0383e1cb6
3
+ size 40
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 12.053571428571429,
3
- "total_flos": 9.163482510982138e+18,
4
- "train_loss": 0.8344545293737341,
5
- "train_runtime": 2390.0152,
6
- "train_samples": 894,
7
- "train_samples_per_second": 11.222,
8
- "train_steps_per_second": 2.812
9
  }
 
1
  {
2
+ "epoch": 21.238938053097346,
3
+ "total_flos": 1.5969122077347269e+19,
4
+ "train_loss": 1.5763085651397706,
5
+ "train_runtime": 3424.9125,
6
+ "train_samples": 901,
7
+ "train_samples_per_second": 657.681,
8
+ "train_steps_per_second": 82.484
9
  }
trainer_state.json CHANGED
@@ -1,468 +1,420 @@
1
  {
2
- "best_metric": 0.3741886615753174,
3
- "best_model_checkpoint": "/scratch/skscla001/speech/results/mms-1b-toigen-balanced-model/checkpoint-2400",
4
- "epoch": 12.053571428571429,
5
  "eval_steps": 100,
6
- "global_step": 2700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.44642857142857145,
13
- "grad_norm": 3.282467842102051,
14
- "learning_rate": 0.00028199999999999997,
15
- "loss": 7.7726,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.44642857142857145,
20
- "eval_loss": 3.8109493255615234,
21
- "eval_runtime": 17.8235,
22
- "eval_samples_per_second": 11.446,
23
- "eval_steps_per_second": 2.861,
24
- "eval_wer": 0.9938476100331283,
25
  "step": 100
26
  },
27
  {
28
- "epoch": 0.8928571428571429,
29
- "grad_norm": 2.164923667907715,
30
- "learning_rate": 0.00029574018126888213,
31
- "loss": 2.5726,
32
  "step": 200
33
  },
34
  {
35
- "epoch": 0.8928571428571429,
36
- "eval_loss": 0.8106288313865662,
37
- "eval_runtime": 17.6933,
38
- "eval_samples_per_second": 11.53,
39
- "eval_steps_per_second": 2.882,
40
- "eval_wer": 0.616658778987222,
41
  "step": 200
42
  },
43
  {
44
- "epoch": 1.3392857142857144,
45
- "grad_norm": 1.3534202575683594,
46
- "learning_rate": 0.0002912084592145015,
47
- "loss": 0.7986,
48
  "step": 300
49
  },
50
  {
51
- "epoch": 1.3392857142857144,
52
- "eval_loss": 0.5409455299377441,
53
- "eval_runtime": 17.7815,
54
- "eval_samples_per_second": 11.473,
55
- "eval_steps_per_second": 2.868,
56
- "eval_wer": 0.5257927117841931,
57
  "step": 300
58
  },
59
  {
60
- "epoch": 1.7857142857142856,
61
- "grad_norm": 0.9854668378829956,
62
- "learning_rate": 0.00028667673716012085,
63
- "loss": 0.6324,
64
  "step": 400
65
  },
66
  {
67
- "epoch": 1.7857142857142856,
68
- "eval_loss": 0.5256258845329285,
69
- "eval_runtime": 17.7084,
70
- "eval_samples_per_second": 11.52,
71
- "eval_steps_per_second": 2.88,
72
- "eval_wer": 0.5054424988168481,
73
  "step": 400
74
  },
75
  {
76
- "epoch": 2.232142857142857,
77
- "grad_norm": 18.835981369018555,
78
- "learning_rate": 0.00028214501510574015,
79
- "loss": 0.603,
80
  "step": 500
81
  },
82
  {
83
- "epoch": 2.232142857142857,
84
- "eval_loss": 0.4854464828968048,
85
- "eval_runtime": 17.7046,
86
- "eval_samples_per_second": 11.522,
87
- "eval_steps_per_second": 2.881,
88
- "eval_wer": 0.4831992427827733,
89
  "step": 500
90
  },
91
  {
92
- "epoch": 2.678571428571429,
93
- "grad_norm": 32.54256820678711,
94
- "learning_rate": 0.0002776132930513595,
95
- "loss": 0.59,
96
  "step": 600
97
  },
98
  {
99
- "epoch": 2.678571428571429,
100
- "eval_loss": 0.47332894802093506,
101
- "eval_runtime": 17.6431,
102
- "eval_samples_per_second": 11.563,
103
- "eval_steps_per_second": 2.891,
104
- "eval_wer": 0.4846190250828206,
105
  "step": 600
106
  },
107
  {
108
- "epoch": 3.125,
109
- "grad_norm": 1.3252086639404297,
110
- "learning_rate": 0.0002730815709969788,
111
- "loss": 0.5489,
112
  "step": 700
113
  },
114
  {
115
- "epoch": 3.125,
116
- "eval_loss": 0.4439888894557953,
117
- "eval_runtime": 17.8297,
118
- "eval_samples_per_second": 11.442,
119
- "eval_steps_per_second": 2.86,
120
- "eval_wer": 0.46568859441552296,
121
  "step": 700
122
  },
123
  {
124
- "epoch": 3.571428571428571,
125
- "grad_norm": 0.5452375411987305,
126
- "learning_rate": 0.00026854984894259817,
127
- "loss": 0.5173,
128
  "step": 800
129
  },
130
  {
131
- "epoch": 3.571428571428571,
132
- "eval_loss": 0.43219566345214844,
133
- "eval_runtime": 17.6382,
134
- "eval_samples_per_second": 11.566,
135
- "eval_steps_per_second": 2.891,
136
- "eval_wer": 0.45764316138192146,
137
  "step": 800
138
  },
139
  {
140
- "epoch": 4.017857142857143,
141
- "grad_norm": 0.7151035070419312,
142
- "learning_rate": 0.0002640181268882175,
143
- "loss": 0.5315,
144
  "step": 900
145
  },
146
  {
147
- "epoch": 4.017857142857143,
148
- "eval_loss": 0.4285721480846405,
149
- "eval_runtime": 17.7542,
150
- "eval_samples_per_second": 11.49,
151
- "eval_steps_per_second": 2.873,
152
- "eval_wer": 0.44533838144817794,
153
  "step": 900
154
  },
155
  {
156
- "epoch": 4.464285714285714,
157
- "grad_norm": 1.8268319368362427,
158
- "learning_rate": 0.0002594864048338368,
159
- "loss": 0.4912,
160
  "step": 1000
161
  },
162
  {
163
- "epoch": 4.464285714285714,
164
- "eval_loss": 0.42536306381225586,
165
- "eval_runtime": 17.765,
166
- "eval_samples_per_second": 11.483,
167
- "eval_steps_per_second": 2.871,
168
- "eval_wer": 0.4458116422148604,
169
  "step": 1000
170
  },
171
  {
172
- "epoch": 4.910714285714286,
173
- "grad_norm": 0.850709080696106,
174
- "learning_rate": 0.0002549546827794562,
175
- "loss": 0.4728,
176
  "step": 1100
177
  },
178
  {
179
- "epoch": 4.910714285714286,
180
- "eval_loss": 0.43455594778060913,
181
- "eval_runtime": 17.7563,
182
- "eval_samples_per_second": 11.489,
183
- "eval_steps_per_second": 2.872,
184
- "eval_wer": 0.44297207761476576,
185
  "step": 1100
186
  },
187
  {
188
- "epoch": 5.357142857142857,
189
- "grad_norm": 0.7361202836036682,
190
- "learning_rate": 0.00025042296072507554,
191
- "loss": 0.4989,
192
  "step": 1200
193
  },
194
  {
195
- "epoch": 5.357142857142857,
196
- "eval_loss": 0.40502411127090454,
197
- "eval_runtime": 17.6139,
198
- "eval_samples_per_second": 11.582,
199
- "eval_steps_per_second": 2.895,
200
- "eval_wer": 0.42924751538097494,
201
  "step": 1200
202
  },
203
  {
204
- "epoch": 5.803571428571429,
205
- "grad_norm": 1.305498719215393,
206
- "learning_rate": 0.00024589123867069484,
207
- "loss": 0.4661,
208
  "step": 1300
209
  },
210
  {
211
- "epoch": 5.803571428571429,
212
- "eval_loss": 0.4019148647785187,
213
- "eval_runtime": 17.792,
214
- "eval_samples_per_second": 11.466,
215
- "eval_steps_per_second": 2.866,
216
- "eval_wer": 0.4254614292475154,
217
  "step": 1300
218
  },
219
  {
220
- "epoch": 6.25,
221
- "grad_norm": 1.875386357307434,
222
- "learning_rate": 0.00024135951661631417,
223
- "loss": 0.4755,
224
  "step": 1400
225
  },
226
  {
227
- "epoch": 6.25,
228
- "eval_loss": 0.4128676652908325,
229
- "eval_runtime": 17.7386,
230
- "eval_samples_per_second": 11.5,
231
- "eval_steps_per_second": 2.875,
232
- "eval_wer": 0.44486512068149553,
233
  "step": 1400
234
  },
235
  {
236
- "epoch": 6.696428571428571,
237
- "grad_norm": 1.3318761587142944,
238
- "learning_rate": 0.0002368277945619335,
239
- "loss": 0.4603,
240
  "step": 1500
241
  },
242
  {
243
- "epoch": 6.696428571428571,
244
- "eval_loss": 0.40455254912376404,
245
- "eval_runtime": 17.8115,
246
- "eval_samples_per_second": 11.453,
247
- "eval_steps_per_second": 2.863,
248
- "eval_wer": 0.4254614292475154,
249
  "step": 1500
250
  },
251
  {
252
- "epoch": 7.142857142857143,
253
- "grad_norm": 1.7303593158721924,
254
- "learning_rate": 0.00023229607250755283,
255
- "loss": 0.4229,
256
  "step": 1600
257
  },
258
  {
259
- "epoch": 7.142857142857143,
260
- "eval_loss": 0.3939039707183838,
261
- "eval_runtime": 17.7505,
262
- "eval_samples_per_second": 11.493,
263
- "eval_steps_per_second": 2.873,
264
- "eval_wer": 0.41504969238050166,
265
  "step": 1600
266
  },
267
  {
268
- "epoch": 7.589285714285714,
269
- "grad_norm": 0.9812105894088745,
270
- "learning_rate": 0.0002277643504531722,
271
- "loss": 0.455,
272
  "step": 1700
273
  },
274
  {
275
- "epoch": 7.589285714285714,
276
- "eval_loss": 0.41328728199005127,
277
- "eval_runtime": 17.7719,
278
- "eval_samples_per_second": 11.479,
279
- "eval_steps_per_second": 2.87,
280
- "eval_wer": 0.41552295314718407,
281
  "step": 1700
282
  },
283
  {
284
- "epoch": 8.035714285714286,
285
- "grad_norm": 0.6829022765159607,
286
- "learning_rate": 0.00022323262839879152,
287
- "loss": 0.4501,
288
  "step": 1800
289
  },
290
  {
291
- "epoch": 8.035714285714286,
292
- "eval_loss": 0.3978167176246643,
293
- "eval_runtime": 17.743,
294
- "eval_samples_per_second": 11.497,
295
- "eval_steps_per_second": 2.874,
296
- "eval_wer": 0.4065309985802177,
297
  "step": 1800
298
  },
299
  {
300
- "epoch": 8.482142857142858,
301
- "grad_norm": 1.3150678873062134,
302
- "learning_rate": 0.00021870090634441088,
303
- "loss": 0.45,
304
  "step": 1900
305
  },
306
  {
307
- "epoch": 8.482142857142858,
308
- "eval_loss": 0.3925248682498932,
309
- "eval_runtime": 17.6795,
310
- "eval_samples_per_second": 11.539,
311
- "eval_steps_per_second": 2.885,
312
- "eval_wer": 0.42309512541410316,
313
  "step": 1900
314
  },
315
  {
316
- "epoch": 8.928571428571429,
317
- "grad_norm": 1.0976217985153198,
318
- "learning_rate": 0.0002141691842900302,
319
- "loss": 0.4226,
320
  "step": 2000
321
  },
322
  {
323
- "epoch": 8.928571428571429,
324
- "eval_loss": 0.3901020586490631,
325
- "eval_runtime": 17.7714,
326
- "eval_samples_per_second": 11.479,
327
- "eval_steps_per_second": 2.87,
328
- "eval_wer": 0.4098438239469948,
329
  "step": 2000
330
  },
331
  {
332
- "epoch": 9.375,
333
- "grad_norm": 0.727407693862915,
334
- "learning_rate": 0.00020963746223564954,
335
- "loss": 0.3973,
336
  "step": 2100
337
  },
338
  {
339
- "epoch": 9.375,
340
- "eval_loss": 0.38098010420799255,
341
- "eval_runtime": 17.825,
342
- "eval_samples_per_second": 11.445,
343
- "eval_steps_per_second": 2.861,
344
- "eval_wer": 0.4055844770468528,
345
  "step": 2100
346
  },
347
  {
348
- "epoch": 9.821428571428571,
349
- "grad_norm": 2.031233072280884,
350
- "learning_rate": 0.00020510574018126884,
351
- "loss": 0.4038,
352
  "step": 2200
353
  },
354
  {
355
- "epoch": 9.821428571428571,
356
- "eval_loss": 0.41775575280189514,
357
- "eval_runtime": 17.8559,
358
- "eval_samples_per_second": 11.425,
359
- "eval_steps_per_second": 2.856,
360
- "eval_wer": 0.4117368670137246,
361
  "step": 2200
362
  },
363
  {
364
- "epoch": 10.267857142857142,
365
- "grad_norm": 1.3557627201080322,
366
- "learning_rate": 0.0002005740181268882,
367
- "loss": 0.4559,
368
  "step": 2300
369
  },
370
  {
371
- "epoch": 10.267857142857142,
372
- "eval_loss": 0.38752201199531555,
373
- "eval_runtime": 17.7454,
374
- "eval_samples_per_second": 11.496,
375
- "eval_steps_per_second": 2.874,
376
- "eval_wer": 0.40747752011358257,
377
  "step": 2300
378
  },
379
  {
380
- "epoch": 10.714285714285714,
381
- "grad_norm": 0.7480702996253967,
382
- "learning_rate": 0.00019604229607250753,
383
- "loss": 0.4399,
384
  "step": 2400
385
  },
386
  {
387
- "epoch": 10.714285714285714,
388
- "eval_loss": 0.3741886615753174,
389
- "eval_runtime": 17.7283,
390
- "eval_samples_per_second": 11.507,
391
- "eval_steps_per_second": 2.877,
392
- "eval_wer": 0.39895882631329865,
393
  "step": 2400
394
  },
395
  {
396
- "epoch": 11.160714285714286,
397
- "grad_norm": 1.065514087677002,
398
- "learning_rate": 0.00019151057401812688,
399
- "loss": 0.3545,
400
- "step": 2500
401
- },
402
- {
403
- "epoch": 11.160714285714286,
404
- "eval_loss": 0.38181087374687195,
405
- "eval_runtime": 17.8745,
406
- "eval_samples_per_second": 11.413,
407
- "eval_steps_per_second": 2.853,
408
- "eval_wer": 0.40132513014671084,
409
- "step": 2500
410
- },
411
- {
412
- "epoch": 11.607142857142858,
413
- "grad_norm": 1.19502854347229,
414
- "learning_rate": 0.0001869788519637462,
415
- "loss": 0.4452,
416
- "step": 2600
417
- },
418
- {
419
- "epoch": 11.607142857142858,
420
- "eval_loss": 0.3905617594718933,
421
- "eval_runtime": 17.8748,
422
- "eval_samples_per_second": 11.413,
423
- "eval_steps_per_second": 2.853,
424
- "eval_wer": 0.39801230477993377,
425
- "step": 2600
426
- },
427
- {
428
- "epoch": 12.053571428571429,
429
- "grad_norm": 0.8653120994567871,
430
- "learning_rate": 0.00018244712990936554,
431
- "loss": 0.4014,
432
- "step": 2700
433
- },
434
- {
435
- "epoch": 12.053571428571429,
436
- "eval_loss": 0.3751629889011383,
437
- "eval_runtime": 17.7431,
438
- "eval_samples_per_second": 11.497,
439
- "eval_steps_per_second": 2.874,
440
- "eval_wer": 0.39990534784666354,
441
- "step": 2700
442
- },
443
- {
444
- "epoch": 12.053571428571429,
445
- "step": 2700,
446
- "total_flos": 9.163482510982138e+18,
447
- "train_loss": 0.8344545293737341,
448
- "train_runtime": 2390.0152,
449
- "train_samples_per_second": 11.222,
450
- "train_steps_per_second": 2.812
451
  }
452
  ],
453
  "logging_steps": 100,
454
- "max_steps": 6720,
455
  "num_input_tokens_seen": 0,
456
- "num_train_epochs": 30,
457
  "save_steps": 400,
458
  "stateful_callbacks": {
459
  "EarlyStoppingCallback": {
460
  "args": {
461
- "early_stopping_patience": 3,
462
  "early_stopping_threshold": 0.0
463
  },
464
  "attributes": {
465
- "early_stopping_patience_counter": 0
466
  }
467
  },
468
  "TrainerControl": {
@@ -471,12 +423,12 @@
471
  "should_evaluate": false,
472
  "should_log": false,
473
  "should_save": true,
474
- "should_training_stop": false
475
  },
476
  "attributes": {}
477
  }
478
  },
479
- "total_flos": 9.163482510982138e+18,
480
  "train_batch_size": 4,
481
  "trial_name": null,
482
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.32343700528144836,
3
+ "best_model_checkpoint": "/scratch/skscla001/speech/results/mms-1b-toigen-balanced-model/checkpoint-2000",
4
+ "epoch": 21.238938053097346,
5
  "eval_steps": 100,
6
+ "global_step": 2400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.8849557522123894,
13
+ "grad_norm": 3.8822503089904785,
14
+ "learning_rate": 0.00028799999999999995,
15
+ "loss": 14.2297,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.8849557522123894,
20
+ "eval_loss": 3.483584403991699,
21
+ "eval_runtime": 18.7703,
22
+ "eval_samples_per_second": 11.081,
23
+ "eval_steps_per_second": 2.77,
24
+ "eval_wer": 1.0055555555555555,
25
  "step": 100
26
  },
27
  {
28
+ "epoch": 1.7699115044247788,
29
+ "grad_norm": 3.1678736209869385,
30
+ "learning_rate": 0.0002998980169971671,
31
+ "loss": 4.1389,
32
  "step": 200
33
  },
34
  {
35
+ "epoch": 1.7699115044247788,
36
+ "eval_loss": 0.5561802983283997,
37
+ "eval_runtime": 18.75,
38
+ "eval_samples_per_second": 11.093,
39
+ "eval_steps_per_second": 2.773,
40
+ "eval_wer": 0.5694444444444444,
41
  "step": 200
42
  },
43
  {
44
+ "epoch": 2.6548672566371683,
45
+ "grad_norm": 1.6712620258331299,
46
+ "learning_rate": 0.00029979178470254956,
47
+ "loss": 1.3643,
48
  "step": 300
49
  },
50
  {
51
+ "epoch": 2.6548672566371683,
52
+ "eval_loss": 0.4360348582267761,
53
+ "eval_runtime": 18.7668,
54
+ "eval_samples_per_second": 11.083,
55
+ "eval_steps_per_second": 2.771,
56
+ "eval_wer": 0.49583333333333335,
57
  "step": 300
58
  },
59
  {
60
+ "epoch": 3.5398230088495577,
61
+ "grad_norm": 1.3147025108337402,
62
+ "learning_rate": 0.00029968555240793195,
63
+ "loss": 1.1715,
64
  "step": 400
65
  },
66
  {
67
+ "epoch": 3.5398230088495577,
68
+ "eval_loss": 0.3980385661125183,
69
+ "eval_runtime": 18.8024,
70
+ "eval_samples_per_second": 11.062,
71
+ "eval_steps_per_second": 2.766,
72
+ "eval_wer": 0.4824074074074074,
73
  "step": 400
74
  },
75
  {
76
+ "epoch": 4.424778761061947,
77
+ "grad_norm": 1.7749208211898804,
78
+ "learning_rate": 0.00029957932011331445,
79
+ "loss": 1.1309,
80
  "step": 500
81
  },
82
  {
83
+ "epoch": 4.424778761061947,
84
+ "eval_loss": 0.37851694226264954,
85
+ "eval_runtime": 19.1098,
86
+ "eval_samples_per_second": 10.884,
87
+ "eval_steps_per_second": 2.721,
88
+ "eval_wer": 0.4583333333333333,
89
  "step": 500
90
  },
91
  {
92
+ "epoch": 5.3097345132743365,
93
+ "grad_norm": 1.853244662284851,
94
+ "learning_rate": 0.0002994730878186969,
95
+ "loss": 1.0283,
96
  "step": 600
97
  },
98
  {
99
+ "epoch": 5.3097345132743365,
100
+ "eval_loss": 0.37413156032562256,
101
+ "eval_runtime": 18.6437,
102
+ "eval_samples_per_second": 11.157,
103
+ "eval_steps_per_second": 2.789,
104
+ "eval_wer": 0.4476851851851852,
105
  "step": 600
106
  },
107
  {
108
+ "epoch": 6.1946902654867255,
109
+ "grad_norm": 1.4990218877792358,
110
+ "learning_rate": 0.0002993668555240793,
111
+ "loss": 1.0148,
112
  "step": 700
113
  },
114
  {
115
+ "epoch": 6.1946902654867255,
116
+ "eval_loss": 0.36694276332855225,
117
+ "eval_runtime": 18.6699,
118
+ "eval_samples_per_second": 11.141,
119
+ "eval_steps_per_second": 2.785,
120
+ "eval_wer": 0.44027777777777777,
121
  "step": 700
122
  },
123
  {
124
+ "epoch": 7.079646017699115,
125
+ "grad_norm": 2.7431230545043945,
126
+ "learning_rate": 0.0002992606232294617,
127
+ "loss": 0.9961,
128
  "step": 800
129
  },
130
  {
131
+ "epoch": 7.079646017699115,
132
+ "eval_loss": 0.36071425676345825,
133
+ "eval_runtime": 18.6525,
134
+ "eval_samples_per_second": 11.151,
135
+ "eval_steps_per_second": 2.788,
136
+ "eval_wer": 0.4356481481481482,
137
  "step": 800
138
  },
139
  {
140
+ "epoch": 7.964601769911504,
141
+ "grad_norm": 2.0985348224639893,
142
+ "learning_rate": 0.00029915439093484416,
143
+ "loss": 0.9248,
144
  "step": 900
145
  },
146
  {
147
+ "epoch": 7.964601769911504,
148
+ "eval_loss": 0.3580877482891083,
149
+ "eval_runtime": 18.871,
150
+ "eval_samples_per_second": 11.022,
151
+ "eval_steps_per_second": 2.756,
152
+ "eval_wer": 0.4236111111111111,
153
  "step": 900
154
  },
155
  {
156
+ "epoch": 8.849557522123893,
157
+ "grad_norm": 1.3972795009613037,
158
+ "learning_rate": 0.0002990481586402266,
159
+ "loss": 0.9482,
160
  "step": 1000
161
  },
162
  {
163
+ "epoch": 8.849557522123893,
164
+ "eval_loss": 0.3462725281715393,
165
+ "eval_runtime": 18.815,
166
+ "eval_samples_per_second": 11.055,
167
+ "eval_steps_per_second": 2.764,
168
+ "eval_wer": 0.4356481481481482,
169
  "step": 1000
170
  },
171
  {
172
+ "epoch": 9.734513274336283,
173
+ "grad_norm": 3.017667293548584,
174
+ "learning_rate": 0.00029894192634560905,
175
+ "loss": 0.8815,
176
  "step": 1100
177
  },
178
  {
179
+ "epoch": 9.734513274336283,
180
+ "eval_loss": 0.3487873673439026,
181
+ "eval_runtime": 18.8526,
182
+ "eval_samples_per_second": 11.033,
183
+ "eval_steps_per_second": 2.758,
184
+ "eval_wer": 0.4273148148148148,
185
  "step": 1100
186
  },
187
  {
188
+ "epoch": 10.619469026548673,
189
+ "grad_norm": 1.357649803161621,
190
+ "learning_rate": 0.0002988356940509915,
191
+ "loss": 0.8209,
192
  "step": 1200
193
  },
194
  {
195
+ "epoch": 10.619469026548673,
196
+ "eval_loss": 0.33840110898017883,
197
+ "eval_runtime": 18.6886,
198
+ "eval_samples_per_second": 11.13,
199
+ "eval_steps_per_second": 2.782,
200
+ "eval_wer": 0.4,
201
  "step": 1200
202
  },
203
  {
204
+ "epoch": 11.504424778761061,
205
+ "grad_norm": 1.8951735496520996,
206
+ "learning_rate": 0.00029872946175637393,
207
+ "loss": 0.8754,
208
  "step": 1300
209
  },
210
  {
211
+ "epoch": 11.504424778761061,
212
+ "eval_loss": 0.3459264636039734,
213
+ "eval_runtime": 18.8754,
214
+ "eval_samples_per_second": 11.02,
215
+ "eval_steps_per_second": 2.755,
216
+ "eval_wer": 0.4050925925925926,
217
  "step": 1300
218
  },
219
  {
220
+ "epoch": 12.389380530973451,
221
+ "grad_norm": 1.3216720819473267,
222
+ "learning_rate": 0.0002986232294617563,
223
+ "loss": 0.8454,
224
  "step": 1400
225
  },
226
  {
227
+ "epoch": 12.389380530973451,
228
+ "eval_loss": 0.33166107535362244,
229
+ "eval_runtime": 18.7736,
230
+ "eval_samples_per_second": 11.079,
231
+ "eval_steps_per_second": 2.77,
232
+ "eval_wer": 0.38842592592592595,
233
  "step": 1400
234
  },
235
  {
236
+ "epoch": 13.274336283185841,
237
+ "grad_norm": 2.39943528175354,
238
+ "learning_rate": 0.00029851699716713876,
239
+ "loss": 0.8164,
240
  "step": 1500
241
  },
242
  {
243
+ "epoch": 13.274336283185841,
244
+ "eval_loss": 0.33193060755729675,
245
+ "eval_runtime": 18.8153,
246
+ "eval_samples_per_second": 11.055,
247
+ "eval_steps_per_second": 2.764,
248
+ "eval_wer": 0.40324074074074073,
249
  "step": 1500
250
  },
251
  {
252
+ "epoch": 14.15929203539823,
253
+ "grad_norm": 6.335964202880859,
254
+ "learning_rate": 0.00029841076487252126,
255
+ "loss": 0.7673,
256
  "step": 1600
257
  },
258
  {
259
+ "epoch": 14.15929203539823,
260
+ "eval_loss": 0.33113545179367065,
261
+ "eval_runtime": 18.711,
262
+ "eval_samples_per_second": 11.116,
263
+ "eval_steps_per_second": 2.779,
264
+ "eval_wer": 0.3921296296296296,
265
  "step": 1600
266
  },
267
  {
268
+ "epoch": 15.044247787610619,
269
+ "grad_norm": 1.1695411205291748,
270
+ "learning_rate": 0.00029830559490084984,
271
+ "loss": 0.7953,
272
  "step": 1700
273
  },
274
  {
275
+ "epoch": 15.044247787610619,
276
+ "eval_loss": 0.33329564332962036,
277
+ "eval_runtime": 18.8623,
278
+ "eval_samples_per_second": 11.027,
279
+ "eval_steps_per_second": 2.757,
280
+ "eval_wer": 0.39444444444444443,
281
  "step": 1700
282
  },
283
  {
284
+ "epoch": 15.929203539823009,
285
+ "grad_norm": 13.718667030334473,
286
+ "learning_rate": 0.0002981993626062323,
287
+ "loss": 0.7527,
288
  "step": 1800
289
  },
290
  {
291
+ "epoch": 15.929203539823009,
292
+ "eval_loss": 0.3312545120716095,
293
+ "eval_runtime": 18.8046,
294
+ "eval_samples_per_second": 11.061,
295
+ "eval_steps_per_second": 2.765,
296
+ "eval_wer": 0.39166666666666666,
297
  "step": 1800
298
  },
299
  {
300
+ "epoch": 16.8141592920354,
301
+ "grad_norm": 1.9348554611206055,
302
+ "learning_rate": 0.0002980931303116147,
303
+ "loss": 0.763,
304
  "step": 1900
305
  },
306
  {
307
+ "epoch": 16.8141592920354,
308
+ "eval_loss": 0.3277539610862732,
309
+ "eval_runtime": 18.7599,
310
+ "eval_samples_per_second": 11.087,
311
+ "eval_steps_per_second": 2.772,
312
+ "eval_wer": 0.39305555555555555,
313
  "step": 1900
314
  },
315
  {
316
+ "epoch": 17.699115044247787,
317
+ "grad_norm": 18.476669311523438,
318
+ "learning_rate": 0.0002979868980169971,
319
+ "loss": 0.7319,
320
  "step": 2000
321
  },
322
  {
323
+ "epoch": 17.699115044247787,
324
+ "eval_loss": 0.32343700528144836,
325
+ "eval_runtime": 18.8597,
326
+ "eval_samples_per_second": 11.029,
327
+ "eval_steps_per_second": 2.757,
328
+ "eval_wer": 0.37546296296296294,
329
  "step": 2000
330
  },
331
  {
332
+ "epoch": 18.58407079646018,
333
+ "grad_norm": 2.2555744647979736,
334
+ "learning_rate": 0.00029788066572237955,
335
+ "loss": 0.7352,
336
  "step": 2100
337
  },
338
  {
339
+ "epoch": 18.58407079646018,
340
+ "eval_loss": 0.3248392343521118,
341
+ "eval_runtime": 18.8863,
342
+ "eval_samples_per_second": 11.013,
343
+ "eval_steps_per_second": 2.753,
344
+ "eval_wer": 0.38055555555555554,
345
  "step": 2100
346
  },
347
  {
348
+ "epoch": 19.469026548672566,
349
+ "grad_norm": 2.2022745609283447,
350
+ "learning_rate": 0.00029777443342776205,
351
+ "loss": 0.7017,
352
  "step": 2200
353
  },
354
  {
355
+ "epoch": 19.469026548672566,
356
+ "eval_loss": 0.3333507776260376,
357
+ "eval_runtime": 18.7742,
358
+ "eval_samples_per_second": 11.079,
359
+ "eval_steps_per_second": 2.77,
360
+ "eval_wer": 0.3851851851851852,
361
  "step": 2200
362
  },
363
  {
364
+ "epoch": 20.353982300884955,
365
+ "grad_norm": 7.710162162780762,
366
+ "learning_rate": 0.00029766820113314444,
367
+ "loss": 0.6902,
368
  "step": 2300
369
  },
370
  {
371
+ "epoch": 20.353982300884955,
372
+ "eval_loss": 0.330443412065506,
373
+ "eval_runtime": 18.756,
374
+ "eval_samples_per_second": 11.09,
375
+ "eval_steps_per_second": 2.772,
376
+ "eval_wer": 0.3888888888888889,
377
  "step": 2300
378
  },
379
  {
380
+ "epoch": 21.238938053097346,
381
+ "grad_norm": 3.896944522857666,
382
+ "learning_rate": 0.0002975619688385269,
383
+ "loss": 0.707,
384
  "step": 2400
385
  },
386
  {
387
+ "epoch": 21.238938053097346,
388
+ "eval_loss": 0.3313958942890167,
389
+ "eval_runtime": 18.7923,
390
+ "eval_samples_per_second": 11.068,
391
+ "eval_steps_per_second": 2.767,
392
+ "eval_wer": 0.38564814814814813,
393
  "step": 2400
394
  },
395
  {
396
+ "epoch": 21.238938053097346,
397
+ "step": 2400,
398
+ "total_flos": 1.5969122077347269e+19,
399
+ "train_loss": 1.5763085651397706,
400
+ "train_runtime": 3424.9125,
401
+ "train_samples_per_second": 657.681,
402
+ "train_steps_per_second": 82.484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  }
404
  ],
405
  "logging_steps": 100,
406
+ "max_steps": 282500,
407
  "num_input_tokens_seen": 0,
408
+ "num_train_epochs": 2500,
409
  "save_steps": 400,
410
  "stateful_callbacks": {
411
  "EarlyStoppingCallback": {
412
  "args": {
413
+ "early_stopping_patience": 4,
414
  "early_stopping_threshold": 0.0
415
  },
416
  "attributes": {
417
+ "early_stopping_patience_counter": 4
418
  }
419
  },
420
  "TrainerControl": {
 
423
  "should_evaluate": false,
424
  "should_log": false,
425
  "should_save": true,
426
+ "should_training_stop": true
427
  },
428
  "attributes": {}
429
  }
430
  },
431
+ "total_flos": 1.5969122077347269e+19,
432
  "train_batch_size": 4,
433
  "trial_name": null,
434
  "trial_params": null