Jobaula
/

whisper-medium-nan-tw-common-voice

@@ -1,43 +1,42 @@
 ---
-language:
-- zh
 license: apache-2.0
 tags:
-- whisper-event
 - generated_from_trainer
 datasets:
-- mozilla-foundation/common_voice_11_0
 metrics:
 - wer
-base_model: openai/whisper-medium
 model-index:
-- name: Whisper medium nan-tw only char
   results:
   - task:
-      type: automatic-speech-recognition
       name: Automatic Speech Recognition
     dataset:
-      name: mozilla-foundation/common_voice_11_0 nan-tw
-      type: mozilla-foundation/common_voice_11_0
       config: nan-tw
       split: test
       args: nan-tw
     metrics:
-    - type: wer
-      value: 45.2824427480916
-      name: Wer
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-# Whisper medium nan-tw only char
-This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on the mozilla-foundation/common_voice_11_0 nan-tw dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.9944
-- Wer: 45.2824
-- Cer: 45.3667
 ## Model description
@@ -60,7 +59,7 @@ The following hyperparameters were used during training:
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
-- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - lr_scheduler_warmup_steps: 500
 - training_steps: 5000
@@ -68,18 +67,18 @@ The following hyperparameters were used during training:
 ### Training results
-| Training Loss | Epoch | Step | Validation Loss | Wer     | Cer     |
-|:-------------:|:-----:|:----:|:---------------:|:-------:|:-------:|
-| 0.5832        | 1.04  | 1000 | 1.0634          | 56.3053 | 56.4745 |
-| 0.1467        | 2.08  | 2000 | 1.0407          | 50.9618 | 51.0112 |
-| 0.016         | 3.13  | 3000 | 1.0226          | 46.4427 | 46.5137 |
-| 0.0001        | 5.01  | 4000 | 0.9974          | 45.4656 | 45.6082 |
-| 0.0001        | 6.05  | 5000 | 0.9944          | 45.2824 | 45.3667 |
 ### Framework versions
-- Transformers 4.27.0.dev0
-- Pytorch 1.13.1+cu117
-- Datasets 2.8.0
-- Tokenizers 0.13.2

 ---
+library_name: transformers
 license: apache-2.0
+base_model: openai/whisper-medium
 tags:
 - generated_from_trainer
 datasets:
+- audiofolder
 metrics:
 - wer
 model-index:
+- name: openai/whisper-medium
   results:
   - task:
       name: Automatic Speech Recognition
+      type: automatic-speech-recognition
     dataset:
+      name: audiofolder
+      type: audiofolder
       config: nan-tw
       split: test
       args: nan-tw
     metrics:
+    - name: Wer
+      type: wer
+      value: 0.9615384615384616
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+# openai/whisper-medium
+This model is a fine-tuned version of [openai/whisper-medium](https://huggingface.co/openai/whisper-medium) on the audiofolder dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0141
+- Model Preparation Time: 0.0121
+- Wer: 0.9615
+- Cer: 0.9524
 ## Model description
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
+- optimizer: Use adamw_bnb_8bit with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: linear
 - lr_scheduler_warmup_steps: 500
 - training_steps: 5000
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Model Preparation Time | Wer     | Cer     |
+|:-------------:|:------:|:----:|:---------------:|:----------------------:|:-------:|:-------:|
+| 0.97          | 0.2    | 1000 | 0.7356          | 0.0121                 | 38.1731 | 38.4762 |
+| 0.3044        | 1.0388 | 2000 | 0.3099          | 0.0121                 | 23.4615 | 23.9048 |
+| 0.3108        | 1.2388 | 3000 | 0.1153          | 0.0121                 | 7.5     | 7.7143  |
+| 0.0544        | 2.0776 | 4000 | 0.0295          | 0.0121                 | 2.3077  | 2.2857  |
+| 0.0678        | 2.2776 | 5000 | 0.0141          | 0.0121                 | 0.9615  | 0.9524  |
 ### Framework versions
+- Transformers 4.47.0.dev0
+- Pytorch 2.5.1+cu121
+- Datasets 3.1.0
+- Tokenizers 0.20.3

generation_config.json CHANGED Viewed

@@ -1,4 +1,30 @@
 {
   "begin_suppress_tokens": [
     220,
     50257
@@ -14,10 +40,6 @@
     [
       2,
       50359
-    ],
-    [
-      3,
-      50363
     ]
   ],
   "is_multilingual": true,
@@ -122,10 +144,12 @@
     "<|yo|>": 50325,
     "<|zh|>": 50260
   },
-  "max_initial_timestamp_index": 1,
   "max_length": 448,
   "no_timestamps_token_id": 50363,
   "pad_token_id": 50257,
   "suppress_tokens": [
     1,
     2,
@@ -220,5 +244,5 @@
     "transcribe": 50359,
     "translate": 50358
   },
-  "transformers_version": "4.27.0.dev0"
 }

 {
+  "alignment_heads": [
+    [
+      13,
+      15
+    ],
+    [
+      15,
+      4
+    ],
+    [
+      15,
+      15
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      20,
+      0
+    ],
+    [
+      23,
+      4
+    ]
+  ],
   "begin_suppress_tokens": [
     220,
     50257
     [
       2,
       50359
     ]
   ],
   "is_multilingual": true,
     "<|yo|>": 50325,
     "<|zh|>": 50260
   },
+  "max_initial_timestamp_index": 50,
   "max_length": 448,
   "no_timestamps_token_id": 50363,
   "pad_token_id": 50257,
+  "prev_sot_token_id": 50361,
+  "return_timestamps": false,
   "suppress_tokens": [
     1,
     2,
     "transcribe": 50359,
     "translate": 50358
   },
+  "transformers_version": "4.47.0.dev0"
 }