elsayedissa commited on
Commit
6f59045
1 Parent(s): 0e3b9d4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +89 -2
README.md CHANGED
@@ -5,7 +5,7 @@ tags:
5
  metrics:
6
  - wer
7
  model-index:
8
- - name: whisper-small-sp
9
  results: []
10
  ---
11
 
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # whisper-small-sp
16
 
17
- This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the None dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 0.4485
20
  - Wer: 20.6842
@@ -76,6 +76,93 @@ The following hyperparameters were used during training:
76
  | 0.0487 | 3.12 | 24000 | 0.4456 | 20.8617 |
77
  | 0.0401 | 3.25 | 25000 | 0.4485 | 20.6842 |
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  ### Framework versions
81
 
 
5
  metrics:
6
  - wer
7
  model-index:
8
+ - name: whisper-small-spanish
9
  results: []
10
  ---
11
 
 
14
 
15
  # whisper-small-sp
16
 
17
+ This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the `commonvoice dataset v11` dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 0.4485
20
  - Wer: 20.6842
 
76
  | 0.0487 | 3.12 | 24000 | 0.4456 | 20.8617 |
77
  | 0.0401 | 3.25 | 25000 | 0.4485 | 20.6842 |
78
 
79
+ ### Transcription:
80
+
81
+ ```python
82
+ from datasets import load_dataset, Audio
83
+ import torch
84
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
85
+
86
+ # device
87
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
+
89
+ # load the model
90
+ processor = WhisperProcessor.from_pretrained("clu-ling/whisper-small-spanish")
91
+ model = WhisperForConditionalGeneration.from_pretrained("clu-ling/whisper-small-spanish").to(device)
92
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="transcribe")
93
+
94
+ # load the dataset
95
+ commonvoice_eval = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="validation", streaming=True)
96
+ commonvoice_eval = commonvoice_eval.cast_column("audio", Audio(sampling_rate=16000))
97
+ sample = next(iter(commonvoice_eval))["audio"]
98
+
99
+ # features and generate token ids
100
+ input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
101
+ predicted_ids = model.generate(input_features.to(device), forced_decoder_ids=forced_decoder_ids)
102
+
103
+ # decode
104
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
105
+
106
+ print(transcription)
107
+
108
+ ```
109
+
110
+ ### Evaluation:
111
+
112
+ Evaluates this model on `mozilla-foundation/common_voice_11_0` test split.
113
+
114
+ ```python
115
+ from transformers.models.whisper.english_normalizer import BasicTextNormalizer
116
+ from datasets import load_dataset, Audio
117
+ import evaluate
118
+ import torch
119
+ import re
120
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
121
+
122
+ # device
123
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
124
+
125
+ # metric
126
+ wer_metric = evaluate.load("wer")
127
+
128
+ # model
129
+ processor = WhisperProcessor.from_pretrained("clu-ling/whisper-small-spanish")
130
+ model = WhisperForConditionalGeneration.from_pretrained("clu-ling/whisper-small-spanish")
131
+
132
+ # dataset
133
+ dataset = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", )#cache_dir=args.cache_dir
134
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
135
+
136
+ #for debuggings: it gets some examples
137
+ #dataset = dataset.shard(num_shards=10000, index=0)
138
+ #print(dataset)
139
+
140
+ def normalize(batch):
141
+ batch["gold_text"] = whisper_norm(batch['sentence'])
142
+ return batch
143
+
144
+ def map_wer(batch):
145
+ model.to(device)
146
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language = "es", task = "transcribe")
147
+ inputs = processor(batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"], return_tensors="pt").input_features
148
+ with torch.no_grad():
149
+ generated_ids = model.generate(inputs=inputs.to(device), forced_decoder_ids=forced_decoder_ids)
150
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
151
+ batch["predicted_text"] = whisper_norm(transcription)
152
+ return batch
153
+
154
+ # process GOLD text
155
+ processed_dataset = dataset.map(normalize)
156
+ # get predictions
157
+ predicted = processed_dataset.map(map_wer)
158
+
159
+ # word error rate
160
+ wer = wer_metric.compute(references=predicted['gold_text'], predictions=predicted['predicted_text'])
161
+ wer = round(100 * wer, 2)
162
+ print("WER:", wer)
163
+
164
+
165
+ ```
166
 
167
  ### Framework versions
168