jonatasgrosman
/

wav2vec2-large-xlsr-53-dutch

@@ -27,7 +27,7 @@ model-index:
          value: 13.60
        - name: Test CER
          type: cer
-         value: 8.12
 ---
 # Wav2Vec2-Large-XLSR-53-Dutch
@@ -101,12 +101,16 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 LANG_ID = "nl"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-dutch"
 DEVICE = "cuda"
 CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                    "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
                    "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
 wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
 cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
@@ -119,9 +123,11 @@ model.to(DEVICE)
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
-    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
-    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
     batch["speech"] = speech_array
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
@@ -138,13 +144,13 @@ def evaluate(batch):
 	batch["pred_strings"] = processor.batch_decode(pred_ids)
 	return batch
-result = test_dataset.map(evaluate, batched=True, batch_size=32)
-print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=8000)))
-print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=8000)))
 ```
 **Test Result**:
 - WER: 13.60%
-- CER: 8.12%

          value: 13.60
        - name: Test CER
          type: cer
+         value: 4.45
 ---
 # Wav2Vec2-Large-XLSR-53-Dutch
 LANG_ID = "nl"
 MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-dutch"
 DEVICE = "cuda"
+MAX_SAMPLES = 8000
 CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                    "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
                    "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。"]
 test_dataset = load_dataset("common_voice", LANG_ID, split="test")
+if len(test_dataset) > MAX_SAMPLES:
+    test_dataset = test_dataset.select(range(MAX_SAMPLES))
 wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
 cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
     batch["speech"] = speech_array
+    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 	batch["pred_strings"] = processor.batch_decode(pred_ids)
 	return batch
+result = test_dataset.map(evaluate, batched=True, batch_size=8)
+print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=1000)))
+print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"], chunk_size=1000)))
 ```
 **Test Result**:
 - WER: 13.60%
+- CER: 4.45%