Add better performing model

Browse files

Files changed (4) hide show

README.md +19 -19
config.json +2 -2
pytorch_model.bin +1 -1
vocab.json +1 -1

README.md CHANGED Viewed

@@ -23,12 +23,12 @@ model-index:
     metrics:
        - name: Test WER
          type: wer
-         value: 22.84
 ---
 # Wav2Vec2-Large-XLSR-53-Kazakh
-Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Kazakh using the [Kazakh Speech Corpus v1.1](https://issai.nu.edu.kz/kz-speech-corpus/?version=1.1)
 When using this model, make sure that your speech input is sampled at 16kHz.
@@ -53,15 +53,15 @@ model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-large-xlsr-kazakh")
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
-\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
-\tbatch["speech"] = torchaudio.transforms.Resample(sampling_rate, 16_000)(speech_array).squeeze().numpy()
-\treturn batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
-\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
@@ -72,7 +72,7 @@ print("Reference:", test_dataset["sentence"][:2])
 ## Evaluation
-The model can be evaluated as follows on the test data of [Kazakh Speech Corpus v1.1](https://issai.nu.edu.kz/kz-speech-corpus/?version=1.1). To evaluate, download the [archive](https://www.openslr.org/resources/102/ISSAI_KSC_335RS_v1.1_flac.tar.gz), untar and pass the path to data to `get_test_dataset` as below:
 ```python
 import torch
@@ -94,31 +94,31 @@ model.to("cuda")
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
-\tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
-\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
-\tbatch["speech"] = torchaudio.transforms.Resample(sampling_rate, 16_000)(speech_array).squeeze().numpy()
-\treturn batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 def evaluate(batch):
-\tinputs = processor(batch["text"], sampling_rate=16_000, return_tensors="pt", padding=True)
-\twith torch.no_grad():
-\t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
-\tpred_ids = torch.argmax(logits, dim=-1)
-\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
-\treturn batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
-**Test Result**: 22.84 %
 ## Training
-The Kazakh Speech Corpus v1.1 `train` dataset was used for training,

     metrics:
        - name: Test WER
          type: wer
+         value: 19.65
 ---
 # Wav2Vec2-Large-XLSR-53-Kazakh
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) for Kazakh ASR using the [Kazakh Speech Corpus v1.1](https://issai.nu.edu.kz/kz-speech-corpus/?version=1.1)
 When using this model, make sure that your speech input is sampled at 16kHz.
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    batch["speech"] = torchaudio.transforms.Resample(sampling_rate, 16_000)(speech_array).squeeze().numpy()
+    return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 with torch.no_grad():
+    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 predicted_ids = torch.argmax(logits, dim=-1)
 ## Evaluation
+The model can be evaluated as follows on the test set of [Kazakh Speech Corpus v1.1](https://issai.nu.edu.kz/kz-speech-corpus/?version=1.1). To evaluate, download the [archive](https://www.openslr.org/resources/102/ISSAI_KSC_335RS_v1.1_flac.tar.gz), untar and pass the path to data to `get_test_dataset` as below:
 ```python
 import torch
 # Preprocessing the datasets.
 # We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
+    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    batch["speech"] = torchaudio.transforms.Resample(sampling_rate, 16_000)(speech_array).squeeze().numpy()
+    return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 def evaluate(batch):
+    inputs = processor(batch["text"], sampling_rate=16_000, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
+    pred_ids = torch.argmax(logits, dim=-1)
+    batch["pred_strings"] = processor.batch_decode(pred_ids)
+    return batch
 result = test_dataset.map(evaluate, batched=True, batch_size=8)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
+**Test Result**: 19.65%
 ## Training
+The Kazakh Speech Corpus v1.1 `train` dataset was used for training.

config.json CHANGED Viewed

@@ -42,7 +42,7 @@
   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
-  "feat_proj_dropout": 0.0,
   "final_dropout": 0.0,
   "gradient_checkpointing": true,
   "hidden_act": "gelu",
@@ -62,7 +62,7 @@
   "mask_time_length": 10,
   "mask_time_min_space": 1,
   "mask_time_other": 0.0,
-  "mask_time_prob": 0,
   "mask_time_selection": "static",
   "model_type": "wav2vec2",
   "num_attention_heads": 16,

   "feat_extract_activation": "gelu",
   "feat_extract_dropout": 0.0,
   "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0,
   "final_dropout": 0.0,
   "gradient_checkpointing": true,
   "hidden_act": "gelu",
   "mask_time_length": 10,
   "mask_time_min_space": 1,
   "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
   "mask_time_selection": "static",
   "model_type": "wav2vec2",
   "num_attention_heads": 16,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad8b539264b74e769a7115cdc8830ee7024abbfc055befa33747a8a13115ef3a
 size 1262118359

 version https://git-lfs.github.com/spec/v1
+oid sha256:b1e83db5ad4984dbe05208534fd2003de46257634517f054984a09c4d61a1ace
 size 1262118359

vocab.json CHANGED Viewed

@@ -1 +1 @@

- {"и": ~~0, "й":~~ 1, "у": 2, "ь": 3, "ә": 4, "п": 5, "ы": 6, "щ": 7, "ч": 8, "е": 9, "р": 10, "з": 11, "ү": 12, "қ": 13, "х": 14, "г": 15, "б": 16, "я": 17, "с": 18, "ш": 19, "ө": 20, "ғ": 21, "т": 22, "і": 23, "м": 24, "ц": 25, "һ": 26, "ң": 27, "ж": 28, "ю": 29, "в": 30, "а": 32, "д": 33, "ф": 34, "э": 35, "ъ": 36, "л": 37, "ё": 38, "н": 39, "к": 40, "о": 41, "ұ": 42, "|": 31, "[UNK]": 43, "[PAD]": 44}

+ {"а": 1, "б": 2, "в": 3, "г": 4, "д": 5, "е": 6, "ж": 7, "з": 8, "и": 9, "й": 10, "к": 11, "л": 12, "м": 13, "н": 14, "о": 15, "п": 16, "р": 17, "с": 18, "т": 19, "у": 20, "ф": 21, "х": 22, "ц": 23, "ч": 24, "ш": 25, "щ": 26, "ъ": 27, "ы": 28, "ь": 29, "э": 30, "ю": 31, "я": 32, "ё": 33, "і": 34, "ғ": 35, "қ": 36, "ң": 37, "ү": 38, "ұ": 39, "һ": 40, "ә": 41, "ө": 42, "|": 0, "[UNK]": 43, "[PAD]": 44}