Spaces:

peproject
/

pronounciationevaluation

Runtime error

App Files Files Community

bel32123 commited on Oct 18, 2023

Commit

22efacc

1 Parent(s): 4cb5b41

Create model interface

Browse files

Files changed (2) hide show

wav2vecasr/MispronounciationDetector.py +18 -13
wav2vecasr/PhonemeASRModel.py +101 -0

wav2vecasr/MispronounciationDetector.py CHANGED Viewed

@@ -3,30 +3,34 @@ import torch
 import jiwer
 class MispronounciationDetector:
-  def __init__(self, l2_phoneme_recogniser, l2_phoneme_recogniser_processor, g2p, device):
-    self.l2_phoneme_recogniser = l2_phoneme_recogniser
-    self.l2_phoneme_recogniser_processor = l2_phoneme_recogniser_processor
     self.g2p = g2p
     self.device = device
   def detect(self, audio, text):
-    l2_phones = self.get_l2_phoneme_sequence(audio)
     native_speaker_phones = self.get_native_speaker_phoneme_sequence(text)
-    raw_info = self.get_mispronounciation_output(text, l2_phones, native_speaker_phones)
     return raw_info
-  def get_l2_phoneme_sequence(self, audio):
-    input_dict = self.l2_phoneme_recogniser_processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
-    logits = self.l2_phoneme_recogniser(input_dict.input_values.to(self.device)).logits
-    pred_ids = torch.argmax(logits, dim=-1)[0]
-    pred_phones = [phoneme for phoneme in self.l2_phoneme_recogniser_processor.batch_decode(pred_ids) if phoneme != ""]
-    return pred_phones
   def get_native_speaker_phoneme_sequence(self, text):
     phonemes = self.g2p(text)
     return phonemes
   def get_mispronounciation_output(self, text, pred_phones, org_label_phones):
     # get per
     label_phones = [phone for phone in org_label_phones if phone != " "]
     reference = " ".join(label_phones) # dummy phones
@@ -80,6 +84,7 @@ class MispronounciationDetector:
           space_padding = "-" * (len(label_phones[i]))
           error_bool.append(space_padding)
     delimiter_idx = 0
     for phone in org_label_phones:
       if phone == " ":
@@ -93,7 +98,7 @@ class MispronounciationDetector:
     ref.append("|")
     hyp.append("|")
-    # get mispronounced words
     aligned_word_error_output = ""
     words = text.split(" ")
     word_error_bool = self.get_mispronounced_words(error_bool)

 import jiwer
 class MispronounciationDetector:
+  def __init__(self, l2_phoneme_recogniser, g2p, device):
+    self.phoneme_asr_model = l2_phoneme_recogniser # PhonemeASRModel class
     self.g2p = g2p
     self.device = device
   def detect(self, audio, text):
+    l2_phones = self.phoneme_asr_model.get_l2_phoneme_sequence(audio)
     native_speaker_phones = self.get_native_speaker_phoneme_sequence(text)
+    standardised_native_speaker_phones = self.phoneme_asr_model.standardise_g2p_phoneme_sequence(native_speaker_phones)
+    raw_info = self.get_mispronounciation_output(text, l2_phones, standardised_native_speaker_phones)
     return raw_info
   def get_native_speaker_phoneme_sequence(self, text):
     phonemes = self.g2p(text)
     return phonemes
   def get_mispronounciation_output(self, text, pred_phones, org_label_phones):
+    """
+    Aligns the predicted phones from the L2 speaker and the expected native speaker phone to get the errors
+    :param text: original words read by the user
+    :type text: string
+    :param pred_phones: predicted phonemes by L2 speaker from ASR Model
+    :type pred_phones: array
+    :param org_label_phones: correct, native speaker phonemes from G2P where phonemes of each word is segregated by " "
+    :type org_label_phones: array
+    :return: dictionary containing various mispronounciation information like PER, WER and error boolean arrays at phoneme/word level
+    :rtype: dictionary
+    """
     # get per
     label_phones = [phone for phone in org_label_phones if phone != " "]
     reference = " ".join(label_phones) # dummy phones
           space_padding = "-" * (len(label_phones[i]))
           error_bool.append(space_padding)
+    # insert word delimiters to show user phoneme sections by word
     delimiter_idx = 0
     for phone in org_label_phones:
       if phone == " ":
     ref.append("|")
     hyp.append("|")
+    # get mispronounced words based on if there are phoneme errors present in the phonemes of that word
     aligned_word_error_output = ""
     words = text.split(" ")
     word_error_bool = self.get_mispronounced_words(error_bool)

wav2vecasr/PhonemeASRModel.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, \
+  Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
+import pyctcdecode
+import json
+import re
+from sys import platform
+class PhonemeASRModel:
+  def get_l2_phoneme_sequence(self, audio):
+    """
+    :param audio: audio sampled at 16k sampling rate with torchaudio
+    :type audio: array
+    :return: predicted phonemes for L2 speaker
+    :rtype: array
+    """
+    pass
+  def standardise_g2p_phoneme_sequence(self, phones):
+    """
+    To facilitate mispronounciation detection
+    :param phones: native speaker phones predicted by G2P model
+    :type phones: array
+    :return: standardised native speaker phoneme sequence that aligns with phoneme classes by the model
+    :rtype: array
+    """
+    pass
+  def standardise_l2_artic_groundtruth_phoneme_sequence(self, phones):
+    """
+    To facilitate testing
+    :param phones: native speaker phones as annotated in l2 artic
+    :type phones: array
+    :return: standardised native speaker phoneme sequence that aligns with phoneme classes by the model
+    :rtype: array
+    """
+    pass
+class Wav2Vec2PhonemeASRModel(PhonemeASRModel):
+  """
+  Uses greedy decoding
+  """
+  def __init__(self, model_path, processor_path):
+    self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    self.model = Wav2Vec2ForCTC.from_pretrained(model_path).to(self.device)
+    self.processor = Wav2Vec2Processor.from_pretrained(processor_path)
+  def get_l2_phoneme_sequence(self, audio):
+    input_dict = self.processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
+    logits = self.model(input_dict.input_values.to(self.device)).logits
+    pred_ids = torch.argmax(logits, dim=-1)[0]
+    pred_phones = [phoneme for phoneme in self.processor.batch_decode(pred_ids) if phoneme != ""]
+    return pred_phones
+  def standardise_g2p_phoneme_sequence(self, phones):
+    return phones
+  def standardise_l2_artic_groundtruth_phoneme_sequence(self, phones):
+    return [re.sub(r'\d', "", phone_str) for phone_str in phones]
+# TODO debug on linux because KenLM is not supported on Windows
+class Wav2Vec2OptimisedPhonemeASRModel(PhonemeASRModel):
+  """
+  Uses beam search and a LM for decoding
+  """
+  def __init__(self, model_path, vocab_json_path, kenlm_model_path):
+    self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    f = open(vocab_json_path)
+    vocab_dict = json.load(f)
+    tokenizer = Wav2Vec2CTCTokenizer(vocab_json_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
+    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0,
+                                                 do_normalize=True, return_attention_mask=False)
+    labels = list(vocab_dict.keys())
+    # beam search
+    decoder = pyctcdecode.decoder.build_ctcdecoder(labels)
+    if (platform == "linux" or platform == "linux2") and kenlm_model_path:
+      # beam search + LM
+      decoder = pyctcdecode.decoder.build_ctcdecoder(labels, kenlm_model_path=kenlm_model_path)
+    self.model = Wav2Vec2ForCTC.from_pretrained(model_path).to(self.device)
+    self.processor = Wav2Vec2ProcessorWithLM(feature_extractor=feature_extractor, tokenizer=tokenizer, decoder=decoder)
+  def get_l2_phoneme_sequence(self, audio):
+    input_dict = self.processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
+    logits = self.model(input_dict.input_values.to(self.device)).logits.cpu().detach()
+    normalised_logits = torch.nn.Softmax(dim=2)(logits)
+    normalised_logits = normalised_logits.numpy()[0]
+    output = self.processor.decode(normalised_logits)
+    pred_phones = output.text.split(" ")
+    return pred_phones
+  def standardise_g2p_phoneme_sequence(self, phones):
+    return phones
+  def standardise_l2_artic_groundtruth_phoneme_sequence(self, phones):
+    return [re.sub(r'\d', "", phone_str) for phone_str in phones]