tdns03
/

whisper-small-korean-pronunciation-scorer-sampledata

@@ -19,7 +19,69 @@ pipeline_tag: audio-classification
 # Whisper Pronunciation Scorer
-This model assesses pronunciation quality for Korean speech.
-The whisper-small model is fined-tuned using the Korea AI-Hub (https://www.aihub.or.kr/) foreigner Korean pronunciation evaluation dataset.
-You need to input the audio and ground truth script to obtain the Korean pronunciation score.
-Scale is 1~5.

 # Whisper Pronunciation Scorer
+This model assesses pronunciation quality for Korean speech. It's based on the openai/whisper-small model, fine-tuned using the Korea AI-Hub (https://www.aihub.or.kr/) foreigner Korean pronunciation evaluation dataset.
+# Model Description
+The Whisper Pronunciation Scorer takes audio input along with its corresponding text transcript and provides a Korean pronunciation score on a scale of 1 to 5. It utilizes the encoder-decoder architecture of the Whisper model to extract speech features and employs an additional linear layer to predict the pronunciation score.
+# How to Use
+To use this model, follow these steps:
+1. Install required libraries
+2. Load the model and processor
+3. Prepare your audio file and text transcript
+4. Predict the pronunciation score
+Here's a detailed example of how to use the model:
+import torch
+import torchaudio
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import torch.nn as nn
+class WhisperPronunciationScorer(nn.Module):
+    def __init__(self, pretrained_model):
+        super().__init__()
+        self.whisper = pretrained_model
+        self.score_head = nn.Linear(self.whisper.config.d_model, 1)
+    def forward(self, input_features, labels=None):
+        outputs = self.whisper(input_features, labels=labels, output_hidden_states=True)
+        last_hidden_state = outputs.decoder_hidden_states[-1]
+        scores = self.score_head(last_hidden_state.mean(dim=1)).squeeze()
+        return scores
+def load_model(model_path, device):
+    model_name = "openai/whisper-small"
+    processor = WhisperProcessor.from_pretrained(model_name)
+    pretrained_model = WhisperForConditionalGeneration.from_pretrained(model_name)
+    model = WhisperPronunciationScorer(pretrained_model).to(device)
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    model.eval()
+    return model, processor
+def predict_pronunciation_score(model, processor, audio_path, transcript, device):
+    # Load and preprocess audio
+    audio, sr = torchaudio.load(audio_path)
+    if sr != 16000:
+        audio = torchaudio.functional.resample(audio, sr, 16000)
+    input_features = processor(audio.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
+    # Prepare transcript
+    labels = processor(text=transcript, return_tensors="pt").input_ids.to(device)
+    # Predict score
+    with torch.no_grad():
+        score = model(input_features, labels)
+    return score.item()
+# Load model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_path = "path/to/your/model.pth"
+model, processor = load_model(model_path, device)
+# Run prediction
+audio_path = "path/to/your/audio.wav"
+transcript = "안녕하세요"
+score = predict_pronunciation_score(model, processor, audio_path, transcript, device)
+print(f"Predicted pronunciation score: {score:.2f}")