Evaluation on WOLOF Test
import pandas as pd
from datasets import load_dataset, load_metric,Dataset
from tqdm import tqdm
import torch
import soundfile as sf
import torchaudio
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2CTCTokenizer
model_name = "kingabzpro/wav2vec2-large-xlsr-53-wolof"
device = "cuda"
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
processor = Wav2Vec2Processor.from_pretrained(model_name)
val =pd.read_csv("../input/automatic-speech-recognition-in-wolof/Test.csv")
val["path"] = "../input/automatic-speech-recognition-in-wolof/Noise Removed/tmp/WOLOF_ASR_dataset/noise_remove/"+val["ID"]+".wav"
val.rename(columns = {'transcription':'sentence'}, inplace = True)
common_voice_val = Dataset.from_pandas(val)
def speech_file_to_array_fn_test(batch):
speech_array, sampling_rate = sf.read(batch["path"])#(.wav) 16000 sample rate
batch["speech"] = speech_array
batch["sampling_rate"] = sampling_rate
return batch
def prepare_dataset_test(batch):
# check that all files have the correct sampling rate
assert (
len(set(batch["sampling_rate"])) == 1
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
batch["input_values"] = processor(batch["speech"], padding=True,sampling_rate=batch["sampling_rate"][0]).input_values
return batch
common_voice_val = common_voice_val.remove_columns([ "ID","age", "down_votes", "gender", "up_votes"]) # Remove columns
common_voice_val = common_voice_val.map(speech_file_to_array_fn_test, remove_columns=common_voice_val.column_names)# Applying speech_file_to_array function
common_voice_val = common_voice_val.map(prepare_dataset_test, remove_columns=common_voice_val.column_names, batch_size=8, num_proc=4, batched=True)# Applying prepare_dataset_test function
final_pred = []
for i in tqdm(range(common_voice_val.shape[0])):# Testing model on Wolof Dataset
input_dict = processor(common_voice_val[i]["input_values"], return_tensors="pt", padding=True)
logits = model(input_dict.input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]
prediction = processor.decode(pred_ids)
final_pred.append(prediction)
You can check my result on Zindi, I got 8th rank in AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF
Result: 7.88 %
- Downloads last month
- 150
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.