model-index: - name: Fon XLSR Wav2Vec2 Large 53 results: - task: name: Speech Recognition type: automatic-speech-recognition
dataset: - name: fon - type: fon_dataset - args: fon
metrics: - name: Test WER - type: wer - value: 14.97

Wav2Vec2-Large-XLSR-53-Fon

Fine-tuned facebook/wav2vec2-large-xlsr-53 on Fon (or Fongbe) using the Fon Dataset.

When using this model, make sure that your speech input is sampled at 16kHz.

Usage

The model can be used directly (without a language model) as follows:

import json
import random
import torch
import torchaudio
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

#This will download the files from Layele's Github to the directory FonAudio
if not os.path.isdir("./FonAudio"):
  !wget https://github.com/laleye/pyFongbe/archive/master/data.zip
  with zipfile.ZipFile("data.zip","r") as zip_ref:
    zip_ref.extractall("./FonAudio")
    
with open('./FonAudio/pyFongbe-master/data/train.csv', newline='',encoding='UTF-8') as f:
      reader = csv.reader(f)
      data = list(reader)
      train_data = [data[i] for i in range(len(data)) if i!=0]

with open('./FonAudio/pyFongbe-master/data/test.csv', newline='',encoding='UTF-8') as f:
      reader = csv.reader(f)
      data = list(reader)
      t_data = [data[i] for i in range(len(data)) if i!=0]
      
      
#Get valid indices
random.seed(42) #this seed was used specifically to compare
                # with Okwugbe model (https://arxiv.org/abs/2103.07762)


v = 1500 
test_list = [i for i in range(len(t_data))]
valid_indices = random.choices(test_list, k=v)

test_data = [t_data[i] for i in range(len(t_data)) if i not in valid_indices] 
valid_data = [t_data[i] for i in range(len(t_data)) if i in valid_indices]

#Final length of validation_dataset -> 1107
#Final length of test_dataset -> 1061

#Please note, the final validation size is is smaller than the
#expected (1500) because we used random.choices which could contain duplicates.

#Create JSON files 
def create_json_file(d):
  utterance = d[2]
  wav_path =d[0]
  wav_path = wav_path.replace("/home/frejus/Projects/Fongbe_ASR/pyFongbe","./FonAudio/pyFongbe-master")
  return {
      "path": wav_path,
      "sentence": utterance
  }

train_json = [create_json_file(i) for i in train_data]
test_json = [create_json_file(i) for i in test_data]
valid_json = [create_json_file(i) for i in valid_data]

#Save JSON files to your Google Drive folders
#Make folder in GDrive to store files
train_path = '/content/drive/MyDrive/fon_xlsr/train'
test_path = '/content/drive/MyDrive/fon_xlsr/test'
valid_path = '/content/drive/MyDrive/fon_xlsr/valid'

if not os.path.isdir(train_path):
  print("Creating paths")
  os.makedirs(train_path)
  os.makedirs(test_path) #this is where we save the test files
  os.makedirs(valid_path)
  

#for train
for i, sample in enumerate(train_json):
  file_path = os.path.join(train_path,'train_fon_{}.json'.format(i))
  with open(file_path, 'w') as outfile:
    json.dump(sample, outfile)

#for test
for i, sample in enumerate(test_json):
  file_path = os.path.join(test_path,'test_fon_{}.json'.format(i))
  with open(file_path, 'w') as outfile:
    json.dump(sample, outfile)

#for valid
for i, sample in enumerate(valid_json):
  file_path = os.path.join(valid_path,'valid_fon_{}.json'.format(i))
  with open(file_path, 'w') as outfile:
    json.dump(sample, outfile)
  

#Load test_dataset from saved files in folder
from datasets import load_dataset, load_metric

#for test
for root, dirs, files in os.walk(test_path):
  test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")

#Remove unnecessary chars
chars_to_ignore_regex = 
def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

test_dataset = test_dataset.map(remove_special_characters)

processor = Wav2Vec2Processor.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon") 
model = Wav2Vec2ForCTC.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon") 

#No need for resampling because audio dataset already at 16kHz
#resampler = torchaudio.transforms.Resample(48_000, 16_000)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
  speech_array, sampling_rate = torchaudio.load(batch["path"])
  batch["speech"]=speech_array.squeeze().numpy()
  return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
  tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)

print("Prediction:", processor.batch_decode(predicted_ids))
print("Reference:", test_dataset["sentence"][:2])

Evaluation

The model can be evaluated as follows on our unique Fon test data.

import torch
import torchaudio
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import re

for root, dirs, files in os.walk(test_path):
  test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")

chars_to_ignore_regex = 
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

test_dataset = test_dataset.map(remove_special_characters)
wer = load_metric("wer")

processor = Wav2Vec2Processor.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon")
model = Wav2Vec2ForCTC.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon") #use checkpoint-12400 to get our WER test results
model.to("cuda")

# Preprocessing the datasets.
# We need to read the aduio files as arrays
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)

#Evaluation on test dataset
def evaluate(batch):
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
  
  with torch.no_grad():
    logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
  
  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_strings"] = processor.batch_decode(pred_ids)
  return batch

result = test_dataset.map(evaluate, batched=True, batch_size=8)

print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Test Result: 14.97 %

Training

The Fon dataset was split into train(8235 samples), validation(1107 samples), and test(1061 samples).

The script used for training can be found here

Collaborators on this project

Chris C. Emezue (Twitter)
Bonaventure F.P. Dossou (Twitter)

This is a joint project continuing our research on OkwuGbé: End-to-End Speech Recognition for Fon and Igbo

Please contact [email protected] for any issues or questions.

model-index: - name: Fon XLSR Wav2Vec2 Large 53 results: - task: name: Speech Recognition type: automatic-speech-recognition dataset: - name: fon - type: fon_dataset - args: fon metrics: - name: Test WER - type: wer - value: 14.97