In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor
from datasets import load_dataset, load_metric, Audio
from pyctcdecode import build_ctcdecoder
from pydub import AudioSegment
from pydub.playback import play

import numpy as np
import torch
import kenlm
import pandas as pd
import random
import soundfile as sf

In [2]:
model = AutoModelForCTC.from_pretrained(".")
processor = Wav2Vec2Processor.from_pretrained(".")

In [28]:
# model = AutoModelForCTC.from_pretrained("vitouphy/xls-r-300m-km").to('cuda')
# processor = Wav2Vec2Processor.from_pretrained("vitouphy/xls-r-300m-km")

In [3]:
common_voice_test  = (load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')
                      .remove_columns(["Unnamed: 0", "drop"])
                      .rename_column('text', 'sentence')
                      .cast_column("path", Audio(sampling_rate=16_000)).rename_column('path', 'audio'))

Using custom data configuration default-36119ec2a15afb82
Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


In [4]:
common_voice_test[0]

{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_3154_2555595821.wav',
  'array': array([ 0.00014737,  0.00016698,  0.00013704, ..., -0.00011244,
         -0.0001059 , -0.00011476], dtype=float32),
  'sampling_rate': 16000},
 'sentence': 'ការ ធ្វើ អាជីវកម្ម រ៉ែ ដំបូង នៅ កម្ពុជា'}

In [5]:
def prepare_dataset(batch):
    audio = batch["audio"]
    
    # batched output is "un-batched"
    batch["input_values"] = processor(np.array(audio["array"]), sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [6]:
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)

Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-081703c0621182da.arrow


In [9]:
i = 25

In [10]:
input_dict = processor(common_voice_test[i]["input_values"], return_tensors="pt", padding=True)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [11]:
input_dict

{'input_values': tensor([[ 2.8537e-04,  2.5043e-04,  2.7738e-04,  ..., -4.8949e-05,
         -1.1382e-04,  2.7166e-04]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}

In [12]:
input_dict = processor(common_voice_test[i]["input_values"], return_tensors="pt", padding=True)
logits = model(input_dict.input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [14]:
pred_ids

tensor([ 1, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,
        72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 10, 70, 70, 70, 10, 72,
        43, 72, 72, 72, 72, 72, 72,  0,  0, 72, 72, 18, 72, 54, 72, 72, 72, 72,
        72,  0, 72, 21, 72, 49, 72, 72, 72, 72, 72, 72, 23, 70, 70, 27, 72, 46,
        72, 72, 72,  1, 72,  0,  0, 30, 72, 72, 72, 72, 25, 70, 70, 72, 72, 11,
        55, 72, 72, 72, 72,  5, 72,  0, 20, 58, 72, 72, 72,  0,  0, 16, 72, 72,
        72, 20, 70, 70, 72, 72, 16, 70, 27, 72, 72, 72, 72, 72, 45,  0,  0, 30,
        30, 70, 70, 27, 72, 43, 72, 72, 72, 72, 72, 72, 21, 72, 53, 72, 72, 72,
        27, 72,  0,  1, 72, 72, 72, 72, 25, 70, 23, 23, 48, 72, 72, 72, 72, 72,
        72,  8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,
        72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,
        72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,
        72, 72, 72, 72, 72, 72, 72, 72, 

In [15]:
print("Prediction:")
pred_ids = pred_ids[pred_ids != processor.tokenizer.pad_token_id]
print(processor.decode(pred_ids))

print("\nReference:")
print(processor.decode(common_voice_test['labels'][i]))
# print(common_voice_test_transcription[0]["sentence"].lower())

Prediction:
កញ្ញា ទេ បូព្រឹក សម្ដែង នៅ តន្ត្រី ស្រាបៀរ កម្ពុជា

Reference:
កញ្ញា ទេព បូព្រឹក្ស សម្ដែង នៅ តន្ត្រី ស្រាបៀរ កម្ពុជា
