In [4]:
!pip install datasets==1.18.3
!pip install transformers
!pip install pyctcdecode
!pip install jiwer
!pip install https://github.com/kpu/kenlm/archive/master.zip
!huggingface-cli login

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Using cached https://github.com/kpu/kenlm/archive/master.zip

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.
        (Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.
        
Token: 
Login successful
Your token has been save

In [5]:
import torch
import torchaudio
from datasets import load_dataset, load_metric,  Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
import re
wer = load_metric("wer")
cer = load_metric("cer")


chars_to_ignore_regex = '[\é\！\，\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\’\—\–\·]'

def load_data(dataset_id, language, split='test'):
    test_dataset = load_dataset(dataset_id, language, split=split, use_auth_token=True)
    test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
    return test_dataset

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
    batch["sentence"] = re.sub('!', '', batch["sentence"]).lower() + " "
    batch["sentence"] = batch["sentence"].replace('\"',"").replace("&","").replace("'","").replace("(","").lower() + " "
    batch["sentence"] = batch["sentence"].replace('[',"").replace("]","").replace("\\","").replace("«","").replace("»","").replace(")","").lower() + " "
    batch["sentence"] = batch["sentence"].replace("  "," ").replace("  "," ").replace("  "," ").lower() + " "
    
    batch["speech"] = batch["audio"]["array"]
    return batch


def evaluate_with_lm(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs.to('cuda')).logits
    int_result = processor.batch_decode(logits.cpu().numpy())

    batch["pred_strings"] =  int_result.text

    del int_result
    torch.cuda.empty_cache()

    return batch

def evaluate(batch):

    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values.to('cuda')).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
    return batch


Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [6]:
import torch
import torchaudio
from datasets import load_dataset, load_metric,  Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
import re

language="el"
model_id = 'ayameRushia/wav2vec2-large-xls-r-300m-el'
dataset_id = "mozilla-foundation/common_voice_8_0"
split="test"

processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_id,use_auth_token=True)
model = Wav2Vec2ForCTC.from_pretrained(model_id,use_auth_token=True)
model.to('cuda')

test_dataset = load_data(dataset_id, language, split)
test_dataset = test_dataset.map(speech_file_to_array_fn)

result = test_dataset.map(evaluate_with_lm, batched=True, batch_size=4)

print("WER: {:2f}".format(wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
print("CER: {:2f}".format(cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/77.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.1k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/el to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8...


Downloading:   0%|          | 0.00/639M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8. Subsequent calls will reuse this data.


0ex [00:00, ?ex/s]

  0%|          | 0/421 [00:00<?, ?ba/s]

WER: 0.207340
CER: 0.060466


Delete previous model and clear the cuda cache

In [7]:
del model
torch.cuda.empty_cache()
!nvidia-smi

Mon Feb  7 01:05:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P0    47W / 250W |   2435MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
dataset_id = "mozilla-foundation/common_voice_8_0"
language="el"
split="test"
model_id = 'ayameRushia/wav2vec2-large-xls-r-300m-el'

processor = Wav2Vec2Processor.from_pretrained(model_id,use_auth_token=True)
model = Wav2Vec2ForCTC.from_pretrained(model_id,use_auth_token=True)
model.to("cuda")

test_dataset = load_data(dataset_id, language, split)
test_dataset = test_dataset.map(speech_file_to_array_fn)

result = test_dataset.map(evaluate, batched=True, batch_size=4)

print("WER: {:2f}".format(wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
print("CER: {:2f}".format(cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Reusing dataset common_voice (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8)
Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/8.0.0/b8bc4d453193c06a43269b46cd87f075c70f152ac963b7f28f7a2760c45ec3e8/cache-641b47a82b6cfb24.arrow


  0%|          | 0/421 [00:00<?, ?ba/s]

WER: 0.311294
CER: 0.079509
