# HuggingFace challenge - Debugger notebook
Run this notebook to verify your libraries versions, check GPU config and run a quick training

In [1]:
%%capture
!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode
!pip install datasets==1.18.1
!pip install git+https://github.com/huggingface/transformers.git
!pip install huggingface_hub==0.1
!pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install jiwer

In [2]:
import platform
import multiprocessing

import torch
import transformers
import datasets

import soundfile

## Print main infos

In [3]:
print(f"Platform: {platform.platform()}")
print(f"CPU cores: {multiprocessing.cpu_count()}")

print(f"Python version: {platform.python_version()}")

print(f"PyTorch version: {torch.__version__}")
print(f"GPU is visible: {torch.cuda.is_available()}")

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

print(f"soundfile version: {soundfile.__version__}")

Platform: Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic
CPU cores: 2
Python version: 3.7.12
PyTorch version: 1.10.0+cu111
GPU is visible: True
Transformers version: 4.17.0.dev0
Datasets version: 1.18.1
soundfile version: 0.10.3


## Check your GPU informations (if any)
If you launched an AI Training job with GPU resources, they should be listed below (Tesla V100s 32GB).
Driver and CUDA version 

In [4]:
!nvidia-smi

Fri Jan 28 03:31:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-crendential store but this isn't the helper defined on your machine.
You will have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal to set it as the default

git config --global credential.helper store[0m


In [6]:
%%capture
!apt install git-lfs

## Quick training run with a dummy model and data
more information on https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition

In [20]:
!wget -O run_speech_recognition_ctc.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py

--2022-01-28 03:34:16--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30347 (30K) [text/plain]
Saving to: ‘run_speech_recognition_ctc.py’


2022-01-28 03:34:16 (24.7 MB/s) - ‘run_speech_recognition_ctc.py’ saved [30347/30347]



In [21]:
# 	--learning_rate="7.5e-5" \
# 84.5

In [46]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="el" \
	--output_dir="./wav2vec2-large-xls-r-300m-greek" \
	--overwrite_output_dir \
	--num_train_epochs="100" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="32" \
	--gradient_accumulation_steps="1" \
	--learning_rate="3e-4" \
	--warmup_steps="500" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \´ \· \« \
	--save_steps="500" \
	--eval_steps="500" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="2" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/28/2022 03:36:33 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0003,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_

In [None]:
!python run_speech_recognition_ctc.py \
	--dataset_name="mozilla-foundation/common_voice_7_0" \
	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
	--dataset_config_name="gn" \
	--output_dir="./wav2vec2-large-xls-r-300m-guarani" \
	--overwrite_output_dir \
	--num_train_epochs="200" \
	--per_device_train_batch_size="32" \
	--per_device_eval_batch_size="32" \
	--gradient_accumulation_steps="1" \
	--learning_rate="7e-5" \
	--warmup_steps="400" \
	--length_column_name="input_length" \
	--evaluation_strategy="steps" \
	--text_column_name="sentence" \
	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
	--save_steps="400" \
	--eval_steps="400" \
	--logging_steps="100" \
	--layerdrop="0.0" \
	--activation_dropout="0.1" \
	--save_total_limit="2" \
	--freeze_feature_encoder \
	--feat_proj_dropout="0.0" \
	--mask_time_prob="0.75" \
	--mask_time_length="10" \
	--mask_feature_prob="0.25" \
	--mask_feature_length="64" \
	--gradient_checkpointing \
	--use_auth_token \
	--fp16 \
	--group_by_length \
	--do_train --do_eval \
    --push_to_hub

01/28/2022 02:46:18 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=400,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
greater_is_better=None,
group_by_length=True,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=7e-05,
length_column_name=input_length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_l

In [None]:
# !rm -rf wav2vec2-large-xls-r-300m-bashkir

In [None]:
!ls -ltr

In [None]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         3.5T  1.2T  2.2T  34% /
tmpfs            64M     0   64M   0% /dev
tmpfs            87G     0   87G   0% /sys/fs/cgroup
tmpfs            87G     0   87G   0% /dev/shm
/dev/md0        3.5T  1.2T  2.2T  34% /etc/group
tmpfs            87G   12K   87G   1% /proc/driver/nvidia
/dev/vda1        49G  6.5G   42G  14% /usr/bin/nvidia-smi
udev             87G     0   87G   0% /dev/nvidia0
tmpfs            87G     0   87G   0% /proc/acpi
tmpfs            87G     0   87G   0% /proc/scsi
tmpfs            87G     0   87G   0% /sys/firmware


In [2]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_7_0", "el", use_auth_token=True, split="train+validation")
common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "el", use_auth_token=True, split="test")

print(len(common_voice_train))

Downloading:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.7k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/el to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b...


Downloading:   0%|          | 0.00/580M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b. Subsequent calls will reuse this data.


Reusing dataset common_voice (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)


3601


In [3]:
common_voice_train

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 3601
})

In [4]:
len(common_voice_train) * 100 / 32

11253.125

In [5]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [6]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [7]:
show_random_elements(common_voice_train.remove_columns(["path", "audio"]), num_examples=10)

Unnamed: 0,sentence
0,του Δήμου Καρύστου Ευβοίας
1,"Ο Κακομοιρίδης, ακούοντας τις φωνές, βγήκε και αυτός"
2,"και αργά, αλλά με βήμα κανονικό, ανέβηκε την ακροποταμιά"
3,Πήδηξε το Βασιλόπουλο πίσω του
4,"Μεγαλέμπορος στα ξένα, Αφέντη μου, αποκρίθηκε ο πρωτοβεστιάριος."
5,που γύρευε να μας ζημιώσει.
6,«Πολυτρανότατε Βασιλιά και ανεψιέ.
7,Ποιος κλαίει;
8,Ποιος είναι; ρώτησε από μέσα μια γυναικεία φωνή.
9,Οι κύριοι που περίμεναν τη σειρά τους τον κοίταξαν αδιάφορα


In [35]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\´\·\«\»]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [36]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [37]:
# start_with_ar = common_voice_train.filter(lambda example: "'" in example['sentence'])
# start_with_ar[0]

In [38]:
# start_with_ar

In [39]:
def replace_hatted_characters(batch):
#     batch["sentence"] = re.sub('[â]', 'a', batch["sentence"])
#     batch["sentence"] = re.sub('[î]', 'i', batch["sentence"])
#     batch["sentence"] = re.sub('[ô]', 'o', batch["sentence"])
#     batch["sentence"] = re.sub('[û]', 'u', batch["sentence"])
    return batch

In [40]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [41]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [42]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [43]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [44]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 "'": 1,
 'a': 2,
 'e': 3,
 'g': 4,
 'h': 5,
 'm': 6,
 'n': 7,
 'o': 8,
 'r': 9,
 't': 10,
 'v': 11,
 '́': 12,
 'ΐ': 13,
 'ά': 14,
 'έ': 15,
 'ή': 16,
 'ί': 17,
 'α': 18,
 'β': 19,
 'γ': 20,
 'δ': 21,
 'ε': 22,
 'ζ': 23,
 'η': 24,
 'θ': 25,
 'ι': 26,
 'κ': 27,
 'λ': 28,
 'μ': 29,
 'ν': 30,
 'ξ': 31,
 'ο': 32,
 'π': 33,
 'ρ': 34,
 'ς': 35,
 'σ': 36,
 'τ': 37,
 'υ': 38,
 'φ': 39,
 'χ': 40,
 'ψ': 41,
 'ω': 42,
 'ϊ': 43,
 'ϋ': 44,
 'ό': 45,
 'ύ': 46,
 'ώ': 47}

In [45]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

import json
with open('./vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
    
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

repo_name = "wav2vec2-large-xls-r-300m-greek"

tokenizer.save_pretrained(repo_name)

# tokenizer.push_to_hub(repo_name)

file ./config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


50


('wav2vec2-large-xls-r-300m-greek/tokenizer_config.json',
 'wav2vec2-large-xls-r-300m-greek/special_tokens_map.json',
 'wav2vec2-large-xls-r-300m-greek/vocab.json',
 'wav2vec2-large-xls-r-300m-greek/added_tokens.json')

In [None]:
!wget -O eval.py https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
!cp eval.py wav2vec2-large-xls-r-300m-georgian
!ls -ltr wav2vec2-large-xls-r-300m-georgian

--2022-01-27 16:52:27--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/research_projects/robust-speech-event/eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4738 (4.6K) [text/plain]
Saving to: ‘eval.py’


2022-01-27 16:52:27 (21.9 MB/s) - ‘eval.py’ saved [4738/4738]

total 1232584
-rw-r--r-- 1 ovh ovh        399 Jan 27 12:09 vocab.json
-rw-r--r-- 1 ovh ovh        294 Jan 27 12:09 tokenizer_config.json
-rw-r--r-- 1 ovh ovh        695 Jan 27 12:09 special_tokens_map.json
-rw-r--r-- 1 ovh ovh         23 Jan 27 12:09 added_tokens.json
drwxr-xr-x 2 ovh ovh       4096 Jan 27 16:15 checkpoint-7500
drwxr-xr-x 2 ovh ovh       4096 Jan 27 16:31 checkpoint-8000
-rw-r--r-- 1 ovh ovh        197 Jan 27 16:43 train_results.json
-rw-r--r

In [None]:
!cd wav2vec2-large-xls-r-300m-georgian; python eval.py \
    --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config ka --split test --log_outputs

Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.
	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.


In [None]:
!cd wav2vec2-large-xls-r-300m-georgian; python eval.py \
    --model_id infinitejoy/wav2vec2-large-xls-r-300m-georgian --dataset speech-recognition-community-v2/dev_data \
    --config ka --split validation --chunk_length_s 10 --stride_length_s 1

In [None]:
# from transformers import AutoModelForCTC, Wav2Vec2Processor

# model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
# processor = Wav2Vec2Processor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")



Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [None]:
# from transformers import AutoModelForCTC, AutoProcessor
# from datasets import load_dataset

# model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")
# processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bashkir")

# input_values = processor(common_voice_test[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
# # input_values = input_values.to("cuda")

# logits = model(input_values).logits

# assert logits.shape[-1] == 32, logits.shape[-1]

Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

AssertionError: 55

In [None]:
from datasets import Audio, Dataset, load_dataset, load_metric
from transformers import AutoFeatureExtractor, pipeline

dataset = load_dataset("mozilla-foundation/common_voice_7_0", "el", use_auth_token=True, split="train+validation")

# for testing: only process the first two examples as a test
dataset = dataset.select(range(10))

repo_name = 'infinitejoy/wav2vec2-large-xls-r-300m-greek'

# load processor
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_name)
# feature_extractor = processor_with_lm.feature_extractor
sampling_rate = feature_extractor.sampling_rate

# resample audio
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

# load eval pipeline
asr = pipeline("automatic-speech-recognition", model=repo_name, feature_extractor=feature_extractor)

# map function to decode audio
def map_to_pred(batch):
    prediction = asr(
        batch["audio"]["array"])

    batch["prediction"] = prediction["text"]
    batch["target"] = batch["sentence"]
    return batch

# run inference on all examples
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
print(result["prediction"])

result[0]['target']

Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/mozilla-foundation___common_voice/el/7.0.0/fe20cac47c166e25b1f096ab661832e3da7cf298ed4a91dcaa1343ad972d175b)


Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]