In [1]:
# !pip list

In [2]:
# !python3 -m pip install --upgrade pip

In [3]:
import numpy as np
from transformers import (
  HubertForCTC,
  Wav2Vec2Processor,
  Wav2Vec2CTCTokenizer,
  Wav2Vec2FeatureExtractor,
  TrainingArguments,
  Trainer,
)
import torch
from transformers import Wav2Vec2Processor
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_metric, concatenate_datasets, load_dataset, Audio



In [4]:
@dataclass
class DataCollatorCTCWithPadding:
  processor: Wav2Vec2Processor
  padding: Union[bool, str] = True

  def __call__(
    self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
  ) -> Dict[str, torch.Tensor]:
    # split inputs and labels since they have to be of different lengths and need
    # different padding methods
    input_features = [{"input_values": feature["input_values"]} for feature in features]
    label_features = [{"input_ids": feature["labels"]} for feature in features]

    batch = self.processor.pad(
      input_features,
      padding=self.padding,
      return_tensors="pt",
    )
    with self.processor.as_target_processor():
      labels_batch = self.processor.pad(
        label_features,
        padding=self.padding,
        return_tensors="pt",
      )

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(
      labels_batch.attention_mask.ne(1), -100
    )

    batch["labels"] = labels
    return batch

In [10]:
from huggingface_hub import notebook_login, login

login("hf_token", True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [11]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_17_0", "ba", split="train+validation",keep_in_memory=True, trust_remote_code=True, use_auth_token=True)
common_voice_test = load_dataset("mozilla-foundation/common_voice_17_0", "ba", split="test",keep_in_memory=True, trust_remote_code=True, use_auth_token=True)




Downloading builder script:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/432M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/240M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/228M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 14736it [00:00, 147352.76it/s][A
Reading metadata...: 29699it [00:00, 148687.16it/s][A
Reading metadata...: 44568it [00:00, 148554.45it/s][A
Reading metadata...: 59463it [00:00, 148706.38it/s][A
Reading metadata...: 74334it [00:00, 145886.45it/s][A
Reading metadata...: 88932it [00:00, 144124.52it/s][A
Reading metadata...: 103353it [00:00, 143464.01it/s][A
Reading metadata...: 119180it [00:00, 145082.51it/s][A


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 14495it [00:00, 147067.28it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 14513it [00:00, 154055.46it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 493it [00:00, 128354.55it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 7960it [00:00, 143292.85it/s]


Generating validated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 15618it [00:00, 156171.96it/s][A
Reading metadata...: 31737it [00:00, 159119.31it/s][A
Reading metadata...: 47649it [00:00, 155583.00it/s][A
Reading metadata...: 63217it [00:00, 152091.81it/s][A
Reading metadata...: 78440it [00:00, 151895.76it/s][A
Reading metadata...: 93638it [00:00, 151469.59it/s][A
Reading metadata...: 108790it [00:00, 150521.49it/s][A
Reading metadata...: 123846it [00:00, 149525.41it/s][A
Reading metadata...: 138801it [00:00, 146779.51it/s][A
Reading metadata...: 154009it [00:01, 148376.19it/s][A
Reading metadata...: 168857it [00:01, 146840.37it/s][A
Reading metadata...: 183550it [00:01, 145609.81it/s][A
Reading metadata...: 209653it [00:01, 148701.85it/s][A


In [12]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
# !pip install "soundfile>=0.12.1" -U

In [13]:
# !pip list

In [14]:
show_random_elements(common_voice_train.remove_columns(["path"]), num_examples=10)

Unnamed: 0,client_id,audio,sentence,up_votes,down_votes,age,gender,accent,locale,segment,variant
0,96f58b0274d05b4b890c7e21fab8fab819a27c284ab68b9614a5dc3c435ed553464c27d8fa81720e0e5bca6e7fd13c8d0a3f7bcb258487312d889f4a7317b1f9,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/944fc7c543869badc9d8d8062f5e8e33351fb7826c2bd639f752d45bf1d9f1b9/ba_train_2/common_voice_ba_26163821.mp3', 'array': [9.094947017729282e-13, -7.275957614183426e-12, -1.6370904631912708e-11, -1.0459189070388675e-11, -9.094947017729282e-13, 8.185452315956354e-12, 9.549694368615746e-12, 1.0913936421275139e-11, 6.366462912410498e-12, 5.4569682106375694e-12, 7.275957614183426e-12, 1.4551915228366852e-11, 1.2732925824820995e-11, 8.185452315956354e-12, 9.094947017729282e-13, 0.0, 6.366462912410498e-12, 1.546140993013978e-11, 2.637534635141492e-11, 4.001776687800884e-11, 4.18367562815547e-11, 4.001776687800884e-11, 3.728928277269006e-11, 2.546585164964199e-11, -2.7284841053187847e-12, -3.456079866737127e-11, -6.184563972055912e-11, -7.09405867382884e-11, -7.639755494892597e-11, -8.549250196665525e-11, -8.458300726488233e-11, -6.366462912410498e-11, -1.9099388737231493e-11, 4.547473508864641e-13, -9.094947017729282e-12, -3.865352482534945e-11, -5.275069270282984e-11, -4.774847184307873e-11, -2.2737367544323206e-11, -2.000888343900442e-11, -6.275513442233205e-11, -1.1186784831807017e-10, -1.0868461686186492e-10, -8.185452315956354e-12, 1.532498572487384e-10, 2.864908310584724e-10, 3.510649548843503e-10, 4.1154635255225003e-10, 5.607034836430103e-10, 7.880771590862423e-10, 9.840732673183084e-10, 9.777068044058979e-10, 6.978098099352792e-10, 3.019522409886122e-10, -1.1823431123048067e-11, -2.637534635141492e-10, -5.966285243630409e-10, -1.0741132427938282e-09, -1.6589183360338211e-09, -2.2646418074145913e-09, -2.7303030947223306e-09, -2.9849616112187505e-09, -3.1641320674680173e-09, -3.4297045203857124e-09, -3.7630343285854906e-09, -3.997683961642906e-09, -3.952663973905146e-09, -3.613422450143844e-09, -3.1768649932928383e-09, -2.890374162234366e-09, -2.8194335754960775e-09, -2.8958311304450035e-09, -3.0831870390102267e-09, -3.335117071401328e-09, -3.6325218388810754e-09, -4.101821104995906e-09, -4.895355232292786e-09, -5.923084245296195e-09, -6.874188329675235e-09, -7.451262717950158e-09, -7.58791429689154e-09, -7.504240784328431e-09, -7.523340173065662e-09, -7.736161933280528e-09, -7.905327947810292e-09, -7.670678314752877e-09, -6.804839358665049e-09, -5.420588422566652e-09, -3.939931048080325e-09, -2.7321220841258764e-09, -1.6316334949806333e-09, -1.3369572116062045e-10, 1.9681465346366167e-09, 4.1009116102941334e-09, 5.2159521146677434e-09, 4.790308594238013e-09, 3.4287950256839395e-09, 2.241904439870268e-09, 1.724401954561472e-09, 1.4679244486615062e-09, ...], 'sampling_rate': 48000}",–ë—ã–ª –º–∏–Ω–µ“£ “°—ã“ô—ã–º,2,0,,,,ba,,
1,e160b36a4c95f67d0d0f35c1f0e262e3275c4ce49a92410607bd64f89a96ccfce73003ec9b45f94c1eaf91c358b3728ddcbbe62c970c2a729b39eedc5b44d296,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/944fc7c543869badc9d8d8062f5e8e33351fb7826c2bd639f752d45bf1d9f1b9/ba_train_2/common_voice_ba_26031663.mp3', 'array': [1.9645539548134407e-24, -3.799860938915471e-24, -1.9128551665288765e-24, -4.1359030627651384e-25, -6.72084247699335e-25, -5.686866711302065e-25, -2.274746684520826e-24, -4.497794580757088e-24, -4.446095792472524e-24, -3.308722450212111e-24, 2.481541837659083e-24, -8.271806125530277e-25, -8.06501097239202e-24, -2.0679515313825692e-24, -2.5332406259436473e-24, -1.2149215246872594e-24, 1.5800442169469943e-24, 2.1196503196671334e-24, -2.6366382025127757e-24, -1.1632227364026952e-24, 2.7788098702953273e-25, -6.462348535570529e-25, 1.7771458472818954e-25, 4.937234281175884e-24, 8.271806125530277e-24, 4.226375942263126e-24, 1.173562494059608e-23, 6.2038545941477076e-24, 9.228233708794715e-24, 3.677076316739631e-24, 7.005185812558453e-24, 6.814546530759123e-24, 5.686866711302065e-25, -2.3264454728053903e-25, 5.169878828456423e-26, -5.790264287871194e-24, -7.858215819253763e-24, -4.963083675318166e-24, -8.271806125530277e-25, -4.1359030627651384e-25, -3.7223127564886245e-24, -7.444625512977249e-24, -1.6543612251060553e-24, -1.6543612251060553e-23, -1.6543612251060553e-24, 1.2407709188295415e-23, 2.895132143935597e-24, -1.809457589959748e-24, -8.788794008375919e-25, -1.0081263715490025e-24, -3.5155176033503676e-24, -5.118180040171859e-24, 1.6672859221771964e-24, 3.166550782429559e-24, -1.9128551665288765e-24, -4.7748677742196744e-24, 3.7481621506309067e-25, 1.9387045606711586e-26, 2.859589226989959e-24, 1.0766272660260501e-23, -2.1584244108805566e-24, -3.457356466530233e-24, -2.8046592644376095e-24, 7.94868869875175e-25, -1.2569267901684678e-24, -1.124448645189272e-24, 2.0097903945624344e-24, 4.9695460238537366e-24, -9.17653492051015e-25, -6.526972020926234e-25, 1.2149215246872594e-24, 2.8822074468644558e-24, -1.0662875083691372e-25, -1.8611563782443123e-24, -1.1813173123022926e-23, -1.5037481145489147e-23, -1.4359338446037715e-23, -9.65474871214237e-24, -1.6285118309637732e-24, -5.428372769879244e-25, -6.371875656072541e-24, -5.2603517079544104e-24, 3.903258515484599e-24, 8.019774532643026e-24, -3.360421238496675e-24, 2.0679515313825692e-25, 5.450990989753741e-24, 8.271806125530277e-25, -1.83530698410203e-24, -1.357093192469811e-24, 0.0, 3.748162150630907e-24, 4.872610795820179e-24, 1.7060600133906196e-24, -3.799860938915471e-24, -1.9516292577422997e-24, -1.0339757656912846e-25, 2.0679515313825692e-24, 1.8611563782443123e-24, 8.271806125530277e-24, ...], 'sampling_rate': 48000}","- –®”ô“£–≥”ô—Ä”ô–π —ç—à“ª–µ“ô-—à”©“ì”©–ª“ª”©“ô —è—Ç—ã—É“ô–∞–Ω –∞“ì–∞—Ä—ã–ø, –Ω”ô“ô–µ–≥”ô–π–µ–ø “°–∞–ª“ì–∞–Ω –±–∞—Ä–º–∞“°—Ç–∞—Ä—ã –º–µ–Ω”ô–Ω “Ø“£”ô—Å–µ–Ω —ã—à“°—ã–Ω—ã.",2,0,sixties,male_masculine,,ba,,
2,3024459e489b395b30fe5221c17d7c0c4d287cca2a69df62c78155d6167a423e173bfa262b7e1ce379b6c0a3cbd3b904b25364980cb92b5dbcfee5061f163d5c,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/c2c22acf185d9d78077a9485cc6687145cc56b7e1780652a4e8aa43aff5cd21f/ba_train_1/common_voice_ba_26226950.mp3', 'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], 'sampling_rate': 48000}",‚Äî “Æ–ª–µ–º–¥–µ –∫“Ø—Ä–≥”ô–Ω–µ–º –±–∞—Ä...,2,0,thirties,female_feminine,,ba,,
3,8861729e0f44d9e23ab916bd84784292790a58ce9d7059f6b58f6429815c16e5516a68561a95872da770523482b21141b0a5a384340604ad0f63d8f371de4df7,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/aba413718ed4fe3f7cc687eda89d7f07fad095a757c628c75c723633e666fd56/ba_train_0/common_voice_ba_26258558.mp3', 'array': [4.263256414560601e-14, -1.2079226507921703e-13, 1.7763568394002505e-14, 2.842170943040401e-14, -5.542233338928781e-13, -1.3358203432289883e-12, -2.0179413695586845e-12, -2.3021584638627246e-12, -1.7195134205394424e-12, -1.0516032489249483e-12, -6.536993168992922e-13, -8.810729923425242e-13, -1.2931877790833823e-12, -1.9610979506978765e-12, -1.9610979506978765e-12, -9.947598300641403e-13, 8.810729923425242e-13, 2.0179413695586845e-12, 2.8137492336099967e-12, 3.240074875066057e-12, 2.7995383788947947e-12, 2.1884716261411086e-12, 2.0179413695586845e-12, 1.4779288903810084e-12, 1.5631940186722204e-13, -1.3358203432289883e-12, -2.4655832930875476e-12, -3.588240815588506e-12, -4.675371201301459e-12, -5.9259264162392356e-12, -7.51754214434186e-12, -6.494360604847316e-12, -2.0534685063466895e-12, 4.206412995699793e-12, 9.350742402602918e-12, 1.1141310096718371e-11, 8.469669410260394e-12, 4.803268893738277e-12, 6.707523425575346e-12, 1.6228796084760688e-11, 2.9245939003885724e-11, 3.9193537304527126e-11, 3.929301328753354e-11, 2.744826588241267e-11, 1.0658141036401503e-11, -9.805489753489383e-13, -2.9558577807620168e-12, 5.400124791776761e-13, 1.5774048733874224e-12, -4.526157226791838e-12, -1.3805845355818747e-11, -1.5219825399981346e-11, -4.831690603168681e-13, 2.6005864128819667e-11, 4.7265302782761864e-11, 4.6213699533836916e-11, 1.956834694283316e-11, -1.8317791727895383e-11, -4.249045559845399e-11, -3.171862772433087e-11, 1.3500311979441904e-11, 6.716049938404467e-11, 9.689671287560486e-11, 8.910205906431656e-11, 5.297806637827307e-11, 1.1084466677857563e-11, -1.0544454198679887e-11, 1.6768808563938364e-12, 3.809930149145657e-11, 7.831602033547824e-11, 1.1007728062395472e-10, 1.3307044355315156e-10, 1.4998136066424195e-10, 1.5910472939140163e-10, 1.5052137314341962e-10, 1.1198153515579179e-10, 4.4508396968012676e-11, -2.9956481739645824e-11, -7.315748007385992e-11, -5.0818016461562365e-11, 3.8824055081931874e-11, 1.5234036254696548e-10, 2.233377927041147e-10, 2.0662582755903713e-10, 1.0723510968091432e-10, -1.659827830735594e-11, -8.427036846114788e-11, -5.17843545821961e-11, 5.061906449554954e-11, 1.4171064321999438e-10, 1.4918555280019064e-10, 5.88329385209363e-11, -7.281641956069507e-11, -1.5580781109747477e-10, -1.4165379980113357e-10, -6.110667527536862e-11, 1.0828671292983927e-11, 1.5148771126405336e-11, -4.480682491703192e-11, -9.902123565552756e-11, ...], 'sampling_rate': 48000}","‚Äî “∫–∞“£“ì—ã—Ä–∞—É–ª—ã“° —Ç–∞ –±–∞—Ä“ô—ã—Ä, —é“ì–∏“ª”ô —Ñ–∏–∞–ª —ã“ì—ã-–∑—ã“ì—ã“ª—ã–Ω –∏—à–µ—Ç–µ—Ä –∏–Ω–µ–º.",2,0,thirties,female_feminine,,ba,,
4,21328583ed4c54d1c02eb9acb67e2480a97a2bc0425a5aa56c8408c6ffc65bd2647b13af168e0f9484a36a69cf256984618b9dab49dac4d08d911617df135a30,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/aba413718ed4fe3f7cc687eda89d7f07fad095a757c628c75c723633e666fd56/ba_train_0/common_voice_ba_26091873.mp3', 'array': [1.0658141036401503e-14, 1.7763568394002505e-14, -1.7763568394002505e-14, -2.842170943040401e-14, -8.881784197001252e-14, -1.971756091734278e-13, -2.984279490192421e-13, -1.6342482922482304e-13, 3.197442310920451e-14, 1.2079226507921703e-13, -1.0658141036401503e-14, -2.2026824808563106e-13, -1.1013412404281553e-13, 2.899902540320909e-13, 5.364597654988756e-13, 3.232969447708456e-13, 3.197442310920451e-14, 1.0658141036401503e-14, 5.329070518200751e-15, -3.7125857943465235e-13, -7.922551503725117e-13, -6.536993168992922e-13, 1.0835776720341528e-13, 5.400124791776761e-13, 0.0, -6.217248937900877e-13, -8.171241461241152e-14, 1.1262102361797588e-12, 1.4450662888521038e-12, 6.181721801112872e-13, -7.105427357601002e-14, -5.3290705182007514e-14, -3.197442310920451e-13, -1.5862866575844237e-12, -2.5863755581667647e-12, -1.588063014423824e-12, 7.638334409421077e-13, 1.4086509736443986e-12, -5.613287612504791e-13, -2.1458390619955026e-12, -4.476419235288631e-13, 3.282707439211663e-12, 5.258016244624741e-12, 3.808509063674137e-12, 1.2541079286165768e-12, 2.1316282072803006e-13, 3.765876499528531e-13, 1.687538997430238e-14, -1.2079226507921703e-12, -2.632560835991171e-12, -3.6841640849161195e-12, -3.112177182629239e-12, -2.2737367544323206e-13, 2.5224267119483557e-12, 1.9610979506978765e-12, -3.019806626980426e-13, 8.952838470577262e-13, 5.9685589803848416e-12, 7.617018127348274e-12, 7.389644451905042e-13, -8.08242361927114e-12, -7.565503779005667e-12, 7.709388682997087e-13, 3.623767952376511e-12, -3.4852121189032914e-12, -7.315037464650231e-12, 1.815436689867056e-12, 1.1496581464598421e-11, 4.533262654149439e-12, -1.0409451078885468e-11, -6.206590796864475e-12, 2.148858868622483e-11, 4.149569576838985e-11, 2.8137492336099967e-11, -1.6413537196058314e-12, -1.1908696251339279e-11, 2.0268231537556858e-12, 1.1610268302320037e-11, 1.5916157281026244e-12, -9.00435281891987e-12, -1.6484591469634324e-12, 9.386269539390923e-12, -7.105427357601002e-15, -2.2637891561316792e-11, -2.7490898446558276e-11, -4.192202140984591e-12, 2.220446049250313e-11, 2.559730205575761e-11, 1.0189182830799837e-11, -2.0037305148434825e-12, -1.4495071809506044e-12, 2.618349981275969e-12, -1.6413537196058314e-12, -1.6218137943724287e-11, -3.108269197582558e-11, -3.000977244482783e-11, -8.903100479074055e-12, 1.1230127938688383e-11, 6.778577699151356e-12, -8.881784197001252e-12, ...], 'sampling_rate': 48000}","“†–∞—Ç—ã–Ω—ã“£ —è–º–∞–Ω –±—É–ª“ª–∞, “ª–∞—Ä“ì–∞–π—ã–ø –∫–∏–±–µ—Ä“ª–µ“£.",2,0,twenties,female_feminine,,ba,,
5,e160b36a4c95f67d0d0f35c1f0e262e3275c4ce49a92410607bd64f89a96ccfce73003ec9b45f94c1eaf91c358b3728ddcbbe62c970c2a729b39eedc5b44d296,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/944fc7c543869badc9d8d8062f5e8e33351fb7826c2bd639f752d45bf1d9f1b9/ba_train_2/common_voice_ba_25876729.mp3', 'array': [-1.990403348955723e-24, 2.68833699079734e-24, -4.1359030627651384e-25, 3.9291079096268815e-24, -3.101927297073854e-25, -4.342698215903395e-24, -2.042102137240287e-24, 1.34416849539867e-24, 1.7512964531396133e-24, 1.809457589959748e-24, 7.237830359838992e-25, -2.0679515313825692e-25, -1.2795450100429647e-24, -3.7223127564886245e-24, -2.3781442610899546e-24, 7.302453845194697e-25, 2.4492300949812304e-24, 4.426708746865812e-25, 1.6414365280349143e-24, -1.5057272087879332e-24, -1.1002148381808825e-24, -3.2440989648564054e-24, 1.34416849539867e-24, 2.235972593307403e-24, 2.1842738050228387e-24, 1.6802106192483375e-24, -2.0808762284537102e-24, -2.055026834311428e-24, -8.077935669463161e-25, -5.62224322594636e-25, -1.6026624368214911e-24, 2.6495628995839168e-24, 1.7642211502107543e-24, -2.9985297205047253e-24, -1.5509636485369269e-24, 2.274746684520826e-24, -2.0679515313825692e-24, -1.2407709188295415e-24, -2.481541837659083e-24, 2.68833699079734e-24, -2.714186384939622e-25, 2.0679515313825692e-25, -4.1359030627651384e-25, -4.963083675318166e-24, -6.462348535570529e-26, 8.271806125530277e-24, 3.308722450212111e-24, 1.2924697071141057e-25, -2.0679515313825692e-24, 3.101927297073854e-25, 4.3943970041879595e-25, -3.2053248736429822e-24, 2.5849394142282115e-25, 3.2570236619275465e-24, 2.3522948669476725e-24, 7.884065213396045e-25, -1.783608195817466e-24, -9.822769774067204e-25, 2.946830932220161e-24, 4.1746771539785615e-24, -2.5849394142282115e-25, -6.898557061721539e-25, -2.0679515313825692e-24, -3.101927297073854e-25, -1.822382287030889e-24, -1.932242212135588e-24, 1.4217166778255163e-25, 2.772347521759757e-24, 2.959755629291302e-24, 2.1196503196671334e-24, 3.1172753748458338e-24, 1.550963648536927e-25, -1.4604907690389395e-24, -1.6349741794993438e-24, -3.360421238496675e-24, -1.770683498746325e-24, 7.754818242684634e-26, 7.754818242684634e-24, 5.906586561511463e-24, -1.0469004627624257e-24, -1.9645539548134407e-24, -1.4411037234322279e-24, 2.5849394142282115e-26, 2.895132143935597e-24, 3.0986961228060685e-24, 8.013312184107456e-25, -7.754818242684634e-25, -8.659547037664508e-25, 1.8611563782443123e-24, 7.237830359838992e-25, -8.271806125530277e-25, 2.5849394142282115e-25, 5.945360652724886e-25, -2.4039936552322367e-24, -2.895132143935597e-24, 1.2407709188295415e-24, -1.2924697071141057e-24, -1.2407709188295415e-24, -3.308722450212111e-24, 0.0, ...], 'sampling_rate': 48000}",- ”ò–ø—Ç–µ–ª”ô—Ö”ô—Ç –∫“Ø“ô“ô”ô—Ä–µ–Ω –∞—Å—Ç—ã –ª–∞ —Ç“Ø—à”ô–º–≥”ô “°–∞—Ä–∞–ø —è—Ç–∞ –±–∏—Ä“ô–µ.,2,0,sixties,male_masculine,,ba,,
6,4b7938fa6ff8aae5574194abc152b8905e72203d160a8920ab44c550eabfd9baae1932cd3170fd503efe5fd16051c568b90f201f290aba4245072e9ab8dd3dc5,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/e024c6558d9897612fd8f5a6b3c29f7c888176fe04022cbd549f5518312bfd53/ba_dev_0/common_voice_ba_26054132.mp3', 'array': [6.815758221109542e-28, -6.058451752097371e-28, -3.0292258760486853e-28, -1.2116903504194741e-27, 1.5146129380243427e-28, -6.310887241768094e-28, -1.2874209973206913e-27, -2.0952145642670074e-27, -7.573064690121713e-28, -7.068193710780266e-28, 6.058451752097371e-28, 1.5146129380243427e-28, -2.0194839173657902e-28, -1.1107161545511846e-27, -1.6408306828597046e-28, -5.679798517591285e-29, -2.4296915880807164e-28, 5.806016262426647e-28, 7.9201634884189585e-28, 6.68954047627418e-28, 2.9030081312133234e-28, 1.3142422680982057e-27, 2.865142807762715e-27, 1.5777218104420236e-27, -2.477023242393977e-28, 1.4956802762990384e-27, 2.5180440094654697e-27, 2.1125695041818696e-27, 7.573064690121713e-28, 2.524354896707238e-29, 3.281661365719409e-28, 7.573064690121713e-28, 6.310887241768094e-28, 6.058451752097371e-28, -1.0097419586828951e-27, 0.0, -1.0097419586828951e-27, -1.4136387421560532e-27, -1.4136387421560532e-27, -1.4136387421560532e-27, -1.8175355256292112e-27, -4.0389678347315804e-28, -2.4233807008389483e-27, -1.6155871338926322e-27, -1.2116903504194741e-27, 2.0194839173657902e-28, -1.2116903504194741e-27, -3.0292258760486853e-28, 3.0292258760486853e-28, -3.0292258760486853e-28, -2.1078363387505435e-27, -1.7039395552773855e-27, -6.437104986603456e-28, 3.34477023813709e-28, -1.1675141397270975e-27, -1.0196027199981578e-27, 2.524354896707238e-29, -2.524354896707238e-28, -7.667727998748235e-28, -4.165185579566942e-28, -7.9201634884189585e-28, -8.456588903969247e-28, -1.6218980211344003e-27, -8.993014319519535e-29, 1.1296488162764889e-27, -3.7865323450608567e-28, -1.606909663935201e-27, 3.7234234726431757e-28, -6.500213859021137e-28, -7.809722961688017e-28, -8.330371159133885e-28, 1.4515040656066617e-28, -3.9758589623138995e-28, -2.0194839173657902e-28, -1.9753077066734136e-27, -1.4136387421560532e-27, -2.865142807762715e-27, -2.328717392212427e-27, -1.2495556738700827e-27, 2.398137151871876e-28, -8.393480031551566e-28, -1.0349855076499675e-27, 5.427363027920561e-28, 6.184669496932733e-28, 1.2116903504194741e-27, -5.301145283085199e-28, 1.539856486991415e-27, 5.806016262426647e-28, 3.9127500898962186e-28, 8.077935669463161e-28, 1.792291976662139e-27, -1.135959703518257e-28, 1.2621774483536189e-27, 1.666074231826777e-27, 1.5651000359584874e-27, 6.058451752097371e-28, 1.7670484276950664e-27, 1.6155871338926322e-27, 1.6155871338926322e-27, 8.077935669463161e-28, ...], 'sampling_rate': 48000}","–§”ô“°–∏—Ä–ª–µ–∫ “ì”ô–π–µ–ø —Ç“Ø–≥–µ–ª, —è–ª“°–∞—É–ª—ã“° “ì”ô–π–µ–ø.",2,0,twenties,male_masculine,,ba,,
7,8535f9f4ecb07a3756bb1d71743cc8bb29c29fb868c41b32d3d67929aae8f2d34a481cd8340cd267ecd9d156b1473a60ac1d46b8e1e9080b01a09ce6fea8e374,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/c2c22acf185d9d78077a9485cc6687145cc56b7e1780652a4e8aa43aff5cd21f/ba_train_1/common_voice_ba_26190836.mp3', 'array': [1.1368683772161603e-13, 4.149569576838985e-12, 6.707523425575346e-12, 5.059064278611913e-12, 3.694822225952521e-12, 4.547473508864641e-12, 3.751665644813329e-12, 2.7284841053187847e-12, 1.8189894035458565e-12, 7.673861546209082e-13, -8.526512829121202e-13, -9.094947017729282e-13, -1.9042545318370685e-12, -4.263256414560601e-12, -4.888534022029489e-12, -5.7980287238024175e-12, -8.58335624798201e-12, -8.810729923425242e-12, -5.4569682106375694e-12, -6.252776074688882e-13, 5.9117155615240335e-12, 1.2732925824820995e-11, 1.3528733688872308e-11, 8.753886504564434e-12, 1.1368683772161603e-13, -9.094947017729282e-12, -1.7280399333685637e-11, -1.878674993349705e-11, -1.0686562745831907e-11, 3.183231456205249e-12, 1.5802470443304628e-11, 2.3703705664956942e-11, 2.8819613362429664e-11, 3.12070369545836e-11, 2.9331204132176936e-11, 2.2680524125462398e-11, 1.5688783605583012e-11, 1.0913936421275139e-11, 5.7980287238024175e-12, -7.958078640513122e-12, -3.2514435588382185e-11, -5.843503458891064e-11, -6.917844075360335e-11, -7.102585186657961e-11, -8.495248948747758e-11, -1.1465317584224977e-10, -1.4443912732531317e-10, -1.581099695613375e-10, -1.4358647604240105e-10, -9.515588317299262e-11, -1.91562321560923e-11, 6.650680006714538e-11, 1.2937562132719904e-10, 1.2812506611226127e-10, 4.6497916628140956e-11, -7.821654435247183e-11, -1.8451373762218282e-10, -2.276010491186753e-10, -2.0406787371030077e-10, -1.475655153626576e-10, -9.947598300641403e-11, -7.844391802791506e-11, -8.662937034387141e-11, -1.1436895874794573e-10, -1.3147882782504894e-10, -1.1590373105718754e-10, -7.992184691829607e-11, -4.4565240386873484e-11, -2.5920599000528455e-11, -3.956301952712238e-11, -7.742073648842052e-11, -1.007265382213518e-10, -7.707967597525567e-11, 1.8758328224066645e-12, 1.0203393685515039e-10, 1.4983925211708993e-10, 8.799361239653081e-11, -4.922640073345974e-11, -1.7757884052116424e-10, -2.362412487855181e-10, -2.15550244320184e-10, -1.4506440493278205e-10, -8.29913915367797e-11, -8.174083632184193e-11, -1.8309265215066262e-10, -4.0688519220566377e-10, -6.920117812114768e-10, -8.89428974915063e-10, -8.675442586536519e-10, -6.17319528828375e-10, -2.710294211283326e-10, -3.956301952712238e-11, -5.275069270282984e-11, -2.53749021794647e-10, -5.029505700804293e-10, -7.360085874097422e-10, -9.46556610870175e-10, -1.1033307600882836e-09, -1.163925844593905e-09, -1.136641003540717e-09, ...], 'sampling_rate': 48000}",”®—Å-–¥“Ø—Ä—Ç –∫”©–Ω–¥”ô–Ω —è—Ä“ô–∞–º –∫–∏–ª–µ–ø –µ—Ç–µ—Ä–≥”ô —Ç–µ–π–µ—à.,2,0,,,,ba,,
8,9d5f6665597f4ddc6a310d5323b128cc1336a83a3ae499c21039f1c634bdf6ab06e029e7d7ded7462137a61433af2cbc70d34e5ef86cb0cfc01d08a1588993a1,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/944fc7c543869badc9d8d8062f5e8e33351fb7826c2bd639f752d45bf1d9f1b9/ba_train_2/common_voice_ba_26096270.mp3', 'array': [-3.680953725860973e-23, 1.075334796318936e-23, 2.481541837659083e-23, 4.9217246446905147e-23, 1.3855275260263214e-23, -3.639594695233322e-23, -2.936491174563248e-23, 1.613002194478404e-23, -2.68833699079734e-23, -4.963083675318166e-24, -1.6543612251060553e-23, 3.970466940254533e-23, 1.7887780746459223e-23, -2.150669592637872e-23, -4.880365614062863e-23, -1.34416849539867e-23, 2.2333876538931747e-23, 2.9158116592494226e-23, -9.822769774067204e-25, -1.8301371052735737e-23, -1.5509636485369269e-24, -2.016252743098005e-23, -4.8596860987490376e-23, 3.5361971186641933e-23, 2.3290304122196185e-23, -6.2038545941477076e-24, 1.3648480107124957e-23, -1.7577588016751838e-24, 3.019209235818551e-23, 2.088631046696395e-23, 1.6543612251060553e-24, 2.0679515313825692e-23, -2.4608623223452573e-23, -4.963083675318166e-23, 1.819797347616661e-23, 4.1359030627651384e-24, -1.9852334701272664e-23, 1.6543612251060553e-23, -2.481541837659083e-23, -3.680953725860973e-23, -5.790264287871194e-23, 1.3234889800848443e-23, -2.6469779601696886e-23, -5.293955920339377e-23, 9.926167350636332e-24, 3.970466940254533e-23, -5.955700410381799e-23, -2.285086442177739e-23, -1.3648480107124957e-23, 0.0, 1.406207041340147e-23, -1.111523948118131e-23, 3.4638188150658034e-23, 2.212708138579349e-23, -1.9102702271146483e-23, -2.750375536738817e-23, -2.2023683809224362e-23, 1.760343741089412e-23, 3.474158572722716e-23, 6.162495563520056e-23, 3.618915179919496e-24, 1.8404768629304866e-23, 1.002956492720546e-23, -2.223047896236262e-23, -5.997059441009451e-24, -8.478601278668534e-24, -8.375203702099405e-24, 1.075334796318936e-23, -1.7784383169890095e-23, -2.750375536738817e-23, 2.6056189295420372e-23, 2.90547190159251e-23, -1.2407709188295415e-24, -4.487454823100175e-23, -9.952016744778614e-23, -7.982292911136717e-23, -1.406207041340147e-23, -8.271806125530277e-25, 4.880365614062863e-23, -3.908428394313056e-23, -3.288042934898285e-23, -1.5302841332231012e-23, 1.675040740419881e-23, 6.493367808541267e-23, 3.1846453583291565e-23, 7.651420666115506e-24, -9.512577044359818e-24, -4.1359030627651384e-24, 2.6469779601696886e-23, 2.9778502051908996e-23, -3.970466940254533e-23, -3.308722450212111e-24, 8.271806125530277e-24, 1.9025154088719637e-23, -6.617444900424222e-24, 1.3234889800848443e-23, -5.790264287871194e-24, 3.3087224502121107e-23, 1.3234889800848443e-23, 1.9852334701272664e-23, ...], 'sampling_rate': 48000}",–¢—É–π—ã“ì—ã“ô “°–æ—Ç–ª–æ –±—É–ª“ª—ã–Ω!,2,0,,,,ba,,
9,2708e2fb869bc54793cf48dd35de1f5c15a613e160eb915e0d6508145f03d906e467db8b6933ce3e028ee9efc39e5bbf70011ad653be23ad1c483e913a4b3c23,"{'path': '/root/.cache/huggingface/datasets/downloads/extracted/aba413718ed4fe3f7cc687eda89d7f07fad095a757c628c75c723633e666fd56/ba_train_0/common_voice_ba_25938224.mp3', 'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], 'sampling_rate': 48000}","–ô”ô—à, “ª—ã–ª—ã—É “°–∞—Ç—ã–Ω—ã–Ω –∫”©–Ω–ª”ô—à–µ“Ø —Ç–æ–π“ì–æ“ª–æ “°–∞–ø—ã–ª —Ç—ã–Ω—ã–Ω “°—É—Ä“ª–∞, —à–∞“°—à—ã —Ö–∏—Å—Ç”ô—Ä“ô–µ “Ø“ô–µ–Ω”ô–Ω —Ç–∏“ô–µ—Ä”ô–∫ “°—ã—É—ã—Ä“ì–∞ —Ç—ã—Ä—ã—à—Ç—ã.",2,0,twenties,female_feminine,,ba,,


In [15]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\‚Äú\%\‚Äò\‚Äù\ÔøΩ\'\¬´\¬ª\‚Äì\‚Äîaijno]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, ' ', batch["sentence"]).lower()
    return batch

In [16]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

Map:   0%|          | 0/133675 [00:00<?, ? examples/s]

Map:   0%|          | 0/14513 [00:00<?, ? examples/s]

In [17]:
import re
chars_to_remove_regex = {'–∏ÃÜ':'–π','i':''}

def replace_bad_characters(batch):
    for k in chars_to_remove_regex:
        batch["sentence"] = re.sub(k, chars_to_remove_regex[k], batch["sentence"]).lower()
    return batch

In [18]:
common_voice_train = common_voice_train.map(replace_bad_characters)
common_voice_test = common_voice_test.map(replace_bad_characters)

Map:   0%|          | 0/133675 [00:00<?, ? examples/s]

Map:   0%|          | 0/14513 [00:00<?, ? examples/s]

In [19]:
# show_random_elements(common_voice_train.remove_columns(["path"]))

In [20]:
def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[—è]', '–π–∞', batch["sentence"])
    batch["sentence"] = re.sub('[—é]', '–π—É', batch["sentence"])
    batch["sentence"] = re.sub('[—ë]', '–π–æ', batch["sentence"])
    batch["sentence"] = re.sub('[—ä]', '', batch["sentence"])
    batch["sentence"] = re.sub('[—å]', '', batch["sentence"])
    if '–µ' in batch["sentence"]:
        words=batch["sentence"].split(' ')
        new_list=[]
        for word in words:
            if len(word)==0:
                continue
            new_word=word
            if word[0]=='–µ':
                new_word='–π—ç'+word[1:]
            new_word=re.sub('[–µ]', '—ç', new_word)
            new_list.append(new_word)

        batch["sentence"] = " ".join(new_list)
        
    words=batch["sentence"].split(' ')
    while '' in words:
        words.remove('')
    
    batch["sentence"]=" ".join(words)
    
    return batch

In [21]:
common_voice_train = common_voice_train.map(replace_hatted_characters)
common_voice_test = common_voice_test.map(replace_hatted_characters)

Map:   0%|          | 0/133675 [00:00<?, ? examples/s]

Map:   0%|          | 0/14513 [00:00<?, ? examples/s]

In [22]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [23]:
common_voice_train.column_names

['client_id',
 'path',
 'audio',
 'sentence',
 'up_votes',
 'down_votes',
 'age',
 'gender',
 'accent',
 'locale',
 'segment',
 'variant']

In [24]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/133675 [00:00<?, ? examples/s]

Map:   0%|          | 0/14513 [00:00<?, ? examples/s]

In [25]:
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))

In [26]:
# import IPython.display as ipd
# import numpy as np
# import random

# rand_int = random.randint(0, len(common_voice_train)-1)

# print(common_voice_train[rand_int]["sentence"])
# ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=16000)

In [27]:
# rand_int = random.randint(0, len(common_voice_train)-1)

# print("Target text:", common_voice_train[rand_int]["sentence"])
# print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
# print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])

In [28]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [29]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '–∞': 1,
 '–±': 2,
 '–≤': 3,
 '–≥': 4,
 '–¥': 5,
 '–∂': 6,
 '–∑': 7,
 '–∏': 8,
 '–π': 9,
 '–∫': 10,
 '–ª': 11,
 '–º': 12,
 '–Ω': 13,
 '–æ': 14,
 '–ø': 15,
 '—Ä': 16,
 '—Å': 17,
 '—Ç': 18,
 '—É': 19,
 '—Ñ': 20,
 '—Ö': 21,
 '—Ü': 22,
 '—á': 23,
 '—à': 24,
 '—â': 25,
 '—ã': 26,
 '—ç': 27,
 '“ì': 28,
 '“ô': 29,
 '“°': 30,
 '“£': 31,
 '“´': 32,
 '“Ø': 33,
 '“ª': 34,
 '”ô': 35,
 '”©': 36}

In [30]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [31]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

39

In [32]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [33]:
model_id = "utter-project/mHuBERT-147"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

model = HubertForCTC.from_pretrained(
  model_id,
  attention_dropout=0.1,
  hidden_dropout=0.1,
  feat_proj_dropout=0.1,
  mask_time_prob=0.05,
  layerdrop=0.1,
  final_dropout=0.3,
  ctc_loss_reduction="mean",
  pad_token_id=processor.tokenizer.pad_token_id,
  vocab_size=len(processor.tokenizer),
)

model.freeze_feature_encoder()

preprocessor_config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForCTC were not initialized from the model checkpoint at utter-project/mHuBERT-147 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [35]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, num_proc=1)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, num_proc=1)

Map:   0%|          | 0/133675 [00:00<?, ? examples/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [36]:
# from datasets import Dataset
# import pickle
# import os

# def prepare_item(item):
#     try:
#         audio = item["audio"]
        
#         # Process audio
#         input_values = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
#         input_length = len(input_values)
        
#         # Process text
#         with processor.as_target_processor():
#             labels = processor(item["sentence"]).input_ids
        
#         return {
#             "input_values": input_values,
#             "input_length": input_length,
#             "labels": labels
#         }
#     except Exception as e:
#         print(f"Error processing item: {e}")
#         return None

# def process_dataset(dataset):
#     processed_data = []
#     ind=0
#     for item in dataset:
#         ind+=1
#         if ind%1000==0:
#             print(ind)
#         processed_item = prepare_item(item)
#         if processed_item is not None:
#             processed_data.append(processed_item)
    
#     return Dataset.from_list(processed_data)

# def save_to_pickle(prefix, data, index):
#     filename = f"processed_dataset/{prefix}_{index}.pkl"
#     with open(filename, 'wb') as f:
#         pickle.dump(data, f)
#     print(f"Saved {len(data)} items to {filename}")

# def process_dataset(prefix, dataset):
#     processed_data = []
#     total_processed = 0
#     for index, item in enumerate(dataset, start=1):
#         if index % 1000 == 0:
#             print(f"Processed {index} items")
        
#         processed_item = prepare_item(item)
#         if processed_item is not None:
#             processed_data.append(processed_item)
#             total_processed += 1
        
#         if len(processed_data) == 1000:
#             save_to_pickle(prefix, processed_data, total_processed // 1000)
#             processed_data = []  # Clear the list after saving
    
#     # Save any remaining items
#     if processed_data:
#         save_to_pickle(prefix, processed_data, (total_processed // 1000) + 1)
    
#     print(f"Total processed items: {total_processed}")
#     return total_processed

# process_dataset("test", common_voice_test)
# process_dataset("train", common_voice_train)
# # # Process the dataset
# # common_voice_test = process_dataset(common_voice_test)
# # common_voice_train = process_dataset(common_voice_train)

In [37]:
# def load_processed_dataset(prefix, directory="processed_dataset"):
#     all_data = []
#     for filename in sorted(os.listdir(directory)):
#         if filename.startswith(f"{prefix}_") and filename.endswith(".pkl"):
#             with open(os.path.join(directory, filename), 'rb') as f:
#                 all_data.extend(pickle.load(f))
#     return Dataset.from_list(all_data)

# # Load test and train datasets
# common_voice_test = load_processed_dataset("test")


In [38]:
# common_voice_train = load_processed_dataset("train")

In [39]:
wer_metric = load_metric("wer", trust_remote_code=True)

def compute_metrics(pred):
  pred_logits = pred.predictions
  pred_ids = np.argmax(pred_logits, axis=-1)
  pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
  pred_str = processor.batch_decode(pred_ids)
  # we do not want to group tokens when computing the metrics
  label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
  wer = wer_metric.compute(predictions=pred_str, references=label_str)
  return {"wer": wer}

  wer_metric = load_metric("wer", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [43]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

training_args = TrainingArguments(
  output_dir="training",
  report_to="tensorboard",
  per_device_train_batch_size=32,
  per_device_eval_batch_size=4,
  eval_strategy="steps",
  num_train_epochs=60,
  fp16=False,
  save_steps=1000,
  eval_steps=1000,
  logging_steps=100,
  learning_rate=1e-5,
  adam_beta1=0.9,
  adam_beta2=0.98,
  adam_epsilon=1e-08,
  warmup_ratio=0.1,
  save_total_limit=5,
  push_to_hub=False,
  load_best_model_at_end=True,
)

trainer = Trainer(
  model=model,
  data_collator=data_collator,
  args=training_args,
  compute_metrics=compute_metrics,
  train_dataset=common_voice_train,
  eval_dataset=common_voice_test,
  tokenizer=processor.feature_extractor,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
1000,51.2746,38.144913,1.003135
2000,31.0695,22.275505,1.0
3000,25.9185,18.99543,1.0
4000,22.7995,16.684248,1.0
5000,18.799,13.793333,1.0
6000,14.7246,10.601075,1.0
7000,10.0615,7.508481,1.0
8000,6.4954,5.195969,1.0
9000,4.3991,3.93689,1.0
10000,3.5461,3.437078,1.0


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [46]:
trainer.push_to_hub()

events.out.tfevents.1720638005.sciencebrick.3579461.1:   0%|          | 0.00/625k [00:00<?, ?B/s]

events.out.tfevents.1720637180.sciencebrick.3579461.0:   0%|          | 0.00/6.25k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/AigizK/training/commit/9515ed057312efdbd933737b5bb3d04ab16a7a69', commit_message='End of training', commit_description='', oid='9515ed057312efdbd933737b5bb3d04ab16a7a69', pr_url=None, pr_revision=None, pr_num=None)

In [47]:
tokenizer.push_to_hub("AigizK/mHuBERT-147-bashkort")

README.md:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AigizK/mHuBERT-147-bashkort/commit/e6f2ad28f7a01efaea1dbd21697268fea5771e32', commit_message='Upload tokenizer', commit_description='', oid='e6f2ad28f7a01efaea1dbd21697268fea5771e32', pr_url=None, pr_revision=None, pr_num=None)