mesolitica
/

wav2vec2-xls-r-300m-mixed

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "02b2d284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "4966a667",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !wget https://huggingface.co/huseinzol05/language-model-bahasa-manglish-combined/resolve/main/model.klm\n",
+    "# !pip3 install pyctcdecode==0.1.0 pypi-kenlm==0.1.20210121"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "42d8d861",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/.local/lib/python3.8/site-packages/apex/pyprof/__init__.py:5: FutureWarning: pyprof will be removed by the end of June, 2022\n",
+      "  warnings.warn(\"pyprof will be removed by the end of June, 2022\", FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import transformers\n",
+    "from transformers import (\n",
+    "    HfArgumentParser,\n",
+    "    Trainer,\n",
+    "    TrainingArguments,\n",
+    "    Wav2Vec2CTCTokenizer,\n",
+    "    Wav2Vec2FeatureExtractor,\n",
+    "    Wav2Vec2ForCTC,\n",
+    "    Wav2Vec2Processor,\n",
+    "    is_apex_available,\n",
+    "    set_seed,\n",
+    "    AutoModelForCTC,\n",
+    "    TFWav2Vec2ForCTC,\n",
+    "    TFWav2Vec2PreTrainedModel,\n",
+    "    Wav2Vec2PreTrainedModel,\n",
+    ")\n",
+    "from scipy.special import log_softmax"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0d6b421c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "060fb120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "import json\n",
+    "\n",
+    "CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']\n",
+    "vocab_dict = {v: k for k, v in enumerate(CTC_VOCAB)}\n",
+    "vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
+    "del vocab_dict[\" \"]\n",
+    "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
+    "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
+    "\n",
+    "with open(\"ctc-vocab.json\", \"w\") as vocab_file:\n",
+    "    json.dump(vocab_dict, vocab_file)\n",
+    "\n",
+    "tokenizer = Wav2Vec2CTCTokenizer(\n",
+    "    \"ctc-vocab.json\",\n",
+    "    unk_token=\"[UNK]\",\n",
+    "    pad_token=\"[PAD]\",\n",
+    "    word_delimiter_token=\"|\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c16b890f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(765, 3579, 614)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from glob import glob\n",
+    "malay = sorted(glob('malay-test/*.wav'), key = lambda x: int(x.split('/')[1].replace('.wav', '')))\n",
+    "singlish = sorted(glob('singlish-test/*.wav'), key = lambda x: int(x.split('/')[1].replace('.wav', '')))\n",
+    "mandarin = sorted(glob('mandarin-test/*.wav'), key = lambda x: int(x.split('/')[1].replace('.wav', '')))\n",
+    "len(malay), len(singlish), len(mandarin)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "29568a5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(765, 3579, 614)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with open('malay-test.json') as fopen:\n",
+    "    malay_label = json.load(fopen)\n",
+    "with open('singlish-test.json') as fopen:\n",
+    "    singlish_label = json.load(fopen)\n",
+    "with open('mandarin-test.json') as fopen:\n",
+    "    mandarin_label = json.load(fopen)\n",
+    "    \n",
+    "len(malay_label), len(singlish_label), len(mandarin_label)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "bdac1296",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('mandarin-test/460.wav', 'ting qi lai you dian xiang zai chang de na zhong'),\n",
+       " ('mandarin-test/256.wav', 'zai jia hao wu liao a'),\n",
+       " ('singlish-test/2169.wav', 'controlling our environment is important'),\n",
+       " ('mandarin-test/400.wav', 'bo fang gu zheng de ge qu'),\n",
+       " ('singlish-test/1001.wav', 'because they are the one that badly need it'),\n",
+       " ('singlish-test/4.wav',\n",
+       "  'rescuers who used what appeared to be makeshift stretchers to carry the injured'),\n",
+       " ('singlish-test/392.wav', 'i attached a mirror to my closet door'),\n",
+       " ('singlish-test/2563.wav', 'do you know the answer'),\n",
+       " ('singlish-test/799.wav',\n",
+       "  'this kind of packaging can pose a danger to animals'),\n",
+       " ('singlish-test/1165.wav',\n",
+       "  'a lot of parents ive spoken to say they dont have the luxury to do that')]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.utils import shuffle\n",
+    "\n",
+    "audio = malay + singlish + mandarin\n",
+    "labels = malay_label + singlish_label + mandarin_label\n",
+    "audio, labels = shuffle(audio, labels)\n",
+    "test_set = list(zip(audio, labels))\n",
+    "test_set[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "69cb17cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import soundfile as sf\n",
+    "import numpy as np\n",
+    "\n",
+    "def norm_audio(x):\n",
+    "    return (x - x.mean()) / np.sqrt(x.var() + 1e-7)\n",
+    "\n",
+    "def sequence_1d(\n",
+    "    seq, maxlen=None, padding: str = 'post', pad_int=0, return_len=False\n",
+    "):\n",
+    "    if padding not in ['post', 'pre']:\n",
+    "        raise ValueError('padding only supported [`post`, `pre`]')\n",
+    "\n",
+    "    if not maxlen:\n",
+    "        maxlen = max([len(s) for s in seq])\n",
+    "\n",
+    "    padded_seqs, length = [], []\n",
+    "    for s in seq:\n",
+    "        if isinstance(s, np.ndarray):\n",
+    "            s = s.tolist()\n",
+    "        if padding == 'post':\n",
+    "            padded_seqs.append(s + [pad_int] * (maxlen - len(s)))\n",
+    "        if padding == 'pre':\n",
+    "            padded_seqs.append([pad_int] * (maxlen - len(s)) + s)\n",
+    "        length.append(len(s))\n",
+    "    if return_len:\n",
+    "        return np.array(padded_seqs), length\n",
+    "    return np.array(padded_seqs)\n",
+    "\n",
+    "def batching(audios):\n",
+    "    audios = [sf.read(a)[0] for a in audios]\n",
+    "    batch, lens = sequence_1d(audios,return_len=True)\n",
+    "    attentions = [[1] * l for l in lens]\n",
+    "    attentions = sequence_1d(attentions)\n",
+    "    normed_input_values = []\n",
+    "\n",
+    "    for vector, length in zip(batch, attentions.sum(-1)):\n",
+    "        normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)\n",
+    "        if length < normed_slice.shape[0]:\n",
+    "            normed_slice[length:] = 0.0\n",
+    "\n",
+    "        normed_input_values.append(normed_slice)\n",
+    "\n",
+    "    normed_input_values = np.array(normed_input_values)\n",
+    "    return normed_input_values.astype(np.float32), attentions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f97f22e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoModelForCTC.from_pretrained(\n",
+    "    './wav2vec2-mixed-v3/checkpoint-55000',\n",
+    "    ctc_loss_reduction=\"mean\",\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    "    vocab_size=len(tokenizer),\n",
+    ").cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "20fee479",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "51703510",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 4\n",
+    "batch_x = audio[:batch_size]\n",
+    "normed_input_values, attentions = batching(batch_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "065fce75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "o_pt = model(torch.from_numpy(normed_input_values.astype(np.float32)).cuda(), \n",
+    "             attention_mask = torch.from_numpy(attentions).cuda())\n",
+    "o_pt = o_pt.logits.detach().cpu().numpy()\n",
+    "o_pt = log_softmax(o_pt, axis = -1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "b7851fc9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ting qi lai you dian xiang zai chang de na zhong',\n",
+       " 'zai jia hao wu liao wa',\n",
+       " 'controlling our environment is important',\n",
+       " 'bo fang gu zheng de ge qu']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pred_ids = np.argmax(o_pt, axis = -1)\n",
+    "tokenizer.batch_decode(pred_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3efd715e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_vocab = list(vocab_dict.keys())\n",
+    "unique_vocab[-3] = ' ' \n",
+    "unique_vocab[-2] = '?'\n",
+    "unique_vocab[-1] = '_'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "3024298f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyctcdecode import build_ctcdecoder\n",
+    "import kenlm\n",
+    "\n",
+    "kenlm_model = kenlm.Model('model.klm')\n",
+    "decoder = build_ctcdecoder(\n",
+    "    unique_vocab,\n",
+    "    kenlm_model,\n",
+    "    alpha=0.2,\n",
+    "    beta=1.0,\n",
+    "    ctc_token_idx=tokenizer.pad_token_id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6100ea60",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 ting qi lai you dian xiang zai chang de na zhong\n",
+      "1 zai jia hao wu liao wa\n",
+      "2 controlling our environment is important\n",
+      "3 bo fang gu zheng de ge qu\n"
+     ]
+    }
+   ],
+   "source": [
+    "for k in range(len(o_pt)):\n",
+    "    out = decoder.decode_beams(o_pt[k], prune_history=True)\n",
+    "    d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]\n",
+    "    print(k, d_lm2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "4672ac73",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ting qi lai you dian xiang zai chang de na zhong',\n",
+       " 'zai jia hao wu liao a',\n",
+       " 'controlling our environment is important',\n",
+       " 'bo fang gu zheng de ge qu']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "labels[:batch_size]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "5d47692d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_cer(actual, hyp):\n",
+    "    \"\"\"\n",
+    "    Calculate CER using `python-Levenshtein`.\n",
+    "    \"\"\"\n",
+    "    import Levenshtein as Lev\n",
+    "\n",
+    "    actual = actual.replace(' ', '')\n",
+    "    hyp = hyp.replace(' ', '')\n",
+    "    return Lev.distance(actual, hyp) / len(actual)\n",
+    "\n",
+    "\n",
+    "def calculate_wer(actual, hyp):\n",
+    "    \"\"\"\n",
+    "    Calculate WER using `python-Levenshtein`.\n",
+    "    \"\"\"\n",
+    "    import Levenshtein as Lev\n",
+    "\n",
+    "    b = set(actual.split() + hyp.split())\n",
+    "    word2char = dict(zip(b, range(len(b))))\n",
+    "\n",
+    "    w1 = [chr(word2char[w]) for w in actual.split()]\n",
+    "    w2 = [chr(word2char[w]) for w in hyp.split()]\n",
+    "\n",
+    "    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c01ea2e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1240/1240 [04:27<00:00,  4.63it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "wer, cer = [], []\n",
+    "wer_lm, cer_lm = [], []\n",
+    "\n",
+    "for i in tqdm(range(0, len(audio), batch_size)):\n",
+    "    torch.cuda.empty_cache()\n",
+    "    \n",
+    "    batch_x = audio[i: i + batch_size]\n",
+    "    batch_y = labels[i: i + batch_size]\n",
+    "    normed_input_values, attentions = batching(batch_x)\n",
+    "    inputs = torch.from_numpy(normed_input_values.astype(np.float32)).cuda()\n",
+    "    attention_mask = torch.from_numpy(attentions).cuda()\n",
+    "    o_pt = model(inputs, attention_mask = attention_mask)\n",
+    "    o_pt = o_pt.logits.detach().cpu().numpy()\n",
+    "    o_pt = log_softmax(o_pt, axis = -1)\n",
+    "    pred_ids = np.argmax(o_pt, axis = -1)\n",
+    "    pred = tokenizer.batch_decode(pred_ids)\n",
+    "    for k in range(len(o_pt)):\n",
+    "        out = decoder.decode_beams(o_pt[k], prune_history=True)\n",
+    "        d_lm2, lm_state, timesteps, logit_score, lm_score = out[0]\n",
+    "        \n",
+    "        wer.append(calculate_wer(batch_y[k], pred[k]))\n",
+    "        cer.append(calculate_cer(batch_y[k], pred[k]))\n",
+    "        \n",
+    "        wer_lm.append(calculate_wer(batch_y[k], d_lm2))\n",
+    "        cer_lm.append(calculate_cer(batch_y[k], d_lm2))\n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "6c6ce8ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.14251665517797765,\n",
+       " 0.05082346216269688,\n",
+       " 0.10380217528405207,\n",
+       " 0.042868860764264445)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(wer), np.mean(cer), np.mean(wer_lm), np.mean(cer_lm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "cf53914e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_malay = [no for no, i in enumerate(audio) if 'malay-test/' in i]\n",
+    "index_singlish = [no for no, i in enumerate(audio) if 'singlish-test/' in i]\n",
+    "index_mandarin = [no for no, i in enumerate(audio) if 'mandarin-test/' in i]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b1558987",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.21723938552369926,\n",
+       " 0.05027226867066105,\n",
+       " 0.13593624603428525,\n",
+       " 0.03601546154013878)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(np.array(wer)[index_malay]), np.mean(np.array(cer)[index_malay]), np.mean(np.array(wer_lm)[index_malay]), np.mean(np.array(cer_lm)[index_malay])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "f340cde7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.1331819722523124,\n",
+       " 0.05161275767676772,\n",
+       " 0.09859626021111582,\n",
+       " 0.04419848182804781)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(np.array(wer)[index_singlish]), np.mean(np.array(cer)[index_singlish]), np.mean(np.array(wer_lm)[index_singlish]), np.mean(np.array(cer_lm)[index_singlish])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "cbc2539f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.10382926344585862,\n",
+       " 0.04690941391603209,\n",
+       " 0.09411065398455744,\n",
+       " 0.0436573568867001)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(np.array(wer)[index_mandarin]), np.mean(np.array(cer)[index_mandarin]), np.mean(np.array(wer_lm)[index_mandarin]), np.mean(np.array(cer_lm)[index_mandarin])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "4c543d0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/.local/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:39: FutureWarning: Pass token='wav2vec2-xls-r-300m-mixed' as keyword args. From version 0.7 passing these as positional arguments will result in an error\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:79: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.7. Pass `repo_id` instead.\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:596: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!\n",
+      "  warnings.warn(\n",
+      "Cloning https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed into local empty directory.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10c88b815c83447b9d04f297f54fe1d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload file pytorch_model.bin:   0%|          | 4.00k/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "remote: Enforcing permissions...        \n",
+      "remote: Allowed refs: all        \n",
+      "To https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed\n",
+      "   33df917..7044629  main -> main\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed/commit/7044629625df853dec50f463f6b794afe61d391f'"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='mesolitica')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "05ec385e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-06-01 09:29:07.148431: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.191068: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.192882: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.194967: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2022-06-01 09:29:07.196435: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.197071: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.197672: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.199082: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.199700: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.200318: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
+      "2022-06-01 09:29:07.201032: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n",
+      "2022-06-01 09:29:07.201159: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17325 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6\n",
+      "\n",
+      "TFWav2Vec2ForCTC has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU\n",
+      "2022-06-01 09:29:09.085113: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8100\n",
+      "2022-06-01 09:29:09.930887: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory\n",
+      "2022-06-01 09:29:10.708302: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.\n",
+      "All PyTorch model weights were used when initializing TFWav2Vec2ForCTC.\n",
+      "\n",
+      "All the weights of TFWav2Vec2ForCTC were initialized from the PyTorch model.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWav2Vec2ForCTC for predictions without further training.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_tf = TFWav2Vec2ForCTC.from_pretrained(\n",
+    "    './wav2vec2-mixed-v3/checkpoint-55000',\n",
+    "    ctc_loss_reduction=\"mean\",\n",
+    "    pad_token_id=tokenizer.pad_token_id,\n",
+    "    vocab_size=len(tokenizer),\n",
+    "    from_pt=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "e0f3f749",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-06-01 09:29:38.885075: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 33554432 exceeds 10% of free system memory.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f70276237419473091977e7dcd4da591",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload file tf_model.h5:   0%|          | 4.00k/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "remote: Enforcing permissions...        \n",
+      "remote: Allowed refs: all        \n",
+      "To https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed\n",
+      "   7044629..86e9f45  main -> main\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed/commit/86e9f450fa80b3f51175f04f694b35f342a6a09e'"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_tf.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='mesolitica')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "999b8b28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = Wav2Vec2CTCTokenizer(\n",
+    "    \"ctc-vocab.json\",\n",
+    "    unk_token=\"[UNK]\",\n",
+    "    pad_token=\"[PAD]\",\n",
+    "    word_delimiter_token=\"|\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "54a3285e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_extractor = Wav2Vec2FeatureExtractor(\n",
+    "    feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True\n",
+    ")\n",
+    "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "b4bf1a21",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "remote: Enforcing permissions...        \n",
+      "remote: Allowed refs: all        \n",
+      "To https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed\n",
+      "   86e9f45..adf6534  main -> main\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/mesolitica/wav2vec2-xls-r-300m-mixed/commit/adf65347379e5902f7488753aef24d4e9d16daff'"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processor.push_to_hub('wav2vec2-xls-r-300m-mixed', organization='mesolitica')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}