marinone94
/

xls-r-300m-sv-robust

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c9526c52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from datasets import DatasetDict, load_dataset, load_metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "663ff92e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "cc9f1c45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n",
+    "dataset_config_name = \"sv-SE\"\n",
+    "train_split_name = \"train+validation\"\n",
+    "use_auth_token = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "21fd7030",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets = DatasetDict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "81a27912",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_datasets[\"train\"] = load_dataset(\n",
+    "    dataset_name,\n",
+    "    dataset_config_name,\n",
+    "    split=train_split_name,\n",
+    "    use_auth_token=use_auth_token,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "7945cada",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
+     ]
+    }
+   ],
+   "source": [
+    "raw_datasets[\"test\"] = load_dataset(\n",
+    "    dataset_name,\n",
+    "    dataset_config_name,\n",
+    "    split=\"test\",\n",
+    "    use_auth_token=use_auth_token,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "c98cb649",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_data = raw_datasets[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "1aead6a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = raw_datasets[\"test\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "97e9a626",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
+       "    num_rows: 11030\n",
+       "})"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "training_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "fc794e39",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
+       "    num_rows: 4620\n",
+       "})"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "31b328fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_speakers_dict = {}\n",
+    "for record in training_data:\n",
+    "    try:\n",
+    "        speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n",
+    "    except:\n",
+    "        speakers_dict[record[\"client_id\"]] = [record[\"path\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "7eba5861",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(f\"Speakers in training set: {train_speakers_dict}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "17905c39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_speakers_dict = {}\n",
+    "for record in test_data:\n",
+    "    try:\n",
+    "        speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n",
+    "    except:\n",
+    "        speakers_dict[record[\"client_id\"]] = [record[\"path\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "25a25454",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "24"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(f\"Speakers in test set: {test_speakers_dict}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "f72bdb7a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Speakers in both training and test sets: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "c = 0\n",
+    "for speaker in test_speakers_dict:\n",
+    "    if speaker in train_speakers_dict:\n",
+    "        c+=1\n",
+    "print(f\"Speakers in both training and test sets: {c}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "ed6bc20b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'\n",
+    "def clean_text(text):\n",
+    "    return re.sub(chars_to_ignore_regex, \"\", text.lower())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "16b289be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Avg tokens training data: 7.243336355394379\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_tokens_train = 0\n",
+    "for record in training_data:\n",
+    "    num_tokens_train += len(clean_text(record[\"sentence\"]).split())\n",
+    "avg_tokens_train = num_tokens_train / training_data.num_rows\n",
+    "print(f\"Avg tokens training data: {avg_tokens_train}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "364aff29",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Avg tokens training data: 7.074891774891775\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_tokens_test = 0\n",
+    "for record in test_data:\n",
+    "    num_tokens_test += len(clean_text(record[\"sentence\"]).split())\n",
+    "avg_tokens_test = num_tokens_test / test_data.num_rows\n",
+    "print(f\"Avg tokens training data: {avg_tokens_test}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

run_speech_recognition_ctc.py CHANGED Viewed

@@ -43,7 +43,6 @@ from transformers import (
     Trainer,
     TrainingArguments,
     Wav2Vec2Processor,
-    Wav2Vec2ProcessorWithLM,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process

     Trainer,
     TrainingArguments,
     Wav2Vec2Processor,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process

train_n_gram_lm_with_KenLM.ipynb CHANGED Viewed

@@ -1,2262 +1,301 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Train n-gram language model with KenLM on Colab"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "PtkgQE7--Ufg"
-   },
-   "source": [
-    "See https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Boosting_Wav2Vec2_with_n_grams_in_Transformers.ipynb#scrollTo=X9qg4FPt2zi8 for detailed explanation on how to use KenLM to boost wav2vec2 fine-tuned models on 🤗"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "VBCqCboC6Soc"
-   },
-   "source": [
-    "Install KenLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "-CKLr9bI6GPE",
-    "outputId": "0c6d917e-4896-4e35-c92f-4b085f77c893"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The operation couldn’t be completed. Unable to locate a Java Runtime that supports apt.\r\n",
-      "Please visit http://www.java.com for information on installing Java.\r\n",
-      "\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
     "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "TIlrFi3M6XO4",
-    "outputId": "7c986a6f-f84c-4d29-b3f8-941cb85e6e8d"
-   },
-   "outputs": [],
-   "source": [
-    "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "KGwSg6Bl6a8Y",
-    "outputId": "025562ab-679d-4986-9474-334dc8bd834e"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-- The C compiler identification is GNU 7.5.0\n",
-      "-- The CXX compiler identification is GNU 7.5.0\n",
-      "-- Check for working C compiler: /usr/bin/cc\n",
-      "-- Check for working C compiler: /usr/bin/cc -- works\n",
-      "-- Detecting C compiler ABI info\n",
-      "-- Detecting C compiler ABI info - done\n",
-      "-- Detecting C compile features\n",
-      "-- Detecting C compile features - done\n",
-      "-- Check for working CXX compiler: /usr/bin/c++\n",
-      "-- Check for working CXX compiler: /usr/bin/c++ -- works\n",
-      "-- Detecting CXX compiler ABI info\n",
-      "-- Detecting CXX compiler ABI info - done\n",
-      "-- Detecting CXX compile features\n",
-      "-- Detecting CXX compile features - done\n",
-      "-- Looking for pthread.h\n",
-      "-- Looking for pthread.h - found\n",
-      "-- Looking for pthread_create\n",
-      "-- Looking for pthread_create - not found\n",
-      "-- Looking for pthread_create in pthreads\n",
-      "-- Looking for pthread_create in pthreads - not found\n",
-      "-- Looking for pthread_create in pthread\n",
-      "-- Looking for pthread_create in pthread - found\n",
-      "-- Found Threads: TRUE  \n",
-      "-- Boost version: 1.65.1\n",
-      "-- Found the following Boost libraries:\n",
-      "--   program_options\n",
-      "--   system\n",
-      "--   thread\n",
-      "--   unit_test_framework\n",
-      "--   chrono\n",
-      "--   date_time\n",
-      "--   atomic\n",
-      "-- Check if compiler accepts -pthread\n",
-      "-- Check if compiler accepts -pthread - yes\n",
-      "-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version \"1.2.11\") \n",
-      "-- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version \"1.0.6\") \n",
-      "-- Looking for BZ2_bzCompressInit\n",
-      "-- Looking for BZ2_bzCompressInit - found\n",
-      "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
-      "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
-      "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
-      "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
-      "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
-      "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
-      "-- Found LibLZMA: /usr/include (found version \"5.2.2\") \n",
-      "-- Found OpenMP_C: -fopenmp (found version \"4.5\") \n",
-      "-- Found OpenMP_CXX: -fopenmp (found version \"4.5\") \n",
-      "-- Found OpenMP: TRUE (found version \"4.5\")  \n",
-      "-- Configuring done\n",
-      "-- Generating done\n",
-      "-- Build files have been written to: /content/kenlm/build\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target kenlm_util\u001b[0m\n",
-      "[  2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o\u001b[0m\n",
-      "[  2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o\u001b[0m\n",
-      "[  3%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o\u001b[0m\n",
-      "[  4%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/diy-fp.cc.o\u001b[0m\n",
-      "[  5%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-conversion.cc.o\u001b[0m\n",
-      "[  6%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o\u001b[0m\n",
-      "[  7%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o\u001b[0m\n",
-      "[  8%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o\u001b[0m\n",
-      "[  9%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o\u001b[0m\n",
-      "[ 10%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o\u001b[0m\n",
-      "[ 11%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o\u001b[0m\n",
-      "[ 12%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o\u001b[0m\n",
-      "[ 13%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o\u001b[0m\n",
-      "[ 14%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o\u001b[0m\n",
-      "[ 15%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o\u001b[0m\n",
-      "[ 16%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o\u001b[0m\n",
-      "[ 17%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o\u001b[0m\n",
-      "[ 18%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o\u001b[0m\n",
-      "[ 19%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o\u001b[0m\n",
-      "[ 20%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o\u001b[0m\n",
-      "[ 21%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o\u001b[0m\n",
-      "[ 22%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o\u001b[0m\n",
-      "[ 23%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o\u001b[0m\n",
-      "[ 25%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o\u001b[0m\n",
-      "[ 26%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o\u001b[0m\n",
-      "[ 27%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o\u001b[0m\n",
-      "[ 28%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o\u001b[0m\n",
-      "[ 29%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o\u001b[0m\n",
-      "[ 30%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o\u001b[0m\n",
-      "[ 31%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o\u001b[0m\n",
-      "[ 32%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm_util.a\u001b[0m\n",
-      "[ 32%] Built target kenlm_util\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target probing_hash_table_benchmark\u001b[0m\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target kenlm\u001b[0m\n",
-      "[ 33%] \u001b[32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o\u001b[0m\n",
-      "[ 34%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o\u001b[0m\n",
-      "[ 35%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o\u001b[0m\n",
-      "[ 36%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o\u001b[0m\n",
-      "[ 37%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o\u001b[0m\n",
-      "[ 38%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o\u001b[0m\n",
-      "[ 39%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o\u001b[0m\n",
-      "[ 40%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o\u001b[0m\n",
-      "[ 41%] \u001b[32m\u001b[1mLinking CXX executable ../bin/probing_hash_table_benchmark\u001b[0m\n",
-      "[ 41%] Built target probing_hash_table_benchmark\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target kenlm_filter\u001b[0m\n",
-      "[ 42%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o\u001b[0m\n",
-      "[ 43%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o\u001b[0m\n",
-      "[ 44%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o\u001b[0m\n",
-      "[ 45%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o\u001b[0m\n",
-      "[ 46%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o\u001b[0m\n",
-      "[ 47%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_filter.a\u001b[0m\n",
-      "[ 47%] Built target kenlm_filter\n",
-      "[ 48%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o\u001b[0m\n",
-      "[ 50%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o\u001b[0m\n",
-      "[ 51%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o\u001b[0m\n",
-      "[ 52%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o\u001b[0m\n",
-      "[ 53%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o\u001b[0m\n",
-      "[ 54%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o\u001b[0m\n",
-      "[ 55%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o\u001b[0m\n",
-      "[ 56%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o\u001b[0m\n",
-      "[ 57%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o\u001b[0m\n",
-      "[ 58%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o\u001b[0m\n",
-      "[ 59%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm.a\u001b[0m\n",
-      "[ 59%] Built target kenlm\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target build_binary\u001b[0m\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target fragment\u001b[0m\n",
-      "[ 60%] \u001b[32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o\u001b[0m\n",
-      "[ 61%] \u001b[32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o\u001b[0m\n",
-      "[ 62%] \u001b[32m\u001b[1mLinking CXX executable ../bin/fragment\u001b[0m\n",
-      "[ 63%] \u001b[32m\u001b[1mLinking CXX executable ../bin/build_binary\u001b[0m\n",
-      "[ 63%] Built target fragment\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target kenlm_benchmark\u001b[0m\n",
-      "[ 64%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o\u001b[0m\n",
-      "[ 64%] Built target build_binary\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target query\u001b[0m\n",
-      "[ 65%] \u001b[32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o\u001b[0m\n",
-      "[ 66%] \u001b[32m\u001b[1mLinking CXX executable ../bin/query\u001b[0m\n",
-      "[ 66%] Built target query\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target kenlm_builder\u001b[0m\n",
-      "[ 67%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o\u001b[0m\n",
-      "[ 68%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o\u001b[0m\n",
-      "[ 69%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o\u001b[0m\n",
-      "[ 70%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o\u001b[0m\n",
-      "[ 71%] \u001b[32m\u001b[1mLinking CXX executable ../bin/kenlm_benchmark\u001b[0m\n",
-      "[ 72%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o\u001b[0m\n",
-      "[ 72%] Built target kenlm_benchmark\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target phrase_table_vocab\u001b[0m\n",
-      "[ 73%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o\u001b[0m\n",
-      "[ 75%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/phrase_table_vocab\u001b[0m\n",
-      "[ 75%] Built target phrase_table_vocab\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target filter\u001b[0m\n",
-      "[ 76%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o\u001b[0m\n",
-      "[ 77%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o\u001b[0m\n",
-      "[ 78%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_builder.a\u001b[0m\n",
-      "[ 78%] Built target kenlm_builder\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target kenlm_interpolate\u001b[0m\n",
-      "[ 79%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/backoff_reunification.cc.o\u001b[0m\n",
-      "[ 80%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/bounded_sequence_encoding.cc.o\u001b[0m\n",
-      "[ 81%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_probabilities.cc.o\u001b[0m\n",
-      "[ 82%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/filter\u001b[0m\n",
-      "[ 82%] Built target filter\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target count_ngrams\u001b[0m\n",
-      "[ 83%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o\u001b[0m\n",
-      "[ 84%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_vocab.cc.o\u001b[0m\n",
-      "[ 85%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/normalize.cc.o\u001b[0m\n",
-      "[ 86%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/pipeline.cc.o\u001b[0m\n",
-      "[ 87%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/count_ngrams\u001b[0m\n",
-      "[ 87%] Built target count_ngrams\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target lmplz\u001b[0m\n",
-      "[ 88%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o\u001b[0m\n",
-      "[ 89%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/lmplz\u001b[0m\n",
-      "[ 89%] Built target lmplz\n",
-      "[ 90%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/split_worker.cc.o\u001b[0m\n",
-      "[ 91%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_derivatives.cc.o\u001b[0m\n",
-      "[ 92%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_instances.cc.o\u001b[0m\n",
-      "[ 93%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_weights.cc.o\u001b[0m\n",
-      "[ 94%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/universal_vocab.cc.o\u001b[0m\n",
-      "[ 95%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_interpolate.a\u001b[0m\n",
-      "[ 95%] Built target kenlm_interpolate\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target streaming_example\u001b[0m\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target interpolate\u001b[0m\n",
-      "[ 96%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/streaming_example.dir/streaming_example_main.cc.o\u001b[0m\n",
-      "[ 97%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/interpolate.dir/interpolate_main.cc.o\u001b[0m\n",
-      "[ 98%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/interpolate\u001b[0m\n",
-      "[ 98%] Built target interpolate\n",
-      "[100%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/streaming_example\u001b[0m\n",
-      "[100%] Built target streaming_example\n",
-      "build_binary  fragment\t       lmplz\t\t\t     query\n",
-      "count_ngrams  interpolate      phrase_table_vocab\t     streaming_example\n",
-      "filter\t      kenlm_benchmark  probing_hash_table_benchmark\n"
-     ]
     }
-   ],
-   "source": [
-    "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n",
-    "!ls kenlm/build/bin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "rUUGXbDy6x7r"
-   },
-   "source": [
-    "Install 🤗 dependencies"
-   ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "Gs8LAZKr6wF8",
-    "outputId": "2a1785bb-f254-487a-ef4c-e496f037145a"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting datasets\n",
-      "  Downloading datasets-1.18.0-py3-none-any.whl (311 kB)\n",
-      "\u001b[K     |████████████████████████████████| 311 kB 5.3 MB/s \n",
-      "\u001b[?25hCollecting transformers\n",
-      "  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)\n",
-      "\u001b[K     |████████████████████████████████| 3.4 MB 39.8 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n",
-      "Collecting aiohttp\n",
-      "  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n",
-      "\u001b[K     |████████████████████████████████| 1.1 MB 54.7 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (3.0.0)\n",
-      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.19.5)\n",
-      "Collecting xxhash\n",
-      "  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)\n",
-      "\u001b[K     |████████████████████████████████| 243 kB 41.5 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n",
-      "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.62.3)\n",
-      "Collecting huggingface-hub<1.0.0,>=0.1.0\n",
-      "  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)\n",
-      "\u001b[K     |████████████████████████████████| 67 kB 5.1 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n",
-      "Collecting fsspec[http]>=2021.05.0\n",
-      "  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)\n",
-      "\u001b[K     |████████████████████████████████| 133 kB 50.2 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n",
-      "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.10.0)\n",
-      "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.1.5)\n",
-      "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.4.2)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.10.0.2)\n",
-      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.13)\n",
-      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.6)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
-      "Collecting pyyaml\n",
-      "  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
-      "\u001b[K     |████████████████████████████████| 596 kB 56.2 MB/s \n",
-      "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n",
-      "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
-      "\u001b[K     |████████████████████████████████| 3.3 MB 46.1 MB/s \n",
-      "\u001b[?25hCollecting sacremoses\n",
-      "  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)\n",
-      "\u001b[K     |████████████████████████████████| 895 kB 51.6 MB/s \n",
-      "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
-      "Collecting frozenlist>=1.1.1\n",
-      "  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n",
-      "\u001b[K     |████████████████████████████████| 144 kB 51.5 MB/s \n",
-      "\u001b[?25hCollecting yarl<2.0,>=1.0\n",
-      "  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
-      "\u001b[K     |████████████████████████████████| 271 kB 56.7 MB/s \n",
-      "\u001b[?25hCollecting asynctest==0.13.0\n",
-      "  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n",
-      "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.10)\n",
-      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n",
-      "Collecting multidict<7.0,>=4.5\n",
-      "  Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n",
-      "\u001b[K     |████████████████████████████████| 94 kB 3.0 MB/s \n",
-      "\u001b[?25hCollecting aiosignal>=1.1.2\n",
-      "  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n",
-      "Collecting async-timeout<5.0,>=4.0.0a3\n",
-      "  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
-      "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.7.0)\n",
-      "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2018.9)\n",
-      "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n",
-      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n",
-      "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
-      "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n",
-      "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, pyyaml, fsspec, aiohttp, xxhash, tokenizers, sacremoses, huggingface-hub, transformers, datasets\n",
-      "  Attempting uninstall: pyyaml\n",
-      "    Found existing installation: PyYAML 3.13\n",
-      "    Uninstalling PyYAML-3.13:\n",
-      "      Successfully uninstalled PyYAML-3.13\n",
-      "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.18.0 frozenlist-1.3.0 fsspec-2022.1.0 huggingface-hub-0.4.0 multidict-6.0.2 pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.15.0 xxhash-2.0.2 yarl-1.7.2\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install datasets transformers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "6RoHBmOz66fz"
-   },
-   "source": [
-    "Load preprocessed dataset from 🤗 and write it to file as required by KenLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 216,
-     "referenced_widgets": [
-      "ad5d7b0bc9ad4e228b3bc76bc975cc47",
-      "5925567ffea2436691c4ed3b7b147c17",
-      "799acae3451445f0a3616b8932f2e3f3",
-      "1714ea91694842339756f26b2fa9c725",
-      "b5d6b069468246abbb3207f3df6f9dde",
-      "5a9c9d4b60e54a3bb64c576707bd9736",
-      "789d5845a82e48fe9c629af743b5b1f0",
-      "e3414cc0456241eca109f4e9e115d16a",
-      "d3e6acd54d024d6791aab76232557721",
-      "6b43ea2d93c04965a4539b3ef839893b",
-      "4958b4c72d0c48af9a77974fc4ed449c",
-      "d722bbfffeaf4ea7a1060d10dc3a06db",
-      "e81b0bf92adc4aadaafce4ee7d36421e",
-      "a4b5b93b88f549e8a4f37f3d48834ca9",
-      "82692c41501c487fad27c6b19836f46f",
-      "af8f433ef2f540c9bd70d14421904d83",
-      "5e469744bf6a4813983ae8ee727c1c5e",
-      "34c5f87238cb4f13a03b207aa7dc1d18",
-      "c7955974289a4f448b422d7e4640131a",
-      "db7ee45589e04749b80376e25ee377bb",
-      "34d9460b112c419885bbff5211674cb3",
-      "033cb43d32314d279a7b9e1e86bbccdc",
-      "6a7e3547dc4141e7b5937f2baff58cbf",
-      "921a3c1f50a24979838fd560c2cea9e0",
-      "e33033ecda374ed4966ae5fccf6efe37",
-      "52a852c0f98c49aa9e5edfdd4f91e4ca",
-      "d1ff84cb5591449abcc7dd3e37f9a2df",
-      "f479a9629c414cb495a97b0741b0fe4b",
-      "a41cf7f5121a4068842bb5c7d2bc4d62",
-      "31b2d7d8d9054c8fb47bf1b58043aee1",
-      "23c9da8dd7bf4be9a23357806ebfc036",
-      "e084d47529ca4131b233ea3514a6344f",
-      "a6e3c5ce0a3c49ffb3d7cbf92568fe47",
-      "a1eca879a11f414f8173b0c2c260f4c3",
-      "75130a60f93b49c8bee0986665121d02",
-      "328cea1a2aac4fb58bceeaf126b99371",
-      "662d61fdd89d434785e74a7038427fbc",
-      "670d4f16a7e44144afc0ac70eea59325",
-      "7f92331b29fd49a68815b6d7389c1005",
-      "c710ba94fd65486cbcbe1d402919e27f",
-      "5951d1bafdd548b6b835b28cf9960533",
-      "adcffda7f78c4a1c8bdc6010c8704292",
-      "2c37aaee1f524837b477dc584209733a",
-      "0bee4735e017471fa8679ad984b88633"
-     ]
     },
-    "id": "0bDpNg9c6mUu",
-    "outputId": "677d294f-2e37-48d5-bab0-6e21d1b4fe30"
-   },
-   "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ad5d7b0bc9ad4e228b3bc76bc975cc47",
-       "version_major": 2,
-       "version_minor": 0
       },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using custom data configuration hf-test--swedish_corpora_parliament_processed-56ded20e2faa0852\n"
-     ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset europarl_bilingual/en-sv (download: 151.40 MiB, generated: 278.82 MiB, post-processed: Unknown size, total: 430.21 MiB) to /root/.cache/huggingface/datasets/parquet/hf-test--swedish_corpora_parliament_processed-56ded20e2faa0852/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
-     ]
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d722bbfffeaf4ea7a1060d10dc3a06db",
-       "version_major": 2,
-       "version_minor": 0
       },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6a7e3547dc4141e7b5937f2baff58cbf",
-       "version_major": 2,
-       "version_minor": 0
       },
-      "text/plain": [
-       "Downloading:   0%|          | 0.00/159M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a1eca879a11f414f8173b0c2c260f4c3",
-       "version_major": 2,
-       "version_minor": 0
       },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/hf-test--swedish_corpora_parliament_processed-56ded20e2faa0852/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
-     ]
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "# change to your dataset path\n",
-    "username = \"hf-test\"  \n",
-    "target_lang = \"sv\"\n",
-    "\n",
-    "dataset = load_dataset(f\"{username}/{target_lang}_corpora_parliament_processed\", split=\"train\")\n",
-    "\n",
-    "with open(\"text.txt\", \"w\") as file:\n",
-    "  file.write(\" \".join(dataset[\"text\"]))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "z8PqeGC17jD8"
-   },
-   "source": [
-    "Train 5-gram language model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "_8KoINuj7h-1",
-    "outputId": "26e0622d-6cb6-4329-e722-91ae9df263c7"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "=== 1/5 Counting and sorting n-grams ===\n",
-      "Reading /content/text.txt\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "tcmalloc: large alloc 1918697472 bytes == 0x5623caa4e000 @  0x7fe627aa41e7 0x5623c8d517a2 0x5623c8cec51e 0x5623c8ccb2eb 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
-      "tcmalloc: large alloc 8953896960 bytes == 0x56243d01e000 @  0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccb308 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
-      "****************************************************************************************************\n",
-      "Unigram tokens 42153890 types 360209\n",
-      "=== 2/5 Calculating and sorting adjusted counts ===\n",
-      "Chain sizes: 1:4322508 2:1062773568 3:1992700672 4:3188320768 5:4649634816\n",
-      "tcmalloc: large alloc 4649639936 bytes == 0x5623caa4e000 @  0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccb8d7 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
-      "tcmalloc: large alloc 1992704000 bytes == 0x56251f640000 @  0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccbcdd 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
-      "tcmalloc: large alloc 3188326400 bytes == 0x5626533e4000 @  0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccbcdd 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
-      "Statistics:\n",
-      "1 360208 D1=0.686222 D2=1.01595 D3+=1.33685\n",
-      "2 5476741 D1=0.761523 D2=1.06735 D3+=1.32559\n",
-      "3 18177681 D1=0.839918 D2=1.12061 D3+=1.33794\n",
-      "4 30374983 D1=0.909146 D2=1.20496 D3+=1.37235\n",
-      "5 37231651 D1=0.944104 D2=1.25164 D3+=1.344\n",
-      "Memory estimate for binary LM:\n",
-      "type      MB\n",
-      "probing 1884 assuming -p 1.5\n",
-      "probing 2195 assuming -r models -p 1.5\n",
-      "trie     922 without quantization\n",
-      "trie     518 assuming -q 8 -b 8 quantization \n",
-      "trie     806 assuming -a 22 array pointer compression\n",
-      "trie     401 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
-      "=== 3/5 Calculating and sorting initial probabilities ===\n",
-      "Chain sizes: 1:4322496 2:87627856 3:363553620 4:728999592 5:1042486228\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "####################################################################################################\n",
-      "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
-      "Chain sizes: 1:4322496 2:87627856 3:363553620 4:728999592 5:1042486228\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "####################################################################################################\n",
-      "=== 5/5 Writing ARPA model ===\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "****************************************************************************************************\n",
-      "Name:lmplz\tVmPeak:14181536 kB\tVmRSS:2199072 kB\tRSSMax:4117540 kB\tuser:125.411\tsys:25.1745\tCPU:150.586\treal:290.479\n"
-     ]
-    }
-   ],
-   "source": [
-    "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "ZJ5OKh358nwR"
-   },
-   "source": [
-    "Check head of file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "pv93ZCR68s4m",
-    "outputId": "9489b8a8-789d-4779-85f4-f4aa4e0b3392"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\\data\\\n",
-      "ngram 1=360208\n",
-      "ngram 2=5476741\n",
-      "ngram 3=18177681\n",
-      "ngram 4=30374983\n",
-      "ngram 5=37231651\n",
-      "\n",
-      "\\1-grams:\n",
-      "-6.770219\t<unk>\t0\n",
-      "0\t<s>\t-0.11831701\n",
-      "-4.6095004\tåterupptagande\t-1.2174699\n",
-      "-2.2361007\tav\t-0.79668784\n",
-      "-4.8163533\tsessionen\t-0.37327805\n",
-      "-2.2251768\tjag\t-1.4205662\n",
-      "-4.181505\tförklarar\t-0.56261665\n",
-      "-3.5790775\teuropaparlamentets\t-0.63611007\n",
-      "-4.771945\tsession\t-0.3647111\n",
-      "-5.8043895\tåterupptagen\t-0.3058712\n",
-      "-2.8580177\tefter\t-0.7557702\n",
-      "-5.199537\tavbrottet\t-0.43322718\n"
-     ]
-    }
-   ],
-   "source": [
-    "!head -20 5gram.arpa"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "FEcPijF77mPY"
-   },
-   "source": [
-    "Add end-of-sentence token \"\\</s>\" "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "id": "Sktd-U5a7yZL"
-   },
-   "outputs": [],
-   "source": [
-    "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_sv_lm.arpa\", \"w\") as write_file:\n",
-    "  has_added_eos = False\n",
-    "  for line in read_file:\n",
-    "    if not has_added_eos and \"ngram 1=\" in line:\n",
-    "      count=line.strip().split(\"=\")[-1]\n",
-    "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
-    "    elif not has_added_eos and \"<s>\" in line:\n",
-    "      write_file.write(line)\n",
-    "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
-    "      has_added_eos = True\n",
-    "    else:\n",
-    "      write_file.write(line)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "hqXHYY-K760Q"
-   },
-   "source": [
-    "Check head of file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "0QuHk3AY8Hax",
-    "outputId": "090d065f-95c7-48e5-bc0c-01069f69c619"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\\data\\\n",
-      "ngram 1=360209\n",
-      "ngram 2=5476741\n",
-      "ngram 3=18177681\n",
-      "ngram 4=30374983\n",
-      "ngram 5=37231651\n",
-      "\n",
-      "\\1-grams:\n",
-      "-6.770219\t<unk>\t0\n",
-      "0\t<s>\t-0.11831701\n",
-      "0\t</s>\t-0.11831701\n",
-      "-4.6095004\tåterupptagande\t-1.2174699\n",
-      "-2.2361007\tav\t-0.79668784\n",
-      "-4.8163533\tsessionen\t-0.37327805\n",
-      "-2.2251768\tjag\t-1.4205662\n",
-      "-4.181505\tförklarar\t-0.56261665\n",
-      "-3.5790775\teuropaparlamentets\t-0.63611007\n",
-      "-4.771945\tsession\t-0.3647111\n",
-      "-5.8043895\tåterupptagen\t-0.3058712\n",
-      "-2.8580177\tefter\t-0.7557702\n"
-     ]
-    }
-   ],
-   "source": [
-    "!head -20 5gram_sv_lm.arpa"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "kTvRntrZ9-uq"
-   },
-   "source": [
-    "Compress arpa file by converting it to bin"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
     },
-    "id": "DnmOlNZ5-ClT",
-    "outputId": "c380c05a-e335-4e9d-98b2-c015645a2d40"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Reading 5gram_sv_lm.arpa\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "****************************************************************************************************\n",
-      "SUCCESS\n"
-     ]
-    }
-   ],
-   "source": [
-    "!kenlm/build/bin/build_binary 5gram_sv_lm.arpa 5gram_sv_lm.bin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "Xra-pM-M8MZj"
-   },
-   "source": [
-    "Download file to local machine (use Chrome if it fails on another browser)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 34
     },
-    "id": "M7b5x8Hr8Yuo",
-    "outputId": "5fbedff6-4a41-47c5-903c-2ad3b59983e1"
-   },
-   "outputs": [
     {
-     "data": {
-      "application/javascript": [
-       "\n",
-       "    async function download(id, filename, size) {\n",
-       "      if (!google.colab.kernel.accessAllowed) {\n",
-       "        return;\n",
-       "      }\n",
-       "      const div = document.createElement('div');\n",
-       "      const label = document.createElement('label');\n",
-       "      label.textContent = `Downloading \"${filename}\": `;\n",
-       "      div.appendChild(label);\n",
-       "      const progress = document.createElement('progress');\n",
-       "      progress.max = size;\n",
-       "      div.appendChild(progress);\n",
-       "      document.body.appendChild(div);\n",
-       "\n",
-       "      const buffers = [];\n",
-       "      let downloaded = 0;\n",
-       "\n",
-       "      const channel = await google.colab.kernel.comms.open(id);\n",
-       "      // Send a message to notify the kernel that we're ready.\n",
-       "      channel.send({})\n",
-       "\n",
-       "      for await (const message of channel.messages) {\n",
-       "        // Send a message to notify the kernel that we're ready.\n",
-       "        channel.send({})\n",
-       "        if (message.buffers) {\n",
-       "          for (const buffer of message.buffers) {\n",
-       "            buffers.push(buffer);\n",
-       "            downloaded += buffer.byteLength;\n",
-       "            progress.value = downloaded;\n",
-       "          }\n",
-       "        }\n",
-       "      }\n",
-       "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
-       "      const a = document.createElement('a');\n",
-       "      a.href = window.URL.createObjectURL(blob);\n",
-       "      a.download = filename;\n",
-       "      div.appendChild(a);\n",
-       "      a.click();\n",
-       "      div.remove();\n",
-       "    }\n",
-       "  "
       ],
-      "text/plain": [
-       "<IPython.core.display.Javascript object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     },
     {
-     "data": {
-      "application/javascript": [
-       "download(\"download_82154b0d-c2b7-4fbd-8f04-e987f3406e7e\", \"5gram_sv_lm.bin\", 1981380707)"
       ],
-      "text/plain": [
-       "<IPython.core.display.Javascript object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from google.colab import files\n",
-    "files.download(\"5gram_sv_lm.bin\") "
-   ]
-  }
- ],
- "metadata": {
-  "colab": {
-   "name": "train_n-gram_lm_with_KenLM",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.6"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "033cb43d32314d279a7b9e1e86bbccdc": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "0bee4735e017471fa8679ad984b88633": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "1714ea91694842339756f26b2fa9c725": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_d3e6acd54d024d6791aab76232557721",
-      "max": 1157,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_e3414cc0456241eca109f4e9e115d16a",
-      "value": 1157
-     }
-    },
-    "23c9da8dd7bf4be9a23357806ebfc036": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "2c37aaee1f524837b477dc584209733a": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "31b2d7d8d9054c8fb47bf1b58043aee1": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "328cea1a2aac4fb58bceeaf126b99371": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_c710ba94fd65486cbcbe1d402919e27f",
-      "placeholder": "",
-      "style": "IPY_MODEL_7f92331b29fd49a68815b6d7389c1005",
-      "value": "100%"
-     }
-    },
-    "34c5f87238cb4f13a03b207aa7dc1d18": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "34d9460b112c419885bbff5211674cb3": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "4958b4c72d0c48af9a77974fc4ed449c": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "52a852c0f98c49aa9e5edfdd4f91e4ca": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_23c9da8dd7bf4be9a23357806ebfc036",
-      "max": 158752204,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_31b2d7d8d9054c8fb47bf1b58043aee1",
-      "value": 158752204
-     }
-    },
-    "5925567ffea2436691c4ed3b7b147c17": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5951d1bafdd548b6b835b28cf9960533": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "5a9c9d4b60e54a3bb64c576707bd9736": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "5e469744bf6a4813983ae8ee727c1c5e": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "662d61fdd89d434785e74a7038427fbc": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_adcffda7f78c4a1c8bdc6010c8704292",
-      "max": 1,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_5951d1bafdd548b6b835b28cf9960533",
-      "value": 1
-     }
-    },
-    "670d4f16a7e44144afc0ac70eea59325": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_0bee4735e017471fa8679ad984b88633",
-      "placeholder": "",
-      "style": "IPY_MODEL_2c37aaee1f524837b477dc584209733a",
-      "value": " 1/1 [00:00&lt;00:00, 21.15it/s]"
-     }
     },
-    "6a7e3547dc4141e7b5937f2baff58cbf": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_e33033ecda374ed4966ae5fccf6efe37",
-       "IPY_MODEL_52a852c0f98c49aa9e5edfdd4f91e4ca",
-       "IPY_MODEL_d1ff84cb5591449abcc7dd3e37f9a2df"
       ],
-      "layout": "IPY_MODEL_921a3c1f50a24979838fd560c2cea9e0"
-     }
-    },
-    "6b43ea2d93c04965a4539b3ef839893b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "75130a60f93b49c8bee0986665121d02": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "789d5845a82e48fe9c629af743b5b1f0": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "799acae3451445f0a3616b8932f2e3f3": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_789d5845a82e48fe9c629af743b5b1f0",
-      "placeholder": "",
-      "style": "IPY_MODEL_5a9c9d4b60e54a3bb64c576707bd9736",
-      "value": "Downloading: 100%"
-     }
-    },
-    "7f92331b29fd49a68815b6d7389c1005": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "82692c41501c487fad27c6b19836f46f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_db7ee45589e04749b80376e25ee377bb",
-      "max": 1,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_c7955974289a4f448b422d7e4640131a",
-      "value": 1
-     }
-    },
-    "921a3c1f50a24979838fd560c2cea9e0": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
     },
-    "a1eca879a11f414f8173b0c2c260f4c3": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_328cea1a2aac4fb58bceeaf126b99371",
-       "IPY_MODEL_662d61fdd89d434785e74a7038427fbc",
-       "IPY_MODEL_670d4f16a7e44144afc0ac70eea59325"
       ],
-      "layout": "IPY_MODEL_75130a60f93b49c8bee0986665121d02"
-     }
-    },
-    "a41cf7f5121a4068842bb5c7d2bc4d62": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "a4b5b93b88f549e8a4f37f3d48834ca9": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_34c5f87238cb4f13a03b207aa7dc1d18",
-      "placeholder": "",
-      "style": "IPY_MODEL_5e469744bf6a4813983ae8ee727c1c5e",
-      "value": "100%"
-     }
-    },
-    "a6e3c5ce0a3c49ffb3d7cbf92568fe47": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
     },
-    "ad5d7b0bc9ad4e228b3bc76bc975cc47": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_799acae3451445f0a3616b8932f2e3f3",
-       "IPY_MODEL_1714ea91694842339756f26b2fa9c725",
-       "IPY_MODEL_b5d6b069468246abbb3207f3df6f9dde"
       ],
-      "layout": "IPY_MODEL_5925567ffea2436691c4ed3b7b147c17"
-     }
-    },
-    "adcffda7f78c4a1c8bdc6010c8704292": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "af8f433ef2f540c9bd70d14421904d83": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_033cb43d32314d279a7b9e1e86bbccdc",
-      "placeholder": "",
-      "style": "IPY_MODEL_34d9460b112c419885bbff5211674cb3",
-      "value": " 1/1 [00:05&lt;00:00,  5.67s/it]"
-     }
-    },
-    "b5d6b069468246abbb3207f3df6f9dde": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_4958b4c72d0c48af9a77974fc4ed449c",
-      "placeholder": "",
-      "style": "IPY_MODEL_6b43ea2d93c04965a4539b3ef839893b",
-      "value": " 1.16k/1.16k [00:00&lt;00:00, 24.4kB/s]"
-     }
-    },
-    "c710ba94fd65486cbcbe1d402919e27f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "c7955974289a4f448b422d7e4640131a": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "d1ff84cb5591449abcc7dd3e37f9a2df": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_a6e3c5ce0a3c49ffb3d7cbf92568fe47",
-      "placeholder": "",
-      "style": "IPY_MODEL_e084d47529ca4131b233ea3514a6344f",
-      "value": " 159M/159M [00:04&lt;00:00, 26.9MB/s]"
-     }
-    },
-    "d3e6acd54d024d6791aab76232557721": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
     },
-    "d722bbfffeaf4ea7a1060d10dc3a06db": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_a4b5b93b88f549e8a4f37f3d48834ca9",
-       "IPY_MODEL_82692c41501c487fad27c6b19836f46f",
-       "IPY_MODEL_af8f433ef2f540c9bd70d14421904d83"
       ],
-      "layout": "IPY_MODEL_e81b0bf92adc4aadaafce4ee7d36421e"
-     }
-    },
-    "db7ee45589e04749b80376e25ee377bb": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e084d47529ca4131b233ea3514a6344f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
     },
-    "e33033ecda374ed4966ae5fccf6efe37": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_a41cf7f5121a4068842bb5c7d2bc4d62",
-      "placeholder": "",
-      "style": "IPY_MODEL_f479a9629c414cb495a97b0741b0fe4b",
-      "value": "Downloading: 100%"
-     }
     },
-    "e3414cc0456241eca109f4e9e115d16a": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
     },
-    "e81b0bf92adc4aadaafce4ee7d36421e": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
     },
-    "f479a9629c414cb495a97b0741b0fe4b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
     }
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}

 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
     "colab": {
+      "name": "Boosting_Wav2Vec2_with_n_grams_in_🤗_Transformers.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
     },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
     },
+    "language_info": {
+      "name": "python"
     }
   },
+  "cells": [
     {
+      "cell_type": "code",
+      "source": [
+        "!pip install datasets transformers"
+      ],
+      "metadata": {
+        "id": "OWGc_zfyq5_T"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode"
+      ],
+      "metadata": {
+        "id": "TvDJ7CYpzSJQ"
       },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "\n",
+        "notebook_login()"
+      ],
+      "metadata": {
+        "id": "JHTeonOGXiGq"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "!sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev"
+      ],
+      "metadata": {
+        "id": "FKMMWfVQp_gP"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz"
+      ],
+      "metadata": {
+        "id": "J8mm4ExzqIaZ"
       },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n",
+        "!ls kenlm/build/bin"
+      ],
+      "metadata": {
+        "id": "MS4mqMyZqVAI"
       },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "username = \"hf-test\"  # change to your username\n",
+        "target_lang = \"sv\"\n",
+        "\n",
+        "dataset = load_dataset(f\"{username}/{target_lang}_corpora_parliament_processed\", split=\"train\")\n",
+        "\n",
+        "with open(\"text.txt\", \"w\") as file:\n",
+        "  file.write(\" \".join(dataset[\"text\"]))"
+      ],
+      "metadata": {
+        "id": "VIgErMqApENm"
       },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
+      ],
+      "metadata": {
+        "id": "_MdDNBlZrPOm"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "!head -20 5gram.arpa"
+      ],
+      "metadata": {
+        "id": "TRnV8Miusl--"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
+        "  has_added_eos = False\n",
+        "  for line in read_file:\n",
+        "    if not has_added_eos and \"ngram 1=\" in line:\n",
+        "      count=line.strip().split(\"=\")[-1]\n",
+        "      write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
+        "    elif not has_added_eos and \"<s>\" in line:\n",
+        "      write_file.write(line)\n",
+        "      write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
+        "      has_added_eos = True\n",
+        "    else:\n",
+        "      write_file.write(line)"
+      ],
+      "metadata": {
+        "id": "_7u7dVPkvyRZ"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "!head -20 5gram_correct.arpa"
+      ],
+      "metadata": {
+        "id": "YF1RSm-Pxst5"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "from transformers import AutoProcessor\n",
+        "\n",
+        "processor = AutoProcessor.from_pretrained(\"marinone94/xls-r-300m-sv-robust\")"
+      ],
+      "metadata": {
+        "id": "paV71gdAtkDC"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "vocab_dict = processor.tokenizer.get_vocab()\n",
+        "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
       ],
+      "metadata": {
+        "id": "ZKwKxMoitoGS"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
+      "cell_type": "code",
+      "source": [
+        "from pyctcdecode import build_ctcdecoder\n",
+        "\n",
+        "decoder = build_ctcdecoder(\n",
+        "    labels=list(sorted_vocab_dict.keys()),\n",
+        "    kenlm_model_path=\"5gram_correct.arpa\",\n",
+        ")"
       ],
+      "metadata": {
+        "id": "zTLzCLB2tQP7"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "from transformers import Wav2Vec2ProcessorWithLM\n",
+        "\n",
+        "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
+        "    feature_extractor=processor.feature_extractor,\n",
+        "    tokenizer=processor.tokenizer,\n",
+        "    decoder=decoder\n",
+        ")"
       ],
+      "metadata": {
+        "id": "VBVf50EzZgAQ"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "!sudo apt-get install git-lfs tree"
       ],
+      "metadata": {
+        "id": "BZZm3ECc5TMP"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import Repository\n",
+        "\n",
+        "repo = Repository(local_dir=\"xls-r-300m-sv-robust\", clone_from=\"marinone94/xls-r-300m-sv-robust\")"
       ],
+      "metadata": {
+        "id": "fIfcunhF4YM6"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "processor_with_lm.save_pretrained(\"xls-r-300m-sv-robust\")"
       ],
+      "metadata": {
+        "id": "UZ1sWfPH2oce"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "!tree -h xls-r-300m-sv/"
+      ],
+      "metadata": {
+        "id": "ClyENOYFcC_C"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "!kenlm/build/bin/build_binary xls-r-300m-sv-robust/language_model/5gram_correct.arpa xls-r-300m-sv-robust/language_model/5gram.bin"
+      ],
+      "metadata": {
+        "id": "X9qg4FPt2zi8"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "!rm xls-r-300m-sv-robust/language_model/5gram_correct.arpa && tree -h xls-r-300m-sv-robust/"
+      ],
+      "metadata": {
+        "id": "Zn4J-4OZdMPc"
+      },
+      "execution_count": null,
+      "outputs": []
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "repo.push_to_hub(commit_message=\"Upload 5-gram lm-boosted decoder\")"
+      ],
+      "metadata": {
+        "id": "WEV1sx6ee3aT"
+      },
+      "execution_count": null,
+      "outputs": []
     }
+  ]
+}