File size: 19,963 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5205c0d3-2272-4a43-9345-9553af479fe6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7afc85b57ea24d31a2fdcc2b1f5c9ace",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "38bdf299-f60d-43ea-9230-df1be861e406",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration sharpcoder--bjorn_training-8c32a3534606a113\n",
      "Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-8c32a3534606a113/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "18cae671f8fd4f9baac804c91fee03bf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from datasets import load_dataset, load_metric\n",
    "ds = load_dataset(\"sharpcoder/bjorn_training\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0611b2fa6cf740d6925d03cf3ba525a2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def extract_all_chars(batch):\n",
    "  all_text = \" \".join(batch[\"text\"])\n",
    "  vocab = list(set(all_text))\n",
    "  return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
    "\n",
    "vocabs = ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=ds.column_names[\"train\"])\n",
    "vocab_list = list(set(vocabs[\"train\"][\"vocab\"][0]))\n",
    "vocab_dict = {v: k for k, v in enumerate(vocab_list)}\n",
    "vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
    "del vocab_dict[\" \"]\n",
    "vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
    "vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
    "len(vocab_dict)\n",
    "import json\n",
    "with open('vocab.json', 'w') as vocab_file:\n",
    "    json.dump(vocab_dict, vocab_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d214872e-d4b1-4aa7-be07-8a1591961968",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Wav2Vec2CTCTokenizer\n",
    "from transformers import Wav2Vec2FeatureExtractor\n",
    "from transformers import Wav2Vec2Processor\n",
    "\n",
    "tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\" \")\n",
    "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
    "processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_dataset(batch):\n",
    "    audio = batch[\"audio\"]\n",
    "\n",
    "    # batched output is \"un-batched\" to ensure mapping is correct\n",
    "    batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
    "    batch[\"input_length\"] = len(batch[\"input_values\"])\n",
    "    \n",
    "    with processor.as_target_processor():\n",
    "        batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ae21f7b6a50241e4ab4dd2b5c7c5689c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "50c9a6ad-9e79-4a1c-a5ce-6e1f73a96e4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "from dataclasses import dataclass, field\n",
    "from typing import Any, Dict, List, Optional, Union\n",
    "\n",
    "@dataclass\n",
    "class DataCollatorCTCWithPadding:\n",
    "    \"\"\"\n",
    "    Data collator that will dynamically pad the inputs received.\n",
    "    Args:\n",
    "        processor (:class:`~transformers.Wav2Vec2Processor`)\n",
    "            The processor used for proccessing the data.\n",
    "        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):\n",
    "            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)\n",
    "            among:\n",
    "            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n",
    "              sequence if provided).\n",
    "            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the\n",
    "              maximum acceptable input length for the model if that argument is not provided.\n",
    "            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of\n",
    "              different lengths).\n",
    "    \"\"\"\n",
    "\n",
    "    processor: Wav2Vec2Processor\n",
    "    padding: Union[bool, str] = True\n",
    "\n",
    "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
    "        # split inputs and labels since they have to be of different lenghts and need\n",
    "        # different padding methods\n",
    "        input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n",
    "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
    "\n",
    "        batch = self.processor.pad(\n",
    "            input_features,\n",
    "            padding=self.padding,\n",
    "            return_tensors=\"pt\",\n",
    "        )\n",
    "        with self.processor.as_target_processor():\n",
    "            labels_batch = self.processor.pad(\n",
    "                label_features,\n",
    "                padding=self.padding,\n",
    "                return_tensors=\"pt\",\n",
    "            )\n",
    "\n",
    "        # replace padding with -100 to ignore loss correctly\n",
    "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
    "\n",
    "        batch[\"labels\"] = labels\n",
    "\n",
    "        return batch\n",
    "    \n",
    "def compute_metrics(pred):\n",
    "    pred_logits = pred.predictions\n",
    "    pred_ids = np.argmax(pred_logits, axis=-1)\n",
    "\n",
    "    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
    "\n",
    "    pred_str = processor.batch_decode(pred_ids)\n",
    "    # we do not want to group tokens when computing the metrics\n",
    "    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
    "\n",
    "    wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
    "\n",
    "    return {\"wer\": wer}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "1025ffdf-cb83-4895-89ab-a98bc3fab642",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)\n",
    "wer_metric = load_metric(\"wer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "71351cf4-6d00-40ae-89cc-cedb87073625",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/config.json from cache at /home/sharpcoder/.cache/huggingface/transformers/cbb3014bb9f03ead9b94f4a791ff8e777465307670e85079d35e28cbc5d88727.0e2d739358c9b58747bd19db5f9f4320dacabbeb1e6282f5cc1069c5c55a82d2\n",
      "Model config Wav2Vec2Config {\n",
      "  \"_name_or_path\": \"facebook/wav2vec2-base-960h\",\n",
      "  \"activation_dropout\": 0.1,\n",
      "  \"apply_spec_augment\": true,\n",
      "  \"architectures\": [\n",
      "    \"Wav2Vec2ForCTC\"\n",
      "  ],\n",
      "  \"attention_dropout\": 0.1,\n",
      "  \"bos_token_id\": 1,\n",
      "  \"classifier_proj_size\": 256,\n",
      "  \"codevector_dim\": 256,\n",
      "  \"contrastive_logits_temperature\": 0.1,\n",
      "  \"conv_bias\": false,\n",
      "  \"conv_dim\": [\n",
      "    512,\n",
      "    512,\n",
      "    512,\n",
      "    512,\n",
      "    512,\n",
      "    512,\n",
      "    512\n",
      "  ],\n",
      "  \"conv_kernel\": [\n",
      "    10,\n",
      "    3,\n",
      "    3,\n",
      "    3,\n",
      "    3,\n",
      "    2,\n",
      "    2\n",
      "  ],\n",
      "  \"conv_stride\": [\n",
      "    5,\n",
      "    2,\n",
      "    2,\n",
      "    2,\n",
      "    2,\n",
      "    2,\n",
      "    2\n",
      "  ],\n",
      "  \"ctc_loss_reduction\": \"mean\",\n",
      "  \"ctc_zero_infinity\": false,\n",
      "  \"diversity_loss_weight\": 0.1,\n",
      "  \"do_stable_layer_norm\": false,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"feat_extract_activation\": \"gelu\",\n",
      "  \"feat_extract_dropout\": 0.0,\n",
      "  \"feat_extract_norm\": \"group\",\n",
      "  \"feat_proj_dropout\": 0.1,\n",
      "  \"feat_quantizer_dropout\": 0.0,\n",
      "  \"final_dropout\": 0.1,\n",
      "  \"gradient_checkpointing\": false,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout\": 0.1,\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"layerdrop\": 0.1,\n",
      "  \"mask_feature_length\": 10,\n",
      "  \"mask_feature_prob\": 0.0,\n",
      "  \"mask_time_length\": 10,\n",
      "  \"mask_time_prob\": 0.05,\n",
      "  \"model_type\": \"wav2vec2\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_codevector_groups\": 2,\n",
      "  \"num_codevectors_per_group\": 320,\n",
      "  \"num_conv_pos_embedding_groups\": 16,\n",
      "  \"num_conv_pos_embeddings\": 128,\n",
      "  \"num_feat_extract_layers\": 7,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"num_negatives\": 100,\n",
      "  \"pad_token_id\": 19,\n",
      "  \"proj_codevector_dim\": 256,\n",
      "  \"transformers_version\": \"4.11.3\",\n",
      "  \"use_weighted_layer_sum\": false,\n",
      "  \"vocab_size\": 32\n",
      "}\n",
      "\n",
      "loading weights file https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/pytorch_model.bin from cache at /home/sharpcoder/.cache/huggingface/transformers/4cb133d3cf3e58e8a4e088b1fc826611a3bcf3d98b20a0bb49ce8cd5362411b7.beeaccfa4baf44ba6123c23938d8a17f48344361a5e7041782e537dfd78a2037\n",
      "All model checkpoint weights were used when initializing Wav2Vec2ForCTC.\n",
      "\n",
      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import Wav2Vec2ForCTC\n",
    "\n",
    "model = Wav2Vec2ForCTC.from_pretrained(\n",
    "    #\"facebook/wav2vec2-base\",\n",
    "    \"facebook/wav2vec2-base-960h\",\n",
    "    ctc_loss_reduction=\"mean\", \n",
    "    pad_token_id=processor.tokenizer.pad_token_id,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PyTorch: setting up devices\n",
      "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n"
     ]
    }
   ],
   "source": [
    "from transformers import TrainingArguments\n",
    "from transformers import Trainer\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "  output_dir=\"./\",\n",
    "  group_by_length=True,\n",
    "  per_device_train_batch_size=8,\n",
    "  evaluation_strategy=\"steps\",\n",
    "  num_train_epochs=30,\n",
    "  fp16=False,\n",
    "  gradient_checkpointing=True,\n",
    "  save_steps=500,\n",
    "  eval_steps=500,\n",
    "  logging_steps=500,\n",
    "  learning_rate=1e-4,\n",
    "  weight_decay=0.005,\n",
    "  warmup_steps=1000,\n",
    "  save_total_limit=2,\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    data_collator=data_collator,\n",
    "    args=training_args,\n",
    "    compute_metrics=compute_metrics,\n",
    "    train_dataset=ds_prepared[\"train\"],\n",
    "    eval_dataset=ds_prepared[\"train\"],\n",
    "    tokenizer=processor.feature_extractor,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
      "***** Running training *****\n",
      "  Num examples = 1\n",
      "  Num Epochs = 3\n",
      "  Instantaneous batch size per device = 8\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 3\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='3' max='3' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [3/3 00:02, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=3, training_loss=15.702210744222006, metrics={'train_runtime': 3.157, 'train_samples_per_second': 0.95, 'train_steps_per_second': 0.95, 'total_flos': 94374986431680.0, 'train_loss': 15.702210744222006, 'epoch': 3.0})"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "333d43cf-add3-4d78-bbca-b44c638519fe",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to ./\n",
      "Configuration saved in ./config.json\n",
      "Model weights saved in ./pytorch_model.bin\n",
      "Configuration saved in ./preprocessor_config.json\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'Trainer' object has no attribute 'repo'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Input \u001b[0;32mIn [47]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhub_model_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msharpcoder/wav2vec2_bjorn\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m   2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m   2675\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m   2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m   2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
     ]
    }
   ],
   "source": [
    "trainer.push_to_hub(hub_model_id=\"sharpcoder/wav2vec2_bjorn\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5cb9a88-2443-4bd9-85ac-12bf80a9e325",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}