{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "715a402a-44b9-4fa2-abf0-b0cfd2f3d80b",
   "metadata": {},
   "source": [
    "## Recording voice in Real Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbdf6bab-7418-4a6f-8b75-c31f98a6ada5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Sprints:\n",
    "- [ ] Do Inference optimization of ASR LM\n",
    "- [ ] Train on train.other.500\n",
    "- [ ] Generate dataset for prompting\n",
    "\n",
    "Evaluation Dates: 20th - 21st June, 2023, 3:30 - 5:30pm\n",
    "Sharpen PPT Skills: 20th June, 3:30pm - 4:45pm\n",
    "Flow of the PPT:\n",
    "Demo -> Datasets -> Techniques -> Evaluation -> Q&A\n",
    "- [ Done ] Update the one pager deck slide\n",
    "https://sprinklr-my.sharepoint.com/:p:/r/personal/sricharan_narayanam_sprinklr_com/_layouts/15/Doc.aspx?sourcedoc=%7B84811f56-5fc7-4eaa-87d2-db4a3588d18c%7D&action=edit&wdPreviousSession=948ccc35-dc05-f1f9-612d-9a22300e25ba\n",
    "My PPT:\n",
    "https://sprinklr-my.sharepoint.com/:p:/p/darshan_makwana/Ec4jCiyMWhxMproH625msc8BClFVceNQ8o4kS3EhZBO9MA?e=YCSDxm&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718703689001&web=1\n",
    "Intern Tracker:\n",
    "https://sprinklr.sharepoint.com/:x:/s/AIIntuition/EbRhHPIAIw9MlZ5PpXbztmABde1LFbaSoSHJAo9qU8ggDg?e=xiLkRt&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718692666812&web=1\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "150aca01-4098-4ab2-809a-25775ec52069",
   "metadata": {},
   "source": [
    "## ASR LM Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "804a58af-beb2-48c1-9530-98024e27c0d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from audio_tokenizer import Data2vecFeatureReader\n",
    "from repcodec.RepCodec import RepCodec\n",
    "import torch.nn.functional as F\n",
    "import torch\n",
    "import yaml\n",
    "\n",
    "reader = Data2vecFeatureReader(\"./../prompting/models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)\n",
    "\n",
    "config = \"./repcodec/configs/repcodec_dim1024.yaml\"\n",
    "with open(config) as fp:\n",
    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
    "\n",
    "audio_model = RepCodec(**conf)\n",
    "audio_model.load_state_dict(torch.load(\"./../prompting/models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
    "audio_model.quantizer.initial()\n",
    "audio_model.to(\"cuda:0\")\n",
    "audio_model.eval()\n",
    "\n",
    "print(\"Successfully Loaded Audio Tokenizer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d8da397-2030-4b36-9a42-97862488797b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "cache_dir = \"./../cache\"\n",
    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bb8016b2-fc9d-4c23-9e85-b6e1c5ca164c",
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 33\u001b[0m\n\u001b[1;32m     30\u001b[0m eot_token \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|endoftranscript|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     31\u001b[0m pad_token \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|padding|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m---> 33\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mGPT2LMHeadModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./../out/checkpoint-10000\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattn_implementation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mflash_attention_2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39meval()\n\u001b[1;32m     34\u001b[0m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mpad_token_id \u001b[38;5;241m=\u001b[39m pad_token\n\u001b[1;32m     35\u001b[0m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39meos_token_id \u001b[38;5;241m=\u001b[39m eot_token\n",
      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:3620\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   3617\u001b[0m     init_contexts\u001b[38;5;241m.\u001b[39mappend(init_empty_weights())\n\u001b[1;32m   3619\u001b[0m config \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(config)  \u001b[38;5;66;03m# We do not want to modify the config inplace in from_pretrained.\u001b[39;00m\n\u001b[0;32m-> 3620\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_autoset_attn_implementation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3621\u001b[0m \u001b[43m    \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\n\u001b[1;32m   3622\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3624\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[1;32m   3625\u001b[0m     \u001b[38;5;66;03m# Let's make sure we don't run the init function of buffer modules\u001b[39;00m\n\u001b[1;32m   3626\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(config, \u001b[38;5;241m*\u001b[39mmodel_args, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n",
      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1469\u001b[0m, in \u001b[0;36mPreTrainedModel._autoset_attn_implementation\u001b[0;34m(cls, config, use_flash_attention_2, torch_dtype, device_map, check_device_map)\u001b[0m\n\u001b[1;32m   1466\u001b[0m     config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1468\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m-> 1469\u001b[0m     \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_and_enable_flash_attn_2\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1470\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1471\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1472\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1473\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhard_check_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   1474\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck_device_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_device_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1475\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1476\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msdpa\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available():\n\u001b[1;32m   1477\u001b[0m     \u001b[38;5;66;03m# use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif.\u001b[39;00m\n\u001b[1;32m   1478\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_sdpa(\n\u001b[1;32m   1479\u001b[0m         config,\n\u001b[1;32m   1480\u001b[0m         hard_check_only\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m   1481\u001b[0m     )\n",
      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1571\u001b[0m, in \u001b[0;36mPreTrainedModel._check_and_enable_flash_attn_2\u001b[0;34m(cls, config, torch_dtype, device_map, check_device_map, hard_check_only)\u001b[0m\n\u001b[1;32m   1568\u001b[0m install_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1570\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mutil\u001b[38;5;241m.\u001b[39mfind_spec(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1571\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpreface\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m the package flash_attn seems to be not installed. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minstall_message\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1573\u001b[0m flash_attention_version \u001b[38;5;241m=\u001b[39m version\u001b[38;5;241m.\u001b[39mparse(importlib\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mversion(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m   1574\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mversion\u001b[38;5;241m.\u001b[39mcuda:\n",
      "\u001b[0;31mImportError\u001b[0m: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
     ]
    }
   ],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
    "import torch\n",
    "import string\n",
    "\n",
    "def process(text):\n",
    "\n",
    "    # Lower case every letter\n",
    "    text = text.lower()\n",
    "\n",
    "    # Remove punctuation\n",
    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
    "    text = text.translate(translation_table)\n",
    "\n",
    "    # Remove whitespaces from front and behind\n",
    "    while text[0] == ' ' or text[-1] == ' ':\n",
    "        if text[0] == ' ':\n",
    "            text = text[1:]\n",
    "        if text[-1] == ' ':\n",
    "            text = text[:-1]\n",
    "    \n",
    "    return text\n",
    "\n",
    "device = \"cuda:0\"\n",
    "dtype = torch.float16\n",
    "context_length = 1877\n",
    "\n",
    "# Load tokenizer and add audio tokens\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
    "\n",
    "model = GPT2LMHeadModel.from_pretrained(\"./../out/checkpoint-10000\", attn_implementation=\"flash_attention_2\", device_map=device, torch_dtype=dtype).eval()\n",
    "model.config.pad_token_id = pad_token\n",
    "model.config.eos_token_id = eot_token\n",
    "# model = torch.compile(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cabe9dc-bbbf-41b4-918f-3f60ee5582f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from math import ceil\n",
    "import torch\n",
    "import time\n",
    "\n",
    "sample = dataset[\"train.clean.100\"][5]\n",
    "\n",
    "x = sample[\"audio\"][\"array\"]\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "with torch.no_grad():\n",
    "    x = torch.from_numpy(x).float().to(reader.device)\n",
    "    if reader.task.cfg.normalize:\n",
    "        x = F.layer_norm(x, x.shape)\n",
    "    x = x.view(1, -1)\n",
    "\n",
    "    feat = []\n",
    "    for start in range(0, x.size(1), reader.max_chunk):\n",
    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
    "        res = reader.model.extract_features(\n",
    "            source=x_chunk,\n",
    "            padding_mask=None,\n",
    "            mask=False,\n",
    "            layer=reader.layer,\n",
    "        )\n",
    "        feat_chunk = res[\"x\"]\n",
    "        feat.append(feat_chunk)\n",
    "        \n",
    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
    "\n",
    "    x = audio_model.encoder(features)\n",
    "    z = audio_model.projector(x)\n",
    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
    "    \n",
    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
    "\n",
    "input_time = time.time()\n",
    "\n",
    "generations = model.generate(\n",
    "    input_ids,\n",
    "    pad_token_id = pad_token,\n",
    "    eos_token_id = eot_token,\n",
    "    max_new_tokens = context_length,\n",
    "    use_cache=True\n",
    ")\n",
    "\n",
    "finish_time = time.time()\n",
    "\n",
    "tokenizer.batch_decode(generations, skip_special_tokens=True)\n",
    "print(\"First Token Latency: \", (input_time - start_time) * 1000, \"ms\")\n",
    "# print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
    "print(\"End to End Inference Time: \", (finish_time - start_time) * 1000, \"ms\")\n",
    "print(\"Refer Text: \", process(sample[\"text\"]))\n",
    "print(\"Transcript: \", tokenizer.batch_decode(generations, skip_special_tokens=True)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baa8d79b-7cf5-4435-838c-1f3d4e043d60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "sample = dataset[\"train.clean.100\"][0]\n",
    "\n",
    "x = sample[\"audio\"][\"array\"]\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "with torch.no_grad():\n",
    "    x = torch.from_numpy(x).float().to(reader.device)\n",
    "    if reader.task.cfg.normalize:\n",
    "        x = F.layer_norm(x, x.shape)\n",
    "    x = x.view(1, -1)\n",
    "\n",
    "    feat = []\n",
    "    for start in range(0, x.size(1), reader.max_chunk):\n",
    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
    "        res = reader.model.extract_features(\n",
    "            source=x_chunk,\n",
    "            padding_mask=None,\n",
    "            mask=False,\n",
    "            layer=reader.layer,\n",
    "        )\n",
    "        feat_chunk = res[\"x\"]\n",
    "        feat.append(feat_chunk)\n",
    "        \n",
    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
    "\n",
    "    x = audio_model.encoder(features)\n",
    "    z = audio_model.projector(x)\n",
    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
    "\n",
    "from tqdm import tqdm\n",
    "from math import ceil\n",
    "import torch\n",
    "\n",
    "context_length = 1877\n",
    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
    "    \n",
    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
    "\n",
    "max_new_tokens = context_length\n",
    "num_tokens = 0\n",
    "first_token = True\n",
    "\n",
    "while max_new_tokens > 0 and input_ids.shape[-1] < context_length:\n",
    "\n",
    "    with torch.no_grad():\n",
    "        outputs = model(input_ids = input_ids)\n",
    "\n",
    "    logits = outputs[\"logits\"][:, -1]\n",
    "\n",
    "    # Greedy Sampling\n",
    "    probas = torch.softmax(logits, dim=-1)\n",
    "    pred_idx = torch.argmax(probas, dim=-1, keepdim=True)\n",
    "    next_idx = pred_idx.item()\n",
    "\n",
    "    if first_token:\n",
    "        first_token_latency = time.time() - start_time\n",
    "        first_token = False\n",
    "        start_time = time.time()\n",
    "\n",
    "    if next_idx == eot_token:\n",
    "        break\n",
    "\n",
    "    input_ids = torch.cat((input_ids, pred_idx), dim=-1)\n",
    "\n",
    "    max_new_tokens -= 1\n",
    "    num_tokens += 1\n",
    "\n",
    "total_time = time.time() - start_time\n",
    "\n",
    "print(\"First Token Latency: \", first_token_latency * 1000, \"ms\")\n",
    "print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
    "print(\"End to End Inference Time: \", (total_time + first_token_latency) * 1000, \"ms\")\n",
    "print(tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0])\n",
    "print(process(sample[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "014ed999-3293-4d68-8f9c-017584adc642",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.batch_decode([[1, 2, 3]])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec11e43f-1eb8-4399-9a93-6f1427782661",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Accelerating GPT 2 Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5489cb4e-3213-4931-abe1-4c96d1a7ba56",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "- change tensorrt.tensorrt to tensorrt\n",
    "- remove cpu quantization lines\n",
    "- output_names [\"logits\"]\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e7e6ea6-7319-4e57-af33-5d917d26abc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import time\n",
    "from typing import Callable, Dict\n",
    "\n",
    "import numpy as np\n",
    "import tensorrt as trt\n",
    "import torch\n",
    "from tensorrt import ICudaEngine\n",
    "from tensorrt import Logger, Runtime\n",
    "from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, AutoModelForCausalLM\n",
    "from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions\n",
    "from transformer_deploy.utils.generative_model import GPTModelWrapper\n",
    "import inspect\n",
    "from transformers import TensorType\n",
    "\n",
    "from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx\n",
    "from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size\n",
    "from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21681412-7747-4824-894a-6006eb12a821",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"gpt2\"\n",
    "\n",
    "model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "model.eval()\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model.config.pad_token_id = tokenizer.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46783acd-c404-44b4-904b-d8fb687afc34",
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer(\"Here is some text to encode Hello World\", return_tensors=\"pt\")\n",
    "print(\"input tensors\")\n",
    "print(inputs)\n",
    "print(\"input tensor shape\")\n",
    "print(inputs[\"input_ids\"].size())\n",
    "\n",
    "with torch.no_grad():\n",
    "    outputs = model(**inputs)\n",
    "\n",
    "logits = outputs.logits\n",
    "print(\"output tensor\")\n",
    "print(logits)\n",
    "print(\"output shape\")\n",
    "print(logits.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f6cc7bd-5e2d-4d4e-a7e6-73a6b2ecd7af",
   "metadata": {},
   "outputs": [],
   "source": [
    "size = 0\n",
    "for i in range(8, 256, 1):\n",
    "    # input sequence (input_ids) made of int-32 (4 bytes)\n",
    "    size += np.prod([1, i]) * 4\n",
    "    # output tensor made of float-32 (4 bytes)\n",
    "    size += np.prod([1, i, 50257]) * 4\n",
    "print(f\"total size (input+output): {size / 1024**3:.2f} Gb\")\n",
    "\n",
    "# to manually check actual tensor size:\n",
    "# np.prod(logits.shape)*32/8/1024**2:.2f}\n",
    "# or\n",
    "# sys.getsizeof(logits.storage())/1024**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7debb40e-9941-45e4-9db8-4bb021ce44ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids: BatchEncoding = tokenizer(\n",
    "    \"Here is some text to encode Hello World\", add_special_tokens=True, return_attention_mask=False, return_tensors=\"pt\"\n",
    ")\n",
    "# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type\n",
    "for k, v in input_ids.items():  # type: str, torch.Tensor\n",
    "    input_ids[k] = v.type(dtype=torch.int32)\n",
    "\n",
    "convert_to_onnx(\n",
    "    model_pytorch=model,\n",
    "    output_path=\"test-gpt2.onnx\",\n",
    "    inputs_pytorch=dict(input_ids),\n",
    "    quantization=False,\n",
    "    var_output_seq=True,  # we inform ONNX export tool that the output shape will vary with the input shape\n",
    "    output_names = [\"logits\"]\n",
    ")\n",
    "# model may switch to train mode for some unknown reasons, we force the eval mode.\n",
    "_ = model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "956c3007-2c18-4d92-af4f-6cef474d86b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig()\n",
    "logging.getLogger().setLevel(logging.INFO)\n",
    "num_attention_heads, hidden_size = get_model_size(path=model_name)\n",
    "optimize_onnx(\n",
    "    onnx_path=\"test-gpt2.onnx\",\n",
    "    onnx_optim_model_path=\"test-gpt2-opt.onnx\",\n",
    "    fp16=False,\n",
    "    use_cuda=True,\n",
    "    num_attention_heads=num_attention_heads,\n",
    "    hidden_size=hidden_size,\n",
    "    architecture=\"gpt2\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85f30ed9-2802-46c9-9201-a70e200b6860",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "trt_logger: Logger = trt.Logger(trt.Logger.ERROR)\n",
    "runtime: Runtime = trt.Runtime(trt_logger)\n",
    "trt_model_name = \"test-gpt2.plan\"\n",
    "\n",
    "# create only of does not exist because it's slow to run...\n",
    "\n",
    "engine: ICudaEngine = build_engine(\n",
    "    runtime=runtime,\n",
    "    onnx_file_path=\"test-gpt2.onnx\",\n",
    "    logger=trt_logger,\n",
    "    min_shape=(1, 1),\n",
    "    optimal_shape=(1, 128),  # num beam, batch size\n",
    "    max_shape=(1, 384),  # num beam, batch size\n",
    "    workspace_size=10000 * 1024**2,\n",
    "    fp16=True,\n",
    "    int8=False,\n",
    ")\n",
    "save_engine(engine, trt_model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "908fe664-800e-4c5f-a1d5-adfd31fd1c64",
   "metadata": {},
   "outputs": [],
   "source": [
    "engine.num_bindings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4626926b-fa94-4633-95d5-0d515f8db5f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(inspect.getsource(GPTModelWrapper))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5bd1de1-a949-46a3-8d15-457d51db4e40",
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer(\n",
    "    \"Here is some text to encode Hello World\",  # Nvidia example prompt\n",
    "    add_special_tokens=True,\n",
    "    return_attention_mask=False,  # Not used\n",
    "    return_tensors=TensorType.PYTORCH,\n",
    ")\n",
    "inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "815b548f-fa00-4183-b72c-10ecdd4b11c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers.generation import GenerationConfig\n",
    "\n",
    "class GPTWrapper(GPTModelWrapper):\n",
    "    def __init__(self, *args, **kwargs):\n",
    "        super().__init__(*args, **kwargs)\n",
    "\n",
    "        self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None\n",
    "\n",
    "    @classmethod\n",
    "    def can_generate(cls) -> bool:\n",
    "        \"\"\"\n",
    "        Returns whether this model can generate sequences with `.generate()`.\n",
    "\n",
    "        Returns:\n",
    "            `bool`: Whether this model can generate sequences with `.generate()`.\n",
    "        \"\"\"\n",
    "        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.\n",
    "        # Alternativelly, the model can also have a custom `generate` function.\n",
    "        if \"GenerationMixin\" in str(cls.prepare_inputs_for_generation) and \"GenerationMixin\" in str(cls.generate):\n",
    "            return False\n",
    "        return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca57ed1e-0bbe-48dd-ae0f-f3d8ecd7fd04",
   "metadata": {},
   "outputs": [],
   "source": [
    "def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)\n",
    "    return model.lm_head(transformer_outputs.last_hidden_state)\n",
    "\n",
    "\n",
    "model.cuda()\n",
    "model.eval()\n",
    "inputs.to(\"cuda\")\n",
    "with torch.inference_mode():\n",
    "    gpt2_model = GPTWrapper(config=model.config, device=model.device, inference=inference_torch)\n",
    "    sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "    print(tokenizer.decode(sample_output[0], skip_special_tokens=False))\n",
    "    for _ in range(2):\n",
    "        _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "        torch.cuda.synchronize()\n",
    "    start = time.time()\n",
    "    for _ in range(10):\n",
    "        _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "        torch.cuda.synchronize()\n",
    "    print(f\"----\\nPytorch: {(time.time() - start)/10:.2f}s/sequence\")\n",
    "_ = model.cpu()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0849aae-876e-47bc-b045-14a594170947",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
    "\n",
    "\n",
    "def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    data = {\"input_ids\": input_ids.detach().cpu().numpy().astype(np.int32)}\n",
    "    logit = model_onnx.run(None, data)\n",
    "    np_logit = np.array(logit)  # convert list of numpy arrays to a numpy array\n",
    "    # we convert numpy tensor to Pytorch tensor as it's the type expected by HF decoding algorithm\n",
    "    return torch.squeeze(torch.from_numpy(np_logit), dim=0)\n",
    "\n",
    "\n",
    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cpu\"), inference=inference_onnx_naive)\n",
    "inputs.to(\"cpu\")\n",
    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
    "for _ in range(2):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "start = time.time()\n",
    "for _ in range(10):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "print(f\"----\\nONNX Runtime (standard API): {(time.time() - start)/10:.2f}s/sequence\")\n",
    "\n",
    "del model_onnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96114897-894b-4997-bc61-8ac0682e0e55",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
    "\n",
    "\n",
    "def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    data = {\"input_ids\": input_ids}\n",
    "    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device=\"cuda\")[\"output\"]\n",
    "\n",
    "\n",
    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_onnx_optimized)\n",
    "inputs.to(\"cuda\")\n",
    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
    "for _ in range(2):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "start = time.time()\n",
    "for _ in range(10):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "print(f\"----\\nONNX Runtime (binding io API): {(time.time() - start)/10:.2f}/sequence\")\n",
    "del model_onnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b5b5427-fd6b-4f70-b307-9c579f0f842a",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(\n",
    "    engine_file_path=\"test-gpt2.plan\", runtime=runtime\n",
    ")\n",
    "\n",
    "\n",
    "def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    data = {\"input_ids\": input_ids}\n",
    "    return tensorrt_model(data)\n",
    "\n",
    "\n",
    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_tensorrt)\n",
    "inputs.to(\"cuda\")\n",
    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
    "for _ in range(2):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "start = time.time()\n",
    "for _ in range(10):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "print(f\"----\\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence\")\n",
    "\n",
    "del tensorrt_model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f547239d-4f7a-433b-8ef6-9e5110a61f4b",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Using CUDAExecution Provider"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e34c682-85fc-4e8d-b13c-7c1c9ea39ead",
   "metadata": {},
   "outputs": [],
   "source": [
    "from optimum.onnxruntime import ORTModelForCausalLM\n",
    "from optimum.pipelines import pipeline\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_id = \"openai-community/gpt2\"\n",
    "\n",
    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
    "    model_id,\n",
    "    export=True,\n",
    "    provider=\"CUDAExecutionProvider\",\n",
    "    use_io_binding=True\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17d28184-26db-4dd3-b24b-0c5a12b10d6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "generations = pipe(\"Both the music and visual were astounding, not to mention the actors performance.\")\n",
    "generations[0][\"generated_text\"]\n",
    "\n",
    "finish_time = time.time()\n",
    "\n",
    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19c4230a-3244-4dce-b5ef-d9927dec5c45",
   "metadata": {},
   "source": [
    "## ASR LM with CUDAExcecution Provider"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f0f1cdc-bfcd-46c5-80a4-60bc76366cf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
    "from datasets import DatasetDict\n",
    "import torch\n",
    "\n",
    "device = \"cuda:0\"\n",
    "dtype = torch.float16\n",
    "\n",
    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
    "\n",
    "from optimum.onnxruntime import ORTModelForCausalLM\n",
    "from optimum.pipelines import pipeline\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_id = \"./../out/checkpoint-10000\"\n",
    "\n",
    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
    "    model_id,\n",
    "    export=True,\n",
    "    provider=\"CUDAExecutionProvider\",\n",
    "    use_io_binding=True\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
    "\n",
    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d32098c-b0ec-4c36-95ac-775a3a865512",
   "metadata": {},
   "outputs": [],
   "source": [
    "ort_model.config.eos_token_id = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
    "ort_model.config.bos_token_id = tokenizer.encode(\"<|startoftranscript|>\")[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fd0a1fb-9349-4c7a-af03-21e29334f420",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset[split][idx].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15d8b989-6460-4555-b6e2-2f9e219d7034",
   "metadata": {},
   "outputs": [],
   "source": [
    "split = \"train.clean.100\"\n",
    "idx = 0\n",
    "\n",
    "text = \"\".join([ f\"<|audio:{tkn}|>\"for tkn in dataset[split][idx][\"audio_tokens\"]]) + \"<|startoftranscript|>\"\n",
    "\n",
    "import time\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "generations = pipe(text, max_new_tokens=10, skip_special_tokens=True)\n",
    "\n",
    "finish_time = time.time()\n",
    "\n",
    "print(generations[0][\"generated_text\"])\n",
    "\n",
    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}