{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "715a402a-44b9-4fa2-abf0-b0cfd2f3d80b",
   "metadata": {},
   "source": [
    "## Recording voice in Real Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbdf6bab-7418-4a6f-8b75-c31f98a6ada5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Sprints:\n",
    "- [ ] Do Inference optimization of ASR LM\n",
    "- [ ] Train on train.other.500\n",
    "- [ ] Generate dataset for prompting\n",
    "\n",
    "Evaluation Dates: 20th - 21st June, 2023, 3:30 - 5:30pm\n",
    "Sharpen PPT Skills: 20th June, 3:30pm - 4:45pm\n",
    "Flow of the PPT:\n",
    "Demo -> Datasets -> Techniques -> Evaluation -> Q&A\n",
    "- [ Done ] Update the one pager deck slide\n",
    "https://sprinklr-my.sharepoint.com/:p:/r/personal/sricharan_narayanam_sprinklr_com/_layouts/15/Doc.aspx?sourcedoc=%7B84811f56-5fc7-4eaa-87d2-db4a3588d18c%7D&action=edit&wdPreviousSession=948ccc35-dc05-f1f9-612d-9a22300e25ba\n",
    "My PPT:\n",
    "https://sprinklr-my.sharepoint.com/:p:/p/darshan_makwana/Ec4jCiyMWhxMproH625msc8BClFVceNQ8o4kS3EhZBO9MA?e=YCSDxm&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718703689001&web=1\n",
    "Intern Tracker:\n",
    "https://sprinklr.sharepoint.com/:x:/s/AIIntuition/EbRhHPIAIw9MlZ5PpXbztmABde1LFbaSoSHJAo9qU8ggDg?e=xiLkRt&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718692666812&web=1\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "150aca01-4098-4ab2-809a-25775ec52069",
   "metadata": {},
   "source": [
    "## ASR LM Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "804a58af-beb2-48c1-9530-98024e27c0d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from audio_tokenizer import Data2vecFeatureReader\n",
    "from repcodec.RepCodec import RepCodec\n",
    "import torch.nn.functional as F\n",
    "import torch\n",
    "import yaml\n",
    "\n",
    "reader = Data2vecFeatureReader(\"./../prompting/models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)\n",
    "\n",
    "config = \"./repcodec/configs/repcodec_dim1024.yaml\"\n",
    "with open(config) as fp:\n",
    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
    "\n",
    "audio_model = RepCodec(**conf)\n",
    "audio_model.load_state_dict(torch.load(\"./../prompting/models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
    "audio_model.quantizer.initial()\n",
    "audio_model.to(\"cuda:0\")\n",
    "audio_model.eval()\n",
    "\n",
    "print(\"Successfully Loaded Audio Tokenizer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d8da397-2030-4b36-9a42-97862488797b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "cache_dir = \"./../cache\"\n",
    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bb8016b2-fc9d-4c23-9e85-b6e1c5ca164c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
    "import torch\n",
    "import string\n",
    "\n",
    "def process(text):\n",
    "\n",
    "    # Lower case every letter\n",
    "    text = text.lower()\n",
    "\n",
    "    # Remove punctuation\n",
    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
    "    text = text.translate(translation_table)\n",
    "\n",
    "    # Remove whitespaces from front and behind\n",
    "    while text[0] == ' ' or text[-1] == ' ':\n",
    "        if text[0] == ' ':\n",
    "            text = text[1:]\n",
    "        if text[-1] == ' ':\n",
    "            text = text[:-1]\n",
    "    \n",
    "    return text\n",
    "\n",
    "device = \"cuda:0\"\n",
    "dtype = torch.float16\n",
    "context_length = 1877\n",
    "\n",
    "# Load tokenizer and add audio tokens\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
    "\n",
    "model = GPT2LMHeadModel.from_pretrained(\"./../out/checkpoint-19000\", attn_implementation=\"flash_attention_2\", device_map=device, torch_dtype=dtype).eval()\n",
    "model.config.pad_token_id = pad_token\n",
    "model.config.eos_token_id = eot_token\n",
    "# model = torch.compile(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "693db182-92ac-4e36-b848-989fafd10e73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GPT2Model(\n",
       "  (wte): Embedding(6027, 768)\n",
       "  (wpe): Embedding(1877, 768)\n",
       "  (drop): Dropout(p=0.1, inplace=False)\n",
       "  (h): ModuleList(\n",
       "    (0-11): 12 x GPT2Block(\n",
       "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "      (attn): GPT2FlashAttention2(\n",
       "        (c_attn): Conv1D()\n",
       "        (c_proj): Conv1D()\n",
       "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
       "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
       "      )\n",
       "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "      (mlp): GPT2MLP(\n",
       "        (c_fc): Conv1D()\n",
       "        (c_proj): Conv1D()\n",
       "        (act): NewGELUActivation()\n",
       "        (dropout): Dropout(p=0.1, inplace=False)\n",
       "      )\n",
       "    )\n",
       "  )\n",
       "  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       ")"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.transformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cabe9dc-bbbf-41b4-918f-3f60ee5582f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from math import ceil\n",
    "import torch\n",
    "import time\n",
    "\n",
    "sample = dataset[\"train.clean.100\"][5]\n",
    "\n",
    "x = sample[\"audio\"][\"array\"]\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "with torch.no_grad():\n",
    "    x = torch.from_numpy(x).float().to(reader.device)\n",
    "    if reader.task.cfg.normalize:\n",
    "        x = F.layer_norm(x, x.shape)\n",
    "    x = x.view(1, -1)\n",
    "\n",
    "    feat = []\n",
    "    for start in range(0, x.size(1), reader.max_chunk):\n",
    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
    "        res = reader.model.extract_features(\n",
    "            source=x_chunk,\n",
    "            padding_mask=None,\n",
    "            mask=False,\n",
    "            layer=reader.layer,\n",
    "        )\n",
    "        feat_chunk = res[\"x\"]\n",
    "        feat.append(feat_chunk)\n",
    "        \n",
    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
    "\n",
    "    x = audio_model.encoder(features)\n",
    "    z = audio_model.projector(x)\n",
    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
    "    \n",
    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
    "\n",
    "input_time = time.time()\n",
    "\n",
    "generations = model.generate(\n",
    "    input_ids,\n",
    "    pad_token_id = pad_token,\n",
    "    eos_token_id = eot_token,\n",
    "    max_new_tokens = context_length,\n",
    "    use_cache=True\n",
    ")\n",
    "\n",
    "finish_time = time.time()\n",
    "\n",
    "tokenizer.batch_decode(generations, skip_special_tokens=True)\n",
    "print(\"First Token Latency: \", (input_time - start_time) * 1000, \"ms\")\n",
    "# print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
    "print(\"End to End Inference Time: \", (finish_time - start_time) * 1000, \"ms\")\n",
    "print(\"Refer Text: \", process(sample[\"text\"]))\n",
    "print(\"Transcript: \", tokenizer.batch_decode(generations, skip_special_tokens=True)[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baa8d79b-7cf5-4435-838c-1f3d4e043d60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "sample = dataset[\"train.clean.100\"][0]\n",
    "\n",
    "x = sample[\"audio\"][\"array\"]\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "with torch.no_grad():\n",
    "    x = torch.from_numpy(x).float().to(reader.device)\n",
    "    if reader.task.cfg.normalize:\n",
    "        x = F.layer_norm(x, x.shape)\n",
    "    x = x.view(1, -1)\n",
    "\n",
    "    feat = []\n",
    "    for start in range(0, x.size(1), reader.max_chunk):\n",
    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
    "        res = reader.model.extract_features(\n",
    "            source=x_chunk,\n",
    "            padding_mask=None,\n",
    "            mask=False,\n",
    "            layer=reader.layer,\n",
    "        )\n",
    "        feat_chunk = res[\"x\"]\n",
    "        feat.append(feat_chunk)\n",
    "        \n",
    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
    "\n",
    "    x = audio_model.encoder(features)\n",
    "    z = audio_model.projector(x)\n",
    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
    "\n",
    "from tqdm import tqdm\n",
    "from math import ceil\n",
    "import torch\n",
    "\n",
    "context_length = 1877\n",
    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
    "    \n",
    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
    "\n",
    "max_new_tokens = context_length\n",
    "num_tokens = 0\n",
    "first_token = True\n",
    "\n",
    "while max_new_tokens > 0 and input_ids.shape[-1] < context_length:\n",
    "\n",
    "    with torch.no_grad():\n",
    "        outputs = model(input_ids = input_ids)\n",
    "\n",
    "    logits = outputs[\"logits\"][:, -1]\n",
    "\n",
    "    # Greedy Sampling\n",
    "    probas = torch.softmax(logits, dim=-1)\n",
    "    pred_idx = torch.argmax(probas, dim=-1, keepdim=True)\n",
    "    next_idx = pred_idx.item()\n",
    "\n",
    "    if first_token:\n",
    "        first_token_latency = time.time() - start_time\n",
    "        first_token = False\n",
    "        start_time = time.time()\n",
    "\n",
    "    if next_idx == eot_token:\n",
    "        break\n",
    "\n",
    "    input_ids = torch.cat((input_ids, pred_idx), dim=-1)\n",
    "\n",
    "    max_new_tokens -= 1\n",
    "    num_tokens += 1\n",
    "\n",
    "total_time = time.time() - start_time\n",
    "\n",
    "print(\"First Token Latency: \", first_token_latency * 1000, \"ms\")\n",
    "print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
    "print(\"End to End Inference Time: \", (total_time + first_token_latency) * 1000, \"ms\")\n",
    "print(tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0])\n",
    "print(process(sample[\"text\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "014ed999-3293-4d68-8f9c-017584adc642",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.batch_decode([[1, 2, 3]])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec11e43f-1eb8-4399-9a93-6f1427782661",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Accelerating GPT 2 Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5489cb4e-3213-4931-abe1-4c96d1a7ba56",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "- change tensorrt.tensorrt to tensorrt\n",
    "- remove cpu quantization lines\n",
    "- output_names [\"logits\"]\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e7e6ea6-7319-4e57-af33-5d917d26abc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import time\n",
    "from typing import Callable, Dict\n",
    "\n",
    "import numpy as np\n",
    "import tensorrt as trt\n",
    "import torch\n",
    "from tensorrt import ICudaEngine\n",
    "from tensorrt import Logger, Runtime\n",
    "from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, AutoModelForCausalLM\n",
    "from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions\n",
    "from transformer_deploy.utils.generative_model import GPTModelWrapper\n",
    "import inspect\n",
    "from transformers import TensorType\n",
    "\n",
    "from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx\n",
    "from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size\n",
    "from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21681412-7747-4824-894a-6006eb12a821",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"gpt2\"\n",
    "\n",
    "model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "model.eval()\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model.config.pad_token_id = tokenizer.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46783acd-c404-44b4-904b-d8fb687afc34",
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer(\"Here is some text to encode Hello World\", return_tensors=\"pt\")\n",
    "print(\"input tensors\")\n",
    "print(inputs)\n",
    "print(\"input tensor shape\")\n",
    "print(inputs[\"input_ids\"].size())\n",
    "\n",
    "with torch.no_grad():\n",
    "    outputs = model(**inputs)\n",
    "\n",
    "logits = outputs.logits\n",
    "print(\"output tensor\")\n",
    "print(logits)\n",
    "print(\"output shape\")\n",
    "print(logits.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f6cc7bd-5e2d-4d4e-a7e6-73a6b2ecd7af",
   "metadata": {},
   "outputs": [],
   "source": [
    "size = 0\n",
    "for i in range(8, 256, 1):\n",
    "    # input sequence (input_ids) made of int-32 (4 bytes)\n",
    "    size += np.prod([1, i]) * 4\n",
    "    # output tensor made of float-32 (4 bytes)\n",
    "    size += np.prod([1, i, 50257]) * 4\n",
    "print(f\"total size (input+output): {size / 1024**3:.2f} Gb\")\n",
    "\n",
    "# to manually check actual tensor size:\n",
    "# np.prod(logits.shape)*32/8/1024**2:.2f}\n",
    "# or\n",
    "# sys.getsizeof(logits.storage())/1024**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7debb40e-9941-45e4-9db8-4bb021ce44ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids: BatchEncoding = tokenizer(\n",
    "    \"Here is some text to encode Hello World\", add_special_tokens=True, return_attention_mask=False, return_tensors=\"pt\"\n",
    ")\n",
    "# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type\n",
    "for k, v in input_ids.items():  # type: str, torch.Tensor\n",
    "    input_ids[k] = v.type(dtype=torch.int32)\n",
    "\n",
    "convert_to_onnx(\n",
    "    model_pytorch=model,\n",
    "    output_path=\"test-gpt2.onnx\",\n",
    "    inputs_pytorch=dict(input_ids),\n",
    "    quantization=False,\n",
    "    var_output_seq=True,  # we inform ONNX export tool that the output shape will vary with the input shape\n",
    "    output_names = [\"logits\"]\n",
    ")\n",
    "# model may switch to train mode for some unknown reasons, we force the eval mode.\n",
    "_ = model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "956c3007-2c18-4d92-af4f-6cef474d86b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig()\n",
    "logging.getLogger().setLevel(logging.INFO)\n",
    "num_attention_heads, hidden_size = get_model_size(path=model_name)\n",
    "optimize_onnx(\n",
    "    onnx_path=\"test-gpt2.onnx\",\n",
    "    onnx_optim_model_path=\"test-gpt2-opt.onnx\",\n",
    "    fp16=False,\n",
    "    use_cuda=True,\n",
    "    num_attention_heads=num_attention_heads,\n",
    "    hidden_size=hidden_size,\n",
    "    architecture=\"gpt2\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85f30ed9-2802-46c9-9201-a70e200b6860",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "trt_logger: Logger = trt.Logger(trt.Logger.ERROR)\n",
    "runtime: Runtime = trt.Runtime(trt_logger)\n",
    "trt_model_name = \"test-gpt2.plan\"\n",
    "\n",
    "# create only of does not exist because it's slow to run...\n",
    "\n",
    "engine: ICudaEngine = build_engine(\n",
    "    runtime=runtime,\n",
    "    onnx_file_path=\"test-gpt2.onnx\",\n",
    "    logger=trt_logger,\n",
    "    min_shape=(1, 1),\n",
    "    optimal_shape=(1, 128),  # num beam, batch size\n",
    "    max_shape=(1, 384),  # num beam, batch size\n",
    "    workspace_size=10000 * 1024**2,\n",
    "    fp16=True,\n",
    "    int8=False,\n",
    ")\n",
    "save_engine(engine, trt_model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "908fe664-800e-4c5f-a1d5-adfd31fd1c64",
   "metadata": {},
   "outputs": [],
   "source": [
    "engine.num_bindings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4626926b-fa94-4633-95d5-0d515f8db5f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(inspect.getsource(GPTModelWrapper))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5bd1de1-a949-46a3-8d15-457d51db4e40",
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer(\n",
    "    \"Here is some text to encode Hello World\",  # Nvidia example prompt\n",
    "    add_special_tokens=True,\n",
    "    return_attention_mask=False,  # Not used\n",
    "    return_tensors=TensorType.PYTORCH,\n",
    ")\n",
    "inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "815b548f-fa00-4183-b72c-10ecdd4b11c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers.generation import GenerationConfig\n",
    "\n",
    "class GPTWrapper(GPTModelWrapper):\n",
    "    def __init__(self, *args, **kwargs):\n",
    "        super().__init__(*args, **kwargs)\n",
    "\n",
    "        self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None\n",
    "\n",
    "    @classmethod\n",
    "    def can_generate(cls) -> bool:\n",
    "        \"\"\"\n",
    "        Returns whether this model can generate sequences with `.generate()`.\n",
    "\n",
    "        Returns:\n",
    "            `bool`: Whether this model can generate sequences with `.generate()`.\n",
    "        \"\"\"\n",
    "        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.\n",
    "        # Alternativelly, the model can also have a custom `generate` function.\n",
    "        if \"GenerationMixin\" in str(cls.prepare_inputs_for_generation) and \"GenerationMixin\" in str(cls.generate):\n",
    "            return False\n",
    "        return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca57ed1e-0bbe-48dd-ae0f-f3d8ecd7fd04",
   "metadata": {},
   "outputs": [],
   "source": [
    "def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)\n",
    "    return model.lm_head(transformer_outputs.last_hidden_state)\n",
    "\n",
    "\n",
    "model.cuda()\n",
    "model.eval()\n",
    "inputs.to(\"cuda\")\n",
    "with torch.inference_mode():\n",
    "    gpt2_model = GPTWrapper(config=model.config, device=model.device, inference=inference_torch)\n",
    "    sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "    print(tokenizer.decode(sample_output[0], skip_special_tokens=False))\n",
    "    for _ in range(2):\n",
    "        _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "        torch.cuda.synchronize()\n",
    "    start = time.time()\n",
    "    for _ in range(10):\n",
    "        _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "        torch.cuda.synchronize()\n",
    "    print(f\"----\\nPytorch: {(time.time() - start)/10:.2f}s/sequence\")\n",
    "_ = model.cpu()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0849aae-876e-47bc-b045-14a594170947",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
    "\n",
    "\n",
    "def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    data = {\"input_ids\": input_ids.detach().cpu().numpy().astype(np.int32)}\n",
    "    logit = model_onnx.run(None, data)\n",
    "    np_logit = np.array(logit)  # convert list of numpy arrays to a numpy array\n",
    "    # we convert numpy tensor to Pytorch tensor as it's the type expected by HF decoding algorithm\n",
    "    return torch.squeeze(torch.from_numpy(np_logit), dim=0)\n",
    "\n",
    "\n",
    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cpu\"), inference=inference_onnx_naive)\n",
    "inputs.to(\"cpu\")\n",
    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
    "for _ in range(2):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "start = time.time()\n",
    "for _ in range(10):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "print(f\"----\\nONNX Runtime (standard API): {(time.time() - start)/10:.2f}s/sequence\")\n",
    "\n",
    "del model_onnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96114897-894b-4997-bc61-8ac0682e0e55",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
    "\n",
    "\n",
    "def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    data = {\"input_ids\": input_ids}\n",
    "    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device=\"cuda\")[\"output\"]\n",
    "\n",
    "\n",
    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_onnx_optimized)\n",
    "inputs.to(\"cuda\")\n",
    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
    "for _ in range(2):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "start = time.time()\n",
    "for _ in range(10):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "print(f\"----\\nONNX Runtime (binding io API): {(time.time() - start)/10:.2f}/sequence\")\n",
    "del model_onnx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b5b5427-fd6b-4f70-b307-9c579f0f842a",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(\n",
    "    engine_file_path=\"test-gpt2.plan\", runtime=runtime\n",
    ")\n",
    "\n",
    "\n",
    "def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:\n",
    "    data = {\"input_ids\": input_ids}\n",
    "    return tensorrt_model(data)\n",
    "\n",
    "\n",
    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_tensorrt)\n",
    "inputs.to(\"cuda\")\n",
    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
    "for _ in range(2):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
    "start = time.time()\n",
    "for _ in range(10):\n",
    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
    "print(f\"----\\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence\")\n",
    "\n",
    "del tensorrt_model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f547239d-4f7a-433b-8ef6-9e5110a61f4b",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Using CUDAExecution Provider"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e34c682-85fc-4e8d-b13c-7c1c9ea39ead",
   "metadata": {},
   "outputs": [],
   "source": [
    "from optimum.onnxruntime import ORTModelForCausalLM\n",
    "from optimum.pipelines import pipeline\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_id = \"openai-community/gpt2\"\n",
    "\n",
    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
    "    model_id,\n",
    "    export=True,\n",
    "    provider=\"CUDAExecutionProvider\",\n",
    "    use_io_binding=True\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17d28184-26db-4dd3-b24b-0c5a12b10d6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "generations = pipe(\"Both the music and visual were astounding, not to mention the actors performance.\")\n",
    "generations[0][\"generated_text\"]\n",
    "\n",
    "finish_time = time.time()\n",
    "\n",
    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19c4230a-3244-4dce-b5ef-d9927dec5c45",
   "metadata": {},
   "source": [
    "## ASR LM with CUDAExcecution Provider"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f0f1cdc-bfcd-46c5-80a4-60bc76366cf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
    "from datasets import DatasetDict\n",
    "import torch\n",
    "\n",
    "device = \"cuda:0\"\n",
    "dtype = torch.float16\n",
    "\n",
    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
    "\n",
    "from optimum.onnxruntime import ORTModelForCausalLM\n",
    "from optimum.pipelines import pipeline\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_id = \"./../out/checkpoint-10000\"\n",
    "\n",
    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
    "    model_id,\n",
    "    export=True,\n",
    "    provider=\"CUDAExecutionProvider\",\n",
    "    use_io_binding=True\n",
    ")\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
    "\n",
    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d32098c-b0ec-4c36-95ac-775a3a865512",
   "metadata": {},
   "outputs": [],
   "source": [
    "ort_model.config.eos_token_id = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
    "ort_model.config.bos_token_id = tokenizer.encode(\"<|startoftranscript|>\")[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fd0a1fb-9349-4c7a-af03-21e29334f420",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset[split][idx].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15d8b989-6460-4555-b6e2-2f9e219d7034",
   "metadata": {},
   "outputs": [],
   "source": [
    "split = \"train.clean.100\"\n",
    "idx = 0\n",
    "\n",
    "text = \"\".join([ f\"<|audio:{tkn}|>\"for tkn in dataset[split][idx][\"audio_tokens\"]]) + \"<|startoftranscript|>\"\n",
    "\n",
    "import time\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "generations = pipe(text, max_new_tokens=10, skip_special_tokens=True)\n",
    "\n",
    "finish_time = time.time()\n",
    "\n",
    "print(generations[0][\"generated_text\"])\n",
    "\n",
    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}