{ "cells": [ { "cell_type": "markdown", "id": "715a402a-44b9-4fa2-abf0-b0cfd2f3d80b", "metadata": {}, "source": [ "## Recording voice in Real Time" ] }, { "cell_type": "code", "execution_count": null, "id": "dbdf6bab-7418-4a6f-8b75-c31f98a6ada5", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Sprints:\n", "- [ ] Do Inference optimization of ASR LM\n", "- [ ] Train on train.other.500\n", "- [ ] Generate dataset for prompting\n", "\n", "Evaluation Dates: 20th - 21st June, 2023, 3:30 - 5:30pm\n", "Sharpen PPT Skills: 20th June, 3:30pm - 4:45pm\n", "Flow of the PPT:\n", "Demo -> Datasets -> Techniques -> Evaluation -> Q&A\n", "- [ Done ] Update the one pager deck slide\n", "https://sprinklr-my.sharepoint.com/:p:/r/personal/sricharan_narayanam_sprinklr_com/_layouts/15/Doc.aspx?sourcedoc=%7B84811f56-5fc7-4eaa-87d2-db4a3588d18c%7D&action=edit&wdPreviousSession=948ccc35-dc05-f1f9-612d-9a22300e25ba\n", "My PPT:\n", "https://sprinklr-my.sharepoint.com/:p:/p/darshan_makwana/Ec4jCiyMWhxMproH625msc8BClFVceNQ8o4kS3EhZBO9MA?e=YCSDxm&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718703689001&web=1\n", "Intern Tracker:\n", "https://sprinklr.sharepoint.com/:x:/s/AIIntuition/EbRhHPIAIw9MlZ5PpXbztmABde1LFbaSoSHJAo9qU8ggDg?e=xiLkRt&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718692666812&web=1\n", "\"\"\"" ] }, { "cell_type": "markdown", "id": "150aca01-4098-4ab2-809a-25775ec52069", "metadata": {}, "source": [ "## ASR LM Inference" ] }, { "cell_type": "code", "execution_count": null, "id": "804a58af-beb2-48c1-9530-98024e27c0d6", "metadata": {}, "outputs": [], "source": [ "from audio_tokenizer import Data2vecFeatureReader\n", "from repcodec.RepCodec import RepCodec\n", "import torch.nn.functional as F\n", "import torch\n", "import yaml\n", "\n", "reader = Data2vecFeatureReader(\"./../prompting/models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)\n", "\n", "config = \"./repcodec/configs/repcodec_dim1024.yaml\"\n", "with open(config) as fp:\n", " conf = yaml.load(fp, Loader=yaml.FullLoader)\n", "\n", "audio_model = RepCodec(**conf)\n", "audio_model.load_state_dict(torch.load(\"./../prompting/models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n", "audio_model.quantizer.initial()\n", "audio_model.to(\"cuda:0\")\n", "audio_model.eval()\n", "\n", "print(\"Successfully Loaded Audio Tokenizer\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7d8da397-2030-4b36-9a42-97862488797b", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "cache_dir = \"./../cache\"\n", "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "bb8016b2-fc9d-4c23-9e85-b6e1c5ca164c", "metadata": {}, "outputs": [ { "ename": "ImportError", "evalue": "FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[2], line 33\u001b[0m\n\u001b[1;32m 30\u001b[0m eot_token \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|endoftranscript|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 31\u001b[0m pad_token \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|padding|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m---> 33\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mGPT2LMHeadModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./../out/checkpoint-10000\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattn_implementation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mflash_attention_2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39meval()\n\u001b[1;32m 34\u001b[0m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mpad_token_id \u001b[38;5;241m=\u001b[39m pad_token\n\u001b[1;32m 35\u001b[0m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39meos_token_id \u001b[38;5;241m=\u001b[39m eot_token\n", "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:3620\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 3617\u001b[0m init_contexts\u001b[38;5;241m.\u001b[39mappend(init_empty_weights())\n\u001b[1;32m 3619\u001b[0m config \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(config) \u001b[38;5;66;03m# We do not want to modify the config inplace in from_pretrained.\u001b[39;00m\n\u001b[0;32m-> 3620\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_autoset_attn_implementation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3621\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\n\u001b[1;32m 3622\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3624\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[1;32m 3625\u001b[0m \u001b[38;5;66;03m# Let's make sure we don't run the init function of buffer modules\u001b[39;00m\n\u001b[1;32m 3626\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(config, \u001b[38;5;241m*\u001b[39mmodel_args, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n", "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1469\u001b[0m, in \u001b[0;36mPreTrainedModel._autoset_attn_implementation\u001b[0;34m(cls, config, use_flash_attention_2, torch_dtype, device_map, check_device_map)\u001b[0m\n\u001b[1;32m 1466\u001b[0m config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1468\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m-> 1469\u001b[0m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_and_enable_flash_attn_2\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1470\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1471\u001b[0m \u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1472\u001b[0m \u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1473\u001b[0m \u001b[43m \u001b[49m\u001b[43mhard_check_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1474\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_device_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_device_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1475\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1476\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msdpa\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available():\n\u001b[1;32m 1477\u001b[0m \u001b[38;5;66;03m# use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif.\u001b[39;00m\n\u001b[1;32m 1478\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_sdpa(\n\u001b[1;32m 1479\u001b[0m config,\n\u001b[1;32m 1480\u001b[0m hard_check_only\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 1481\u001b[0m )\n", "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1571\u001b[0m, in \u001b[0;36mPreTrainedModel._check_and_enable_flash_attn_2\u001b[0;34m(cls, config, torch_dtype, device_map, check_device_map, hard_check_only)\u001b[0m\n\u001b[1;32m 1568\u001b[0m install_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1570\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mutil\u001b[38;5;241m.\u001b[39mfind_spec(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1571\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpreface\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m the package flash_attn seems to be not installed. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minstall_message\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1573\u001b[0m flash_attention_version \u001b[38;5;241m=\u001b[39m version\u001b[38;5;241m.\u001b[39mparse(importlib\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mversion(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 1574\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mversion\u001b[38;5;241m.\u001b[39mcuda:\n", "\u001b[0;31mImportError\u001b[0m: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2." ] } ], "source": [ "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n", "import torch\n", "import string\n", "\n", "def process(text):\n", "\n", " # Lower case every letter\n", " text = text.lower()\n", "\n", " # Remove punctuation\n", " punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n", " translation_table = str.maketrans('', '', punctuation_to_remove)\n", " text = text.translate(translation_table)\n", "\n", " # Remove whitespaces from front and behind\n", " while text[0] == ' ' or text[-1] == ' ':\n", " if text[0] == ' ':\n", " text = text[1:]\n", " if text[-1] == ' ':\n", " text = text[:-1]\n", " \n", " return text\n", "\n", "device = \"cuda:0\"\n", "dtype = torch.float16\n", "context_length = 1877\n", "\n", "# Load tokenizer and add audio tokens\n", "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n", "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n", "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n", "\n", "model = GPT2LMHeadModel.from_pretrained(\"./../out/checkpoint-10000\", attn_implementation=\"flash_attention_2\", device_map=device, torch_dtype=dtype).eval()\n", "model.config.pad_token_id = pad_token\n", "model.config.eos_token_id = eot_token\n", "# model = torch.compile(model)" ] }, { "cell_type": "code", "execution_count": null, "id": "7cabe9dc-bbbf-41b4-918f-3f60ee5582f2", "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "from math import ceil\n", "import torch\n", "import time\n", "\n", "sample = dataset[\"train.clean.100\"][5]\n", "\n", "x = sample[\"audio\"][\"array\"]\n", "\n", "start_time = time.time()\n", "\n", "with torch.no_grad():\n", " x = torch.from_numpy(x).float().to(reader.device)\n", " if reader.task.cfg.normalize:\n", " x = F.layer_norm(x, x.shape)\n", " x = x.view(1, -1)\n", "\n", " feat = []\n", " for start in range(0, x.size(1), reader.max_chunk):\n", " x_chunk = x[:, start: start + reader.max_chunk]\n", " res = reader.model.extract_features(\n", " source=x_chunk,\n", " padding_mask=None,\n", " mask=False,\n", " layer=reader.layer,\n", " )\n", " feat_chunk = res[\"x\"]\n", " feat.append(feat_chunk)\n", " \n", " features = torch.cat(feat, 1).permute(0, 2, 1)\n", "\n", " x = audio_model.encoder(features)\n", " z = audio_model.projector(x)\n", " _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n", " tokens = idx.cpu().data.numpy().tolist()[0]\n", " \n", "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n", "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n", "\n", "input_time = time.time()\n", "\n", "generations = model.generate(\n", " input_ids,\n", " pad_token_id = pad_token,\n", " eos_token_id = eot_token,\n", " max_new_tokens = context_length,\n", " use_cache=True\n", ")\n", "\n", "finish_time = time.time()\n", "\n", "tokenizer.batch_decode(generations, skip_special_tokens=True)\n", "print(\"First Token Latency: \", (input_time - start_time) * 1000, \"ms\")\n", "# print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n", "print(\"End to End Inference Time: \", (finish_time - start_time) * 1000, \"ms\")\n", "print(\"Refer Text: \", process(sample[\"text\"]))\n", "print(\"Transcript: \", tokenizer.batch_decode(generations, skip_special_tokens=True)[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "baa8d79b-7cf5-4435-838c-1f3d4e043d60", "metadata": {}, "outputs": [], "source": [ "import time\n", "\n", "sample = dataset[\"train.clean.100\"][0]\n", "\n", "x = sample[\"audio\"][\"array\"]\n", "\n", "start_time = time.time()\n", "\n", "with torch.no_grad():\n", " x = torch.from_numpy(x).float().to(reader.device)\n", " if reader.task.cfg.normalize:\n", " x = F.layer_norm(x, x.shape)\n", " x = x.view(1, -1)\n", "\n", " feat = []\n", " for start in range(0, x.size(1), reader.max_chunk):\n", " x_chunk = x[:, start: start + reader.max_chunk]\n", " res = reader.model.extract_features(\n", " source=x_chunk,\n", " padding_mask=None,\n", " mask=False,\n", " layer=reader.layer,\n", " )\n", " feat_chunk = res[\"x\"]\n", " feat.append(feat_chunk)\n", " \n", " features = torch.cat(feat, 1).permute(0, 2, 1)\n", "\n", " x = audio_model.encoder(features)\n", " z = audio_model.projector(x)\n", " _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n", " tokens = idx.cpu().data.numpy().tolist()[0]\n", "\n", "from tqdm import tqdm\n", "from math import ceil\n", "import torch\n", "\n", "context_length = 1877\n", "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n", "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n", " \n", "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n", "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n", "\n", "max_new_tokens = context_length\n", "num_tokens = 0\n", "first_token = True\n", "\n", "while max_new_tokens > 0 and input_ids.shape[-1] < context_length:\n", "\n", " with torch.no_grad():\n", " outputs = model(input_ids = input_ids)\n", "\n", " logits = outputs[\"logits\"][:, -1]\n", "\n", " # Greedy Sampling\n", " probas = torch.softmax(logits, dim=-1)\n", " pred_idx = torch.argmax(probas, dim=-1, keepdim=True)\n", " next_idx = pred_idx.item()\n", "\n", " if first_token:\n", " first_token_latency = time.time() - start_time\n", " first_token = False\n", " start_time = time.time()\n", "\n", " if next_idx == eot_token:\n", " break\n", "\n", " input_ids = torch.cat((input_ids, pred_idx), dim=-1)\n", "\n", " max_new_tokens -= 1\n", " num_tokens += 1\n", "\n", "total_time = time.time() - start_time\n", "\n", "print(\"First Token Latency: \", first_token_latency * 1000, \"ms\")\n", "print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n", "print(\"End to End Inference Time: \", (total_time + first_token_latency) * 1000, \"ms\")\n", "print(tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0])\n", "print(process(sample[\"text\"]))" ] }, { "cell_type": "code", "execution_count": null, "id": "014ed999-3293-4d68-8f9c-017584adc642", "metadata": {}, "outputs": [], "source": [ "tokenizer.batch_decode([[1, 2, 3]])" ] }, { "cell_type": "markdown", "id": "ec11e43f-1eb8-4399-9a93-6f1427782661", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Accelerating GPT 2 Inference" ] }, { "cell_type": "code", "execution_count": null, "id": "5489cb4e-3213-4931-abe1-4c96d1a7ba56", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "- change tensorrt.tensorrt to tensorrt\n", "- remove cpu quantization lines\n", "- output_names [\"logits\"]\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "7e7e6ea6-7319-4e57-af33-5d917d26abc6", "metadata": {}, "outputs": [], "source": [ "import logging\n", "import time\n", "from typing import Callable, Dict\n", "\n", "import numpy as np\n", "import tensorrt as trt\n", "import torch\n", "from tensorrt import ICudaEngine\n", "from tensorrt import Logger, Runtime\n", "from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, AutoModelForCausalLM\n", "from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions\n", "from transformer_deploy.utils.generative_model import GPTModelWrapper\n", "import inspect\n", "from transformers import TensorType\n", "\n", "from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx\n", "from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size\n", "from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine" ] }, { "cell_type": "code", "execution_count": null, "id": "21681412-7747-4824-894a-6006eb12a821", "metadata": {}, "outputs": [], "source": [ "model_name = \"gpt2\"\n", "\n", "model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)\n", "model.eval()\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model.config.pad_token_id = tokenizer.eos_token_id" ] }, { "cell_type": "code", "execution_count": null, "id": "46783acd-c404-44b4-904b-d8fb687afc34", "metadata": {}, "outputs": [], "source": [ "inputs = tokenizer(\"Here is some text to encode Hello World\", return_tensors=\"pt\")\n", "print(\"input tensors\")\n", "print(inputs)\n", "print(\"input tensor shape\")\n", "print(inputs[\"input_ids\"].size())\n", "\n", "with torch.no_grad():\n", " outputs = model(**inputs)\n", "\n", "logits = outputs.logits\n", "print(\"output tensor\")\n", "print(logits)\n", "print(\"output shape\")\n", "print(logits.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "2f6cc7bd-5e2d-4d4e-a7e6-73a6b2ecd7af", "metadata": {}, "outputs": [], "source": [ "size = 0\n", "for i in range(8, 256, 1):\n", " # input sequence (input_ids) made of int-32 (4 bytes)\n", " size += np.prod([1, i]) * 4\n", " # output tensor made of float-32 (4 bytes)\n", " size += np.prod([1, i, 50257]) * 4\n", "print(f\"total size (input+output): {size / 1024**3:.2f} Gb\")\n", "\n", "# to manually check actual tensor size:\n", "# np.prod(logits.shape)*32/8/1024**2:.2f}\n", "# or\n", "# sys.getsizeof(logits.storage())/1024**2" ] }, { "cell_type": "code", "execution_count": null, "id": "7debb40e-9941-45e4-9db8-4bb021ce44ab", "metadata": {}, "outputs": [], "source": [ "input_ids: BatchEncoding = tokenizer(\n", " \"Here is some text to encode Hello World\", add_special_tokens=True, return_attention_mask=False, return_tensors=\"pt\"\n", ")\n", "# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type\n", "for k, v in input_ids.items(): # type: str, torch.Tensor\n", " input_ids[k] = v.type(dtype=torch.int32)\n", "\n", "convert_to_onnx(\n", " model_pytorch=model,\n", " output_path=\"test-gpt2.onnx\",\n", " inputs_pytorch=dict(input_ids),\n", " quantization=False,\n", " var_output_seq=True, # we inform ONNX export tool that the output shape will vary with the input shape\n", " output_names = [\"logits\"]\n", ")\n", "# model may switch to train mode for some unknown reasons, we force the eval mode.\n", "_ = model.eval()" ] }, { "cell_type": "code", "execution_count": null, "id": "956c3007-2c18-4d92-af4f-6cef474d86b5", "metadata": {}, "outputs": [], "source": [ "logging.basicConfig()\n", "logging.getLogger().setLevel(logging.INFO)\n", "num_attention_heads, hidden_size = get_model_size(path=model_name)\n", "optimize_onnx(\n", " onnx_path=\"test-gpt2.onnx\",\n", " onnx_optim_model_path=\"test-gpt2-opt.onnx\",\n", " fp16=False,\n", " use_cuda=True,\n", " num_attention_heads=num_attention_heads,\n", " hidden_size=hidden_size,\n", " architecture=\"gpt2\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "85f30ed9-2802-46c9-9201-a70e200b6860", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "trt_logger: Logger = trt.Logger(trt.Logger.ERROR)\n", "runtime: Runtime = trt.Runtime(trt_logger)\n", "trt_model_name = \"test-gpt2.plan\"\n", "\n", "# create only of does not exist because it's slow to run...\n", "\n", "engine: ICudaEngine = build_engine(\n", " runtime=runtime,\n", " onnx_file_path=\"test-gpt2.onnx\",\n", " logger=trt_logger,\n", " min_shape=(1, 1),\n", " optimal_shape=(1, 128), # num beam, batch size\n", " max_shape=(1, 384), # num beam, batch size\n", " workspace_size=10000 * 1024**2,\n", " fp16=True,\n", " int8=False,\n", ")\n", "save_engine(engine, trt_model_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "908fe664-800e-4c5f-a1d5-adfd31fd1c64", "metadata": {}, "outputs": [], "source": [ "engine.num_bindings" ] }, { "cell_type": "code", "execution_count": null, "id": "4626926b-fa94-4633-95d5-0d515f8db5f6", "metadata": {}, "outputs": [], "source": [ "print(inspect.getsource(GPTModelWrapper))" ] }, { "cell_type": "code", "execution_count": null, "id": "d5bd1de1-a949-46a3-8d15-457d51db4e40", "metadata": {}, "outputs": [], "source": [ "inputs = tokenizer(\n", " \"Here is some text to encode Hello World\", # Nvidia example prompt\n", " add_special_tokens=True,\n", " return_attention_mask=False, # Not used\n", " return_tensors=TensorType.PYTORCH,\n", ")\n", "inputs" ] }, { "cell_type": "code", "execution_count": null, "id": "815b548f-fa00-4183-b72c-10ecdd4b11c7", "metadata": {}, "outputs": [], "source": [ "from transformers.generation import GenerationConfig\n", "\n", "class GPTWrapper(GPTModelWrapper):\n", " def __init__(self, *args, **kwargs):\n", " super().__init__(*args, **kwargs)\n", "\n", " self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None\n", "\n", " @classmethod\n", " def can_generate(cls) -> bool:\n", " \"\"\"\n", " Returns whether this model can generate sequences with `.generate()`.\n", "\n", " Returns:\n", " `bool`: Whether this model can generate sequences with `.generate()`.\n", " \"\"\"\n", " # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.\n", " # Alternativelly, the model can also have a custom `generate` function.\n", " if \"GenerationMixin\" in str(cls.prepare_inputs_for_generation) and \"GenerationMixin\" in str(cls.generate):\n", " return False\n", " return True" ] }, { "cell_type": "code", "execution_count": null, "id": "ca57ed1e-0bbe-48dd-ae0f-f3d8ecd7fd04", "metadata": {}, "outputs": [], "source": [ "def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:\n", " transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)\n", " return model.lm_head(transformer_outputs.last_hidden_state)\n", "\n", "\n", "model.cuda()\n", "model.eval()\n", "inputs.to(\"cuda\")\n", "with torch.inference_mode():\n", " gpt2_model = GPTWrapper(config=model.config, device=model.device, inference=inference_torch)\n", " sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n", " print(tokenizer.decode(sample_output[0], skip_special_tokens=False))\n", " for _ in range(2):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n", " torch.cuda.synchronize()\n", " start = time.time()\n", " for _ in range(10):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n", " torch.cuda.synchronize()\n", " print(f\"----\\nPytorch: {(time.time() - start)/10:.2f}s/sequence\")\n", "_ = model.cpu()" ] }, { "cell_type": "code", "execution_count": null, "id": "f0849aae-876e-47bc-b045-14a594170947", "metadata": {}, "outputs": [], "source": [ "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n", "\n", "\n", "def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:\n", " data = {\"input_ids\": input_ids.detach().cpu().numpy().astype(np.int32)}\n", " logit = model_onnx.run(None, data)\n", " np_logit = np.array(logit) # convert list of numpy arrays to a numpy array\n", " # we convert numpy tensor to Pytorch tensor as it's the type expected by HF decoding algorithm\n", " return torch.squeeze(torch.from_numpy(np_logit), dim=0)\n", "\n", "\n", "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cpu\"), inference=inference_onnx_naive)\n", "inputs.to(\"cpu\")\n", "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n", "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n", "for _ in range(2):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n", "start = time.time()\n", "for _ in range(10):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n", "print(f\"----\\nONNX Runtime (standard API): {(time.time() - start)/10:.2f}s/sequence\")\n", "\n", "del model_onnx" ] }, { "cell_type": "code", "execution_count": null, "id": "96114897-894b-4997-bc61-8ac0682e0e55", "metadata": {}, "outputs": [], "source": [ "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n", "\n", "\n", "def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:\n", " data = {\"input_ids\": input_ids}\n", " return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device=\"cuda\")[\"output\"]\n", "\n", "\n", "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_onnx_optimized)\n", "inputs.to(\"cuda\")\n", "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n", "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n", "for _ in range(2):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n", "start = time.time()\n", "for _ in range(10):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n", "print(f\"----\\nONNX Runtime (binding io API): {(time.time() - start)/10:.2f}/sequence\")\n", "del model_onnx" ] }, { "cell_type": "code", "execution_count": null, "id": "0b5b5427-fd6b-4f70-b307-9c579f0f842a", "metadata": {}, "outputs": [], "source": [ "tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(\n", " engine_file_path=\"test-gpt2.plan\", runtime=runtime\n", ")\n", "\n", "\n", "def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:\n", " data = {\"input_ids\": input_ids}\n", " return tensorrt_model(data)\n", "\n", "\n", "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_tensorrt)\n", "inputs.to(\"cuda\")\n", "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n", "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n", "for _ in range(2):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n", "start = time.time()\n", "for _ in range(10):\n", " _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n", "print(f\"----\\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence\")\n", "\n", "del tensorrt_model" ] }, { "cell_type": "markdown", "id": "f547239d-4f7a-433b-8ef6-9e5110a61f4b", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Using CUDAExecution Provider" ] }, { "cell_type": "code", "execution_count": null, "id": "6e34c682-85fc-4e8d-b13c-7c1c9ea39ead", "metadata": {}, "outputs": [], "source": [ "from optimum.onnxruntime import ORTModelForCausalLM\n", "from optimum.pipelines import pipeline\n", "from transformers import AutoTokenizer\n", "\n", "model_id = \"openai-community/gpt2\"\n", "\n", "ort_model = ORTModelForCausalLM.from_pretrained(\n", " model_id,\n", " export=True,\n", " provider=\"CUDAExecutionProvider\",\n", " use_io_binding=True\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")" ] }, { "cell_type": "code", "execution_count": null, "id": "17d28184-26db-4dd3-b24b-0c5a12b10d6d", "metadata": {}, "outputs": [], "source": [ "import time\n", "\n", "start_time = time.time()\n", "\n", "generations = pipe(\"Both the music and visual were astounding, not to mention the actors performance.\")\n", "generations[0][\"generated_text\"]\n", "\n", "finish_time = time.time()\n", "\n", "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")" ] }, { "cell_type": "markdown", "id": "19c4230a-3244-4dce-b5ef-d9927dec5c45", "metadata": {}, "source": [ "## ASR LM with CUDAExcecution Provider" ] }, { "cell_type": "code", "execution_count": null, "id": "0f0f1cdc-bfcd-46c5-80a4-60bc76366cf5", "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n", "from datasets import DatasetDict\n", "import torch\n", "\n", "device = \"cuda:0\"\n", "dtype = torch.float16\n", "\n", "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n", "\n", "from optimum.onnxruntime import ORTModelForCausalLM\n", "from optimum.pipelines import pipeline\n", "from transformers import AutoTokenizer\n", "\n", "model_id = \"./../out/checkpoint-10000\"\n", "\n", "ort_model = ORTModelForCausalLM.from_pretrained(\n", " model_id,\n", " export=True,\n", " provider=\"CUDAExecutionProvider\",\n", " use_io_binding=True\n", ")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n", "\n", "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")" ] }, { "cell_type": "code", "execution_count": null, "id": "9d32098c-b0ec-4c36-95ac-775a3a865512", "metadata": {}, "outputs": [], "source": [ "ort_model.config.eos_token_id = tokenizer.encode(\"<|endoftranscript|>\")[0]\n", "ort_model.config.bos_token_id = tokenizer.encode(\"<|startoftranscript|>\")[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "1fd0a1fb-9349-4c7a-af03-21e29334f420", "metadata": {}, "outputs": [], "source": [ "dataset[split][idx].keys()" ] }, { "cell_type": "code", "execution_count": null, "id": "15d8b989-6460-4555-b6e2-2f9e219d7034", "metadata": {}, "outputs": [], "source": [ "split = \"train.clean.100\"\n", "idx = 0\n", "\n", "text = \"\".join([ f\"<|audio:{tkn}|>\"for tkn in dataset[split][idx][\"audio_tokens\"]]) + \"<|startoftranscript|>\"\n", "\n", "import time\n", "\n", "start_time = time.time()\n", "\n", "generations = pipe(text, max_new_tokens=10, skip_special_tokens=True)\n", "\n", "finish_time = time.time()\n", "\n", "print(generations[0][\"generated_text\"])\n", "\n", "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }