## Recording voice in Real Time

In [None]:
"""
Sprints:
- [ ] Do Inference optimization of ASR LM
- [ ] Train on train.other.500
- [ ] Generate dataset for prompting

Evaluation Dates: 20th - 21st June, 2023, 3:30 - 5:30pm
Sharpen PPT Skills: 20th June, 3:30pm - 4:45pm
Flow of the PPT:
Demo -> Datasets -> Techniques -> Evaluation -> Q&A
- [ Done ] Update the one pager deck slide
https://sprinklr-my.sharepoint.com/:p:/r/personal/sricharan_narayanam_sprinklr_com/_layouts/15/Doc.aspx?sourcedoc=%7B84811f56-5fc7-4eaa-87d2-db4a3588d18c%7D&action=edit&wdPreviousSession=948ccc35-dc05-f1f9-612d-9a22300e25ba
My PPT:
https://sprinklr-my.sharepoint.com/:p:/p/darshan_makwana/Ec4jCiyMWhxMproH625msc8BClFVceNQ8o4kS3EhZBO9MA?e=YCSDxm&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718703689001&web=1
Intern Tracker:
https://sprinklr.sharepoint.com/:x:/s/AIIntuition/EbRhHPIAIw9MlZ5PpXbztmABde1LFbaSoSHJAo9qU8ggDg?e=xiLkRt&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718692666812&web=1
"""

## ASR LM Inference

In [None]:
from audio_tokenizer import Data2vecFeatureReader
from repcodec.RepCodec import RepCodec
import torch.nn.functional as F
import torch
import yaml

reader = Data2vecFeatureReader("./../prompting/models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000)

config = "./repcodec/configs/repcodec_dim1024.yaml"
with open(config) as fp:
 conf = yaml.load(fp, Loader=yaml.FullLoader)

audio_model = RepCodec(**conf)
audio_model.load_state_dict(torch.load("./../prompting/models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"])
audio_model.quantizer.initial()
audio_model.to("cuda:0")
audio_model.eval()

print("Successfully Loaded Audio Tokenizer")

In [None]:
from datasets import load_dataset

cache_dir = "./../cache"
dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer
import torch
import string

def process(text):

 # Lower case every letter
 text = text.lower()

 # Remove punctuation
 punctuation_to_remove = string.punctuation.replace("'", "")
 translation_table = str.maketrans('', '', punctuation_to_remove)
 text = text.translate(translation_table)

 # Remove whitespaces from front and behind
 while text[0] == ' ' or text[-1] == ' ':
 if text[0] == ' ':
 text = text[1:]
 if text[-1] == ' ':
 text = text[:-1]
 
 return text

device = "cuda:0"
dtype = torch.float16
context_length = 1877

# Load tokenizer and add audio tokens
tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
eot_token = tokenizer.encode("<|endoftranscript|>")[0]
pad_token = tokenizer.encode("<|padding|>")[0]

model = GPT2LMHeadModel.from_pretrained("./../out/checkpoint-10000", attn_implementation="flash_attention_2", device_map=device, torch_dtype=dtype).eval()
model.config.pad_token_id = pad_token
model.config.eos_token_id = eot_token
# model = torch.compile(model)

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

In [None]:
from tqdm import tqdm
from math import ceil
import torch
import time

sample = dataset["train.clean.100"][5]

x = sample["audio"]["array"]

start_time = time.time()

with torch.no_grad():
 x = torch.from_numpy(x).float().to(reader.device)
 if reader.task.cfg.normalize:
 x = F.layer_norm(x, x.shape)
 x = x.view(1, -1)

 feat = []
 for start in range(0, x.size(1), reader.max_chunk):
 x_chunk = x[:, start: start + reader.max_chunk]
 res = reader.model.extract_features(
 source=x_chunk,
 padding_mask=None,
 mask=False,
 layer=reader.layer,
 )
 feat_chunk = res["x"]
 feat.append(feat_chunk)
 
 features = torch.cat(feat, 1).permute(0, 2, 1)

 x = audio_model.encoder(features)
 z = audio_model.projector(x)
 _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))
 tokens = idx.cpu().data.numpy().tolist()[0]
 
text = "".join([f"<|audio:{token}|>" for token in tokens]) + "<|startoftranscript|>"
input_ids = tokenizer(text, return_tensors="pt").to(device)["input_ids"]

input_time = time.time()

generations = model.generate(
 input_ids,
 pad_token_id = pad_token,
 eos_token_id = eot_token,
 max_new_tokens = context_length,
 use_cache=True
)

finish_time = time.time()

tokenizer.batch_decode(generations, skip_special_tokens=True)
print("First Token Latency: ", (input_time - start_time) * 1000, "ms")
# print("Throughput: ", (1 + num_tokens)/total_time, "tokens/s")
print("End to End Inference Time: ", (finish_time - start_time) * 1000, "ms")
print("Refer Text: ", process(sample["text"]))
print("Transcript: ", tokenizer.batch_decode(generations, skip_special_tokens=True)[0])

In [None]:
import time

sample = dataset["train.clean.100"][0]

x = sample["audio"]["array"]

start_time = time.time()

with torch.no_grad():
 x = torch.from_numpy(x).float().to(reader.device)
 if reader.task.cfg.normalize:
 x = F.layer_norm(x, x.shape)
 x = x.view(1, -1)

 feat = []
 for start in range(0, x.size(1), reader.max_chunk):
 x_chunk = x[:, start: start + reader.max_chunk]
 res = reader.model.extract_features(
 source=x_chunk,
 padding_mask=None,
 mask=False,
 layer=reader.layer,
 )
 feat_chunk = res["x"]
 feat.append(feat_chunk)
 
 features = torch.cat(feat, 1).permute(0, 2, 1)

 x = audio_model.encoder(features)
 z = audio_model.projector(x)
 _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))
 tokens = idx.cpu().data.numpy().tolist()[0]

from tqdm import tqdm
from math import ceil
import torch

context_length = 1877
eot_token = tokenizer.encode("<|endoftranscript|>")[0]
pad_token = tokenizer.encode("<|padding|>")[0]
 
text = "".join([f"<|audio:{token}|>" for token in tokens]) + "<|startoftranscript|>"
input_ids = tokenizer(text, return_tensors="pt").to(device)["input_ids"]

max_new_tokens = context_length
num_tokens = 0
first_token = True

while max_new_tokens > 0 and input_ids.shape[-1] < context_length:

 with torch.no_grad():
 outputs = model(input_ids = input_ids)

 logits = outputs["logits"][:, -1]

 # Greedy Sampling
 probas = torch.softmax(logits, dim=-1)
 pred_idx = torch.argmax(probas, dim=-1, keepdim=True)
 next_idx = pred_idx.item()

 if first_token:
 first_token_latency = time.time() - start_time
 first_token = False
 start_time = time.time()

 if next_idx == eot_token:
 break

 input_ids = torch.cat((input_ids, pred_idx), dim=-1)

 max_new_tokens -= 1
 num_tokens += 1

total_time = time.time() - start_time

print("First Token Latency: ", first_token_latency * 1000, "ms")
print("Throughput: ", (1 + num_tokens)/total_time, "tokens/s")
print("End to End Inference Time: ", (total_time + first_token_latency) * 1000, "ms")
print(tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0])
print(process(sample["text"]))

In [None]:
tokenizer.batch_decode([[1, 2, 3]])

## Accelerating GPT 2 Inference

In [None]:
"""
- change tensorrt.tensorrt to tensorrt
- remove cpu quantization lines
- output_names ["logits"]
"""

In [None]:
import logging
import time
from typing import Callable, Dict

import numpy as np
import tensorrt as trt
import torch
from tensorrt import ICudaEngine
from tensorrt import Logger, Runtime
from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, AutoModelForCausalLM
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from transformer_deploy.utils.generative_model import GPTModelWrapper
import inspect
from transformers import TensorType

from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx
from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size
from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine

In [None]:
model_name = "gpt2"

model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
inputs = tokenizer("Here is some text to encode Hello World", return_tensors="pt")
print("input tensors")
print(inputs)
print("input tensor shape")
print(inputs["input_ids"].size())

with torch.no_grad():
 outputs = model(**inputs)

logits = outputs.logits
print("output tensor")
print(logits)
print("output shape")
print(logits.shape)

In [None]:
size = 0
for i in range(8, 256, 1):
 # input sequence (input_ids) made of int-32 (4 bytes)
 size += np.prod([1, i]) * 4
 # output tensor made of float-32 (4 bytes)
 size += np.prod([1, i, 50257]) * 4
print(f"total size (input+output): {size / 1024**3:.2f} Gb")

# to manually check actual tensor size:
# np.prod(logits.shape)*32/8/1024**2:.2f}
# or
# sys.getsizeof(logits.storage())/1024**2

In [None]:
input_ids: BatchEncoding = tokenizer(
 "Here is some text to encode Hello World", add_special_tokens=True, return_attention_mask=False, return_tensors="pt"
)
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items(): # type: str, torch.Tensor
 input_ids[k] = v.type(dtype=torch.int32)

convert_to_onnx(
 model_pytorch=model,
 output_path="test-gpt2.onnx",
 inputs_pytorch=dict(input_ids),
 quantization=False,
 var_output_seq=True, # we inform ONNX export tool that the output shape will vary with the input shape
 output_names = ["logits"]
)
# model may switch to train mode for some unknown reasons, we force the eval mode.
_ = model.eval()

In [None]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
num_attention_heads, hidden_size = get_model_size(path=model_name)
optimize_onnx(
 onnx_path="test-gpt2.onnx",
 onnx_optim_model_path="test-gpt2-opt.onnx",
 fp16=False,
 use_cuda=True,
 num_attention_heads=num_attention_heads,
 hidden_size=hidden_size,
 architecture="gpt2",
)

In [None]:
from pathlib import Path

trt_logger: Logger = trt.Logger(trt.Logger.ERROR)
runtime: Runtime = trt.Runtime(trt_logger)
trt_model_name = "test-gpt2.plan"

# create only of does not exist because it's slow to run...

engine: ICudaEngine = build_engine(
 runtime=runtime,
 onnx_file_path="test-gpt2.onnx",
 logger=trt_logger,
 min_shape=(1, 1),
 optimal_shape=(1, 128), # num beam, batch size
 max_shape=(1, 384), # num beam, batch size
 workspace_size=10000 * 1024**2,
 fp16=True,
 int8=False,
)
save_engine(engine, trt_model_name)

In [None]:
engine.num_bindings

In [None]:
print(inspect.getsource(GPTModelWrapper))

In [None]:
inputs = tokenizer(
 "Here is some text to encode Hello World", # Nvidia example prompt
 add_special_tokens=True,
 return_attention_mask=False, # Not used
 return_tensors=TensorType.PYTORCH,
)
inputs

In [None]:
from transformers.generation import GenerationConfig

class GPTWrapper(GPTModelWrapper):
 def __init__(self, *args, **kwargs):
 super().__init__(*args, **kwargs)

 self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None

 @classmethod
 def can_generate(cls) -> bool:
 """
 Returns whether this model can generate sequences with `.generate()`.

 Returns:
 `bool`: Whether this model can generate sequences with `.generate()`.
 """
 # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
 # Alternativelly, the model can also have a custom `generate` function.
 if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
 return False
 return True

In [None]:
def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:
 transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)
 return model.lm_head(transformer_outputs.last_hidden_state)


model.cuda()
model.eval()
inputs.to("cuda")
with torch.inference_mode():
 gpt2_model = GPTWrapper(config=model.config, device=model.device, inference=inference_torch)
 sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
 print(tokenizer.decode(sample_output[0], skip_special_tokens=False))
 for _ in range(2):
 _ = gpt2_model.generate(inputs.input_ids, max_length=64)
 torch.cuda.synchronize()
 start = time.time()
 for _ in range(10):
 _ = gpt2_model.generate(inputs.input_ids, max_length=256)
 torch.cuda.synchronize()
 print(f"----\nPytorch: {(time.time() - start)/10:.2f}s/sequence")
_ = model.cpu()

In [None]:
model_onnx = create_model_for_provider(path="test-gpt2-opt.onnx", provider_to_use="CUDAExecutionProvider")


def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:
 data = {"input_ids": input_ids.detach().cpu().numpy().astype(np.int32)}
 logit = model_onnx.run(None, data)
 np_logit = np.array(logit) # convert list of numpy arrays to a numpy array
 # we convert numpy tensor to Pytorch tensor as it's the type expected by HF decoding algorithm
 return torch.squeeze(torch.from_numpy(np_logit), dim=0)


gpt2_model = GPTWrapper(config=model.config, device=torch.device("cpu"), inference=inference_onnx_naive)
inputs.to("cpu")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
 _ = gpt2_model.generate(inputs.input_ids, max_length=64)
start = time.time()
for _ in range(10):
 _ = gpt2_model.generate(inputs.input_ids, max_length=256)
print(f"----\nONNX Runtime (standard API): {(time.time() - start)/10:.2f}s/sequence")

del model_onnx

In [None]:
model_onnx = create_model_for_provider(path="test-gpt2-opt.onnx", provider_to_use="CUDAExecutionProvider")


def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:
 data = {"input_ids": input_ids}
 return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device="cuda")["output"]


gpt2_model = GPTWrapper(config=model.config, device=torch.device("cuda"), inference=inference_onnx_optimized)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
 _ = gpt2_model.generate(inputs.input_ids, max_length=64)
start = time.time()
for _ in range(10):
 _ = gpt2_model.generate(inputs.input_ids, max_length=256)
print(f"----\nONNX Runtime (binding io API): {(time.time() - start)/10:.2f}/sequence")
del model_onnx

In [None]:
tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(
 engine_file_path="test-gpt2.plan", runtime=runtime
)


def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:
 data = {"input_ids": input_ids}
 return tensorrt_model(data)


gpt2_model = GPTWrapper(config=model.config, device=torch.device("cuda"), inference=inference_tensorrt)
inputs.to("cuda")
sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
for _ in range(2):
 _ = gpt2_model.generate(inputs.input_ids, max_length=64)
start = time.time()
for _ in range(10):
 _ = gpt2_model.generate(inputs.input_ids, max_length=256)
print(f"----\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence")

del tensorrt_model

## Using CUDAExecution Provider

In [None]:
from optimum.onnxruntime import ORTModelForCausalLM
from optimum.pipelines import pipeline
from transformers import AutoTokenizer

model_id = "openai-community/gpt2"

ort_model = ORTModelForCausalLM.from_pretrained(
 model_id,
 export=True,
 provider="CUDAExecutionProvider",
 use_io_binding=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline(task="text-generation", model=ort_model, tokenizer=tokenizer, device="cuda:0")

In [None]:
import time

start_time = time.time()

generations = pipe("Both the music and visual were astounding, not to mention the actors performance.")
generations[0]["generated_text"]

finish_time = time.time()

print("End to End Latency: ", (finish_time - start_time) * 1000, "ms")

## ASR LM with CUDAExcecution Provider

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer
from datasets import DatasetDict
import torch

device = "cuda:0"
dtype = torch.float16

dataset = DatasetDict.load_from_disk("./../librispeech_tokenized.hf")

from optimum.onnxruntime import ORTModelForCausalLM
from optimum.pipelines import pipeline
from transformers import AutoTokenizer

model_id = "./../out/checkpoint-10000"

ort_model = ORTModelForCausalLM.from_pretrained(
 model_id,
 export=True,
 provider="CUDAExecutionProvider",
 use_io_binding=True
)

tokenizer = AutoTokenizer.from_pretrained("./tokenizer")

pipe = pipeline(task="text-generation", model=ort_model, tokenizer=tokenizer, device="cuda:0")

In [None]:
ort_model.config.eos_token_id = tokenizer.encode("<|endoftranscript|>")[0]
ort_model.config.bos_token_id = tokenizer.encode("<|startoftranscript|>")[0]

In [None]:
dataset[split][idx].keys()

In [None]:
split = "train.clean.100"
idx = 0

text = "".join([ f"<|audio:{tkn}|>"for tkn in dataset[split][idx]["audio_tokens"]]) + "<|startoftranscript|>"

import time

start_time = time.time()

generations = pipe(text, max_new_tokens=10, skip_special_tokens=True)

finish_time = time.time()

print(generations[0]["generated_text"])

print("End to End Latency: ", (finish_time - start_time) * 1000, "ms")