darshanmakwana commited on Jun 24, 2024

Commit

e0c2d04

verified ·

1 Parent(s): 2cddd11

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
ASR/.ipynb_checkpoints/audio_tokenizer-checkpoint.py +611 -0
ASR/.ipynb_checkpoints/demo-checkpoint.ipynb +849 -0
ASR/.ipynb_checkpoints/demo-checkpoint.py +24 -0
ASR/.ipynb_checkpoints/tokenizer_training-checkpoint.ipynb +203 -0
ASR/__pycache__/audio_tokenizer.cpython-38.pyc +0 -0
ASR/__pycache__/tokenizer.cpython-38.pyc +0 -0
ASR/audio_tokenizer.py +611 -0
ASR/demo.ipynb +878 -0
ASR/demo.py +24 -0
ASR/repcodec/.ipynb_checkpoints/RepCodec-checkpoint.py +84 -0
ASR/repcodec/RepCodec.py +84 -0
ASR/repcodec/__pycache__/RepCodec.cpython-38.pyc +0 -0
ASR/repcodec/configs/repcodec_dim1024.yaml +18 -0
ASR/repcodec/configs/repcodec_dim1280.yaml +18 -0
ASR/repcodec/configs/repcodec_dim768.yaml +18 -0
ASR/repcodec/layers/__pycache__/conv_layer.cpython-38.pyc +0 -0
ASR/repcodec/layers/__pycache__/vq_module.cpython-38.pyc +0 -0
ASR/repcodec/layers/conv_layer.py +95 -0
ASR/repcodec/layers/vq_module.py +155 -0
ASR/repcodec/modules/__pycache__/decoder.cpython-38.pyc +0 -0
ASR/repcodec/modules/__pycache__/encoder.cpython-38.pyc +0 -0
ASR/repcodec/modules/__pycache__/projector.cpython-38.pyc +0 -0
ASR/repcodec/modules/__pycache__/quantizer.cpython-38.pyc +0 -0
ASR/repcodec/modules/__pycache__/residual_unit.cpython-38.pyc +0 -0
ASR/repcodec/modules/decoder.py +109 -0
ASR/repcodec/modules/encoder.py +89 -0
ASR/repcodec/modules/projector.py +32 -0
ASR/repcodec/modules/quantizer.py +46 -0
ASR/repcodec/modules/residual_unit.py +39 -0
ASR/repcodec/tokenize.py +212 -0
ASR/test-gpt2-opt.onnx +3 -0
ASR/test-gpt2.onnx +3 -0
ASR/test-gpt2.plan +3 -0
ASR/tokenized_librispeech/dataset_dict.json +1 -0
ASR/tokenized_librispeech/test.clean/data-00000-of-00001.arrow +3 -0
ASR/tokenized_librispeech/test.clean/dataset_info.json +29 -0
ASR/tokenized_librispeech/test.clean/state.json +13 -0
ASR/tokenized_librispeech/test.other/data-00000-of-00001.arrow +3 -0
ASR/tokenized_librispeech/test.other/dataset_info.json +29 -0
ASR/tokenized_librispeech/test.other/state.json +13 -0
ASR/tokenized_librispeech/train.clean.100/data-00000-of-00001.arrow +3 -0
ASR/tokenized_librispeech/train.clean.100/dataset_info.json +29 -0
ASR/tokenized_librispeech/train.clean.100/state.json +13 -0
ASR/tokenized_librispeech/train.clean.360/data-00000-of-00003.arrow +3 -0
ASR/tokenized_librispeech/train.clean.360/data-00001-of-00003.arrow +3 -0
ASR/tokenized_librispeech/train.clean.360/data-00002-of-00003.arrow +3 -0
ASR/tokenized_librispeech/train.clean.360/dataset_info.json +29 -0
ASR/tokenized_librispeech/train.clean.360/state.json +19 -0
ASR/tokenized_librispeech/train.other.500/data-00000-of-00004.arrow +3 -0

.gitattributes CHANGED Viewed

@@ -40,3 +40,7 @@ prompting/train_data/train.other.500.json filter=lfs diff=lfs merge=lfs -text
 prompting/transcripts/train.clean.360.txt filter=lfs diff=lfs merge=lfs -text
 prompting/transcripts/train.other.500.txt filter=lfs diff=lfs merge=lfs -text
 prompting/wandb/run-20240615_114519-wfpe2teb/run-wfpe2teb.wandb filter=lfs diff=lfs merge=lfs -text

 prompting/transcripts/train.clean.360.txt filter=lfs diff=lfs merge=lfs -text
 prompting/transcripts/train.other.500.txt filter=lfs diff=lfs merge=lfs -text
 prompting/wandb/run-20240615_114519-wfpe2teb/run-wfpe2teb.wandb filter=lfs diff=lfs merge=lfs -text
+ASR/test-gpt2.plan filter=lfs diff=lfs merge=lfs -text
+ASR/transformer-deploy/docs/infinity/infinity.xcf filter=lfs diff=lfs merge=lfs -text
+ASR/transformer-deploy/resources/img/export_process.png filter=lfs diff=lfs merge=lfs -text
+ASR/transformer-deploy/resources/img/gpt2.png filter=lfs diff=lfs merge=lfs -text

ASR/.ipynb_checkpoints/audio_tokenizer-checkpoint.py ADDED Viewed

	@@ -0,0 +1,611 @@

+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import II
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from fairseq.modules import EMAModule, EMAModuleConfig
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import (
+    ConvFeatureExtractionModel,
+    Wav2Vec2Config,
+    TransformerEncoder,
+)
+from fairseq.modules import (
+    GradMultiply,
+    LayerNorm,
+)
+from fairseq.utils import index_put
+logger = logging.getLogger(__name__)
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+    loss_beta: float = field(
+        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
+    )
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
+        },
+    )
+    average_top_k_layers: int = field(
+        default=8, metadata={"help": "how many layers to average"}
+    )
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(
+        default=0.9999, metadata={"help": "final ema decay rate"}
+    )
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+    max_update: int = II("optimization.max_update")
+    min_target_var: float = field(
+        default=0.1, metadata={"help": "stop training if target var falls below this"}
+    )
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+        self.final_proj = nn.Linear(self.embed, self.embed)
+        self.num_updates = 0
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        if self.ema is None and self.final_proj is not None:
+            logger.info(f"making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+        self.num_updates = num_updates
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices)
+                    .to(x.device)
+                    .unsqueeze(1)
+                    .expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+        return x, mask_indices
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(
+                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
+            )
+        return input_lengths.to(torch.long)
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        orig_padding_mask = padding_mask
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+            padding_mask = torch.zeros(
+                features.shape[:2], dtype=features.dtype, device=features.device
+            )
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+        features = self.dropout_input(features)
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+        result = {
+            "losses": {},
+        }
+        with torch.no_grad():
+            self.ema.model.eval()
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+            target_layer_results = [l[2] for l in y["layer_results"]]
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    tl.permute(1, 2, 0) for tl in target_layer_results  # TBC -> BCT
+                ]
+                permuted = True
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(
+                        tl.float(), running_mean=None, running_var=None, training=True
+                    )
+                    for tl in target_layer_results
+                ]
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [
+                    F.instance_norm(tl.float()) for tl in target_layer_results
+                ]
+            if permuted:
+                target_layer_results = [
+                    tl.transpose(1, 2) for tl in target_layer_results  # BCT -> BTC
+                ]
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-2:])
+                    for tl in target_layer_results
+                ]
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-1:])
+                    for tl in target_layer_results
+                ]
+            y = sum(target_layer_results) / len(target_layer_results)
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+            if not permuted:
+                y = y.transpose(0, 1)
+            y = y[mask_indices]
+        x = x[mask_indices]
+        x = self.final_proj(x)
+        sz = x.size(-1)
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(
+                x.float(), y.float(), reduction="none", beta=self.loss_beta
+            ).sum(dim=-1)
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+        result["losses"]["regression"] = loss.sum() * scale
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+            raise Exception(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+            raise Exception(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+        return result
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+    def extract_features(
+        self, source, padding_mask, mask=False, layer=None
+    ):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )
+import logging
+import torch
+import torch.nn.functional as F
+from fairseq import tasks
+from fairseq.checkpoint_utils import load_checkpoint_to_cpu
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from omegaconf import OmegaConf
+logger = logging.getLogger("dump_feature")
+class Data2vecFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        state = load_checkpoint_to_cpu(ckpt_path)
+        cfg = state["cfg"]
+        # load task
+        task = tasks.setup_task(cfg.task, from_checkpoint=True)
+        task.load_state_dict(state["task_state"])
+        # load model config
+        if "layer_type" not in cfg.model:
+            # fix a missing key
+            model_config = {k: v for k, v in cfg.model.items()}
+            model_config["layer_type"] = "transformer"
+            model_config = OmegaConf.create(model_config)
+        else:
+            model_config = cfg.model
+        # fix param name in the state
+        state["model"]["final_proj.weight"] = state["model"].pop("final_proj.0.weight")
+        state["model"]["final_proj.bias"] = state["model"].pop("final_proj.0.bias")
+        del state["model"]["_ema"]
+        # load model
+        model = Data2VecAudioModel.build_model(model_config)
+        model.load_state_dict(
+            state["model"], strict=True, model_cfg=model_config
+        )
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model.eval().to(self.device)
+        self.task = task
+        self.layer = layer - 1  # make it 1-based
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

ASR/.ipynb_checkpoints/demo-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,849 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "715a402a-44b9-4fa2-abf0-b0cfd2f3d80b",
+   "metadata": {},
+   "source": [
+    "## Recording voice in Real Time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbdf6bab-7418-4a6f-8b75-c31f98a6ada5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Sprints:\n",
+    "- [ ] Do Inference optimization of ASR LM\n",
+    "- [ ] Train on train.other.500\n",
+    "- [ ] Generate dataset for prompting\n",
+    "\n",
+    "Evaluation Dates: 20th - 21st June, 2023, 3:30 - 5:30pm\n",
+    "Sharpen PPT Skills: 20th June, 3:30pm - 4:45pm\n",
+    "Flow of the PPT:\n",
+    "Demo -> Datasets -> Techniques -> Evaluation -> Q&A\n",
+    "- [ Done ] Update the one pager deck slide\n",
+    "https://sprinklr-my.sharepoint.com/:p:/r/personal/sricharan_narayanam_sprinklr_com/_layouts/15/Doc.aspx?sourcedoc=%7B84811f56-5fc7-4eaa-87d2-db4a3588d18c%7D&action=edit&wdPreviousSession=948ccc35-dc05-f1f9-612d-9a22300e25ba\n",
+    "My PPT:\n",
+    "https://sprinklr-my.sharepoint.com/:p:/p/darshan_makwana/Ec4jCiyMWhxMproH625msc8BClFVceNQ8o4kS3EhZBO9MA?e=YCSDxm&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718703689001&web=1\n",
+    "Intern Tracker:\n",
+    "https://sprinklr.sharepoint.com/:x:/s/AIIntuition/EbRhHPIAIw9MlZ5PpXbztmABde1LFbaSoSHJAo9qU8ggDg?e=xiLkRt&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718692666812&web=1\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "150aca01-4098-4ab2-809a-25775ec52069",
+   "metadata": {},
+   "source": [
+    "## ASR LM Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "804a58af-beb2-48c1-9530-98024e27c0d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from audio_tokenizer import Data2vecFeatureReader\n",
+    "from repcodec.RepCodec import RepCodec\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "import yaml\n",
+    "\n",
+    "reader = Data2vecFeatureReader(\"./../prompting/models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)\n",
+    "\n",
+    "config = \"./repcodec/configs/repcodec_dim1024.yaml\"\n",
+    "with open(config) as fp:\n",
+    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
+    "\n",
+    "audio_model = RepCodec(**conf)\n",
+    "audio_model.load_state_dict(torch.load(\"./../prompting/models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
+    "audio_model.quantizer.initial()\n",
+    "audio_model.to(\"cuda:0\")\n",
+    "audio_model.eval()\n",
+    "\n",
+    "print(\"Successfully Loaded Audio Tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d8da397-2030-4b36-9a42-97862488797b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "cache_dir = \"./../cache\"\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bb8016b2-fc9d-4c23-9e85-b6e1c5ca164c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 33\u001b[0m\n\u001b[1;32m     30\u001b[0m eot_token \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|endoftranscript|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m     31\u001b[0m pad_token \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|padding|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m---> 33\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mGPT2LMHeadModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./../out/checkpoint-10000\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattn_implementation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mflash_attention_2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39meval()\n\u001b[1;32m     34\u001b[0m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mpad_token_id \u001b[38;5;241m=\u001b[39m pad_token\n\u001b[1;32m     35\u001b[0m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39meos_token_id \u001b[38;5;241m=\u001b[39m eot_token\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:3620\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   3617\u001b[0m     init_contexts\u001b[38;5;241m.\u001b[39mappend(init_empty_weights())\n\u001b[1;32m   3619\u001b[0m config \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(config)  \u001b[38;5;66;03m# We do not want to modify the config inplace in from_pretrained.\u001b[39;00m\n\u001b[0;32m-> 3620\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_autoset_attn_implementation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3621\u001b[0m \u001b[43m    \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\n\u001b[1;32m   3622\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3624\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[1;32m   3625\u001b[0m     \u001b[38;5;66;03m# Let's make sure we don't run the init function of buffer modules\u001b[39;00m\n\u001b[1;32m   3626\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(config, \u001b[38;5;241m*\u001b[39mmodel_args, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1469\u001b[0m, in \u001b[0;36mPreTrainedModel._autoset_attn_implementation\u001b[0;34m(cls, config, use_flash_attention_2, torch_dtype, device_map, check_device_map)\u001b[0m\n\u001b[1;32m   1466\u001b[0m     config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1468\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m-> 1469\u001b[0m     \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_and_enable_flash_attn_2\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1470\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1471\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1472\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1473\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhard_check_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   1474\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck_device_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_device_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1475\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1476\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msdpa\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available():\n\u001b[1;32m   1477\u001b[0m     \u001b[38;5;66;03m# use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif.\u001b[39;00m\n\u001b[1;32m   1478\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_sdpa(\n\u001b[1;32m   1479\u001b[0m         config,\n\u001b[1;32m   1480\u001b[0m         hard_check_only\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m   1481\u001b[0m     )\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1571\u001b[0m, in \u001b[0;36mPreTrainedModel._check_and_enable_flash_attn_2\u001b[0;34m(cls, config, torch_dtype, device_map, check_device_map, hard_check_only)\u001b[0m\n\u001b[1;32m   1568\u001b[0m install_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1570\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mutil\u001b[38;5;241m.\u001b[39mfind_spec(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1571\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpreface\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m the package flash_attn seems to be not installed. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minstall_message\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1573\u001b[0m flash_attention_version \u001b[38;5;241m=\u001b[39m version\u001b[38;5;241m.\u001b[39mparse(importlib\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mversion(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m   1574\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mversion\u001b[38;5;241m.\u001b[39mcuda:\n",
+      "\u001b[0;31mImportError\u001b[0m: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
+    "import torch\n",
+    "import string\n",
+    "\n",
+    "def process(text):\n",
+    "\n",
+    "    # Lower case every letter\n",
+    "    text = text.lower()\n",
+    "\n",
+    "    # Remove punctuation\n",
+    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
+    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
+    "    text = text.translate(translation_table)\n",
+    "\n",
+    "    # Remove whitespaces from front and behind\n",
+    "    while text[0] == ' ' or text[-1] == ' ':\n",
+    "        if text[0] == ' ':\n",
+    "            text = text[1:]\n",
+    "        if text[-1] == ' ':\n",
+    "            text = text[:-1]\n",
+    "    \n",
+    "    return text\n",
+    "\n",
+    "device = \"cuda:0\"\n",
+    "dtype = torch.float16\n",
+    "context_length = 1877\n",
+    "\n",
+    "# Load tokenizer and add audio tokens\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
+    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
+    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
+    "\n",
+    "model = GPT2LMHeadModel.from_pretrained(\"./../out/checkpoint-10000\", attn_implementation=\"flash_attention_2\", device_map=device, torch_dtype=dtype).eval()\n",
+    "model.config.pad_token_id = pad_token\n",
+    "model.config.eos_token_id = eot_token\n",
+    "# model = torch.compile(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cabe9dc-bbbf-41b4-918f-3f60ee5582f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from math import ceil\n",
+    "import torch\n",
+    "import time\n",
+    "\n",
+    "sample = dataset[\"train.clean.100\"][5]\n",
+    "\n",
+    "x = sample[\"audio\"][\"array\"]\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.from_numpy(x).float().to(reader.device)\n",
+    "    if reader.task.cfg.normalize:\n",
+    "        x = F.layer_norm(x, x.shape)\n",
+    "    x = x.view(1, -1)\n",
+    "\n",
+    "    feat = []\n",
+    "    for start in range(0, x.size(1), reader.max_chunk):\n",
+    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
+    "        res = reader.model.extract_features(\n",
+    "            source=x_chunk,\n",
+    "            padding_mask=None,\n",
+    "            mask=False,\n",
+    "            layer=reader.layer,\n",
+    "        )\n",
+    "        feat_chunk = res[\"x\"]\n",
+    "        feat.append(feat_chunk)\n",
+    "        \n",
+    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
+    "\n",
+    "    x = audio_model.encoder(features)\n",
+    "    z = audio_model.projector(x)\n",
+    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
+    "    \n",
+    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
+    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
+    "\n",
+    "input_time = time.time()\n",
+    "\n",
+    "generations = model.generate(\n",
+    "    input_ids,\n",
+    "    pad_token_id = pad_token,\n",
+    "    eos_token_id = eot_token,\n",
+    "    max_new_tokens = context_length,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "\n",
+    "finish_time = time.time()\n",
+    "\n",
+    "tokenizer.batch_decode(generations, skip_special_tokens=True)\n",
+    "print(\"First Token Latency: \", (input_time - start_time) * 1000, \"ms\")\n",
+    "# print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
+    "print(\"End to End Inference Time: \", (finish_time - start_time) * 1000, \"ms\")\n",
+    "print(\"Refer Text: \", process(sample[\"text\"]))\n",
+    "print(\"Transcript: \", tokenizer.batch_decode(generations, skip_special_tokens=True)[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baa8d79b-7cf5-4435-838c-1f3d4e043d60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "sample = dataset[\"train.clean.100\"][0]\n",
+    "\n",
+    "x = sample[\"audio\"][\"array\"]\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.from_numpy(x).float().to(reader.device)\n",
+    "    if reader.task.cfg.normalize:\n",
+    "        x = F.layer_norm(x, x.shape)\n",
+    "    x = x.view(1, -1)\n",
+    "\n",
+    "    feat = []\n",
+    "    for start in range(0, x.size(1), reader.max_chunk):\n",
+    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
+    "        res = reader.model.extract_features(\n",
+    "            source=x_chunk,\n",
+    "            padding_mask=None,\n",
+    "            mask=False,\n",
+    "            layer=reader.layer,\n",
+    "        )\n",
+    "        feat_chunk = res[\"x\"]\n",
+    "        feat.append(feat_chunk)\n",
+    "        \n",
+    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
+    "\n",
+    "    x = audio_model.encoder(features)\n",
+    "    z = audio_model.projector(x)\n",
+    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from math import ceil\n",
+    "import torch\n",
+    "\n",
+    "context_length = 1877\n",
+    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
+    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
+    "    \n",
+    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
+    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
+    "\n",
+    "max_new_tokens = context_length\n",
+    "num_tokens = 0\n",
+    "first_token = True\n",
+    "\n",
+    "while max_new_tokens > 0 and input_ids.shape[-1] < context_length:\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(input_ids = input_ids)\n",
+    "\n",
+    "    logits = outputs[\"logits\"][:, -1]\n",
+    "\n",
+    "    # Greedy Sampling\n",
+    "    probas = torch.softmax(logits, dim=-1)\n",
+    "    pred_idx = torch.argmax(probas, dim=-1, keepdim=True)\n",
+    "    next_idx = pred_idx.item()\n",
+    "\n",
+    "    if first_token:\n",
+    "        first_token_latency = time.time() - start_time\n",
+    "        first_token = False\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "    if next_idx == eot_token:\n",
+    "        break\n",
+    "\n",
+    "    input_ids = torch.cat((input_ids, pred_idx), dim=-1)\n",
+    "\n",
+    "    max_new_tokens -= 1\n",
+    "    num_tokens += 1\n",
+    "\n",
+    "total_time = time.time() - start_time\n",
+    "\n",
+    "print(\"First Token Latency: \", first_token_latency * 1000, \"ms\")\n",
+    "print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
+    "print(\"End to End Inference Time: \", (total_time + first_token_latency) * 1000, \"ms\")\n",
+    "print(tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0])\n",
+    "print(process(sample[\"text\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "014ed999-3293-4d68-8f9c-017584adc642",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.batch_decode([[1, 2, 3]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec11e43f-1eb8-4399-9a93-6f1427782661",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Accelerating GPT 2 Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5489cb4e-3213-4931-abe1-4c96d1a7ba56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "- change tensorrt.tensorrt to tensorrt\n",
+    "- remove cpu quantization lines\n",
+    "- output_names [\"logits\"]\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e7e6ea6-7319-4e57-af33-5d917d26abc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import time\n",
+    "from typing import Callable, Dict\n",
+    "\n",
+    "import numpy as np\n",
+    "import tensorrt as trt\n",
+    "import torch\n",
+    "from tensorrt import ICudaEngine\n",
+    "from tensorrt import Logger, Runtime\n",
+    "from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, AutoModelForCausalLM\n",
+    "from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions\n",
+    "from transformer_deploy.utils.generative_model import GPTModelWrapper\n",
+    "import inspect\n",
+    "from transformers import TensorType\n",
+    "\n",
+    "from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx\n",
+    "from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size\n",
+    "from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21681412-7747-4824-894a-6006eb12a821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"gpt2\"\n",
+    "\n",
+    "model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "model.eval()\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model.config.pad_token_id = tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46783acd-c404-44b4-904b-d8fb687afc34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = tokenizer(\"Here is some text to encode Hello World\", return_tensors=\"pt\")\n",
+    "print(\"input tensors\")\n",
+    "print(inputs)\n",
+    "print(\"input tensor shape\")\n",
+    "print(inputs[\"input_ids\"].size())\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(**inputs)\n",
+    "\n",
+    "logits = outputs.logits\n",
+    "print(\"output tensor\")\n",
+    "print(logits)\n",
+    "print(\"output shape\")\n",
+    "print(logits.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f6cc7bd-5e2d-4d4e-a7e6-73a6b2ecd7af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "size = 0\n",
+    "for i in range(8, 256, 1):\n",
+    "    # input sequence (input_ids) made of int-32 (4 bytes)\n",
+    "    size += np.prod([1, i]) * 4\n",
+    "    # output tensor made of float-32 (4 bytes)\n",
+    "    size += np.prod([1, i, 50257]) * 4\n",
+    "print(f\"total size (input+output): {size / 1024**3:.2f} Gb\")\n",
+    "\n",
+    "# to manually check actual tensor size:\n",
+    "# np.prod(logits.shape)*32/8/1024**2:.2f}\n",
+    "# or\n",
+    "# sys.getsizeof(logits.storage())/1024**2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7debb40e-9941-45e4-9db8-4bb021ce44ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids: BatchEncoding = tokenizer(\n",
+    "    \"Here is some text to encode Hello World\", add_special_tokens=True, return_attention_mask=False, return_tensors=\"pt\"\n",
+    ")\n",
+    "# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type\n",
+    "for k, v in input_ids.items():  # type: str, torch.Tensor\n",
+    "    input_ids[k] = v.type(dtype=torch.int32)\n",
+    "\n",
+    "convert_to_onnx(\n",
+    "    model_pytorch=model,\n",
+    "    output_path=\"test-gpt2.onnx\",\n",
+    "    inputs_pytorch=dict(input_ids),\n",
+    "    quantization=False,\n",
+    "    var_output_seq=True,  # we inform ONNX export tool that the output shape will vary with the input shape\n",
+    "    output_names = [\"logits\"]\n",
+    ")\n",
+    "# model may switch to train mode for some unknown reasons, we force the eval mode.\n",
+    "_ = model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "956c3007-2c18-4d92-af4f-6cef474d86b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logging.basicConfig()\n",
+    "logging.getLogger().setLevel(logging.INFO)\n",
+    "num_attention_heads, hidden_size = get_model_size(path=model_name)\n",
+    "optimize_onnx(\n",
+    "    onnx_path=\"test-gpt2.onnx\",\n",
+    "    onnx_optim_model_path=\"test-gpt2-opt.onnx\",\n",
+    "    fp16=False,\n",
+    "    use_cuda=True,\n",
+    "    num_attention_heads=num_attention_heads,\n",
+    "    hidden_size=hidden_size,\n",
+    "    architecture=\"gpt2\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85f30ed9-2802-46c9-9201-a70e200b6860",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "trt_logger: Logger = trt.Logger(trt.Logger.ERROR)\n",
+    "runtime: Runtime = trt.Runtime(trt_logger)\n",
+    "trt_model_name = \"test-gpt2.plan\"\n",
+    "\n",
+    "# create only of does not exist because it's slow to run...\n",
+    "\n",
+    "engine: ICudaEngine = build_engine(\n",
+    "    runtime=runtime,\n",
+    "    onnx_file_path=\"test-gpt2.onnx\",\n",
+    "    logger=trt_logger,\n",
+    "    min_shape=(1, 1),\n",
+    "    optimal_shape=(1, 128),  # num beam, batch size\n",
+    "    max_shape=(1, 384),  # num beam, batch size\n",
+    "    workspace_size=10000 * 1024**2,\n",
+    "    fp16=True,\n",
+    "    int8=False,\n",
+    ")\n",
+    "save_engine(engine, trt_model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "908fe664-800e-4c5f-a1d5-adfd31fd1c64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engine.num_bindings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4626926b-fa94-4633-95d5-0d515f8db5f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(inspect.getsource(GPTModelWrapper))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5bd1de1-a949-46a3-8d15-457d51db4e40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = tokenizer(\n",
+    "    \"Here is some text to encode Hello World\",  # Nvidia example prompt\n",
+    "    add_special_tokens=True,\n",
+    "    return_attention_mask=False,  # Not used\n",
+    "    return_tensors=TensorType.PYTORCH,\n",
+    ")\n",
+    "inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "815b548f-fa00-4183-b72c-10ecdd4b11c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers.generation import GenerationConfig\n",
+    "\n",
+    "class GPTWrapper(GPTModelWrapper):\n",
+    "    def __init__(self, *args, **kwargs):\n",
+    "        super().__init__(*args, **kwargs)\n",
+    "\n",
+    "        self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None\n",
+    "\n",
+    "    @classmethod\n",
+    "    def can_generate(cls) -> bool:\n",
+    "        \"\"\"\n",
+    "        Returns whether this model can generate sequences with `.generate()`.\n",
+    "\n",
+    "        Returns:\n",
+    "            `bool`: Whether this model can generate sequences with `.generate()`.\n",
+    "        \"\"\"\n",
+    "        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.\n",
+    "        # Alternativelly, the model can also have a custom `generate` function.\n",
+    "        if \"GenerationMixin\" in str(cls.prepare_inputs_for_generation) and \"GenerationMixin\" in str(cls.generate):\n",
+    "            return False\n",
+    "        return True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca57ed1e-0bbe-48dd-ae0f-f3d8ecd7fd04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)\n",
+    "    return model.lm_head(transformer_outputs.last_hidden_state)\n",
+    "\n",
+    "\n",
+    "model.cuda()\n",
+    "model.eval()\n",
+    "inputs.to(\"cuda\")\n",
+    "with torch.inference_mode():\n",
+    "    gpt2_model = GPTWrapper(config=model.config, device=model.device, inference=inference_torch)\n",
+    "    sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "    print(tokenizer.decode(sample_output[0], skip_special_tokens=False))\n",
+    "    for _ in range(2):\n",
+    "        _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "        torch.cuda.synchronize()\n",
+    "    start = time.time()\n",
+    "    for _ in range(10):\n",
+    "        _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "        torch.cuda.synchronize()\n",
+    "    print(f\"----\\nPytorch: {(time.time() - start)/10:.2f}s/sequence\")\n",
+    "_ = model.cpu()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0849aae-876e-47bc-b045-14a594170947",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
+    "\n",
+    "\n",
+    "def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    data = {\"input_ids\": input_ids.detach().cpu().numpy().astype(np.int32)}\n",
+    "    logit = model_onnx.run(None, data)\n",
+    "    np_logit = np.array(logit)  # convert list of numpy arrays to a numpy array\n",
+    "    # we convert numpy tensor to Pytorch tensor as it's the type expected by HF decoding algorithm\n",
+    "    return torch.squeeze(torch.from_numpy(np_logit), dim=0)\n",
+    "\n",
+    "\n",
+    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cpu\"), inference=inference_onnx_naive)\n",
+    "inputs.to(\"cpu\")\n",
+    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
+    "for _ in range(2):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "start = time.time()\n",
+    "for _ in range(10):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "print(f\"----\\nONNX Runtime (standard API): {(time.time() - start)/10:.2f}s/sequence\")\n",
+    "\n",
+    "del model_onnx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96114897-894b-4997-bc61-8ac0682e0e55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
+    "\n",
+    "\n",
+    "def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    data = {\"input_ids\": input_ids}\n",
+    "    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device=\"cuda\")[\"output\"]\n",
+    "\n",
+    "\n",
+    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_onnx_optimized)\n",
+    "inputs.to(\"cuda\")\n",
+    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
+    "for _ in range(2):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "start = time.time()\n",
+    "for _ in range(10):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "print(f\"----\\nONNX Runtime (binding io API): {(time.time() - start)/10:.2f}/sequence\")\n",
+    "del model_onnx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b5b5427-fd6b-4f70-b307-9c579f0f842a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(\n",
+    "    engine_file_path=\"test-gpt2.plan\", runtime=runtime\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    data = {\"input_ids\": input_ids}\n",
+    "    return tensorrt_model(data)\n",
+    "\n",
+    "\n",
+    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_tensorrt)\n",
+    "inputs.to(\"cuda\")\n",
+    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
+    "for _ in range(2):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "start = time.time()\n",
+    "for _ in range(10):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "print(f\"----\\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence\")\n",
+    "\n",
+    "del tensorrt_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f547239d-4f7a-433b-8ef6-9e5110a61f4b",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Using CUDAExecution Provider"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e34c682-85fc-4e8d-b13c-7c1c9ea39ead",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.onnxruntime import ORTModelForCausalLM\n",
+    "from optimum.pipelines import pipeline\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_id = \"openai-community/gpt2\"\n",
+    "\n",
+    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
+    "    model_id,\n",
+    "    export=True,\n",
+    "    provider=\"CUDAExecutionProvider\",\n",
+    "    use_io_binding=True\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17d28184-26db-4dd3-b24b-0c5a12b10d6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "generations = pipe(\"Both the music and visual were astounding, not to mention the actors performance.\")\n",
+    "generations[0][\"generated_text\"]\n",
+    "\n",
+    "finish_time = time.time()\n",
+    "\n",
+    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19c4230a-3244-4dce-b5ef-d9927dec5c45",
+   "metadata": {},
+   "source": [
+    "## ASR LM with CUDAExcecution Provider"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f0f1cdc-bfcd-46c5-80a4-60bc76366cf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
+    "from datasets import DatasetDict\n",
+    "import torch\n",
+    "\n",
+    "device = \"cuda:0\"\n",
+    "dtype = torch.float16\n",
+    "\n",
+    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
+    "\n",
+    "from optimum.onnxruntime import ORTModelForCausalLM\n",
+    "from optimum.pipelines import pipeline\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_id = \"./../out/checkpoint-10000\"\n",
+    "\n",
+    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
+    "    model_id,\n",
+    "    export=True,\n",
+    "    provider=\"CUDAExecutionProvider\",\n",
+    "    use_io_binding=True\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
+    "\n",
+    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d32098c-b0ec-4c36-95ac-775a3a865512",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ort_model.config.eos_token_id = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
+    "ort_model.config.bos_token_id = tokenizer.encode(\"<|startoftranscript|>\")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fd0a1fb-9349-4c7a-af03-21e29334f420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset[split][idx].keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15d8b989-6460-4555-b6e2-2f9e219d7034",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split = \"train.clean.100\"\n",
+    "idx = 0\n",
+    "\n",
+    "text = \"\".join([ f\"<|audio:{tkn}|>\"for tkn in dataset[split][idx][\"audio_tokens\"]]) + \"<|startoftranscript|>\"\n",
+    "\n",
+    "import time\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "generations = pipe(text, max_new_tokens=10, skip_special_tokens=True)\n",
+    "\n",
+    "finish_time = time.time()\n",
+    "\n",
+    "print(generations[0][\"generated_text\"])\n",
+    "\n",
+    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

ASR/.ipynb_checkpoints/demo-checkpoint.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from flask import Flask, request
+# import speech_recognition as sr
+app = Flask(__name__)
+# recognizer = sr.Recognizer()
+@app.route("/darshan/microphone", methods=['POST'])
+def handle_audio():
+    audio_data = request.data
+    print(audio_data)
+    # audio = sr.AudioData(audio_data, sample_rate=44100, sample_width=2)  # Adjust sample rate and sample width as needed
+    # try:
+    #     text = recognizer.recognize_google(audio)
+    #     print(f"Transcription: {text}")
+    #     return {'transcription': text}, 200
+    # except sr.UnknownValueError:
+    #     print("Could not understand audio")
+    #     return '', 400
+    # except sr.RequestError as e:
+    #     print(f"Error from Google Speech Recognition service; {e}")
+    #     return '', 500
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=8723)  # Replace with your desired host and port

ASR/.ipynb_checkpoints/tokenizer_training-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,203 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f95f1d6-be90-4900-9116-c27b82bd7836",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tokenizers import SentencePieceBPETokenizer\n",
+    "import transformers\n",
+    "from transformers import GPT2Tokenizer, AutoModelForCausalLM\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "\n",
+    "cache_dir = \"./cache\"\n",
+    "\n",
+    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
+    "\n",
+    "text = []\n",
+    "for split in dataset.keys():\n",
+    "    text += list(dataset[split][\"text\"])\n",
+    "\n",
+    "model_max_length = 1877\n",
+    "special_tokens = [ f\"<|audio:{idx}|>\" for idx in range(1024)] + [\"<|startoftranscript|>\", \"<|endoftranscript|>\", \"<|padding|>\"]\n",
+    "\n",
+    "bpe_tokenizer = SentencePieceBPETokenizer()\n",
+    "bpe_tokenizer.train_from_iterator(\n",
+    "    text,\n",
+    "    vocab_size = 5000 + len(special_tokens),\n",
+    "    min_frequency = 2,\n",
+    "    show_progress = True,\n",
+    "    special_tokens = special_tokens\n",
+    ")\n",
+    "\n",
+    "tokenizer = transformers.PreTrainedTokenizerFast(\n",
+    "    tokenizer_object = bpe_tokenizer,\n",
+    "    model_max_length = model_max_length,\n",
+    "    special_tokens = special_tokens\n",
+    ")\n",
+    "\n",
+    "tokenizer.pad_token = \"<|padding|>\"\n",
+    "tokenizer.pad_token_id = bpe_tokenizer.token_to_id(\"<|padding|>\")\n",
+    "\n",
+    "tokenizer.save_pretrained(\"./tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d259b76d-1c8d-4c74-9d04-d711a4b3f395",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "\n",
+    "max_length = 1877\n",
+    "\n",
+    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
+    "\n",
+    "def tokenize(row):\n",
+    "    text = \"\".join([f\"<|audio:{token}|>\" for token in row[\"audio_tokens\"]]) + \"<|startoftranscript|>\" + row[\"text\"] + \"<|endoftranscript|>\"\n",
+    "    input_ids = tokenizer(\n",
+    "        text,\n",
+    "        padding=\"max_length\",\n",
+    "        max_length=max_length,\n",
+    "    )\n",
+    "    return input_ids\n",
+    "\n",
+    "dataset = dataset.map(tokenize, remove_columns=[\"text\", \"audio_tokens\"])\n",
+    "\n",
+    "dataset.save_to_disk(\"tokenized_librispeech\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9bc4d1db-1390-4ba1-b296-d52a8993c87f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'.\n",
+      "  table = cls._concat_blocks(blocks, axis=0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "\n",
+    "max_length = 1877\n",
+    "\n",
+    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a83d83c7-2e56-4b9e-9f63-11f7eea22d6d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1024]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.encode(\"<|startoftranscript|>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b9f0fa9-384b-4453-bf4c-0d49f0c1e4a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.pad_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1964689d-687d-4ab9-967d-80d9eb95b159",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "lens = []\n",
+    "\n",
+    "for split in dataset.keys():\n",
+    "    for idx in tqdm(range(len(dataset[split]))):\n",
+    "        sample = dataset[split][idx]\n",
+    "        max_len = len(tokenizer.encode(sample[\"text\"])) + len(sample[\"audio_tokens\"])\n",
+    "        lens.append(max_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e49c1a27-c3e4-43eb-a4e0-68a0d739f27b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max(lens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96dbd94d-5455-49bd-8893-4aae5dfd9b7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min(lens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9aaca12-286d-416c-a2f7-0cdf689eeb2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.encode(\"<|audio:0|>\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

ASR/__pycache__/audio_tokenizer.cpython-38.pyc ADDED Viewed

Binary file (14.8 kB). View file

ASR/__pycache__/tokenizer.cpython-38.pyc ADDED Viewed

Binary file (14.8 kB). View file

ASR/audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,611 @@

+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import II
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from fairseq.modules import EMAModule, EMAModuleConfig
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import (
+    ConvFeatureExtractionModel,
+    Wav2Vec2Config,
+    TransformerEncoder,
+)
+from fairseq.modules import (
+    GradMultiply,
+    LayerNorm,
+)
+from fairseq.utils import index_put
+logger = logging.getLogger(__name__)
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+    loss_beta: float = field(
+        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
+    )
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
+        },
+    )
+    average_top_k_layers: int = field(
+        default=8, metadata={"help": "how many layers to average"}
+    )
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(
+        default=0.9999, metadata={"help": "final ema decay rate"}
+    )
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+    max_update: int = II("optimization.max_update")
+    min_target_var: float = field(
+        default=0.1, metadata={"help": "stop training if target var falls below this"}
+    )
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+        self.final_proj = nn.Linear(self.embed, self.embed)
+        self.num_updates = 0
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        if self.ema is None and self.final_proj is not None:
+            logger.info(f"making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+        self.num_updates = num_updates
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices)
+                    .to(x.device)
+                    .unsqueeze(1)
+                    .expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+        return x, mask_indices
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(
+                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
+            )
+        return input_lengths.to(torch.long)
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        orig_padding_mask = padding_mask
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+            padding_mask = torch.zeros(
+                features.shape[:2], dtype=features.dtype, device=features.device
+            )
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+        features = self.dropout_input(features)
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+        result = {
+            "losses": {},
+        }
+        with torch.no_grad():
+            self.ema.model.eval()
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+            target_layer_results = [l[2] for l in y["layer_results"]]
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    tl.permute(1, 2, 0) for tl in target_layer_results  # TBC -> BCT
+                ]
+                permuted = True
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(
+                        tl.float(), running_mean=None, running_var=None, training=True
+                    )
+                    for tl in target_layer_results
+                ]
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [
+                    F.instance_norm(tl.float()) for tl in target_layer_results
+                ]
+            if permuted:
+                target_layer_results = [
+                    tl.transpose(1, 2) for tl in target_layer_results  # BCT -> BTC
+                ]
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-2:])
+                    for tl in target_layer_results
+                ]
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-1:])
+                    for tl in target_layer_results
+                ]
+            y = sum(target_layer_results) / len(target_layer_results)
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+            if not permuted:
+                y = y.transpose(0, 1)
+            y = y[mask_indices]
+        x = x[mask_indices]
+        x = self.final_proj(x)
+        sz = x.size(-1)
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(
+                x.float(), y.float(), reduction="none", beta=self.loss_beta
+            ).sum(dim=-1)
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+        result["losses"]["regression"] = loss.sum() * scale
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+            raise Exception(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+            raise Exception(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+        return result
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+    def extract_features(
+        self, source, padding_mask, mask=False, layer=None
+    ):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )
+import logging
+import torch
+import torch.nn.functional as F
+from fairseq import tasks
+from fairseq.checkpoint_utils import load_checkpoint_to_cpu
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from omegaconf import OmegaConf
+logger = logging.getLogger("dump_feature")
+class Data2vecFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        state = load_checkpoint_to_cpu(ckpt_path)
+        cfg = state["cfg"]
+        # load task
+        task = tasks.setup_task(cfg.task, from_checkpoint=True)
+        task.load_state_dict(state["task_state"])
+        # load model config
+        if "layer_type" not in cfg.model:
+            # fix a missing key
+            model_config = {k: v for k, v in cfg.model.items()}
+            model_config["layer_type"] = "transformer"
+            model_config = OmegaConf.create(model_config)
+        else:
+            model_config = cfg.model
+        # fix param name in the state
+        state["model"]["final_proj.weight"] = state["model"].pop("final_proj.0.weight")
+        state["model"]["final_proj.bias"] = state["model"].pop("final_proj.0.bias")
+        del state["model"]["_ema"]
+        # load model
+        model = Data2VecAudioModel.build_model(model_config)
+        model.load_state_dict(
+            state["model"], strict=True, model_cfg=model_config
+        )
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model.eval().to(self.device)
+        self.task = task
+        self.layer = layer - 1  # make it 1-based
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

ASR/demo.ipynb ADDED Viewed

	@@ -0,0 +1,878 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "715a402a-44b9-4fa2-abf0-b0cfd2f3d80b",
+   "metadata": {},
+   "source": [
+    "## Recording voice in Real Time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbdf6bab-7418-4a6f-8b75-c31f98a6ada5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Sprints:\n",
+    "- [ ] Do Inference optimization of ASR LM\n",
+    "- [ ] Train on train.other.500\n",
+    "- [ ] Generate dataset for prompting\n",
+    "\n",
+    "Evaluation Dates: 20th - 21st June, 2023, 3:30 - 5:30pm\n",
+    "Sharpen PPT Skills: 20th June, 3:30pm - 4:45pm\n",
+    "Flow of the PPT:\n",
+    "Demo -> Datasets -> Techniques -> Evaluation -> Q&A\n",
+    "- [ Done ] Update the one pager deck slide\n",
+    "https://sprinklr-my.sharepoint.com/:p:/r/personal/sricharan_narayanam_sprinklr_com/_layouts/15/Doc.aspx?sourcedoc=%7B84811f56-5fc7-4eaa-87d2-db4a3588d18c%7D&action=edit&wdPreviousSession=948ccc35-dc05-f1f9-612d-9a22300e25ba\n",
+    "My PPT:\n",
+    "https://sprinklr-my.sharepoint.com/:p:/p/darshan_makwana/Ec4jCiyMWhxMproH625msc8BClFVceNQ8o4kS3EhZBO9MA?e=YCSDxm&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718703689001&web=1\n",
+    "Intern Tracker:\n",
+    "https://sprinklr.sharepoint.com/:x:/s/AIIntuition/EbRhHPIAIw9MlZ5PpXbztmABde1LFbaSoSHJAo9qU8ggDg?e=xiLkRt&wdOrigin=TEAMS-MAGLEV.p2p_ns.rwc&wdExp=TEAMS-TREATMENT&wdhostclicktime=1718692666812&web=1\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "150aca01-4098-4ab2-809a-25775ec52069",
+   "metadata": {},
+   "source": [
+    "## ASR LM Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "804a58af-beb2-48c1-9530-98024e27c0d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from audio_tokenizer import Data2vecFeatureReader\n",
+    "from repcodec.RepCodec import RepCodec\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "import yaml\n",
+    "\n",
+    "reader = Data2vecFeatureReader(\"./../prompting/models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)\n",
+    "\n",
+    "config = \"./repcodec/configs/repcodec_dim1024.yaml\"\n",
+    "with open(config) as fp:\n",
+    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
+    "\n",
+    "audio_model = RepCodec(**conf)\n",
+    "audio_model.load_state_dict(torch.load(\"./../prompting/models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
+    "audio_model.quantizer.initial()\n",
+    "audio_model.to(\"cuda:0\")\n",
+    "audio_model.eval()\n",
+    "\n",
+    "print(\"Successfully Loaded Audio Tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d8da397-2030-4b36-9a42-97862488797b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "cache_dir = \"./../cache\"\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bb8016b2-fc9d-4c23-9e85-b6e1c5ca164c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
+    "import torch\n",
+    "import string\n",
+    "\n",
+    "def process(text):\n",
+    "\n",
+    "    # Lower case every letter\n",
+    "    text = text.lower()\n",
+    "\n",
+    "    # Remove punctuation\n",
+    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
+    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
+    "    text = text.translate(translation_table)\n",
+    "\n",
+    "    # Remove whitespaces from front and behind\n",
+    "    while text[0] == ' ' or text[-1] == ' ':\n",
+    "        if text[0] == ' ':\n",
+    "            text = text[1:]\n",
+    "        if text[-1] == ' ':\n",
+    "            text = text[:-1]\n",
+    "    \n",
+    "    return text\n",
+    "\n",
+    "device = \"cuda:0\"\n",
+    "dtype = torch.float16\n",
+    "context_length = 1877\n",
+    "\n",
+    "# Load tokenizer and add audio tokens\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
+    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
+    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
+    "\n",
+    "model = GPT2LMHeadModel.from_pretrained(\"./../out/checkpoint-19000\", attn_implementation=\"flash_attention_2\", device_map=device, torch_dtype=dtype).eval()\n",
+    "model.config.pad_token_id = pad_token\n",
+    "model.config.eos_token_id = eot_token\n",
+    "# model = torch.compile(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "693db182-92ac-4e36-b848-989fafd10e73",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "GPT2Model(\n",
+       "  (wte): Embedding(6027, 768)\n",
+       "  (wpe): Embedding(1877, 768)\n",
+       "  (drop): Dropout(p=0.1, inplace=False)\n",
+       "  (h): ModuleList(\n",
+       "    (0-11): 12 x GPT2Block(\n",
+       "      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "      (attn): GPT2FlashAttention2(\n",
+       "        (c_attn): Conv1D()\n",
+       "        (c_proj): Conv1D()\n",
+       "        (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "      (mlp): GPT2MLP(\n",
+       "        (c_fc): Conv1D()\n",
+       "        (c_proj): Conv1D()\n",
+       "        (act): NewGELUActivation()\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.transformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cabe9dc-bbbf-41b4-918f-3f60ee5582f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from math import ceil\n",
+    "import torch\n",
+    "import time\n",
+    "\n",
+    "sample = dataset[\"train.clean.100\"][5]\n",
+    "\n",
+    "x = sample[\"audio\"][\"array\"]\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.from_numpy(x).float().to(reader.device)\n",
+    "    if reader.task.cfg.normalize:\n",
+    "        x = F.layer_norm(x, x.shape)\n",
+    "    x = x.view(1, -1)\n",
+    "\n",
+    "    feat = []\n",
+    "    for start in range(0, x.size(1), reader.max_chunk):\n",
+    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
+    "        res = reader.model.extract_features(\n",
+    "            source=x_chunk,\n",
+    "            padding_mask=None,\n",
+    "            mask=False,\n",
+    "            layer=reader.layer,\n",
+    "        )\n",
+    "        feat_chunk = res[\"x\"]\n",
+    "        feat.append(feat_chunk)\n",
+    "        \n",
+    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
+    "\n",
+    "    x = audio_model.encoder(features)\n",
+    "    z = audio_model.projector(x)\n",
+    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
+    "    \n",
+    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
+    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
+    "\n",
+    "input_time = time.time()\n",
+    "\n",
+    "generations = model.generate(\n",
+    "    input_ids,\n",
+    "    pad_token_id = pad_token,\n",
+    "    eos_token_id = eot_token,\n",
+    "    max_new_tokens = context_length,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "\n",
+    "finish_time = time.time()\n",
+    "\n",
+    "tokenizer.batch_decode(generations, skip_special_tokens=True)\n",
+    "print(\"First Token Latency: \", (input_time - start_time) * 1000, \"ms\")\n",
+    "# print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
+    "print(\"End to End Inference Time: \", (finish_time - start_time) * 1000, \"ms\")\n",
+    "print(\"Refer Text: \", process(sample[\"text\"]))\n",
+    "print(\"Transcript: \", tokenizer.batch_decode(generations, skip_special_tokens=True)[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baa8d79b-7cf5-4435-838c-1f3d4e043d60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "sample = dataset[\"train.clean.100\"][0]\n",
+    "\n",
+    "x = sample[\"audio\"][\"array\"]\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.from_numpy(x).float().to(reader.device)\n",
+    "    if reader.task.cfg.normalize:\n",
+    "        x = F.layer_norm(x, x.shape)\n",
+    "    x = x.view(1, -1)\n",
+    "\n",
+    "    feat = []\n",
+    "    for start in range(0, x.size(1), reader.max_chunk):\n",
+    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
+    "        res = reader.model.extract_features(\n",
+    "            source=x_chunk,\n",
+    "            padding_mask=None,\n",
+    "            mask=False,\n",
+    "            layer=reader.layer,\n",
+    "        )\n",
+    "        feat_chunk = res[\"x\"]\n",
+    "        feat.append(feat_chunk)\n",
+    "        \n",
+    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
+    "\n",
+    "    x = audio_model.encoder(features)\n",
+    "    z = audio_model.projector(x)\n",
+    "    _, idx = audio_model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from math import ceil\n",
+    "import torch\n",
+    "\n",
+    "context_length = 1877\n",
+    "eot_token = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
+    "pad_token = tokenizer.encode(\"<|padding|>\")[0]\n",
+    "    \n",
+    "text = \"\".join([f\"<|audio:{token}|>\" for token in tokens]) + \"<|startoftranscript|>\"\n",
+    "input_ids = tokenizer(text, return_tensors=\"pt\").to(device)[\"input_ids\"]\n",
+    "\n",
+    "max_new_tokens = context_length\n",
+    "num_tokens = 0\n",
+    "first_token = True\n",
+    "\n",
+    "while max_new_tokens > 0 and input_ids.shape[-1] < context_length:\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(input_ids = input_ids)\n",
+    "\n",
+    "    logits = outputs[\"logits\"][:, -1]\n",
+    "\n",
+    "    # Greedy Sampling\n",
+    "    probas = torch.softmax(logits, dim=-1)\n",
+    "    pred_idx = torch.argmax(probas, dim=-1, keepdim=True)\n",
+    "    next_idx = pred_idx.item()\n",
+    "\n",
+    "    if first_token:\n",
+    "        first_token_latency = time.time() - start_time\n",
+    "        first_token = False\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "    if next_idx == eot_token:\n",
+    "        break\n",
+    "\n",
+    "    input_ids = torch.cat((input_ids, pred_idx), dim=-1)\n",
+    "\n",
+    "    max_new_tokens -= 1\n",
+    "    num_tokens += 1\n",
+    "\n",
+    "total_time = time.time() - start_time\n",
+    "\n",
+    "print(\"First Token Latency: \", first_token_latency * 1000, \"ms\")\n",
+    "print(\"Throughput: \", (1 + num_tokens)/total_time, \"tokens/s\")\n",
+    "print(\"End to End Inference Time: \", (total_time + first_token_latency) * 1000, \"ms\")\n",
+    "print(tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0])\n",
+    "print(process(sample[\"text\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "014ed999-3293-4d68-8f9c-017584adc642",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.batch_decode([[1, 2, 3]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec11e43f-1eb8-4399-9a93-6f1427782661",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Accelerating GPT 2 Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5489cb4e-3213-4931-abe1-4c96d1a7ba56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "- change tensorrt.tensorrt to tensorrt\n",
+    "- remove cpu quantization lines\n",
+    "- output_names [\"logits\"]\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e7e6ea6-7319-4e57-af33-5d917d26abc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import time\n",
+    "from typing import Callable, Dict\n",
+    "\n",
+    "import numpy as np\n",
+    "import tensorrt as trt\n",
+    "import torch\n",
+    "from tensorrt import ICudaEngine\n",
+    "from tensorrt import Logger, Runtime\n",
+    "from transformers import AutoTokenizer, BatchEncoding, GPT2LMHeadModel, AutoModelForCausalLM\n",
+    "from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions\n",
+    "from transformer_deploy.utils.generative_model import GPTModelWrapper\n",
+    "import inspect\n",
+    "from transformers import TensorType\n",
+    "\n",
+    "from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx\n",
+    "from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size\n",
+    "from transformer_deploy.backends.trt_utils import build_engine, load_engine, save_engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21681412-7747-4824-894a-6006eb12a821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_name = \"gpt2\"\n",
+    "\n",
+    "model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)\n",
+    "model.eval()\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model.config.pad_token_id = tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46783acd-c404-44b4-904b-d8fb687afc34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = tokenizer(\"Here is some text to encode Hello World\", return_tensors=\"pt\")\n",
+    "print(\"input tensors\")\n",
+    "print(inputs)\n",
+    "print(\"input tensor shape\")\n",
+    "print(inputs[\"input_ids\"].size())\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model(**inputs)\n",
+    "\n",
+    "logits = outputs.logits\n",
+    "print(\"output tensor\")\n",
+    "print(logits)\n",
+    "print(\"output shape\")\n",
+    "print(logits.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f6cc7bd-5e2d-4d4e-a7e6-73a6b2ecd7af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "size = 0\n",
+    "for i in range(8, 256, 1):\n",
+    "    # input sequence (input_ids) made of int-32 (4 bytes)\n",
+    "    size += np.prod([1, i]) * 4\n",
+    "    # output tensor made of float-32 (4 bytes)\n",
+    "    size += np.prod([1, i, 50257]) * 4\n",
+    "print(f\"total size (input+output): {size / 1024**3:.2f} Gb\")\n",
+    "\n",
+    "# to manually check actual tensor size:\n",
+    "# np.prod(logits.shape)*32/8/1024**2:.2f}\n",
+    "# or\n",
+    "# sys.getsizeof(logits.storage())/1024**2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7debb40e-9941-45e4-9db8-4bb021ce44ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids: BatchEncoding = tokenizer(\n",
+    "    \"Here is some text to encode Hello World\", add_special_tokens=True, return_attention_mask=False, return_tensors=\"pt\"\n",
+    ")\n",
+    "# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type\n",
+    "for k, v in input_ids.items():  # type: str, torch.Tensor\n",
+    "    input_ids[k] = v.type(dtype=torch.int32)\n",
+    "\n",
+    "convert_to_onnx(\n",
+    "    model_pytorch=model,\n",
+    "    output_path=\"test-gpt2.onnx\",\n",
+    "    inputs_pytorch=dict(input_ids),\n",
+    "    quantization=False,\n",
+    "    var_output_seq=True,  # we inform ONNX export tool that the output shape will vary with the input shape\n",
+    "    output_names = [\"logits\"]\n",
+    ")\n",
+    "# model may switch to train mode for some unknown reasons, we force the eval mode.\n",
+    "_ = model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "956c3007-2c18-4d92-af4f-6cef474d86b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logging.basicConfig()\n",
+    "logging.getLogger().setLevel(logging.INFO)\n",
+    "num_attention_heads, hidden_size = get_model_size(path=model_name)\n",
+    "optimize_onnx(\n",
+    "    onnx_path=\"test-gpt2.onnx\",\n",
+    "    onnx_optim_model_path=\"test-gpt2-opt.onnx\",\n",
+    "    fp16=False,\n",
+    "    use_cuda=True,\n",
+    "    num_attention_heads=num_attention_heads,\n",
+    "    hidden_size=hidden_size,\n",
+    "    architecture=\"gpt2\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85f30ed9-2802-46c9-9201-a70e200b6860",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "trt_logger: Logger = trt.Logger(trt.Logger.ERROR)\n",
+    "runtime: Runtime = trt.Runtime(trt_logger)\n",
+    "trt_model_name = \"test-gpt2.plan\"\n",
+    "\n",
+    "# create only of does not exist because it's slow to run...\n",
+    "\n",
+    "engine: ICudaEngine = build_engine(\n",
+    "    runtime=runtime,\n",
+    "    onnx_file_path=\"test-gpt2.onnx\",\n",
+    "    logger=trt_logger,\n",
+    "    min_shape=(1, 1),\n",
+    "    optimal_shape=(1, 128),  # num beam, batch size\n",
+    "    max_shape=(1, 384),  # num beam, batch size\n",
+    "    workspace_size=10000 * 1024**2,\n",
+    "    fp16=True,\n",
+    "    int8=False,\n",
+    ")\n",
+    "save_engine(engine, trt_model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "908fe664-800e-4c5f-a1d5-adfd31fd1c64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engine.num_bindings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4626926b-fa94-4633-95d5-0d515f8db5f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(inspect.getsource(GPTModelWrapper))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5bd1de1-a949-46a3-8d15-457d51db4e40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = tokenizer(\n",
+    "    \"Here is some text to encode Hello World\",  # Nvidia example prompt\n",
+    "    add_special_tokens=True,\n",
+    "    return_attention_mask=False,  # Not used\n",
+    "    return_tensors=TensorType.PYTORCH,\n",
+    ")\n",
+    "inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "815b548f-fa00-4183-b72c-10ecdd4b11c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers.generation import GenerationConfig\n",
+    "\n",
+    "class GPTWrapper(GPTModelWrapper):\n",
+    "    def __init__(self, *args, **kwargs):\n",
+    "        super().__init__(*args, **kwargs)\n",
+    "\n",
+    "        self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None\n",
+    "\n",
+    "    @classmethod\n",
+    "    def can_generate(cls) -> bool:\n",
+    "        \"\"\"\n",
+    "        Returns whether this model can generate sequences with `.generate()`.\n",
+    "\n",
+    "        Returns:\n",
+    "            `bool`: Whether this model can generate sequences with `.generate()`.\n",
+    "        \"\"\"\n",
+    "        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.\n",
+    "        # Alternativelly, the model can also have a custom `generate` function.\n",
+    "        if \"GenerationMixin\" in str(cls.prepare_inputs_for_generation) and \"GenerationMixin\" in str(cls.generate):\n",
+    "            return False\n",
+    "        return True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca57ed1e-0bbe-48dd-ae0f-f3d8ecd7fd04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)\n",
+    "    return model.lm_head(transformer_outputs.last_hidden_state)\n",
+    "\n",
+    "\n",
+    "model.cuda()\n",
+    "model.eval()\n",
+    "inputs.to(\"cuda\")\n",
+    "with torch.inference_mode():\n",
+    "    gpt2_model = GPTWrapper(config=model.config, device=model.device, inference=inference_torch)\n",
+    "    sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "    print(tokenizer.decode(sample_output[0], skip_special_tokens=False))\n",
+    "    for _ in range(2):\n",
+    "        _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "        torch.cuda.synchronize()\n",
+    "    start = time.time()\n",
+    "    for _ in range(10):\n",
+    "        _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "        torch.cuda.synchronize()\n",
+    "    print(f\"----\\nPytorch: {(time.time() - start)/10:.2f}s/sequence\")\n",
+    "_ = model.cpu()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0849aae-876e-47bc-b045-14a594170947",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
+    "\n",
+    "\n",
+    "def inference_onnx_naive(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    data = {\"input_ids\": input_ids.detach().cpu().numpy().astype(np.int32)}\n",
+    "    logit = model_onnx.run(None, data)\n",
+    "    np_logit = np.array(logit)  # convert list of numpy arrays to a numpy array\n",
+    "    # we convert numpy tensor to Pytorch tensor as it's the type expected by HF decoding algorithm\n",
+    "    return torch.squeeze(torch.from_numpy(np_logit), dim=0)\n",
+    "\n",
+    "\n",
+    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cpu\"), inference=inference_onnx_naive)\n",
+    "inputs.to(\"cpu\")\n",
+    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
+    "for _ in range(2):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "start = time.time()\n",
+    "for _ in range(10):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "print(f\"----\\nONNX Runtime (standard API): {(time.time() - start)/10:.2f}s/sequence\")\n",
+    "\n",
+    "del model_onnx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96114897-894b-4997-bc61-8ac0682e0e55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_onnx = create_model_for_provider(path=\"test-gpt2-opt.onnx\", provider_to_use=\"CUDAExecutionProvider\")\n",
+    "\n",
+    "\n",
+    "def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    data = {\"input_ids\": input_ids}\n",
+    "    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device=\"cuda\")[\"output\"]\n",
+    "\n",
+    "\n",
+    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_onnx_optimized)\n",
+    "inputs.to(\"cuda\")\n",
+    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
+    "for _ in range(2):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "start = time.time()\n",
+    "for _ in range(10):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "print(f\"----\\nONNX Runtime (binding io API): {(time.time() - start)/10:.2f}/sequence\")\n",
+    "del model_onnx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b5b5427-fd6b-4f70-b307-9c579f0f842a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tensorrt_model: Callable[[Dict[str, torch.Tensor]], torch.Tensor] = load_engine(\n",
+    "    engine_file_path=\"test-gpt2.plan\", runtime=runtime\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def inference_tensorrt(input_ids: torch.Tensor) -> torch.Tensor:\n",
+    "    data = {\"input_ids\": input_ids}\n",
+    "    return tensorrt_model(data)\n",
+    "\n",
+    "\n",
+    "gpt2_model = GPTWrapper(config=model.config, device=torch.device(\"cuda\"), inference=inference_tensorrt)\n",
+    "inputs.to(\"cuda\")\n",
+    "sample_output = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "print(tokenizer.decode(sample_output[0], skip_special_tokens=True))\n",
+    "for _ in range(2):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=64)\n",
+    "start = time.time()\n",
+    "for _ in range(10):\n",
+    "    _ = gpt2_model.generate(inputs.input_ids, max_length=256)\n",
+    "print(f\"----\\nTensorRT + CUDA tensors: {(time.time() - start)/10:.2f}/sequence\")\n",
+    "\n",
+    "del tensorrt_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f547239d-4f7a-433b-8ef6-9e5110a61f4b",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Using CUDAExecution Provider"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e34c682-85fc-4e8d-b13c-7c1c9ea39ead",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.onnxruntime import ORTModelForCausalLM\n",
+    "from optimum.pipelines import pipeline\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_id = \"openai-community/gpt2\"\n",
+    "\n",
+    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
+    "    model_id,\n",
+    "    export=True,\n",
+    "    provider=\"CUDAExecutionProvider\",\n",
+    "    use_io_binding=True\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17d28184-26db-4dd3-b24b-0c5a12b10d6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "generations = pipe(\"Both the music and visual were astounding, not to mention the actors performance.\")\n",
+    "generations[0][\"generated_text\"]\n",
+    "\n",
+    "finish_time = time.time()\n",
+    "\n",
+    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19c4230a-3244-4dce-b5ef-d9927dec5c45",
+   "metadata": {},
+   "source": [
+    "## ASR LM with CUDAExcecution Provider"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f0f1cdc-bfcd-46c5-80a4-60bc76366cf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer\n",
+    "from datasets import DatasetDict\n",
+    "import torch\n",
+    "\n",
+    "device = \"cuda:0\"\n",
+    "dtype = torch.float16\n",
+    "\n",
+    "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n",
+    "\n",
+    "from optimum.onnxruntime import ORTModelForCausalLM\n",
+    "from optimum.pipelines import pipeline\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_id = \"./../out/checkpoint-10000\"\n",
+    "\n",
+    "ort_model = ORTModelForCausalLM.from_pretrained(\n",
+    "    model_id,\n",
+    "    export=True,\n",
+    "    provider=\"CUDAExecutionProvider\",\n",
+    "    use_io_binding=True\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n",
+    "\n",
+    "pipe = pipeline(task=\"text-generation\", model=ort_model, tokenizer=tokenizer, device=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d32098c-b0ec-4c36-95ac-775a3a865512",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ort_model.config.eos_token_id = tokenizer.encode(\"<|endoftranscript|>\")[0]\n",
+    "ort_model.config.bos_token_id = tokenizer.encode(\"<|startoftranscript|>\")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fd0a1fb-9349-4c7a-af03-21e29334f420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset[split][idx].keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15d8b989-6460-4555-b6e2-2f9e219d7034",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split = \"train.clean.100\"\n",
+    "idx = 0\n",
+    "\n",
+    "text = \"\".join([ f\"<|audio:{tkn}|>\"for tkn in dataset[split][idx][\"audio_tokens\"]]) + \"<|startoftranscript|>\"\n",
+    "\n",
+    "import time\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "generations = pipe(text, max_new_tokens=10, skip_special_tokens=True)\n",
+    "\n",
+    "finish_time = time.time()\n",
+    "\n",
+    "print(generations[0][\"generated_text\"])\n",
+    "\n",
+    "print(\"End to End Latency: \", (finish_time - start_time) * 1000, \"ms\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

ASR/demo.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from flask import Flask, request
+# import speech_recognition as sr
+app = Flask(__name__)
+# recognizer = sr.Recognizer()
+@app.route("/darshan/microphone", methods=['POST'])
+def handle_audio():
+    audio_data = request.data
+    print(audio_data)
+    # audio = sr.AudioData(audio_data, sample_rate=44100, sample_width=2)  # Adjust sample rate and sample width as needed
+    # try:
+    #     text = recognizer.recognize_google(audio)
+    #     print(f"Transcription: {text}")
+    #     return {'transcription': text}, 200
+    # except sr.UnknownValueError:
+    #     print("Could not understand audio")
+    #     return '', 400
+    # except sr.RequestError as e:
+    #     print(f"Error from Google Speech Recognition service; {e}")
+    #     return '', 500
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=8723)  # Replace with your desired host and port

ASR/repcodec/.ipynb_checkpoints/RepCodec-checkpoint.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch.nn as nn
+from repcodec.modules.decoder import Decoder
+from repcodec.modules.encoder import Encoder
+from repcodec.modules.projector import Projector
+from repcodec.modules.quantizer import Quantizer
+class RepCodec(nn.Module):
+    def __init__(
+            self,
+            input_channels=768,
+            output_channels=768,
+            encode_channels=768,
+            decode_channels=768,
+            code_dim=768,
+            codebook_num=1,
+            codebook_size=1024,
+            bias=True,
+            enc_ratios=(1, 1),
+            dec_ratios=(1, 1),
+            enc_strides=(1, 1),
+            dec_strides=(1, 1),
+            enc_kernel_size=3,
+            dec_kernel_size=3,
+            enc_block_dilations=(1, 1),
+            enc_block_kernel_size=3,
+            dec_block_dilations=(1, 1),
+            dec_block_kernel_size=3
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.encoder = Encoder(
+            input_channels=input_channels,
+            encode_channels=encode_channels,
+            channel_ratios=enc_ratios,
+            strides=enc_strides,
+            kernel_size=enc_kernel_size,
+            bias=bias,
+            block_dilations=enc_block_dilations,
+            unit_kernel_size=enc_block_kernel_size
+        )
+        self.decoder = Decoder(
+            code_dim=code_dim,
+            output_channels=output_channels,
+            decode_channels=decode_channels,
+            channel_ratios=dec_ratios,
+            strides=dec_strides,
+            kernel_size=dec_kernel_size,
+            bias=bias,
+            block_dilations=dec_block_dilations,
+            unit_kernel_size=dec_block_kernel_size
+        )
+        self.projector = Projector(
+            input_channels=self.encoder.out_channels,
+            code_dim=code_dim,
+            kernel_size=3,
+            stride=1,
+            bias=False
+        )
+        self.quantizer = Quantizer(
+            code_dim=code_dim,
+            codebook_num=codebook_num,
+            codebook_size=codebook_size
+        )
+    def forward(self, x):
+        x = self.encoder(x)
+        z = self.projector(x)
+        zq, vqloss, perplexity = self.quantizer(z)
+        y = self.decoder(zq)
+        return y, zq, z, vqloss, perplexity

ASR/repcodec/RepCodec.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch.nn as nn
+from repcodec.modules.decoder import Decoder
+from repcodec.modules.encoder import Encoder
+from repcodec.modules.projector import Projector
+from repcodec.modules.quantizer import Quantizer
+class RepCodec(nn.Module):
+    def __init__(
+            self,
+            input_channels=768,
+            output_channels=768,
+            encode_channels=768,
+            decode_channels=768,
+            code_dim=768,
+            codebook_num=1,
+            codebook_size=1024,
+            bias=True,
+            enc_ratios=(1, 1),
+            dec_ratios=(1, 1),
+            enc_strides=(1, 1),
+            dec_strides=(1, 1),
+            enc_kernel_size=3,
+            dec_kernel_size=3,
+            enc_block_dilations=(1, 1),
+            enc_block_kernel_size=3,
+            dec_block_dilations=(1, 1),
+            dec_block_kernel_size=3
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.encoder = Encoder(
+            input_channels=input_channels,
+            encode_channels=encode_channels,
+            channel_ratios=enc_ratios,
+            strides=enc_strides,
+            kernel_size=enc_kernel_size,
+            bias=bias,
+            block_dilations=enc_block_dilations,
+            unit_kernel_size=enc_block_kernel_size
+        )
+        self.decoder = Decoder(
+            code_dim=code_dim,
+            output_channels=output_channels,
+            decode_channels=decode_channels,
+            channel_ratios=dec_ratios,
+            strides=dec_strides,
+            kernel_size=dec_kernel_size,
+            bias=bias,
+            block_dilations=dec_block_dilations,
+            unit_kernel_size=dec_block_kernel_size
+        )
+        self.projector = Projector(
+            input_channels=self.encoder.out_channels,
+            code_dim=code_dim,
+            kernel_size=3,
+            stride=1,
+            bias=False
+        )
+        self.quantizer = Quantizer(
+            code_dim=code_dim,
+            codebook_num=codebook_num,
+            codebook_size=codebook_size
+        )
+    def forward(self, x):
+        x = self.encoder(x)
+        z = self.projector(x)
+        zq, vqloss, perplexity = self.quantizer(z)
+        y = self.decoder(zq)
+        return y, zq, z, vqloss, perplexity

ASR/repcodec/__pycache__/RepCodec.cpython-38.pyc ADDED Viewed

Binary file (1.87 kB). View file

ASR/repcodec/configs/repcodec_dim1024.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+input_channels: 1024
+output_channels: 1024
+encode_channels: 1024
+decode_channels: 1024
+code_dim: 1024
+codebook_num: 1
+codebook_size: 1024
+bias: true
+enc_ratios: [ 1, 1 ]
+dec_ratios: [ 1, 1 ]
+enc_strides: [ 1, 1 ]  # no downsampling
+dec_strides: [ 1, 1 ]
+enc_kernel_size: 3
+dec_kernel_size: 3
+enc_block_dilations: [ 1, 1 ]
+enc_block_kernel_size: 3
+dec_block_dilations: [ 1, 1 ]
+dec_block_kernel_size: 3

ASR/repcodec/configs/repcodec_dim1280.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+input_channels: 1280
+output_channels: 1280
+encode_channels: 1280
+decode_channels: 1280
+code_dim: 1280
+codebook_num: 1
+codebook_size: 1024
+bias: true
+enc_ratios: [ 1, 1 ]
+dec_ratios: [ 1, 1 ]
+enc_strides: [ 1, 1 ]  # no downsampling
+dec_strides: [ 1, 1 ]
+enc_kernel_size: 3
+dec_kernel_size: 3
+enc_block_dilations: [ 1, 1 ]
+enc_block_kernel_size: 3
+dec_block_dilations: [ 1, 1 ]
+dec_block_kernel_size: 3

ASR/repcodec/configs/repcodec_dim768.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+input_channels: 768
+output_channels: 768
+encode_channels: 768
+decode_channels: 768
+code_dim: 768
+codebook_num: 1
+codebook_size: 1024
+bias: true
+enc_ratios: [ 1, 1 ]
+dec_ratios: [ 1, 1 ]
+enc_strides: [ 1, 1 ]  # no downsampling
+dec_strides: [ 1, 1 ]
+enc_kernel_size: 3
+dec_kernel_size: 3
+enc_block_dilations: [ 1, 1 ]
+enc_block_kernel_size: 3
+dec_block_dilations: [ 1, 1 ]
+dec_block_kernel_size: 3

ASR/repcodec/layers/__pycache__/conv_layer.cpython-38.pyc ADDED Viewed

Binary file (2.52 kB). View file

ASR/repcodec/layers/__pycache__/vq_module.cpython-38.pyc ADDED Viewed

Binary file (5.14 kB). View file

ASR/repcodec/layers/conv_layer.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch.nn as nn
+class Conv1d1x1(nn.Conv1d):
+    """1x1 Conv1d."""
+    def __init__(self, in_channels, out_channels, bias=True):
+        super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, bias=bias)
+class Conv1d(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            padding: int = -1,
+            dilation: int = 1,
+            groups: int = 1,
+            bias: bool = True
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        if padding < 0:
+            padding = (kernel_size - 1) // 2 * dilation
+        self.dilation = dilation
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Float tensor variable with the shape  (B, C, T).
+        Returns:
+            Tensor: Float tensor variable with the shape (B, C, T).
+        """
+        x = self.conv(x)
+        return x
+class ConvTranspose1d(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int,
+            padding=-1,
+            output_padding=-1,
+            groups=1,
+            bias=True,
+    ):
+        super().__init__()
+        if padding < 0:
+            padding = (stride + 1) // 2
+        if output_padding < 0:
+            output_padding = 1 if stride % 2 else 0
+        self.deconv = nn.ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Float tensor variable with the shape  (B, C, T).
+        Returns:
+            Tensor: Float tensor variable with the shape (B, C', T').
+        """
+        x = self.deconv(x)
+        return x

ASR/repcodec/layers/vq_module.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class VectorQuantize(nn.Module):
+    """Vector quantization w/ exponential moving averages (EMA)"""
+    def __init__(
+            self,
+            dim: int,
+            codebook_size: int,
+            decay=0.8,
+            commitment=1.,
+            eps=1e-5,
+            n_embed=None,
+    ):
+        super().__init__()
+        n_embed = self.default(n_embed, codebook_size)
+        self.dim = dim
+        self.n_embed = n_embed
+        self.decay = decay
+        self.eps = eps
+        self.commitment = commitment
+        embed = torch.randn(dim, n_embed)
+        self.register_buffer('embed', embed)
+        self.register_buffer('cluster_size', torch.zeros(n_embed))
+        self.register_buffer('embed_avg', embed.clone())
+    @property
+    def codebook(self):
+        return self.embed.transpose(0, 1)
+    def exists(self, val):
+        return val is not None
+    def default(self, val, d):
+        return val if self.exists(val) else d
+    def ema_inplace(self, moving_avg, new, decay):
+        moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+    def laplace_smoothing(self, x, n_categories, eps=1e-5):
+        return (x + eps) / (x.sum() + n_categories * eps)
+    def forward(self, input):
+        dtype = input.dtype
+        flatten = input.reshape(-1, self.dim)
+        dist = (
+                flatten.pow(2).sum(1, keepdim=True)
+                - 2 * flatten @ self.embed
+                + self.embed.pow(2).sum(0, keepdim=True)
+        )
+        _, embed_ind = (-dist).max(1)
+        embed_onehot = F.one_hot(embed_ind, self.n_embed).type(dtype)
+        embed_ind = embed_ind.view(*input.shape[:-1])
+        quantize = F.embedding(embed_ind, self.embed.transpose(0, 1))
+        if self.training:
+            self.ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = flatten.transpose(0, 1) @ embed_onehot
+            self.ema_inplace(self.embed_avg, embed_sum, self.decay)
+            cluster_size = self.laplace_smoothing(self.cluster_size, self.n_embed, self.eps) * self.cluster_size.sum()
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(0)
+            self.embed.data.copy_(embed_normalized)
+        loss = F.mse_loss(quantize.detach(), input) * self.commitment
+        quantize = input + (quantize - input).detach()
+        avg_probs = torch.mean(embed_onehot, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+        return quantize, loss, perplexity
+    def forward_index(self, input):
+        dtype = input.dtype
+        flatten = input.reshape(-1, self.dim)
+        dist = (
+                flatten.pow(2).sum(1, keepdim=True)
+                - 2 * flatten @ self.embed
+                + self.embed.pow(2).sum(0, keepdim=True)
+        )
+        _, embed_ind = (-dist).max(1)
+        embed_onehot = F.one_hot(embed_ind, self.n_embed).type(dtype)
+        embed_ind = embed_ind.view(*input.shape[:-1])
+        quantize = F.embedding(embed_ind, self.embed.transpose(0, 1))
+        quantize = input + (quantize - input).detach()
+        return quantize, embed_ind
+class ResidualVQ(nn.Module):
+    """ Residual VQ following algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf """
+    def __init__(
+            self,
+            *,
+            num_quantizers,
+            **kwargs
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([VectorQuantize(**kwargs) for _ in range(num_quantizers)])
+    def forward(self, x):
+        quantized_out = 0.
+        residual = x
+        all_losses = []
+        all_perplexities = []
+        for layer in self.layers:
+            quantized, loss, perplexity = layer(residual)
+            # Issue: https://github.com/lucidrains/vector-quantize-pytorch/issues/33
+            # We found considering only the 1st layer VQ's graident results in better performance
+            # residual = residual - quantized.detach() # considering all layers' graidents
+            residual = residual - quantized  # considering only the first layer's graident
+            quantized_out = quantized_out + quantized
+            all_losses.append(loss)
+            all_perplexities.append(perplexity)
+        all_losses, all_perplexities = map(torch.stack, (all_losses, all_perplexities))
+        return quantized_out, all_losses, all_perplexities
+    def forward_index(self, x, flatten_idx=False):
+        quantized_out = 0.
+        residual = x
+        all_indices = []
+        for i, layer in enumerate(self.layers):
+            quantized, indices = layer.forward_index(residual)
+            # residual = residual - quantized.detach()
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            if flatten_idx:
+                indices += (self.codebook_size * i)
+            all_indices.append(indices)
+        all_indices = torch.stack(all_indices)
+        return quantized_out, all_indices.squeeze(1)
+    def initial(self):
+        self.codebook = []
+        for layer in self.layers:
+            self.codebook.append(layer.codebook)
+        self.codebook_size = self.codebook[0].size(0)
+        self.codebook = torch.stack(self.codebook)
+        self.codebook = self.codebook.reshape(-1, self.codebook.size(-1))
+    def lookup(self, indices):
+        quantized_out = F.embedding(indices, self.codebook)  # Num x T x C
+        return torch.sum(quantized_out, dim=0, keepdim=True)

ASR/repcodec/modules/__pycache__/decoder.cpython-38.pyc ADDED Viewed

Binary file (2.51 kB). View file

ASR/repcodec/modules/__pycache__/encoder.cpython-38.pyc ADDED Viewed

Binary file (2.23 kB). View file

ASR/repcodec/modules/__pycache__/projector.cpython-38.pyc ADDED Viewed

Binary file (903 Bytes). View file

ASR/repcodec/modules/__pycache__/quantizer.cpython-38.pyc ADDED Viewed

Binary file (1.63 kB). View file

ASR/repcodec/modules/__pycache__/residual_unit.cpython-38.pyc ADDED Viewed

Binary file (1.14 kB). View file

ASR/repcodec/modules/decoder.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch
+import torch.nn as nn
+from repcodec.layers.conv_layer import Conv1d, ConvTranspose1d
+from repcodec.modules.residual_unit import ResidualUnit
+class DecoderBlock(nn.Module):
+    """ Decoder block (no up-sampling) """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            stride: int,
+            dilations=(1, 1),
+            unit_kernel_size=3,
+            bias=True
+    ):
+        super().__init__()
+        if stride == 1:
+            self.conv = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,  # fix kernel=3 when stride=1 for unchanged shape
+                stride=stride,
+                bias=bias,
+            )
+        else:
+            self.conv = ConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(2 * stride),
+                stride=stride,
+                bias=bias,
+            )
+        self.res_units = torch.nn.ModuleList()
+        for idx, dilation in enumerate(dilations):
+            self.res_units += [
+                ResidualUnit(out_channels, out_channels,
+                             kernel_size=unit_kernel_size,
+                             dilation=dilation)
+            ]
+        self.num_res = len(self.res_units)
+    def forward(self, x):
+        x = self.conv(x)
+        for idx in range(self.num_res):
+            x = self.res_units[idx](x)
+        return x
+class Decoder(nn.Module):
+    def __init__(
+            self,
+            code_dim: int,
+            output_channels: int,
+            decode_channels: int,
+            channel_ratios=(1, 1),
+            strides=(1, 1),
+            kernel_size=3,
+            bias=True,
+            block_dilations=(1, 1),
+            unit_kernel_size=3,
+    ):
+        super().__init__()
+        assert len(channel_ratios) == len(strides)
+        self.conv1 = Conv1d(
+            in_channels=code_dim,
+            out_channels=int(decode_channels * channel_ratios[0]),
+            kernel_size=kernel_size,
+            stride=1,
+            bias=False
+        )
+        self.conv_blocks = torch.nn.ModuleList()
+        for idx, stride in enumerate(strides):
+            in_channels = int(decode_channels * channel_ratios[idx])
+            if idx < (len(channel_ratios) - 1):
+                out_channels = int(decode_channels * channel_ratios[idx + 1])
+            else:
+                out_channels = decode_channels
+            self.conv_blocks += [
+                DecoderBlock(
+                    in_channels, out_channels, stride,
+                    dilations=block_dilations, unit_kernel_size=unit_kernel_size,
+                    bias=bias
+                )
+            ]
+        self.num_blocks = len(self.conv_blocks)
+        self.conv2 = Conv1d(out_channels, output_channels, kernel_size, 1, bias=False)
+    def forward(self, z):
+        x = self.conv1(z)
+        for i in range(self.num_blocks):
+            x = self.conv_blocks[i](x)
+        x = self.conv2(x)
+        return x

ASR/repcodec/modules/encoder.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch
+import torch.nn as nn
+from repcodec.layers.conv_layer import Conv1d
+from repcodec.modules.residual_unit import ResidualUnit
+class EncoderBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            stride: int,
+            dilations=(1, 1),
+            unit_kernel_size=3,
+            bias=True
+    ):
+        super().__init__()
+        self.res_units = torch.nn.ModuleList()
+        for dilation in dilations:
+            self.res_units += [
+                ResidualUnit(in_channels, in_channels,
+                             kernel_size=unit_kernel_size,
+                             dilation=dilation)
+            ]
+        self.num_res = len(self.res_units)
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3 if stride == 1 else (2 * stride),  # special case: stride=1, do not use kernel=2
+            stride=stride,
+            bias=bias,
+        )
+    def forward(self, x):
+        for idx in range(self.num_res):
+            x = self.res_units[idx](x)
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+            self,
+            input_channels: int,
+            encode_channels: int,
+            channel_ratios=(1, 1),
+            strides=(1, 1),
+            kernel_size=3,
+            bias=True,
+            block_dilations=(1, 1),
+            unit_kernel_size=3
+    ):
+        super().__init__()
+        assert len(channel_ratios) == len(strides)
+        self.conv = Conv1d(
+            in_channels=input_channels,
+            out_channels=encode_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            bias=False
+        )
+        self.conv_blocks = torch.nn.ModuleList()
+        in_channels = encode_channels
+        for idx, stride in enumerate(strides):
+            out_channels = int(encode_channels * channel_ratios[idx])  # could be float
+            self.conv_blocks += [
+                EncoderBlock(in_channels, out_channels, stride,
+                             dilations=block_dilations, unit_kernel_size=unit_kernel_size,
+                             bias=bias)
+            ]
+            in_channels = out_channels
+        self.num_blocks = len(self.conv_blocks)
+        self.out_channels = out_channels
+    def forward(self, x):
+        x = self.conv(x)
+        for i in range(self.num_blocks):
+            x = self.conv_blocks[i](x)
+        return x

ASR/repcodec/modules/projector.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch.nn as nn
+from repcodec.layers.conv_layer import Conv1d
+class Projector(nn.Module):
+    def __init__(
+            self,
+            input_channels: int,
+            code_dim: int,
+            kernel_size=3,
+            stride=1,
+            bias=False
+    ):
+        super().__init__()
+        self.project = Conv1d(
+            input_channels,
+            code_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            bias=bias
+        )
+    def forward(self, x):
+        return self.project(x)

ASR/repcodec/modules/quantizer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch.nn as nn
+from repcodec.layers.vq_module import ResidualVQ
+class Quantizer(nn.Module):
+    def __init__(
+            self,
+            code_dim: int,
+            codebook_num: int,
+            codebook_size: int,
+    ):
+        super().__init__()
+        self.codebook = ResidualVQ(
+            dim=code_dim,
+            num_quantizers=codebook_num,
+            codebook_size=codebook_size
+        )
+    def initial(self):
+        self.codebook.initial()
+    def forward(self, z):
+        zq, vqloss, perplexity = self.codebook(z.transpose(2, 1))
+        zq = zq.transpose(2, 1)
+        return zq, vqloss, perplexity
+    def inference(self, z):
+        zq, indices = self.codebook.forward_index(z.transpose(2, 1))
+        zq = zq.transpose(2, 1)
+        return zq, indices
+    def encode(self, z):
+        zq, indices = self.codebook.forward_index(z.transpose(2, 1), flatten_idx=True)
+        return zq, indices
+    def decode(self, indices):
+        z = self.codebook.lookup(indices)
+        return z

ASR/repcodec/modules/residual_unit.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import torch.nn as nn
+from repcodec.layers.conv_layer import Conv1d, Conv1d1x1
+class ResidualUnit(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size=3,
+            dilation=1,
+            bias=False,
+            nonlinear_activation="ELU",
+            nonlinear_activation_params={},
+    ):
+        super().__init__()
+        self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+        self.conv1 = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=dilation,
+            bias=bias,
+        )
+        self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
+    def forward(self, x):
+        y = self.conv1(self.activation(x))
+        y = self.conv2(self.activation(y))
+        return x + y

ASR/repcodec/tokenize.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+from pathlib import Path
+from typing import Tuple, List, Optional
+import numpy as np
+import torch
+import yaml
+from tqdm import tqdm
+from repcodec.RepCodec import RepCodec
+ALL_MODELS = {
+    "data2vec_base_l6": 768,
+    "data2vec_large_l18": 1024,
+    "hubert_base_l9": 768,
+    "hubert_large_l18": 1024,
+    "whisper_medium_l24": 1024,
+    "whisper_large_l32": 1280
+}
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "in_dir",
+        type=str,
+        help="directory of representations to be tokenized."
+    )
+    parser.add_argument(
+        "--model",
+        required=True,
+        type=str,
+        help="path of the RepCodec model."
+    )
+    parser.add_argument(
+        "--tsv_path",
+        required=True,
+        type=str,
+        help="path of the tsv file."
+    )
+    parser.add_argument(
+        "--model_config_path",
+        default=None,
+        type=str,
+        help="please provide this training config if you are using the model you trained yourself."
+    )
+    parser.add_argument(
+        "--n_shard",
+        required=False,
+        type=int,
+        default=1,
+        help="number of shards of representations."
+    )
+    parser.add_argument(
+        "--use_gpu",
+        default=False,
+        action="store_true",
+        help="whether use gpu for inference."
+    )
+    parser.add_argument(
+        "--batch_size",
+        default=1,
+        type=int,
+        help="number of utterances for each mini batch."
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default=".",
+        help="the directory to save the output."
+    )
+    return parser.parse_args()
+def load_model(model_path: str, config_path: Optional[str] = None):
+    if config_path is None:
+        name = os.path.basename(model_path).strip(".pkl")
+        assert name in ALL_MODELS.keys(), f"Cannot find configs for {model_path}. " \
+                                          f"Please provide the config file you used for training."
+        config = os.path.join(os.path.dirname(__file__), "configs", f"repcodec_dim{ALL_MODELS[name]}.yaml")
+        with open(config) as fp:
+            conf = yaml.load(fp, Loader=yaml.FullLoader)
+    else:
+        with open(config_path) as fp:
+            conf = yaml.load(fp, Loader=yaml.FullLoader)["model_params"]
+    model = RepCodec(**conf)
+    model.load_state_dict(torch.load(model_path, map_location="cpu")["model"]["repcodec"])
+    model.quantizer.initial()
+    model.eval()
+    return model
+def load_shard(in_dir: Path, rank: int, n_shard: int) -> Tuple[np.ndarray, List[int]]:
+    feat_path = in_dir / f"{rank}_{n_shard}.npy"
+    len_path = in_dir / f"{rank}_{n_shard}.len"
+    with open(len_path) as fp:
+        lengths = [int(line.strip()) for line in fp]
+    return np.load(feat_path.as_posix(), mmap_mode="r"), lengths
+def pad_data(data: List[np.ndarray]) -> List[np.ndarray]:
+    max_len = max([d.shape[0] for d in data])
+    data = [
+        np.pad(d, [(0, max_len - d.shape[0]), (0, 0)], "constant", constant_values=0.0)
+        for d in data
+    ]
+    return data
+def make_batch_data(data: np.ndarray, shard_lengths: List[int], batch_size: int):
+    batch_data = []
+    batch_lens = []
+    offsets = np.cumsum([0] + shard_lengths)
+    assert len(data) == offsets[-1], f"{len(data)} {offsets[-1]}"
+    # from longest to shortest
+    for i in range(len(shard_lengths)):
+        if batch_size > len(batch_data):
+            batch_data.append(data[offsets[i]: offsets[i + 1]])
+            batch_lens.append(shard_lengths[i])
+        else:
+            yield {
+                "data": torch.tensor(np.stack(pad_data(batch_data)), dtype=torch.float),  # (bsz, seq len, hidden dim)
+                "lengths": batch_lens
+            }
+            batch_data = [data[offsets[i]: offsets[i + 1]]]
+            batch_lens = [shard_lengths[i]]
+    if len(batch_data) > 0:
+        yield {
+            "data": torch.tensor(np.stack(pad_data(batch_data)), dtype=torch.float),
+            "lengths": batch_lens
+        }
+def tokenize_batch(model: RepCodec, batch: dict, device: str) -> List[List[int]]:
+    with torch.no_grad():
+        data = batch["data"].transpose(1, 2).to(device)  # (bsz, hidden dim, seq len)
+        x = model.encoder(data)
+        z = model.projector(x)
+        _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
+    # when bsz=1: (1, seq len)
+    if idx.dim() == 2:
+        return idx.cpu().data.numpy().tolist()
+    # when bsz>1: (1, bsz, seq len)
+    tokens = idx.cpu().data.numpy().tolist()[0]
+    res = []
+    batch_lens = batch["lengths"]
+    for i in range(len(tokens)):
+        n_tokens = batch_lens[i]
+        res.append(tokens[i][:n_tokens])
+    return res
+def load_tsv(path: str):
+    with open(path) as fp:
+        root = fp.readline().strip()
+        names = []
+        for line in fp:
+            names.append(line.strip().split("\t")[0])
+    return root, names
+def cli():
+    args = parse_args()
+    device = "cuda" if args.use_gpu else "cpu"
+    model = load_model(model_path=args.model, config_path=args.model_config_path)
+    model.to(device)
+    in_dir = Path(args.in_dir)
+    n_shard = args.n_shard
+    batch_size = args.batch_size
+    root_dir, file_names = load_tsv(args.tsv_path)
+    output_dir = args.out_dir
+    os.makedirs(output_dir, exist_ok=True)
+    processed_cnt = 0
+    pbar = tqdm(total=len(file_names))
+    with open(os.path.join(output_dir, "tokens"), mode="w+") as fp:
+        fp.write(f"{root_dir}\n")
+        for rank in range(n_shard):
+            shard_data, shard_lengths = load_shard(in_dir, rank, n_shard)
+            for batch in make_batch_data(shard_data, shard_lengths, batch_size=batch_size):
+                batch_tokens = tokenize_batch(model, batch, device)
+                for tokens in batch_tokens:
+                    fp.write(f"{file_names[processed_cnt]}\t{' '.join(map(str, tokens))}\n")
+                    processed_cnt += 1
+                pbar.update(len(batch_tokens))
+    assert processed_cnt == len(file_names), f"# lines of tsv do not match # of representations!"
+    pbar.close()
+    print("Tokenize successfully!")
+if __name__ == '__main__':
+    cli()

ASR/test-gpt2-opt.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1717d8158b524053e9cc92fdbc9942bb4abc9f119576680f159f2f3177b378d
+size 653546067

ASR/test-gpt2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e0852bac1a63c63262029142c06158c40c81e90ff5f92ef45b0924ad80f6de
+size 653828879

ASR/test-gpt2.plan ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abdb8a96621f01394e8a783644c782815c25c507cafbc1cb86d6b4a2ccb5a6cc
+size 328704308

ASR/tokenized_librispeech/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train.clean.100", "train.clean.360", "train.other.500", "validation.clean", "validation.other", "test.clean", "test.other"]}

ASR/tokenized_librispeech/test.clean/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:267be16197f2a380f021dd5b1687ffd341825474ca00875098a5582252b9b901
+size 29539856

ASR/tokenized_librispeech/test.clean/dataset_info.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

ASR/tokenized_librispeech/test.clean/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5d19fcebbf9d8932",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

ASR/tokenized_librispeech/test.other/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:663e0f0f58ac0ac562f319e51219064caaba5102b9bccd1e30322a780b1237ad
+size 33136248

ASR/tokenized_librispeech/test.other/dataset_info.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

ASR/tokenized_librispeech/test.other/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "6d0c91ac40d55d91",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

ASR/tokenized_librispeech/train.clean.100/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b797259a17fc4ddaca3c0f4a09622cdde099dbf46685d5096405d4916e3428b
+size 321761256

ASR/tokenized_librispeech/train.clean.100/dataset_info.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

ASR/tokenized_librispeech/train.clean.100/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "00eedcd713e3fb08",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

ASR/tokenized_librispeech/train.clean.360/data-00000-of-00003.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9da54e7eceac6fdf5c90656bca2655b28f079f52b3648978e228326ab84f334
+size 390907152

ASR/tokenized_librispeech/train.clean.360/data-00001-of-00003.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af64dfb82934022b279d8022102c321991f99d1e53ab102e9e84a1488565d1fd
+size 390895880

ASR/tokenized_librispeech/train.clean.360/data-00002-of-00003.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a40a4c69c058b01e7eff4068babf1655998beb5904bc32a620393f41018fa49a
+size 390895880

ASR/tokenized_librispeech/train.clean.360/dataset_info.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "token_type_ids": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

ASR/tokenized_librispeech/train.clean.360/state.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00003.arrow"
+    },
+    {
+      "filename": "data-00001-of-00003.arrow"
+    },
+    {
+      "filename": "data-00002-of-00003.arrow"
+    }
+  ],
+  "_fingerprint": "44573b66f4895b44",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

ASR/tokenized_librispeech/train.other.500/data-00000-of-00004.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8310b88f596d7f62ce3c8a9801cefdb016400702e48c316c145bf4abb8539447
+size 419093384