Spaces:

GroveStreet
/

GTA_SOVITS

Running

App Files Files Community

Katock commited on Jul 28, 2023

Commit

0382342

1 Parent(s): f85ad87

debug

Browse files

Files changed (34) hide show

models/carl/cover.png +0 -0
models/ryder/cover.png +0 -0
models/sweet/cover.png +0 -0
utils.py +0 -33
vencoder/CNHubertLarge.py +33 -0
vencoder/ContentVec256L12_Onnx.py +28 -0
vencoder/ContentVec256L9.py +35 -0
vencoder/ContentVec256L9_Onnx.py +28 -0
vencoder/ContentVec768L12.py +34 -0
vencoder/ContentVec768L12_Onnx.py +28 -0
vencoder/ContentVec768L9_Onnx.py +28 -0
vencoder/DPHubert.py +26 -0
vencoder/HubertSoft.py +24 -0
vencoder/HubertSoft_Onnx.py +28 -0
vencoder/WhisperPPG.py +30 -0
vencoder/WhisperPPGLarge.py +30 -0
vencoder/__init__.py +0 -0
vencoder/dphubert/__init__.py +0 -0
vencoder/dphubert/components.py +1410 -0
vencoder/dphubert/hardconcrete.py +122 -0
vencoder/dphubert/model.py +966 -0
vencoder/dphubert/pruning_utils.py +51 -0
vencoder/dphubert/utils/__init__.py +0 -0
vencoder/dphubert/utils/import_huggingface_wavlm.py +129 -0
vencoder/encoder.py +12 -0
vencoder/hubert/__init__.py +0 -0
vencoder/hubert/hubert_model.py +222 -0
vencoder/hubert/hubert_model_onnx.py +217 -0
vencoder/whisper/__init__.py +0 -0
vencoder/whisper/audio.py +125 -0
vencoder/whisper/decoding.py +712 -0
vencoder/whisper/model.py +269 -0
vencoder/whisper/tokenizer.py +331 -0
vencoder/whisper/utils.py +163 -0

models/carl/cover.png ADDED Viewed

models/ryder/cover.png ADDED Viewed

models/sweet/cover.png ADDED Viewed

utils.py CHANGED Viewed

@@ -15,7 +15,6 @@ from scipy.io.wavfile import read
 import torch
 from torch.nn import functional as F
 from modules.commons import sequence_mask
-import faiss
 import tqdm
 MATPLOTLIB_FLAG = False
@@ -428,38 +427,6 @@ def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出
     )
     return data2
-def train_index(spk_name,root_dir = "dataset/44k/"):  #from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
-    print("The feature index is constructing.")
-    exp_dir = os.path.join(root_dir,spk_name)
-    listdir_res = []
-    for file in os.listdir(exp_dir):
-       if ".wav.soft.pt" in file:
-          listdir_res.append(os.path.join(exp_dir,file))
-    if len(listdir_res) == 0:
-        raise Exception("You need to run preprocess_hubert_f0.py!")
-    npys = []
-    for name in sorted(listdir_res):
-        phone = torch.load(name)[0].transpose(-1,-2).numpy()
-        npys.append(phone)
-    big_npy = np.concatenate(npys, 0)
-    big_npy_idx = np.arange(big_npy.shape[0])
-    np.random.shuffle(big_npy_idx)
-    big_npy = big_npy[big_npy_idx]
-    n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
-    index = faiss.index_factory(big_npy.shape[1] , "IVF%s,Flat" % n_ivf)
-    index_ivf = faiss.extract_index_ivf(index)  #
-    index_ivf.nprobe = 1
-    index.train(big_npy)
-    batch_size_add = 8192
-    for i in range(0, big_npy.shape[0], batch_size_add):
-        index.add(big_npy[i : i + batch_size_add])
-    # faiss.write_index(
-    #     index,
-    #     f"added_{spk_name}.index"
-    # )
-    print("Successfully build index")
-    return index
 class HParams():
   def __init__(self, **kwargs):

 import torch
 from torch.nn import functional as F
 from modules.commons import sequence_mask
 import tqdm
 MATPLOTLIB_FLAG = False
     )
     return data2
 class HParams():
   def __init__(self, **kwargs):

vencoder/CNHubertLarge.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from fairseq import checkpoint_utils
+class CNHubertLarge(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/chinese-hubert-large-fairseq-ckpt.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 1024
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+          [vec_path],
+          suffix="",
+        )
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.model = models[0].to(self.dev)
+        self.model.eval()
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+        inputs = {
+          "source": feats.to(wav.device),
+          "padding_mask": padding_mask.to(wav.device)
+        }
+        with torch.no_grad():
+          logits = self.model.extract_features(**inputs)
+        return logits[0].transpose(1, 2)

vencoder/ContentVec256L12_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec256L12_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-256-layer-12.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/ContentVec256L9.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from fairseq import checkpoint_utils
+class ContentVec256L9(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+          [vec_path],
+          suffix="",
+        )
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.model = models[0].to(self.dev)
+        self.model.eval()
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+        inputs = {
+          "source": feats.to(wav.device),
+          "padding_mask": padding_mask.to(wav.device),
+          "output_layer": 9,  # layer 9
+        }
+        with torch.no_grad():
+          logits = self.model.extract_features(**inputs)
+          feats = self.model.final_proj(logits[0])
+        return feats.transpose(1, 2)

vencoder/ContentVec256L9_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec256L9_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-256-layer-9.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/ContentVec768L12.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from fairseq import checkpoint_utils
+class ContentVec768L12(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 768
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+          [vec_path],
+          suffix="",
+        )
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.model = models[0].to(self.dev)
+        self.model.eval()
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+        inputs = {
+          "source": feats.to(wav.device),
+          "padding_mask": padding_mask.to(wav.device),
+          "output_layer": 12,  # layer 12
+        }
+        with torch.no_grad():
+          logits = self.model.extract_features(**inputs)
+        return logits[0].transpose(1, 2)

vencoder/ContentVec768L12_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec768L12_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-768-layer-12.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 768
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/ContentVec768L9_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class ContentVec768L9_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 768
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/DPHubert.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from vencoder.dphubert.model import wav2vec2_model
+class DPHubert(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/DPHuBERT-sp0.75.pth",device=None):
+        print("load model(s) from {}".format(vec_path))
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        ckpt = torch.load(vec_path)
+        self.hidden_dim = 768
+        self.model = wav2vec2_model(**ckpt["config"]).to(self.dev)
+        self.model.load_state_dict(ckpt["state_dict"], strict=False)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats[None,:]
+        with torch.no_grad():
+            with torch.inference_mode():
+              units = self.model(feats)[0]
+              return units.transpose(1,2)

vencoder/HubertSoft.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from vencoder.hubert import hubert_model
+class HubertSoft(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None):
+        print("load model(s) from {}".format(vec_path))
+        hubert_soft = hubert_model.hubert_soft(vec_path)
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        self.hidden_dim = 256
+        self.model = hubert_soft.to(self.dev)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats[None,None,:]
+        with torch.no_grad():
+            with torch.inference_mode():
+              units = self.model.units(feats)
+              return units.transpose(1,2)

vencoder/HubertSoft_Onnx.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from vencoder.encoder import SpeechEncoder
+import onnxruntime
+import torch
+class HubertSoft_Onnx(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/hubert-soft.onnx",device=None):
+        print("load model(s) from {}".format(vec_path))
+        self.hidden_dim = 256
+        if device is None:
+            self.dev = torch.device("cpu")
+        else:
+            self.dev = torch.device(device)
+        if device == 'cpu' or device == torch.device("cpu") or device is None:
+            providers = ['CPUExecutionProvider']
+        elif device == 'cuda' or device == torch.device("cuda"):
+            providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
+    def encoder(self, wav):
+        feats = wav
+        if feats.dim() == 2:  # double channels
+          feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        feats = feats.unsqueeze(0).cpu().detach().numpy()
+        onnx_input = {self.model.get_inputs()[0].name: feats}
+        logits = self.model.run(None, onnx_input)
+        return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

vencoder/WhisperPPG.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from vencoder.whisper.model import Whisper, ModelDimensions
+from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
+class WhisperPPG(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/medium.pt",device=None):
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        checkpoint = torch.load(vec_path, map_location=device)
+        dims = ModelDimensions(**checkpoint["dims"])
+        model = Whisper(dims)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.hidden_dim = dims
+        self.model = model.to(self.dev)
+    def encoder(self, wav):
+        audio = wav
+        audln = audio.shape[0]
+        ppgln = audln // 320
+        audio = pad_or_trim(audio)
+        mel = log_mel_spectrogram(audio).to(self.dev)
+        with torch.no_grad():
+            ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+            ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev)
+            return ppg[None,:,:].transpose(1, 2)

vencoder/WhisperPPGLarge.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from vencoder.encoder import SpeechEncoder
+import torch
+from vencoder.whisper.model import Whisper, ModelDimensions
+from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
+class WhisperPPGLarge(SpeechEncoder):
+    def __init__(self,vec_path = "pretrain/large-v2.pt",device=None):
+        if device is None:
+            self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.dev = torch.device(device)
+        checkpoint = torch.load(vec_path, map_location=device)
+        dims = ModelDimensions(**checkpoint["dims"])
+        model = Whisper(dims)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.hidden_dim = dims
+        self.model = model.to(self.dev)
+    def encoder(self, wav):
+        audio = wav
+        audln = audio.shape[0]
+        ppgln = audln // 320
+        audio = pad_or_trim(audio)
+        mel = log_mel_spectrogram(audio).to(self.dev)
+        with torch.no_grad():
+            ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+            ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev)
+            return ppg[None,:,:].transpose(1, 2)

vencoder/__init__.py ADDED Viewed

File without changes

vencoder/dphubert/__init__.py ADDED Viewed

File without changes

vencoder/dphubert/components.py ADDED Viewed

	@@ -0,0 +1,1410 @@

+"""Building blocks for speech SSL models supporting pruning.
+Originally from:
+https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/components.py
+"""
+from collections import defaultdict
+from typing import List, Optional, Tuple
+import math
+import torch
+from torch import nn, Tensor
+from torch.nn import Module, Parameter
+from .hardconcrete import HardConcrete
+from .pruning_utils import (
+    prune_linear_layer,
+    prune_conv1d_layer,
+    prune_layer_norm,
+)
+def _init_transformer_params(module):
+    """
+    Initialize the weights of Transformer module in Wav2Vec2/HuBERT.
+    If the module is ``nn.Linear``, normalize the weight with mean 0 and standard deviation 0.02.
+    If ``bias`` is set to ``True`` in the module, set ``bias`` to 0.
+    If the module is ``nn.Embedding``, normalize the weight with mean 0 and standard deviation 0.02.
+    If ``padding_idx`` is not None, set the weight of padding to 0.
+    Note:
+        Ths method corresponds to
+        `init_bert_params
+        <https://github.com/facebookresearch/fairseq/blob/main/fairseq/modules/transformer_sentence_encoder.py#L21>`__
+        in the original ``fairseq`` implementation.
+    """
+    def normal_(data):
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+class LayerNorm(nn.LayerNorm):
+    """Layer norm with transpose"""
+    def forward(self, input: Tensor) -> Tensor:
+        x = input.transpose(-2, -1)
+        x = nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = x.transpose(-2, -1)
+        return x
+class ConvLayerBlock(Module):
+    """Convolution unit of FeatureExtractor"""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        bias: bool,
+        layer_norm: Optional[Module],
+        prune_conv_channels: bool = False,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.layer_norm = layer_norm
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            bias=bias,
+        )
+        if prune_conv_channels:
+            self.hard_concrete = HardConcrete(n_in=out_channels, init_mean=0.01)
+        else:
+            self.hard_concrete = None
+    def forward(
+        self,
+        x: Tensor,
+        length: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): Shape: ``[batch, in_channels, in_frame]``.
+            length (Tensor or None, optional): Shape ``[batch, ]``.
+        Returns:
+            Tensor: Shape ``[batch, out_channels, out_frames]``.
+            Optional[Tensor]: Shape ``[batch, ]``.
+        """
+        x = self.conv(x)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        x = nn.functional.gelu(x)
+        if self.hard_concrete is not None:
+            channel_mask = self.hard_concrete()  # hard concrete mask, (out_channels,)
+            x = x * channel_mask.unsqueeze(-1)
+        if length is not None:
+            length = torch.div(length - self.kernel_size, self.stride, rounding_mode="floor") + 1
+            # When input length is 0, the resulting length can be negative. So fix it here.
+            length = torch.max(torch.zeros_like(length), length)
+        return x, length
+    def get_num_params_and_out_channels(self, in_channels):
+        if self.hard_concrete is not None:
+            out_channels = self.hard_concrete.l0_norm()
+        else:
+            out_channels = self.conv.out_channels
+        num_params = in_channels * out_channels * self.kernel_size
+        if self.conv.bias is not None:
+            num_params += out_channels
+        if self.layer_norm is not None:
+            num_params += out_channels * 2
+        return num_params, out_channels
+class FeatureExtractor(Module):
+    """Extract features from audio
+    Args:
+        conv_layers (nn.ModuleList):
+            convolution layers
+    """
+    def __init__(
+        self,
+        conv_layers: nn.ModuleList,
+    ):
+        super().__init__()
+        self.conv_layers = conv_layers
+        # NOTE: a dummy weight used to save the soft mask of the last conv layer
+        self.dummy_weight = nn.Parameter(
+            torch.ones(conv_layers[-1].conv.out_channels, dtype=torch.float32),
+            requires_grad=False
+        )
+    def forward(
+        self,
+        x: Tensor,
+        length: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor):
+                Input Tensor representing a batch of audio,
+                shape: ``[batch, time]``.
+            length (Tensor or None, optional):
+                Valid length of each input sample. shape: ``[batch, ]``.
+        Returns:
+            Tensor:
+                The resulting feature, shape: ``[batch, frame, feature]``
+            Optional[Tensor]:
+                Valid length of each output sample. shape: ``[batch, ]``.
+        """
+        if x.ndim != 2:
+            raise ValueError("Expected the input Tensor to be 2D (batch, time), " "but received {list(x.shape)}")
+        x = x.unsqueeze(1)  # (batch, channel==1, frame)
+        for layer in self.conv_layers:
+            x, length = layer(x, length)  # (batch, feature, frame)
+        x = x.transpose(1, 2)  # (batch, frame, feature)
+        x = x * self.dummy_weight
+        return x, length
+    def get_num_params_and_final_out_channels(self):
+        in_channels = 1
+        num_params = 0
+        for layer in self.conv_layers:
+            layer_params, in_channels = layer.get_num_params_and_out_channels(in_channels)
+            num_params += layer_params
+        num_params += in_channels   # dummy weight
+        return num_params, in_channels
+    def prune(self):
+        """"Prune conv layers and dummy weight based on hardconcrete parameters.
+        This is an in-place operation.
+        """
+        new_config = []     # [(output_channel, kernel_size, stride), ...]
+        for idx, layer in enumerate(self.conv_layers):
+            if layer.hard_concrete is not None:
+                assert not layer.hard_concrete.training
+                mask = layer.hard_concrete()    # (out_features,)
+                index = mask.nonzero().squeeze(-1)    # 2D -> 1D
+                assert len(index) > 0, f"Conv channels pruned to zero at index {idx}"
+                new_config.append(
+                    (len(index), layer.kernel_size, layer.stride)
+                )
+                # prune the current layer
+                prune_conv1d_layer(layer.conv, index, "output")
+                if layer.layer_norm is not None:
+                    prune_layer_norm(layer.layer_norm, index)
+                # prune the next layer
+                if idx == len(self.conv_layers) - 1:
+                    self.dummy_weight.data *= mask
+                    self.dummy_weight = nn.Parameter(
+                        self.dummy_weight.index_select(0, index).clone().detach(), requires_grad=False
+                    )
+                else:
+                    self.conv_layers[idx+1].conv.weight.data *= mask.unsqueeze(-1)
+                    prune_conv1d_layer(self.conv_layers[idx+1].conv, index, dim="input")
+                layer.hard_concrete = None
+            else:
+                new_config.append(
+                    (layer.conv.out_channels, layer.kernel_size, layer.stride)
+                )
+                index = torch.arange(layer.conv.out_channels, dtype=torch.long)
+        return new_config, index
+class FeatureProjection(Module):
+    """Layer that connects FeatureExtractor and Encoder
+    Projects features to encoder dimension.
+    Args:
+        in_features (int): Input feature dim.
+        out_features (int): Output feature dim.
+        dropout (float): Dropout probability.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        dropout: float,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(in_features)
+        self.projection = nn.Linear(
+            in_features,
+            out_features,
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                Feature Tensor. shape: ``[batch, frame, in_feature]``
+        Returns:
+            Tensor: Projected features. ``[batch, frame, out_feature]``.
+        """
+        x = self.layer_norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+    def get_num_params(self, in_features):
+        return in_features * 2 + (in_features + 1) * self.projection.out_features
+class ConvolutionalPositionalEmbedding(Module):
+    """Positional embedding which is placed at the beginning of Transformer.
+    Args:
+        embed_dim (int): Feature dimension of the input Tensor.
+        kernel_size (int): The number of frames to be use.
+        groups (int): The number of groups in feature dimensions.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        kernel_size: int,
+        groups: int,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv1d(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+        self.num_remove: int = 1 if kernel_size % 2 == 0 else 0
+    def __prepare_scriptable__(self):
+        for hook in self.conv._forward_pre_hooks.values():
+            # The hook we want to remove is an instance of WeightNorm class, so
+            # normally we would do `if isinstance(...)` but this class is not accessible
+            # because of shadowing, so we check the module name directly.
+            # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+            if hook.__module__ == "torch.nn.utils.weight_norm" and hook.__class__.__name__ == "WeightNorm":
+                torch.nn.utils.remove_weight_norm(self.conv)
+        return self
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): shape ``[batch, frame, feature]``.
+        Returns:
+            Tensor: The resulting feature. Shape ``[batch, frame, feature]``.
+        """
+        x = x.transpose(-2, -1)
+        x = self.conv(x)
+        if self.num_remove > 0:
+            x = x[..., : -self.num_remove]
+        x = torch.nn.functional.gelu(x)
+        x = x.transpose(-2, -1)
+        return x
+class SelfAttention(Module):
+    """Multihead Self Attention module
+    Args:
+        embed_dim (int): Total dimension of the model.
+        num_heads (int): The number of heads.
+        dropout (float, optional):
+            Dropout probability on attn_output_weights. Default: ``0.0``
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        head_dim: int,
+        dropout: float = 0.0,
+        prune_heads: bool = False,  # whether to prune attention heads
+        prune_layer: bool = False,  # whether to prune entire attention layers
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.dropout = torch.nn.Dropout(dropout)
+        self.scaling = self.head_dim**-0.5
+        self.k_proj = nn.Linear(embed_dim, num_heads * head_dim, bias=True)
+        self.v_proj = nn.Linear(embed_dim, num_heads * head_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, num_heads * head_dim, bias=True)
+        self.out_proj = nn.Linear(num_heads * head_dim, embed_dim, bias=True)
+        if prune_heads:
+            self.hard_concrete_for_heads = HardConcrete(n_in=num_heads, init_mean=0.01)
+        else:
+            self.hard_concrete_for_heads = None
+        if prune_layer:
+            self.hard_concrete_for_layer = HardConcrete(n_in=1, init_mean=0.01)
+        else:
+            self.hard_concrete_for_layer = None
+    def forward(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): shape: ``[batch_size, sequence_length, embed_dim]``.
+            attention_mask (Tensor or ``None``, optional):
+                shape: ``[batch_size, 1, sequence_length, sequence_length]``
+            position_bias: Not used. Only for the compatibility with :py:class:`WavLMSelfAttention`.
+            key_padding_mask (Tensor or ``None``): Not used. Only for the compatibility with
+                :py:class:`WavLMSelfAttention`.
+        Returns:
+            (Tensor, ``None``): The resulting attention output and ``None`` (necessary for compatibility
+                with :py:class:`WavLMSelAttention`).
+                Attention output shape: ``[batch, sequence_length, embed_dim]``.
+        """
+        if x.ndim != 3 or x.shape[2] != self.embed_dim:
+            raise ValueError(
+                f"The expected input shape is (batch, sequence, embed_dim=={self.embed_dim}). " f"Found {x.shape}."
+            )
+        batch_size, length, embed_dim = x.size()
+        shape = (batch_size, length, self.num_heads, self.head_dim)
+        q = self.q_proj(x).view(*shape).transpose(2, 1)  # B, nH, L, Hd
+        k = self.k_proj(x).view(*shape).permute(0, 2, 3, 1)  # B, nH, Hd, L
+        v = self.v_proj(x).view(*shape).transpose(2, 1)  # B, nH, L, Hd
+        # scale down q to avoid value overflow.
+        weights = (self.scaling * q) @ k  # B, nH, L, L
+        if attention_mask is not None:
+            weights += attention_mask
+        # subtracting a constant value from the tensor won't change the output of softmax.
+        # apply the subtraction to avoid value overflow in torch.nn.functional.softmax.
+        # for more details, please see Equation 7 in https://arxiv.org/abs/2112.08778
+        weights = weights - weights.max(dim=-1, keepdim=True)[0]
+        weights = torch.nn.functional.softmax(weights, dim=-1)
+        weights = self.dropout(weights)
+        output = weights @ v  # B, nH, L, Hd
+        if self.hard_concrete_for_heads is not None:
+            head_mask = self.hard_concrete_for_heads()  # (nH,)
+            output = output * head_mask.unsqueeze(-1).unsqueeze(-1)
+        output = output.transpose(2, 1).reshape(batch_size, length, self.num_heads * self.head_dim)
+        output = self.out_proj(output)
+        if self.hard_concrete_for_layer is not None:
+            layer_mask = self.hard_concrete_for_layer() # (1,)
+            output = output * layer_mask
+        return output, None  # Necessary for compatibility with WavLMSelAttention
+    def get_num_params(self):
+        if self.hard_concrete_for_heads is not None:
+            num_heads = self.hard_concrete_for_heads.l0_norm()
+        else:
+            num_heads = self.num_heads
+        num_params = (self.embed_dim + 1) * num_heads * self.head_dim * 3 \
+            + (num_heads * self.head_dim + 1) * self.embed_dim
+        if self.hard_concrete_for_layer is not None:
+            num_params *= self.hard_concrete_for_layer.l0_norm()
+        return num_params
+    def prune(self):
+        new_config = {
+            "use_attention": True,
+            "num_heads": self.num_heads,
+        }
+        if self.hard_concrete_for_layer is not None:
+            assert not self.hard_concrete_for_layer.training
+            layer_mask = self.hard_concrete_for_layer() # (1,)
+            self.out_proj.weight.data *= layer_mask
+            self.out_proj.bias.data *= layer_mask
+            if layer_mask == 0:
+                new_config["use_attention"] = False
+            self.hard_concrete_for_layer = None
+        if self.hard_concrete_for_heads is not None:
+            assert not self.hard_concrete_for_heads.training
+            head_mask = self.hard_concrete_for_heads()  # (num_heads,)
+            new_config["num_heads"] = len(head_mask.nonzero())
+            if new_config["num_heads"] == 0:
+                new_config["use_attention"] = False
+            else:
+                full_mask = head_mask.repeat_interleave(self.head_dim)
+                full_index = full_mask.nonzero().squeeze(-1)  # 1D
+                prune_linear_layer(self.k_proj, full_index, "output")
+                prune_linear_layer(self.v_proj, full_index, "output")
+                prune_linear_layer(self.q_proj, full_index, "output")
+                self.out_proj.weight.data *= full_mask
+                prune_linear_layer(self.out_proj, full_index, "input")
+            self.hard_concrete_for_heads = None
+        return new_config
+class WavLMSelfAttention(SelfAttention):
+    """Multi-headed self-attention for WavLM model :cite:`chen2022wavlm`.
+    Args:
+        embed_dim (int): Total dimension of the model.
+        num_heads (int): The number of heads.
+        dropout (float, optional): Dropout probability on attn_output_weights. (Default: to ``0.0``)
+        bias (bool, optional): If ``True``, add bias to input / output projection layers. (Default: ``True``)
+        has_relative_attention_bias (bool, optional): If ``True``, apply relative position embedding.
+            Necessary in the first encoder layer, but not in the subsequent ones. (Default: ``False``)
+        num_buckets (int, optional): Number of buckets for relative position embedding. (Default: ``32``)
+        max_distance (int, optional): Naximum distance for relative position embedding. (Default: ``128``)
+        gru_rel_pos (bool, optional): If ``True``, apply gated relative position embedding. (Default: ``False``)
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        total_num_heads: int,
+        remaining_heads: Optional[List[int]] = None,
+        dropout: float = 0.0,
+        bias: bool = True,
+        has_relative_attention_bias: bool = False,
+        num_buckets: int = 32,
+        max_distance: int = 128,
+        gru_rel_pos: bool = True,
+        prune_heads: bool = False,
+        prune_layer: bool = False,
+    ):
+        self.total_num_heads = total_num_heads
+        if remaining_heads is None:
+            self.remaining_heads = list(range(total_num_heads))
+        else:
+            self.remaining_heads = remaining_heads  # list of indices
+        self.head_dim = embed_dim // total_num_heads
+        super().__init__(embed_dim, len(self.remaining_heads), self.head_dim, dropout, prune_heads, prune_layer)
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        if has_relative_attention_bias:
+            self.rel_attn_embed = nn.Embedding(num_buckets, total_num_heads)
+        else:
+            self.rel_attn_embed = None
+        # override linear layers to customize bias
+        self.k_proj = nn.Linear(embed_dim, len(self.remaining_heads) * self.head_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, len(self.remaining_heads) * self.head_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, len(self.remaining_heads) * self.head_dim, bias=bias)
+        self.out_proj = nn.Linear(len(self.remaining_heads) * self.head_dim, embed_dim, bias=bias)
+        self.gru_rel_pos = gru_rel_pos
+        if self.gru_rel_pos:
+            self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+            self.gru_rel_pos_const = nn.Parameter(torch.ones(1, total_num_heads, 1, 1))
+        self.has_position_bias = True
+    def compute_bias(self, query_length: int, key_length: int) -> Tensor:
+        """Compute relative position embeddings for WavLM model.
+        Args:
+            query_length (int): Query position can take values between 0 and ``query_length - 1``.
+            key_length (int): Key position can take values between 0 and ``key_length - 1``.
+        Returns:
+            Tensor of shape `(num_heads, query_length, key_length)`, relative positions embeddings
+        """
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # Shape (query_length, key_length)
+        relative_position_bucket = self._relative_positions_bucket(relative_position, bidirectional=True)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)  # Shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1])
+        return values
+    def _relative_positions_bucket(self, relative_positions: Tensor, bidirectional: bool = True):
+        """Compute relative position buckets for WavLM model. Computation similar to formula (5) in WavLM
+           paper :cite:`chen2022wavlm`.
+        Args:
+            relative_positions (Tensor): Relative offsets between query and key positions,
+                of shape ``(query_length, key_length)``.
+            bidirectional (bool): If ``True``, values will be filled both above and below the diagonal in the resulting
+                matrix. If ``False``, the elements above the diagonal (i.e. with negative relative offsets) will be set
+                to zero. (Default ``True``)
+        Returns:
+            Tensor of shape ``(query_length, key_length)`` filled bucketed values of with relative positions.
+        """
+        num_buckets = self.num_buckets
+        max_distance = self.max_distance
+        # Shape (query_length, key_length)
+        relative_buckets = torch.zeros_like(relative_positions, dtype=torch.long)
+        if bidirectional:
+            num_buckets = num_buckets // 2
+            relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
+            relative_positions = torch.abs(relative_positions)
+        else:
+            relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions))
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_positions.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
+        )
+        relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large)
+        return relative_buckets
+    def forward(
+        self,
+        query: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            query (Tensor): Input of shape ``(batch_size, src_len, embed_dim)``.
+            key_padding_mask (Tensor or None, optional): Mask to exclude keys that are pads, of shape
+                `(batch, src_len)`, where padding elements are indicated by 1s. (Default: ``None``)
+            attn_mask: Needs to be ``None``. The argument exists for compatibility with
+                ``EncoderLayer``. (Default: ``None``)
+            position_bias (Tensor or None, optional): Position bias of shape
+                ``(batch_size * num_heads, src_len, src_len)``. When used inside WavLM model encoder, will be
+                generated in the first layer and then passed from each encoder layer to the next one.
+                (Default: ``None``)
+        Returns:
+            attn_output (Tensor): Attention output of shape ``(batch_size, src_len, embed_dim)``.
+            position_bias (Tensor or None): Position bias of shape ``(batch_size * num_heads, src_len, src_len)``.
+        """
+        bsz, seq_len, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert key_padding_mask is None
+        # only for the first layer
+        if self.rel_attn_embed is not None and position_bias is None:
+            position_bias = self.compute_bias(seq_len, seq_len)
+            position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.total_num_heads, seq_len, seq_len)
+        attn_mask_rel_pos: Optional[Tensor] = None
+        if position_bias is not None:
+            attn_mask_rel_pos = position_bias
+            if self.gru_rel_pos:  # Apply gating on relative position bias
+                query_layer = query.view(bsz, seq_len, self.total_num_heads, -1)
+                query_layer = query_layer.permute(0, 2, 1, 3)
+                gate_a, gate_b = torch.sigmoid(
+                    self.gru_rel_pos_linear(query_layer).view(bsz, self.total_num_heads, seq_len, 2, 4).sum(-1, keepdim=False)
+                ).chunk(2, dim=-1)
+                gate_a_1 = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+                attn_mask_rel_pos = gate_a_1.view(bsz * self.total_num_heads, -1, 1) * position_bias
+            attn_mask_rel_pos = attn_mask_rel_pos.view((-1, seq_len, seq_len))
+            attn_mask_rel_pos = attn_mask_rel_pos.reshape(bsz, self.total_num_heads, seq_len, seq_len)[:, self.remaining_heads, :, :]
+        attn_mask = attn_mask_rel_pos
+        if attention_mask is not None:
+            attn_mask = attn_mask + attention_mask
+        if key_padding_mask is not None:
+            attn_mask = attn_mask.masked_fill(
+                key_padding_mask.reshape(bsz, 1, 1, seq_len),
+                float("-inf")
+            )
+        attn_output, _ = super().forward(query, attention_mask=attn_mask)
+        return attn_output, position_bias
+    def prune(self):
+        new_config = {
+            "use_attention": True,
+            "remaining_heads": self.remaining_heads,
+        }
+        if self.hard_concrete_for_layer is not None:
+            assert not self.hard_concrete_for_layer.training
+            layer_mask = self.hard_concrete_for_layer() # (1,)
+            self.out_proj.weight.data *= layer_mask
+            self.out_proj.bias.data *= layer_mask
+            if layer_mask == 0:
+                new_config["use_attention"] = False
+            self.hard_concrete_for_layer = None
+        if self.hard_concrete_for_heads is not None:
+            assert not self.hard_concrete_for_heads.training
+            head_mask = self.hard_concrete_for_heads()  # (num_heads,)
+            new_config["remaining_heads"] = head_mask.nonzero().squeeze(-1).tolist()
+            if len(new_config["remaining_heads"]) == 0:
+                new_config["use_attention"] = False
+            else:
+                full_mask = head_mask.repeat_interleave(self.head_dim)
+                full_index = full_mask.nonzero().squeeze(-1)  # 1D
+                prune_linear_layer(self.k_proj, full_index, "output")
+                prune_linear_layer(self.v_proj, full_index, "output")
+                prune_linear_layer(self.q_proj, full_index, "output")
+                self.out_proj.weight.data *= full_mask
+                prune_linear_layer(self.out_proj, full_index, "input")
+            self.hard_concrete_for_heads = None
+        return new_config
+class FeedForward(Module):
+    """Layer that follows attention layer in encoder layer."""
+    def __init__(
+        self,
+        io_features: int,
+        intermediate_features: int,
+        intermediate_dropout: float,
+        output_dropout: float,
+        prune_intermediate: bool = False,
+        prune_layer: bool = False,
+    ):
+        super().__init__()
+        self.intermediate_dense = nn.Linear(io_features, intermediate_features)
+        self.intermediate_dropout = nn.Dropout(intermediate_dropout)
+        self.output_dense = nn.Linear(intermediate_features, io_features)
+        self.output_dropout = nn.Dropout(output_dropout)
+        if prune_intermediate:
+            self.hard_concrete_for_intermediate = HardConcrete(
+                n_in=intermediate_features, init_mean=0.5
+            )
+        else:
+            self.hard_concrete_for_intermediate = None
+        if prune_layer:
+            self.hard_concrete_for_layer = HardConcrete(n_in=1, init_mean=0.01)
+        else:
+            self.hard_concrete_for_layer = None
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): shape: `(batch, sequence_length, io_features)`
+        Returns:
+            x (Tensor): shape: `(batch, sequence_length, io_features)`
+        """
+        x = self.intermediate_dense(x)
+        x = torch.nn.functional.gelu(x)
+        x = self.intermediate_dropout(x)
+        if self.hard_concrete_for_intermediate is not None:
+            intermediate_mask = self.hard_concrete_for_intermediate()   # (intermediate_features,)
+            x = x * intermediate_mask
+        x = self.output_dense(x)
+        x = self.output_dropout(x)
+        if self.hard_concrete_for_layer is not None:
+            layer_mask = self.hard_concrete_for_layer()     # (1,)
+            x = x * layer_mask
+        return x
+    def get_num_params(self):
+        io_features = self.intermediate_dense.in_features
+        if self.hard_concrete_for_intermediate is not None:
+            intermediate_features = self.hard_concrete_for_intermediate.l0_norm()
+        else:
+            intermediate_features = self.intermediate_dense.out_features
+        num_params = (io_features + 1) * intermediate_features + (intermediate_features + 1) * io_features
+        if self.hard_concrete_for_layer is not None:
+            num_params *= self.hard_concrete_for_layer.l0_norm()
+        return num_params
+    def prune(self):
+        new_config = {
+            "use_feed_forward": True,
+            "ff_interm_features": self.intermediate_dense.out_features
+        }
+        if self.hard_concrete_for_layer is not None:
+            assert not self.hard_concrete_for_layer.training
+            layer_mask = self.hard_concrete_for_layer()
+            self.output_dense.weight.data *= layer_mask
+            self.output_dense.bias.data *= layer_mask
+            if layer_mask == 0:
+                new_config["use_feed_forward"] = False
+            self.hard_concrete_for_layer = None
+        if self.hard_concrete_for_intermediate is not None:
+            assert not self.hard_concrete_for_intermediate.training
+            interm_mask = self.hard_concrete_for_intermediate()
+            interm_index = interm_mask.nonzero().squeeze(-1)    # NOTE: must specify dim=-1
+            new_config["ff_interm_features"] = len(interm_index)
+            if new_config["ff_interm_features"] == 0:
+                new_config["use_feed_forward"] = False
+            else:
+                prune_linear_layer(self.intermediate_dense, interm_index, "output")
+                self.output_dense.weight.data *= interm_mask
+                prune_linear_layer(self.output_dense, interm_index, "input")
+            self.hard_concrete_for_intermediate = None
+        return new_config
+class EncoderLayer(Module):
+    """A layer unit in encoder. Combines multihead self attention and feed forward."""
+    def __init__(
+        self,
+        attention: Optional[Module],    # can be None if the entire layer is pruned
+        dropout: float,
+        layer_norm_first: bool,
+        feed_forward: Optional[Module], # can be None if the entire layer is pruned
+        embed_dim: int,
+    ):
+        super().__init__()
+        self.attention = attention
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.layer_norm_first = layer_norm_first
+        self.feed_forward = feed_forward
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.embed_dim = embed_dim
+    def forward(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Args:
+            x (Tensor): Input of shape ``(batch, sequence_length, embed_dim)``.
+            attention_mask (Tensor or ``None``, optional): attention mask
+                of shape ``(batch, 1, sequence_length, sequence_length)``. (Default: ``None``)
+            position_bias (Tensor or ``None``, optional): position bias of shape
+                ``(batch_size * num_heads, src_len, src_len)``.
+                Only necessary for WavLM model, ``None`` otherwise. (Default: ``None``)
+            key_padding_mask (Tensor or ``None``, optional): key padding mask of shape ``(batch_size, src_len)``.
+                Only used for WavLM model, ignored otherwise. (Default: ``None``)
+        Returns:
+            (x, position_bias): Shapes are the same as in the input. Position bias is only relevant for WaLM model,
+                ``None`` otherwise.
+        """
+        if self.attention is not None:
+            residual = x
+            if self.layer_norm_first:
+                x = self.layer_norm(x)
+            x, position_bias = self.attention(
+                x, attention_mask=attention_mask, position_bias=position_bias, key_padding_mask=key_padding_mask
+            )
+            x = self.dropout(x)
+            x = residual + x
+        if self.layer_norm_first:
+            if self.feed_forward is not None:
+                x = x + self.feed_forward(self.final_layer_norm(x))
+        else:
+            # NOTE: for post norm, the layer norms should always be applied even if the layers are pruned.
+            x = self.layer_norm(x)
+            if self.feed_forward is not None:
+                x = x + self.feed_forward(x)
+            x = self.final_layer_norm(x)
+        return x, position_bias
+    def get_num_params(self):
+        num_params = self.embed_dim * 2 * 2     # two layer norms
+        if self.attention is not None:
+            num_params += self.attention.get_num_params()
+        if self.feed_forward is not None:
+            num_params += self.feed_forward.get_num_params()
+        return num_params
+class Transformer(Module):
+    def __init__(
+        self,
+        pos_conv_embed: Module,
+        dropout: float,
+        layers: Module,
+        layer_norm_first: bool,
+        layer_drop: float,
+    ):
+        super().__init__()
+        self.pos_conv_embed = pos_conv_embed
+        self.layer_norm = nn.LayerNorm(pos_conv_embed.embed_dim)
+        self.layer_norm_first = layer_norm_first
+        self.layer_drop = layer_drop
+        self.dropout = nn.Dropout(dropout)
+        self.layers = layers
+    def _preprocess(self, x: Tensor):
+        x = x + self.pos_conv_embed(x)
+        if self.layer_norm_first:
+            x = self.layer_norm(x)
+        x = self.dropout(x)
+        return x
+    def forward(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_bias: Optional[Tensor] = None,
+    ) -> Tensor:
+        x = self._preprocess(x)
+        for layer in self.layers:
+            if not (self.training and torch.rand(1).item() <= self.layer_drop):
+                x, position_bias = layer(x, attention_mask, position_bias=position_bias)
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+        return x
+    def get_intermediate_outputs(
+        self,
+        x: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+        position_bias: Optional[Tensor] = None,
+    ) -> List[Tensor]:
+        if num_layers is not None:
+            if not 0 < num_layers <= len(self.layers):
+                raise ValueError(f"`num_layers` must be between [1, {len(self.layers)}]")
+        ret: List[Tensor] = []
+        x = self._preprocess(x)
+        for layer in self.layers:
+            x, position_bias = layer(x, attention_mask, position_bias=position_bias)
+            ret.append(x)
+            if num_layers is not None and len(ret) >= num_layers:
+                return ret
+        return ret
+    def get_num_params(self):
+        # pos_conv_embed and layer_norm
+        num_params = sum(p.numel() for p in self.pos_conv_embed.parameters()) + self.pos_conv_embed.embed_dim * 2
+        for layer in self.layers:
+            num_params += layer.get_num_params()
+        return num_params
+    def prune(self):
+        new_config = defaultdict(list)
+        for layer in self.layers:
+            attention_config = layer.attention.prune()
+            new_config["use_attention"].append(attention_config["use_attention"])
+            if "remaining_heads" in attention_config:
+                new_config["remaining_heads"].append(attention_config["remaining_heads"])
+            else:
+                new_config["num_heads"].append(attention_config["num_heads"])
+            if not attention_config["use_attention"]:
+                layer.attention = None
+            ff_config = layer.feed_forward.prune()
+            new_config["use_feed_forward"].append(ff_config["use_feed_forward"])
+            new_config["ff_interm_features"].append(ff_config["ff_interm_features"])
+            if not ff_config["use_feed_forward"]:
+                layer.feed_forward = None
+        return new_config
+class Encoder(Module):
+    def __init__(
+        self,
+        feature_projection: Module,
+        transformer: Module,
+    ):
+        super().__init__()
+        self.feature_projection = feature_projection
+        self.transformer = transformer
+    def _preprocess(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        x = self.feature_projection(features)
+        mask: Optional[Tensor] = None
+        if lengths is not None:
+            batch_size, max_len, _ = x.shape
+            # create mask for padded elements and zero-out them
+            mask = torch.arange(max_len, device=lengths.device).expand(batch_size, max_len) >= lengths[:, None]
+            x[mask] = 0.0
+            # extend the mask to attention shape and set weight
+            mask = -10000.0 * mask[:, None, None, :].to(dtype=features.dtype)
+            mask = mask.expand(batch_size, 1, max_len, max_len)
+        return x, mask
+    def forward(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tensor:
+        x, mask = self._preprocess(features, lengths)
+        x = self.transformer(x, attention_mask=mask)
+        return x
+    def extract_features(
+        self,
+        features: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> List[Tensor]:
+        x, masks = self._preprocess(features, lengths)
+        interm = self.transformer.get_intermediate_outputs(x, attention_mask=masks, num_layers=num_layers)
+        return [x] + interm
+    def get_num_params(self, in_features):
+        """Calculate the current model size."""
+        feature_projection_size = self.feature_projection.get_num_params(in_features)
+        transformer_size = self.transformer.get_num_params()
+        return feature_projection_size + transformer_size
+    def prune(self, conv_out_index):
+        """In-place pruning of submodules."""
+        prune_layer_norm(self.feature_projection.layer_norm, conv_out_index)
+        prune_linear_layer(self.feature_projection.projection, conv_out_index, "input")
+        transformer_config = self.transformer.prune()
+        return transformer_config
+################################################################################
+def _get_feature_extractor(
+    norm_mode: str,
+    shapes: List[Tuple[int, int, int]],
+    bias: bool,
+    prune_conv_channels: bool = False,
+) -> FeatureExtractor:
+    """
+    Args:
+        norm_mode (str):
+            Either "group_norm" or "layer_norm".
+            If "group_norm", then a single normalization is applied
+            in the first convolution block. Otherwise, all the convolution
+            blocks will have layer normalization.
+            This option corresponds to "extractor_mode" from fairseq.
+            Expected values are "group_norm" for Base arch, and
+            "layer_norm" for Large arch.
+        shapes (list of tuple of int):
+            Configuration of convolution layers. List of convolution configuration,
+            i.e. ``[(output_channel, kernel_size, stride), ...]``
+            This option corresponds to "conv_feature_layers" from fairseq.
+            Expected values are
+            ``[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2``
+            for all the architectures.
+        bias (bool):
+            Whether to include bias term to each convolution operation.
+            This option corresponds to "conv_bias" from fairseq.
+            Expected values are False for Base arch, and True for Large arch.
+    See Also:
+        * Original implementation
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L666-L733
+        * "extractor_mode"
+          - Def and base:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L38-L45
+          - Large:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L52
+        * "conv_feature_layers"
+          - Def, base and large:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L94-L100
+        * "conv_bias"
+          - Def and base:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L101-L103
+          - Large:
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L61
+    """
+    if norm_mode not in ["group_norm", "layer_norm"]:
+        raise ValueError("Invalid norm mode")
+    blocks = []
+    in_channels = 1
+    for i, (out_channels, kernel_size, stride) in enumerate(shapes):
+        normalization = None
+        if norm_mode == "group_norm" and i == 0:
+            normalization = nn.GroupNorm(
+                num_groups=out_channels,
+                num_channels=out_channels,
+                affine=True,
+            )
+        elif norm_mode == "layer_norm":
+            normalization = LayerNorm(
+                normalized_shape=out_channels,
+                elementwise_affine=True,
+            )
+        blocks.append(
+            ConvLayerBlock(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                bias=bias,
+                layer_norm=normalization,
+                prune_conv_channels=prune_conv_channels,
+            )
+        )
+        in_channels = out_channels
+    return FeatureExtractor(nn.ModuleList(blocks))
+def _get_encoder(
+    in_features: int,
+    embed_dim: int,
+    dropout_input: float,
+    pos_conv_kernel: int,
+    pos_conv_groups: int,
+    num_layers: int,
+    use_attention: List[bool],
+    use_feed_forward: List[bool],
+    num_heads: List[int],
+    head_dim: int,
+    attention_dropout: float,
+    ff_interm_features: List[int],
+    ff_interm_dropout: float,
+    dropout: float,
+    layer_norm_first: bool,
+    layer_drop: float,
+    prune_attention_heads: bool = False,
+    prune_attention_layer: bool = False,
+    prune_feed_forward_intermediate: bool = False,
+    prune_feed_forward_layer: bool = False,
+) -> Encoder:
+    """
+    Args:
+        in_features (int): The number of input features.
+        embed_dim (int):
+            The dimension of embedding.
+            This option corresponds to "encoder_embed_dim" from fairseq.
+            Expected values are 768 for Base arch, and 1024 for Large arch.
+        dropout_input (float):
+            The dropout probability applied after the input feature is projected
+            to ``embed_dim``.
+            This option corresponds to "dropout_input" from fairseq.
+            Expected values are 0.1 for both Base and Large arch.
+        pos_conv_kernel (int):
+            The kernel size of convolutional positional embeddings.
+            This option corresponds to "conv_pos" from fairseq.
+            Expected values are 128 for both Base and Large arch.
+        pos_conv_groups (int):
+            The number of groups of convolutional positional embeddings.
+            This option corresponds to "conv_pos_groups" from fairseq.
+            Expected values are 16 for both Base and Large arch.
+        num_layers (int):
+            The number of self attention layers in transformer block.
+            This option corresponds to "encoder_layers" from fairseq.
+            Expected values are 12 for Base and 24 for Large arch.
+        num_heads (int):
+            The number of heads in self attention layers.
+            This option corresponds to "encoder_attention_heads" from fairseq.
+            Expected values are 12 for Base and 16 for Large arch.
+        attention_dropout (float):
+            The dropout probability applied after softmax in self-attention layer.
+            This option corresponds to "attention_dropout" from fairseq.
+            Expected values are 0.1 for Base and 0.0 for Large arch.
+        ff_interm_features (int):
+            The dimension of hidden features in feed forward layer.
+            This option corresponds to "encoder_ffn_embed_dim" from fairseq.
+            Expected values are 3072 for Base and 4096 for Large arch.
+        ff_interm_dropout (float):
+            The dropout probability applied in feedforward layer.
+            This option correspinds to "activation_dropout" from fairseq.
+            Expected values are 0.1 for both Base and Large arch.
+        dropout (float):
+            The dropout probability applied at the end of feed forward layer.
+            This option corresponds to "dropout" from fairseq.
+            Expected values are 0.1 for Base and 0.0 for Large arch.
+        layer_norm_first (bool):
+            Control the order of layer norm in transformer layer and each encoder layer.
+            If True, in transformer layer, layer norm is applied before features are fed
+            to encoder layers. In encoder layer, two layer norms are applied before and after
+            self attention.
+            If False, in transformer layer, layer norm is applied after features are fed
+            to encoder layers. In encoder layer, two layer norms are applied after self
+            attention, before and after feed forward.
+            This option corresponds to "layer_norm_first" from fairseq.
+            Expected values are False for Base and True for Large arch.
+        layer_drop (float):
+            Probability to drop each encoder layer during training.
+            This option corresponds to "layerdrop" from fairseq.
+            Expected values are 0.1 for both Base and Large arch.
+    See Also:
+        * "encoder_embed_dim"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L49-L51
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L64
+        * "dropout_input"
+          - Def, base and large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L75-L78
+        * "conv_pos"
+          - Def, base and large
+            NOTE: The description is wrong.
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L204-L207
+          - Usage
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L756
+        * "conv_pos_groups"
+          - Def, base and large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L208-L211
+        * "encoder_layers"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L46-L48
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L63
+        * "encoder_attention_heads"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L55-L57
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L66
+        * "attention_dropout"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L66-L68
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L60
+        * "encoder_ffn_embed_dim"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L52-L54
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L65
+        * "activation_dropout"
+          - Def
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L69-L71
+          - Base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/base_960h.yaml#L55
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/vox_960h.yaml#L55
+        * "dropout"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L63-L65
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L59
+        * "layer_norm_first"
+          - Def and base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L91-L93
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml#L53
+        * "layerdrop"
+          - Def
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L72-L74
+          - Base
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/base_960h.yaml#L54
+          - Large
+            https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/examples/wav2vec/config/finetuning/vox_960h.yaml#L54
+    """
+    feature_projection = FeatureProjection(in_features, embed_dim, dropout_input)
+    pos_conv = ConvolutionalPositionalEmbedding(embed_dim, pos_conv_kernel, pos_conv_groups)
+    # Original impl
+    # https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L768-L782
+    encoder_layers = nn.ModuleList()
+    for idx in range(num_layers):
+        if use_attention[idx]:
+            attention = SelfAttention(
+                embed_dim=embed_dim,
+                num_heads=num_heads[idx],
+                head_dim=head_dim,
+                dropout=attention_dropout,
+                prune_heads=prune_attention_heads,
+                prune_layer=prune_attention_layer,
+            )
+        else:
+            attention = None
+        if use_feed_forward[idx]:
+            feed_forward = FeedForward(
+                io_features=embed_dim,
+                intermediate_features=ff_interm_features[idx],
+                intermediate_dropout=ff_interm_dropout,
+                output_dropout=dropout,
+                prune_intermediate=prune_feed_forward_intermediate,
+                prune_layer=prune_feed_forward_layer,
+            )
+        else:
+            feed_forward = None
+        encoder_layers.append(
+            EncoderLayer(
+                attention=attention,
+                dropout=dropout,
+                layer_norm_first=layer_norm_first,
+                feed_forward=feed_forward,
+                embed_dim=embed_dim,
+            )
+        )
+    transformer = Transformer(
+        pos_conv_embed=pos_conv,
+        dropout=dropout,
+        layers=encoder_layers,
+        layer_norm_first=not layer_norm_first,
+        layer_drop=layer_drop,
+    )
+    return Encoder(feature_projection, transformer)
+def _get_wavlm_encoder(
+    in_features: int,
+    embed_dim: int,
+    dropout_input: float,
+    pos_conv_kernel: int,
+    pos_conv_groups: int,
+    num_layers: int,
+    use_attention: List[bool],
+    use_feed_forward: List[bool],
+    total_num_heads: List[int],
+    remaining_heads: List[List[int]],
+    num_buckets: int,
+    max_distance: int,
+    attention_dropout: float,
+    ff_interm_features: List[int],
+    ff_interm_dropout: float,
+    dropout: float,
+    layer_norm_first: bool,
+    layer_drop: float,
+    prune_attention_heads: bool = False,
+    prune_attention_layer: bool = False,
+    prune_feed_forward_intermediate: bool = False,
+    prune_feed_forward_layer: bool = False,
+) -> Encoder:
+    """
+    Construct encoder for WavLM model :cite:`chen2022wavlm`. The structure of the encoder and most of the argments are
+    the same as in :py:func:`_get_encoder` so refer there for documentation. The only difference from Wav2Vec2 encoder
+    is usage of `WavLMSelfAttention` instead of `SelfAttention` and two additional parameters: `num_buckets` and
+    `max_distance`.
+    Args:
+        in_features (int): See :py:func:`_get_encoder`.
+        embed_dim (int): See :py:func:`_get_encoder`.
+        dropout_input (float): See :py:func:`_get_encoder`.
+        pos_conv_kernel (int): See :py:func:`_get_encoder`.
+        pos_conv_groups (int): See :py:func:`_get_encoder`.
+        num_layers (int): See :py:func:`_get_encoder`.
+        num_heads (int): See :py:func:`_get_encoder`.
+        num_buckets (int): Number of buckets for relative position embedding.
+        max_distance (int): Maximum distance for relative position embedding.
+        attention_dropout (float): See :py:func:`_get_encoder`.
+        ff_interm_features (int): See :py:func:`_get_encoder`.
+        ff_interm_dropout (float): See :py:func:`_get_encoder`.
+        dropout (float): See :py:func:`_get_encoder`.
+        layer_norm_first (bool): See :py:func:`_get_encoder`.
+        layer_drop (float): See :py:func:`_get_encoder`.
+    """
+    feature_projection = FeatureProjection(in_features, embed_dim, dropout_input)
+    pos_conv = ConvolutionalPositionalEmbedding(embed_dim, pos_conv_kernel, pos_conv_groups)
+    # Original impl
+    # https://github.com/pytorch/fairseq/blob/425c36eafff535fe7337f8bdd5ace22ebacc78cb/fairseq/models/wav2vec/wav2vec2.py#L768-L782
+    encoder_layers = nn.ModuleList()
+    for i in range(num_layers):
+        if use_attention[i]:
+            attention = WavLMSelfAttention(
+                embed_dim=embed_dim,
+                total_num_heads=total_num_heads[i],
+                remaining_heads=remaining_heads[i],
+                dropout=attention_dropout,
+                has_relative_attention_bias=(i == 0),  # Position embedding is only necessary in the first layer.
+                num_buckets=num_buckets,
+                max_distance=max_distance,
+                prune_heads=prune_attention_heads,
+                prune_layer=prune_attention_layer,
+            )
+        else:
+            attention = None
+        if use_feed_forward[i]:
+            feed_forward = FeedForward(
+                io_features=embed_dim,
+                intermediate_features=ff_interm_features[i],
+                intermediate_dropout=ff_interm_dropout,
+                output_dropout=dropout,
+                prune_intermediate=prune_feed_forward_intermediate,
+                prune_layer=prune_feed_forward_layer,
+            )
+        else:
+            feed_forward = None
+        encoder_layers.append(
+            EncoderLayer(
+                attention=attention,
+                dropout=dropout,
+                layer_norm_first=layer_norm_first,
+                feed_forward=feed_forward,
+                embed_dim=embed_dim,
+            )
+        )
+    transformer = Transformer(
+        pos_conv_embed=pos_conv,
+        dropout=dropout,
+        layers=encoder_layers,
+        layer_norm_first=not layer_norm_first,
+        layer_drop=layer_drop,
+    )
+    return Encoder(feature_projection, transformer)
+def _get_padding_mask(input: Tensor, lengths: Tensor) -> Tensor:
+    """Generate the padding mask given the padded input and the lengths Tensors.
+    Args:
+        input (Tensor): The padded Tensor of dimension `[batch, max_len, frequency]`.
+        lengths (Tensor): The lengths Tensor of dimension `[batch,]`.
+    Returns:
+        (Tensor): The padding mask.
+    """
+    batch_size, max_len, _ = input.shape
+    mask = torch.arange(max_len, device=lengths.device).expand(batch_size, max_len) >= lengths[:, None]
+    return mask
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None

vencoder/dphubert/hardconcrete.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""Implementation of the hard Concrete distribution.
+Originally from:
+https://github.com/asappresearch/flop/blob/master/flop/hardconcrete.py
+"""
+import math
+import torch
+import torch.nn as nn
+class HardConcrete(nn.Module):
+    """A HarcConcrete module.
+    Use this module to create a mask of size N, which you can
+    then use to perform L0 regularization.
+    To obtain a mask, simply run a forward pass through the module
+    with no input data. The mask is sampled in training mode, and
+    fixed during evaluation mode, e.g.:
+    >>> module = HardConcrete(n_in=100)
+    >>> mask = module()
+    >>> norm = module.l0_norm()
+    """
+    def __init__(
+        self,
+        n_in: int,
+        init_mean: float = 0.5,
+        init_std: float = 0.01,
+        temperature: float = 2/3,     # from CoFi
+        stretch: float = 0.1,
+        eps: float = 1e-6
+    ) -> None:
+        """Initialize the HardConcrete module.
+        Parameters
+        ----------
+        n_in : int
+            The number of hard concrete variables in this mask.
+        init_mean : float, optional
+            Initial drop rate for hard concrete parameter,
+            by default 0.5.,
+        init_std: float, optional
+            Used to initialize the hard concrete parameters,
+            by default 0.01.
+        temperature : float, optional
+            Temperature used to control the sharpness of the
+            distribution, by default 1.0
+        stretch : float, optional
+            Stretch the sampled value from [0, 1] to the interval
+            [-stretch, 1 + stretch], by default 0.1.
+        """
+        super().__init__()
+        self.n_in = n_in
+        self.limit_l = -stretch
+        self.limit_r = 1.0 + stretch
+        self.log_alpha = nn.Parameter(torch.zeros(n_in))
+        self.beta = temperature
+        self.init_mean = init_mean
+        self.init_std = init_std
+        self.bias = -self.beta * math.log(-self.limit_l / self.limit_r)
+        self.eps = eps
+        self.compiled_mask = None
+        self.reset_parameters()
+    def reset_parameters(self):
+        """Reset the parameters of this module."""
+        self.compiled_mask = None
+        mean = math.log(1 - self.init_mean) - math.log(self.init_mean)
+        self.log_alpha.data.normal_(mean, self.init_std)
+    def l0_norm(self) -> torch.Tensor:
+        """Compute the expected L0 norm of this mask.
+        Returns
+        -------
+        torch.Tensor
+            The expected L0 norm.
+        """
+        return (self.log_alpha + self.bias).sigmoid().sum()
+    def forward(self) -> torch.Tensor:
+        """Sample a hard concrete mask.
+        Returns
+        -------
+        torch.Tensor
+            The sampled binary mask
+        """
+        if self.training:
+            # Reset the compiled mask
+            self.compiled_mask = None
+            # Sample mask dynamically
+            u = self.log_alpha.new(self.n_in).uniform_(self.eps, 1 - self.eps)
+            s = torch.sigmoid((torch.log(u / (1 - u)) + self.log_alpha) / self.beta)
+            s = s * (self.limit_r - self.limit_l) + self.limit_l
+            mask = s.clamp(min=0., max=1.)
+        else:
+            # Compile new mask if not cached
+            if self.compiled_mask is None:
+                # Get expected sparsity
+                expected_num_zeros = self.n_in - self.l0_norm().item()
+                num_zeros = round(expected_num_zeros)
+                # Approximate expected value of each mask variable z;
+                # We use an empirically validated magic number 0.8
+                soft_mask = torch.sigmoid(self.log_alpha / self.beta * 0.8)
+                # Prune small values to set to 0
+                _, indices = torch.topk(soft_mask, k=num_zeros, largest=False)
+                soft_mask[indices] = 0.
+                self.compiled_mask = soft_mask
+            mask = self.compiled_mask
+        return mask
+    def extra_repr(self) -> str:
+        return str(self.n_in)
+    def __repr__(self) -> str:
+        return "{}({})".format(self.__class__.__name__, self.extra_repr())

vencoder/dphubert/model.py ADDED Viewed

	@@ -0,0 +1,966 @@

+"""Speech SSL models supporting pruning.
+Originally from:
+https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/model.py
+"""
+import math
+from typing import List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import Module
+from . import components
+class Wav2Vec2Model(Module):
+    """Acoustic model used in *wav2vec 2.0* :cite:`baevski2020wav2vec`.
+    Note:
+        To build the model, please use one of the factory functions.
+        :py:func:`wav2vec2_model`, :py:func:`wav2vec2_base`, :py:func:`wav2vec2_large`,
+        :py:func:`wav2vec2_large_lv60k`, :py:func:`hubert_base`, :py:func:`hubert_large`,
+        and :py:func:`hubert_xlarge`.
+    See Also:
+        * :class:`torchaudio.pipelines.Wav2Vec2Bundle`: Pretrained models (without fine-tuning)
+        * :class:`torchaudio.pipelines.Wav2Vec2ASRBundle`: ASR pipelines with pretrained models.
+    Args:
+        feature_extractor (torch.nn.Module):
+            Feature extractor that extracts feature vectors from raw audio Tensor.
+        encoder (torch.nn.Module):
+            Encoder that converts the audio features into the sequence of probability
+            distribution (in negative log-likelihood) over labels.
+        aux (torch.nn.Module or None, optional):
+            Auxiliary module. If provided, the output from encoder is passed to this module.
+    """  # noqa: E501
+    def __init__(
+        self,
+        normalize_waveform: bool,
+        feature_extractor: Module,
+        encoder: Module,
+        aux: Optional[Module] = None,
+    ):
+        super().__init__()
+        self.normalize_waveform = normalize_waveform
+        self.feature_extractor = feature_extractor
+        self.encoder = encoder
+        self.aux = aux
+    @torch.jit.export
+    def extract_features(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> Tuple[List[Tensor], Optional[Tensor]]:
+        """Extract feature vectors from raw waveforms
+        This returns the list of outputs from the intermediate layers of
+        transformer block in encoder.
+        Args:
+            waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
+            lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that the entire audio waveform
+                length is valid.
+            num_layers (int or None, optional):
+                If given, limit the number of intermediate layers to go through.
+                Providing `1` will stop the computation after going through one
+                intermediate layers. If not given, the outputs from all the
+                intermediate layers are returned.
+        Returns:
+            (List[Tensor], Optional[Tensor]):
+            List of Tensors
+                Features from requested layers.
+                Each Tensor is of shape: `(batch, time frame, feature dimension)`
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is returned.
+                It indicates the valid length in time axis of each feature Tensor.
+        """
+        if self.normalize_waveform:
+            if lengths is not None:
+                waveforms = [
+                    F.layer_norm(wave[:length], (length,)) for wave, length in zip(waveforms, lengths)
+                ]
+                waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
+            else:
+                waveforms = F.layer_norm(waveforms, waveforms.shape[-1:])
+        x, lengths = self.feature_extractor(waveforms, lengths)
+        x = self.encoder.extract_features(x, lengths, num_layers)   # (num_layers+1,), including the input
+        return x, lengths
+    def get_num_params(self):
+        """Calculate the current size."""
+        feature_extractor_size, encoder_in_features = self.feature_extractor.get_num_params_and_final_out_channels()
+        encoder_size = self.encoder.get_num_params(encoder_in_features)
+        return feature_extractor_size + encoder_size
+    def prune(self):
+        self.eval()     # must be in eval mode
+        conv_config, conv_out_index = self.feature_extractor.prune()    # [(output_channel, kernel_size, stride), ...]
+        transformer_config = self.encoder.prune(conv_out_index)     # NOTE: this is a defaultdict(list)
+        use_attention = transformer_config["use_attention"]
+        use_feed_forward = transformer_config["use_feed_forward"]
+        num_heads = transformer_config["num_heads"]     # can be []
+        remaining_heads = transformer_config["remaining_heads"]     # can be []
+        ff_interm_features = transformer_config["ff_interm_features"]
+        return conv_config, use_attention, use_feed_forward, num_heads, remaining_heads, ff_interm_features
+    def forward(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Compute the sequence of probability distribution over labels.
+        Args:
+            waveforms (Tensor): Audio tensor of shape `(batch, frames)`.
+            lengths (Tensor or None, optional):
+                Indicates the valid length of each audio in the batch.
+                Shape: `(batch, )`.
+                When the ``waveforms`` contains audios with different durations,
+                by providing ``lengths`` argument, the model will compute
+                the corresponding valid output lengths and apply proper mask in
+                transformer attention layer.
+                If ``None``, it is assumed that all the audio in ``waveforms``
+                have valid length. Default: ``None``.
+        Returns:
+            (Tensor, Optional[Tensor]):
+            Tensor
+                The sequences of probability distribution (in logit) over labels.
+                Shape: `(batch, frames, num labels)`.
+            Tensor or None
+                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
+                is returned.
+                It indicates the valid length in time axis of the output Tensor.
+        """
+        if self.normalize_waveform:
+            if lengths is not None:
+                waveforms = [
+                    F.layer_norm(wave[:length], (length,)) for wave, length in zip(waveforms, lengths)
+                ]
+                waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
+            else:
+                waveforms = F.layer_norm(waveforms, waveforms.shape[-1:])
+        x, lengths = self.feature_extractor(waveforms, lengths)
+        x = self.encoder(x, lengths)
+        if self.aux is not None:
+            x = self.aux(x)
+        return x, lengths
+def wav2vec2_model(**configs) -> Wav2Vec2Model:
+    """Wraps the original wav2vec2_model and wavlm_model."""
+    if "encoder_remaining_heads" in configs:
+        return wavlm_model(**configs)
+    return wav2vec2_model_original(**configs)
+def wav2vec2_model_original(
+    extractor_mode: str,
+    extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
+    extractor_conv_bias: bool,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_pos_conv_kernel: int,
+    encoder_pos_conv_groups: int,
+    encoder_num_layers: int,
+    encoder_use_attention: List[bool],
+    encoder_use_feed_forward: List[bool],
+    encoder_num_heads: List[int],
+    encoder_head_dim: int,
+    encoder_attention_dropout: float,
+    encoder_ff_interm_features: List[int],
+    encoder_ff_interm_dropout: float,
+    encoder_dropout: float,
+    encoder_layer_norm_first: bool,
+    encoder_layer_drop: float,
+    aux_num_out: Optional[int],
+    normalize_waveform: bool,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds custom :class:`~torchaudio.models.Wav2Vec2Model`.
+    Note:
+        The "feature extractor" below corresponds to
+        `ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__
+        in the original ``fairseq`` implementation.
+        This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0*
+        :cite:`baevski2020wav2vec` paper.
+        The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__,
+        and this is referred as "Transformer" in the paper.
+    Args:
+        extractor_mode (str): Operation mode of feature extractor.
+            Valid values are ``"group_norm"`` or ``"layer_norm"``.
+            If ``"group_norm"``, then a single normalization is applied
+            in the first convolution block. Otherwise, all the convolution
+            blocks will have layer normalization.
+            This option corresponds to ``extractor_mode`` from ``fairseq``.
+        extractor_conv_layer_config (list of integer tuples or None):
+            Configuration of convolution layers in feature extractor.
+            List of convolution configuration,
+            i.e. ``[(output_channel, kernel_size, stride), ...]``
+            If ``None`` is provided, then the following default value is used.
+            .. code-block:: python
+               [
+                 (512, 10, 5),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 3, 2),
+                 (512, 2, 2),
+                 (512, 2, 2),
+               ]
+            This option corresponds to ``conv_feature_layers`` from ``fairseq``.
+        extractor_conv_bias (bool):
+            Whether to include bias term to each convolution operation.
+            This option corresponds to ``conv_bias`` from ``fairseq``.
+        encoder_embed_dim (int):
+            The dimension of embedding in encoder.
+            This option corresponds to ``encoder_embed_dim`` from ``fairseq``.
+        encoder_projection_dropout (float):
+            The dropout probability applied after the input feature is projected
+            to ``encoder_embed_dim``.
+            This option corresponds to ``dropout_input`` from ``fairseq``.
+        encoder_pos_conv_kernel (int):
+            The kernel size of convolutional positional embeddings.
+            This option corresponds to ``conv_pos`` from ``fairseq``.
+        encoder_pos_conv_groups (int):
+            The number of groups of convolutional positional embeddings.
+            This option corresponds to ``conv_pos_groups`` from ``fairseq``.
+        encoder_num_layers (int):
+            The number of self attention layers in transformer block.
+            This option corresponds to ``encoder_layers`` from ``fairseq``.
+        encoder_num_heads (int):
+            The number of heads in self attention layers.
+            This option corresponds to ``encoder_attention_heads`` from ``fairseq``.
+        encoder_attention_dropout (float):
+            The dropout probability applied after softmax in self-attention layer.
+            This option corresponds to ``attention_dropout`` from ``fairseq``.
+        encoder_ff_interm_features (int):
+            The dimension of hidden features in feed forward layer.
+            This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``.
+        encoder_ff_interm_dropout (float):
+            The dropout probability applied in feedforward layer.
+            This option correspinds to ``activation_dropout`` from ``fairseq``.
+        encoder_dropout (float):
+            The dropout probability applied at the end of feed forward layer.
+            This option corresponds to ``dropout`` from ``fairseq``.
+        encoder_layer_norm_first (bool):
+            Control the order of layer norm in transformer layer and each encoder layer.
+            If True, in transformer layer, layer norm is applied before features are fed
+            to encoder layers. In encoder layer, two layer norms are applied before and after
+            self attention.
+            If False, in transformer layer, layer norm is applied after features are fed
+            to encoder layers. In encoder layer, two layer norms are applied after self
+            attention, before and after feed forward.
+            This option corresponds to ``layer_norm_first`` from ``fairseq``.
+        encoder_layer_drop (float):
+            Probability to drop each encoder layer during training.
+            This option corresponds to ``layerdrop`` from ``fairseq``.
+        aux_num_out (int or None):
+            When provided, attach an extra linear layer on top of encoder, which can be
+            used for fine-tuning.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    if extractor_conv_layer_config is None:
+        extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
+    feature_extractor = components._get_feature_extractor(
+        extractor_mode, extractor_conv_layer_config, extractor_conv_bias,
+        prune_conv_channels=extractor_prune_conv_channels,
+    )
+    encoder = components._get_encoder(
+        in_features=extractor_conv_layer_config[-1][0],
+        embed_dim=encoder_embed_dim,
+        dropout_input=encoder_projection_dropout,
+        pos_conv_kernel=encoder_pos_conv_kernel,
+        pos_conv_groups=encoder_pos_conv_groups,
+        num_layers=encoder_num_layers,
+        use_attention=encoder_use_attention,
+        use_feed_forward=encoder_use_feed_forward,
+        num_heads=encoder_num_heads,
+        head_dim=encoder_head_dim,
+        attention_dropout=encoder_attention_dropout,
+        ff_interm_features=encoder_ff_interm_features,
+        ff_interm_dropout=encoder_ff_interm_dropout,
+        dropout=encoder_dropout,
+        layer_norm_first=encoder_layer_norm_first,
+        layer_drop=encoder_layer_drop,
+        prune_attention_heads=encoder_prune_attention_heads,
+        prune_attention_layer=encoder_prune_attention_layer,
+        prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+    aux = None
+    if aux_num_out is not None:
+        aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out)
+    return Wav2Vec2Model(normalize_waveform, feature_extractor, encoder, aux)
+def wav2vec2_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds "base" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+        extractor_prune_conv_channels=extractor_prune_conv_channels,
+        encoder_prune_attention_heads=encoder_prune_attention_heads,
+        encoder_prune_attention_layer=encoder_prune_attention_layer,
+        encoder_prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        encoder_prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+def wav2vec2_large(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds "large" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+        extractor_prune_conv_channels=extractor_prune_conv_channels,
+        encoder_prune_attention_heads=encoder_prune_attention_heads,
+        encoder_prune_attention_layer=encoder_prune_attention_layer,
+        encoder_prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        encoder_prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+def wav2vec2_large_lv60k(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds "large lv-60k" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec`
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=True,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+        extractor_prune_conv_channels=extractor_prune_conv_channels,
+        encoder_prune_attention_heads=encoder_prune_attention_heads,
+        encoder_prune_attention_layer=encoder_prune_attention_layer,
+        encoder_prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        encoder_prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+def hubert_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.05,
+    aux_num_out: Optional[int] = None,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds "base" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_use_attention=[True] * 12,
+        encoder_use_feed_forward=[True] * 12,
+        encoder_num_heads=[12] * 12,
+        encoder_head_dim=64,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=[3072] * 12,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+        extractor_prune_conv_channels=extractor_prune_conv_channels,
+        encoder_prune_attention_heads=encoder_prune_attention_heads,
+        encoder_prune_attention_layer=encoder_prune_attention_layer,
+        encoder_prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        encoder_prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+def hubert_large(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds "large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+        extractor_prune_conv_channels=extractor_prune_conv_channels,
+        encoder_prune_attention_heads=encoder_prune_attention_heads,
+        encoder_prune_attention_layer=encoder_prune_attention_layer,
+        encoder_prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        encoder_prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+def hubert_xlarge(
+    encoder_projection_dropout: float = 0.0,
+    encoder_attention_dropout: float = 0.0,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.0,
+    encoder_layer_drop: float = 0.0,
+    aux_num_out: Optional[int] = None,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds "extra large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert`
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """  # noqa: E501
+    return wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1280,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=48,
+        encoder_num_heads=16,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=5120,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+        extractor_prune_conv_channels=extractor_prune_conv_channels,
+        encoder_prune_attention_heads=encoder_prune_attention_heads,
+        encoder_prune_attention_layer=encoder_prune_attention_layer,
+        encoder_prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        encoder_prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+def _init_hubert_pretrain_model(module):
+    if isinstance(module, components.LayerNorm):
+        torch.nn.init.kaiming_normal_(module.conv.weight)
+    elif isinstance(module, components.ConvolutionalPositionalEmbedding):
+        # normalize the weight to normal distribution.
+        std = math.sqrt(4.0 / (module.embed_dim * module.kernel_size))
+        torch.nn.init.normal_(module.conv.weight, mean=0.0, std=std)
+        torch.nn.init.constant_(module.conv.bias, 0.0)
+    elif isinstance(module, components.SelfAttention):
+        # normalize the query, key, value, and out_proj parameters in self attention module.
+        torch.nn.init.xavier_uniform_(module.k_proj.weight, gain=1 / math.sqrt(2))
+        torch.nn.init.xavier_uniform_(module.v_proj.weight, gain=1 / math.sqrt(2))
+        torch.nn.init.xavier_uniform_(module.q_proj.weight, gain=1 / math.sqrt(2))
+        torch.nn.init.xavier_uniform_(module.out_proj.weight)
+        torch.nn.init.constant_(module.out_proj.bias, 0.0)
+    elif isinstance(module, components.Transformer):
+        module.apply(components._init_transformer_params)
+    else:
+        pass
+def wavlm_model(
+    extractor_mode: str,
+    extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]],
+    extractor_conv_bias: bool,
+    encoder_embed_dim: int,
+    encoder_projection_dropout: float,
+    encoder_pos_conv_kernel: int,
+    encoder_pos_conv_groups: int,
+    encoder_num_layers: int,
+    encoder_use_attention: List[bool],
+    encoder_use_feed_forward: List[bool],
+    encoder_total_num_heads: List[int],
+    encoder_remaining_heads: List[List[int]],
+    encoder_num_buckets: int,
+    encoder_max_distance: int,
+    encoder_attention_dropout: float,
+    encoder_ff_interm_features: List[int],
+    encoder_ff_interm_dropout: float,
+    encoder_dropout: float,
+    encoder_layer_norm_first: bool,
+    encoder_layer_drop: float,
+    aux_num_out: Optional[int],
+    normalize_waveform: bool,
+    extractor_prune_conv_channels: bool = False,
+    encoder_prune_attention_heads: bool = False,
+    encoder_prune_attention_layer: bool = False,
+    encoder_prune_feed_forward_intermediate: bool = False,
+    encoder_prune_feed_forward_layer: bool = False,
+) -> Wav2Vec2Model:
+    """Builds custom WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output object is
+    :class:`~torchaudio.models.Wav2Vec2Model`. Most of the arguments have the same meaning
+    as in :py:func:`wav2vec2_model` so please refer there for documentation.
+    Args:
+        extractor_mode (str): Operation mode of feature extractor.
+            See :py:func:`wav2vec2_model`.
+        extractor_conv_layer_config (list of integer tuples or None):
+            See :py:func:`wav2vec2_model`.
+        extractor_conv_bias (bool):
+            See :py:func:`wav2vec2_model`.
+        encoder_embed_dim (int):
+            See :py:func:`wav2vec2_model`.
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_pos_conv_kernel (int):
+            See :py:func:`wav2vec2_model`.
+        encoder_pos_conv_groups (int):
+            See :py:func:`wav2vec2_model`.
+        encoder_num_layers (int):
+            See :py:func:`wav2vec2_model`.
+        encoder_num_heads (int):
+            See :py:func:`wav2vec2_model`.
+        encoder_num_buckets (int):
+            Number of buckets for relative position embedding.
+        encoder_max_distance (int):
+            Maximum distance for relative position embedding.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_features (int):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_norm_first (bool):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int or None):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    if extractor_conv_layer_config is None:
+        extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2
+    feature_extractor = components._get_feature_extractor(
+        extractor_mode, extractor_conv_layer_config, extractor_conv_bias,
+        prune_conv_channels=extractor_prune_conv_channels,
+    )
+    encoder = components._get_wavlm_encoder(
+        in_features=extractor_conv_layer_config[-1][0],
+        embed_dim=encoder_embed_dim,
+        dropout_input=encoder_projection_dropout,
+        pos_conv_kernel=encoder_pos_conv_kernel,
+        pos_conv_groups=encoder_pos_conv_groups,
+        num_layers=encoder_num_layers,
+        use_attention=encoder_use_attention,
+        use_feed_forward=encoder_use_feed_forward,
+        total_num_heads=encoder_total_num_heads,
+        remaining_heads=encoder_remaining_heads,
+        num_buckets=encoder_num_buckets,
+        max_distance=encoder_max_distance,
+        attention_dropout=encoder_attention_dropout,
+        ff_interm_features=encoder_ff_interm_features,
+        ff_interm_dropout=encoder_ff_interm_dropout,
+        dropout=encoder_dropout,
+        layer_norm_first=encoder_layer_norm_first,
+        layer_drop=encoder_layer_drop,
+        prune_attention_heads=encoder_prune_attention_heads,
+        prune_attention_layer=encoder_prune_attention_layer,
+        prune_feed_forward_intermediate=encoder_prune_feed_forward_intermediate,
+        prune_feed_forward_layer=encoder_prune_feed_forward_layer,
+    )
+    aux = None
+    if aux_num_out is not None:
+        aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out)
+    return Wav2Vec2Model(normalize_waveform, feature_extractor, encoder, aux)
+def wavlm_base(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.1,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "base" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wavlm_model(
+        extractor_mode="group_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=768,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=12,
+        encoder_num_heads=12,
+        encoder_num_buckets=320,
+        encoder_max_distance=800,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=3072,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=False,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )
+def wavlm_large(
+    encoder_projection_dropout: float = 0.1,
+    encoder_attention_dropout: float = 0.1,
+    encoder_ff_interm_dropout: float = 0.0,
+    encoder_dropout: float = 0.1,
+    encoder_layer_drop: float = 0.1,
+    aux_num_out: Optional[int] = None,
+) -> Wav2Vec2Model:
+    """Builds "large" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible
+    with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is
+    :class:`~torchaudio.models.Wav2Vec2Model`.
+    Args:
+        encoder_projection_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_attention_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_ff_interm_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_dropout (float):
+            See :py:func:`wav2vec2_model`.
+        encoder_layer_drop (float):
+            See :py:func:`wav2vec2_model`.
+        aux_num_out (int, optional):
+            See :py:func:`wav2vec2_model`.
+    Returns:
+        Wav2Vec2Model:
+            The resulting model.
+    """
+    return wavlm_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=None,
+        extractor_conv_bias=False,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=encoder_projection_dropout,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_num_buckets=320,
+        encoder_max_distance=800,
+        encoder_attention_dropout=encoder_attention_dropout,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=encoder_ff_interm_dropout,
+        encoder_dropout=encoder_dropout,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=encoder_layer_drop,
+        aux_num_out=aux_num_out,
+    )

vencoder/dphubert/pruning_utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Utility functions for pruning."""
+from typing import Union
+import torch
+import torch.nn as nn
+def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: str):
+    "Prune linear layer in place."
+    # NOTE: weight: (out_features, in_features), bias: (out_features,)
+    if dim == "input":
+        dim = 1
+        layer.in_features = len(index)
+    elif dim == "output":
+        dim = 0
+        layer.out_features = len(index)
+    else:
+        raise ValueError
+    layer.weight = nn.Parameter(layer.weight.index_select(dim, index).clone().detach())
+    if layer.bias is not None and dim == 0:
+        layer.bias = nn.Parameter(layer.bias.index_select(0, index).clone().detach())
+def prune_conv1d_layer(layer: nn.Conv1d, index: torch.LongTensor, dim: str):
+    """Prune conv1d in place."""
+    # NOTE: weight: (out_channels, in_channels, kernel_size), bias: (out_channels,)
+    if dim == "input":
+        dim = 1
+        layer.in_channels = len(index)
+    elif dim == "output":
+        dim = 0
+        layer.out_channels = len(index)
+    else:
+        raise ValueError
+    layer.weight = nn.Parameter(layer.weight.index_select(dim, index).clone().detach())
+    if layer.bias is not None and dim == 0:
+        layer.bias = nn.Parameter(layer.bias.index_select(0, index).clone().detach())
+def prune_layer_norm(layernorm: Union[nn.LayerNorm, nn.GroupNorm], index: torch.LongTensor):
+    """Prune layer norm or group norm in place."""
+    layernorm.weight = nn.Parameter(layernorm.weight.index_select(0, index).clone().detach())
+    layernorm.bias = nn.Parameter(layernorm.bias.index_select(0, index).clone().detach())
+    if isinstance(layernorm, nn.LayerNorm):
+        layernorm.normalized_shape = (len(index),)
+    elif isinstance(layernorm, nn.GroupNorm):
+        layernorm.num_groups = len(index)
+        layernorm.num_channels = len(index)

vencoder/dphubert/utils/__init__.py ADDED Viewed

File without changes

vencoder/dphubert/utils/import_huggingface_wavlm.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Import Hugging Face transformers's wav2vec2.0 pretrained weights to torchaudios's format.
+Originally from:
+https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/utils/import_huggingface.py
+"""
+import logging
+from typing import Any, Dict
+from torch.nn import Module
+from ..model import wav2vec2_model, Wav2Vec2Model, wavlm_model
+_LG = logging.getLogger(__name__)
+def _get_config(cfg):
+    config = {
+        "extractor_mode": f"{cfg.feat_extract_norm}_norm",
+        "extractor_conv_layer_config": list(zip(cfg.conv_dim, cfg.conv_kernel, cfg.conv_stride)),
+        "extractor_conv_bias": cfg.conv_bias,
+        "encoder_embed_dim": cfg.hidden_size,
+        "encoder_projection_dropout": cfg.feat_proj_dropout,
+        "encoder_pos_conv_kernel": cfg.num_conv_pos_embeddings,
+        "encoder_pos_conv_groups": cfg.num_conv_pos_embedding_groups,
+        "encoder_num_layers": cfg.num_hidden_layers,
+        "encoder_num_heads": cfg.num_attention_heads,
+        "encoder_attention_dropout": cfg.attention_dropout,
+        "encoder_ff_interm_features": cfg.intermediate_size,
+        "encoder_ff_interm_dropout": cfg.activation_dropout,
+        "encoder_dropout": cfg.hidden_dropout,
+        "encoder_layer_norm_first": cfg.do_stable_layer_norm,
+        "encoder_layer_drop": cfg.layerdrop,
+    }
+    return config
+def _get_config_wavlm(cfg):
+    config = {
+        "extractor_mode": f"{cfg.feat_extract_norm}_norm",
+        "extractor_conv_layer_config": list(zip(cfg.conv_dim, cfg.conv_kernel, cfg.conv_stride)),
+        "extractor_conv_bias": cfg.conv_bias,
+        "encoder_embed_dim": cfg.hidden_size,
+        "encoder_projection_dropout": cfg.feat_proj_dropout,
+        "encoder_pos_conv_kernel": cfg.num_conv_pos_embeddings,
+        "encoder_pos_conv_groups": cfg.num_conv_pos_embedding_groups,
+        "encoder_num_layers": cfg.num_hidden_layers,
+        "encoder_use_attention": [True] * cfg.num_hidden_layers,
+        "encoder_use_feed_forward": [True] * cfg.num_hidden_layers,
+        "encoder_total_num_heads": [cfg.num_attention_heads for _ in range(cfg.num_hidden_layers)],
+        "encoder_remaining_heads": [list(range(cfg.num_attention_heads)) for _ in range(cfg.num_hidden_layers)],
+        "encoder_num_buckets": cfg.num_buckets,
+        "encoder_max_distance": cfg.max_bucket_distance,
+        "encoder_attention_dropout": cfg.attention_dropout,
+        "encoder_ff_interm_features": [cfg.intermediate_size for _ in range(cfg.num_hidden_layers)],
+        "encoder_ff_interm_dropout": cfg.activation_dropout,
+        "encoder_dropout": cfg.hidden_dropout,
+        "encoder_layer_norm_first": cfg.do_stable_layer_norm,
+        "encoder_layer_drop": cfg.layerdrop,
+        "normalize_waveform": cfg.feat_extract_norm == "layer",
+    }
+    return config
+def _build(config, original):
+    is_for_ctc = original.__class__.__name__ in ["Wav2Vec2ForCTC", "WavLMForCTC"]
+    if is_for_ctc:
+        aux_num_out = original.config.vocab_size
+        wav2vec2 = original.wav2vec2
+    else:
+        _LG.warning(
+            "The model is not an instance of Wav2Vec2ForCTC or WavLMForCTC. " '"lm_head" module is not imported.'
+        )
+        aux_num_out = None
+        wav2vec2 = original
+    is_wavlm = original.__class__.__name__ in ["WavLMModel", "WavLMForCTC"]
+    if is_wavlm:
+        imported = wavlm_model(**config, aux_num_out=aux_num_out)
+    else:
+        imported = wav2vec2_model(**config, aux_num_out=aux_num_out)
+    print(imported.feature_extractor.load_state_dict(wav2vec2.feature_extractor.state_dict(), strict=False))
+    print(imported.encoder.feature_projection.load_state_dict(wav2vec2.feature_projection.state_dict(), strict=False))
+    encoder_state_dict = wav2vec2.encoder.state_dict()
+    if is_wavlm:  # Rename paramaters of linear transformations for compatibility with the HF model
+        transform_wavlm_encoder_state(encoder_state_dict, config["encoder_num_layers"])
+    print(imported.encoder.transformer.load_state_dict(encoder_state_dict, strict=False))
+    if is_for_ctc:
+        imported.aux.load_state_dict(original.lm_head.state_dict())
+    return imported
+def transform_wavlm_encoder_state(state: Dict[str, Any], encoder_num_layers: int):
+    """Converts WavLM encoder state from HuggingFace format. In particular, concatenates linear projection weights and
+    biases to align with the structure of ``torch.nn.MultiheadAttention``.
+    """
+    pass
+def import_huggingface_model(original: Module) -> Wav2Vec2Model:
+    """Builds :class:`Wav2Vec2Model` from the corresponding model object of
+    `Transformers <https://huggingface.co/transformers/>`_.
+    Args:
+        original (torch.nn.Module): An instance of ``Wav2Vec2ForCTC`` from ``transformers``.
+    Returns:
+        Wav2Vec2Model: Imported model.
+    Example
+        >>> from torchaudio.models.wav2vec2.utils import import_huggingface_model
+        >>>
+        >>> original = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = import_huggingface_model(original)
+        >>>
+        >>> waveforms, _ = torchaudio.load("audio.wav")
+        >>> logits, _ = model(waveforms)
+    """
+    _LG.info("Importing model.")
+    _LG.info("Loading model configuration.")
+    is_wavlm = original.__class__.__name__ in ["WavLMModel", "WavLMForCTC"]
+    if is_wavlm:
+        config = _get_config_wavlm(original.config)
+    else:
+        config = _get_config(original.config)
+    _LG.debug("  - config: %s", config)
+    _LG.info("Building model.")
+    imported = _build(config, original)
+    return imported

vencoder/encoder.py ADDED Viewed

	@@ -0,0 +1,12 @@

+class SpeechEncoder(object):
+    def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
+        self.model = None  #This is Model
+        self.hidden_dim = 768
+        pass
+    def encoder(self,wav):
+        '''
+        input: wav:[batchsize,signal_length]
+        output: embedding:[batchsize,hidden_dim,wav_frame]
+        '''
+        pass

vencoder/hubert/__init__.py ADDED Viewed

File without changes

vencoder/hubert/hubert_model.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import copy
+import random
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_soft(
+        path: str,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert

vencoder/hubert/hubert_model_onnx.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import copy
+import random
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as t_func
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+    def encode(
+            self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+    def forward(self, x):
+        return self.units(x)
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = t_func.gelu(self.norm0(self.conv0(x)))
+        x = t_func.gelu(self.conv1(x))
+        x = t_func.gelu(self.conv2(x))
+        x = t_func.gelu(self.conv3(x))
+        x = t_func.gelu(self.conv4(x))
+        x = t_func.gelu(self.conv5(x))
+        x = t_func.gelu(self.conv6(x))
+        return x
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = t_func.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+class TransformerEncoder(nn.Module):
+    def __init__(
+            self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+    def forward(
+            self,
+            src: torch.Tensor,
+            mask: torch.Tensor = None,
+            src_key_padding_mask: torch.Tensor = None,
+            output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+def _compute_mask(
+        shape: Tuple[int, int],
+        mask_prob: float,
+        mask_length: int,
+        device: torch.device,
+        min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+    return mask
+def hubert_soft(
+        path: str,
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert

vencoder/whisper/__init__.py ADDED Viewed

File without changes

vencoder/whisper/audio.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+from functools import lru_cache
+from typing import Union
+import ffmpeg
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .utils import exact_div
+from librosa.filters import mel as librosa_mel_fn
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(dim=axis, index=torch.arange(length, device=array.device))
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
+    return torch.from_numpy(librosa_mel_fn(sr=SAMPLE_RATE,n_fft=N_FFT,n_mels=n_mels)).to(device)
+def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec

vencoder/whisper/decoding.py ADDED Viewed

	@@ -0,0 +1,712 @@

+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributions import Categorical
+from .audio import CHUNK_LENGTH
+from .tokenizer import Tokenizer, get_tokenizer
+from .utils import compression_ratio
+if TYPE_CHECKING:
+    from .model import Whisper
+@torch.no_grad()
+def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]:
+    """
+    Detect the spoken language in the audio, and return them as list of strings, along with the ids
+    of the most probable language tokens and the probability distribution over all language tokens.
+    This is performed outside the main decode loop in order to not interfere with kv-caching.
+    Returns
+    -------
+    language_tokens : Tensor, shape = (n_audio,)
+        ids of the most probable language tokens, which appears after the startoftranscript token.
+    language_probs : List[Dict[str, float]], length = n_audio
+        list of dictionaries containing the probability distribution over all languages.
+    """
+    if tokenizer is None:
+        tokenizer = get_tokenizer(model.is_multilingual)
+    if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
+        raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+    # skip encoder forward pass if already-encoded audio features were given
+    if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
+        mel = model.encoder(mel)
+    # forward pass using a single token, startoftranscript
+    n_audio = mel.shape[0]
+    x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
+    logits = model.logits(x, mel)[:, 0]
+    # collect detected languages; suppress all non-language tokens
+    mask = torch.ones(logits.shape[-1], dtype=torch.bool)
+    mask[list(tokenizer.all_language_tokens)] = False
+    logits[:, mask] = -np.inf
+    language_tokens = logits.argmax(dim=-1)
+    language_token_probs = logits.softmax(dim=-1).cpu()
+    language_probs = [
+        {
+            c: language_token_probs[i, j].item()
+            for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
+        }
+        for i in range(n_audio)
+    ]
+    if single:
+        language_tokens = language_tokens[0]
+        language_probs = language_probs[0]
+    return language_tokens, language_probs
+@dataclass(frozen=True)
+class DecodingOptions:
+    task: str = "transcribe"  # whether to perform X->X "transcribe" or X->English "translate"
+    language: Optional[str] = None  # language that the audio is in; uses detected language if None
+    # sampling-related options
+    temperature: float = 0.0
+    sample_len: Optional[int] = None  # maximum number of tokens to sample
+    best_of: Optional[int] = None     # number of independent samples to collect, when t > 0
+    beam_size: Optional[int] = None   # number of beams in beam search, when t == 0
+    patience: Optional[float] = None  # patience in beam search (https://arxiv.org/abs/2204.05424)
+    # options for ranking generations (either beams or best-of-N samples)
+    length_penalty: Optional[float] = None   # "alpha" in Google NMT, None defaults to length norm
+    # prompt, prefix, and token suppression
+    prompt: Optional[Union[str, List[int]]] = None   # text or tokens for the previous context
+    prefix: Optional[Union[str, List[int]]] = None   # text or tokens to prefix the current context
+    suppress_blank: bool = True                      # this will suppress blank outputs
+    # list of tokens ids (or comma-separated token ids) to suppress
+    # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
+    suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
+    # timestamp sampling options
+    without_timestamps: bool = False              # use <|notimestamps|> to sample text tokens only
+    max_initial_timestamp: Optional[float] = 1.0  # the initial timestamp cannot be later than this
+    # implementation details
+    fp16: bool = True  # use fp16 for most of the calculation
+@dataclass(frozen=True)
+class DecodingResult:
+    audio_features: Tensor
+    language: str
+    language_probs: Optional[Dict[str, float]] = None
+    tokens: List[int] = field(default_factory=list)
+    text: str = ""
+    avg_logprob: float = np.nan
+    no_speech_prob: float = np.nan
+    temperature: float = np.nan
+    compression_ratio: float = np.nan
+class Inference:
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        """Perform a forward pass on the decoder and return per-token logits"""
+        raise NotImplementedError
+    def rearrange_kv_cache(self, source_indices) -> None:
+        """Update the key-value cache according to the updated beams"""
+        raise NotImplementedError
+    def cleanup_caching(self) -> None:
+        """Clean up any resources or hooks after decoding is finished"""
+        pass
+class PyTorchInference(Inference):
+    def __init__(self, model: "Whisper", initial_token_length: int):
+        self.model: "Whisper" = model
+        self.initial_token_length = initial_token_length
+        self.kv_cache = {}
+        self.hooks = []
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        if not self.kv_cache:
+            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
+        if tokens.shape[-1] > self.initial_token_length:
+            # only need to use the last token except in the first forward pass
+            tokens = tokens[:, -1:]
+        return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
+    def cleanup_caching(self):
+        for hook in self.hooks:
+            hook.remove()
+        self.kv_cache = {}
+        self.hooks = []
+    def rearrange_kv_cache(self, source_indices):
+        for module, tensor in self.kv_cache.items():
+            # update the key/value cache to contain the selected sequences
+            self.kv_cache[module] = tensor[source_indices].detach()
+class SequenceRanker:
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]:
+        """
+        Given a list of groups of samples and their cumulative log probabilities,
+        return the indices of the samples in each group to select as the final result
+        """
+        raise NotImplementedError
+class MaximumLikelihoodRanker(SequenceRanker):
+    """
+    Select the sample with the highest log probabilities, penalized using either
+    a simple length normalization or Google NMT paper's length penalty
+    """
+    def __init__(self, length_penalty: Optional[float]):
+        self.length_penalty = length_penalty
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
+        def scores(logprobs, lengths):
+            result = []
+            for logprob, length in zip(logprobs, lengths):
+                if self.length_penalty is None:
+                    penalty = length
+                else:
+                    # from the Google NMT paper
+                    penalty = ((5 + length) / 6) ** self.length_penalty
+                result.append(logprob / penalty)
+            return result
+        # get the sequence with the highest score
+        lengths = [[len(t) for t in s] for s in tokens]
+        return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
+class TokenDecoder:
+    def reset(self):
+        """Initialize any stateful variables for decoding a new sequence"""
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        """Specify how to select the next token, based on the current trace and logits
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+        sum_logprobs : Tensor, shape = (n_batch)
+            cumulative log probabilities for each sequence
+        Returns
+        -------
+        tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
+            the tokens, appended with the selected next token
+        completed : bool
+            True if all sequences has reached the end of text
+        """
+        raise NotImplementedError
+    def finalize(
+        self, tokens: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
+        """Finalize search and return the final candidate sequences
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence
+        sum_logprobs : Tensor, shape = (n_audio, n_group)
+            cumulative log probabilities for each sequence
+        Returns
+        -------
+        tokens : Sequence[Sequence[Tensor]], length = n_audio
+            sequence of Tensors containing candidate token sequences, for each audio input
+        sum_logprobs : List[List[float]], length = n_audio
+            sequence of cumulative log probabilities corresponding to the above
+        """
+        raise NotImplementedError
+class GreedyDecoder(TokenDecoder):
+    def __init__(self, temperature: float, eot: int):
+        self.temperature = temperature
+        self.eot = eot
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        temperature = self.temperature
+        if temperature == 0:
+            next_tokens = logits.argmax(dim=-1)
+        else:
+            next_tokens = Categorical(logits=logits / temperature).sample()
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
+        sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
+        next_tokens[tokens[:, -1] == self.eot] = self.eot
+        tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
+        completed = (tokens[:, -1] == self.eot).all()
+        return tokens, completed
+    def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
+        # make sure each sequence has at least one EOT token at the end
+        tokens = F.pad(tokens, (0, 1), value=self.eot)
+        return tokens, sum_logprobs.tolist()
+class BeamSearchDecoder(TokenDecoder):
+    def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None):
+        self.beam_size = beam_size
+        self.eot = eot
+        self.inference = inference
+        self.patience = patience or 1.0
+        self.max_candidates: int = round(beam_size * self.patience)
+        self.finished_sequences = None
+        assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})"
+    def reset(self):
+        self.finished_sequences = None
+    def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
+        if tokens.shape[0] % self.beam_size != 0:
+            raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
+        n_audio = tokens.shape[0] // self.beam_size
+        if self.finished_sequences is None:  # for the first update
+            self.finished_sequences = [{} for _ in range(n_audio)]
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        next_tokens, source_indices, finished_sequences = [], [], []
+        for i in range(n_audio):
+            scores, sources, finished = {}, {}, {}
+            # STEP 1: calculate the cumulative log probabilities for possible candidates
+            for j in range(self.beam_size):
+                idx = i * self.beam_size + j
+                prefix = tokens[idx].tolist()
+                for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
+                    new_logprob = (sum_logprobs[idx] + logprob).item()
+                    sequence = tuple(prefix + [token.item()])
+                    scores[sequence] = new_logprob
+                    sources[sequence] = idx
+            # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
+            saved = 0
+            for sequence in sorted(scores, key=scores.get, reverse=True):
+                if sequence[-1] == self.eot:
+                    finished[sequence] = scores[sequence]
+                else:
+                    sum_logprobs[len(next_tokens)] = scores[sequence]
+                    next_tokens.append(sequence)
+                    source_indices.append(sources[sequence])
+                    saved += 1
+                    if saved == self.beam_size:
+                        break
+            finished_sequences.append(finished)
+        tokens = torch.tensor(next_tokens, device=tokens.device)
+        self.inference.rearrange_kv_cache(source_indices)
+        # add newly finished sequences to self.finished_sequences
+        assert len(self.finished_sequences) == len(finished_sequences)
+        for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences):
+            for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
+                if len(previously_finished) >= self.max_candidates:
+                    break  # the candidate list is full
+                previously_finished[seq] = newly_finished[seq]
+        # mark as completed if all audio has enough number of samples
+        completed = all(
+            len(sequences) >= self.max_candidates for sequences in self.finished_sequences
+        )
+        return tokens, completed
+    def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
+        # collect all finished sequences, including patience, and add unfinished ones if not enough
+        sum_logprobs = sum_logprobs.cpu()
+        for i, sequences in enumerate(self.finished_sequences):
+            if len(sequences) < self.beam_size:  # when not enough sequences are finished
+                for j in list(np.argsort(sum_logprobs[i]))[::-1]:
+                    sequence = preceding_tokens[i, j].tolist() + [self.eot]
+                    sequences[tuple(sequence)] = sum_logprobs[i][j].item()
+                    if len(sequences) >= self.beam_size:
+                        break
+        tokens: List[List[Tensor]] = [
+            [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences
+        ]
+        sum_logprobs: List[List[float]] = [
+            list(sequences.values()) for sequences in self.finished_sequences
+        ]
+        return tokens, sum_logprobs
+class LogitFilter:
+    def apply(self, logits: Tensor, tokens: Tensor) -> None:
+        """Apply any filtering or masking to logits in-place
+        Parameters
+        ----------
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+        """
+        raise NotImplementedError
+class SuppressBlank(LogitFilter):
+    def __init__(self, tokenizer: Tokenizer, sample_begin: int):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+    def apply(self, logits: Tensor, tokens: Tensor):
+        if tokens.shape[1] == self.sample_begin:
+            logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
+class SuppressTokens(LogitFilter):
+    def __init__(self, suppress_tokens: Sequence[int]):
+        self.suppress_tokens = list(suppress_tokens)
+    def apply(self, logits: Tensor, tokens: Tensor):
+        logits[:, self.suppress_tokens] = -np.inf
+class ApplyTimestampRules(LogitFilter):
+    def __init__(
+        self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int]
+    ):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+        self.max_initial_timestamp_index = max_initial_timestamp_index
+    def apply(self, logits: Tensor, tokens: Tensor):
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        if self.tokenizer.no_timestamps is not None:
+            logits[:, self.tokenizer.no_timestamps] = -np.inf
+        # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
+        for k in range(tokens.shape[0]):
+            seq = [t for t in tokens[k, self.sample_begin :].tolist()]
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    logits[k, self.tokenizer.timestamp_begin :] = -np.inf
+                else:  # cannot be normal text tokens
+                    logits[k, : self.tokenizer.eot] = -np.inf
+        if tokens.shape[1] == self.sample_begin:
+            # suppress generating non-timestamp tokens at the beginning
+            logits[:, : self.tokenizer.timestamp_begin] = -np.inf
+            # apply the `max_initial_timestamp` option
+            if self.max_initial_timestamp_index is not None:
+                last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
+                logits[:, last_allowed + 1 :] = -np.inf
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        for k in range(tokens.shape[0]):
+            timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob:
+                logits[k, : self.tokenizer.timestamp_begin] = -np.inf
+class DecodingTask:
+    inference: Inference
+    sequence_ranker: SequenceRanker
+    decoder: TokenDecoder
+    logit_filters: List[LogitFilter]
+    def __init__(self, model: "Whisper", options: DecodingOptions):
+        self.model = model
+        language = options.language or "en"
+        tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task)
+        self.tokenizer: Tokenizer = tokenizer
+        self.options: DecodingOptions = self._verify_options(options)
+        self.n_group: int = options.beam_size or options.best_of or 1
+        self.n_ctx: int = model.dims.n_text_ctx
+        self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
+        self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
+        if self.options.without_timestamps:
+            self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
+        self.initial_tokens: Tuple[int] = self._get_initial_tokens()
+        self.sample_begin: int = len(self.initial_tokens)
+        self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
+        # inference: implements the forward pass through the decoder, including kv caching
+        self.inference = PyTorchInference(model, len(self.initial_tokens))
+        # sequence ranker: implements how to rank a group of sampled sequences
+        self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
+        # decoder: implements how to select the next tokens, given the autoregressive distribution
+        if options.beam_size is not None:
+            self.decoder = BeamSearchDecoder(
+                options.beam_size, tokenizer.eot, self.inference, options.patience
+            )
+        else:
+            self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
+        # logit filters: applies various rules to suppress or penalize certain tokens
+        self.logit_filters = []
+        if self.options.suppress_blank:
+            self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
+        if self.options.suppress_tokens:
+            self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
+        if not options.without_timestamps:
+            precision = CHUNK_LENGTH / model.dims.n_audio_ctx  # usually 0.02 seconds
+            max_initial_timestamp_index = None
+            if options.max_initial_timestamp:
+                max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision)
+            self.logit_filters.append(
+                ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)
+            )
+    def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
+        if options.beam_size is not None and options.best_of is not None:
+            raise ValueError("beam_size and best_of can't be given together")
+        if options.temperature == 0:
+            if options.best_of is not None:
+                raise ValueError("best_of with greedy sampling (T=0) is not compatible")
+        if options.patience is not None and options.beam_size is None:
+            raise ValueError("patience requires beam_size to be given")
+        if options.length_penalty is not None and not (0 <= options.length_penalty <= 1):
+            raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
+        return options
+    def _get_initial_tokens(self) -> Tuple[int]:
+        tokens = list(self.sot_sequence)
+        prefix = self.options.prefix
+        prompt = self.options.prompt
+        if prefix:
+            prefix_tokens = (
+                self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix
+            )
+            if self.sample_len is not None:
+                max_prefix_len = self.n_ctx // 2 - self.sample_len
+                prefix_tokens = prefix_tokens[-max_prefix_len:]
+            tokens = tokens + prefix_tokens
+        if prompt:
+            prompt_tokens = (
+                self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt
+            )
+            tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens
+        return tuple(tokens)
+    def _get_suppress_tokens(self) -> Tuple[int]:
+        suppress_tokens = self.options.suppress_tokens
+        if isinstance(suppress_tokens, str):
+            suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
+        if -1 in suppress_tokens:
+            suppress_tokens = [t for t in suppress_tokens if t >= 0]
+            suppress_tokens.extend(self.tokenizer.non_speech_tokens)
+        elif suppress_tokens is None or len(suppress_tokens) == 0:
+            suppress_tokens = []  # interpret empty string as an empty list
+        else:
+            assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
+        suppress_tokens.extend(
+            [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]
+        )
+        if self.tokenizer.no_speech is not None:
+            # no-speech probability is collected separately
+            suppress_tokens.append(self.tokenizer.no_speech)
+        return tuple(sorted(set(suppress_tokens)))
+    def _get_audio_features(self, mel: Tensor):
+        if self.options.fp16:
+            mel = mel.half()
+        if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
+            # encoded audio features are given; skip audio encoding
+            print("encoded audio features are given; skip audio encoding")
+            audio_features = mel
+        else:
+            print(mel.shape)
+            print("===============================")
+            audio_features = self.model.encoder(mel)
+        if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32):
+            return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}")
+        return audio_features
+    def _detect_language(self, audio_features: Tensor, tokens: Tensor):
+        languages = [self.options.language] * audio_features.shape[0]
+        lang_probs = None
+        if self.options.language is None or self.options.task == "lang_id":
+            lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer)
+            languages = [max(probs, key=probs.get) for probs in lang_probs]
+            if self.options.language is None:
+                tokens[:, self.sot_index + 1] = lang_tokens  # write language tokens
+        return languages, lang_probs
+    def _main_loop(self, audio_features: Tensor, tokens: Tensor):
+        assert audio_features.shape[0] == tokens.shape[0]
+        n_batch = tokens.shape[0]
+        sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
+        no_speech_probs = [np.nan] * n_batch
+        try:
+            for i in range(self.sample_len):
+                logits = self.inference.logits(tokens, audio_features)
+                if i == 0 and self.tokenizer.no_speech is not None:  # save no_speech_probs
+                    probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
+                    no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
+                # now we need to consider the logits at the last token only
+                logits = logits[:, -1]
+                # apply the logit filters, e.g. for suppressing or applying penalty to
+                for logit_filter in self.logit_filters:
+                    logit_filter.apply(logits, tokens)
+                # expand the tokens tensor with the selected next tokens
+                tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
+                if completed or tokens.shape[-1] > self.n_ctx:
+                    break
+        finally:
+            self.inference.cleanup_caching()
+        return tokens, sum_logprobs, no_speech_probs
+    @torch.no_grad()
+    def run(self, mel: Tensor) -> List[DecodingResult]:
+        self.decoder.reset()
+        tokenizer: Tokenizer = self.tokenizer
+        n_audio: int = mel.shape[0]
+        audio_features: Tensor = self._get_audio_features(mel)  # encoder forward pass
+        tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
+        # detect language if requested, overwriting the language token
+        languages, language_probs = self._detect_language(audio_features, tokens)
+        if self.options.task == "lang_id":
+            return [
+                DecodingResult(audio_features=features, language=language, language_probs=probs)
+                for features, language, probs in zip(audio_features, languages, language_probs)
+            ]
+        # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
+        audio_features = audio_features.repeat_interleave(self.n_group, dim=0)
+        tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
+        # call the main sampling loop
+        tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
+        # reshape the tensors to have (n_audio, n_group) as the first two dimensions
+        audio_features = audio_features[:: self.n_group]
+        no_speech_probs = no_speech_probs[:: self.n_group]
+        assert audio_features.shape[0] == len(no_speech_probs) == n_audio
+        tokens = tokens.reshape(n_audio, self.n_group, -1)
+        sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
+        # get the final candidates for each group, and slice between the first sampled token and EOT
+        tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
+        tokens: List[List[Tensor]] = [
+            [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens
+        ]
+        # select the top-ranked sample in each group
+        selected = self.sequence_ranker.rank(tokens, sum_logprobs)
+        tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
+        texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
+        sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
+        avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)]
+        fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs)
+        if len(set(map(len, fields))) != 1:
+            raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
+        return [
+            DecodingResult(
+                audio_features=features,
+                language=language,
+                tokens=tokens,
+                text=text,
+                avg_logprob=avg_logprob,
+                no_speech_prob=no_speech_prob,
+                temperature=self.options.temperature,
+                compression_ratio=compression_ratio(text),
+            )
+            for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)
+        ]
+@torch.no_grad()
+def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]:
+    """
+    Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
+    Parameters
+    ----------
+    model: Whisper
+        the Whisper model instance
+    mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
+        A tensor containing the Mel spectrogram(s)
+    options: DecodingOptions
+        A dataclass that contains all necessary options for decoding 30-second segments
+    Returns
+    -------
+    result: Union[DecodingResult, List[DecodingResult]]
+        The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
+    """
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+    result = DecodingTask(model, options).run(mel)
+    if single:
+        result = result[0]
+    return result

vencoder/whisper/model.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from dataclasses import dataclass
+from typing import Dict
+from typing import Iterable, Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+from .decoding import detect_language as detect_language_function, decode as decode_function
+@dataclass
+class ModelDimensions:
+    n_mels: int
+    n_audio_ctx: int
+    n_audio_state: int
+    n_audio_head: int
+    n_audio_layer: int
+    n_vocab: int
+    n_text_ctx: int
+    n_text_state: int
+    n_text_head: int
+    n_text_layer: int
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        qk = qk.float()
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        len_x = x.shape[1]
+        len_e = self.positional_embedding.shape[0]
+        assert len_x <= len_e, "incorrect audio shape"
+        pos_e = self.positional_embedding[:len_x, :]
+        x = (x + pos_e).to(x.dtype)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
+        )
+        self.ln = LayerNorm(n_state)
+        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
+        self.register_buffer("mask", mask, persistent=False)
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+        x = self.ln(x)
+        logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
+        return logits
+class Whisper(nn.Module):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        )
+        self.decoder = TextDecoder(
+            self.dims.n_vocab,
+            self.dims.n_text_ctx,
+            self.dims.n_text_state,
+            self.dims.n_text_head,
+            self.dims.n_text_layer,
+        )
+    def embed_audio(self, mel: torch.Tensor):
+        return self.encoder(mel)
+    def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
+        return self.decoder(tokens, audio_features)
+    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
+        return self.decoder(tokens, self.encoder(mel))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def is_multilingual(self):
+        return self.dims.n_vocab == 51865
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        """
+        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
+        tensors calculated for the previous positions. This method returns a dictionary that stores
+        all caches, and the necessary hooks for the key and value projection modules that save the
+        intermediate tensors to be reused during later calculations.
+        Returns
+        -------
+        cache : Dict[nn.Module, torch.Tensor]
+            A dictionary object mapping the key/value projection modules to its cache
+        hooks : List[RemovableHandle]
+            List of PyTorch RemovableHandle objects to stop the hooks to be called
+        """
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
+                cache[module] = output  # save as-is, for the first token or cross attention
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=1).detach()
+            return cache[module]
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttention):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+    detect_language = detect_language_function
+    decode = decode_function

vencoder/whisper/tokenizer.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import os
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from transformers import GPT2TokenizerFast
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+}
+@dataclass(frozen=True)
+class Tokenizer:
+    """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
+    tokenizer: "GPT2TokenizerFast"
+    language: Optional[str]
+    sot_sequence: Tuple[int]
+    def encode(self, text, **kwargs):
+        return self.tokenizer.encode(text, **kwargs)
+    def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
+        return self.tokenizer.decode(token_ids, **kwargs)
+    def decode_with_timestamps(self, tokens) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
+        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        outputs = [[]]
+        for token in tokens:
+            if token >= self.timestamp_begin:
+                timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
+                outputs.append(timestamp)
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+        return "".join(outputs)
+    @property
+    @lru_cache()
+    def eot(self) -> int:
+        return self.tokenizer.eos_token_id
+    @property
+    @lru_cache()
+    def sot(self) -> int:
+        return self._get_single_token_id("<|startoftranscript|>")
+    @property
+    @lru_cache()
+    def sot_lm(self) -> int:
+        return self._get_single_token_id("<|startoflm|>")
+    @property
+    @lru_cache()
+    def sot_prev(self) -> int:
+        return self._get_single_token_id("<|startofprev|>")
+    @property
+    @lru_cache()
+    def no_speech(self) -> int:
+        return self._get_single_token_id("<|nospeech|>")
+    @property
+    @lru_cache()
+    def no_timestamps(self) -> int:
+        return self._get_single_token_id("<|notimestamps|>")
+    @property
+    @lru_cache()
+    def timestamp_begin(self) -> int:
+        return self.tokenizer.all_special_ids[-1] + 1
+    @property
+    @lru_cache()
+    def language_token(self) -> int:
+        """Returns the token id corresponding to the value of the `language` field"""
+        if self.language is None:
+            raise ValueError(f"This tokenizer does not have language token configured")
+        additional_tokens = dict(
+            zip(
+                self.tokenizer.additional_special_tokens,
+                self.tokenizer.additional_special_tokens_ids,
+            )
+        )
+        candidate = f"<|{self.language}|>"
+        if candidate in additional_tokens:
+            return additional_tokens[candidate]
+        raise KeyError(f"Language {self.language} not found in tokenizer.")
+    @property
+    @lru_cache()
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in zip(
+            self.tokenizer.additional_special_tokens,
+            self.tokenizer.additional_special_tokens_ids,
+        ):
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
+    @property
+    @lru_cache()
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
+    @property
+    @lru_cache()
+    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
+        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+    @property
+    @lru_cache()
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+        symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
+        symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+        result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+        return tuple(sorted(result))
+    def _get_single_token_id(self, text) -> int:
+        tokens = self.tokenizer.encode(text)
+        assert len(tokens) == 1, f"{text} is not encoded as a single token"
+        return tokens[0]
+@lru_cache(maxsize=None)
+def build_tokenizer(name: str = "gpt2"):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    path = os.path.join(os.path.dirname(__file__), "assets", name)
+    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+    specials = [
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+    ]
+    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+    return tokenizer
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+    language: Optional[str] = None,
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+    if multilingual:
+        tokenizer_name = "multilingual"
+        task = task or "transcribe"
+        language = language or "en"
+    else:
+        tokenizer_name = "gpt2"
+        task = None
+        language = None
+    tokenizer = build_tokenizer(name=tokenizer_name)
+    all_special_ids: List[int] = tokenizer.all_special_ids
+    sot: int = all_special_ids[1]
+    translate: int = all_special_ids[-6]
+    transcribe: int = all_special_ids[-5]
+    langs = tuple(LANGUAGES.keys())
+    sot_sequence = [sot]
+    if language is not None:
+        sot_sequence.append(sot + 1 + langs.index(language))
+    if task is not None:
+        sot_sequence.append(transcribe if task == "transcribe" else translate)
+    return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))

vencoder/whisper/utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import json
+import os
+import sys
+import zlib
+from typing import Callable, TextIO
+system_encoding = sys.getdefaultencoding()
+if system_encoding != "utf-8":
+    def make_safe(string):
+        # replaces any character not representable using the system default encoding with an '?',
+        # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
+        return string.encode(system_encoding, errors="replace").decode(system_encoding)
+else:
+    def make_safe(string):
+        # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
+        return string
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(self, result: dict, audio_path: str):
+        audio_basename = os.path.basename(audio_path)
+        output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension)
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f)
+    def write_result(self, result: dict, file: TextIO):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(self, result: dict, file: TextIO):
+        for segment in result["segments"]:
+            print(segment['text'].strip(), file=file, flush=True)
+class WriteVTT(ResultWriter):
+    extension: str = "vtt"
+    def write_result(self, result: dict, file: TextIO):
+        print("WEBVTT\n", file=file)
+        for segment in result["segments"]:
+            print(
+                f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+class WriteSRT(ResultWriter):
+    extension: str = "srt"
+    def write_result(self, result: dict, file: TextIO):
+        for i, segment in enumerate(result["segments"], start=1):
+            # write srt lines
+            print(
+                f"{i}\n"
+                f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
+                f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(self, result: dict, file: TextIO):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment['start']), file=file, end="\t")
+            print(round(1000 * segment['end']), file=file, end="\t")
+            print(segment['text'].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(self, result: dict, file: TextIO):
+        json.dump(result, file)
+def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]:
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(result: dict, file: TextIO):
+            for writer in all_writers:
+                writer(result, file)
+        return write_all
+    return writers[output_format](output_dir)