Spaces:
Running
Running
File size: 4,778 Bytes
67c46fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import copy
import logging
import os
from argparse import Namespace
from typing import Optional
from typing import Tuple
from typing import Union
import humanfriendly
import torch
import torch.nn as nn
from funasr_detach.frontends.utils.frontend import Frontend
from funasr_detach.models.transformer.utils.nets_utils import pad_list
def base_s3prl_setup(args):
args.upstream_feature_selection = getattr(args, "upstream_feature_selection", None)
args.upstream_model_config = getattr(args, "upstream_model_config", None)
args.upstream_refresh = getattr(args, "upstream_refresh", False)
args.upstream_ckpt = getattr(args, "upstream_ckpt", None)
args.init_ckpt = getattr(args, "init_ckpt", None)
args.verbose = getattr(args, "verbose", False)
args.tile_factor = getattr(args, "tile_factor", 1)
return args
class S3prlFrontend(nn.Module):
"""Speech Pretrained Representation frontend structure for ASR."""
def __init__(
self,
fs: Union[int, str] = 16000,
frontend_conf: Optional[dict] = None,
download_dir: str = None,
multilayer_feature: bool = False,
):
super().__init__()
if isinstance(fs, str):
fs = humanfriendly.parse_size(fs)
if download_dir is not None:
torch.hub.set_dir(download_dir)
self.multilayer_feature = multilayer_feature
self.upstream, self.featurizer = self._get_upstream(frontend_conf)
self.pretrained_params = copy.deepcopy(self.upstream.state_dict())
self.output_dim = self.featurizer.output_dim
self.frontend_type = "s3prl"
self.hop_length = self.upstream.get_downsample_rates("key")
def _get_upstream(self, frontend_conf):
"""Get S3PRL upstream model."""
s3prl_args = base_s3prl_setup(
Namespace(**frontend_conf, device="cpu"),
)
self.args = s3prl_args
s3prl_path = None
python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
for p in python_path_list:
if p.endswith("s3prl"):
s3prl_path = p
break
assert s3prl_path is not None
s3prl_upstream = torch.hub.load(
s3prl_path,
s3prl_args.upstream,
ckpt=s3prl_args.upstream_ckpt,
model_config=s3prl_args.upstream_model_config,
refresh=s3prl_args.upstream_refresh,
source="local",
).to("cpu")
if getattr(
s3prl_upstream, "model", None
) is not None and s3prl_upstream.model.__class__.__name__ in [
"Wav2Vec2Model",
"HubertModel",
]:
s3prl_upstream.model.encoder.layerdrop = 0.0
from s3prl.upstream.interfaces import Featurizer
if self.multilayer_feature is None:
feature_selection = "last_hidden_state"
else:
feature_selection = "hidden_states"
s3prl_featurizer = Featurizer(
upstream=s3prl_upstream,
feature_selection=feature_selection,
upstream_device="cpu",
)
return s3prl_upstream, s3prl_featurizer
def _tile_representations(self, feature):
"""Tile up the representations by `tile_factor`.
Input - sequence of representations
shape: (batch_size, seq_len, feature_dim)
Output - sequence of tiled representations
shape: (batch_size, seq_len * factor, feature_dim)
"""
assert (
len(feature.shape) == 3
), "Input argument `feature` has invalid shape: {}".format(feature.shape)
tiled_feature = feature.repeat(1, 1, self.args.tile_factor)
tiled_feature = tiled_feature.reshape(
feature.size(0), feature.size(1) * self.args.tile_factor, feature.size(2)
)
return tiled_feature
def output_size(self) -> int:
return self.output_dim
def forward(
self, input: torch.Tensor, input_lengths: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
self.upstream.eval()
with torch.no_grad():
feats = self.upstream(wavs)
feats = self.featurizer(wavs, feats)
if self.args.tile_factor != 1:
feats = self._tile_representations(feats)
input_feats = pad_list(feats, 0.0)
feats_lens = torch.tensor([f.shape[0] for f in feats], dtype=torch.long)
# Saving CUDA Memory
del feats
return input_feats, feats_lens
def reload_pretrained_parameters(self):
self.upstream.load_state_dict(self.pretrained_params)
logging.info("Pretrained S3PRL frontend model parameters reloaded!")
|