wav2vec2-base-spkreg / modeling_wav2vec2_spkreg.py
yangwang825's picture
Update modeling_wav2vec2_spkreg.py
9d938b7 verified
import math
import warnings
from typing import Union, Tuple, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
from transformers.modeling_outputs import SequenceClassifierOutput, Wav2Vec2BaseModelOutput
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2ForPreTraining,
Wav2Vec2GumbelVectorQuantizer,
Wav2Vec2PositionalConvEmbedding,
Wav2Vec2FeatureProjection,
Wav2Vec2AttnAdapterLayer,
Wav2Vec2ForCTC,
Wav2Vec2FeatureEncoder,
Wav2Vec2EncoderStableLayerNorm,
Wav2Vec2Encoder,
Wav2Vec2Adapter,
safe_load_file,
_compute_mask_indices,
_HIDDEN_STATES_START_POSITION,
WAV2VEC2_ADAPTER_SAFE_FILE,
WAV2VEC2_ADAPTER_PT_FILE
)
from transformers.utils import (
cached_file,
is_safetensors_available,
logging,
)
from .configuration_wav2vec2_spkreg import Wav2Vec2SpkRegConfig
logger = logging.get_logger(__name__)
class Wav2Vec2SpkRegPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Wav2Vec2SpkRegConfig
base_model_prefix = "wav2vec2"
main_input_name = "input_values"
supports_gradient_checkpointing = True
_supports_flash_attn_2 = True
_supports_sdpa = True
def _init_weights(self, module):
"""Initialize the weights"""
# Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
if isinstance(module, Wav2Vec2ForPreTraining):
module.project_hid.reset_parameters()
module.project_q.reset_parameters()
module.project_hid._is_hf_initialized = True
module.project_q._is_hf_initialized = True
# gumbel softmax requires special init
elif isinstance(module, Wav2Vec2GumbelVectorQuantizer):
module.weight_proj.weight.data.normal_(mean=0.0, std=1)
module.weight_proj.bias.data.zero_()
nn.init.uniform_(module.codevectors)
elif isinstance(module, Wav2Vec2PositionalConvEmbedding):
nn.init.normal_(
module.conv.weight,
mean=0,
std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
)
nn.init.constant_(module.conv.bias, 0)
elif isinstance(module, Wav2Vec2FeatureProjection):
k = math.sqrt(1 / module.projection.in_features)
nn.init.uniform_(module.projection.weight, a=-k, b=k)
nn.init.uniform_(module.projection.bias, a=-k, b=k)
elif isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
elif isinstance(module, nn.Conv1d):
nn.init.kaiming_normal_(module.weight)
if module.bias is not None:
k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
nn.init.uniform_(module.bias, a=-k, b=k)
def _get_feat_extract_output_lengths(
self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
):
"""
Computes the output length of the convolutional layers
"""
add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
def _conv_out_length(input_length, kernel_size, stride):
# 1D convolutional layer output length formula taken
# from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
if add_adapter:
for _ in range(self.config.num_adapter_layers):
input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
return input_lengths
def _get_feature_vector_attention_mask(
self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
):
# Effectively attention_mask.sum(-1), but not inplace to be able to run
# on inference mode.
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
output_lengths = output_lengths.to(torch.long)
batch_size = attention_mask.shape[0]
attention_mask = torch.zeros(
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)
# these two operations makes sure that all values before the output lengths idxs are attended to
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
return attention_mask
def _get_adapters(self):
if self.config.adapter_attn_dim is None:
raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
adapter_weights = {}
for name, module in self.named_modules():
if isinstance(module, Wav2Vec2AttnAdapterLayer):
for param_name, param in module.named_parameters():
adapter_weights[".".join([name, param_name])] = param
if isinstance(self, Wav2Vec2ForCTC):
for name, param in self.lm_head.named_parameters():
adapter_weights[".".join(["lm_head", name])] = param
return adapter_weights
def init_adapter_layers(self):
"""
(Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
"""
# init attention adapters
for module in self.modules():
if isinstance(module, Wav2Vec2AttnAdapterLayer):
self._init_weights(module)
# init lm head
if isinstance(self, Wav2Vec2ForCTC):
self._init_weights(self.lm_head)
def load_adapter(self, target_lang: str, force_load=True, **kwargs):
r"""
Load a language adapter model from a pre-trained adapter model.
Parameters:
target_lang (`str`):
Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
adapter.<lang>.safetensors or adapter.<lang>.bin
force_load (`bool`, defaults to `True`):
Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model).
token (`str` or `bool`, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
<Tip>
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
</Tip>
mirror (`str`, *optional*):
Mirror source to accelerate downloads in China. If you are from China and have an accessibility
problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
Please refer to the mirror site for more information.
<Tip>
Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
use this method in a firewalled environment.
</Tip>
Examples:
```python
>>> from transformers import Wav2Vec2ForCTC, AutoProcessor
>>> ckpt = "facebook/mms-1b-all"
>>> processor = AutoProcessor.from_pretrained(ckpt)
>>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
>>> # set specific language
>>> processor.tokenizer.set_target_lang("spa")
>>> model.load_adapter("spa")
```
"""
if self.config.adapter_attn_dim is None:
raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
if target_lang == self.target_lang and not force_load:
logger.warning(f"Adapter weights are already set to {target_lang}.")
return
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", False)
token = kwargs.pop("token", None)
use_auth_token = kwargs.pop("use_auth_token", None)
revision = kwargs.pop("revision", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
token = use_auth_token
model_path_or_id = self.config._name_or_path
state_dict = None
# 1. Let's first try loading a safetensors adapter weight
if use_safetensors is not False:
filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang)
try:
weight_path = cached_file(
model_path_or_id,
filename=filepath,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
local_files_only=local_files_only,
token=token,
revision=revision,
cache_dir=cache_dir,
)
state_dict = safe_load_file(weight_path)
except EnvironmentError:
if use_safetensors:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
# to the original exception.
raise
except Exception:
# For any other exception, we throw a generic error.
if use_safetensors:
raise EnvironmentError(
f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
" from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
f" directory containing a file named {filepath}."
)
# 2. If this didn't work let's try loading a PyTorch adapter weight
if state_dict is None:
filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang)
try:
weight_path = cached_file(
model_path_or_id,
filename=filepath,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
local_files_only=local_files_only,
token=token,
revision=revision,
cache_dir=cache_dir,
)
weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {}
state_dict = torch.load(
weight_path,
map_location="cpu",
**weights_only_kwarg,
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
# to the original exception.
raise
except Exception:
# For any other exception, we throw a generic error.
raise EnvironmentError(
f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
" from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
f" directory containing a file named {filepath}."
)
adapter_weights = self._get_adapters()
unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
missing_keys = set(adapter_weights.keys()) - set(state_dict.keys())
if len(unexpected_keys) > 0:
raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.")
elif len(missing_keys) > 0:
raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.")
# make sure now vocab size is correct
target_vocab_size = state_dict["lm_head.weight"].shape[0]
if target_vocab_size != self.config.vocab_size:
self.lm_head = nn.Linear(
self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype
)
self.config.vocab_size = target_vocab_size
# make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights
state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
self.load_state_dict(state_dict, strict=False)
# set target language corectly
self.target_lang = target_lang
class Wav2Vec2SpkRegModel(Wav2Vec2SpkRegPreTrainedModel):
def __init__(self, config: Wav2Vec2SpkRegConfig):
super().__init__(config)
self.config = config
self.feature_extractor = Wav2Vec2FeatureEncoder(config)
self.feature_projection = Wav2Vec2FeatureProjection(config)
# model only needs masking vector if mask prob is > 0.0
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
if config.do_stable_layer_norm:
self.encoder = Wav2Vec2EncoderStableLayerNorm(config)
else:
self.encoder = Wav2Vec2Encoder(config)
self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
# Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.feature_extractor._freeze_parameters()
def _mask_hidden_states(
self,
hidden_states: torch.FloatTensor,
mask_time_indices: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
):
"""
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://arxiv.org/abs/1904.08779).
"""
# `config.apply_spec_augment` can set masking to False
if not getattr(self.config, "apply_spec_augment", True):
return hidden_states
# generate indices & apply SpecAugment along time axis
batch_size, sequence_length, hidden_size = hidden_states.size()
if mask_time_indices is not None:
# apply SpecAugment along time axis with given mask_time_indices
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
elif self.config.mask_time_prob > 0 and self.training:
mask_time_indices = _compute_mask_indices(
(batch_size, sequence_length),
mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length,
attention_mask=attention_mask,
min_masks=self.config.mask_time_min_masks,
)
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
if self.config.mask_feature_prob > 0 and self.training:
# generate indices & apply SpecAugment along feature axis
mask_feature_indices = _compute_mask_indices(
(batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
)
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
hidden_states[mask_feature_indices] = 0
return hidden_states
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
mask_time_indices: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
extract_features = self.feature_extractor(input_values)
extract_features = extract_features.transpose(1, 2)
if attention_mask is not None:
# compute reduced attention_mask corresponding to feature vectors
attention_mask = self._get_feature_vector_attention_mask(
extract_features.shape[1], attention_mask, add_adapter=False
)
hidden_states, extract_features = self.feature_projection(extract_features)
hidden_states = self._mask_hidden_states(
hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
)
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = encoder_outputs[0]
if self.adapter is not None:
hidden_states = self.adapter(hidden_states)
if not return_dict:
return (hidden_states, extract_features) + encoder_outputs[1:]
return Wav2Vec2BaseModelOutput(
last_hidden_state=hidden_states,
extract_features=extract_features,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class AngularLinear(nn.Module):
def __init__(self, in_features: int, out_features: int):
super(AngularLinear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = torch.nn.Parameter(
torch.FloatTensor(out_features, in_features), requires_grad=True
)
nn.init.xavier_normal_(self.weight, gain=1)
def forward(
self,
inputs: torch.Tensor,
):
# Calculation of cos(theta)
cosine = F.linear(F.normalize(inputs), F.normalize(self.weight))
return cosine
def extra_repr(self) -> str:
return 'in_features={}, out_features={}'.format(
self.in_features, self.out_features
)
class AMSoftmaxLoss(nn.Module):
"""Additive Margin Softmax (CosFace).
Paper: Wang, Feng, et al. "Additive margin softmax for face verification."
IEEE Signal Processing Letters 25.7 (2018): 926-930.
"""
def __init__(
self,
scale: float = 30.0,
margin: float = 0.35,
label_smoothing: float = 0.0,
reduction: str = "mean"
):
"""
Args:
num_classes: Number of classes (output dimension)
scale: Scaling factor for logits (default: 30.0)
margin: Angular margin (default: 0.35)
"""
super(AMSoftmaxLoss, self).__init__()
self.scale = scale
self.margin = margin
self.label_smoothing = label_smoothing
self.reduction = reduction
def forward(
self,
inputs: torch.Tensor,
targets: torch.Tensor,
):
"""
Args:
inputs: Input features of shape (batch_size, num_labels)
targets: Ground truth labels of shape (batch_size)
label_smoothing: Label smoothing factor (default: 0.0)
reduction: Reduction method (default: "mean")
Returns:
Loss value
"""
_, num_labels = inputs.shape
# `inputs` are the outputs from AngularLinear()
cos_theta = torch.clamp(inputs, -1.0 + 1e-7, 1.0 - 1e-7)
psi = cos_theta - self.margin
one_hot = nn.functional.one_hot(targets, num_labels)
outputs = self.scale * torch.where(one_hot.bool(), psi, cos_theta)
loss = F.cross_entropy(
outputs, targets, label_smoothing=self.label_smoothing, reduction=self.reduction
)
return loss
class AAMSoftmaxLoss(nn.Module):
"""Additive Angular Margin Softmax (ArcFace).
Paper: Deng, Jiankang, et al. "Arcface: Additive angular margin loss for deep face recognition."
Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2019.
"""
def __init__(
self,
scale: float = 30.0,
margin: float = 0.2,
easy_margin: bool = False,
label_smoothing: float = 0.0,
reduction: str = "mean"
):
"""
Args:
num_classes: Number of classes (output dimension)
scale: Scaling factor for logits (default: 30.0)
margin: Angular margin (default: 0.35)
easy_margin: Use the easy margin loss (default: False)
"""
super(AAMSoftmaxLoss, self).__init__()
self.scale = scale
self.margin = margin
self.easy_margin = easy_margin
self.label_smoothing = label_smoothing
self.reduction = reduction
def forward(
self,
inputs: torch.Tensor,
targets: torch.Tensor,
):
"""
Args:
inputs: Input features of shape (batch_size, num_labels)
targets: Ground truth labels of shape (batch_size)
Returns:
Loss value
"""
_, num_labels = inputs.shape
# `inputs` are the outputs from AngularLinear()
epsilon = 1e-6
# theta = torch.acos(cos_theta)
# psi = torch.cos(theta + self.margin)
cos_theta = torch.clamp(inputs, -1.0 + epsilon, 1.0 - epsilon)
sin_theta = torch.sqrt(1.0 - torch.pow(cos_theta, 2))
sin_theta = torch.clamp(sin_theta, 0.0 + epsilon, 1.0 - epsilon)
cos_m = math.cos(self.margin)
sin_m = math.sin(self.margin)
psi = cos_theta * cos_m - sin_theta * sin_m # cos(theta + m)
if self.easy_margin:
psi = torch.where(cos_theta > 0, psi, cos_theta)
else:
# Make the function cos(theta+m) monotonic decreasing while theta in [0°, 180°]
psi = torch.where((cos_theta - math.cos(math.pi - self.margin)) > 0, psi, cos_theta - self.margin)
one_hot = nn.functional.one_hot(targets, num_labels)
outputs = self.scale * torch.where(one_hot.bool(), psi, cos_theta)
loss = F.cross_entropy(
outputs, targets, label_smoothing=self.label_smoothing, reduction=self.reduction
)
return loss
class Wav2Vec2SpkRegForSequenceClassification(Wav2Vec2SpkRegPreTrainedModel):
def __init__(self, config):
super().__init__(config)
if hasattr(config, "add_adapter") and config.add_adapter:
raise ValueError(
"Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
)
self.wav2vec2 = Wav2Vec2SpkRegModel(config)
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
if config.use_weighted_layer_sum:
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
if self.config.loss_fct == 'cross_entropy':
self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
elif self.config.loss_fct == 'additive_margin':
self.classifier = AngularLinear(config.classifier_proj_size, config.num_labels)
elif self.config.loss_fct == 'additive_angular_margin':
self.classifier = AngularLinear(config.classifier_proj_size, config.num_labels)
else:
raise ValueError(f"Unsupported loss function: {self.config.loss_fct}")
# Initialize weights and apply final processing
self.post_init()
def freeze_feature_extractor(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
"""
warnings.warn(
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
"Please use the equivalent `freeze_feature_encoder` method instead.",
FutureWarning,
)
self.freeze_feature_encoder()
def freeze_feature_encoder(self):
"""
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
"""
self.wav2vec2.feature_extractor._freeze_parameters()
def freeze_base_model(self):
"""
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
"""
for param in self.wav2vec2.parameters():
param.requires_grad = False
def forward(
self,
input_values: Optional[torch.Tensor],
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if self.config.use_weighted_layer_sum:
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
hidden_states = torch.stack(hidden_states, dim=1)
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
else:
hidden_states = outputs[0]
hidden_states = self.projector(hidden_states)
if attention_mask is None:
pooled_output = hidden_states.mean(dim=1)
else:
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
hidden_states[~padding_mask] = 0.0
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.loss_fct == 'cross_entropy':
loss_fct = nn.CrossEntropyLoss(
label_smoothing=self.config.label_smoothing,
reduction=self.config.reduction
)
elif self.config.loss_fct == 'additive_margin':
loss_fct = AMSoftmaxLoss(
scale=self.config.scale,
margin=self.config.margin,
label_smoothing=self.config.label_smoothing,
reduction=self.config.reduction
)
elif self.config.loss_fct == 'additive_angular_margin':
loss_fct = AAMSoftmaxLoss(
scale=self.config.scale,
margin=self.config.margin,
easy_margin=self.config.easy_margin,
label_smoothing=self.config.label_smoothing,
reduction=self.config.reduction
)
loss = loss_fct(
logits.view(-1, self.config.num_labels),
labels.view(-1),
)
if not return_dict:
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)