Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,872 Bytes
2da45ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from typing import Any, Optional, Tuple, Union,List,Dict
import torch
from dataclasses import dataclass
from transformers.modeling_outputs import (
BaseModelOutput,
ModelOutput,
)
#.............................................
@dataclass
class PosteriorDecoderModelOutput(ModelOutput):
labels_padding_mask: torch.FloatTensor = None
posterior_latents: torch.FloatTensor = None
posterior_means: torch.FloatTensor = None
posterior_log_variances: torch.FloatTensor = None
latents_slice : torch.FloatTensor = None
ids_slice: torch.FloatTensor = None
waveform: torch.FloatTensor = None
#.............................................................................................
@dataclass
class VitsModelOutput(ModelOutput):
waveform: torch.FloatTensor = None
sequence_lengths: torch.FloatTensor = None
spectrogram: Optional[Tuple[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
#.............................................................................................
@dataclass
class VitsTrainingOutput(ModelOutput):
waveform: torch.FloatTensor = None
log_duration: torch.FloatTensor = None
attn: torch.FloatTensor = None
ids_slice: torch.FloatTensor = None
input_padding_mask: torch.FloatTensor = None
labels_padding_mask: torch.FloatTensor = None
latents: torch.FloatTensor = None
prior_latents: torch.FloatTensor = None
prior_means: torch.FloatTensor = None
prior_log_variances: torch.FloatTensor = None
posterior_means: torch.FloatTensor = None
posterior_log_variances: torch.FloatTensor = None
#.............................................................................................
@dataclass
class VitsTextEncoderOutput(ModelOutput):
"""
Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The predicted mean values of the prior distribution for the latent text variables.
prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The predicted log-variance values of the prior distribution for the latent text variables.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attention weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor = None
prior_means: torch.FloatTensor = None
prior_log_variances: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
#.............................................................................................
|