|
from __future__ import annotations |
|
|
|
from typing import Any, Dict, Tuple, Union, Optional |
|
|
|
import torch |
|
import yaml |
|
from torch import nn |
|
from .heads import ISTFTHead |
|
from .models import VocosBackbone |
|
|
|
|
|
class Vocos(nn.Module): |
|
""" |
|
The Vocos class represents a Fourier-based neural vocoder for audio synthesis. |
|
This class is primarily designed for inference, with support for loading from pretrained |
|
model checkpoints. It consists of three main components: a feature extractor, |
|
a backbone, and a head. |
|
""" |
|
|
|
def __init__( |
|
self, args, |
|
): |
|
super().__init__() |
|
self.backbone = VocosBackbone( |
|
input_channels=args.vocos.backbone.input_channels, |
|
dim=args.vocos.backbone.dim, |
|
intermediate_dim=args.vocos.backbone.intermediate_dim, |
|
num_layers=args.vocos.backbone.num_layers, |
|
) |
|
self.head = ISTFTHead( |
|
dim=args.vocos.head.dim, |
|
n_fft=args.vocos.head.n_fft, |
|
hop_length=args.vocos.head.hop_length, |
|
padding=args.vocos.head.padding, |
|
) |
|
|
|
def forward(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: |
|
""" |
|
Method to decode audio waveform from already calculated features. The features input is passed through |
|
the backbone and the head to reconstruct the audio output. |
|
|
|
Args: |
|
features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size, |
|
C denotes the feature dimension, and L is the sequence length. |
|
|
|
Returns: |
|
Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T). |
|
""" |
|
x = self.backbone(features_input, **kwargs) |
|
audio_output = self.head(x) |
|
return audio_output |
|
|