File size: 6,280 Bytes
ad16788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# Copyright 2020 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""F0 extractor using DIO + Stonemask algorithm."""
import logging
from typing import Any
from typing import Dict
from typing import Tuple
from typing import Union
import humanfriendly
import numpy as np
import pyworld
import torch
import torch.nn.functional as F
from scipy.interpolate import interp1d
from typeguard import check_argument_types
from espnet.nets.pytorch_backend.nets_utils import pad_list
from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
class Dio(AbsFeatsExtract):
"""F0 estimation with dio + stonemask algortihm.
This is f0 extractor based on dio + stonmask algorithm introduced in `WORLD:
a vocoder-based high-quality speech synthesis system for real-time applications`_.
.. _`WORLD: a vocoder-based high-quality speech synthesis system for real-time
applications`: https://doi.org/10.1587/transinf.2015EDP7457
Note:
This module is based on NumPy implementation. Therefore, the computational graph
is not connected.
Todo:
Replace this module with PyTorch-based implementation.
"""
def __init__(
self,
fs: Union[int, str] = 22050,
n_fft: int = 1024,
hop_length: int = 256,
f0min: int = 80,
f0max: int = 400,
use_token_averaged_f0: bool = True,
use_continuous_f0: bool = True,
use_log_f0: bool = True,
reduction_factor: int = None,
):
assert check_argument_types()
super().__init__()
if isinstance(fs, str):
fs = humanfriendly.parse_size(fs)
self.fs = fs
self.n_fft = n_fft
self.hop_length = hop_length
self.frame_period = 1000 * hop_length / fs
self.f0min = f0min
self.f0max = f0max
self.use_token_averaged_f0 = use_token_averaged_f0
self.use_continuous_f0 = use_continuous_f0
self.use_log_f0 = use_log_f0
if use_token_averaged_f0:
assert reduction_factor >= 1
self.reduction_factor = reduction_factor
def output_size(self) -> int:
return 1
def get_parameters(self) -> Dict[str, Any]:
return dict(
fs=self.fs,
n_fft=self.n_fft,
hop_length=self.hop_length,
f0min=self.f0min,
f0max=self.f0max,
use_token_averaged_f0=self.use_token_averaged_f0,
use_continuous_f0=self.use_continuous_f0,
use_log_f0=self.use_log_f0,
reduction_factor=self.reduction_factor,
)
def forward(
self,
input: torch.Tensor,
input_lengths: torch.Tensor = None,
feats_lengths: torch.Tensor = None,
durations: torch.Tensor = None,
durations_lengths: torch.Tensor = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
# If not provide, we assume that the inputs have the same length
if input_lengths is None:
input_lengths = (
input.new_ones(input.shape[0], dtype=torch.long) * input.shape[1]
)
# F0 extraction
pitch = [self._calculate_f0(x[:xl]) for x, xl in zip(input, input_lengths)]
# (Optional): Adjust length to match with the mel-spectrogram
if feats_lengths is not None:
pitch = [
self._adjust_num_frames(p, fl).view(-1)
for p, fl in zip(pitch, feats_lengths)
]
# (Optional): Average by duration to calculate token-wise f0
if self.use_token_averaged_f0:
durations = durations * self.reduction_factor
pitch = [
self._average_by_duration(p, d).view(-1)
for p, d in zip(pitch, durations)
]
pitch_lengths = durations_lengths
else:
pitch_lengths = input.new_tensor([len(p) for p in pitch], dtype=torch.long)
# Padding
pitch = pad_list(pitch, 0.0)
# Return with the shape (B, T, 1)
return pitch.unsqueeze(-1), pitch_lengths
def _calculate_f0(self, input: torch.Tensor) -> torch.Tensor:
x = input.cpu().numpy().astype(np.double)
f0, timeaxis = pyworld.dio(
x,
self.fs,
f0_floor=self.f0min,
f0_ceil=self.f0max,
frame_period=self.frame_period,
)
f0 = pyworld.stonemask(x, f0, timeaxis, self.fs)
if self.use_continuous_f0:
f0 = self._convert_to_continuous_f0(f0)
if self.use_log_f0:
nonzero_idxs = np.where(f0 != 0)[0]
f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
return input.new_tensor(f0.reshape(-1), dtype=torch.float)
@staticmethod
def _adjust_num_frames(x: torch.Tensor, num_frames: torch.Tensor) -> torch.Tensor:
if num_frames > len(x):
x = F.pad(x, (0, num_frames - len(x)))
elif num_frames < len(x):
x = x[:num_frames]
return x
@staticmethod
def _convert_to_continuous_f0(f0: np.array) -> np.array:
if (f0 == 0).all():
logging.warn("All frames seems to be unvoiced.")
return f0
# padding start and end of f0 sequence
start_f0 = f0[f0 != 0][0]
end_f0 = f0[f0 != 0][-1]
start_idx = np.where(f0 == start_f0)[0][0]
end_idx = np.where(f0 == end_f0)[0][-1]
f0[:start_idx] = start_f0
f0[end_idx:] = end_f0
# get non-zero frame index
nonzero_idxs = np.where(f0 != 0)[0]
# perform linear interpolation
interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
f0 = interp_fn(np.arange(0, f0.shape[0]))
return f0
def _average_by_duration(self, x: torch.Tensor, d: torch.Tensor) -> torch.Tensor:
assert 0 <= len(x) - d.sum() < self.reduction_factor
d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
x_avg = [
x[start:end].masked_select(x[start:end].gt(0.0)).mean(dim=0)
if len(x[start:end].masked_select(x[start:end].gt(0.0))) != 0
else x.new_tensor(0.0)
for start, end in zip(d_cumsum[:-1], d_cumsum[1:])
]
return torch.stack(x_avg)
|