|
|
|
|
|
|
|
|
|
"""Sliding Window for raw audio input data.""" |
|
|
|
from espnet2.asr.frontend.abs_frontend import AbsFrontend |
|
import torch |
|
from typeguard import check_argument_types |
|
from typing import Tuple |
|
|
|
|
|
class SlidingWindow(AbsFrontend): |
|
"""Sliding Window. |
|
|
|
Provides a sliding window over a batched continuous raw audio tensor. |
|
Optionally, provides padding (Currently not implemented). |
|
Combine this module with a pre-encoder compatible with raw audio data, |
|
for example Sinc convolutions. |
|
|
|
Known issues: |
|
Output length is calculated incorrectly if audio shorter than win_length. |
|
WARNING: trailing values are discarded - padding not implemented yet. |
|
There is currently no additional window function applied to input values. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
win_length: int = 400, |
|
hop_length: int = 160, |
|
channels: int = 1, |
|
padding: int = None, |
|
fs=None, |
|
): |
|
"""Initialize. |
|
|
|
Args: |
|
win_length: Length of frame. |
|
hop_length: Relative starting point of next frame. |
|
channels: Number of input channels. |
|
padding: Padding (placeholder, currently not implemented). |
|
fs: Sampling rate (placeholder for compatibility, not used). |
|
""" |
|
assert check_argument_types() |
|
super().__init__() |
|
self.fs = fs |
|
self.win_length = win_length |
|
self.hop_length = hop_length |
|
self.channels = channels |
|
self.padding = padding |
|
|
|
def forward( |
|
self, input: torch.Tensor, input_lengths: torch.Tensor |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
"""Apply a sliding window on the input. |
|
|
|
Args: |
|
input: Input (B, T, C*D) or (B, T*C*D), with D=C=1. |
|
input_lengths: Input lengths within batch. |
|
|
|
Returns: |
|
Tensor: Output with dimensions (B, T, C, D), with D=win_length. |
|
Tensor: Output lengths within batch. |
|
""" |
|
input_size = input.size() |
|
B = input_size[0] |
|
T = input_size[1] |
|
C = self.channels |
|
D = self.win_length |
|
|
|
continuous = input.view(B, T, C).permute(1, 0, 2) |
|
windowed = continuous.unfold(0, D, self.hop_length) |
|
|
|
output = windowed.permute(1, 0, 2, 3).contiguous() |
|
|
|
output_lengths = (input_lengths - self.win_length) // self.hop_length + 1 |
|
return output, output_lengths |
|
|
|
def output_size(self) -> int: |
|
"""Return output length of feature dimension D, i.e. the window length.""" |
|
return self.win_length |
|
|