File size: 2,815 Bytes
ad16788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
#  2020, Technische Universität München;  Ludwig Kürzinger
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Sliding Window for raw audio input data."""

from espnet2.asr.frontend.abs_frontend import AbsFrontend
import torch
from typeguard import check_argument_types
from typing import Tuple


class SlidingWindow(AbsFrontend):
    """Sliding Window.

    Provides a sliding window over a batched continuous raw audio tensor.
    Optionally, provides padding (Currently not implemented).
    Combine this module with a pre-encoder compatible with raw audio data,
    for example Sinc convolutions.

    Known issues:
    Output length is calculated incorrectly if audio shorter than win_length.
    WARNING: trailing values are discarded - padding not implemented yet.
    There is currently no additional window function applied to input values.
    """

    def __init__(
        self,
        win_length: int = 400,
        hop_length: int = 160,
        channels: int = 1,
        padding: int = None,
        fs=None,
    ):
        """Initialize.

        Args:
            win_length: Length of frame.
            hop_length: Relative starting point of next frame.
            channels: Number of input channels.
            padding: Padding (placeholder, currently not implemented).
            fs:  Sampling rate (placeholder for compatibility, not used).
        """
        assert check_argument_types()
        super().__init__()
        self.fs = fs
        self.win_length = win_length
        self.hop_length = hop_length
        self.channels = channels
        self.padding = padding

    def forward(
        self, input: torch.Tensor, input_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Apply a sliding window on the input.

        Args:
            input: Input (B, T, C*D) or (B, T*C*D), with D=C=1.
            input_lengths: Input lengths within batch.

        Returns:
            Tensor: Output with dimensions (B, T, C, D), with D=win_length.
            Tensor: Output lengths within batch.
        """
        input_size = input.size()
        B = input_size[0]
        T = input_size[1]
        C = self.channels
        D = self.win_length
        # (B, T, C) --> (T, B, C)
        continuous = input.view(B, T, C).permute(1, 0, 2)
        windowed = continuous.unfold(0, D, self.hop_length)
        # (T, B, C, D) --> (B, T, C, D)
        output = windowed.permute(1, 0, 2, 3).contiguous()
        # After unfold(), windowed lengths change:
        output_lengths = (input_lengths - self.win_length) // self.hop_length + 1
        return output, output_lengths

    def output_size(self) -> int:
        """Return output length of feature dimension D, i.e. the window length."""
        return self.win_length