File size: 4,444 Bytes
bb70eb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Third Party Imports
import torch
import onnxruntime as ort

# Local Imports
from vocal_isolation.models.mdx_net import Conv_TDF_net_trimm

# Global Variables
from vocal_isolation.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE
from vocal_isolation.constants import ONNX_MODEL_PATH, PRETRAINED_MODELS_DIRECTORY


class KimVocal:
    def __init__(self):
        self.models = [
            Conv_TDF_net_trimm(   
                ONNX_MODEL_PATH, use_onnx=True, target_name='vocals', 
                L=11, l=3, g=48, bn=8, bias=False, 
                dim_f=11, dim_t=8
            )
        ]

    def demix_both(self, music_tensor, sample_rate):
        """
        Isolating vocals AND instrumental using an ONNX model. 
        Assuming the audio is loaded correctly at 41000hz samplerate.

        Args:
            music_tensor (torch.Tensor): Input tensor.
            model (torch.nn): Model used for inferring.

        Returns:
            torch.Tensor: Output tensor after passing through the network.
        """
        number_of_samples = music_tensor.shape[1]
        vocals_tensor = None

        # * Extracting vocals
        overlap = self.models[0].overlap
        chunk_size = self.models[0].chunk_size
        gen_size = chunk_size - 2 * overlap
        pad_size = gen_size - number_of_samples % gen_size
        # Along the column dimensions (used for features), we pad the left and right side of the mix to keep the integrity of the whole tensor
        # overlap is added to ensure there's overlap between chunks.
        mix_padded = torch.cat([torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size+overlap)], 1)

        ort_session = ort.InferenceSession(f'{PRETRAINED_MODELS_DIRECTORY}/{self.models[0].target_name}.onnx', providers=EXECUTION_PROVIDER_LIST)

        # process one chunk at a time (batch_size=1)
        demixed_chunks = []
        i = 0
        while i < number_of_samples + pad_size:                
            chunk = mix_padded[:, i : i + chunk_size]
            x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))                
            with torch.no_grad():
                x = torch.tensor(ort_session.run(None, {'input': x.cpu().numpy()})[0]) 
            x = self.models[0].stft.inverse(x).squeeze(0)
            x = x[...,overlap:-overlap]
            demixed_chunks.append(x)
            i += gen_size

        vocals_tensor = torch.cat(demixed_chunks, -1)[...,:-pad_size].cpu()

        # Subtract vocals output from the input mix for the remaining models
        music_minus_vocals_tensor = music_tensor - vocals_tensor

        # Returning two tensors.
        return music_minus_vocals_tensor, vocals_tensor

    def demix_vocals(self, music_tensor, sample_rate):
        """
        Isolating vocals using an ONNX model. 
        Assuming the audio is loaded correctly at 41000hz samplerate.

        Args:
            music_tensor (torch.Tensor): Input tensor.
            model (torch.nn): Model used for inferring.

        Returns:
            torch.Tensor: Output tensor after passing through the network.
        """
        number_of_samples = music_tensor.shape[1]
        overlap = self.models[0].overlap

        # Calculate chunk_size and gen_size based on the sample rate
        chunk_size = self.models[0].chunk_size
        gen_size = chunk_size - 2 * overlap
        pad_size = gen_size - number_of_samples % gen_size
        mix_padded = torch.cat(
            [torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)],
            1,
        )

        # Start running the session for the model
        ort_session = ort.InferenceSession(
            ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST
        )

        # process one chunk at a time (batch_size=1)
        demixed_chunks = []
        i = 0
        while i < number_of_samples + pad_size:
            # Computation
            chunk = mix_padded[:, i : i + chunk_size]
            x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
            with torch.no_grad():
                x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0])
            x = self.models[0].stft.inverse(x).squeeze(0)
            x = x[..., overlap:-overlap]
            demixed_chunks.append(x)
            i += gen_size

        vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu()

        return vocals_output