File size: 3,590 Bytes
3883c60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os.path

import torch


def _split(sr, audio):
    import scipy.io.wavfile
    import librosa

    scipy.io.wavfile.write('speakeraudio.wav', sr, audio.detach().cpu().numpy())

    audio, sr = librosa.load('speakeraudio.wav', sr=16000)

    # Code source: Brian McFee
    # License: ISC

    ##################
    # Standard imports
    import numpy as np
    import matplotlib.pyplot as plt

    import librosa.display

    S_full, phase = librosa.magphase(librosa.stft(audio))

    # We'll compare frames using cosine similarity, and aggregate similar frames
    # by taking their (per-frequency) median value.
    #
    # To avoid being biased by local continuity, we constrain similar frames to be
    # separated by at least 2 seconds.
    #
    # This suppresses sparse/non-repetetitive deviations from the average spectrum,
    # and works well to discard vocal elements.

    S_filter = librosa.decompose.nn_filter(S_full,
                                           aggregate=np.median,
                                           metric='cosine',
                                           width=int(librosa.time_to_frames(2, sr=sr)))

    # The output of the filter shouldn't be greater than the input
    # if we assume signals are additive.  Taking the pointwise minimium
    # with the input spectrum forces this.
    S_filter = np.minimum(S_full, S_filter)

    # We can also use a margin to reduce bleed between the vocals and instrumentation masks.
    # Note: the margins need not be equal for foreground and background separation
    margin_i, margin_v = 2, 10
    power = 2

    mask_i = librosa.util.softmask(S_filter,
                                   margin_i * (S_full - S_filter),
                                   power=power)

    mask_v = librosa.util.softmask(S_full - S_filter,
                                   margin_v * S_filter,
                                   power=power)

    # Once we have the masks, simply multiply them with the input spectrum
    # to separate the components

    S_foreground = mask_v * S_full
    S_background = mask_i * S_full

    # S_full_audio = librosa.istft(S_full*phase)
    S_foreground_audio = librosa.istft(S_foreground*phase)
    S_background_audio = librosa.istft(S_background*phase)

    return S_foreground_audio, S_background_audio, sr


def split(sr, audio):
    import scipy.io.wavfile
    scipy.io.wavfile.write('speakeraudio.wav', sr, audio.detach().cpu().numpy())
    # import torchaudio
    # torchaudio.save('speakeraudio.wav', audio.abs().unsqueeze(0), sr)

    import demucs.separate
    import shlex
    # model_name = 'htdemucs'
    model_name = 'htdemucs_6s'
    # model_name = 'mdx_extra_q'
    args = shlex.split(f'speakeraudio.wav -n {model_name} --two-stems vocals --filename {{stem}}.{{ext}} --float32')
    demucs.separate.main(args)

    # audio_other_files = [os.path.join('separated', model_name, f+'.wav') for f in ['bass', 'drums', 'other', 'piano', 'guitar'] if os.path.isfile(os.path.join('separated', model_name, f+'.wav'))]
    # audio_other_files = [os.path.join('separated', model_name, f+'.wav') for f in ['bass', 'other', 'piano', 'guitar'] if os.path.isfile(os.path.join('separated', model_name, f+'.wav'))]
    audio_vocals_file = os.path.join('separated', model_name, 'vocals.wav')
    other_file = os.path.join('separated', model_name, 'no_vocals.wav')

    import torchaudio

    vocals, sr = torchaudio.load(audio_vocals_file)
    additional, _ = torchaudio.load(other_file)

    return vocals, additional, sr


# def split(sr, audio):
#     return audio, audio, sr