Spaces:
Running
Running
kevinwang676
commited on
Commit
•
169904c
1
Parent(s):
e045432
Upload folder using huggingface_hub
Browse files- uvr5/__pycache__/vr.cpython-310.pyc +0 -0
- uvr5/lib/__pycache__/utils.cpython-310.pyc +0 -0
- uvr5/lib/lib_v5/__pycache__/layers_123821KB.cpython-310.pyc +0 -0
- uvr5/lib/lib_v5/__pycache__/model_param_init.cpython-310.pyc +0 -0
- uvr5/lib/lib_v5/__pycache__/nets_61968KB.cpython-310.pyc +0 -0
- uvr5/lib/lib_v5/__pycache__/spec_utils.cpython-310.pyc +0 -0
- uvr5/lib/lib_v5/layers_123821KB.py +118 -0
- uvr5/lib/lib_v5/model_param_init.py +69 -0
- uvr5/lib/lib_v5/modelparams/4band_v2.json +54 -0
- uvr5/lib/lib_v5/nets_61968KB.py +122 -0
- uvr5/lib/lib_v5/spec_utils.py +672 -0
- uvr5/lib/name_params.json +263 -0
- uvr5/lib/utils.py +121 -0
- uvr5/uvr_model/UVR-HP2.pth +3 -0
- uvr5/uvr_model/UVR-HP5.pth +3 -0
- uvr5/uvr_model/__init__.py +1 -0
- uvr5/vr.py +196 -0
uvr5/__pycache__/vr.cpython-310.pyc
ADDED
Binary file (4.2 kB). View file
|
|
uvr5/lib/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (2.98 kB). View file
|
|
uvr5/lib/lib_v5/__pycache__/layers_123821KB.cpython-310.pyc
ADDED
Binary file (4.05 kB). View file
|
|
uvr5/lib/lib_v5/__pycache__/model_param_init.cpython-310.pyc
ADDED
Binary file (1.63 kB). View file
|
|
uvr5/lib/lib_v5/__pycache__/nets_61968KB.cpython-310.pyc
ADDED
Binary file (3.43 kB). View file
|
|
uvr5/lib/lib_v5/__pycache__/spec_utils.cpython-310.pyc
ADDED
Binary file (13.6 kB). View file
|
|
uvr5/lib/lib_v5/layers_123821KB.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from torch import nn
|
4 |
+
|
5 |
+
from . import spec_utils
|
6 |
+
|
7 |
+
|
8 |
+
class Conv2DBNActiv(nn.Module):
|
9 |
+
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
10 |
+
super(Conv2DBNActiv, self).__init__()
|
11 |
+
self.conv = nn.Sequential(
|
12 |
+
nn.Conv2d(
|
13 |
+
nin,
|
14 |
+
nout,
|
15 |
+
kernel_size=ksize,
|
16 |
+
stride=stride,
|
17 |
+
padding=pad,
|
18 |
+
dilation=dilation,
|
19 |
+
bias=False,
|
20 |
+
),
|
21 |
+
nn.BatchNorm2d(nout),
|
22 |
+
activ(),
|
23 |
+
)
|
24 |
+
|
25 |
+
def __call__(self, x):
|
26 |
+
return self.conv(x)
|
27 |
+
|
28 |
+
|
29 |
+
class SeperableConv2DBNActiv(nn.Module):
|
30 |
+
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
|
31 |
+
super(SeperableConv2DBNActiv, self).__init__()
|
32 |
+
self.conv = nn.Sequential(
|
33 |
+
nn.Conv2d(
|
34 |
+
nin,
|
35 |
+
nin,
|
36 |
+
kernel_size=ksize,
|
37 |
+
stride=stride,
|
38 |
+
padding=pad,
|
39 |
+
dilation=dilation,
|
40 |
+
groups=nin,
|
41 |
+
bias=False,
|
42 |
+
),
|
43 |
+
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
|
44 |
+
nn.BatchNorm2d(nout),
|
45 |
+
activ(),
|
46 |
+
)
|
47 |
+
|
48 |
+
def __call__(self, x):
|
49 |
+
return self.conv(x)
|
50 |
+
|
51 |
+
|
52 |
+
class Encoder(nn.Module):
|
53 |
+
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
|
54 |
+
super(Encoder, self).__init__()
|
55 |
+
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
56 |
+
self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
|
57 |
+
|
58 |
+
def __call__(self, x):
|
59 |
+
skip = self.conv1(x)
|
60 |
+
h = self.conv2(skip)
|
61 |
+
|
62 |
+
return h, skip
|
63 |
+
|
64 |
+
|
65 |
+
class Decoder(nn.Module):
|
66 |
+
def __init__(
|
67 |
+
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
|
68 |
+
):
|
69 |
+
super(Decoder, self).__init__()
|
70 |
+
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
|
71 |
+
self.dropout = nn.Dropout2d(0.1) if dropout else None
|
72 |
+
|
73 |
+
def __call__(self, x, skip=None):
|
74 |
+
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
|
75 |
+
if skip is not None:
|
76 |
+
skip = spec_utils.crop_center(skip, x)
|
77 |
+
x = torch.cat([x, skip], dim=1)
|
78 |
+
h = self.conv(x)
|
79 |
+
|
80 |
+
if self.dropout is not None:
|
81 |
+
h = self.dropout(h)
|
82 |
+
|
83 |
+
return h
|
84 |
+
|
85 |
+
|
86 |
+
class ASPPModule(nn.Module):
|
87 |
+
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
|
88 |
+
super(ASPPModule, self).__init__()
|
89 |
+
self.conv1 = nn.Sequential(
|
90 |
+
nn.AdaptiveAvgPool2d((1, None)),
|
91 |
+
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
|
92 |
+
)
|
93 |
+
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
|
94 |
+
self.conv3 = SeperableConv2DBNActiv(
|
95 |
+
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
|
96 |
+
)
|
97 |
+
self.conv4 = SeperableConv2DBNActiv(
|
98 |
+
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
|
99 |
+
)
|
100 |
+
self.conv5 = SeperableConv2DBNActiv(
|
101 |
+
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
|
102 |
+
)
|
103 |
+
self.bottleneck = nn.Sequential(
|
104 |
+
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
|
105 |
+
)
|
106 |
+
|
107 |
+
def forward(self, x):
|
108 |
+
_, _, h, w = x.size()
|
109 |
+
feat1 = F.interpolate(
|
110 |
+
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
|
111 |
+
)
|
112 |
+
feat2 = self.conv2(x)
|
113 |
+
feat3 = self.conv3(x)
|
114 |
+
feat4 = self.conv4(x)
|
115 |
+
feat5 = self.conv5(x)
|
116 |
+
out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
|
117 |
+
bottle = self.bottleneck(out)
|
118 |
+
return bottle
|
uvr5/lib/lib_v5/model_param_init.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import pathlib
|
4 |
+
|
5 |
+
default_param = {}
|
6 |
+
default_param["bins"] = 768
|
7 |
+
default_param["unstable_bins"] = 9 # training only
|
8 |
+
default_param["reduction_bins"] = 762 # training only
|
9 |
+
default_param["sr"] = 44100
|
10 |
+
default_param["pre_filter_start"] = 757
|
11 |
+
default_param["pre_filter_stop"] = 768
|
12 |
+
default_param["band"] = {}
|
13 |
+
|
14 |
+
|
15 |
+
default_param["band"][1] = {
|
16 |
+
"sr": 11025,
|
17 |
+
"hl": 128,
|
18 |
+
"n_fft": 960,
|
19 |
+
"crop_start": 0,
|
20 |
+
"crop_stop": 245,
|
21 |
+
"lpf_start": 61, # inference only
|
22 |
+
"res_type": "polyphase",
|
23 |
+
}
|
24 |
+
|
25 |
+
default_param["band"][2] = {
|
26 |
+
"sr": 44100,
|
27 |
+
"hl": 512,
|
28 |
+
"n_fft": 1536,
|
29 |
+
"crop_start": 24,
|
30 |
+
"crop_stop": 547,
|
31 |
+
"hpf_start": 81, # inference only
|
32 |
+
"res_type": "sinc_best",
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def int_keys(d):
|
37 |
+
r = {}
|
38 |
+
for k, v in d:
|
39 |
+
if k.isdigit():
|
40 |
+
k = int(k)
|
41 |
+
r[k] = v
|
42 |
+
return r
|
43 |
+
|
44 |
+
|
45 |
+
class ModelParameters(object):
|
46 |
+
def __init__(self, config_path=""):
|
47 |
+
if ".pth" == pathlib.Path(config_path).suffix:
|
48 |
+
import zipfile
|
49 |
+
|
50 |
+
with zipfile.ZipFile(config_path, "r") as zip:
|
51 |
+
self.param = json.loads(
|
52 |
+
zip.read("param.json"), object_pairs_hook=int_keys
|
53 |
+
)
|
54 |
+
elif ".json" == pathlib.Path(config_path).suffix:
|
55 |
+
with open(config_path, "r") as f:
|
56 |
+
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
|
57 |
+
else:
|
58 |
+
self.param = default_param
|
59 |
+
|
60 |
+
for k in [
|
61 |
+
"mid_side",
|
62 |
+
"mid_side_b",
|
63 |
+
"mid_side_b2",
|
64 |
+
"stereo_w",
|
65 |
+
"stereo_n",
|
66 |
+
"reverse",
|
67 |
+
]:
|
68 |
+
if not k in self.param:
|
69 |
+
self.param[k] = False
|
uvr5/lib/lib_v5/modelparams/4band_v2.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bins": 672,
|
3 |
+
"unstable_bins": 8,
|
4 |
+
"reduction_bins": 637,
|
5 |
+
"band": {
|
6 |
+
"1": {
|
7 |
+
"sr": 7350,
|
8 |
+
"hl": 80,
|
9 |
+
"n_fft": 640,
|
10 |
+
"crop_start": 0,
|
11 |
+
"crop_stop": 85,
|
12 |
+
"lpf_start": 25,
|
13 |
+
"lpf_stop": 53,
|
14 |
+
"res_type": "polyphase"
|
15 |
+
},
|
16 |
+
"2": {
|
17 |
+
"sr": 7350,
|
18 |
+
"hl": 80,
|
19 |
+
"n_fft": 320,
|
20 |
+
"crop_start": 4,
|
21 |
+
"crop_stop": 87,
|
22 |
+
"hpf_start": 25,
|
23 |
+
"hpf_stop": 12,
|
24 |
+
"lpf_start": 31,
|
25 |
+
"lpf_stop": 62,
|
26 |
+
"res_type": "polyphase"
|
27 |
+
},
|
28 |
+
"3": {
|
29 |
+
"sr": 14700,
|
30 |
+
"hl": 160,
|
31 |
+
"n_fft": 512,
|
32 |
+
"crop_start": 17,
|
33 |
+
"crop_stop": 216,
|
34 |
+
"hpf_start": 48,
|
35 |
+
"hpf_stop": 24,
|
36 |
+
"lpf_start": 139,
|
37 |
+
"lpf_stop": 210,
|
38 |
+
"res_type": "polyphase"
|
39 |
+
},
|
40 |
+
"4": {
|
41 |
+
"sr": 44100,
|
42 |
+
"hl": 480,
|
43 |
+
"n_fft": 960,
|
44 |
+
"crop_start": 78,
|
45 |
+
"crop_stop": 383,
|
46 |
+
"hpf_start": 130,
|
47 |
+
"hpf_stop": 86,
|
48 |
+
"res_type": "kaiser_fast"
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"sr": 44100,
|
52 |
+
"pre_filter_start": 668,
|
53 |
+
"pre_filter_stop": 672
|
54 |
+
}
|
uvr5/lib/lib_v5/nets_61968KB.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from torch import nn
|
4 |
+
|
5 |
+
from . import layers_123821KB as layers
|
6 |
+
|
7 |
+
|
8 |
+
class BaseASPPNet(nn.Module):
|
9 |
+
def __init__(self, nin, ch, dilations=(4, 8, 16)):
|
10 |
+
super(BaseASPPNet, self).__init__()
|
11 |
+
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
|
12 |
+
self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
|
13 |
+
self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
|
14 |
+
self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
|
15 |
+
|
16 |
+
self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
|
17 |
+
|
18 |
+
self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
|
19 |
+
self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
|
20 |
+
self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
|
21 |
+
self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
|
22 |
+
|
23 |
+
def __call__(self, x):
|
24 |
+
h, e1 = self.enc1(x)
|
25 |
+
h, e2 = self.enc2(h)
|
26 |
+
h, e3 = self.enc3(h)
|
27 |
+
h, e4 = self.enc4(h)
|
28 |
+
|
29 |
+
h = self.aspp(h)
|
30 |
+
|
31 |
+
h = self.dec4(h, e4)
|
32 |
+
h = self.dec3(h, e3)
|
33 |
+
h = self.dec2(h, e2)
|
34 |
+
h = self.dec1(h, e1)
|
35 |
+
|
36 |
+
return h
|
37 |
+
|
38 |
+
|
39 |
+
class CascadedASPPNet(nn.Module):
|
40 |
+
def __init__(self, n_fft):
|
41 |
+
super(CascadedASPPNet, self).__init__()
|
42 |
+
self.stg1_low_band_net = BaseASPPNet(2, 32)
|
43 |
+
self.stg1_high_band_net = BaseASPPNet(2, 32)
|
44 |
+
|
45 |
+
self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
|
46 |
+
self.stg2_full_band_net = BaseASPPNet(16, 32)
|
47 |
+
|
48 |
+
self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
|
49 |
+
self.stg3_full_band_net = BaseASPPNet(32, 64)
|
50 |
+
|
51 |
+
self.out = nn.Conv2d(64, 2, 1, bias=False)
|
52 |
+
self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
|
53 |
+
self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
|
54 |
+
|
55 |
+
self.max_bin = n_fft // 2
|
56 |
+
self.output_bin = n_fft // 2 + 1
|
57 |
+
|
58 |
+
self.offset = 128
|
59 |
+
|
60 |
+
def forward(self, x, aggressiveness=None):
|
61 |
+
mix = x.detach()
|
62 |
+
x = x.clone()
|
63 |
+
|
64 |
+
x = x[:, :, : self.max_bin]
|
65 |
+
|
66 |
+
bandw = x.size()[2] // 2
|
67 |
+
aux1 = torch.cat(
|
68 |
+
[
|
69 |
+
self.stg1_low_band_net(x[:, :, :bandw]),
|
70 |
+
self.stg1_high_band_net(x[:, :, bandw:]),
|
71 |
+
],
|
72 |
+
dim=2,
|
73 |
+
)
|
74 |
+
|
75 |
+
h = torch.cat([x, aux1], dim=1)
|
76 |
+
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
|
77 |
+
|
78 |
+
h = torch.cat([x, aux1, aux2], dim=1)
|
79 |
+
h = self.stg3_full_band_net(self.stg3_bridge(h))
|
80 |
+
|
81 |
+
mask = torch.sigmoid(self.out(h))
|
82 |
+
mask = F.pad(
|
83 |
+
input=mask,
|
84 |
+
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
|
85 |
+
mode="replicate",
|
86 |
+
)
|
87 |
+
|
88 |
+
if self.training:
|
89 |
+
aux1 = torch.sigmoid(self.aux1_out(aux1))
|
90 |
+
aux1 = F.pad(
|
91 |
+
input=aux1,
|
92 |
+
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
|
93 |
+
mode="replicate",
|
94 |
+
)
|
95 |
+
aux2 = torch.sigmoid(self.aux2_out(aux2))
|
96 |
+
aux2 = F.pad(
|
97 |
+
input=aux2,
|
98 |
+
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
|
99 |
+
mode="replicate",
|
100 |
+
)
|
101 |
+
return mask * mix, aux1 * mix, aux2 * mix
|
102 |
+
else:
|
103 |
+
if aggressiveness:
|
104 |
+
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
|
105 |
+
mask[:, :, : aggressiveness["split_bin"]],
|
106 |
+
1 + aggressiveness["value"] / 3,
|
107 |
+
)
|
108 |
+
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
|
109 |
+
mask[:, :, aggressiveness["split_bin"] :],
|
110 |
+
1 + aggressiveness["value"],
|
111 |
+
)
|
112 |
+
|
113 |
+
return mask * mix
|
114 |
+
|
115 |
+
def predict(self, x_mag, aggressiveness=None):
|
116 |
+
h = self.forward(x_mag, aggressiveness)
|
117 |
+
|
118 |
+
if self.offset > 0:
|
119 |
+
h = h[:, :, :, self.offset : -self.offset]
|
120 |
+
assert h.size()[3] > 0
|
121 |
+
|
122 |
+
return h
|
uvr5/lib/lib_v5/spec_utils.py
ADDED
@@ -0,0 +1,672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
|
6 |
+
import librosa
|
7 |
+
import numpy as np
|
8 |
+
import soundfile as sf
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
|
12 |
+
def crop_center(h1, h2):
|
13 |
+
h1_shape = h1.size()
|
14 |
+
h2_shape = h2.size()
|
15 |
+
|
16 |
+
if h1_shape[3] == h2_shape[3]:
|
17 |
+
return h1
|
18 |
+
elif h1_shape[3] < h2_shape[3]:
|
19 |
+
raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
|
20 |
+
|
21 |
+
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
|
22 |
+
# e_freq = s_freq + h1_shape[2]
|
23 |
+
s_time = (h1_shape[3] - h2_shape[3]) // 2
|
24 |
+
e_time = s_time + h2_shape[3]
|
25 |
+
h1 = h1[:, :, :, s_time:e_time]
|
26 |
+
|
27 |
+
return h1
|
28 |
+
|
29 |
+
|
30 |
+
def wave_to_spectrogram(
|
31 |
+
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
|
32 |
+
):
|
33 |
+
if reverse:
|
34 |
+
wave_left = np.flip(np.asfortranarray(wave[0]))
|
35 |
+
wave_right = np.flip(np.asfortranarray(wave[1]))
|
36 |
+
elif mid_side:
|
37 |
+
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
|
38 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
|
39 |
+
elif mid_side_b2:
|
40 |
+
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
|
41 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
|
42 |
+
else:
|
43 |
+
wave_left = np.asfortranarray(wave[0])
|
44 |
+
wave_right = np.asfortranarray(wave[1])
|
45 |
+
|
46 |
+
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
|
47 |
+
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
|
48 |
+
|
49 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
50 |
+
|
51 |
+
return spec
|
52 |
+
|
53 |
+
|
54 |
+
def wave_to_spectrogram_mt(
|
55 |
+
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
|
56 |
+
):
|
57 |
+
import threading
|
58 |
+
|
59 |
+
if reverse:
|
60 |
+
wave_left = np.flip(np.asfortranarray(wave[0]))
|
61 |
+
wave_right = np.flip(np.asfortranarray(wave[1]))
|
62 |
+
elif mid_side:
|
63 |
+
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
|
64 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
|
65 |
+
elif mid_side_b2:
|
66 |
+
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
|
67 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
|
68 |
+
else:
|
69 |
+
wave_left = np.asfortranarray(wave[0])
|
70 |
+
wave_right = np.asfortranarray(wave[1])
|
71 |
+
|
72 |
+
def run_thread(**kwargs):
|
73 |
+
global spec_left
|
74 |
+
spec_left = librosa.stft(**kwargs)
|
75 |
+
|
76 |
+
thread = threading.Thread(
|
77 |
+
target=run_thread,
|
78 |
+
kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
|
79 |
+
)
|
80 |
+
thread.start()
|
81 |
+
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
|
82 |
+
thread.join()
|
83 |
+
|
84 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
85 |
+
|
86 |
+
return spec
|
87 |
+
|
88 |
+
|
89 |
+
def combine_spectrograms(specs, mp):
|
90 |
+
l = min([specs[i].shape[2] for i in specs])
|
91 |
+
spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
|
92 |
+
offset = 0
|
93 |
+
bands_n = len(mp.param["band"])
|
94 |
+
|
95 |
+
for d in range(1, bands_n + 1):
|
96 |
+
h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
|
97 |
+
spec_c[:, offset : offset + h, :l] = specs[d][
|
98 |
+
:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
|
99 |
+
]
|
100 |
+
offset += h
|
101 |
+
|
102 |
+
if offset > mp.param["bins"]:
|
103 |
+
raise ValueError("Too much bins")
|
104 |
+
|
105 |
+
# lowpass fiter
|
106 |
+
if (
|
107 |
+
mp.param["pre_filter_start"] > 0
|
108 |
+
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
|
109 |
+
if bands_n == 1:
|
110 |
+
spec_c = fft_lp_filter(
|
111 |
+
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
|
112 |
+
)
|
113 |
+
else:
|
114 |
+
gp = 1
|
115 |
+
for b in range(
|
116 |
+
mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
|
117 |
+
):
|
118 |
+
g = math.pow(
|
119 |
+
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
|
120 |
+
)
|
121 |
+
gp = g
|
122 |
+
spec_c[:, b, :] *= g
|
123 |
+
|
124 |
+
return np.asfortranarray(spec_c)
|
125 |
+
|
126 |
+
|
127 |
+
def spectrogram_to_image(spec, mode="magnitude"):
|
128 |
+
if mode == "magnitude":
|
129 |
+
if np.iscomplexobj(spec):
|
130 |
+
y = np.abs(spec)
|
131 |
+
else:
|
132 |
+
y = spec
|
133 |
+
y = np.log10(y**2 + 1e-8)
|
134 |
+
elif mode == "phase":
|
135 |
+
if np.iscomplexobj(spec):
|
136 |
+
y = np.angle(spec)
|
137 |
+
else:
|
138 |
+
y = spec
|
139 |
+
|
140 |
+
y -= y.min()
|
141 |
+
y *= 255 / y.max()
|
142 |
+
img = np.uint8(y)
|
143 |
+
|
144 |
+
if y.ndim == 3:
|
145 |
+
img = img.transpose(1, 2, 0)
|
146 |
+
img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
|
147 |
+
|
148 |
+
return img
|
149 |
+
|
150 |
+
|
151 |
+
def reduce_vocal_aggressively(X, y, softmask):
|
152 |
+
v = X - y
|
153 |
+
y_mag_tmp = np.abs(y)
|
154 |
+
v_mag_tmp = np.abs(v)
|
155 |
+
|
156 |
+
v_mask = v_mag_tmp > y_mag_tmp
|
157 |
+
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
|
158 |
+
|
159 |
+
return y_mag * np.exp(1.0j * np.angle(y))
|
160 |
+
|
161 |
+
|
162 |
+
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
|
163 |
+
if min_range < fade_size * 2:
|
164 |
+
raise ValueError("min_range must be >= fade_area * 2")
|
165 |
+
|
166 |
+
mag = mag.copy()
|
167 |
+
|
168 |
+
idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
|
169 |
+
starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
|
170 |
+
ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
|
171 |
+
uninformative = np.where(ends - starts > min_range)[0]
|
172 |
+
if len(uninformative) > 0:
|
173 |
+
starts = starts[uninformative]
|
174 |
+
ends = ends[uninformative]
|
175 |
+
old_e = None
|
176 |
+
for s, e in zip(starts, ends):
|
177 |
+
if old_e is not None and s - old_e < fade_size:
|
178 |
+
s = old_e - fade_size * 2
|
179 |
+
|
180 |
+
if s != 0:
|
181 |
+
weight = np.linspace(0, 1, fade_size)
|
182 |
+
mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
|
183 |
+
else:
|
184 |
+
s -= fade_size
|
185 |
+
|
186 |
+
if e != mag.shape[2]:
|
187 |
+
weight = np.linspace(1, 0, fade_size)
|
188 |
+
mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
|
189 |
+
else:
|
190 |
+
e += fade_size
|
191 |
+
|
192 |
+
mag[:, :, s + fade_size : e - fade_size] += ref[
|
193 |
+
:, :, s + fade_size : e - fade_size
|
194 |
+
]
|
195 |
+
old_e = e
|
196 |
+
|
197 |
+
return mag
|
198 |
+
|
199 |
+
|
200 |
+
def align_wave_head_and_tail(a, b):
|
201 |
+
l = min([a[0].size, b[0].size])
|
202 |
+
|
203 |
+
return a[:l, :l], b[:l, :l]
|
204 |
+
|
205 |
+
|
206 |
+
def cache_or_load(mix_path, inst_path, mp):
|
207 |
+
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
|
208 |
+
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
|
209 |
+
|
210 |
+
cache_dir = "mph{}".format(
|
211 |
+
hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
|
212 |
+
)
|
213 |
+
mix_cache_dir = os.path.join("cache", cache_dir)
|
214 |
+
inst_cache_dir = os.path.join("cache", cache_dir)
|
215 |
+
|
216 |
+
os.makedirs(mix_cache_dir, exist_ok=True)
|
217 |
+
os.makedirs(inst_cache_dir, exist_ok=True)
|
218 |
+
|
219 |
+
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
|
220 |
+
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
|
221 |
+
|
222 |
+
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
|
223 |
+
X_spec_m = np.load(mix_cache_path)
|
224 |
+
y_spec_m = np.load(inst_cache_path)
|
225 |
+
else:
|
226 |
+
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
|
227 |
+
|
228 |
+
for d in range(len(mp.param["band"]), 0, -1):
|
229 |
+
bp = mp.param["band"][d]
|
230 |
+
|
231 |
+
if d == len(mp.param["band"]): # high-end band
|
232 |
+
X_wave[d], _ = librosa.load(
|
233 |
+
mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
|
234 |
+
)
|
235 |
+
y_wave[d], _ = librosa.load(
|
236 |
+
inst_path,
|
237 |
+
bp["sr"],
|
238 |
+
False,
|
239 |
+
dtype=np.float32,
|
240 |
+
res_type=bp["res_type"],
|
241 |
+
)
|
242 |
+
else: # lower bands
|
243 |
+
X_wave[d] = librosa.resample(
|
244 |
+
X_wave[d + 1],
|
245 |
+
mp.param["band"][d + 1]["sr"],
|
246 |
+
bp["sr"],
|
247 |
+
res_type=bp["res_type"],
|
248 |
+
)
|
249 |
+
y_wave[d] = librosa.resample(
|
250 |
+
y_wave[d + 1],
|
251 |
+
mp.param["band"][d + 1]["sr"],
|
252 |
+
bp["sr"],
|
253 |
+
res_type=bp["res_type"],
|
254 |
+
)
|
255 |
+
|
256 |
+
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
|
257 |
+
|
258 |
+
X_spec_s[d] = wave_to_spectrogram(
|
259 |
+
X_wave[d],
|
260 |
+
bp["hl"],
|
261 |
+
bp["n_fft"],
|
262 |
+
mp.param["mid_side"],
|
263 |
+
mp.param["mid_side_b2"],
|
264 |
+
mp.param["reverse"],
|
265 |
+
)
|
266 |
+
y_spec_s[d] = wave_to_spectrogram(
|
267 |
+
y_wave[d],
|
268 |
+
bp["hl"],
|
269 |
+
bp["n_fft"],
|
270 |
+
mp.param["mid_side"],
|
271 |
+
mp.param["mid_side_b2"],
|
272 |
+
mp.param["reverse"],
|
273 |
+
)
|
274 |
+
|
275 |
+
del X_wave, y_wave
|
276 |
+
|
277 |
+
X_spec_m = combine_spectrograms(X_spec_s, mp)
|
278 |
+
y_spec_m = combine_spectrograms(y_spec_s, mp)
|
279 |
+
|
280 |
+
if X_spec_m.shape != y_spec_m.shape:
|
281 |
+
raise ValueError("The combined spectrograms are different: " + mix_path)
|
282 |
+
|
283 |
+
_, ext = os.path.splitext(mix_path)
|
284 |
+
|
285 |
+
np.save(mix_cache_path, X_spec_m)
|
286 |
+
np.save(inst_cache_path, y_spec_m)
|
287 |
+
|
288 |
+
return X_spec_m, y_spec_m
|
289 |
+
|
290 |
+
|
291 |
+
def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
|
292 |
+
spec_left = np.asfortranarray(spec[0])
|
293 |
+
spec_right = np.asfortranarray(spec[1])
|
294 |
+
|
295 |
+
wave_left = librosa.istft(spec_left, hop_length=hop_length)
|
296 |
+
wave_right = librosa.istft(spec_right, hop_length=hop_length)
|
297 |
+
|
298 |
+
if reverse:
|
299 |
+
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
|
300 |
+
elif mid_side:
|
301 |
+
return np.asfortranarray(
|
302 |
+
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
|
303 |
+
)
|
304 |
+
elif mid_side_b2:
|
305 |
+
return np.asfortranarray(
|
306 |
+
[
|
307 |
+
np.add(wave_right / 1.25, 0.4 * wave_left),
|
308 |
+
np.subtract(wave_left / 1.25, 0.4 * wave_right),
|
309 |
+
]
|
310 |
+
)
|
311 |
+
else:
|
312 |
+
return np.asfortranarray([wave_left, wave_right])
|
313 |
+
|
314 |
+
|
315 |
+
def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
|
316 |
+
import threading
|
317 |
+
|
318 |
+
spec_left = np.asfortranarray(spec[0])
|
319 |
+
spec_right = np.asfortranarray(spec[1])
|
320 |
+
|
321 |
+
def run_thread(**kwargs):
|
322 |
+
global wave_left
|
323 |
+
wave_left = librosa.istft(**kwargs)
|
324 |
+
|
325 |
+
thread = threading.Thread(
|
326 |
+
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
|
327 |
+
)
|
328 |
+
thread.start()
|
329 |
+
wave_right = librosa.istft(spec_right, hop_length=hop_length)
|
330 |
+
thread.join()
|
331 |
+
|
332 |
+
if reverse:
|
333 |
+
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
|
334 |
+
elif mid_side:
|
335 |
+
return np.asfortranarray(
|
336 |
+
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
|
337 |
+
)
|
338 |
+
elif mid_side_b2:
|
339 |
+
return np.asfortranarray(
|
340 |
+
[
|
341 |
+
np.add(wave_right / 1.25, 0.4 * wave_left),
|
342 |
+
np.subtract(wave_left / 1.25, 0.4 * wave_right),
|
343 |
+
]
|
344 |
+
)
|
345 |
+
else:
|
346 |
+
return np.asfortranarray([wave_left, wave_right])
|
347 |
+
|
348 |
+
|
349 |
+
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
|
350 |
+
wave_band = {}
|
351 |
+
bands_n = len(mp.param["band"])
|
352 |
+
offset = 0
|
353 |
+
|
354 |
+
for d in range(1, bands_n + 1):
|
355 |
+
bp = mp.param["band"][d]
|
356 |
+
spec_s = np.ndarray(
|
357 |
+
shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
|
358 |
+
)
|
359 |
+
h = bp["crop_stop"] - bp["crop_start"]
|
360 |
+
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
|
361 |
+
:, offset : offset + h, :
|
362 |
+
]
|
363 |
+
|
364 |
+
offset += h
|
365 |
+
if d == bands_n: # higher
|
366 |
+
if extra_bins_h: # if --high_end_process bypass
|
367 |
+
max_bin = bp["n_fft"] // 2
|
368 |
+
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
|
369 |
+
:, :extra_bins_h, :
|
370 |
+
]
|
371 |
+
if bp["hpf_start"] > 0:
|
372 |
+
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
|
373 |
+
if bands_n == 1:
|
374 |
+
wave = spectrogram_to_wave(
|
375 |
+
spec_s,
|
376 |
+
bp["hl"],
|
377 |
+
mp.param["mid_side"],
|
378 |
+
mp.param["mid_side_b2"],
|
379 |
+
mp.param["reverse"],
|
380 |
+
)
|
381 |
+
else:
|
382 |
+
wave = np.add(
|
383 |
+
wave,
|
384 |
+
spectrogram_to_wave(
|
385 |
+
spec_s,
|
386 |
+
bp["hl"],
|
387 |
+
mp.param["mid_side"],
|
388 |
+
mp.param["mid_side_b2"],
|
389 |
+
mp.param["reverse"],
|
390 |
+
),
|
391 |
+
)
|
392 |
+
else:
|
393 |
+
sr = mp.param["band"][d + 1]["sr"]
|
394 |
+
if d == 1: # lower
|
395 |
+
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
|
396 |
+
wave = librosa.resample(
|
397 |
+
spectrogram_to_wave(
|
398 |
+
spec_s,
|
399 |
+
bp["hl"],
|
400 |
+
mp.param["mid_side"],
|
401 |
+
mp.param["mid_side_b2"],
|
402 |
+
mp.param["reverse"],
|
403 |
+
),
|
404 |
+
bp["sr"],
|
405 |
+
sr,
|
406 |
+
res_type="sinc_fastest",
|
407 |
+
)
|
408 |
+
else: # mid
|
409 |
+
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
|
410 |
+
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
|
411 |
+
wave2 = np.add(
|
412 |
+
wave,
|
413 |
+
spectrogram_to_wave(
|
414 |
+
spec_s,
|
415 |
+
bp["hl"],
|
416 |
+
mp.param["mid_side"],
|
417 |
+
mp.param["mid_side_b2"],
|
418 |
+
mp.param["reverse"],
|
419 |
+
),
|
420 |
+
)
|
421 |
+
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
|
422 |
+
wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
|
423 |
+
|
424 |
+
return wave.T
|
425 |
+
|
426 |
+
|
427 |
+
def fft_lp_filter(spec, bin_start, bin_stop):
|
428 |
+
g = 1.0
|
429 |
+
for b in range(bin_start, bin_stop):
|
430 |
+
g -= 1 / (bin_stop - bin_start)
|
431 |
+
spec[:, b, :] = g * spec[:, b, :]
|
432 |
+
|
433 |
+
spec[:, bin_stop:, :] *= 0
|
434 |
+
|
435 |
+
return spec
|
436 |
+
|
437 |
+
|
438 |
+
def fft_hp_filter(spec, bin_start, bin_stop):
|
439 |
+
g = 1.0
|
440 |
+
for b in range(bin_start, bin_stop, -1):
|
441 |
+
g -= 1 / (bin_start - bin_stop)
|
442 |
+
spec[:, b, :] = g * spec[:, b, :]
|
443 |
+
|
444 |
+
spec[:, 0 : bin_stop + 1, :] *= 0
|
445 |
+
|
446 |
+
return spec
|
447 |
+
|
448 |
+
|
449 |
+
def mirroring(a, spec_m, input_high_end, mp):
|
450 |
+
if "mirroring" == a:
|
451 |
+
mirror = np.flip(
|
452 |
+
np.abs(
|
453 |
+
spec_m[
|
454 |
+
:,
|
455 |
+
mp.param["pre_filter_start"]
|
456 |
+
- 10
|
457 |
+
- input_high_end.shape[1] : mp.param["pre_filter_start"]
|
458 |
+
- 10,
|
459 |
+
:,
|
460 |
+
]
|
461 |
+
),
|
462 |
+
1,
|
463 |
+
)
|
464 |
+
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
|
465 |
+
|
466 |
+
return np.where(
|
467 |
+
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
|
468 |
+
)
|
469 |
+
|
470 |
+
if "mirroring2" == a:
|
471 |
+
mirror = np.flip(
|
472 |
+
np.abs(
|
473 |
+
spec_m[
|
474 |
+
:,
|
475 |
+
mp.param["pre_filter_start"]
|
476 |
+
- 10
|
477 |
+
- input_high_end.shape[1] : mp.param["pre_filter_start"]
|
478 |
+
- 10,
|
479 |
+
:,
|
480 |
+
]
|
481 |
+
),
|
482 |
+
1,
|
483 |
+
)
|
484 |
+
mi = np.multiply(mirror, input_high_end * 1.7)
|
485 |
+
|
486 |
+
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
|
487 |
+
|
488 |
+
|
489 |
+
def ensembling(a, specs):
|
490 |
+
for i in range(1, len(specs)):
|
491 |
+
if i == 1:
|
492 |
+
spec = specs[0]
|
493 |
+
|
494 |
+
ln = min([spec.shape[2], specs[i].shape[2]])
|
495 |
+
spec = spec[:, :, :ln]
|
496 |
+
specs[i] = specs[i][:, :, :ln]
|
497 |
+
|
498 |
+
if "min_mag" == a:
|
499 |
+
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
|
500 |
+
if "max_mag" == a:
|
501 |
+
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
|
502 |
+
|
503 |
+
return spec
|
504 |
+
|
505 |
+
|
506 |
+
def stft(wave, nfft, hl):
|
507 |
+
wave_left = np.asfortranarray(wave[0])
|
508 |
+
wave_right = np.asfortranarray(wave[1])
|
509 |
+
spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
|
510 |
+
spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
|
511 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
512 |
+
|
513 |
+
return spec
|
514 |
+
|
515 |
+
|
516 |
+
def istft(spec, hl):
|
517 |
+
spec_left = np.asfortranarray(spec[0])
|
518 |
+
spec_right = np.asfortranarray(spec[1])
|
519 |
+
|
520 |
+
wave_left = librosa.istft(spec_left, hop_length=hl)
|
521 |
+
wave_right = librosa.istft(spec_right, hop_length=hl)
|
522 |
+
wave = np.asfortranarray([wave_left, wave_right])
|
523 |
+
|
524 |
+
|
525 |
+
if __name__ == "__main__":
|
526 |
+
import argparse
|
527 |
+
import sys
|
528 |
+
import time
|
529 |
+
|
530 |
+
import cv2
|
531 |
+
from model_param_init import ModelParameters
|
532 |
+
|
533 |
+
p = argparse.ArgumentParser()
|
534 |
+
p.add_argument(
|
535 |
+
"--algorithm",
|
536 |
+
"-a",
|
537 |
+
type=str,
|
538 |
+
choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
|
539 |
+
default="min_mag",
|
540 |
+
)
|
541 |
+
p.add_argument(
|
542 |
+
"--model_params",
|
543 |
+
"-m",
|
544 |
+
type=str,
|
545 |
+
default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
|
546 |
+
)
|
547 |
+
p.add_argument("--output_name", "-o", type=str, default="output")
|
548 |
+
p.add_argument("--vocals_only", "-v", action="store_true")
|
549 |
+
p.add_argument("input", nargs="+")
|
550 |
+
args = p.parse_args()
|
551 |
+
|
552 |
+
start_time = time.time()
|
553 |
+
|
554 |
+
if args.algorithm.startswith("invert") and len(args.input) != 2:
|
555 |
+
raise ValueError("There should be two input files.")
|
556 |
+
|
557 |
+
if not args.algorithm.startswith("invert") and len(args.input) < 2:
|
558 |
+
raise ValueError("There must be at least two input files.")
|
559 |
+
|
560 |
+
wave, specs = {}, {}
|
561 |
+
mp = ModelParameters(args.model_params)
|
562 |
+
|
563 |
+
for i in range(len(args.input)):
|
564 |
+
spec = {}
|
565 |
+
|
566 |
+
for d in range(len(mp.param["band"]), 0, -1):
|
567 |
+
bp = mp.param["band"][d]
|
568 |
+
|
569 |
+
if d == len(mp.param["band"]): # high-end band
|
570 |
+
wave[d], _ = librosa.load(
|
571 |
+
args.input[i],
|
572 |
+
bp["sr"],
|
573 |
+
False,
|
574 |
+
dtype=np.float32,
|
575 |
+
res_type=bp["res_type"],
|
576 |
+
)
|
577 |
+
|
578 |
+
if len(wave[d].shape) == 1: # mono to stereo
|
579 |
+
wave[d] = np.array([wave[d], wave[d]])
|
580 |
+
else: # lower bands
|
581 |
+
wave[d] = librosa.resample(
|
582 |
+
wave[d + 1],
|
583 |
+
mp.param["band"][d + 1]["sr"],
|
584 |
+
bp["sr"],
|
585 |
+
res_type=bp["res_type"],
|
586 |
+
)
|
587 |
+
|
588 |
+
spec[d] = wave_to_spectrogram(
|
589 |
+
wave[d],
|
590 |
+
bp["hl"],
|
591 |
+
bp["n_fft"],
|
592 |
+
mp.param["mid_side"],
|
593 |
+
mp.param["mid_side_b2"],
|
594 |
+
mp.param["reverse"],
|
595 |
+
)
|
596 |
+
|
597 |
+
specs[i] = combine_spectrograms(spec, mp)
|
598 |
+
|
599 |
+
del wave
|
600 |
+
|
601 |
+
if args.algorithm == "deep":
|
602 |
+
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
|
603 |
+
v_spec = d_spec - specs[1]
|
604 |
+
sf.write(
|
605 |
+
os.path.join("{}.wav".format(args.output_name)),
|
606 |
+
cmb_spectrogram_to_wave(v_spec, mp),
|
607 |
+
mp.param["sr"],
|
608 |
+
)
|
609 |
+
|
610 |
+
if args.algorithm.startswith("invert"):
|
611 |
+
ln = min([specs[0].shape[2], specs[1].shape[2]])
|
612 |
+
specs[0] = specs[0][:, :, :ln]
|
613 |
+
specs[1] = specs[1][:, :, :ln]
|
614 |
+
|
615 |
+
if "invert_p" == args.algorithm:
|
616 |
+
X_mag = np.abs(specs[0])
|
617 |
+
y_mag = np.abs(specs[1])
|
618 |
+
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
|
619 |
+
v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
|
620 |
+
else:
|
621 |
+
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
|
622 |
+
v_spec = specs[0] - specs[1]
|
623 |
+
|
624 |
+
if not args.vocals_only:
|
625 |
+
X_mag = np.abs(specs[0])
|
626 |
+
y_mag = np.abs(specs[1])
|
627 |
+
v_mag = np.abs(v_spec)
|
628 |
+
|
629 |
+
X_image = spectrogram_to_image(X_mag)
|
630 |
+
y_image = spectrogram_to_image(y_mag)
|
631 |
+
v_image = spectrogram_to_image(v_mag)
|
632 |
+
|
633 |
+
cv2.imwrite("{}_X.png".format(args.output_name), X_image)
|
634 |
+
cv2.imwrite("{}_y.png".format(args.output_name), y_image)
|
635 |
+
cv2.imwrite("{}_v.png".format(args.output_name), v_image)
|
636 |
+
|
637 |
+
sf.write(
|
638 |
+
"{}_X.wav".format(args.output_name),
|
639 |
+
cmb_spectrogram_to_wave(specs[0], mp),
|
640 |
+
mp.param["sr"],
|
641 |
+
)
|
642 |
+
sf.write(
|
643 |
+
"{}_y.wav".format(args.output_name),
|
644 |
+
cmb_spectrogram_to_wave(specs[1], mp),
|
645 |
+
mp.param["sr"],
|
646 |
+
)
|
647 |
+
|
648 |
+
sf.write(
|
649 |
+
"{}_v.wav".format(args.output_name),
|
650 |
+
cmb_spectrogram_to_wave(v_spec, mp),
|
651 |
+
mp.param["sr"],
|
652 |
+
)
|
653 |
+
else:
|
654 |
+
if not args.algorithm == "deep":
|
655 |
+
sf.write(
|
656 |
+
os.path.join("ensembled", "{}.wav".format(args.output_name)),
|
657 |
+
cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
|
658 |
+
mp.param["sr"],
|
659 |
+
)
|
660 |
+
|
661 |
+
if args.algorithm == "align":
|
662 |
+
trackalignment = [
|
663 |
+
{
|
664 |
+
"file1": '"{}"'.format(args.input[0]),
|
665 |
+
"file2": '"{}"'.format(args.input[1]),
|
666 |
+
}
|
667 |
+
]
|
668 |
+
|
669 |
+
for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
|
670 |
+
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
|
671 |
+
|
672 |
+
# print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
|
uvr5/lib/name_params.json
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"equivalent" : [
|
3 |
+
{
|
4 |
+
"model_hash_name" : [
|
5 |
+
{
|
6 |
+
"hash_name": "47939caf0cfe52a0e81442b85b971dfd",
|
7 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
8 |
+
"param_name": "4band_44100"
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
|
12 |
+
"model_params": "lib/lib_v5/modelparams/4band_v2.json",
|
13 |
+
"param_name": "4band_v2"
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
|
17 |
+
"model_params": "lib/lib_v5/modelparams/4band_v2.json",
|
18 |
+
"param_name": "4band_v2"
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
|
22 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
23 |
+
"param_name": "4band_44100"
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"hash_name": "a82f14e75892e55e994376edbf0c8435",
|
27 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
28 |
+
"param_name": "4band_44100"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
|
32 |
+
"model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
|
33 |
+
"param_name": "4band_v2_sn"
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"hash_name": "08611fb99bd59eaa79ad27c58d137727",
|
37 |
+
"model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
|
38 |
+
"param_name": "4band_v2_sn"
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"hash_name": "5c7bbca45a187e81abbbd351606164e5",
|
42 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
|
43 |
+
"param_name": "3band_44100_msb2"
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"hash_name": "d6b2cb685a058a091e5e7098192d3233",
|
47 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
|
48 |
+
"param_name": "3band_44100_msb2"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
|
52 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
53 |
+
"param_name": "4band_44100"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"hash_name": "c3448ec923fa0edf3d03a19e633faa53",
|
57 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
58 |
+
"param_name": "4band_44100"
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"hash_name": "68aa2c8093d0080704b200d140f59e54",
|
62 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100.json",
|
63 |
+
"param_name": "3band_44100"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
|
67 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
|
68 |
+
"param_name": "3band_44100_mid.json"
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
|
72 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
|
73 |
+
"param_name": "3band_44100_mid.json"
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"hash_name": "52fdca89576f06cf4340b74a4730ee5f",
|
77 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
78 |
+
"param_name": "4band_44100.json"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"hash_name": "41191165b05d38fc77f072fa9e8e8a30",
|
82 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
83 |
+
"param_name": "4band_44100.json"
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"hash_name": "89e83b511ad474592689e562d5b1f80e",
|
87 |
+
"model_params": "lib/lib_v5/modelparams/2band_32000.json",
|
88 |
+
"param_name": "2band_32000.json"
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"hash_name": "0b954da81d453b716b114d6d7c95177f",
|
92 |
+
"model_params": "lib/lib_v5/modelparams/2band_32000.json",
|
93 |
+
"param_name": "2band_32000.json"
|
94 |
+
}
|
95 |
+
|
96 |
+
],
|
97 |
+
"v4 Models": [
|
98 |
+
{
|
99 |
+
"hash_name": "6a00461c51c2920fd68937d4609ed6c8",
|
100 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
|
101 |
+
"param_name": "1band_sr16000_hl512"
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"hash_name": "0ab504864d20f1bd378fe9c81ef37140",
|
105 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
|
106 |
+
"param_name": "1band_sr32000_hl512"
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
|
110 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
|
111 |
+
"param_name": "1band_sr32000_hl512"
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"hash_name": "80ab74d65e515caa3622728d2de07d23",
|
115 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
|
116 |
+
"param_name": "1band_sr32000_hl512"
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"hash_name": "edc115e7fc523245062200c00caa847f",
|
120 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
|
121 |
+
"param_name": "1band_sr33075_hl384"
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
|
125 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
|
126 |
+
"param_name": "1band_sr33075_hl384"
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"hash_name": "b58090534c52cbc3e9b5104bad666ef2",
|
130 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
|
131 |
+
"param_name": "1band_sr44100_hl512"
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
|
135 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
|
136 |
+
"param_name": "1band_sr44100_hl512"
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"hash_name": "ae702fed0238afb5346db8356fe25f13",
|
140 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
|
141 |
+
"param_name": "1band_sr44100_hl1024"
|
142 |
+
}
|
143 |
+
]
|
144 |
+
}
|
145 |
+
],
|
146 |
+
"User Models" : [
|
147 |
+
{
|
148 |
+
"1 Band": [
|
149 |
+
{
|
150 |
+
"hash_name": "1band_sr16000_hl512",
|
151 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
|
152 |
+
"param_name": "1band_sr16000_hl512"
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"hash_name": "1band_sr32000_hl512",
|
156 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
|
157 |
+
"param_name": "1band_sr16000_hl512"
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"hash_name": "1band_sr33075_hl384",
|
161 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
|
162 |
+
"param_name": "1band_sr33075_hl384"
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"hash_name": "1band_sr44100_hl256",
|
166 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr44100_hl256.json",
|
167 |
+
"param_name": "1band_sr44100_hl256"
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"hash_name": "1band_sr44100_hl512",
|
171 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
|
172 |
+
"param_name": "1band_sr44100_hl512"
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"hash_name": "1band_sr44100_hl1024",
|
176 |
+
"model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
|
177 |
+
"param_name": "1band_sr44100_hl1024"
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"2 Band": [
|
181 |
+
{
|
182 |
+
"hash_name": "2band_44100_lofi",
|
183 |
+
"model_params": "lib/lib_v5/modelparams/2band_44100_lofi.json",
|
184 |
+
"param_name": "2band_44100_lofi"
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"hash_name": "2band_32000",
|
188 |
+
"model_params": "lib/lib_v5/modelparams/2band_32000.json",
|
189 |
+
"param_name": "2band_32000"
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"hash_name": "2band_48000",
|
193 |
+
"model_params": "lib/lib_v5/modelparams/2band_48000.json",
|
194 |
+
"param_name": "2band_48000"
|
195 |
+
}
|
196 |
+
],
|
197 |
+
"3 Band": [
|
198 |
+
{
|
199 |
+
"hash_name": "3band_44100",
|
200 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100.json",
|
201 |
+
"param_name": "3band_44100"
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"hash_name": "3band_44100_mid",
|
205 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
|
206 |
+
"param_name": "3band_44100_mid"
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"hash_name": "3band_44100_msb2",
|
210 |
+
"model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
|
211 |
+
"param_name": "3band_44100_msb2"
|
212 |
+
}
|
213 |
+
],
|
214 |
+
"4 Band": [
|
215 |
+
{
|
216 |
+
"hash_name": "4band_44100",
|
217 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100.json",
|
218 |
+
"param_name": "4band_44100"
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"hash_name": "4band_44100_mid",
|
222 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100_mid.json",
|
223 |
+
"param_name": "4band_44100_mid"
|
224 |
+
},
|
225 |
+
{
|
226 |
+
"hash_name": "4band_44100_msb",
|
227 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100_msb.json",
|
228 |
+
"param_name": "4band_44100_msb"
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"hash_name": "4band_44100_msb2",
|
232 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100_msb2.json",
|
233 |
+
"param_name": "4band_44100_msb2"
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"hash_name": "4band_44100_reverse",
|
237 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100_reverse.json",
|
238 |
+
"param_name": "4band_44100_reverse"
|
239 |
+
},
|
240 |
+
{
|
241 |
+
"hash_name": "4band_44100_sw",
|
242 |
+
"model_params": "lib/lib_v5/modelparams/4band_44100_sw.json",
|
243 |
+
"param_name": "4band_44100_sw"
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"hash_name": "4band_v2",
|
247 |
+
"model_params": "lib/lib_v5/modelparams/4band_v2.json",
|
248 |
+
"param_name": "4band_v2"
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"hash_name": "4band_v2_sn",
|
252 |
+
"model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
|
253 |
+
"param_name": "4band_v2_sn"
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"hash_name": "tmodelparam",
|
257 |
+
"model_params": "lib/lib_v5/modelparams/tmodelparam.json",
|
258 |
+
"param_name": "User Model Param Set"
|
259 |
+
}
|
260 |
+
]
|
261 |
+
}
|
262 |
+
]
|
263 |
+
}
|
uvr5/lib/utils.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
|
8 |
+
def load_data(file_name: str = "./lib/name_params.json") -> dict:
|
9 |
+
with open(file_name, "r") as f:
|
10 |
+
data = json.load(f)
|
11 |
+
|
12 |
+
return data
|
13 |
+
|
14 |
+
|
15 |
+
def make_padding(width, cropsize, offset):
|
16 |
+
left = offset
|
17 |
+
roi_size = cropsize - left * 2
|
18 |
+
if roi_size == 0:
|
19 |
+
roi_size = cropsize
|
20 |
+
right = roi_size - (width % roi_size) + left
|
21 |
+
|
22 |
+
return left, right, roi_size
|
23 |
+
|
24 |
+
|
25 |
+
def inference(X_spec, device, model, aggressiveness, data):
|
26 |
+
"""
|
27 |
+
data : dic configs
|
28 |
+
"""
|
29 |
+
|
30 |
+
def _execute(
|
31 |
+
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
|
32 |
+
):
|
33 |
+
model.eval()
|
34 |
+
with torch.no_grad():
|
35 |
+
preds = []
|
36 |
+
|
37 |
+
iterations = [n_window]
|
38 |
+
|
39 |
+
total_iterations = sum(iterations)
|
40 |
+
for i in tqdm(range(n_window)):
|
41 |
+
start = i * roi_size
|
42 |
+
X_mag_window = X_mag_pad[
|
43 |
+
None, :, :, start : start + data["window_size"]
|
44 |
+
]
|
45 |
+
X_mag_window = torch.from_numpy(X_mag_window)
|
46 |
+
if is_half:
|
47 |
+
X_mag_window = X_mag_window.half()
|
48 |
+
X_mag_window = X_mag_window.to(device)
|
49 |
+
|
50 |
+
pred = model.predict(X_mag_window, aggressiveness)
|
51 |
+
|
52 |
+
pred = pred.detach().cpu().numpy()
|
53 |
+
preds.append(pred[0])
|
54 |
+
|
55 |
+
pred = np.concatenate(preds, axis=2)
|
56 |
+
return pred
|
57 |
+
|
58 |
+
def preprocess(X_spec):
|
59 |
+
X_mag = np.abs(X_spec)
|
60 |
+
X_phase = np.angle(X_spec)
|
61 |
+
|
62 |
+
return X_mag, X_phase
|
63 |
+
|
64 |
+
X_mag, X_phase = preprocess(X_spec)
|
65 |
+
|
66 |
+
coef = X_mag.max()
|
67 |
+
X_mag_pre = X_mag / coef
|
68 |
+
|
69 |
+
n_frame = X_mag_pre.shape[2]
|
70 |
+
pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
|
71 |
+
n_window = int(np.ceil(n_frame / roi_size))
|
72 |
+
|
73 |
+
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
|
74 |
+
|
75 |
+
if list(model.state_dict().values())[0].dtype == torch.float16:
|
76 |
+
is_half = True
|
77 |
+
else:
|
78 |
+
is_half = False
|
79 |
+
pred = _execute(
|
80 |
+
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
|
81 |
+
)
|
82 |
+
pred = pred[:, :, :n_frame]
|
83 |
+
|
84 |
+
if data["tta"]:
|
85 |
+
pad_l += roi_size // 2
|
86 |
+
pad_r += roi_size // 2
|
87 |
+
n_window += 1
|
88 |
+
|
89 |
+
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
|
90 |
+
|
91 |
+
pred_tta = _execute(
|
92 |
+
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
|
93 |
+
)
|
94 |
+
pred_tta = pred_tta[:, :, roi_size // 2 :]
|
95 |
+
pred_tta = pred_tta[:, :, :n_frame]
|
96 |
+
|
97 |
+
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
|
98 |
+
else:
|
99 |
+
return pred * coef, X_mag, np.exp(1.0j * X_phase)
|
100 |
+
|
101 |
+
|
102 |
+
def _get_name_params(model_path, model_hash):
|
103 |
+
data = load_data()
|
104 |
+
flag = False
|
105 |
+
ModelName = model_path
|
106 |
+
for type in list(data):
|
107 |
+
for model in list(data[type][0]):
|
108 |
+
for i in range(len(data[type][0][model])):
|
109 |
+
if str(data[type][0][model][i]["hash_name"]) == model_hash:
|
110 |
+
flag = True
|
111 |
+
elif str(data[type][0][model][i]["hash_name"]) in ModelName:
|
112 |
+
flag = True
|
113 |
+
|
114 |
+
if flag:
|
115 |
+
model_params_auto = data[type][0][model][i]["model_params"]
|
116 |
+
param_name_auto = data[type][0][model][i]["param_name"]
|
117 |
+
if type == "equivalent":
|
118 |
+
return param_name_auto, model_params_auto
|
119 |
+
else:
|
120 |
+
flag = False
|
121 |
+
return param_name_auto, model_params_auto
|
uvr5/uvr_model/UVR-HP2.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05
|
3 |
+
size 63454827
|
uvr5/uvr_model/UVR-HP5.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee
|
3 |
+
size 63454827
|
uvr5/uvr_model/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
uvr5/vr.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os,sys
|
2 |
+
parent_directory = os.path.dirname(os.path.abspath(__file__))
|
3 |
+
import logging,pdb
|
4 |
+
logger = logging.getLogger(__name__)
|
5 |
+
|
6 |
+
import librosa
|
7 |
+
import numpy as np
|
8 |
+
import soundfile as sf
|
9 |
+
import torch
|
10 |
+
from uvr5.lib.lib_v5 import nets_61968KB as Nets
|
11 |
+
from uvr5.lib.lib_v5 import spec_utils
|
12 |
+
from uvr5.lib.lib_v5.model_param_init import ModelParameters
|
13 |
+
from uvr5.lib.utils import inference
|
14 |
+
|
15 |
+
|
16 |
+
class AudioPre:
|
17 |
+
def __init__(self, agg, model_path, device, is_half, tta=False):
|
18 |
+
self.model_path = model_path
|
19 |
+
self.device = device
|
20 |
+
self.data = {
|
21 |
+
# Processing Options
|
22 |
+
"postprocess": False,
|
23 |
+
"tta": tta,
|
24 |
+
# Constants
|
25 |
+
"window_size": 512,
|
26 |
+
"agg": agg,
|
27 |
+
"high_end_process": "mirroring",
|
28 |
+
}
|
29 |
+
mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory)
|
30 |
+
model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
|
31 |
+
cpk = torch.load(model_path, map_location="cpu")
|
32 |
+
model.load_state_dict(cpk)
|
33 |
+
model.eval()
|
34 |
+
if is_half:
|
35 |
+
model = model.half().to(device)
|
36 |
+
else:
|
37 |
+
model = model.to(device)
|
38 |
+
|
39 |
+
self.mp = mp
|
40 |
+
self.model = model
|
41 |
+
|
42 |
+
def _path_audio_(
|
43 |
+
self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
|
44 |
+
):
|
45 |
+
if ins_root is None and vocal_root is None:
|
46 |
+
return "No save root."
|
47 |
+
name = os.path.basename(music_file)
|
48 |
+
if ins_root is not None:
|
49 |
+
os.makedirs(ins_root, exist_ok=True)
|
50 |
+
if vocal_root is not None:
|
51 |
+
os.makedirs(vocal_root, exist_ok=True)
|
52 |
+
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
|
53 |
+
bands_n = len(self.mp.param["band"])
|
54 |
+
# print(bands_n)
|
55 |
+
for d in range(bands_n, 0, -1):
|
56 |
+
bp = self.mp.param["band"][d]
|
57 |
+
if d == bands_n: # high-end band
|
58 |
+
(
|
59 |
+
X_wave[d],
|
60 |
+
_,
|
61 |
+
) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
|
62 |
+
music_file,
|
63 |
+
bp["sr"],
|
64 |
+
False,
|
65 |
+
dtype=np.float32,
|
66 |
+
res_type=bp["res_type"],
|
67 |
+
)
|
68 |
+
if X_wave[d].ndim == 1:
|
69 |
+
X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
|
70 |
+
else: # lower bands
|
71 |
+
X_wave[d] = librosa.core.resample(
|
72 |
+
X_wave[d + 1],
|
73 |
+
self.mp.param["band"][d + 1]["sr"],
|
74 |
+
bp["sr"],
|
75 |
+
res_type=bp["res_type"],
|
76 |
+
)
|
77 |
+
# Stft of wave source
|
78 |
+
X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
|
79 |
+
X_wave[d],
|
80 |
+
bp["hl"],
|
81 |
+
bp["n_fft"],
|
82 |
+
self.mp.param["mid_side"],
|
83 |
+
self.mp.param["mid_side_b2"],
|
84 |
+
self.mp.param["reverse"],
|
85 |
+
)
|
86 |
+
# pdb.set_trace()
|
87 |
+
if d == bands_n and self.data["high_end_process"] != "none":
|
88 |
+
input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
|
89 |
+
self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
|
90 |
+
)
|
91 |
+
input_high_end = X_spec_s[d][
|
92 |
+
:, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
|
93 |
+
]
|
94 |
+
|
95 |
+
X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
|
96 |
+
aggresive_set = float(self.data["agg"] / 100)
|
97 |
+
aggressiveness = {
|
98 |
+
"value": aggresive_set,
|
99 |
+
"split_bin": self.mp.param["band"][1]["crop_stop"],
|
100 |
+
}
|
101 |
+
with torch.no_grad():
|
102 |
+
pred, X_mag, X_phase = inference(
|
103 |
+
X_spec_m, self.device, self.model, aggressiveness, self.data
|
104 |
+
)
|
105 |
+
# Postprocess
|
106 |
+
if self.data["postprocess"]:
|
107 |
+
pred_inv = np.clip(X_mag - pred, 0, np.inf)
|
108 |
+
pred = spec_utils.mask_silence(pred, pred_inv)
|
109 |
+
y_spec_m = pred * X_phase
|
110 |
+
v_spec_m = X_spec_m - y_spec_m
|
111 |
+
|
112 |
+
if is_hp3 == True:
|
113 |
+
ins_root,vocal_root = vocal_root,ins_root
|
114 |
+
|
115 |
+
if ins_root is not None:
|
116 |
+
if self.data["high_end_process"].startswith("mirroring"):
|
117 |
+
input_high_end_ = spec_utils.mirroring(
|
118 |
+
self.data["high_end_process"], y_spec_m, input_high_end, self.mp
|
119 |
+
)
|
120 |
+
wav_instrument = spec_utils.cmb_spectrogram_to_wave(
|
121 |
+
y_spec_m, self.mp, input_high_end_h, input_high_end_
|
122 |
+
)
|
123 |
+
else:
|
124 |
+
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
|
125 |
+
logger.info("%s instruments done" % name)
|
126 |
+
if is_hp3 == True:
|
127 |
+
head = "vocal_"
|
128 |
+
else:
|
129 |
+
head = "instrument_"
|
130 |
+
if format in ["wav", "flac"]:
|
131 |
+
sf.write(
|
132 |
+
os.path.join(
|
133 |
+
ins_root,
|
134 |
+
head + "{}_{}.{}".format(name, self.data["agg"], format),
|
135 |
+
),
|
136 |
+
(np.array(wav_instrument)).astype("float32"),
|
137 |
+
self.mp.param["sr"],
|
138 |
+
) #
|
139 |
+
else:
|
140 |
+
path = os.path.join(
|
141 |
+
ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
|
142 |
+
)
|
143 |
+
sf.write(
|
144 |
+
path,
|
145 |
+
(np.array(wav_instrument)).astype("float32"),
|
146 |
+
self.mp.param["sr"],
|
147 |
+
)
|
148 |
+
if os.path.exists(path):
|
149 |
+
opt_format_path = path[:-4] + ".%s" % format
|
150 |
+
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
|
151 |
+
if os.path.exists(opt_format_path):
|
152 |
+
try:
|
153 |
+
os.remove(path)
|
154 |
+
except:
|
155 |
+
pass
|
156 |
+
if vocal_root is not None:
|
157 |
+
if is_hp3 == True:
|
158 |
+
head = "instrument_"
|
159 |
+
else:
|
160 |
+
head = "vocal_"
|
161 |
+
if self.data["high_end_process"].startswith("mirroring"):
|
162 |
+
input_high_end_ = spec_utils.mirroring(
|
163 |
+
self.data["high_end_process"], v_spec_m, input_high_end, self.mp
|
164 |
+
)
|
165 |
+
wav_vocals = spec_utils.cmb_spectrogram_to_wave(
|
166 |
+
v_spec_m, self.mp, input_high_end_h, input_high_end_
|
167 |
+
)
|
168 |
+
else:
|
169 |
+
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
|
170 |
+
logger.info("%s vocals done" % name)
|
171 |
+
if format in ["wav", "flac"]:
|
172 |
+
sf.write(
|
173 |
+
os.path.join(
|
174 |
+
vocal_root,
|
175 |
+
head + "{}_{}.{}".format(name, self.data["agg"], format),
|
176 |
+
),
|
177 |
+
(np.array(wav_vocals)).astype("float32"),
|
178 |
+
self.mp.param["sr"],
|
179 |
+
)
|
180 |
+
else:
|
181 |
+
path = os.path.join(
|
182 |
+
vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
|
183 |
+
)
|
184 |
+
sf.write(
|
185 |
+
path,
|
186 |
+
(np.array(wav_vocals)).astype("float32"),
|
187 |
+
self.mp.param["sr"],
|
188 |
+
)
|
189 |
+
if os.path.exists(path):
|
190 |
+
opt_format_path = path[:-4] + ".%s" % format
|
191 |
+
os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
|
192 |
+
if os.path.exists(opt_format_path):
|
193 |
+
try:
|
194 |
+
os.remove(path)
|
195 |
+
except:
|
196 |
+
pass
|