Spaces:
Running
Running
first commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -1
- .gitignore +3 -0
- LICENSE +21 -0
- README.md +1 -0
- Utils/JDC/__init__.py +1 -0
- Utils/JDC/bst.t7.txt +1 -0
- Utils/JDC/model.py +190 -0
- Utils/__init__.py +1 -0
- app.py +142 -0
- config_v1_16k.json +42 -0
- dataset/audio/p225/p225_220.wav +0 -0
- dataset/audio/p226/p226_341.wav +0 -0
- dataset/audio/p227/p227_021.wav +0 -0
- dataset/audio/p228/p228_242.wav +0 -0
- dataset/audio/p229/p229_021.wav +0 -0
- dataset/audio/p230/p230_361.wav +0 -0
- dataset/audio/p231/p231_197.wav +0 -0
- dataset/audio/p232/p232_023.wav +0 -0
- dataset/audio/p233/p233_323.wav +0 -0
- dataset/audio/p234/p234_229.wav +0 -0
- dataset/audio/p236/p236_068.wav +0 -0
- dataset/audio/p237/p237_023.wav +0 -0
- dataset/audio/p238/p238_023.wav +0 -0
- dataset/audio/p239/p239_023.wav +0 -0
- dataset/audio/p240/p240_004.wav +0 -0
- dataset/audio/p241/p241_050.wav +0 -0
- dataset/audio/p243/p243_087.wav +0 -0
- dataset/audio/p244/p244_008.wav +0 -0
- dataset/audio/p245/p245_014.wav +0 -0
- dataset/audio/p246/p246_022.wav +0 -0
- dataset/audio/p247/p247_380.wav +0 -0
- dataset/audio/p248/p248_023.wav +0 -0
- dataset/audio/p249/p249_223.wav +0 -0
- dataset/audio/p250/p250_021.wav +0 -0
- dataset/audio/p251/p251_364.wav +0 -0
- dataset/audio/p252/p252_023.wav +0 -0
- dataset/audio/p253/p253_207.wav +0 -0
- dataset/audio/p254/p254_023.wav +0 -0
- dataset/audio/p255/p255_038.wav +0 -0
- dataset/audio/p256/p256_079.wav +0 -0
- dataset/audio/p257/p257_023.wav +0 -0
- dataset/audio/p258/p258_228.wav +0 -0
- dataset/audio/p259/p259_011.wav +0 -0
- dataset/audio/p260/p260_103.wav +0 -0
- dataset/audio/p261/p261_023.wav +0 -0
- dataset/audio/p262/p262_210.wav +0 -0
- dataset/audio/p263/p263_218.wav +0 -0
- dataset/audio/p264/p264_438.wav +0 -0
- dataset/audio/p265/p265_273.wav +0 -0
- dataset/audio/p266/p266_417.wav +0 -0
.gitattributes
CHANGED
@@ -11,7 +11,7 @@
|
|
11 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
11 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
# *.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
exp/default/g_00700000 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
Utils/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
flagged
|
3 |
+
out.wav
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Jingyi Li
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -7,6 +7,7 @@ sdk: gradio
|
|
7 |
sdk_version: 4.22.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
7 |
sdk_version: 4.22.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
Utils/JDC/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
Utils/JDC/bst.t7.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
https://github.com/yl4579/HiFTNet/blob/main/Utils/JDC/bst.t7
|
Utils/JDC/model.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Implementation of model from:
|
3 |
+
Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
|
4 |
+
Convolutional Recurrent Neural Networks" (2019)
|
5 |
+
Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
|
6 |
+
"""
|
7 |
+
import torch
|
8 |
+
from torch import nn
|
9 |
+
|
10 |
+
class JDCNet(nn.Module):
|
11 |
+
"""
|
12 |
+
Joint Detection and Classification Network model for singing voice melody.
|
13 |
+
"""
|
14 |
+
def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
|
15 |
+
super().__init__()
|
16 |
+
self.num_class = num_class
|
17 |
+
|
18 |
+
# input = (b, 1, 31, 513), b = batch size
|
19 |
+
self.conv_block = nn.Sequential(
|
20 |
+
nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), # out: (b, 64, 31, 513)
|
21 |
+
nn.BatchNorm2d(num_features=64),
|
22 |
+
nn.LeakyReLU(leaky_relu_slope, inplace=True),
|
23 |
+
nn.Conv2d(64, 64, 3, padding=1, bias=False), # (b, 64, 31, 513)
|
24 |
+
)
|
25 |
+
|
26 |
+
# res blocks
|
27 |
+
self.res_block1 = ResBlock(in_channels=64, out_channels=128) # (b, 128, 31, 128)
|
28 |
+
self.res_block2 = ResBlock(in_channels=128, out_channels=192) # (b, 192, 31, 32)
|
29 |
+
self.res_block3 = ResBlock(in_channels=192, out_channels=256) # (b, 256, 31, 8)
|
30 |
+
|
31 |
+
# pool block
|
32 |
+
self.pool_block = nn.Sequential(
|
33 |
+
nn.BatchNorm2d(num_features=256),
|
34 |
+
nn.LeakyReLU(leaky_relu_slope, inplace=True),
|
35 |
+
nn.MaxPool2d(kernel_size=(1, 4)), # (b, 256, 31, 2)
|
36 |
+
nn.Dropout(p=0.2),
|
37 |
+
)
|
38 |
+
|
39 |
+
# maxpool layers (for auxiliary network inputs)
|
40 |
+
# in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
|
41 |
+
self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
|
42 |
+
# in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
|
43 |
+
self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
|
44 |
+
# in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
|
45 |
+
self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
|
46 |
+
|
47 |
+
# in = (b, 640, 31, 2), out = (b, 256, 31, 2)
|
48 |
+
self.detector_conv = nn.Sequential(
|
49 |
+
nn.Conv2d(640, 256, 1, bias=False),
|
50 |
+
nn.BatchNorm2d(256),
|
51 |
+
nn.LeakyReLU(leaky_relu_slope, inplace=True),
|
52 |
+
nn.Dropout(p=0.2),
|
53 |
+
)
|
54 |
+
|
55 |
+
# input: (b, 31, 512) - resized from (b, 256, 31, 2)
|
56 |
+
self.bilstm_classifier = nn.LSTM(
|
57 |
+
input_size=512, hidden_size=256,
|
58 |
+
batch_first=True, bidirectional=True) # (b, 31, 512)
|
59 |
+
|
60 |
+
# input: (b, 31, 512) - resized from (b, 256, 31, 2)
|
61 |
+
self.bilstm_detector = nn.LSTM(
|
62 |
+
input_size=512, hidden_size=256,
|
63 |
+
batch_first=True, bidirectional=True) # (b, 31, 512)
|
64 |
+
|
65 |
+
# input: (b * 31, 512)
|
66 |
+
self.classifier = nn.Linear(in_features=512, out_features=self.num_class) # (b * 31, num_class)
|
67 |
+
|
68 |
+
# input: (b * 31, 512)
|
69 |
+
self.detector = nn.Linear(in_features=512, out_features=2) # (b * 31, 2) - binary classifier
|
70 |
+
|
71 |
+
# initialize weights
|
72 |
+
self.apply(self.init_weights)
|
73 |
+
|
74 |
+
def get_feature_GAN(self, x):
|
75 |
+
seq_len = x.shape[-2]
|
76 |
+
x = x.float().transpose(-1, -2)
|
77 |
+
|
78 |
+
convblock_out = self.conv_block(x)
|
79 |
+
|
80 |
+
resblock1_out = self.res_block1(convblock_out)
|
81 |
+
resblock2_out = self.res_block2(resblock1_out)
|
82 |
+
resblock3_out = self.res_block3(resblock2_out)
|
83 |
+
poolblock_out = self.pool_block[0](resblock3_out)
|
84 |
+
poolblock_out = self.pool_block[1](poolblock_out)
|
85 |
+
|
86 |
+
return poolblock_out.transpose(-1, -2)
|
87 |
+
|
88 |
+
def get_feature(self, x):
|
89 |
+
seq_len = x.shape[-2]
|
90 |
+
x = x.float().transpose(-1, -2)
|
91 |
+
|
92 |
+
convblock_out = self.conv_block(x)
|
93 |
+
|
94 |
+
resblock1_out = self.res_block1(convblock_out)
|
95 |
+
resblock2_out = self.res_block2(resblock1_out)
|
96 |
+
resblock3_out = self.res_block3(resblock2_out)
|
97 |
+
poolblock_out = self.pool_block[0](resblock3_out)
|
98 |
+
poolblock_out = self.pool_block[1](poolblock_out)
|
99 |
+
|
100 |
+
return self.pool_block[2](poolblock_out)
|
101 |
+
|
102 |
+
def forward(self, x):
|
103 |
+
"""
|
104 |
+
Returns:
|
105 |
+
classification_prediction, detection_prediction
|
106 |
+
sizes: (b, 31, 722), (b, 31, 2)
|
107 |
+
"""
|
108 |
+
###############################
|
109 |
+
# forward pass for classifier #
|
110 |
+
###############################
|
111 |
+
seq_len = x.shape[-1]
|
112 |
+
x = x.float().transpose(-1, -2)
|
113 |
+
|
114 |
+
convblock_out = self.conv_block(x)
|
115 |
+
|
116 |
+
resblock1_out = self.res_block1(convblock_out)
|
117 |
+
resblock2_out = self.res_block2(resblock1_out)
|
118 |
+
resblock3_out = self.res_block3(resblock2_out)
|
119 |
+
|
120 |
+
|
121 |
+
poolblock_out = self.pool_block[0](resblock3_out)
|
122 |
+
poolblock_out = self.pool_block[1](poolblock_out)
|
123 |
+
GAN_feature = poolblock_out.transpose(-1, -2)
|
124 |
+
poolblock_out = self.pool_block[2](poolblock_out)
|
125 |
+
|
126 |
+
# (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
|
127 |
+
classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
|
128 |
+
classifier_out, _ = self.bilstm_classifier(classifier_out) # ignore the hidden states
|
129 |
+
|
130 |
+
classifier_out = classifier_out.contiguous().view((-1, 512)) # (b * 31, 512)
|
131 |
+
classifier_out = self.classifier(classifier_out)
|
132 |
+
classifier_out = classifier_out.view((-1, seq_len, self.num_class)) # (b, 31, num_class)
|
133 |
+
|
134 |
+
# sizes: (b, 31, 722), (b, 31, 2)
|
135 |
+
# classifier output consists of predicted pitch classes per frame
|
136 |
+
# detector output consists of: (isvoice, notvoice) estimates per frame
|
137 |
+
return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
|
138 |
+
|
139 |
+
@staticmethod
|
140 |
+
def init_weights(m):
|
141 |
+
if isinstance(m, nn.Linear):
|
142 |
+
nn.init.kaiming_uniform_(m.weight)
|
143 |
+
if m.bias is not None:
|
144 |
+
nn.init.constant_(m.bias, 0)
|
145 |
+
elif isinstance(m, nn.Conv2d):
|
146 |
+
nn.init.xavier_normal_(m.weight)
|
147 |
+
elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
|
148 |
+
for p in m.parameters():
|
149 |
+
if p.data is None:
|
150 |
+
continue
|
151 |
+
|
152 |
+
if len(p.shape) >= 2:
|
153 |
+
nn.init.orthogonal_(p.data)
|
154 |
+
else:
|
155 |
+
nn.init.normal_(p.data)
|
156 |
+
|
157 |
+
|
158 |
+
class ResBlock(nn.Module):
|
159 |
+
def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
|
160 |
+
super().__init__()
|
161 |
+
self.downsample = in_channels != out_channels
|
162 |
+
|
163 |
+
# BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
|
164 |
+
self.pre_conv = nn.Sequential(
|
165 |
+
nn.BatchNorm2d(num_features=in_channels),
|
166 |
+
nn.LeakyReLU(leaky_relu_slope, inplace=True),
|
167 |
+
nn.MaxPool2d(kernel_size=(1, 2)), # apply downsampling on the y axis only
|
168 |
+
)
|
169 |
+
|
170 |
+
# conv layers
|
171 |
+
self.conv = nn.Sequential(
|
172 |
+
nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
|
173 |
+
kernel_size=3, padding=1, bias=False),
|
174 |
+
nn.BatchNorm2d(out_channels),
|
175 |
+
nn.LeakyReLU(leaky_relu_slope, inplace=True),
|
176 |
+
nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
|
177 |
+
)
|
178 |
+
|
179 |
+
# 1 x 1 convolution layer to match the feature dimensions
|
180 |
+
self.conv1by1 = None
|
181 |
+
if self.downsample:
|
182 |
+
self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
|
183 |
+
|
184 |
+
def forward(self, x):
|
185 |
+
x = self.pre_conv(x)
|
186 |
+
if self.downsample:
|
187 |
+
x = self.conv(x) + self.conv1by1(x)
|
188 |
+
else:
|
189 |
+
x = self.conv(x) + x
|
190 |
+
return x
|
Utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
app.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import librosa
|
8 |
+
import numpy as np
|
9 |
+
import soundfile as sf
|
10 |
+
import gradio as gr
|
11 |
+
from transformers import WavLMModel
|
12 |
+
|
13 |
+
from env import AttrDict
|
14 |
+
from meldataset import mel_spectrogram, MAX_WAV_VALUE
|
15 |
+
from models import Generator
|
16 |
+
from stft import TorchSTFT
|
17 |
+
from Utils.JDC.model import JDCNet
|
18 |
+
|
19 |
+
|
20 |
+
# files
|
21 |
+
hpfile = "config_v1_16k.json"
|
22 |
+
ptfile = "exp/default/g_00700000"
|
23 |
+
spk2id_path = "filelists/spk2id.json"
|
24 |
+
f0_stats_path = "filelists/f0_stats.json"
|
25 |
+
spk_stats_path = "filelists/spk_stats.json"
|
26 |
+
spk_emb_dir = "dataset/spk"
|
27 |
+
spk_wav_dir = "dataset/audio"
|
28 |
+
|
29 |
+
# device
|
30 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
31 |
+
|
32 |
+
# load config
|
33 |
+
with open(hpfile) as f:
|
34 |
+
data = f.read()
|
35 |
+
json_config = json.loads(data)
|
36 |
+
h = AttrDict(json_config)
|
37 |
+
|
38 |
+
# load models
|
39 |
+
F0_model = JDCNet(num_class=1, seq_len=192)
|
40 |
+
generator = Generator(h, F0_model).to(device)
|
41 |
+
stft = TorchSTFT(filter_length=h.gen_istft_n_fft, hop_length=h.gen_istft_hop_size, win_length=h.gen_istft_n_fft).to(device)
|
42 |
+
|
43 |
+
state_dict_g = torch.load(ptfile, map_location=device)
|
44 |
+
generator.load_state_dict(state_dict_g['generator'], strict=True)
|
45 |
+
generator.remove_weight_norm()
|
46 |
+
_ = generator.eval()
|
47 |
+
|
48 |
+
wavlm = WavLMModel.from_pretrained("microsoft/wavlm-base-plus")
|
49 |
+
wavlm.eval()
|
50 |
+
wavlm.to(device)
|
51 |
+
|
52 |
+
# load stats
|
53 |
+
with open(spk2id_path) as f:
|
54 |
+
spk2id = json.load(f)
|
55 |
+
with open(f0_stats_path) as f:
|
56 |
+
f0_stats = json.load(f)
|
57 |
+
with open(spk_stats_path) as f:
|
58 |
+
spk_stats = json.load(f)
|
59 |
+
|
60 |
+
# tune f0
|
61 |
+
threshold = 10
|
62 |
+
step = (math.log(1100) - math.log(50)) / 256
|
63 |
+
def tune_f0(initial_f0, i):
|
64 |
+
if i == 0:
|
65 |
+
return initial_f0
|
66 |
+
voiced = initial_f0 > threshold
|
67 |
+
initial_lf0 = torch.log(initial_f0)
|
68 |
+
lf0 = initial_lf0 + step * i
|
69 |
+
f0 = torch.exp(lf0)
|
70 |
+
f0 = torch.where(voiced, f0, initial_f0)
|
71 |
+
return f0
|
72 |
+
|
73 |
+
# convert function
|
74 |
+
def convert(tgt_spk, src_wav, f0_shift=0):
|
75 |
+
tgt_ref = spk_stats[tgt_spk]["best_spk_emb"]
|
76 |
+
tgt_emb = f"{spk_emb_dir}/{tgt_spk}/{tgt_ref}.npy"
|
77 |
+
|
78 |
+
with torch.no_grad():
|
79 |
+
# tgt
|
80 |
+
spk_id = spk2id[tgt_spk]
|
81 |
+
spk_id = torch.LongTensor([spk_id]).unsqueeze(0).to(device)
|
82 |
+
|
83 |
+
spk_emb = np.load(tgt_emb)
|
84 |
+
spk_emb = torch.from_numpy(spk_emb).unsqueeze(0).to(device)
|
85 |
+
|
86 |
+
f0_mean_tgt = f0_stats[tgt_spk]["mean"]
|
87 |
+
|
88 |
+
# src
|
89 |
+
wav, sr = librosa.load(src_wav, sr=16000)
|
90 |
+
wav = torch.FloatTensor(wav).to(device)
|
91 |
+
mel = mel_spectrogram(wav.unsqueeze(0), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
|
92 |
+
|
93 |
+
x = wavlm(wav.unsqueeze(0)).last_hidden_state
|
94 |
+
x = x.transpose(1, 2) # (B, C, T)
|
95 |
+
x = F.pad(x, (0, mel.size(2) - x.size(2)), 'constant')
|
96 |
+
|
97 |
+
# cvt
|
98 |
+
f0 = generator.get_f0(mel, f0_mean_tgt)
|
99 |
+
f0 = tune_f0(f0, f0_shift)
|
100 |
+
x = generator.get_x(x, spk_emb, spk_id)
|
101 |
+
y = generator.infer(x, f0, stft)
|
102 |
+
|
103 |
+
audio = y.squeeze()
|
104 |
+
audio = audio / torch.max(torch.abs(audio)) * 0.95
|
105 |
+
audio = audio * MAX_WAV_VALUE
|
106 |
+
audio = audio.cpu().numpy().astype('int16')
|
107 |
+
|
108 |
+
sf.write("out.wav", audio, h.sampling_rate, "PCM_16")
|
109 |
+
|
110 |
+
out_wav = "out.wav"
|
111 |
+
return out_wav
|
112 |
+
|
113 |
+
# change spk
|
114 |
+
def change_spk(tgt_spk):
|
115 |
+
tgt_ref = spk_stats[tgt_spk]["best_spk_emb"]
|
116 |
+
tgt_wav = f"{spk_wav_dir}/{tgt_spk}/{tgt_ref}.wav"
|
117 |
+
return tgt_wav
|
118 |
+
|
119 |
+
# interface
|
120 |
+
with gr.Blocks() as demo:
|
121 |
+
gr.Markdown("# PitchVC")
|
122 |
+
gr.Markdown("Gradio Demo for PitchVC. ([Github Repo](https://github.com/OlaWod/PitchVC))")
|
123 |
+
|
124 |
+
with gr.Row():
|
125 |
+
with gr.Column():
|
126 |
+
tgt_spk = gr.Dropdown(choices=spk2id.keys(), type="value", label="Target Speaker")
|
127 |
+
ref_audio = gr.Audio(label="Reference Audio", type='filepath')
|
128 |
+
src_audio = gr.Audio(label="Source Audio", type='filepath')
|
129 |
+
f0_shift = gr.Slider(minimum=-30, maximum=30, value=0, step=1, label="F0 Shift")
|
130 |
+
with gr.Column():
|
131 |
+
out_audio = gr.Audio(label="Output Audio", type='filepath')
|
132 |
+
submit = gr.Button(value="Submit")
|
133 |
+
|
134 |
+
tgt_spk.change(fn=change_spk, inputs=[tgt_spk], outputs=[ref_audio])
|
135 |
+
submit.click(convert, [tgt_spk, src_audio, f0_shift], [out_audio])
|
136 |
+
|
137 |
+
examples = gr.Examples(
|
138 |
+
examples=[["p225", 'dataset/audio/p226/p226_341.wav', 0],
|
139 |
+
["p226", 'dataset/audio/p225/p225_220.wav', -5]],
|
140 |
+
inputs=[tgt_spk, src_audio, f0_shift])
|
141 |
+
|
142 |
+
demo.launch()
|
config_v1_16k.json
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"F0_path": "Utils/JDC/bst.t7",
|
3 |
+
|
4 |
+
"use_aug": true,
|
5 |
+
|
6 |
+
"resblock": "1",
|
7 |
+
"num_gpus": 1,
|
8 |
+
"batch_size": 16,
|
9 |
+
"learning_rate": 0.0002,
|
10 |
+
"adam_b1": 0.8,
|
11 |
+
"adam_b2": 0.99,
|
12 |
+
"lr_decay": 0.999,
|
13 |
+
"seed": 1234,
|
14 |
+
|
15 |
+
"upsample_rates": [10,8],
|
16 |
+
"upsample_kernel_sizes": [20,16],
|
17 |
+
"upsample_initial_channel": 512,
|
18 |
+
"resblock_kernel_sizes": [3,7,11],
|
19 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
20 |
+
"gen_istft_n_fft": 16,
|
21 |
+
"gen_istft_hop_size": 4,
|
22 |
+
|
23 |
+
"segment_size": 16000,
|
24 |
+
"num_mels": 80,
|
25 |
+
"n_fft": 1024,
|
26 |
+
"hop_size": 320,
|
27 |
+
"win_size": 1024,
|
28 |
+
|
29 |
+
"sampling_rate": 16000,
|
30 |
+
|
31 |
+
"fmin": 0,
|
32 |
+
"fmax": 8000,
|
33 |
+
"fmax_for_loss": null,
|
34 |
+
|
35 |
+
"num_workers": 8,
|
36 |
+
|
37 |
+
"dist_config": {
|
38 |
+
"dist_backend": "nccl",
|
39 |
+
"dist_url": "tcp://localhost:54321",
|
40 |
+
"world_size": 1
|
41 |
+
}
|
42 |
+
}
|
dataset/audio/p225/p225_220.wav
ADDED
Binary file (101 kB). View file
|
|
dataset/audio/p226/p226_341.wav
ADDED
Binary file (93.3 kB). View file
|
|
dataset/audio/p227/p227_021.wav
ADDED
Binary file (294 kB). View file
|
|
dataset/audio/p228/p228_242.wav
ADDED
Binary file (87.1 kB). View file
|
|
dataset/audio/p229/p229_021.wav
ADDED
Binary file (239 kB). View file
|
|
dataset/audio/p230/p230_361.wav
ADDED
Binary file (132 kB). View file
|
|
dataset/audio/p231/p231_197.wav
ADDED
Binary file (36.9 kB). View file
|
|
dataset/audio/p232/p232_023.wav
ADDED
Binary file (285 kB). View file
|
|
dataset/audio/p233/p233_323.wav
ADDED
Binary file (133 kB). View file
|
|
dataset/audio/p234/p234_229.wav
ADDED
Binary file (73.8 kB). View file
|
|
dataset/audio/p236/p236_068.wav
ADDED
Binary file (89.2 kB). View file
|
|
dataset/audio/p237/p237_023.wav
ADDED
Binary file (272 kB). View file
|
|
dataset/audio/p238/p238_023.wav
ADDED
Binary file (372 kB). View file
|
|
dataset/audio/p239/p239_023.wav
ADDED
Binary file (265 kB). View file
|
|
dataset/audio/p240/p240_004.wav
ADDED
Binary file (119 kB). View file
|
|
dataset/audio/p241/p241_050.wav
ADDED
Binary file (64.6 kB). View file
|
|
dataset/audio/p243/p243_087.wav
ADDED
Binary file (109 kB). View file
|
|
dataset/audio/p244/p244_008.wav
ADDED
Binary file (225 kB). View file
|
|
dataset/audio/p245/p245_014.wav
ADDED
Binary file (154 kB). View file
|
|
dataset/audio/p246/p246_022.wav
ADDED
Binary file (196 kB). View file
|
|
dataset/audio/p247/p247_380.wav
ADDED
Binary file (92.2 kB). View file
|
|
dataset/audio/p248/p248_023.wav
ADDED
Binary file (396 kB). View file
|
|
dataset/audio/p249/p249_223.wav
ADDED
Binary file (116 kB). View file
|
|
dataset/audio/p250/p250_021.wav
ADDED
Binary file (225 kB). View file
|
|
dataset/audio/p251/p251_364.wav
ADDED
Binary file (128 kB). View file
|
|
dataset/audio/p252/p252_023.wav
ADDED
Binary file (324 kB). View file
|
|
dataset/audio/p253/p253_207.wav
ADDED
Binary file (101 kB). View file
|
|
dataset/audio/p254/p254_023.wav
ADDED
Binary file (286 kB). View file
|
|
dataset/audio/p255/p255_038.wav
ADDED
Binary file (114 kB). View file
|
|
dataset/audio/p256/p256_079.wav
ADDED
Binary file (119 kB). View file
|
|
dataset/audio/p257/p257_023.wav
ADDED
Binary file (242 kB). View file
|
|
dataset/audio/p258/p258_228.wav
ADDED
Binary file (89.2 kB). View file
|
|
dataset/audio/p259/p259_011.wav
ADDED
Binary file (191 kB). View file
|
|
dataset/audio/p260/p260_103.wav
ADDED
Binary file (121 kB). View file
|
|
dataset/audio/p261/p261_023.wav
ADDED
Binary file (286 kB). View file
|
|
dataset/audio/p262/p262_210.wav
ADDED
Binary file (118 kB). View file
|
|
dataset/audio/p263/p263_218.wav
ADDED
Binary file (101 kB). View file
|
|
dataset/audio/p264/p264_438.wav
ADDED
Binary file (125 kB). View file
|
|
dataset/audio/p265/p265_273.wav
ADDED
Binary file (119 kB). View file
|
|
dataset/audio/p266/p266_417.wav
ADDED
Binary file (89.2 kB). View file
|
|