Spaces:
Running
Running
Create rvc_infer.py
Browse files- rvc_infer.py +64 -0
rvc_infer.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torchaudio
|
6 |
+
import librosa
|
7 |
+
import pyworld as pw
|
8 |
+
from scipy.io import wavfile
|
9 |
+
|
10 |
+
class RVCModel:
|
11 |
+
def __init__(self, model_path, index_path, device='cpu'):
|
12 |
+
self.device = device
|
13 |
+
self.model = self.load_model(model_path)
|
14 |
+
self.index = self.load_index(index_path)
|
15 |
+
self.sr = 16000
|
16 |
+
self.hop_length = 160
|
17 |
+
|
18 |
+
def load_model(self, path):
|
19 |
+
state_dict = torch.load(path, map_location=self.device)
|
20 |
+
model = nn.Sequential(
|
21 |
+
nn.Conv1d(128, 512, 3, padding=1),
|
22 |
+
nn.ReLU(),
|
23 |
+
nn.Conv1d(512, 512, 3, padding=1),
|
24 |
+
nn.ReLU(),
|
25 |
+
nn.Conv1d(512, 128, 3, padding=1)
|
26 |
+
)
|
27 |
+
model.load_state_dict(state_dict)
|
28 |
+
model.eval().to(self.device)
|
29 |
+
return model
|
30 |
+
|
31 |
+
def load_index(self, path):
|
32 |
+
if os.path.exists(path):
|
33 |
+
return np.load(path)
|
34 |
+
return None
|
35 |
+
|
36 |
+
def extract_features(self, audio):
|
37 |
+
f0, sp, ap = self.compute_pyworld(audio)
|
38 |
+
return torch.from_numpy(f0).float().to(self.device), sp, ap
|
39 |
+
|
40 |
+
def compute_pyworld(self, audio):
|
41 |
+
audio = audio.astype(np.float64)
|
42 |
+
f0, t = pw.harvest(audio, self.sr)
|
43 |
+
sp = pw.cheaptrick(audio, f0, t, self.sr)
|
44 |
+
ap = pw.d4c(audio, f0, t, self.sr)
|
45 |
+
return f0, sp, ap
|
46 |
+
|
47 |
+
def infer(self, audio):
|
48 |
+
f0, sp, ap = self.extract_features(audio)
|
49 |
+
sp = torch.from_numpy(sp).float().permute(1,0).unsqueeze(0).to(self.device)
|
50 |
+
|
51 |
+
with torch.no_grad():
|
52 |
+
converted = self.model(sp)
|
53 |
+
|
54 |
+
converted = converted.squeeze(0).permute(1,0).cpu().numpy()
|
55 |
+
return self.reconstruct_audio(f0, converted, ap)
|
56 |
+
|
57 |
+
def reconstruct_audio(self, f0, sp, ap):
|
58 |
+
y = pw.synthesize(
|
59 |
+
f0.flatten().astype(np.float64),
|
60 |
+
sp.astype(np.float64),
|
61 |
+
ap.astype(np.float64),
|
62 |
+
self.sr
|
63 |
+
)
|
64 |
+
return np.nan_to_num(y).astype(np.float32)
|