Spaces:
Runtime error
Runtime error
Delete preprocess_hubert_f0.py
Browse files- preprocess_hubert_f0.py +0 -172
preprocess_hubert_f0.py
DELETED
@@ -1,172 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import logging
|
3 |
-
import os
|
4 |
-
import random
|
5 |
-
from concurrent.futures import ProcessPoolExecutor
|
6 |
-
from glob import glob
|
7 |
-
from random import shuffle
|
8 |
-
|
9 |
-
import librosa
|
10 |
-
import numpy as np
|
11 |
-
import torch
|
12 |
-
import torch.multiprocessing as mp
|
13 |
-
from loguru import logger
|
14 |
-
from tqdm import tqdm
|
15 |
-
|
16 |
-
import diffusion.logger.utils as du
|
17 |
-
import utils
|
18 |
-
from diffusion.vocoder import Vocoder
|
19 |
-
from modules.mel_processing import spectrogram_torch
|
20 |
-
|
21 |
-
logging.getLogger("numba").setLevel(logging.WARNING)
|
22 |
-
logging.getLogger("matplotlib").setLevel(logging.WARNING)
|
23 |
-
|
24 |
-
hps = utils.get_hparams_from_file("configs/config.json")
|
25 |
-
dconfig = du.load_config("configs/diffusion.yaml")
|
26 |
-
sampling_rate = hps.data.sampling_rate
|
27 |
-
hop_length = hps.data.hop_length
|
28 |
-
speech_encoder = hps["model"]["speech_encoder"]
|
29 |
-
|
30 |
-
|
31 |
-
def process_one(filename, hmodel, f0p, device, diff=False, mel_extractor=None):
|
32 |
-
wav, sr = librosa.load(filename, sr=sampling_rate)
|
33 |
-
audio_norm = torch.FloatTensor(wav)
|
34 |
-
audio_norm = audio_norm.unsqueeze(0)
|
35 |
-
soft_path = filename + ".soft.pt"
|
36 |
-
if not os.path.exists(soft_path):
|
37 |
-
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
|
38 |
-
wav16k = torch.from_numpy(wav16k).to(device)
|
39 |
-
c = hmodel.encoder(wav16k)
|
40 |
-
torch.save(c.cpu(), soft_path)
|
41 |
-
|
42 |
-
f0_path = filename + ".f0.npy"
|
43 |
-
if not os.path.exists(f0_path):
|
44 |
-
f0_predictor = utils.get_f0_predictor(f0p,sampling_rate=sampling_rate, hop_length=hop_length,device=None,threshold=0.05)
|
45 |
-
f0,uv = f0_predictor.compute_f0_uv(
|
46 |
-
wav
|
47 |
-
)
|
48 |
-
np.save(f0_path, np.asanyarray((f0,uv),dtype=object))
|
49 |
-
|
50 |
-
|
51 |
-
spec_path = filename.replace(".wav", ".spec.pt")
|
52 |
-
if not os.path.exists(spec_path):
|
53 |
-
# Process spectrogram
|
54 |
-
# The following code can't be replaced by torch.FloatTensor(wav)
|
55 |
-
# because load_wav_to_torch return a tensor that need to be normalized
|
56 |
-
|
57 |
-
if sr != hps.data.sampling_rate:
|
58 |
-
raise ValueError(
|
59 |
-
"{} SR doesn't match target {} SR".format(
|
60 |
-
sr, hps.data.sampling_rate
|
61 |
-
)
|
62 |
-
)
|
63 |
-
|
64 |
-
#audio_norm = audio / hps.data.max_wav_value
|
65 |
-
|
66 |
-
spec = spectrogram_torch(
|
67 |
-
audio_norm,
|
68 |
-
hps.data.filter_length,
|
69 |
-
hps.data.sampling_rate,
|
70 |
-
hps.data.hop_length,
|
71 |
-
hps.data.win_length,
|
72 |
-
center=False,
|
73 |
-
)
|
74 |
-
spec = torch.squeeze(spec, 0)
|
75 |
-
torch.save(spec, spec_path)
|
76 |
-
|
77 |
-
if diff or hps.model.vol_embedding:
|
78 |
-
volume_path = filename + ".vol.npy"
|
79 |
-
volume_extractor = utils.Volume_Extractor(hop_length)
|
80 |
-
if not os.path.exists(volume_path):
|
81 |
-
volume = volume_extractor.extract(audio_norm)
|
82 |
-
np.save(volume_path, volume.to('cpu').numpy())
|
83 |
-
|
84 |
-
if diff:
|
85 |
-
mel_path = filename + ".mel.npy"
|
86 |
-
if not os.path.exists(mel_path) and mel_extractor is not None:
|
87 |
-
mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate)
|
88 |
-
mel = mel_t.squeeze().to('cpu').numpy()
|
89 |
-
np.save(mel_path, mel)
|
90 |
-
aug_mel_path = filename + ".aug_mel.npy"
|
91 |
-
aug_vol_path = filename + ".aug_vol.npy"
|
92 |
-
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
93 |
-
max_shift = min(1, np.log10(1/max_amp))
|
94 |
-
log10_vol_shift = random.uniform(-1, max_shift)
|
95 |
-
keyshift = random.uniform(-5, 5)
|
96 |
-
if mel_extractor is not None:
|
97 |
-
aug_mel_t = mel_extractor.extract(audio_norm * (10 ** log10_vol_shift), sampling_rate, keyshift = keyshift)
|
98 |
-
aug_mel = aug_mel_t.squeeze().to('cpu').numpy()
|
99 |
-
aug_vol = volume_extractor.extract(audio_norm * (10 ** log10_vol_shift))
|
100 |
-
if not os.path.exists(aug_mel_path):
|
101 |
-
np.save(aug_mel_path,np.asanyarray((aug_mel,keyshift),dtype=object))
|
102 |
-
if not os.path.exists(aug_vol_path):
|
103 |
-
np.save(aug_vol_path,aug_vol.to('cpu').numpy())
|
104 |
-
|
105 |
-
|
106 |
-
def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
|
107 |
-
logger.info("Loading speech encoder for content...")
|
108 |
-
rank = mp.current_process()._identity
|
109 |
-
rank = rank[0] if len(rank) > 0 else 0
|
110 |
-
if torch.cuda.is_available():
|
111 |
-
gpu_id = rank % torch.cuda.device_count()
|
112 |
-
device = torch.device(f"cuda:{gpu_id}")
|
113 |
-
logger.info(f"Rank {rank} uses device {device}")
|
114 |
-
hmodel = utils.get_speech_encoder(speech_encoder, device=device)
|
115 |
-
logger.info(f"Loaded speech encoder for rank {rank}")
|
116 |
-
for filename in tqdm(file_chunk, position = rank):
|
117 |
-
process_one(filename, hmodel, f0p, device, diff, mel_extractor)
|
118 |
-
|
119 |
-
def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
|
120 |
-
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
121 |
-
tasks = []
|
122 |
-
for i in range(num_processes):
|
123 |
-
start = int(i * len(filenames) / num_processes)
|
124 |
-
end = int((i + 1) * len(filenames) / num_processes)
|
125 |
-
file_chunk = filenames[start:end]
|
126 |
-
tasks.append(executor.submit(process_batch, file_chunk, f0p, diff, mel_extractor, device=device))
|
127 |
-
for task in tqdm(tasks, position = 0):
|
128 |
-
task.result()
|
129 |
-
|
130 |
-
if __name__ == "__main__":
|
131 |
-
parser = argparse.ArgumentParser()
|
132 |
-
parser.add_argument('-d', '--device', type=str, default=None)
|
133 |
-
parser.add_argument(
|
134 |
-
"--in_dir", type=str, default="dataset/44k", help="path to input dir"
|
135 |
-
)
|
136 |
-
parser.add_argument(
|
137 |
-
'--use_diff',action='store_true', help='Whether to use the diffusion model'
|
138 |
-
)
|
139 |
-
parser.add_argument(
|
140 |
-
'--f0_predictor', type=str, default="rmvpe", help='Select F0 predictor, can select crepe,pm,dio,harvest,rmvpe,fcpe|default: pm(note: crepe is original F0 using mean filter)'
|
141 |
-
)
|
142 |
-
parser.add_argument(
|
143 |
-
'--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores'
|
144 |
-
)
|
145 |
-
args = parser.parse_args()
|
146 |
-
f0p = args.f0_predictor
|
147 |
-
device = args.device
|
148 |
-
if device is None:
|
149 |
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
150 |
-
|
151 |
-
print(speech_encoder)
|
152 |
-
logger.info("Using device: " + str(device))
|
153 |
-
logger.info("Using SpeechEncoder: " + speech_encoder)
|
154 |
-
logger.info("Using extractor: " + f0p)
|
155 |
-
logger.info("Using diff Mode: " + str(args.use_diff))
|
156 |
-
|
157 |
-
if args.use_diff:
|
158 |
-
print("use_diff")
|
159 |
-
print("Loading Mel Extractor...")
|
160 |
-
mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device=device)
|
161 |
-
print("Loaded Mel Extractor.")
|
162 |
-
else:
|
163 |
-
mel_extractor = None
|
164 |
-
filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10]
|
165 |
-
shuffle(filenames)
|
166 |
-
mp.set_start_method("spawn", force=True)
|
167 |
-
|
168 |
-
num_processes = args.num_processes
|
169 |
-
if num_processes == 0:
|
170 |
-
num_processes = os.cpu_count()
|
171 |
-
|
172 |
-
parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor, device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|