File size: 7,775 Bytes
e34aada |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import os
import subprocess
import tempfile
import traceback
import uuid
import torch
from scipy.signal import medfilt
from utils.audio import librosa_wav2spec
from utils.audio.io import save_wav
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import librosa
import numpy as np
from scipy.interpolate import interp1d
from utils.audio.pitch.crepe_utils import crepe_with_corrector, crepe_predict, frequency_to_bins, \
crepe_predict_torch
from utils.audio.pitch.extractor_utils import find_nearest_stft_bin, find_best_f0_using_har_energy
PITCH_EXTRACTOR = {}
def register_pitch_extractor(name):
def register_pitch_extractor_(cls):
PITCH_EXTRACTOR[name] = cls
return cls
return register_pitch_extractor_
def get_pitch_extractor(name):
return PITCH_EXTRACTOR[name]
def extract_pitch_simple(wav):
from utils.commons.hparams import hparams
n_mel_frames = (len(wav) + 1) // hparams['hop_size'] - hparams['win_size'] // hparams['hop_size']
return extract_pitch(hparams['pitch_extractor'], wav,
hparams['hop_size'], hparams['audio_sample_rate'],
f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
n_mel_frames=n_mel_frames)
def extract_pitch(extractor_name, wav_data, hop_size, audio_sample_rate, f0_min=75, f0_max=800, **kwargs):
return get_pitch_extractor(extractor_name)(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, **kwargs)
@register_pitch_extractor('harvest')
def harvest(wav_data, hop_size, audio_sample_rate, *args, **kwargs):
import pyworld as pw
n_mel_frames = int(len(wav_data) // hop_size)
f0, t = pw.harvest(wav_data.astype(np.double), audio_sample_rate)
x_old = np.arange(0, 1, 1 / len(f0))[:len(f0)]
x_old[-1] = 1.0
x_new = np.arange(0, 1, 1 / n_mel_frames)[:n_mel_frames]
f0 = interp1d(x_old, f0, 'nearest')(x_new)
return f0
@register_pitch_extractor('dio')
def dio(wav_data, hop_size, audio_sample_rate, *args, **kwargs):
import pyworld as pw
n_mel_frames = int(len(wav_data) // hop_size)
_f0, t = pw.dio(wav_data.astype(np.double), audio_sample_rate)
f0 = pw.stonemask(wav_data.astype(np.double), _f0, t, audio_sample_rate)
x_old = np.arange(0, 1, 1 / len(f0))[:len(f0)]
x_old[-1] = 1.0
x_new = np.arange(0, 1, 1 / n_mel_frames)[:n_mel_frames]
f0 = interp1d(x_old, f0, 'nearest')(x_new)
return f0
@register_pitch_extractor('parselmouth')
def parselmouth_pitch(wav_data, hop_size, audio_sample_rate, f0_min, f0_max,
voicing_threshold=0.45, *args, **kwargs):
import parselmouth
time_step = hop_size / audio_sample_rate * 1000
n_mel_frames = int(len(wav_data) // hop_size)
f0_pm = parselmouth.Sound(wav_data, audio_sample_rate).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=voicing_threshold,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size = (n_mel_frames - len(f0_pm) + 1) // 2
f0 = np.pad(f0_pm, [[pad_size, n_mel_frames - len(f0_pm) - pad_size]], mode='constant')
return f0
@register_pitch_extractor('reaper')
def reaper_extract_f0(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, denoise=True,
return_denoised_wav=False,
*args, **kwargs):
dirname = f'/tmp/reaper_tmp/{len(wav_data)}_{str(uuid.uuid1())}'
os.makedirs(dirname, exist_ok=True)
with tempfile.TemporaryDirectory(dir=dirname) as _:
if hop_size == 256:
if audio_sample_rate == 24000:
save_wav(wav_data, f'{dirname}/1.wav', 25600, norm=False)
if audio_sample_rate == 48000:
save_wav(wav_data, f'{dirname}/1.wav', 51200, norm=False)
else:
assert hop_size == 240
save_wav(wav_data, f'{dirname}/1.wav', audio_sample_rate, norm=False)
if denoise:
from utils.audio import trim_long_silences
wav_data_ = wav_data
_, audio_mask, sr = trim_long_silences(wav_data, audio_sample_rate, vad_max_silence_length=20)
sr = audio_sample_rate
wav_noise = wav_data[~audio_mask]
# wav_noise = wav_data[:round(audio_sample_rate * 0.1)]
# from scipy.signal import butter, lfilter
# Define the filter parameters
# cutoff_freq = 200.0 # Hz
# nyquist_freq = 0.5 * sr
# order = 5
# b, a = butter(order, cutoff_freq / nyquist_freq, btype='lowpass')
new_fn = f'{dirname}/0'
save_wav(wav_noise, f'{new_fn}-noise.wav', sr=sr)
save_wav(wav_data, f'{new_fn}.wav', sr=sr)
subprocess.check_call(
f'sox {new_fn}-noise.wav -n noiseprof {new_fn}-noise.prof; '
f'sox {new_fn}.wav {new_fn}.denoised.wav noisered {new_fn}-noise.prof 0.21; ', shell=True)
wav_data, _ = librosa.load(f'{new_fn}.denoised.wav', sr=sr)
wav_data = np.concatenate([wav_data, wav_data_[-1024:]], 0)
# wav_data = lfilter(b, a, wav_data)
if hop_size == 256:
if audio_sample_rate == 24000:
save_wav(wav_data, f'{dirname}/2.wav', 25600, norm=False)
if audio_sample_rate == 48000:
save_wav(wav_data, f'{dirname}/2.wav', 51200, norm=False)
else:
assert hop_size == 240
save_wav(wav_data, f'{dirname}/2.wav', audio_sample_rate, norm=False)
retry = 10
while retry > 0:
subprocess.check_call(f'rm -rf {dirname}/*f0', shell=True)
try:
f0 = reaper_extract_f0_(
f'{dirname}/2.wav', f'{dirname}/1.wav', dirname, f0_min, f0_max)[:-8]
break
except KeyboardInterrupt:
raise KeyboardInterrupt
except:
traceback.print_exc()
retry -= 1
if audio_sample_rate == 24000:
if hop_size == 256:
f0 = f0 * audio_sample_rate / 25600
f0[f0 == 0] = -100000
f0 = f0.reshape(-1, 2).mean(-1)
f0[f0 < 0] = 0
if audio_sample_rate == 48000:
if hop_size == 256:
f0 = f0 * audio_sample_rate / 51200
if return_denoised_wav:
return f0, wav_data
else:
return f0
def reaper_extract_f0_(fwav1, fwav2, temp_dir, pitch_lower, pitch_upper):
frame_shift = 5
use_reaper = True
straight_f0_file = f'{temp_dir}/1.sf0'
if not os.path.exists(straight_f0_file):
subprocess.check_call('utils/audio/pitch/bin/ExtractF0ByStraight frame_shift=%d ' \
'min_f0=%d max_f0=%d wave="%s" output="%s"' % (
frame_shift, pitch_lower, pitch_upper,
fwav1, straight_f0_file), shell=True, timeout=20)
if use_reaper:
reaper_f0_file = f'{temp_dir}/1.rf0'
if not os.path.exists(reaper_f0_file):
subprocess.check_call('utils/audio/pitch/bin/ReaperF0 wave="%s" output="%s" ' \
'f0_min=%d f0_max=%d' % (
fwav2, reaper_f0_file,
pitch_lower, pitch_upper), shell=True, timeout=20) # ignore_security_alert
interp_f0_file = f'{temp_dir}/1.tf0'
if not os.path.exists(interp_f0_file):
subprocess.check_call('utils/audio/pitch/bin/InterpF0 straight="%s" ' \
'reaper="%s" output="%s"' % (
straight_f0_file, reaper_f0_file, interp_f0_file), shell=True, timeout=20)
straight_f0_file = interp_f0_file
f0 = np.loadtxt(straight_f0_file, dtype=np.float32)
return f0
|