File size: 7,775 Bytes
e34aada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import os
import subprocess
import tempfile
import traceback
import uuid

import torch
from scipy.signal import medfilt

from utils.audio import librosa_wav2spec
from utils.audio.io import save_wav

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import librosa
import numpy as np
from scipy.interpolate import interp1d
from utils.audio.pitch.crepe_utils import crepe_with_corrector, crepe_predict, frequency_to_bins, \
    crepe_predict_torch
from utils.audio.pitch.extractor_utils import find_nearest_stft_bin, find_best_f0_using_har_energy

PITCH_EXTRACTOR = {}


def register_pitch_extractor(name):
    def register_pitch_extractor_(cls):
        PITCH_EXTRACTOR[name] = cls
        return cls

    return register_pitch_extractor_


def get_pitch_extractor(name):
    return PITCH_EXTRACTOR[name]


def extract_pitch_simple(wav):
    from utils.commons.hparams import hparams
    n_mel_frames = (len(wav) + 1) // hparams['hop_size'] - hparams['win_size'] // hparams['hop_size']
    return extract_pitch(hparams['pitch_extractor'], wav,
                         hparams['hop_size'], hparams['audio_sample_rate'],
                         f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
                         n_mel_frames=n_mel_frames)


def extract_pitch(extractor_name, wav_data, hop_size, audio_sample_rate, f0_min=75, f0_max=800, **kwargs):
    return get_pitch_extractor(extractor_name)(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, **kwargs)


@register_pitch_extractor('harvest')
def harvest(wav_data, hop_size, audio_sample_rate, *args, **kwargs):
    import pyworld as pw
    n_mel_frames = int(len(wav_data) // hop_size)
    f0, t = pw.harvest(wav_data.astype(np.double), audio_sample_rate)
    x_old = np.arange(0, 1, 1 / len(f0))[:len(f0)]
    x_old[-1] = 1.0
    x_new = np.arange(0, 1, 1 / n_mel_frames)[:n_mel_frames]
    f0 = interp1d(x_old, f0, 'nearest')(x_new)
    return f0


@register_pitch_extractor('dio')
def dio(wav_data, hop_size, audio_sample_rate, *args, **kwargs):
    import pyworld as pw
    n_mel_frames = int(len(wav_data) // hop_size)
    _f0, t = pw.dio(wav_data.astype(np.double), audio_sample_rate)
    f0 = pw.stonemask(wav_data.astype(np.double), _f0, t, audio_sample_rate)
    x_old = np.arange(0, 1, 1 / len(f0))[:len(f0)]
    x_old[-1] = 1.0
    x_new = np.arange(0, 1, 1 / n_mel_frames)[:n_mel_frames]
    f0 = interp1d(x_old, f0, 'nearest')(x_new)
    return f0


@register_pitch_extractor('parselmouth')
def parselmouth_pitch(wav_data, hop_size, audio_sample_rate, f0_min, f0_max,
                      voicing_threshold=0.45, *args, **kwargs):
    import parselmouth
    time_step = hop_size / audio_sample_rate * 1000
    n_mel_frames = int(len(wav_data) // hop_size)
    f0_pm = parselmouth.Sound(wav_data, audio_sample_rate).to_pitch_ac(
        time_step=time_step / 1000, voicing_threshold=voicing_threshold,
        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
    pad_size = (n_mel_frames - len(f0_pm) + 1) // 2
    f0 = np.pad(f0_pm, [[pad_size, n_mel_frames - len(f0_pm) - pad_size]], mode='constant')
    return f0


@register_pitch_extractor('reaper')
def reaper_extract_f0(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, denoise=True,
                      return_denoised_wav=False,
                      *args, **kwargs):
    dirname = f'/tmp/reaper_tmp/{len(wav_data)}_{str(uuid.uuid1())}'
    os.makedirs(dirname, exist_ok=True)
    with tempfile.TemporaryDirectory(dir=dirname) as _:
        if hop_size == 256:
            if audio_sample_rate == 24000:
                save_wav(wav_data, f'{dirname}/1.wav', 25600, norm=False)
            if audio_sample_rate == 48000:
                save_wav(wav_data, f'{dirname}/1.wav', 51200, norm=False)
        else:
            assert hop_size == 240
            save_wav(wav_data, f'{dirname}/1.wav', audio_sample_rate, norm=False)

        if denoise:
            from utils.audio import trim_long_silences
            wav_data_ = wav_data
            _, audio_mask, sr = trim_long_silences(wav_data, audio_sample_rate, vad_max_silence_length=20)
            sr = audio_sample_rate
            wav_noise = wav_data[~audio_mask]
            # wav_noise = wav_data[:round(audio_sample_rate * 0.1)]

            # from scipy.signal import butter, lfilter
            # Define the filter parameters
            # cutoff_freq = 200.0  # Hz
            # nyquist_freq = 0.5 * sr
            # order = 5
            # b, a = butter(order, cutoff_freq / nyquist_freq, btype='lowpass')

            new_fn = f'{dirname}/0'
            save_wav(wav_noise, f'{new_fn}-noise.wav', sr=sr)
            save_wav(wav_data, f'{new_fn}.wav', sr=sr)
            subprocess.check_call(
                f'sox {new_fn}-noise.wav -n noiseprof {new_fn}-noise.prof; '
                f'sox {new_fn}.wav {new_fn}.denoised.wav noisered {new_fn}-noise.prof 0.21; ', shell=True)
            wav_data, _ = librosa.load(f'{new_fn}.denoised.wav', sr=sr)
            wav_data = np.concatenate([wav_data, wav_data_[-1024:]], 0)
            # wav_data = lfilter(b, a, wav_data)

        if hop_size == 256:
            if audio_sample_rate == 24000:
                save_wav(wav_data, f'{dirname}/2.wav', 25600, norm=False)
            if audio_sample_rate == 48000:
                save_wav(wav_data, f'{dirname}/2.wav', 51200, norm=False)
        else:
            assert hop_size == 240
            save_wav(wav_data, f'{dirname}/2.wav', audio_sample_rate, norm=False)

        retry = 10
        while retry > 0:
            subprocess.check_call(f'rm -rf {dirname}/*f0', shell=True)
            try:
                f0 = reaper_extract_f0_(
                    f'{dirname}/2.wav', f'{dirname}/1.wav', dirname, f0_min, f0_max)[:-8]
                break
            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except:
                traceback.print_exc()
                retry -= 1
    if audio_sample_rate == 24000:
        if hop_size == 256:
            f0 = f0 * audio_sample_rate / 25600
        f0[f0 == 0] = -100000
        f0 = f0.reshape(-1, 2).mean(-1)
        f0[f0 < 0] = 0
    if audio_sample_rate == 48000:
        if hop_size == 256:
            f0 = f0 * audio_sample_rate / 51200
    if return_denoised_wav:
        return f0, wav_data
    else:
        return f0


def reaper_extract_f0_(fwav1, fwav2, temp_dir, pitch_lower, pitch_upper):
    frame_shift = 5
    use_reaper = True
    straight_f0_file = f'{temp_dir}/1.sf0'
    if not os.path.exists(straight_f0_file):
        subprocess.check_call('utils/audio/pitch/bin/ExtractF0ByStraight frame_shift=%d ' \
                              'min_f0=%d max_f0=%d wave="%s" output="%s"' % (
                                  frame_shift, pitch_lower, pitch_upper,
                                  fwav1, straight_f0_file), shell=True, timeout=20)
    if use_reaper:
        reaper_f0_file = f'{temp_dir}/1.rf0'
        if not os.path.exists(reaper_f0_file):
            subprocess.check_call('utils/audio/pitch/bin/ReaperF0 wave="%s" output="%s" ' \
                                  'f0_min=%d f0_max=%d' % (
                                      fwav2, reaper_f0_file,
                                      pitch_lower, pitch_upper), shell=True, timeout=20)  # ignore_security_alert
        interp_f0_file = f'{temp_dir}/1.tf0'
        if not os.path.exists(interp_f0_file):
            subprocess.check_call('utils/audio/pitch/bin/InterpF0 straight="%s" ' \
                                  'reaper="%s" output="%s"' % (
                                      straight_f0_file, reaper_f0_file, interp_f0_file), shell=True, timeout=20)
        straight_f0_file = interp_f0_file

    f0 = np.loadtxt(straight_f0_file, dtype=np.float32)
    return f0