Spaces:

koajoel
/

PolyFormer

Running

File size: 8,334 Bytes

650c5f6

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

"""
Signal processing-based evaluation using waveforms
"""
import numpy as np
import os.path as op

import torchaudio
import tqdm
from tabulate import tabulate

from examples.speech_synthesis.utils import (
    gross_pitch_error, voicing_decision_error, f0_frame_error
)
from examples.speech_synthesis.evaluation.eval_sp import load_eval_spec


def difference_function(x, n, tau_max):
    """
    Compute difference function of data x. This solution is implemented directly
    with Numpy fft.


    :param x: audio data
    :param n: length of data
    :param tau_max: integration window size
    :return: difference function
    :rtype: list
    """

    x = np.array(x, np.float64)
    w = x.size
    tau_max = min(tau_max, w)
    x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum()))
    size = w + tau_max
    p2 = (size // 32).bit_length()
    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
    fc = np.fft.rfft(x, size_pad)
    conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
    return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - \
        2 * conv


def cumulative_mean_normalized_difference_function(df, n):
    """
    Compute cumulative mean normalized difference function (CMND).

    :param df: Difference function
    :param n: length of data
    :return: cumulative mean normalized difference function
    :rtype: list
    """

    # scipy method
    cmn_df = df[1:] * range(1, n) / np.cumsum(df[1:]).astype(float)
    return np.insert(cmn_df, 0, 1)


def get_pitch(cmdf, tau_min, tau_max, harmo_th=0.1):
    """
    Return fundamental period of a frame based on CMND function.

    :param cmdf: Cumulative Mean Normalized Difference function
    :param tau_min: minimum period for speech
    :param tau_max: maximum period for speech
    :param harmo_th: harmonicity threshold to determine if it is necessary to
    compute pitch frequency
    :return: fundamental period if there is values under threshold, 0 otherwise
    :rtype: float
    """
    tau = tau_min
    while tau < tau_max:
        if cmdf[tau] < harmo_th:
            while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
                tau += 1
            return tau
        tau += 1

    return 0    # if unvoiced


def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500,
                harmo_thresh=0.1):
    """

    Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.

    https://github.com/NVIDIA/mellotron adaption of
    https://github.com/patriceguyot/Yin

    :param sig: Audio signal (list of float)
    :param sr: sampling rate (int)
    :param w_len: size of the analysis window (samples)
    :param w_step: size of the lag between two consecutives windows (samples)
    :param f0_min: Minimum fundamental frequency that can be detected (hertz)
    :param f0_max: Maximum fundamental frequency that can be detected (hertz)
    :param harmo_thresh: Threshold of detection. The yalgorithmù return the
    first minimum of the CMND function below this threshold.

    :returns:

        * pitches: list of fundamental frequencies,
        * harmonic_rates: list of harmonic rate values for each fundamental
        frequency value (= confidence value)
        * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
        * times: list of time of each estimation
    :rtype: tuple
    """

    tau_min = int(sr / f0_max)
    tau_max = int(sr / f0_min)

    # time values for each analysis window
    time_scale = range(0, len(sig) - w_len, w_step)
    times = [t/float(sr) for t in time_scale]
    frames = [sig[t:t + w_len] for t in time_scale]

    pitches = [0.0] * len(time_scale)
    harmonic_rates = [0.0] * len(time_scale)
    argmins = [0.0] * len(time_scale)

    for i, frame in enumerate(frames):
        # Compute YIN
        df = difference_function(frame, w_len, tau_max)
        cm_df = cumulative_mean_normalized_difference_function(df, tau_max)
        p = get_pitch(cm_df, tau_min, tau_max, harmo_thresh)

        # Get results
        if np.argmin(cm_df) > tau_min:
            argmins[i] = float(sr / np.argmin(cm_df))
        if p != 0:  # A pitch was found
            pitches[i] = float(sr / p)
            harmonic_rates[i] = cm_df[p]
        else:  # No pitch, but we compute a value of the harmonic rate
            harmonic_rates[i] = min(cm_df)

    return pitches, harmonic_rates, argmins, times


def extract_f0(samples):
    f0_samples = []
    for sample in tqdm.tqdm(samples):
        if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]):
            f0_samples.append(None)
            continue

        # assume single channel
        yref, sr = torchaudio.load(sample["ref"])
        ysyn, _sr = torchaudio.load(sample["syn"])
        yref, ysyn = yref[0], ysyn[0]
        assert sr == _sr, f"{sr} != {_sr}"

        yref_f0 = compute_yin(yref, sr)
        ysyn_f0 = compute_yin(ysyn, sr)

        f0_samples += [
            {
                "ref": yref_f0,
                "syn": ysyn_f0
            }
        ]

    return f0_samples


def eval_f0_error(samples, distortion_fn):
    results = []
    for sample in tqdm.tqdm(samples):
        if sample is None:
            results.append(None)
            continue
        # assume single channel
        yref_f, _, _, yref_t = sample["ref"]
        ysyn_f, _, _, ysyn_t = sample["syn"]

        yref_f = np.array(yref_f)
        yref_t = np.array(yref_t)
        ysyn_f = np.array(ysyn_f)
        ysyn_t = np.array(ysyn_t)

        distortion = distortion_fn(yref_t, yref_f, ysyn_t, ysyn_f)
        results.append((distortion.item(),
                        len(yref_f),
                        len(ysyn_f)
                        ))
    return results


def eval_gross_pitch_error(samples):
    return eval_f0_error(samples, gross_pitch_error)


def eval_voicing_decision_error(samples):
    return eval_f0_error(samples, voicing_decision_error)


def eval_f0_frame_error(samples):
    return eval_f0_error(samples, f0_frame_error)


def print_results(results, show_bin):
    results = np.array(list(filter(lambda x: x is not None, results)))

    np.set_printoptions(precision=3)

    def _print_result(results):
        res = {
            "nutt": len(results),
            "error": results[:, 0].mean(),
            "std": results[:, 0].std(),
            "dur_ref": int(results[:, 1].sum()),
            "dur_syn": int(results[:, 2].sum()),
        }
        print(tabulate([res.values()], res.keys(), floatfmt=".4f"))

    print(">>>> ALL")
    _print_result(results)

    if show_bin:
        edges = [0, 200, 400, 600, 800, 1000, 2000, 4000]
        for i in range(1, len(edges)):
            mask = np.logical_and(results[:, 1] >= edges[i-1],
                                  results[:, 1] < edges[i])
            if not mask.any():
                continue
            bin_results = results[mask]
            print(f">>>> ({edges[i-1]}, {edges[i]})")
            _print_result(bin_results)


def main(eval_f0, gpe, vde, ffe, show_bin):
    samples = load_eval_spec(eval_f0)
    if gpe or vde or ffe:
        f0_samples = extract_f0(samples)

    if gpe:
        print("===== Evaluate Gross Pitch Error =====")
        results = eval_gross_pitch_error(f0_samples)
        print_results(results, show_bin)
    if vde:
        print("===== Evaluate Voicing Decision Error =====")
        results = eval_voicing_decision_error(f0_samples)
        print_results(results, show_bin)
    if ffe:
        print("===== Evaluate F0 Frame Error =====")
        results = eval_f0_frame_error(f0_samples)
        print_results(results, show_bin)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("eval_f0")
    parser.add_argument("--gpe", action="store_true")
    parser.add_argument("--vde", action="store_true")
    parser.add_argument("--ffe", action="store_true")
    parser.add_argument("--show-bin", action="store_true")
    args = parser.parse_args()

    main(args.eval_f0, args.gpe, args.vde, args.ffe, args.show_bin)