Spaces:

Hygee
/

Violin_midi_pro

Running

File size: 11,280 Bytes

e72f2a9

from torch import nn
import torch
import torchaudio
from typing import List, Optional, Tuple
import pathlib
from scipy.signal import medfilt
import numpy as np
import librosa
from librosa.sequence import viterbi_discriminative
from scipy.ndimage import gaussian_filter1d
from musc.postprocessing import spotify_create_notes


class PitchEstimator(nn.Module):
    """
    This is the base class that everything else inherits from. The hierarchy is:
    PitchEstimator -> Transcriber -> Synchronizer -> AutonomousAgent -> The n-Head Music Performance Analysis Models
    PitchEstimator can handle reading the audio, predicting all the features,
    estimating a single frame level f0 using viterbi, or
    MIDI pitch bend creation for the predicted note events when used inside a Transcriber, or
    score-informed f0 estimation when used inside a Synchronizer.
    """
    def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
        super().__init__()
        self.labeling = labeling
        self.sr = sr
        self.window_size = window_size
        self.hop_length = hop_length
        self.instrument = instrument
        self.f0_bins_per_semitone = int(np.round(100/self.labeling.f0_granularity_c))


    def read_audio(self, audio):
        """
        Read and resample an audio file, convert to mono, and unfold into representation frames.
        The time array represents the center of each small frame with 5.8ms hop length. This is different than the chunk
        level frames. The chunk level frames represent the entire sequence the model sees. Whereas it predicts with the
        small frames intervals (5.8ms).
        :param  audio: str, pathlib.Path, np.ndarray, or torch.Tensor
        :return: frames: (n_big_frames, frame_length), times: (n_small_frames,)
        """
        if isinstance(audio, str) or isinstance(audio, pathlib.Path):
            audio, sample_rate = torchaudio.load(audio, normalize=True)
            audio = audio.mean(axis=0)  # convert to mono
            if sample_rate != self.sr:
                audio = torchaudio.functional.resample(audio, sample_rate, self.sr)
        elif isinstance(audio, np.ndarray):
            audio = torch.from_numpy(audio)
        else:
            assert isinstance(audio, torch.Tensor)
        len_audio = audio.shape[-1]
        n_frames = int(np.ceil((len_audio + sum(self.frame_overlap)) / (self.hop_length * self.chunk_size)))
        audio = nn.functional.pad(audio, (self.frame_overlap[0],
                                          self.frame_overlap[1] + (n_frames * self.hop_length * self.chunk_size) - len_audio))
        frames = audio.unfold(0, self.max_window_size, self.hop_length*self.chunk_size)
        times = np.arange(0, len_audio, self.hop_length) / self.sr    # not tensor, we don't compute anything with it
        return frames, times

    def predict(self, audio, batch_size):
        frames, times = self.read_audio(audio)
        performance = {'f0': [], 'note': [], 'onset': [], 'offset': []}
        self.eval()
        device = self.main.conv0.conv2d.weight.device
        with torch.no_grad():
            for i in range(0, len(frames), batch_size):
                f = frames[i:min(i + batch_size, len(frames))].to(device)
                f -= (torch.mean(f, axis=1).unsqueeze(-1))
                f /= (torch.std(f, axis=1).unsqueeze(-1))
                out = self.forward(f)
                for key, value in out.items():
                    value = torch.sigmoid(value)
                    value = torch.nan_to_num(value) # the model outputs nan when the frame is silent (this is an expected behavior due to normalization)
                    value = value.view(-1, value.shape[-1])
                    value = value.detach().cpu().numpy()
                    performance[key].append(value)
        performance = {key: np.concatenate(value, axis=0)[:len(times)] for key, value in performance.items()}
        performance['time'] = times
        return performance

    def estimate_pitch(self, audio, batch_size, viterbi=False):
        out = self.predict(audio, batch_size)
        f0_hz = self.out2f0(out, viterbi)
        return out['time'], f0_hz

    def out2f0(self, out, viterbi=False):
        """
        Monophonic f0 estimation from the model output. The viterbi postprocessing is specialized for the violin family.
        """
        salience = out['f0']
        if viterbi == 'constrained':
            assert hasattr(self, 'out2note')
            notes =  spotify_create_notes( out["note"], out["onset"], note_low=self.labeling.midi_centers[0],
                                           note_high=self.labeling.midi_centers[-1], onset_thresh=0.5, frame_thresh=0.3,
                                           infer_onsets=True, melodia_trick=True,
                                           min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))))
            note_cents = self.get_pitch_bends(salience, notes, to_midi=False, timing_refinement_range=0)
            cents = np.zeros_like(out['time'])
            cents[note_cents[:,0].astype(int)] = note_cents[:,1]
        elif viterbi:
            # transition probabilities inducing continuous pitch
            # big changes are penalized with one order of magnitude
            transition = gaussian_filter1d(np.eye(self.labeling.f0_n_bins), 30) + 99 * gaussian_filter1d(
                np.eye(self.labeling.f0_n_bins), 2)
            transition = transition / np.sum(transition, axis=1)[:, None]

            p = salience / salience.sum(axis=1)[:, None]
            p[np.isnan(p.sum(axis=1)), :] = np.ones(self.labeling.f0_n_bins) * 1 / self.labeling.f0_n_bins
            path = viterbi_discriminative(p.T, transition)
            cents = np.array([self.labeling.f0_label2c(salience[i, :], path[i]) for i in range(len(path))])
        else:
            cents = self.labeling.f0_label2c(salience, center=None)  # use argmax for center

        f0_hz = self.labeling.f0_c2hz(cents)
        f0_hz[np.isnan(f0_hz)] = 0
        return f0_hz

    def get_pitch_bends(
            self,
            contours: np.ndarray, note_events: List[Tuple[int, int, int, float]],
            timing_refinement_range: int = 0, to_midi: bool = True,
    ) -> List[Tuple[int, int, int, float, Optional[List[int]]]]:
        """Modified version of an excellent script from Spotify/basic_pitch!! Thank you!!!!
        Given note events and contours, estimate pitch bends per note.
        Pitch bends are represented as a sequence of evenly spaced midi pitch bend control units.
        The time stamps of each pitch bend can be inferred by computing an evenly spaced grid between
        the start and end times of each note event.
        Args:
            contours: Matrix of estimated pitch contours
            note_events: note event tuple
            timing_refinement_range: if > 0, refine onset/offset boundaries with f0 confidence
            to_midi: whether to convert pitch bends to midi pitch bends. If False, return pitch estimates in the format
        [time (index), pitch (Hz), confidence in range [0, 1]].
        Returns:
            note events with pitch bends
        """

        f0_matrix = []  # [time (index), pitch (Hz), confidence in range [0, 1]]
        note_events_with_pitch_bends = []
        for start_idx, end_idx, pitch_midi, amplitude in note_events:
            if timing_refinement_range:
                start_idx = np.max([0, start_idx - timing_refinement_range])
                end_idx = np.min([contours.shape[0], end_idx + timing_refinement_range])
            freq_idx = int(np.round(self.midi_pitch_to_contour_bin(pitch_midi)))
            freq_start_idx = np.max([freq_idx - self.labeling.f0_tolerance_bins, 0])
            freq_end_idx = np.min([self.labeling.f0_n_bins, freq_idx + self.labeling.f0_tolerance_bins + 1])

            trans_start_idx = np.max([0, self.labeling.f0_tolerance_bins - freq_idx])
            trans_end_idx = (2 * self.labeling.f0_tolerance_bins + 1) - \
                            np.max([0, freq_idx - (self.labeling.f0_n_bins - self.labeling.f0_tolerance_bins - 1)])

            # apply regional viterbi to estimate the intonation
            # observation probabilities come from the f0_roll matrix
            observation = contours[start_idx:end_idx, freq_start_idx:freq_end_idx]
            observation = observation / observation.sum(axis=1)[:, None]
            observation[np.isnan(observation.sum(axis=1)), :] = np.ones(freq_end_idx - freq_start_idx) * 1 / (
                        freq_end_idx - freq_start_idx)

            # transition probabilities assure continuity
            transition = self.labeling.f0_transition_matrix[trans_start_idx:trans_end_idx,
                         trans_start_idx:trans_end_idx] + 1e-6
            transition = transition / np.sum(transition, axis=1)[:, None]

            path = viterbi_discriminative(observation.T / observation.sum(axis=1), transition) + freq_start_idx

            cents = np.array([self.labeling.f0_label2c(contours[i + start_idx, :], path[i]) for i in range(len(path))])
            bends = cents - self.labeling.midi_centers_c[pitch_midi - self.labeling.midi_centers[0]]
            if to_midi:
                bends = (bends * 4096 / 100).astype(int)
                bends[bends > 8191] = 8191
                bends[bends < -8192] = -8192

                if timing_refinement_range:
                    confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
                    threshold = np.median(confidences)
                    threshold = (np.median(confidences > threshold) + threshold) / 2  # some magic
                    median_kernel = 2 * (timing_refinement_range // 2) + 1  # some more magic
                    confidences = medfilt(confidences, kernel_size=median_kernel)
                    conf_bool = confidences > threshold
                    onset_idx = np.argmax(conf_bool)
                    offset_idx = len(confidences) - np.argmax(conf_bool[::-1])
                    bends = bends[onset_idx:offset_idx]
                    start_idx = start_idx + onset_idx
                    end_idx = start_idx + offset_idx

                note_events_with_pitch_bends.append((start_idx, end_idx, pitch_midi, amplitude, bends))
            else:
                confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
                time_idx = np.arange(len(path)) + start_idx
                # f0_hz = self.labeling.f0_c2hz(cents)
                possible_f0s = np.array([time_idx, cents, confidences]).T
                f0_matrix.append(possible_f0s[np.abs(bends)<100]) # filter out pitch bends that are too large
        if not to_midi:
            return np.vstack(f0_matrix)
        else:
            return note_events_with_pitch_bends


    def midi_pitch_to_contour_bin(self, pitch_midi: int) -> np.array:
        """Convert midi pitch to corresponding index in contour matrix
        Args:
            pitch_midi: pitch in midi
        Returns:
            index in contour matrix
        """
        pitch_hz = librosa.midi_to_hz(pitch_midi)
        return np.argmin(np.abs(self.labeling.f0_centers_hz - pitch_hz))