Spaces:

Hygee
/

Violin_midi_pro

Running

App Files Files Community

Violin_midi_pro / musc /postprocessing.py

Hygee

Upload 9 files

e72f2a9 verified about 1 year ago

raw

history blame contribute delete

21.1 kB

	from typing import List, Tuple
	import scipy
	import numpy as np


	# SPOTIFY

	def get_inferred_onsets(onset_roll: np.array, note_roll: np.array, n_diff: int = 2) -> np.array:
	"""
	Infer onsets from large changes in note roll matrix amplitudes.
	Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
	:param onset_roll: Onset activation matrix (n_times, n_freqs).
	:param note_roll: Frame-level note activation matrix (n_times, n_freqs).
	:param n_diff: Differences used to detect onsets.
	:return: The maximum between the predicted onsets and its differences.
	"""

	diffs = []
	for n in range(1, n_diff + 1):
	frames_appended = np.concatenate([np.zeros((n, note_roll.shape[1])), note_roll])
	diffs.append(frames_appended[n:, :] - frames_appended[:-n, :])
	frame_diff = np.min(diffs, axis=0)
	frame_diff[frame_diff < 0] = 0
	frame_diff[:n_diff, :] = 0
	frame_diff = np.max(onset_roll) * frame_diff / np.max(frame_diff) # rescale to have the same max as onsets

	max_onsets_diff = np.max([onset_roll, frame_diff],
	axis=0) # use the max of the predicted onsets and the differences

	return max_onsets_diff



	def spotify_create_notes(
	note_roll: np.array,
	onset_roll: np.array,
	onset_thresh: float,
	frame_thresh: float,
	min_note_len: int,
	infer_onsets: bool,
	note_low : int, #self.labeling.midi_centers[0]
	note_high : int, #self.labeling.midi_centers[-1],
	melodia_trick: bool = True,
	energy_tol: int = 11,
	) -> List[Tuple[int, int, int, float]]:
	"""Decode raw model output to polyphonic note events
	Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
	Args:
	note_roll: Frame activation matrix (n_times, n_freqs).
	onset_roll: Onset activation matrix (n_times, n_freqs).
	onset_thresh: Minimum amplitude of an onset activation to be considered an onset.
	frame_thresh: Minimum amplitude of a frame activation for a note to remain "on".
	min_note_len: Minimum allowed note length in frames.
	infer_onsets: If True, add additional onsets when there are large differences in frame amplitudes.
	melodia_trick : Whether to use the melodia trick to better detect notes.
	energy_tol: Drop notes below this energy.
	Returns:
	list of tuples [(start_time_frames, end_time_frames, pitch_midi, amplitude)]
	representing the note events, where amplitude is a number between 0 and 1
	"""

	n_frames = note_roll.shape[0]

	# use onsets inferred from frames in addition to the predicted onsets
	if infer_onsets:
	onset_roll = get_inferred_onsets(onset_roll, note_roll)

	peak_thresh_mat = np.zeros(onset_roll.shape)
	peaks = scipy.signal.argrelmax(onset_roll, axis=0)
	peak_thresh_mat[peaks] = onset_roll[peaks]

	onset_idx = np.where(peak_thresh_mat >= onset_thresh)
	onset_time_idx = onset_idx[0][::-1] # sort to go backwards in time
	onset_freq_idx = onset_idx[1][::-1] # sort to go backwards in time

	remaining_energy = np.zeros(note_roll.shape)
	remaining_energy[:, :] = note_roll[:, :]

	# loop over onsets
	note_events = []
	for note_start_idx, freq_idx in zip(onset_time_idx, onset_freq_idx):
	# if we're too close to the end of the audio, continue
	if note_start_idx >= n_frames - 1:
	continue

	# find time index at this frequency band where the frames drop below an energy threshold
	i = note_start_idx + 1
	k = 0 # number of frames since energy dropped below threshold
	while i < n_frames - 1 and k < energy_tol:
	if remaining_energy[i, freq_idx] < frame_thresh:
	k += 1
	else:
	k = 0
	i += 1

	i -= k # go back to frame above threshold

	# if the note is too short, skip it
	if i - note_start_idx <= min_note_len:
	continue

	remaining_energy[note_start_idx:i, freq_idx] = 0
	if freq_idx < note_high:
	remaining_energy[note_start_idx:i, freq_idx + 1] = 0
	if freq_idx > note_low:
	remaining_energy[note_start_idx:i, freq_idx - 1] = 0

	# add the note
	amplitude = np.mean(note_roll[note_start_idx:i, freq_idx])
	note_events.append(
	(
	note_start_idx,
	i,
	freq_idx + note_low,
	amplitude,
	)
	)

	if melodia_trick:
	energy_shape = remaining_energy.shape

	while np.max(remaining_energy) > frame_thresh:
	i_mid, freq_idx = np.unravel_index(np.argmax(remaining_energy), energy_shape)
	remaining_energy[i_mid, freq_idx] = 0

	# forward pass
	i = i_mid + 1
	k = 0
	while i < n_frames - 1 and k < energy_tol:
	if remaining_energy[i, freq_idx] < frame_thresh:
	k += 1
	else:
	k = 0

	remaining_energy[i, freq_idx] = 0
	if freq_idx < note_high:
	remaining_energy[i, freq_idx + 1] = 0
	if freq_idx > note_low:
	remaining_energy[i, freq_idx - 1] = 0

	i += 1

	i_end = i - 1 - k # go back to frame above threshold

	# backward pass
	i = i_mid - 1
	k = 0
	while i > 0 and k < energy_tol:
	if remaining_energy[i, freq_idx] < frame_thresh:
	k += 1
	else:
	k = 0

	remaining_energy[i, freq_idx] = 0
	if freq_idx < note_high:
	remaining_energy[i, freq_idx + 1] = 0
	if freq_idx > note_low:
	remaining_energy[i, freq_idx - 1] = 0

	i -= 1

	i_start = i + 1 + k # go back to frame above threshold
	assert i_start >= 0, "{}".format(i_start)
	assert i_end < n_frames

	if i_end - i_start <= min_note_len:
	# note is too short, skip it
	continue

	# add the note
	amplitude = np.mean(note_roll[i_start:i_end, freq_idx])
	note_events.append(
	(
	i_start,
	i_end,
	freq_idx + note_low,
	amplitude,
	)
	)

	return note_events



	# TIKTOK


	def note_detection_with_onset_offset_regress(frame_output, onset_output,
	onset_shift_output, offset_output, offset_shift_output, velocity_output,
	frame_threshold):
	"""Process prediction matrices to note events information.
	First, detect onsets with onset outputs. Then, detect offsets
	with frame and offset outputs.

	Args:
	frame_output: (frames_num,)
	onset_output: (frames_num,)
	onset_shift_output: (frames_num,)
	offset_output: (frames_num,)
	offset_shift_output: (frames_num,)
	velocity_output: (frames_num,)
	frame_threshold: float
	Returns:
	output_tuples: list of [bgn, fin, onset_shift, offset_shift, normalized_velocity],
	e.g., [
	[1821, 1909, 0.47498, 0.3048533, 0.72119445],
	[1909, 1947, 0.30730522, -0.45764327, 0.64200014],
	...]
	"""
	output_tuples = []
	bgn = None
	frame_disappear = None
	offset_occur = None

	for i in range(onset_output.shape[0]):
	if onset_output[i] == 1:
	"""Onset detected"""
	if bgn:
	"""Consecutive onsets. E.g., pedal is not released, but two
	consecutive notes being played."""
	fin = max(i - 1, 0)
	output_tuples.append([bgn, fin, onset_shift_output[bgn],
	0, velocity_output[bgn]])
	frame_disappear, offset_occur = None, None
	bgn = i

	if bgn and i > bgn:
	"""If onset found, then search offset"""
	if frame_output[i] <= frame_threshold and not frame_disappear:
	"""Frame disappear detected"""
	frame_disappear = i

	if offset_output[i] == 1 and not offset_occur:
	"""Offset detected"""
	offset_occur = i

	if frame_disappear:
	if offset_occur and offset_occur - bgn > frame_disappear - offset_occur:
	"""bgn --------- offset_occur --- frame_disappear"""
	fin = offset_occur
	else:
	"""bgn --- offset_occur --------- frame_disappear"""
	fin = frame_disappear
	output_tuples.append([bgn, fin, onset_shift_output[bgn],
	offset_shift_output[fin], velocity_output[bgn]])
	bgn, frame_disappear, offset_occur = None, None, None

	if bgn and (i - bgn >= 600 or i == onset_output.shape[0] - 1):
	"""Offset not detected"""
	fin = i
	output_tuples.append([bgn, fin, onset_shift_output[bgn],
	offset_shift_output[fin], velocity_output[bgn]])
	bgn, frame_disappear, offset_occur = None, None, None

	# Sort pairs by onsets
	output_tuples.sort(key=lambda pair: pair[0])

	return output_tuples


	class RegressionPostProcessor(object):
	def __init__(self, frames_per_second, classes_num, onset_threshold,
	offset_threshold, frame_threshold, pedal_offset_threshold,
	begin_note):
	"""Postprocess the output probabilities of a transription model to MIDI
	events.

	Args:
	frames_per_second: float
	classes_num: int
	onset_threshold: float
	offset_threshold: float
	frame_threshold: float
	pedal_offset_threshold: float
	"""
	self.frames_per_second = frames_per_second
	self.classes_num = classes_num
	self.onset_threshold = onset_threshold
	self.offset_threshold = offset_threshold
	self.frame_threshold = frame_threshold
	self.pedal_offset_threshold = pedal_offset_threshold
	self.begin_note = begin_note
	self.velocity_scale = 128

	def output_dict_to_midi_events(self, output_dict):
	"""Main function. Post process model outputs to MIDI events.

	Args:
	output_dict: {
	'reg_onset_output': (segment_frames, classes_num),
	'reg_offset_output': (segment_frames, classes_num),
	'frame_output': (segment_frames, classes_num),
	'velocity_output': (segment_frames, classes_num),
	'reg_pedal_onset_output': (segment_frames, 1),
	'reg_pedal_offset_output': (segment_frames, 1),
	'pedal_frame_output': (segment_frames, 1)}

	Outputs:
	est_note_events: list of dict, e.g. [
	{'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
	{'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]

	est_pedal_events: list of dict, e.g. [
	{'onset_time': 0.17, 'offset_time': 0.96},
	{'osnet_time': 1.17, 'offset_time': 2.65}]
	"""
	output_dict['frame_output'] = output_dict['note']
	output_dict['velocity_output'] = output_dict['note']
	output_dict['reg_onset_output'] = output_dict['onset']
	output_dict['reg_offset_output'] = output_dict['offset']
	# Post process piano note outputs to piano note and pedal events information
	(est_on_off_note_vels, est_pedal_on_offs) = \
	self.output_dict_to_note_pedal_arrays(output_dict)
	"""est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
	est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""

	# Reformat notes to MIDI events
	est_note_events = self.detected_notes_to_events(est_on_off_note_vels)

	if est_pedal_on_offs is None:
	est_pedal_events = None
	else:
	est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)

	return est_note_events, est_pedal_events

	def output_dict_to_note_pedal_arrays(self, output_dict):
	"""Postprocess the output probabilities of a transription model to MIDI
	events.

	Args:
	output_dict: dict, {
	'reg_onset_output': (frames_num, classes_num),
	'reg_offset_output': (frames_num, classes_num),
	'frame_output': (frames_num, classes_num),
	'velocity_output': (frames_num, classes_num),
	...}

	Returns:
	est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
	offset_time, piano_note and velocity. E.g. [
	[39.74, 39.87, 27, 0.65],
	[11.98, 12.11, 33, 0.69],
	...]

	est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
	and offset_time. E.g. [
	[0.17, 0.96],
	[1.17, 2.65],
	...]
	"""

	# ------ 1. Process regression outputs to binarized outputs ------
	# For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
	# will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]

	# Calculate binarized onset output from regression output
	(onset_output, onset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_onset_output'],
	threshold=self.onset_threshold, neighbour=2)

	output_dict['onset_output'] = onset_output # Values are 0 or 1
	output_dict['onset_shift_output'] = onset_shift_output

	# Calculate binarized offset output from regression output
	(offset_output, offset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_offset_output'],
	threshold=self.offset_threshold, neighbour=4)

	output_dict['offset_output'] = offset_output # Values are 0 or 1
	output_dict['offset_shift_output'] = offset_shift_output

	if 'reg_pedal_onset_output' in output_dict.keys():
	"""Pedal onsets are not used in inference. Instead, frame-wise pedal
	predictions are used to detect onsets. We empirically found this is
	more accurate to detect pedal onsets."""
	pass

	if 'reg_pedal_offset_output' in output_dict.keys():
	# Calculate binarized pedal offset output from regression output
	(pedal_offset_output, pedal_offset_shift_output) = \
	self.get_binarized_output_from_regression(
	reg_output=output_dict['reg_pedal_offset_output'],
	threshold=self.pedal_offset_threshold, neighbour=4)

	output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1
	output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output

	# ------ 2. Process matrices results to event results ------
	# Detect piano notes from output_dict
	est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)

	est_pedal_on_offs = None

	return est_on_off_note_vels, est_pedal_on_offs

	def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
	"""Calculate binarized output and shifts of onsets or offsets from the
	regression results.

	Args:
	reg_output: (frames_num, classes_num)
	threshold: float
	neighbour: int

	Returns:
	binary_output: (frames_num, classes_num)
	shift_output: (frames_num, classes_num)
	"""
	binary_output = np.zeros_like(reg_output)
	shift_output = np.zeros_like(reg_output)
	(frames_num, classes_num) = reg_output.shape

	for k in range(classes_num):
	x = reg_output[:, k]
	for n in range(neighbour, frames_num - neighbour):
	if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
	binary_output[n, k] = 1

	"""See Section III-D in [1] for deduction.
	[1] Q. Kong, et al., High-resolution Piano Transcription
	with Pedals by Regressing Onsets and Offsets Times, 2020."""
	if x[n - 1] > x[n + 1]:
	shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
	else:
	shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
	shift_output[n, k] = shift

	return binary_output, shift_output

	def is_monotonic_neighbour(self, x, n, neighbour):
	"""Detect if values are monotonic in both side of x[n].

	Args:
	x: (frames_num,)
	n: int
	neighbour: int

	Returns:
	monotonic: bool
	"""
	monotonic = True
	for i in range(neighbour):
	if x[n - i] < x[n - i - 1]:
	monotonic = False
	if x[n + i] < x[n + i + 1]:
	monotonic = False

	return monotonic

	def output_dict_to_detected_notes(self, output_dict):
	"""Postprocess output_dict to piano notes.

	Args:
	output_dict: dict, e.g. {
	'onset_output': (frames_num, classes_num),
	'onset_shift_output': (frames_num, classes_num),
	'offset_output': (frames_num, classes_num),
	'offset_shift_output': (frames_num, classes_num),
	'frame_output': (frames_num, classes_num),
	'onset_output': (frames_num, classes_num),
	...}

	Returns:
	est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
	MIDI notes and velocities. E.g.,
	[[39.7375, 39.7500, 27., 0.6638],
	[11.9824, 12.5000, 33., 0.6892],
	...]
	"""

	est_tuples = []
	est_midi_notes = []
	classes_num = output_dict['frame_output'].shape[-1]

	for piano_note in range(classes_num):
	"""Detect piano notes"""
	est_tuples_per_note = note_detection_with_onset_offset_regress(
	frame_output=output_dict['frame_output'][:, piano_note],
	onset_output=output_dict['onset_output'][:, piano_note],
	onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
	offset_output=output_dict['offset_output'][:, piano_note],
	offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
	velocity_output=output_dict['velocity_output'][:, piano_note],
	frame_threshold=self.frame_threshold)

	est_tuples += est_tuples_per_note
	est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)

	est_tuples = np.array(est_tuples) # (notes, 5)
	"""(notes, 5), the five columns are onset, offset, onset_shift,
	offset_shift and normalized_velocity"""

	est_midi_notes = np.array(est_midi_notes) # (notes,)

	onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
	offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
	velocities = est_tuples[:, 4]

	est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
	"""(notes, 3), the three columns are onset_times, offset_times and velocity."""

	est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)

	return est_on_off_note_vels

	def detected_notes_to_events(self, est_on_off_note_vels):
	"""Reformat detected notes to midi events.

	Args:
	est_on_off_vels: (notes, 3), the three columns are onset_times,
	offset_times and velocity. E.g.
	[[32.8376, 35.7700, 0.7932],
	[37.3712, 39.9300, 0.8058],
	...]

	Returns:
	midi_events, list, e.g.,
	[{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
	{'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
	...]
	"""
	midi_events = []
	for i in range(est_on_off_note_vels.shape[0]):
	midi_events.append({
	'onset_time': est_on_off_note_vels[i][0],
	'offset_time': est_on_off_note_vels[i][1],
	'midi_note': int(est_on_off_note_vels[i][2]),
	'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})

	return midi_events