Spaces:
Build error
Build error
import librosa | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import torch | |
from matplotlib.colors import LogNorm | |
matplotlib.use("Agg") | |
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False, plot_log=False): | |
if isinstance(alignment, torch.Tensor): | |
alignment_ = alignment.detach().cpu().numpy().squeeze() | |
else: | |
alignment_ = alignment | |
alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_ | |
fig, ax = plt.subplots(figsize=fig_size) | |
im = ax.imshow( | |
alignment_.T, aspect="auto", origin="lower", interpolation="none", norm=LogNorm() if plot_log else None | |
) | |
fig.colorbar(im, ax=ax) | |
xlabel = "Decoder timestep" | |
if info is not None: | |
xlabel += "\n\n" + info | |
plt.xlabel(xlabel) | |
plt.ylabel("Encoder timestep") | |
# plt.yticks(range(len(text)), list(text)) | |
plt.tight_layout() | |
if title is not None: | |
plt.title(title) | |
if not output_fig: | |
plt.close() | |
return fig | |
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False): | |
if isinstance(spectrogram, torch.Tensor): | |
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T | |
else: | |
spectrogram_ = spectrogram.T | |
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_ | |
if ap is not None: | |
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access | |
fig = plt.figure(figsize=fig_size) | |
plt.imshow(spectrogram_, aspect="auto", origin="lower") | |
plt.colorbar() | |
plt.tight_layout() | |
if not output_fig: | |
plt.close() | |
return fig | |
def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False): | |
"""Plot pitch curves on top of the spectrogram. | |
Args: | |
pitch (np.array): Pitch values. | |
spectrogram (np.array): Spectrogram values. | |
Shapes: | |
pitch: :math:`(T,)` | |
spec: :math:`(C, T)` | |
""" | |
if isinstance(spectrogram, torch.Tensor): | |
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T | |
else: | |
spectrogram_ = spectrogram.T | |
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_ | |
if ap is not None: | |
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access | |
old_fig_size = plt.rcParams["figure.figsize"] | |
if fig_size is not None: | |
plt.rcParams["figure.figsize"] = fig_size | |
fig, ax = plt.subplots() | |
ax.imshow(spectrogram_, aspect="auto", origin="lower") | |
ax.set_xlabel("time") | |
ax.set_ylabel("spec_freq") | |
ax2 = ax.twinx() | |
ax2.plot(pitch, linewidth=5.0, color="red") | |
ax2.set_ylabel("F0") | |
plt.rcParams["figure.figsize"] = old_fig_size | |
if not output_fig: | |
plt.close() | |
return fig | |
def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False): | |
"""Plot pitch curves on top of the input characters. | |
Args: | |
pitch (np.array): Pitch values. | |
chars (str): Characters to place to the x-axis. | |
Shapes: | |
pitch: :math:`(T,)` | |
""" | |
old_fig_size = plt.rcParams["figure.figsize"] | |
if fig_size is not None: | |
plt.rcParams["figure.figsize"] = fig_size | |
fig, ax = plt.subplots() | |
x = np.array(range(len(chars))) | |
my_xticks = chars | |
plt.xticks(x, my_xticks) | |
ax.set_xlabel("characters") | |
ax.set_ylabel("freq") | |
ax2 = ax.twinx() | |
ax2.plot(pitch, linewidth=5.0, color="red") | |
ax2.set_ylabel("F0") | |
plt.rcParams["figure.figsize"] = old_fig_size | |
if not output_fig: | |
plt.close() | |
return fig | |
def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False): | |
"""Plot energy curves on top of the input characters. | |
Args: | |
energy (np.array): energy values. | |
chars (str): Characters to place to the x-axis. | |
Shapes: | |
energy: :math:`(T,)` | |
""" | |
old_fig_size = plt.rcParams["figure.figsize"] | |
if fig_size is not None: | |
plt.rcParams["figure.figsize"] = fig_size | |
fig, ax = plt.subplots() | |
x = np.array(range(len(chars))) | |
my_xticks = chars | |
plt.xticks(x, my_xticks) | |
ax.set_xlabel("characters") | |
ax.set_ylabel("freq") | |
ax2 = ax.twinx() | |
ax2.plot(energy, linewidth=5.0, color="red") | |
ax2.set_ylabel("energy") | |
plt.rcParams["figure.figsize"] = old_fig_size | |
if not output_fig: | |
plt.close() | |
return fig | |
def visualize( | |
alignment, | |
postnet_output, | |
text, | |
hop_length, | |
CONFIG, | |
tokenizer, | |
stop_tokens=None, | |
decoder_output=None, | |
output_path=None, | |
figsize=(8, 24), | |
output_fig=False, | |
): | |
"""Intended to be used in Notebooks.""" | |
if decoder_output is not None: | |
num_plot = 4 | |
else: | |
num_plot = 3 | |
label_fontsize = 16 | |
fig = plt.figure(figsize=figsize) | |
plt.subplot(num_plot, 1, 1) | |
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) | |
plt.xlabel("Decoder timestamp", fontsize=label_fontsize) | |
plt.ylabel("Encoder timestamp", fontsize=label_fontsize) | |
# compute phoneme representation and back | |
if CONFIG.use_phonemes: | |
seq = tokenizer.text_to_ids(text) | |
text = tokenizer.ids_to_text(seq) | |
print(text) | |
plt.yticks(range(len(text)), list(text)) | |
plt.colorbar() | |
if stop_tokens is not None: | |
# plot stopnet predictions | |
plt.subplot(num_plot, 1, 2) | |
plt.plot(range(len(stop_tokens)), list(stop_tokens)) | |
# plot postnet spectrogram | |
plt.subplot(num_plot, 1, 3) | |
librosa.display.specshow( | |
postnet_output.T, | |
sr=CONFIG.audio["sample_rate"], | |
hop_length=hop_length, | |
x_axis="time", | |
y_axis="linear", | |
fmin=CONFIG.audio["mel_fmin"], | |
fmax=CONFIG.audio["mel_fmax"], | |
) | |
plt.xlabel("Time", fontsize=label_fontsize) | |
plt.ylabel("Hz", fontsize=label_fontsize) | |
plt.tight_layout() | |
plt.colorbar() | |
if decoder_output is not None: | |
plt.subplot(num_plot, 1, 4) | |
librosa.display.specshow( | |
decoder_output.T, | |
sr=CONFIG.audio["sample_rate"], | |
hop_length=hop_length, | |
x_axis="time", | |
y_axis="linear", | |
fmin=CONFIG.audio["mel_fmin"], | |
fmax=CONFIG.audio["mel_fmax"], | |
) | |
plt.xlabel("Time", fontsize=label_fontsize) | |
plt.ylabel("Hz", fontsize=label_fontsize) | |
plt.tight_layout() | |
plt.colorbar() | |
if output_path: | |
print(output_path) | |
fig.savefig(output_path) | |
plt.close() | |
if not output_fig: | |
plt.close() | |