clonar-voz / TTS /tts /utils /visual.py
Shadhil's picture
voice-clone with single audio sample input
9b2107c
raw
history blame
6.67 kB
import librosa
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch
from matplotlib.colors import LogNorm
matplotlib.use("Agg")
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False, plot_log=False):
if isinstance(alignment, torch.Tensor):
alignment_ = alignment.detach().cpu().numpy().squeeze()
else:
alignment_ = alignment
alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_
fig, ax = plt.subplots(figsize=fig_size)
im = ax.imshow(
alignment_.T, aspect="auto", origin="lower", interpolation="none", norm=LogNorm() if plot_log else None
)
fig.colorbar(im, ax=ax)
xlabel = "Decoder timestep"
if info is not None:
xlabel += "\n\n" + info
plt.xlabel(xlabel)
plt.ylabel("Encoder timestep")
# plt.yticks(range(len(text)), list(text))
plt.tight_layout()
if title is not None:
plt.title(title)
if not output_fig:
plt.close()
return fig
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
if isinstance(spectrogram, torch.Tensor):
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
else:
spectrogram_ = spectrogram.T
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
if ap is not None:
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
fig = plt.figure(figsize=fig_size)
plt.imshow(spectrogram_, aspect="auto", origin="lower")
plt.colorbar()
plt.tight_layout()
if not output_fig:
plt.close()
return fig
def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False):
"""Plot pitch curves on top of the spectrogram.
Args:
pitch (np.array): Pitch values.
spectrogram (np.array): Spectrogram values.
Shapes:
pitch: :math:`(T,)`
spec: :math:`(C, T)`
"""
if isinstance(spectrogram, torch.Tensor):
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
else:
spectrogram_ = spectrogram.T
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
if ap is not None:
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
old_fig_size = plt.rcParams["figure.figsize"]
if fig_size is not None:
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
ax.imshow(spectrogram_, aspect="auto", origin="lower")
ax.set_xlabel("time")
ax.set_ylabel("spec_freq")
ax2 = ax.twinx()
ax2.plot(pitch, linewidth=5.0, color="red")
ax2.set_ylabel("F0")
plt.rcParams["figure.figsize"] = old_fig_size
if not output_fig:
plt.close()
return fig
def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
"""Plot pitch curves on top of the input characters.
Args:
pitch (np.array): Pitch values.
chars (str): Characters to place to the x-axis.
Shapes:
pitch: :math:`(T,)`
"""
old_fig_size = plt.rcParams["figure.figsize"]
if fig_size is not None:
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
x = np.array(range(len(chars)))
my_xticks = chars
plt.xticks(x, my_xticks)
ax.set_xlabel("characters")
ax.set_ylabel("freq")
ax2 = ax.twinx()
ax2.plot(pitch, linewidth=5.0, color="red")
ax2.set_ylabel("F0")
plt.rcParams["figure.figsize"] = old_fig_size
if not output_fig:
plt.close()
return fig
def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
"""Plot energy curves on top of the input characters.
Args:
energy (np.array): energy values.
chars (str): Characters to place to the x-axis.
Shapes:
energy: :math:`(T,)`
"""
old_fig_size = plt.rcParams["figure.figsize"]
if fig_size is not None:
plt.rcParams["figure.figsize"] = fig_size
fig, ax = plt.subplots()
x = np.array(range(len(chars)))
my_xticks = chars
plt.xticks(x, my_xticks)
ax.set_xlabel("characters")
ax.set_ylabel("freq")
ax2 = ax.twinx()
ax2.plot(energy, linewidth=5.0, color="red")
ax2.set_ylabel("energy")
plt.rcParams["figure.figsize"] = old_fig_size
if not output_fig:
plt.close()
return fig
def visualize(
alignment,
postnet_output,
text,
hop_length,
CONFIG,
tokenizer,
stop_tokens=None,
decoder_output=None,
output_path=None,
figsize=(8, 24),
output_fig=False,
):
"""Intended to be used in Notebooks."""
if decoder_output is not None:
num_plot = 4
else:
num_plot = 3
label_fontsize = 16
fig = plt.figure(figsize=figsize)
plt.subplot(num_plot, 1, 1)
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
# compute phoneme representation and back
if CONFIG.use_phonemes:
seq = tokenizer.text_to_ids(text)
text = tokenizer.ids_to_text(seq)
print(text)
plt.yticks(range(len(text)), list(text))
plt.colorbar()
if stop_tokens is not None:
# plot stopnet predictions
plt.subplot(num_plot, 1, 2)
plt.plot(range(len(stop_tokens)), list(stop_tokens))
# plot postnet spectrogram
plt.subplot(num_plot, 1, 3)
librosa.display.specshow(
postnet_output.T,
sr=CONFIG.audio["sample_rate"],
hop_length=hop_length,
x_axis="time",
y_axis="linear",
fmin=CONFIG.audio["mel_fmin"],
fmax=CONFIG.audio["mel_fmax"],
)
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
if decoder_output is not None:
plt.subplot(num_plot, 1, 4)
librosa.display.specshow(
decoder_output.T,
sr=CONFIG.audio["sample_rate"],
hop_length=hop_length,
x_axis="time",
y_axis="linear",
fmin=CONFIG.audio["mel_fmin"],
fmax=CONFIG.audio["mel_fmax"],
)
plt.xlabel("Time", fontsize=label_fontsize)
plt.ylabel("Hz", fontsize=label_fontsize)
plt.tight_layout()
plt.colorbar()
if output_path:
print(output_path)
fig.savefig(output_path)
plt.close()
if not output_fig:
plt.close()