Spaces:
Running
Running
import gradio as gr | |
import io | |
import typing as T | |
import numpy as np | |
from PIL import Image | |
import pydub | |
from scipy.io import wavfile | |
import torch | |
import torchaudio | |
def convert(audio): | |
# read uploaded file to wav | |
rate, data = wavfile.read(audio) | |
# resample from 48000 to 44100 | |
# from scipy.signal import resample | |
# data = resample(data, int(data.shape[0] * 44100 / 48000)) | |
# convert to mono | |
data = np.mean(data, axis=0) | |
# convert to float32 | |
data = data.astype(np.float32) | |
# take a random 7 second slice of the audio | |
data = data[rate*7:rate*14] | |
spectrogram = spectrogram_from_waveform( | |
waveform=data, | |
sample_rate=rate, | |
# width=768, | |
n_fft=8192, | |
hop_length=512, | |
win_length=8192, | |
) | |
spec = image_from_spectrogram(spectrogram) | |
return spec | |
def spectrogram_from_waveform( | |
waveform: np.ndarray, | |
sample_rate: int, | |
n_fft: int, | |
hop_length: int, | |
win_length: int, | |
mel_scale: bool = True, | |
n_mels: int = 512, | |
) -> np.ndarray: | |
""" | |
Compute a spectrogram from a waveform. | |
""" | |
spectrogram_func = torchaudio.transforms.Spectrogram( | |
n_fft=n_fft, | |
power=None, | |
hop_length=hop_length, | |
win_length=win_length, | |
) | |
waveform_tensor = torch.from_numpy(waveform.astype(np.float32)).reshape(1, -1) | |
Sxx_complex = spectrogram_func(waveform_tensor).numpy()[0] | |
Sxx_mag = np.abs(Sxx_complex) | |
if mel_scale: | |
mel_scaler = torchaudio.transforms.MelScale( | |
n_mels=n_mels, | |
sample_rate=sample_rate, | |
f_min=0, | |
f_max=10000, | |
n_stft=n_fft // 2 + 1, | |
norm=None, | |
mel_scale="htk", | |
) | |
Sxx_mag = mel_scaler(torch.from_numpy(Sxx_mag)).numpy() | |
return Sxx_mag | |
def image_from_spectrogram( | |
spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25 | |
) -> Image.Image: | |
""" | |
Compute a spectrogram image from a spectrogram magnitude array. | |
""" | |
# Apply the power curve | |
data = np.power(spectrogram, power_for_image) | |
# Rescale to 0-255 | |
data = data * 255 / max_volume | |
# Invert | |
data = 255 - data | |
# Convert to a PIL image | |
image = Image.fromarray(data.astype(np.uint8)) | |
# Flip Y | |
image = image.transpose(Image.FLIP_TOP_BOTTOM) | |
# Convert to RGB | |
image = image.convert("RGB") | |
return image | |
gr.Interface(fn=convert, inputs=[gr.Audio(source="upload", type="filepath")], outputs=[gr.Image()]).launch() |