In [None]:
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath("")))

In [None]:
import torch
import random
import numpy as np
from PIL import Image
from datasets import load_dataset
from IPython.display import Audio
from diffusers import AutoencoderKL
from audiodiffusion.mel import Mel

In [None]:
mel = Mel()
vae = AutoencoderKL.from_pretrained('../models/autoencoder-kl')

In [None]:
vae.config

In [None]:
ds = load_dataset('teticio/audio-diffusion-256')

In [None]:
image = random.choice(ds['train'])['image']
display(image)
Audio(data=mel.image_to_audio(image), rate=mel.get_sample_rate())

In [None]:
# encode
input_image = np.frombuffer(image.convert('RGB').tobytes(), dtype="uint8").reshape(
    (image.height, image.width, 3))
input_image = ((input_image / 255) * 2 - 1).transpose(2, 0, 1)
posterior = vae.encode(torch.tensor([input_image], dtype=torch.float32)).latent_dist
latents = posterior.sample()

In [None]:
# reconstruct
output_image = vae.decode(latents)['sample']
output_image = torch.clamp(output_image, -1., 1.)
output_image = (output_image + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
output_image = (output_image.detach().cpu().numpy() *
                255).round().astype("uint8").transpose(0, 2, 3, 1)[0]
output_image = Image.fromarray(output_image).convert('L')
display(output_image)
Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())

In [None]:
# sample
output_image = vae.decode(torch.randn_like(posterior.sample()))['sample']
output_image = torch.clamp(output_image, -1., 1.)
output_image = (output_image + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
output_image = (output_image.detach().cpu().numpy() *
                255).round().astype("uint8").transpose(0, 2, 3, 1)[0]
output_image = Image.fromarray(output_image).convert('L')
display(output_image)
Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())