{ "cells": [ { "cell_type": "markdown", "id": "62c5865f", "metadata": {}, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 1, "id": "6c7800a6", "metadata": {}, "outputs": [], "source": [ "try:\n", " # are we running on Google Colab?\n", " import google.colab\n", " !git clone -q https://github.com/teticio/audio-diffusion.git\n", " %cd audio-diffusion\n", " %pip install -q -r requirements.txt\n", "except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 2, "id": "b447e2c4", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))" ] }, { "cell_type": "code", "execution_count": 3, "id": "c2fc0e7a", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import random\n", "import librosa\n", "import numpy as np\n", "from datasets import load_dataset\n", "from IPython.display import Audio\n", "from audiodiffusion import AudioDiffusion" ] }, { "cell_type": "code", "execution_count": 4, "id": "b294a94a", "metadata": {}, "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "generator = torch.Generator(device=device)" ] }, { "cell_type": "markdown", "id": "f3feb265", "metadata": {}, "source": [ "## DDPM (De-noising Diffusion Probabilistic Models)" ] }, { "cell_type": "markdown", "id": "7fd945bb", "metadata": {}, "source": [ "### Select model" ] }, { "cell_type": "code", "execution_count": 5, "id": "97f24046", "metadata": {}, "outputs": [], "source": [ "#@markdown teticio/audio-diffusion-256 - trained on my Spotify \"liked\" playlist\n", "\n", "#@markdown teticio/audio-diffusion-breaks-256 - trained on samples used in music\n", "\n", "#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n", "\n", "model_id = \"teticio/audio-diffusion-256\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\", \"teticio/audio-diffusion-ddim-256\"]" ] }, { "cell_type": "code", "execution_count": 11, "id": "88bebba3", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'AudioDiffusion' object has no attribute 'Mel'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Input \u001b[0;32mIn [11]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mMel\u001b[49m\n", "\u001b[0;31mAttributeError\u001b[0m: 'AudioDiffusion' object has no attribute 'Mel'" ] } ], "source": [ "audio_diffusion.Mel" ] }, { "cell_type": "code", "execution_count": 6, "id": "a3d45c36", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89e8b4345bab47378576244f4d3f7b44", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/244 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Seed = 2275699277188148\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e049bd4fb00542feba252d9f9da2334d", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1000 [00:00\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSeed = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mseed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 4\u001b[0m generator\u001b[38;5;241m.\u001b[39mmanual_seed(seed)\n\u001b[1;32m 5\u001b[0m image, (sample_rate,\n\u001b[0;32m----> 6\u001b[0m audio) \u001b[38;5;241m=\u001b[39m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_spectrogram_and_audio\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m display(image)\n\u001b[1;32m 9\u001b[0m display(Audio(audio, rate\u001b[38;5;241m=\u001b[39msample_rate))\n", "File \u001b[0;32m~/ML/huggingface/audio-diffusion/audiodiffusion/__init__.py:54\u001b[0m, in \u001b[0;36mAudioDiffusion.generate_spectrogram_and_audio\u001b[0;34m(self, steps, generator, step_generator, eta, noise)\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_spectrogram_and_audio\u001b[39m(\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 34\u001b[0m steps: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 38\u001b[0m noise: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 39\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[Image\u001b[38;5;241m.\u001b[39mImage, Tuple[\u001b[38;5;28mint\u001b[39m, np\u001b[38;5;241m.\u001b[39mndarray]]:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m\"\"\"Generate random mel spectrogram and convert to audio.\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \n\u001b[1;32m 42\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m (float, np.ndarray): sample rate and raw audio\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 53\u001b[0m images, (sample_rate,\n\u001b[0;32m---> 54\u001b[0m audios) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpipe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mstep_generator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[43m \u001b[49m\u001b[43mnoise\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnoise\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m images[\u001b[38;5;241m0\u001b[39m], (sample_rate, audios[\u001b[38;5;241m0\u001b[39m])\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclone():\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py:160\u001b[0m, in \u001b[0;36mAudioDiffusionPipeline.__call__\u001b[0;34m(self, batch_size, audio_file, raw_audio, slice, start_step, steps, generator, mask_start_secs, mask_end_secs, step_generator, eta, noise, return_dict)\u001b[0m\n\u001b[1;32m 157\u001b[0m mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39madd_noise(input_images, noise, torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps[start_step:]))\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprogress_bar(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps[start_step:])):\n\u001b[0;32m--> 160\u001b[0m model_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler, DDIMScheduler):\n\u001b[1;32m 163\u001b[0m images \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mstep(\n\u001b[1;32m 164\u001b[0m model_output\u001b[38;5;241m=\u001b[39mmodel_output, timestep\u001b[38;5;241m=\u001b[39mt, sample\u001b[38;5;241m=\u001b[39mimages, eta\u001b[38;5;241m=\u001b[39meta, generator\u001b[38;5;241m=\u001b[39mstep_generator\n\u001b[1;32m 165\u001b[0m )[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprev_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/models/unet_2d.py:247\u001b[0m, in \u001b[0;36mUNet2DModel.forward\u001b[0;34m(self, sample, timestep, return_dict)\u001b[0m\n\u001b[1;32m 245\u001b[0m sample, skip_sample \u001b[38;5;241m=\u001b[39m upsample_block(sample, res_samples, emb, skip_sample)\n\u001b[1;32m 246\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 247\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[43mupsample_block\u001b[49m\u001b[43m(\u001b[49m\u001b[43msample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mres_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43memb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 249\u001b[0m \u001b[38;5;66;03m# 6. post-process\u001b[39;00m\n\u001b[1;32m 250\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_norm_out(sample)\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/models/unet_2d_blocks.py:1317\u001b[0m, in \u001b[0;36mUpBlock2D.forward\u001b[0;34m(self, hidden_states, res_hidden_states_tuple, temb, upsample_size)\u001b[0m\n\u001b[1;32m 1315\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(create_custom_forward(resnet), hidden_states, temb)\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1317\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[43mresnet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m upsampler \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers:\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/models/resnet.py:467\u001b[0m, in \u001b[0;36mResnetBlock2D.forward\u001b[0;34m(self, input_tensor, temb)\u001b[0m\n\u001b[1;32m 464\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv1(hidden_states)\n\u001b[1;32m 466\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m temb \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 467\u001b[0m temb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtime_emb_proj\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnonlinearity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtemb\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m[:, :, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m]\n\u001b[1;32m 468\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states \u001b[38;5;241m+\u001b[39m temb\n\u001b[1;32m 470\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm2(hidden_states)\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mlinear(\u001b[38;5;28minput\u001b[39m, \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias)\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1194\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1191\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_is_full_backward_hook\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m:\n\u001b[1;32m 1192\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_full_backward_hook \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1194\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getattr__\u001b[39m(\u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tensor, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mModule\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 1195\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_parameters\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m:\n\u001b[1;32m 1196\u001b[0m _parameters \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_parameters\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "for _ in range(10):\n", " seed = generator.seed()\n", " print(f'Seed = {seed}')\n", " generator.manual_seed(seed)\n", " image, (sample_rate,\n", " audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " generator=generator)\n", " display(image)\n", " display(Audio(audio, rate=sample_rate))\n", " loop = AudioDiffusion.loop_it(audio, sample_rate)\n", " if loop is not None:\n", " display(Audio(loop, rate=sample_rate))\n", " else:\n", " print(\"Unable to determine loop points\")" ] }, { "cell_type": "markdown", "id": "0bb03e33", "metadata": {}, "source": [ "### Generate variations of audios" ] }, { "cell_type": "markdown", "id": "80e5b5fa", "metadata": {}, "source": [ "Try playing around with `start_steps`. Values closer to zero will produce new samples, while values closer to 1,000 will produce samples more faithful to the original." ] }, { "cell_type": "code", "execution_count": null, "id": "5074ec11", "metadata": {}, "outputs": [], "source": [ "seed = 2391504374279719 #@param {type:\"integer\"}\n", "generator.manual_seed(seed)\n", "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " generator=generator)\n", "display(image)\n", "display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "code", "execution_count": null, "id": "a0fefe28", "metadata": { "scrolled": false }, "outputs": [], "source": [ "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n", "track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n", "for variation in range(12):\n", " image2, (\n", " sample_rate,\n", " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", " raw_audio=audio, start_step=start_step)\n", " display(image2)\n", " display(Audio(audio2, rate=sample_rate))\n", " track = np.concatenate(\n", " [track, AudioDiffusion.loop_it(audio2, sample_rate, loops=1)])\n", "display(Audio(track, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "58a876c1", "metadata": {}, "source": [ "### Generate continuations (\"out-painting\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b95d5780", "metadata": {}, "outputs": [], "source": [ "overlap_secs = 2 #@param {type:\"integer\"}\n", "start_step = 0 #@param {type:\"slider\", min:0, max:1000, step:10}\n", "overlap_samples = overlap_secs * sample_rate\n", "track = audio\n", "for variation in range(12):\n", " image2, (\n", " sample_rate,\n", " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", " raw_audio=audio[-overlap_samples:],\n", " start_step=start_step,\n", " mask_start_secs=overlap_secs)\n", " display(image2)\n", " display(Audio(audio2, rate=sample_rate))\n", " track = np.concatenate([track, audio2[overlap_samples:]])\n", " audio = audio2\n", "display(Audio(track, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "b6434d3f", "metadata": {}, "source": [ "### Remix (style transfer)" ] }, { "cell_type": "markdown", "id": "0da030b2", "metadata": {}, "source": [ "Alternatively, you can start from another audio altogether, resulting in a kind of style transfer. Maintaining the same seed during generation fixes the style, while masking helps stitch consecutive segments together more smoothly." ] }, { "cell_type": "code", "execution_count": null, "id": "fc620a80", "metadata": {}, "outputs": [], "source": [ "try:\n", " # are we running on Google Colab?\n", " from google.colab import files\n", " audio_file = list(files.upload().keys())[0]\n", "except:\n", " audio_file = \"/home/teticio/Music/liked/El Michels Affair - Glaciers Of Ice.mp3\"" ] }, { "cell_type": "code", "execution_count": null, "id": "5a257e69", "metadata": { "scrolled": false }, "outputs": [], "source": [ "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n", "overlap_secs = 2 #@param {type:\"integer\"}\n", "track_audio, _ = librosa.load(audio_file, mono=True, sr=mel.get_sample_rate())\n", "overlap_samples = overlap_secs * sample_rate\n", "slice_size = mel.x_res * mel.hop_length\n", "stride = slice_size - overlap_samples\n", "generator = torch.Generator(device=device)\n", "seed = generator.seed()\n", "print(f'Seed = {seed}')\n", "track = np.array([])\n", "not_first = 0\n", "for sample in range(len(track_audio) // stride):\n", " generator.manual_seed(seed)\n", " audio = np.array(track_audio[sample * stride:sample * stride + slice_size])\n", " if not_first:\n", " # Normalize and re-insert generated audio\n", " audio[:overlap_samples] = audio2[-overlap_samples:] * np.max(\n", " audio[:overlap_samples]) / np.max(audio2[-overlap_samples:])\n", " _, (sample_rate,\n", " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", " raw_audio=audio,\n", " start_step=start_step,\n", " generator=generator,\n", " mask_start_secs=overlap_secs * not_first)\n", " track = np.concatenate([track, audio2[overlap_samples * not_first:]])\n", " not_first = 1\n", " display(Audio(track, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "924ff9d5", "metadata": {}, "source": [ "### Fill the gap (\"in-painting\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0200264c", "metadata": {}, "outputs": [], "source": [ "slice = 3 #@param {type:\"integer\"}\n", "raw_audio = track_audio[sample * stride:sample * stride + slice_size]\n", "_, (sample_rate,\n", " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n", " raw_audio=raw_audio,\n", " mask_start_secs=1,\n", " mask_end_secs=1,\n", " step_generator=torch.Generator(device=device))\n", "display(Audio(audio, rate=sample_rate))\n", "display(Audio(audio2, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "efc32dae", "metadata": {}, "source": [ "## DDIM (De-noising Diffusion Implicit Models)" ] }, { "cell_type": "code", "execution_count": null, "id": "a021f78a", "metadata": {}, "outputs": [], "source": [ "audio_diffusion = AudioDiffusion(model_id='teticio/audio-diffusion-ddim-256')\n", "mel = audio_diffusion.pipe.mel" ] }, { "cell_type": "markdown", "id": "deb23339", "metadata": {}, "source": [ "### Generation can be done in many fewer steps with DDIMs" ] }, { "cell_type": "code", "execution_count": null, "id": "c105a497", "metadata": {}, "outputs": [], "source": [ "for _ in range(10):\n", " seed = generator.seed()\n", " print(f'Seed = {seed}')\n", " generator.manual_seed(seed)\n", " image, (sample_rate,\n", " audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " generator=generator)\n", " display(image)\n", " display(Audio(audio, rate=sample_rate))\n", " loop = AudioDiffusion.loop_it(audio, sample_rate)\n", " if loop is not None:\n", " display(Audio(loop, rate=sample_rate))\n", " else:\n", " print(\"Unable to determine loop points\")" ] }, { "cell_type": "markdown", "id": "cab4692c", "metadata": {}, "source": [ "The parameter eta controls the variance:\n", "* 0 - DDIM (deterministic)\n", "* 1 - DDPM (De-noising Diffusion Probabilistic Model)" ] }, { "cell_type": "code", "execution_count": null, "id": "72bdd207", "metadata": {}, "outputs": [], "source": [ "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " steps=1000, generator=generator, eta=1)\n", "display(image)\n", "display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "b8d5442c", "metadata": {}, "source": [ "### DDIMs can be used as encoders..." ] }, { "cell_type": "code", "execution_count": null, "id": "269ee816", "metadata": {}, "outputs": [], "source": [ "# Doesn't have to be an audio from the train dataset, this is just for convenience\n", "ds = load_dataset('teticio/audio-diffusion-256')" ] }, { "cell_type": "code", "execution_count": null, "id": "278d1d80", "metadata": {}, "outputs": [], "source": [ "image = ds['train'][264]['image']\n", "display(Audio(mel.image_to_audio(image), rate=sample_rate))" ] }, { "cell_type": "code", "execution_count": null, "id": "912b54e4", "metadata": {}, "outputs": [], "source": [ "noise = audio_diffusion.pipe.encode([image])" ] }, { "cell_type": "code", "execution_count": null, "id": "c7b31f97", "metadata": {}, "outputs": [], "source": [ "# Reconstruct original audio from noise\n", "_, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " noise=noise, generator=generator)\n", "display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "998c776b", "metadata": {}, "source": [ "### ...or to interpolate between audios" ] }, { "cell_type": "code", "execution_count": null, "id": "33f82367", "metadata": {}, "outputs": [], "source": [ "image2 = ds['train'][15978]['image']\n", "display(Audio(mel.image_to_audio(image2), rate=sample_rate))" ] }, { "cell_type": "code", "execution_count": null, "id": "f93fb6c0", "metadata": {}, "outputs": [], "source": [ "noise2 = audio_diffusion.pipe.encode([image2])" ] }, { "cell_type": "code", "execution_count": null, "id": "a4190563", "metadata": {}, "outputs": [], "source": [ "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n", "_, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " noise=audio_diffusion.pipe.slerp(noise, noise2, alpha),\n", " generator=generator)\n", "display(Audio(mel.image_to_audio(image), rate=sample_rate))\n", "display(Audio(mel.image_to_audio(image2), rate=sample_rate))\n", "display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "9b244547", "metadata": {}, "source": [ "## Latent Audio Diffusion\n", "Instead of de-noising images directly in the pixel space, we can work in the latent space of a pre-trained VAE (Variational AutoEncoder). This is much faster to train and run inference on, although the quality suffers as there are now three stages involved in encoding / decoding: mel spectrogram, VAE and de-noising." ] }, { "cell_type": "code", "execution_count": null, "id": "a88b3fbb", "metadata": {}, "outputs": [], "source": [ "model_id = \"teticio/latent-audio-diffusion-ddim-256\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "15e353ee", "metadata": {}, "outputs": [], "source": [ "audio_diffusion = AudioDiffusion(model_id=model_id)\n", "mel = audio_diffusion.pipe.mel" ] }, { "cell_type": "code", "execution_count": null, "id": "fa0f0c8c", "metadata": {}, "outputs": [], "source": [ "seed = 3412253600050855 #@param {type:\"integer\"}\n", "generator.manual_seed(seed)\n", "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n", " generator=generator)\n", "display(image)\n", "display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "code", "execution_count": null, "id": "73dc575d", "metadata": {}, "outputs": [], "source": [ "seed2 = 7016114633369557 #@param {type:\"integer\"}\n", "generator.manual_seed(seed2)\n", "image2, (sample_rate, audio2) = audio_diffusion.generate_spectrogram_and_audio(\n", " generator=generator)\n", "display(image2)\n", "display(Audio(audio2, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "428d2d67", "metadata": {}, "source": [ "### Interpolation in latent space\n", "As the VAE forces a more compact, lower dimensional representation for the spectrograms, interpolation in latent space can lead to meaningful combinations of audios. In combination with the (deterministic) DDIM from the previous section, the model can be used as an encoder / decoder to a lower dimensional space." ] }, { "cell_type": "code", "execution_count": null, "id": "72211c2b", "metadata": {}, "outputs": [], "source": [ "generator.manual_seed(seed)\n", "latents = torch.randn((1, audio_diffusion.pipe.unet.in_channels,\n", " audio_diffusion.pipe.unet.sample_size[0],\n", " audio_diffusion.pipe.unet.sample_size[1]),\n", " device=device,\n", " generator=generator)\n", "latents.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "6c732dbe", "metadata": {}, "outputs": [], "source": [ "generator.manual_seed(seed2)\n", "latents2 = torch.randn((1, audio_diffusion.pipe.unet.in_channels,\n", " audio_diffusion.pipe.unet.sample_size[0],\n", " audio_diffusion.pipe.unet.sample_size[1]),\n", " device=device,\n", " generator=generator)\n", "latents2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "159bcfc4", "metadata": {}, "outputs": [], "source": [ "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n", "_, (sample_rate, audio3) = audio_diffusion.generate_spectrogram_and_audio(\n", " noise=audio_diffusion.pipe.slerp(latents, latents2, alpha),\n", " generator=generator)\n", "display(Audio(audio, rate=sample_rate))\n", "display(Audio(audio2, rate=sample_rate))\n", "display(Audio(audio3, rate=sample_rate))" ] }, { "cell_type": "code", "execution_count": null, "id": "ce6c9cc1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "huggingface", "language": "python", "name": "huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }