{ "cells": [ { "cell_type": "markdown", "id": "0fd939b0", "metadata": {}, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 1, "id": "6c7800a6", "metadata": {}, "outputs": [], "source": [ "try:\n", " # are we running on Google Colab?\n", " import google.colab\n", " !git clone -q https://github.com/teticio/audio-diffusion.git\n", " %cd audio-diffusion\n", " !pip install -q -r requirements.txt\n", "except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 2, "id": "b447e2c4", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))" ] }, { "cell_type": "code", "execution_count": 14, "id": "c2fc0e7a", "metadata": {}, "outputs": [], "source": [ "import random\n", "from datasets import load_dataset\n", "from IPython.display import Audio\n", "from audiodiffusion.mel import Mel\n", "from audiodiffusion import AudioDiffusion" ] }, { "cell_type": "markdown", "id": "011fb5a1", "metadata": {}, "source": [ "### Run model inference to generate mel spectrogram and audios" ] }, { "cell_type": "code", "execution_count": 4, "id": "a3d45c36", "metadata": {}, "outputs": [], "source": [ "audio_diffusion = AudioDiffusion(model_id=\"teticio/audio-diffusion-256\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "b809fed5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "71abbf1ea35d4180bf97bdc94dd66ef1", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1000 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ad5c8b3eaad240a6869ee788740d75e9", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1000 [00:00\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m----> 2\u001b[0m image, (sample_rate, audio) \u001b[38;5;241m=\u001b[39m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_spectrogram_and_audio\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m display(image)\n\u001b[1;32m 4\u001b[0m display(Audio(audio, rate\u001b[38;5;241m=\u001b[39msample_rate))\n", "File \u001b[0;32m~/ML/huggingface/audio-diffusion/audiodiffusion/__init__.py:36\u001b[0m, in \u001b[0;36mAudioDiffusion.generate_spectrogram_and_audio\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_spectrogram_and_audio\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 30\u001b[0m \u001b[38;5;124;03m\"\"\"Generate random mel spectrogram and convert to audio.\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \n\u001b[1;32m 32\u001b[0m \u001b[38;5;124;03m Returns:\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;124;03m PIL Image: mel spectrogram\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \u001b[38;5;124;03m (Float, Array): sample rate and raw audio\u001b[39;00m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 36\u001b[0m images \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mddpm\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnumpy\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 37\u001b[0m images \u001b[38;5;241m=\u001b[39m (images \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m255\u001b[39m)\u001b[38;5;241m.\u001b[39mround()\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muint8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 38\u001b[0m image \u001b[38;5;241m=\u001b[39m Image\u001b[38;5;241m.\u001b[39mfromarray(images[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m])\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclone():\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/pipelines/ddpm/pipeline_ddpm.py:58\u001b[0m, in \u001b[0;36mDDPMPipeline.__call__\u001b[0;34m(self, batch_size, generator, output_type, **kwargs)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mset_timesteps(\u001b[38;5;241m1000\u001b[39m)\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m tqdm(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps):\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# 1. predict noise model_output\u001b[39;00m\n\u001b[0;32m---> 58\u001b[0m model_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 60\u001b[0m \u001b[38;5;66;03m# 2. compute previous image: x_t -> t_t-1\u001b[39;00m\n\u001b[1;32m 61\u001b[0m image \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mstep(model_output, t, image)[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprev_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/diffusers/models/unet_2d.py:147\u001b[0m, in \u001b[0;36mUNet2DModel.forward\u001b[0;34m(self, sample, timestep)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# 3. down\u001b[39;00m\n\u001b[1;32m 146\u001b[0m down_block_res_samples \u001b[38;5;241m=\u001b[39m (sample,)\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m downsample_block \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdown_blocks:\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(downsample_block, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip_conv\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 149\u001b[0m sample, res_samples, skip_sample \u001b[38;5;241m=\u001b[39m downsample_block(\n\u001b[1;32m 150\u001b[0m hidden_states\u001b[38;5;241m=\u001b[39msample, temb\u001b[38;5;241m=\u001b[39memb, skip_sample\u001b[38;5;241m=\u001b[39mskip_sample\n\u001b[1;32m 151\u001b[0m )\n", "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/container.py:219\u001b[0m, in \u001b[0;36mModuleList.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;129m@_copy_to_script_wrapper\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Module]:\n\u001b[0;32m--> 219\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43miter\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_modules\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "while True:\n", " image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio()\n", " display(image)\n", " display(Audio(audio, rate=sample_rate))" ] }, { "cell_type": "markdown", "id": "ef54cef3", "metadata": {}, "source": [ "### Compare results with random sample from training set" ] }, { "cell_type": "code", "execution_count": 12, "id": "f028a3c8", "metadata": {}, "outputs": [], "source": [ "mel = Mel(x_res=256, y_res=256)" ] }, { "cell_type": "code", "execution_count": 15, "id": "269ee816", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration teticio--audio-diffusion-256-90642b08dc2c6e33\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None (download: 910.09 MiB, generated: 911.42 MiB, post-processed: Unknown size, total: 1.78 GiB) to /home/teticio/.cache/huggingface/datasets/teticio___parquet/teticio--audio-diffusion-256-90642b08dc2c6e33/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c91ed8a7a45f45ffbc1452a7219311fc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "image = random.choice(ds['train'])['image']\n", "image" ] }, { "cell_type": "code", "execution_count": 17, "id": "492e2334", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio = mel.image_to_audio(image)\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "markdown", "id": "946fdb4d", "metadata": {}, "source": [ "### Push model to hub" ] }, { "cell_type": "code", "execution_count": null, "id": "37c0564e", "metadata": {}, "outputs": [], "source": [ "from diffusers.hub_utils import init_git_repo, push_to_hub\n", "\n", "\n", "class AttributeDict(dict):\n", "\n", " def __getattr__(self, attr):\n", " return self[attr]\n", "\n", " def __setattr__(self, attr, value):\n", " self[attr] = value\n", "\n", "\n", "args = AttributeDict({\n", " \"hub_model_id\":\n", " \"teticio/audio-diffusion-256\",\n", " \"output_dir\":\n", " \"../ddpm-ema-audio-256-repo\",\n", " \"local_rank\":\n", " -1,\n", " \"hub_token\":\n", " open(os.path.join(os.environ['HOME'], '.huggingface/token'), 'rt').read(),\n", " \"hub_private_repo\":\n", " False,\n", " \"overwrite_output_dir\":\n", " False\n", "})\n", "\n", "repo = init_git_repo(args, at_init=True)\n", "ddpm = DDPMPipeline.from_pretrained('../ddpm-ema-audio-256')\n", "push_to_hub(args, ddpm, repo)" ] }, { "cell_type": "code", "execution_count": null, "id": "8c8261a0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "huggingface", "language": "python", "name": "huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }