{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"../\")\n", "\n", "import torch, os\n", "import numpy as np\n", "from PIL import Image\n", "from diffusers import StableDiffusionPipeline\n", "from huggingface_hub import snapshot_download\n", "from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio\n", "from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize\n", "from IPython.display import display, Audio" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "pretrained_model_name_or_path = \"auffusion/auffusion-full-no-adapter\"\n", "dtype = torch.float16\n", "device = \"cuda\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "if not os.path.isdir(pretrained_model_name_or_path):\n", " pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder=\"vocoder\")\n", "vocoder = vocoder.to(device=device, dtype=dtype)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipe = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype)\n", "pipe = pipe.to(device)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Examples\n", "prompt = \"A kitten mewing for attention\"\n", "seed = 42" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a5114b06c3224f9c8010f17ba20e97e9", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/100 [00:00\n", " \n", " Your browser does not support the audio element.\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Generation \n", "generator = torch.Generator(device=device).manual_seed(seed)\n", "\n", "with torch.autocast(\"cuda\"):\n", " output_spec = pipe(\n", " prompt=prompt, num_inference_steps=100, generator=generator, height=256, width=1024, output_type=\"pt\"\n", " ).images[0]\n", "\n", "\n", "denorm_spec = denormalize_spectrogram(output_spec)\n", "denorm_spec_audio = vocoder.inference(denorm_spec)\n", "\n", "display(Audio(denorm_spec_audio, rate=16000))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "TTA", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }