{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "fd262b00", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))" ] }, { "cell_type": "code", "execution_count": 28, "id": "d2253762", "metadata": {}, "outputs": [], "source": [ "import random\n", "from src.mel import Mel\n", "from PIL import ImageOps, Image\n", "from IPython.display import Audio\n", "from datasets import load_from_disk\n", "from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline" ] }, { "cell_type": "code", "execution_count": 3, "id": "293dd2c7", "metadata": {}, "outputs": [], "source": [ "mel = Mel(x_res=64, y_res=64, hop_length=1024)" ] }, { "cell_type": "markdown", "id": "5bdb2648", "metadata": {}, "source": [ "### Run model inference to generate Mel spectrogram" ] }, { "cell_type": "code", "execution_count": 5, "id": "aac92f90", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9fa5515ab1984c45bf459e9dfa12c3b9", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1000 [00:00\n", " \n", " Your browser does not support the audio element.\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio = mel.image_to_audio(ImageOps.grayscale(image))\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "markdown", "id": "10805113", "metadata": {}, "source": [ "### Compare results with random sample from training set" ] }, { "cell_type": "code", "execution_count": 29, "id": "7a366813", "metadata": {}, "outputs": [], "source": [ "ds = load_from_disk('../data-64')" ] }, { "cell_type": "code", "execution_count": 38, "id": "55a29505", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio = mel.image_to_audio(random.choice(ds['train'])['image'])\n", "Audio(data=audio, rate=mel.get_sample_rate())" ] }, { "cell_type": "code", "execution_count": null, "id": "afb1f699", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "huggingface", "language": "python", "name": "huggingface" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }