{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fd262b00",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "d2253762",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"from src.mel import Mel\n",
"from PIL import ImageOps, Image\n",
"from IPython.display import Audio\n",
"from datasets import load_from_disk\n",
"from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "293dd2c7",
"metadata": {},
"outputs": [],
"source": [
"mel = Mel(x_res=64, y_res=64, hop_length=1024)"
]
},
{
"cell_type": "markdown",
"id": "5bdb2648",
"metadata": {},
"source": [
"### Run model inference to generate Mel spectrogram"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "aac92f90",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9fa5515ab1984c45bf459e9dfa12c3b9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1000 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model_id = \"../ddpm-ema-audio-64\"\n",
"ddpm = DDPMPipeline.from_pretrained(model_id) # you can replace DDPMPipeline with DDIMPipeline or PNDMPipeline for faster inference\n",
"image = ddpm()[\"sample\"][0]"
]
},
{
"cell_type": "markdown",
"id": "df6c533b",
"metadata": {},
"source": [
"### Transform Mel spectrogram to audio"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "37c24f43",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio = mel.image_to_audio(ImageOps.grayscale(image))\n",
"Audio(data=audio, rate=mel.get_sample_rate())"
]
},
{
"cell_type": "markdown",
"id": "10805113",
"metadata": {},
"source": [
"### Compare results with random sample from training set"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "7a366813",
"metadata": {},
"outputs": [],
"source": [
"ds = load_from_disk('../data-64')"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "55a29505",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"audio = mel.image_to_audio(random.choice(ds['train'])['image'])\n",
"Audio(data=audio, rate=mel.get_sample_rate())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afb1f699",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "huggingface",
"language": "python",
"name": "huggingface"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}