AMP-2023-S2-SoundGeneration

Sleeping

App Files Files Community

acanivet commited on Jan 26, 2024

Commit

bdac835

1 Parent(s): 55056c0

v1

Browse files

Files changed (13) hide show

__pycache__/model.cpython-310.pyc +0 -0
app.py +30 -2
cvae/__init__.py +7 -0
cvae/__pycache__/__init__.cpython-310.pyc +0 -0
cvae/__pycache__/__init__.cpython-311.pyc +0 -0
cvae/__pycache__/blocks.cpython-310.pyc +0 -0
cvae/__pycache__/blocks.cpython-311.pyc +0 -0
cvae/__pycache__/models.cpython-310.pyc +0 -0
cvae/__pycache__/models.cpython-311.pyc +0 -0
cvae/blocks.py +59 -0
cvae/models.py +167 -0
epoch=17-step=650718.ckpt +3 -0
model.py +28 -0

__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (1.96 kB). View file

app.py CHANGED Viewed

@@ -1,4 +1,32 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+from model import generate
+import numpy as np
+if "result" not in st.session_state:
+    st.session_state["result"] = np.empty(16000*4)
+st.title("Sound Exploration")
+col1, col2 = st.columns(2)
+with col1:
+    instrument = st.selectbox(
+        'Which intrument do you want ?',
+        ('🎸 Bass', '🎺  Brass', '🪈 Flute', '🪕 Guitar', '🎹 Keyboard', '🔨 Mallet', 'Organ', 'Reed', '🎻 String', 'Synth lead', '🎙️ Vocal')
+    )
+with col2:
+    instrument_t = st.selectbox(
+        'Which type intrument do you want ?',
+        ('📯 Acoustic', '🎙️  Electronic', '🎛️ Synthetic')
+    )
+with st.expander("Magical parameters 🪄"):
+    p1 = st.slider('p1', 0., 1., step=0.001)
+if st.button("Generate ✨", type="primary"):
+    st.session_state["result"] = generate([instrument, instrument_t])
+if st.session_state["result"].any():
+    st.audio(st.session_state["result"], sample_rate=16000)

cvae/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .models import (
+    Encoder, Decoder, VAE, CVAE
+)
+from .blocks import (
+    UpResConvBlock, DownResConvBlock
+)

cvae/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (316 Bytes). View file

cvae/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (430 Bytes). View file

cvae/__pycache__/blocks.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

cvae/__pycache__/blocks.cpython-311.pyc ADDED Viewed

Binary file (4.32 kB). View file

cvae/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (6.09 kB). View file

cvae/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

cvae/blocks.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from torch import nn
+class UpResConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(UpResConvBlock, self).__init__()
+        self.residual = nn.Sequential(
+            nn.Upsample(scale_factor=2),
+            nn.Conv1d(in_channels, out_channels, 1, 1, bias=False),
+            )
+        self.main = nn.Sequential(
+            nn.Upsample(scale_factor=2),
+            nn.Conv1d(in_channels, out_channels, kernel_size, 1),
+            nn.GroupNorm(1, out_channels),
+            nn.GELU(),
+            nn.Conv1d(out_channels, out_channels, kernel_size, 1),
+            nn.GroupNorm(1, out_channels),
+            nn.GELU()
+        )
+    def forward(self, x):
+        return self.main(x) + self.residual(x)
+class DownResConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(DownResConvBlock, self).__init__()
+        self.residual = nn.Conv1d(in_channels, out_channels, 1, 2, bias=False)
+        self.main = nn.Sequential(
+            nn.Conv1d(in_channels, out_channels, kernel_size, 2),
+            nn.GroupNorm(1, out_channels),
+            nn.GELU(),
+            nn.Conv1d(out_channels, out_channels, kernel_size, 1),
+            nn.GroupNorm(1, out_channels),
+            nn.GELU()
+        )
+    def forward(self, x):
+        return self.main(x) + self.residual(x)
+class ResConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super(ResConvBlock, self).__init__()
+        self.residual = nn.Identity() if in_channels == out_channels else nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        self.main = nn.Sequential(
+            nn.Conv1d(in_channels, out_channels, kernel_size),
+            nn.GroupNorm(1, out_channels),
+            nn.GELU(),
+            nn.Conv1d(out_channels, out_channels, kernel_size),
+            nn.GroupNorm(1, out_channels),
+            nn.GELU()
+        )
+    def forward(self, x):
+        return self.main(x) + self.residual(x)

cvae/models.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+from torch import nn, Tensor
+from torch.optim import Optimizer
+from .blocks import UpResConvBlock, DownResConvBlock
+import lightning as L
+from auraloss.freq import MultiResolutionSTFTLoss
+class Encoder(nn.Module):
+    def __init__(self,
+        in_channels: int,
+        in_features: int,
+        out_features: int,
+        channels: list = None,
+        ) -> None:
+        super(Encoder, self).__init__()
+        assert in_features % 2**len(channels) == 0, f"in_features ({in_features}) must be a multiple of downscale factor ({2**len(channels)})"
+        modules = [
+            nn.Conv1d(in_channels, channels[0], 1),
+            nn.GELU()
+        ]
+        for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
+            modules += [
+                DownResConvBlock(in_channel, out_channel, 1),
+            ]
+        n_features = int(in_features*.5**len(channels))
+        modules += [
+            nn.Flatten(),
+            nn.Linear(n_features*channels[-1], 2*out_features)
+        ]
+        self.net = nn.Sequential(*modules)
+    def forward(self, x):
+        mean, logvar = self.net(x).chunk(2, dim=1)
+        return mean, logvar
+class Decoder(nn.Module):
+    def __init__(self,
+        out_channels: int,
+        in_features: int,
+        out_features: int,
+        channels: list = None,
+        ) -> None:
+        super(Decoder, self).__init__()
+        n_features = int(out_features/2**len(channels))
+        modules = [
+            nn.Linear(in_features, n_features*channels[0]),
+            nn.Unflatten(-1, (channels[0], n_features))
+        ]
+        for in_channel, out_channel in zip(channels, channels[1:]+[channels[-1]]):
+            modules += [
+                UpResConvBlock(in_channel, out_channel, 1),
+            ]
+        modules += [
+                nn.Conv1d(channels[-1], out_channels, 1),
+                nn.GELU()
+            ]
+        self.net = nn.Sequential(*modules)
+    def forward(self, x):
+        x = torch.tanh(self.net(x))
+        return x
+class VAE(L.LightningModule):
+    def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, learning_rate: float):
+        super().__init__()
+        self.encoder = Encoder(io_channels, io_features, latent_features, channels)
+        channels.reverse()
+        self.decoder = Decoder(io_channels, latent_features, io_features, channels)
+        self.latent_features = latent_features
+        self.audio_loss_func = MultiResolutionSTFTLoss()
+        self.learning_rate = learning_rate
+    @torch.no_grad()
+    def sample(self, eps=None):
+        if eps is None:
+            eps = torch.rand((1, self.latent_features))
+        return self.decoder(eps)
+    def loss_function(self, x, x_hat, mean, logvar):
+        audio_loss = self.audio_loss_func(x, x_hat)
+        kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
+        return audio_loss + kld_loss
+    def reparameterize(self, mean, logvar):
+        std= torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        return eps * std + mean
+    def forward(self, x):
+        mean, logvar = self.encoder(x)
+        z = self.reparameterize(mean, logvar)
+        return self.decoder(z), mean, logvar
+    def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
+        x_hat, mean, logvar = self.forward(batch)
+        loss = self.loss_function(batch, x_hat, mean, logvar)
+        if log: self.log("train_loss", loss, prog_bar=True)
+        return loss
+    def configure_optimizers(self) -> Optimizer:
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
+        return optimizer
+class CVAE(L.LightningModule):
+    def __init__(self, io_channels: int, io_features: int, latent_features: int, channels: list, num_classes: int, learning_rate: float):
+        super().__init__()
+        self.class_embedder = nn.Linear(num_classes, io_features)
+        self.data_embedder = nn.Conv1d(io_channels, io_channels, kernel_size=1)
+        self.encoder = Encoder(io_channels+1, io_features, latent_features, channels)
+        channels.reverse()
+        self.decoder = Decoder(io_channels, latent_features+num_classes, io_features, channels)
+        self.num_classes = num_classes
+        self.latent_features = latent_features
+        self.audio_loss_func = MultiResolutionSTFTLoss()
+        self.learning_rate = learning_rate
+    @torch.no_grad()
+    def sample(self, c, eps=None):
+        c = nn.functional.one_hot(c, num_classes=self.num_classes).float().unsqueeze(0)
+        if eps is None:
+            eps = torch.rand((1, self.latent_features))
+        z = torch.cat([eps, c], dim=1)
+        return self.decoder(z)
+    def loss_function(self, x, x_hat, mean, logvar):
+        audio_loss = self.audio_loss_func(x, x_hat)
+        kld_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
+        return audio_loss + kld_loss
+    def reparameterize(self, mean, logvar):
+        std= torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        return eps * std + mean
+    def forward(self, x, c):
+        c = nn.functional.one_hot(c, num_classes=self.num_classes).float()
+        c_embedding = self.class_embedder(c).unsqueeze(1)
+        x_embedding = self.data_embedder(x)
+        x = torch.cat([x_embedding, c_embedding], dim = 1)
+        mean, logvar = self.encoder(x)
+        z = self.reparameterize(mean, logvar)
+        z = torch.cat([z, c], dim = 1)
+        return self.decoder(z), mean, logvar
+    def training_step(self, batch: Tensor, batch_idx: int, log: bool = True) -> Tensor:
+        x, c = batch
+        x_hat, mean, logvar = self.forward(x, c)
+        loss = self.loss_function(x, x_hat, mean, logvar)
+        if log: self.log("train_loss", loss, prog_bar=True)
+        return loss
+    def configure_optimizers(self) -> Optimizer:
+        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
+        return optimizer

epoch=17-step=650718.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cbccc3cf4b4a124831ab6fc7f23b4270ed90fcb41e1e87277ec4155787362c8
+size 651547328

model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from cvae import CVAE
+import torch
+from typing import Sequence
+import re
+instruments = ['bass_acoustic', 'brass_acoustic', 'flute_acoustic', 'guitar_acoustic', 'keyboard_acoustic', 'mallet_acoustic', 'organ_acoustic', 'reed_acoustic', 'string_acoustic', 'synth_lead_acoustic', 'vocal_acoustic', 'bass_synthetic', 'brass_synthetic', 'flute_synthetic', 'guitar_synthetic', 'keyboard_synthetic', 'mallet_synthetic', 'organ_synthetic', 'reed_synthetic', 'string_synthetic', 'synth_lead_synthetic', 'vocal_synthetic', 'bass_electronic', 'brass_electronic', 'flute_electronic', 'guitar_electronic', 'keyboard_electronic', 'mallet_electronic', 'organ_electronic', 'reed_electronic', 'string_electronic', 'synth_lead_electronic', 'vocal_electronic']
+model = CVAE.load_from_checkpoint(
+    'epoch=17-step=650718.ckpt',
+    io_channels=1,
+    io_features=16000*4,
+    latent_features=5,
+    channels=[32, 64, 128, 256, 512],
+    num_classes=len(instruments),
+    learning_rate=1e-5
+)
+def format(text):
+    text = text.split(' ')[-1]
+    return text.replace(" ", "").lower()
+def choice_to_tensor(choice: Sequence[str]) -> torch.Tensor:
+    choice = '_'.join([format(i) for i in choice])
+    return torch.tensor(instruments.index(choice))
+def generate(choice: Sequence[str], params: Sequence[int]=None):
+    noise = torch.tensor(params).unsqueeze(0).to('cuda') if params else torch.randn(1, 5).to('cuda')
+    return model.sample(eps=noise, c = choice_to_tensor(choice).to('cuda')).cpu().numpy()[0]