teticio commited on
Commit
001a426
1 Parent(s): d55f1e0
.gitignore CHANGED
@@ -1,13 +1,11 @@
1
  .vscode
2
  __pycache__
3
  .ipynb_checkpoints
4
- data*
5
- ddpm-ema-audio-*
6
  flagged
7
  build
8
  audiodiffusion.egg-info
9
  lightning_logs
10
  taming
11
  checkpoints
12
- vae_model
13
- latent-audio-diffusion-*
 
1
  .vscode
2
  __pycache__
3
  .ipynb_checkpoints
4
+ data
5
+ models
6
  flagged
7
  build
8
  audiodiffusion.egg-info
9
  lightning_logs
10
  taming
11
  checkpoints
 
 
README.md CHANGED
@@ -56,7 +56,7 @@ python scripts/audio_to_images.py \
56
  --resolution 64 \
57
  --hop_length 1024 \
58
  --input_dir path-to-audio-files \
59
- --output_dir data-test
60
  ```
61
  #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
62
 
@@ -64,7 +64,7 @@ python scripts/audio_to_images.py \
64
  python scripts/audio_to_images.py \
65
  --resolution 256 \
66
  --input_dir path-to-audio-files \
67
- --output_dir data-256 \
68
  --push_to_hub teticio/audio-diffusion-256
69
  ```
70
  ## Train model
@@ -72,10 +72,10 @@ python scripts/audio_to_images.py \
72
  ```bash
73
  accelerate launch --config_file config/accelerate_local.yaml \
74
  scripts/train_unconditional.py \
75
- --dataset_name data-64 \
76
  --resolution 64 \
77
  --hop_length 1024 \
78
- --output_dir ddpm-ema-audio-64 \
79
  --train_batch_size 16 \
80
  --num_epochs 100 \
81
  --gradient_accumulation_steps 1 \
@@ -89,7 +89,7 @@ accelerate launch --config_file config/accelerate_local.yaml \
89
  scripts/train_unconditional.py \
90
  --dataset_name teticio/audio-diffusion-256 \
91
  --resolution 256 \
92
- --output_dir audio-diffusion-256 \
93
  --num_epochs 100 \
94
  --train_batch_size 2 \
95
  --eval_batch_size 2 \
@@ -107,7 +107,7 @@ accelerate launch --config_file config/accelerate_sagemaker.yaml \
107
  scripts/train_unconditional.py \
108
  --dataset_name teticio/audio-diffusion-256 \
109
  --resolution 256 \
110
- --output_dir ddpm-ema-audio-256 \
111
  --train_batch_size 16 \
112
  --num_epochs 100 \
113
  --gradient_accumulation_steps 1 \
 
56
  --resolution 64 \
57
  --hop_length 1024 \
58
  --input_dir path-to-audio-files \
59
+ --output_dir path-to-output-data
60
  ```
61
  #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
62
 
 
64
  python scripts/audio_to_images.py \
65
  --resolution 256 \
66
  --input_dir path-to-audio-files \
67
+ --output_dir data/audio-diffusion-256 \
68
  --push_to_hub teticio/audio-diffusion-256
69
  ```
70
  ## Train model
 
72
  ```bash
73
  accelerate launch --config_file config/accelerate_local.yaml \
74
  scripts/train_unconditional.py \
75
+ --dataset_name data/audio-diffusion-64 \
76
  --resolution 64 \
77
  --hop_length 1024 \
78
+ --output_dir models/ddpm-ema-audio-64 \
79
  --train_batch_size 16 \
80
  --num_epochs 100 \
81
  --gradient_accumulation_steps 1 \
 
89
  scripts/train_unconditional.py \
90
  --dataset_name teticio/audio-diffusion-256 \
91
  --resolution 256 \
92
+ --output_dir models/audio-diffusion-256 \
93
  --num_epochs 100 \
94
  --train_batch_size 2 \
95
  --eval_batch_size 2 \
 
107
  scripts/train_unconditional.py \
108
  --dataset_name teticio/audio-diffusion-256 \
109
  --resolution 256 \
110
+ --output_dir models/ddpm-ema-audio-256 \
111
  --train_batch_size 16 \
112
  --num_epochs 100 \
113
  --gradient_accumulation_steps 1 \
notebooks/test_vae.ipynb CHANGED
@@ -1,5 +1,17 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
@@ -12,7 +24,9 @@
12
  "import numpy as np\n",
13
  "from PIL import Image\n",
14
  "from datasets import load_dataset\n",
15
- "from diffusers import AutoencoderKL"
 
 
16
  ]
17
  },
18
  {
@@ -22,7 +36,8 @@
22
  "metadata": {},
23
  "outputs": [],
24
  "source": [
25
- "vae = AutoencoderKL.from_pretrained('../vae_model')"
 
26
  ]
27
  },
28
  {
@@ -42,7 +57,7 @@
42
  "metadata": {},
43
  "outputs": [],
44
  "source": [
45
- "ds = load_dataset('teticio/audio-diffusion-256')"
46
  ]
47
  },
48
  {
@@ -53,7 +68,8 @@
53
  "outputs": [],
54
  "source": [
55
  "image = random.choice(ds['train'])['image']\n",
56
- "image"
 
57
  ]
58
  },
59
  {
@@ -84,7 +100,9 @@
84
  "output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
85
  "output_image = (output_image.detach().cpu().numpy() *\n",
86
  " 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
87
- "Image.fromarray(output_image)"
 
 
88
  ]
89
  },
90
  {
@@ -100,7 +118,9 @@
100
  "output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
101
  "output_image = (output_image.detach().cpu().numpy() *\n",
102
  " 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
103
- "Image.fromarray(output_image)"
 
 
104
  ]
105
  },
106
  {
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "3c8663ed",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import sys\n",
12
+ "sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))"
13
+ ]
14
+ },
15
  {
16
  "cell_type": "code",
17
  "execution_count": null,
 
24
  "import numpy as np\n",
25
  "from PIL import Image\n",
26
  "from datasets import load_dataset\n",
27
+ "from IPython.display import Audio\n",
28
+ "from diffusers import AutoencoderKL\n",
29
+ "from audiodiffusion.mel import Mel"
30
  ]
31
  },
32
  {
 
36
  "metadata": {},
37
  "outputs": [],
38
  "source": [
39
+ "mel = Mel()\n",
40
+ "vae = AutoencoderKL.from_pretrained('../models/autoencoder-kl')"
41
  ]
42
  },
43
  {
 
57
  "metadata": {},
58
  "outputs": [],
59
  "source": [
60
+ "ds = load_dataset('teticio/audio-diffusion-breaks-256')"
61
  ]
62
  },
63
  {
 
68
  "outputs": [],
69
  "source": [
70
  "image = random.choice(ds['train'])['image']\n",
71
+ "display(image)\n",
72
+ "Audio(data=mel.image_to_audio(image), rate=mel.get_sample_rate())"
73
  ]
74
  },
75
  {
 
100
  "output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
101
  "output_image = (output_image.detach().cpu().numpy() *\n",
102
  " 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
103
+ "output_image = Image.fromarray(output_image).convert('L')\n",
104
+ "display(output_image)\n",
105
+ "Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())"
106
  ]
107
  },
108
  {
 
118
  "output_image = (output_image + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w\n",
119
  "output_image = (output_image.detach().cpu().numpy() *\n",
120
  " 255).round().astype(\"uint8\").transpose(0, 2, 3, 1)[0]\n",
121
+ "output_image = Image.fromarray(output_image).convert('L')\n",
122
+ "display(output_image)\n",
123
+ "Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())"
124
  ]
125
  },
126
  {
scripts/train_unconditional.py CHANGED
@@ -73,11 +73,12 @@ def main(args):
73
  )
74
 
75
  if args.scheduler == "ddpm":
76
- noise_scheduler = DDPMScheduler(num_train_timesteps=1000,
77
- tensor_format="pt")
78
  else:
79
- noise_scheduler = DDIMScheduler(num_train_timesteps=1000,
80
- tensor_format="pt")
 
81
  optimizer = torch.optim.AdamW(
82
  model.parameters(),
83
  lr=args.learning_rate,
@@ -305,7 +306,6 @@ if __name__ == "__main__":
305
  parser.add_argument("--overwrite_output_dir", type=bool, default=False)
306
  parser.add_argument("--cache_dir", type=str, default=None)
307
  parser.add_argument("--resolution", type=int, default=256)
308
- parser.add_argument("--latent_resolution", type=int, default=64)
309
  parser.add_argument("--train_batch_size", type=int, default=16)
310
  parser.add_argument("--eval_batch_size", type=int, default=16)
311
  parser.add_argument("--num_epochs", type=int, default=100)
@@ -342,6 +342,8 @@ if __name__ == "__main__":
342
  parser.add_argument("--hop_length", type=int, default=512)
343
  parser.add_argument("--from_pretrained", type=str, default=None)
344
  parser.add_argument("--start_epoch", type=int, default=0)
 
 
345
  parser.add_argument("--scheduler",
346
  type=str,
347
  default="ddpm",
 
73
  )
74
 
75
  if args.scheduler == "ddpm":
76
+ noise_scheduler = DDPMScheduler(
77
+ num_train_timesteps=args.num_train_steps, tensor_format="pt")
78
  else:
79
+ noise_scheduler = DDIMScheduler(
80
+ num_train_timesteps=args.num_train_steps, tensor_format="pt")
81
+
82
  optimizer = torch.optim.AdamW(
83
  model.parameters(),
84
  lr=args.learning_rate,
 
306
  parser.add_argument("--overwrite_output_dir", type=bool, default=False)
307
  parser.add_argument("--cache_dir", type=str, default=None)
308
  parser.add_argument("--resolution", type=int, default=256)
 
309
  parser.add_argument("--train_batch_size", type=int, default=16)
310
  parser.add_argument("--eval_batch_size", type=int, default=16)
311
  parser.add_argument("--num_epochs", type=int, default=100)
 
342
  parser.add_argument("--hop_length", type=int, default=512)
343
  parser.add_argument("--from_pretrained", type=str, default=None)
344
  parser.add_argument("--start_epoch", type=int, default=0)
345
+ parser.add_argument("--num_train_steps", type=int, default=1000)
346
+ parser.add_argument("--latent_resolution", type=int, default=64)
347
  parser.add_argument("--scheduler",
348
  type=str,
349
  default="ddpm",
scripts/train_vae.py CHANGED
@@ -107,7 +107,7 @@ class ImageLogger(Callback):
107
 
108
  class HFModelCheckpoint(ModelCheckpoint):
109
 
110
- def __init__(self, ldm_config, hf_checkpoint='vae_model', *args, **kwargs):
111
  super().__init__(*args, **kwargs)
112
  self.ldm_config = ldm_config
113
  self.hf_checkpoint = hf_checkpoint
@@ -130,7 +130,7 @@ if __name__ == "__main__":
130
  default="config/ldm_autoencoder_kl.yaml")
131
  parser.add_argument("--ldm_checkpoint_dir",
132
  type=str,
133
- default="checkpoints")
134
  parser.add_argument("--hf_checkpoint_dir", type=str, default="vae_model")
135
  parser.add_argument("-r",
136
  "--resume_from_checkpoint",
 
107
 
108
  class HFModelCheckpoint(ModelCheckpoint):
109
 
110
+ def __init__(self, ldm_config, hf_checkpoint='models/autoencoder-kl', *args, **kwargs):
111
  super().__init__(*args, **kwargs)
112
  self.ldm_config = ldm_config
113
  self.hf_checkpoint = hf_checkpoint
 
130
  default="config/ldm_autoencoder_kl.yaml")
131
  parser.add_argument("--ldm_checkpoint_dir",
132
  type=str,
133
+ default="models/ldm-autoencoder-kl")
134
  parser.add_argument("--hf_checkpoint_dir", type=str, default="vae_model")
135
  parser.add_argument("-r",
136
  "--resume_from_checkpoint",