teticio commited on
Commit
21c77d0
1 Parent(s): 02a2d67
README.md CHANGED
@@ -71,7 +71,7 @@ python scripts/audio_to_images.py \
71
  --output_dir data/audio-diffusion-256 \
72
  --push_to_hub teticio/audio-diffusion-256
73
  ```
74
-
75
  ## Train model
76
  #### Run training on local machine.
77
  ```bash
@@ -123,7 +123,7 @@ accelerate launch --config_file config/accelerate_sagemaker.yaml \
123
  --mixed_precision no
124
  ```
125
  ## Latent Audio Diffusion
126
- Rather than denoising images directly, it is interesting to work in the "latent space" after first encoding images using an autoencoder. This has a number of advantages. Firstly, the information in the images is compressed into a latent space of a much lower dimension, so it is much faster to train denoising diffusion models and run inference with them. Secondly, as the latent space is really a array (tensor) of guassian variables with a particular mean, the decoder is invariant to guassian noise. And thirdly, similar images tend to be clustered together and interpolating between two images in latent space can produce meaningful combinations.
127
 
128
  At the time of writing, the Hugging Face `diffusers` library is geared towards inference and lacking in training functionality, rather like its cousin `transformers` in the early days of development. In order to train a VAE (Variational Autoencoder), I use the [stable-diffusion](https://github.com/CompVis/stable-diffusion) repo from CompVis and convert the checkpoints to `diffusers` format. Note that it uses a perceptual loss function for images; it would be nice to try a perceptual *audio* loss function.
129
 
 
71
  --output_dir data/audio-diffusion-256 \
72
  --push_to_hub teticio/audio-diffusion-256
73
  ```
74
+
75
  ## Train model
76
  #### Run training on local machine.
77
  ```bash
 
123
  --mixed_precision no
124
  ```
125
  ## Latent Audio Diffusion
126
+ Rather than denoising images directly, it is interesting to work in the "latent space" after first encoding images using an autoencoder. This has a number of advantages. Firstly, the information in the images is compressed into a latent space of a much lower dimension, so it is much faster to train denoising diffusion models and run inference with them. Secondly, similar images tend to be clustered together and interpolating between two images in latent space can produce meaningful combinations.
127
 
128
  At the time of writing, the Hugging Face `diffusers` library is geared towards inference and lacking in training functionality, rather like its cousin `transformers` in the early days of development. In order to train a VAE (Variational Autoencoder), I use the [stable-diffusion](https://github.com/CompVis/stable-diffusion) repo from CompVis and convert the checkpoints to `diffusers` format. Note that it uses a perceptual loss function for images; it would be nice to try a perceptual *audio* loss function.
129
 
audiodiffusion/__init__.py CHANGED
@@ -100,10 +100,10 @@ class AudioDiffusion:
100
  """
101
 
102
  # It would be better to derive a class from DiffusionPipeline
103
- # but currently the return type ImagePipelineOutput cannot be imported.
104
  if steps is None:
105
  steps = self.pipe.scheduler.num_train_timesteps
106
- # Unfortunately, the schedule is set up in the constructor.
107
  scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
108
  scheduler.set_timesteps(steps)
109
  mask = None
@@ -121,15 +121,21 @@ class AudioDiffusion:
121
  input_image.width))
122
  input_image = ((input_image / 255) * 2 - 1)
123
 
 
 
 
 
 
124
  if start_step > 0:
125
  images[0, 0] = scheduler.add_noise(
126
  torch.tensor(input_image[np.newaxis, np.newaxis, :]),
127
  noise, torch.tensor(steps - start_step))
128
 
129
- mask_start = int(mask_start_secs * self.mel.get_sample_rate() /
130
- self.mel.hop_length)
131
- mask_end = int(mask_end_secs * self.mel.get_sample_rate() /
132
- self.mel.hop_length)
 
133
  mask = scheduler.add_noise(
134
  torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
135
  torch.tensor(scheduler.timesteps[start_step:]))
@@ -150,11 +156,21 @@ class AudioDiffusion:
150
  if mask_end > 0:
151
  images[0, 0, :, -mask_end:] = mask[step, 0, :, -mask_end:]
152
 
 
 
 
 
 
 
153
  images = (images / 2 + 0.5).clamp(0, 1)
154
  images = images.cpu().permute(0, 2, 3, 1).numpy()
155
-
156
  images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
157
  image = Image.fromarray(images[0][0])
 
 
 
 
 
158
  audio = self.mel.image_to_audio(image)
159
  return image, (self.mel.get_sample_rate(), audio)
160
 
 
100
  """
101
 
102
  # It would be better to derive a class from DiffusionPipeline
103
+ # but currently the return type ImagePipelineOutput cannot be imported
104
  if steps is None:
105
  steps = self.pipe.scheduler.num_train_timesteps
106
+ # Unfortunately, the schedule is set up in the constructor
107
  scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
108
  scheduler.set_timesteps(steps)
109
  mask = None
 
121
  input_image.width))
122
  input_image = ((input_image / 255) * 2 - 1)
123
 
124
+ if hasattr(self.pipe, 'vqvae'):
125
+ input_image = self.pipe.vqvae.encode(
126
+ input_image).latent_dist.sample(generator=generator)
127
+ input_image = 0.18215 * input_image
128
+
129
  if start_step > 0:
130
  images[0, 0] = scheduler.add_noise(
131
  torch.tensor(input_image[np.newaxis, np.newaxis, :]),
132
  noise, torch.tensor(steps - start_step))
133
 
134
+ pixels_per_second = (self.mel.get_sample_rate() *
135
+ self.pipe.unet.sample_size /
136
+ self.mel.hop_length / self.mel.x_res)
137
+ mask_start = int(mask_start_secs * pixels_per_second)
138
+ mask_end = int(mask_end_secs * pixels_per_second)
139
  mask = scheduler.add_noise(
140
  torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
141
  torch.tensor(scheduler.timesteps[start_step:]))
 
156
  if mask_end > 0:
157
  images[0, 0, :, -mask_end:] = mask[step, 0, :, -mask_end:]
158
 
159
+ if hasattr(self.pipe, 'vqvae'):
160
+ # 0.18215 was scaling factor used in training to ensure unit variance
161
+ # This is also currently hardcoded in diffusers pipeline
162
+ images = 1 / 0.18215 * images
163
+ images = self.pipe.vqvae.decode(images)['sample']
164
+
165
  images = (images / 2 + 0.5).clamp(0, 1)
166
  images = images.cpu().permute(0, 2, 3, 1).numpy()
 
167
  images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
168
  image = Image.fromarray(images[0][0])
169
+
170
+ if hasattr(self.pipe,
171
+ 'vqvae') and self.pipe.vqvae.config['out_channels'] == 3:
172
+ image = image.convert('L')
173
+
174
  audio = self.mel.image_to_audio(image)
175
  return image, (self.mel.get_sample_rate(), audio)
176
 
scripts/train_unconditional.py CHANGED
@@ -50,8 +50,10 @@ def main(args):
50
  model = UNet2DModel(
51
  sample_size=args.resolution
52
  if args.vae is None else args.latent_resolution,
53
- in_channels=1 if args.vae is None else 3,
54
- out_channels=1 if args.vae is None else 3,
 
 
55
  layers_per_block=2,
56
  block_out_channels=(128, 128, 256, 256, 512, 512),
57
  down_block_types=(
@@ -115,9 +117,9 @@ def main(args):
115
  )
116
 
117
  def transforms(examples):
118
- if args.vae is not None:
119
  images = [
120
- augmentations(image.convert("RGB"))
121
  for image in examples["image"]
122
  ]
123
  else:
@@ -182,6 +184,8 @@ def main(args):
182
  with torch.no_grad():
183
  clean_images = vqvae.encode(
184
  clean_images).latent_dist.sample()
 
 
185
 
186
  # Sample noise that we'll add to the images
187
  noise = torch.randn(clean_images.shape).to(clean_images.device)
@@ -231,9 +235,7 @@ def main(args):
231
 
232
  # Generate sample images for visual inspection
233
  if accelerator.is_main_process:
234
- if (
235
- epoch + 1
236
- ) % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
237
  if args.vae is not None:
238
  pipeline = LDMPipeline(unet=accelerator.unwrap_model(
239
  ema_model.averaged_model if args.use_ema else model),
@@ -262,14 +264,16 @@ def main(args):
262
  else:
263
  pipeline.save_pretrained(output_dir)
264
 
265
- generator = torch.manual_seed(0)
 
266
  # run pipeline in inference (sample random noise and denoise)
267
- images = pipeline(
268
- generator=generator,
269
- batch_size=args.eval_batch_size,
270
- output_type="numpy",
271
- num_inference_steps=args.num_train_steps,
272
- )["sample"]
 
273
 
274
  # denormalize the images and save to tensorboard
275
  images_processed = ((images *
@@ -278,7 +282,13 @@ def main(args):
278
  accelerator.trackers[0].writer.add_images(
279
  "test_samples", images_processed, epoch)
280
  for _, image in enumerate(images_processed):
281
- audio = mel.image_to_audio(Image.fromarray(image[0]))
 
 
 
 
 
 
282
  accelerator.trackers[0].writer.add_audio(
283
  f"test_audio_{_}",
284
  normalize(audio),
 
50
  model = UNet2DModel(
51
  sample_size=args.resolution
52
  if args.vae is None else args.latent_resolution,
53
+ in_channels=1
54
+ if args.vae is None else vqvae.config['latent_channels'],
55
+ out_channels=1
56
+ if args.vae is None else vqvae.config['latent_channels'],
57
  layers_per_block=2,
58
  block_out_channels=(128, 128, 256, 256, 512, 512),
59
  down_block_types=(
 
117
  )
118
 
119
  def transforms(examples):
120
+ if args.vae is not None and vqvae.config['in_channels'] == 3:
121
  images = [
122
+ augmentations(image.convert('RGB'))
123
  for image in examples["image"]
124
  ]
125
  else:
 
184
  with torch.no_grad():
185
  clean_images = vqvae.encode(
186
  clean_images).latent_dist.sample()
187
+ # Scale latent images to ensure approximately unit variance
188
+ clean_images = clean_images * 0.18215
189
 
190
  # Sample noise that we'll add to the images
191
  noise = torch.randn(clean_images.shape).to(clean_images.device)
 
235
 
236
  # Generate sample images for visual inspection
237
  if accelerator.is_main_process:
238
+ if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
 
 
239
  if args.vae is not None:
240
  pipeline = LDMPipeline(unet=accelerator.unwrap_model(
241
  ema_model.averaged_model if args.use_ema else model),
 
264
  else:
265
  pipeline.save_pretrained(output_dir)
266
 
267
+ if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
268
+ generator = torch.manual_seed(42)
269
  # run pipeline in inference (sample random noise and denoise)
270
+ with torch.no_grad():
271
+ images = pipeline(
272
+ generator=generator,
273
+ batch_size=args.eval_batch_size,
274
+ output_type="numpy",
275
+ num_inference_steps=args.num_train_steps,
276
+ )["sample"]
277
 
278
  # denormalize the images and save to tensorboard
279
  images_processed = ((images *
 
282
  accelerator.trackers[0].writer.add_images(
283
  "test_samples", images_processed, epoch)
284
  for _, image in enumerate(images_processed):
285
+ image = Image.fromarray(image[0])
286
+
287
+ if args.vae is not None and vqvae.config[
288
+ 'out_channels'] == 3:
289
+ image = image.convert('L')
290
+
291
+ audio = mel.image_to_audio(image)
292
  accelerator.trackers[0].writer.add_audio(
293
  f"test_audio_{_}",
294
  normalize(audio),
scripts/train_vae.py CHANGED
@@ -4,6 +4,7 @@
4
 
5
  # TODO
6
  # grayscale
 
7
 
8
  import os
9
  import argparse
 
4
 
5
  # TODO
6
  # grayscale
7
+ # update generate from audio to include vae step
8
 
9
  import os
10
  import argparse