Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,979 Bytes
0fd2f06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
from abc import abstractmethod, ABC
import torch
class SchedulerInterface(ABC):
"""
Base class for diffusion noise schedule.
"""
alphas_cumprod: torch.Tensor # [T], alphas for defining the noise schedule
@abstractmethod
def add_noise(
self, clean_latent: torch.Tensor,
noise: torch.Tensor, timestep: torch.Tensor
):
"""
Diffusion forward corruption process.
Input:
- clean_latent: the clean latent with shape [B, C, H, W]
- noise: the noise with shape [B, C, H, W]
- timestep: the timestep with shape [B]
Output: the corrupted latent with shape [B, C, H, W]
"""
pass
def convert_x0_to_noise(
self, x0: torch.Tensor, xt: torch.Tensor,
timestep: torch.Tensor
) -> torch.Tensor:
"""
Convert the diffusion network's x0 prediction to noise predidction.
x0: the predicted clean data with shape [B, C, H, W]
xt: the input noisy data with shape [B, C, H, W]
timestep: the timestep with shape [B]
noise = (xt-sqrt(alpha_t)*x0) / sqrt(beta_t) (eq 11 in https://arxiv.org/abs/2311.18828)
"""
# use higher precision for calculations
original_dtype = x0.dtype
x0, xt, alphas_cumprod = map(
lambda x: x.double().to(x0.device), [x0, xt,
self.alphas_cumprod]
)
alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1)
beta_prod_t = 1 - alpha_prod_t
noise_pred = (xt - alpha_prod_t **
(0.5) * x0) / beta_prod_t ** (0.5)
return noise_pred.to(original_dtype)
def convert_noise_to_x0(
self, noise: torch.Tensor, xt: torch.Tensor,
timestep: torch.Tensor
) -> torch.Tensor:
"""
Convert the diffusion network's noise prediction to x0 predidction.
noise: the predicted noise with shape [B, C, H, W]
xt: the input noisy data with shape [B, C, H, W]
timestep: the timestep with shape [B]
x0 = (x_t - sqrt(beta_t) * noise) / sqrt(alpha_t) (eq 11 in https://arxiv.org/abs/2311.18828)
"""
# use higher precision for calculations
original_dtype = noise.dtype
noise, xt, alphas_cumprod = map(
lambda x: x.double().to(noise.device), [noise, xt,
self.alphas_cumprod]
)
alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1)
beta_prod_t = 1 - alpha_prod_t
x0_pred = (xt - beta_prod_t **
(0.5) * noise) / alpha_prod_t ** (0.5)
return x0_pred.to(original_dtype)
def convert_velocity_to_x0(
self, velocity: torch.Tensor, xt: torch.Tensor,
timestep: torch.Tensor
) -> torch.Tensor:
"""
Convert the diffusion network's velocity prediction to x0 predidction.
velocity: the predicted noise with shape [B, C, H, W]
xt: the input noisy data with shape [B, C, H, W]
timestep: the timestep with shape [B]
v = sqrt(alpha_t) * noise - sqrt(beta_t) x0
noise = (xt-sqrt(alpha_t)*x0) / sqrt(beta_t)
given v, x_t, we have
x0 = sqrt(alpha_t) * x_t - sqrt(beta_t) * v
see derivations https://chatgpt.com/share/679fb6c8-3a30-8008-9b0e-d1ae892dac56
"""
# use higher precision for calculations
original_dtype = velocity.dtype
velocity, xt, alphas_cumprod = map(
lambda x: x.double().to(velocity.device), [velocity, xt,
self.alphas_cumprod]
)
alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1)
beta_prod_t = 1 - alpha_prod_t
x0_pred = (alpha_prod_t ** 0.5) * xt - (beta_prod_t ** 0.5) * velocity
return x0_pred.to(original_dtype)
class FlowMatchScheduler():
def __init__(self, num_inference_steps=100, num_train_timesteps=1000, shift=3.0, sigma_max=1.0, sigma_min=0.003 / 1.002, inverse_timesteps=False, extra_one_step=False, reverse_sigmas=False):
self.num_train_timesteps = num_train_timesteps
self.shift = shift
self.sigma_max = sigma_max
self.sigma_min = sigma_min
self.inverse_timesteps = inverse_timesteps
self.extra_one_step = extra_one_step
self.reverse_sigmas = reverse_sigmas
self.set_timesteps(num_inference_steps)
def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False):
sigma_start = self.sigma_min + \
(self.sigma_max - self.sigma_min) * denoising_strength
if self.extra_one_step:
self.sigmas = torch.linspace(
sigma_start, self.sigma_min, num_inference_steps + 1)[:-1]
else:
self.sigmas = torch.linspace(
sigma_start, self.sigma_min, num_inference_steps)
if self.inverse_timesteps:
self.sigmas = torch.flip(self.sigmas, dims=[0])
self.sigmas = self.shift * self.sigmas / \
(1 + (self.shift - 1) * self.sigmas)
if self.reverse_sigmas:
self.sigmas = 1 - self.sigmas
self.timesteps = self.sigmas * self.num_train_timesteps
if training:
x = self.timesteps
y = torch.exp(-2 * ((x - num_inference_steps / 2) /
num_inference_steps) ** 2)
y_shifted = y - y.min()
bsmntw_weighing = y_shifted * \
(num_inference_steps / y_shifted.sum())
self.linear_timesteps_weights = bsmntw_weighing
def step(self, model_output, timestep, sample, to_final=False):
if timestep.ndim == 2:
timestep = timestep.flatten(0, 1)
self.sigmas = self.sigmas.to(model_output.device)
self.timesteps = self.timesteps.to(model_output.device)
timestep_id = torch.argmin(
(self.timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1)
sigma = self.sigmas[timestep_id].reshape(-1, 1, 1, 1)
if to_final or (timestep_id + 1 >= len(self.timesteps)).any():
sigma_ = 1 if (
self.inverse_timesteps or self.reverse_sigmas) else 0
else:
sigma_ = self.sigmas[timestep_id + 1].reshape(-1, 1, 1, 1)
prev_sample = sample + model_output * (sigma_ - sigma)
return prev_sample
def add_noise(self, original_samples, noise, timestep):
"""
Diffusion forward corruption process.
Input:
- clean_latent: the clean latent with shape [B*T, C, H, W]
- noise: the noise with shape [B*T, C, H, W]
- timestep: the timestep with shape [B*T]
Output: the corrupted latent with shape [B*T, C, H, W]
"""
if timestep.ndim == 2:
timestep = timestep.flatten(0, 1)
self.sigmas = self.sigmas.to(noise.device)
self.timesteps = self.timesteps.to(noise.device)
timestep_id = torch.argmin(
(self.timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1)
sigma = self.sigmas[timestep_id].reshape(-1, 1, 1, 1)
sample = (1 - sigma) * original_samples + sigma * noise
return sample.type_as(noise)
def training_target(self, sample, noise, timestep):
target = noise - sample
return target
def training_weight(self, timestep):
"""
Input:
- timestep: the timestep with shape [B*T]
Output: the corresponding weighting [B*T]
"""
if timestep.ndim == 2:
timestep = timestep.flatten(0, 1)
self.linear_timesteps_weights = self.linear_timesteps_weights.to(timestep.device)
timestep_id = torch.argmin(
(self.timesteps.unsqueeze(1) - timestep.unsqueeze(0)).abs(), dim=0)
weights = self.linear_timesteps_weights[timestep_id]
return weights
|