Spaces:

ds1david
/

sculpt

Runtime error

App Files Files Community

sculpt / model /swin_ir.py

ds1david

New logic

1eb87a5 3 months ago

raw

history blame contribute delete

18.4 kB

	import math
	from typing import Callable, Optional, Iterable

	import numpy as np
	import jax
	import jax.numpy as jnp
	import flax.linen as nn
	from jaxtyping import Array


	def trunc_normal(mean=0., std=1., a=-2., b=2., dtype=jnp.float32) -> Callable:
	"""Truncated normal initialization function"""

	def init(key, shape, dtype=dtype) -> Array:
	# https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/weight_init.py
	def norm_cdf(x):
	# Computes standard normal cumulative distribution function
	return (1. + math.erf(x / math.sqrt(2.))) / 2.

	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)
	out = jax.random.uniform(key, shape, dtype=dtype, minval=2 * l - 1, maxval=2 * u - 1)
	out = jax.scipy.special.erfinv(out) * std * math.sqrt(2.) + mean
	return jnp.clip(out, a, b)

	return init


	def Dense(features, use_bias=True, kernel_init=trunc_normal(std=.02), bias_init=nn.initializers.zeros):
	return nn.Dense(features, use_bias=use_bias, kernel_init=kernel_init, bias_init=bias_init)


	def LayerNorm():
	"""torch LayerNorm uses larger epsilon by default"""
	return nn.LayerNorm(epsilon=1e-05)


	class Mlp(nn.Module):

	in_features: int
	hidden_features: int = None
	out_features: int = None
	act_layer: Callable = nn.gelu
	drop: float = 0.0

	@nn.compact
	def __call__(self, x, training: bool):
	x = nn.Dense(self.hidden_features or self.in_features)(x)
	x = self.act_layer(x)
	x = nn.Dropout(self.drop, deterministic=not training)(x)
	x = nn.Dense(self.out_features or self.in_features)(x)
	x = nn.Dropout(self.drop, deterministic=not training)(x)
	return x


	def window_partition(x, window_size: int):
	"""
	Args:
	x: (B, H, W, C)
	window_size (int): window size

	Returns:
	windows: (num_windows*B, window_size, window_size, C)
	"""
	B, H, W, C = x.shape
	x = x.reshape((B, H // window_size, window_size, W // window_size, window_size, C))
	windows = x.transpose((0, 1, 3, 2, 4, 5)).reshape((-1, window_size, window_size, C))
	return windows


	def window_reverse(windows, window_size: int, H: int, W: int):
	"""
	Args:
	windows: (num_windows*B, window_size, window_size, C)
	window_size (int): Window size
	H (int): Height of image
	W (int): Width of image

	Returns:
	x: (B, H, W, C)
	"""
	B = int(windows.shape[0] / (H * W / window_size / window_size))
	x = windows.reshape((B, H // window_size, W // window_size, window_size, window_size, -1))
	x = x.transpose((0, 1, 3, 2, 4, 5)).reshape((B, H, W, -1))
	return x


	class DropPath(nn.Module):
	"""
	Implementation referred from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
	"""

	dropout_prob: float = 0.1
	deterministic: Optional[bool] = None

	@nn.compact
	def __call__(self, input, training):
	if not training:
	return input
	keep_prob = 1 - self.dropout_prob
	shape = (input.shape[0],) + (1,) * (input.ndim - 1)
	rng = self.make_rng("dropout")
	random_tensor = keep_prob + jax.random.uniform(rng, shape)
	random_tensor = jnp.floor(random_tensor)
	return jnp.divide(input, keep_prob) * random_tensor


	class WindowAttention(nn.Module):
	dim: int
	window_size: Iterable[int]
	num_heads: int
	qkv_bias: bool = True
	qk_scale: Optional[float] = None
	att_drop: float = 0.0
	proj_drop: float = 0.0

	def make_rel_pos_index(self):
	h_indices = np.arange(0, self.window_size[0])
	w_indices = np.arange(0, self.window_size[1])
	indices = np.stack(np.meshgrid(w_indices, h_indices, indexing="ij"))
	flatten_indices = np.reshape(indices, (2, -1))
	relative_indices = flatten_indices[:, :, None] - flatten_indices[:, None, :]
	relative_indices = np.transpose(relative_indices, (1, 2, 0))
	relative_indices[:, :, 0] += self.window_size[0] - 1
	relative_indices[:, :, 1] += self.window_size[1] - 1
	relative_indices[:, :, 0] = 2 self.window_size[1] - 1
	relative_pos_index = np.sum(relative_indices, -1)
	return relative_pos_index

	@nn.compact
	def __call__(self, inputs, mask, training):
	rpbt = self.param(
	"relative_position_bias_table",
	trunc_normal(std=.02),
	(
	(2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
	self.num_heads,
	),
	)

	#relative_pos_index = self.variable(
	# "variables", "relative_position_index", self.get_rel_pos_index
	#)

	batch, n, channels = inputs.shape
	qkv = nn.Dense(self.dim * 3, use_bias=self.qkv_bias, name="qkv")(inputs)
	qkv = qkv.reshape(batch, n, 3, self.num_heads, channels // self.num_heads)
	qkv = jnp.transpose(qkv, (2, 0, 3, 1, 4))
	q, k, v = qkv[0], qkv[1], qkv[2]

	scale = self.qk_scale or (self.dim // self.num_heads) ** -0.5
	q = q * scale
	att = q @ jnp.swapaxes(k, -2, -1)

	rel_pos_bias = jnp.reshape(
	rpbt[np.reshape(self.make_rel_pos_index(), (-1))],
	(
	self.window_size[0] * self.window_size[1],
	self.window_size[0] * self.window_size[1],
	-1,
	),
	)
	rel_pos_bias = jnp.transpose(rel_pos_bias, (2, 0, 1))
	att += jnp.expand_dims(rel_pos_bias, 0)

	if mask is not None:
	att = jnp.reshape(
	att, (batch // mask.shape[0], mask.shape[0], self.num_heads, n, n)
	)
	att = att + jnp.expand_dims(jnp.expand_dims(mask, 1), 0)
	att = jnp.reshape(att, (-1, self.num_heads, n, n))
	att = jax.nn.softmax(att)

	else:
	att = jax.nn.softmax(att)

	att = nn.Dropout(self.att_drop)(att, deterministic=not training)

	x = jnp.reshape(jnp.swapaxes(att @ v, 1, 2), (batch, n, channels))
	x = nn.Dense(self.dim, name="proj")(x)
	x = nn.Dropout(self.proj_drop)(x, deterministic=not training)
	return x


	class SwinTransformerBlock(nn.Module):

	dim: int
	input_resolution: tuple[int]
	num_heads: int
	window_size: int = 7
	shift_size: int = 0
	mlp_ratio: float = 4.
	qkv_bias: bool = True
	qk_scale: Optional[float] = None
	drop: float = 0.
	attn_drop: float = 0.
	drop_path: float = 0.
	act_layer: Callable = nn.activation.gelu
	norm_layer: Callable = LayerNorm

	@staticmethod
	def make_att_mask(shift_size, window_size, height, width):
	if shift_size > 0:
	mask = jnp.zeros([1, height, width, 1])
	h_slices = (
	slice(0, -window_size),
	slice(-window_size, -shift_size),
	slice(-shift_size, None),
	)
	w_slices = (
	slice(0, -window_size),
	slice(-window_size, -shift_size),
	slice(-shift_size, None),
	)

	count = 0
	for h in h_slices:
	for w in w_slices:
	mask = mask.at[:, h, w, :].set(count)
	count += 1

	mask_windows = window_partition(mask, window_size)
	mask_windows = jnp.reshape(mask_windows, (-1, window_size * window_size))
	att_mask = jnp.expand_dims(mask_windows, 1) - jnp.expand_dims(mask_windows, 2)
	att_mask = jnp.where(att_mask != 0.0, float(-100.0), att_mask)
	att_mask = jnp.where(att_mask == 0.0, float(0.0), att_mask)
	else:
	att_mask = None

	return att_mask

	@nn.compact
	def __call__(self, x, x_size, training):
	H, W = x_size
	B, L, C = x.shape

	if min(self.input_resolution) <= self.window_size:
	# if window size is larger than input resolution, we don't partition windows
	self.shift_size = 0
	self.window_size = min(self.input_resolution)
	assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"

	shortcut = x
	x = self.norm_layer()(x)
	x = x.reshape((B, H, W, C))

	# cyclic shift
	if self.shift_size > 0:
	shifted_x = jnp.roll(x, (-self.shift_size, -self.shift_size), axis=(1, 2))
	else:
	shifted_x = x

	# partition windows
	x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
	x_windows = x_windows.reshape((-1, self.window_size * self.window_size, C)) # nWB, window_sizewindow_size, C

	#attn_mask = self.variable(
	# "variables",
	# "attn_mask",
	# self.get_att_mask,
	# self.shift_size,
	# self.window_size,
	# self.input_resolution[0],
	# self.input_resolution[1]
	#)

	attn_mask = self.make_att_mask(self.shift_size, self.window_size, *self.input_resolution)

	attn = WindowAttention(self.dim, (self.window_size, self.window_size), self.num_heads,
	self.qkv_bias, self.qk_scale, self.attn_drop, self.drop)
	if self.input_resolution == x_size:
	attn_windows = attn(x_windows, attn_mask, training) # nWB, window_sizewindow_size, C
	else:
	# test time
	assert not training
	test_mask = self.make_att_mask(self.shift_size, self.window_size, *x_size)
	attn_windows = attn(x_windows, test_mask, training=False)

	# merge windows
	attn_windows = attn_windows.reshape((-1, self.window_size, self.window_size, C))
	shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C

	# reverse cyclic shift
	if self.shift_size > 0:
	x = jnp.roll(shifted_x, (self.shift_size, self.shift_size), axis=(1, 2))
	else:
	x = shifted_x

	x = x.reshape((B, H * W, C))

	# FFN
	x = shortcut + DropPath(self.drop_path)(x, training)

	norm = self.norm_layer()(x)
	mlp = Mlp(in_features=self.dim, hidden_features=int(self.dim * self.mlp_ratio),
	act_layer=self.act_layer, drop=self.drop)(norm, training)
	x = x + DropPath(self.drop_path)(mlp, training)

	return x


	class PatchMerging(nn.Module):
	inp_res: Iterable[int]
	dim: int
	norm_layer: Callable = LayerNorm

	@nn.compact
	def __call__(self, inputs):
	batch, n, channels = inputs.shape
	height, width = self.inp_res[0], self.inp_res[1]
	x = jnp.reshape(inputs, (batch, height, width, channels))

	x0 = x[:, 0::2, 0::2, :]
	x1 = x[:, 1::2, 0::2, :]
	x2 = x[:, 0::2, 1::2, :]
	x3 = x[:, 1::2, 1::2, :]

	x = jnp.concatenate([x0, x1, x2, x3], axis=-1)
	x = jnp.reshape(x, (batch, -1, 4 * channels))
	x = self.norm_layer()(x)
	x = nn.Dense(2 * self.dim, use_bias=False)(x)
	return x


	class BasicLayer(nn.Module):

	dim: int
	input_resolution: int
	depth: int
	num_heads: int
	window_size: int
	mlp_ratio: float = 4.
	qkv_bias: bool = True
	qk_scale: Optional[float] = None
	drop: float = 0.
	attn_drop: float = 0.
	drop_path: float = 0.
	norm_layer: Callable = LayerNorm
	downsample: Optional[Callable] = None

	@nn.compact
	def __call__(self, x, x_size, training):
	for i in range(self.depth):
	x = SwinTransformerBlock(
	self.dim,
	self.input_resolution,
	self.num_heads,
	self.window_size,
	0 if (i % 2 == 0) else self.window_size // 2,
	self.mlp_ratio,
	self.qkv_bias,
	self.qk_scale,
	self.drop,
	self.attn_drop,
	self.drop_path[i] if isinstance(self.drop_path, (list, tuple)) else self.drop_path,
	norm_layer=self.norm_layer
	)(x, x_size, training)

	if self.downsample is not None:
	x = self.downsample(self.input_resolution, dim=self.dim, norm_layer=self.norm_layer)(x)

	return x


	class RSTB(nn.Module):

	dim: int
	input_resolution: int
	depth: int
	num_heads: int
	window_size: int
	mlp_ratio: float = 4.
	qkv_bias: bool = True
	qk_scale: Optional[float] = None
	drop: float = 0.
	attn_drop: float = 0.
	drop_path: float = 0.
	norm_layer: Callable = LayerNorm
	downsample: Optional[Callable] = None
	img_size: int = 224,
	patch_size: int = 4,
	resi_connection: str = '1conv'

	@nn.compact
	def __call__(self, x, x_size, training):
	res = x
	x = BasicLayer(dim=self.dim,
	input_resolution=self.input_resolution,
	depth=self.depth,
	num_heads=self.num_heads,
	window_size=self.window_size,
	mlp_ratio=self.mlp_ratio,
	qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
	drop=self.drop, attn_drop=self.attn_drop,
	drop_path=self.drop_path,
	norm_layer=self.norm_layer,
	downsample=self.downsample)(x, x_size, training)

	x = PatchUnEmbed(embed_dim=self.dim)(x, x_size)

	# resi_connection == '1conv':
	x = nn.Conv(self.dim, (3, 3))(x)

	x = PatchEmbed()(x)

	return x + res


	class PatchEmbed(nn.Module):
	norm_layer: Optional[Callable] = None

	@nn.compact
	def __call__(self, x):
	x = x.reshape((x.shape[0], -1, x.shape[-1])) # B Ph Pw C -> B Ph*Pw C
	if self.norm_layer is not None:
	x = self.norm_layer()(x)
	return x


	class PatchUnEmbed(nn.Module):
	embed_dim: int = 96

	@nn.compact
	def __call__(self, x, x_size):
	B, HW, C = x.shape
	x = x.reshape((B, x_size[0], x_size[1], self.embed_dim))
	return x


	class SwinIR(nn.Module):
	r""" SwinIR JAX implementation
	Args:
	img_size (int \| tuple(int)): Input image size. Default 64
	patch_size (int \| tuple(int)): Patch size. Default: 1
	in_chans (int): Number of input image channels. Default: 3
	embed_dim (int): Patch embedding dimension. Default: 96
	depths (tuple(int)): Depth of each Swin Transformer layer.
	num_heads (tuple(int)): Number of attention heads in different layers.
	window_size (int): Window size. Default: 7
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
	qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
	qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
	drop_rate (float): Dropout rate. Default: 0
	attn_drop_rate (float): Attention dropout rate. Default: 0
	drop_path_rate (float): Stochastic depth rate. Default: 0.1
	norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
	ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
	patch_norm (bool): If True, add normalization after patch embedding. Default: True
	upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
	img_range: Image range. 1. or 25I think5.
	"""

	img_size: int = 48
	patch_size: int = 1
	in_chans: int = 3
	embed_dim: int = 180
	depths: tuple = (6, 6, 6, 6, 6, 6)
	num_heads: tuple = (6, 6, 6, 6, 6, 6)
	window_size: int = 8
	mlp_ratio: float = 2.
	qkv_bias: bool = True
	qk_scale: Optional[float] = None
	drop_rate: float = 0.
	attn_drop_rate: float = 0.
	drop_path_rate: float = 0.1
	norm_layer: Callable = LayerNorm
	ape: bool = False
	patch_norm: bool = True
	upscale: int = 2
	img_range: float = 1.
	num_feat: int = 64

	def pad(self, x):
	_, h, w, _ = x.shape
	mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
	mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
	x = jnp.pad(x, ((0, 0), (0, mod_pad_h), (0, mod_pad_w), (0, 0)), 'reflect')
	return x

	@nn.compact
	def __call__(self, x, training):
	_, h_before, w_before, _ = x.shape
	x = self.pad(x)
	_, h, w, _ = x.shape
	patches_resolution = [self.img_size // self.patch_size] * 2
	num_patches = patches_resolution[0] * patches_resolution[1]

	# conv_first
	x = nn.Conv(self.embed_dim, (3, 3))(x)
	res = x

	# feature extraction
	x_size = (h, w)
	x = PatchEmbed(self.norm_layer if self.patch_norm else None)(x)

	if self.ape:
	absolute_pos_embed = \
	self.param('ape', trunc_normal(std=.02), (1, num_patches, self.embed_dim))
	x = x + absolute_pos_embed

	x = nn.Dropout(self.drop_rate, deterministic=not training)(x)

	dpr = [x.item() for x in np.linspace(0, self.drop_path_rate, sum(self.depths))]
	for i_layer in range(len(self.depths)):
	x = RSTB(
	dim=self.embed_dim,
	input_resolution=(patches_resolution[0], patches_resolution[1]),
	depth=self.depths[i_layer],
	num_heads=self.num_heads[i_layer],
	window_size=self.window_size,
	mlp_ratio=self.mlp_ratio,
	qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
	drop=self.drop_rate, attn_drop=self.attn_drop_rate,
	drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
	norm_layer=self.norm_layer,
	downsample=None,
	img_size=self.img_size,
	patch_size=self.patch_size)(x, x_size, training)

	x = self.norm_layer()(x) # B L C
	x = PatchUnEmbed(self.embed_dim)(x, x_size)

	# conv_after_body
	x = nn.Conv(self.embed_dim, (3, 3))(x)
	x = x + res

	# conv_before_upsample
	x = nn.activation.leaky_relu(nn.Conv(self.num_feat, (3, 3))(x))

	# revert padding
	x = x[:, :-(h - h_before) or None, :-(w - w_before) or None]
	return x