Spaces:

introvoyz041
/

medical-segment-anything-adapter

Configuration error

App Files Files Community

medical-segment-anything-adapter / models /ImageEncoder /tinyvit /tiny_vit.py

introvoyz041

Upload folder using huggingface_hub

3f31c34 verified 12 months ago

raw

history blame contribute delete

16.8 kB

	# --------------------------------------------------------
	# TinyViT Model Architecture
	# Copyright (c) 2022 Microsoft
	# Adapted from LeViT and Swin Transformer
	# LeViT: (https://github.com/facebookresearch/levit)
	# Swin: (https://github.com/microsoft/swin-transformer)
	# Build the TinyViT Model
	# --------------------------------------------------------

	import itertools
	from typing import Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as checkpoint
	from timm.models.layers import DropPath as TimmDropPath
	from timm.models.layers import to_2tuple, trunc_normal_
	from timm.models.registry import register_model

	from ...common import LayerNorm2d
	from .adalora_block import TinyViTAdaloraBlock
	from .adapter_block import TinyViTAdapterBlock
	from .block import TinyViTBlock
	from .lora_block import TinyViTLoraBlock
	from .utils import Conv2d_BN, DropPath, Mlp


	class PatchEmbed(nn.Module):
	def __init__(self, in_chans, embed_dim, resolution, activation):
	super().__init__()
	img_size: Tuple[int, int] = to_2tuple(resolution)
	self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
	self.num_patches = self.patches_resolution[0] * \
	self.patches_resolution[1]
	self.in_chans = in_chans
	self.embed_dim = embed_dim
	n = embed_dim
	self.seq = nn.Sequential(
	Conv2d_BN(in_chans, n // 2, 3, 2, 1),
	activation(),
	Conv2d_BN(n // 2, n, 3, 2, 1),
	)

	def forward(self, x):
	return self.seq(x)


	class MBConv(nn.Module):
	def __init__(self, in_chans, out_chans, expand_ratio,
	activation, drop_path):
	super().__init__()
	self.in_chans = in_chans
	self.hidden_chans = int(in_chans * expand_ratio)
	self.out_chans = out_chans

	self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
	self.act1 = activation()

	self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans,
	ks=3, stride=1, pad=1, groups=self.hidden_chans)
	self.act2 = activation()

	self.conv3 = Conv2d_BN(
	self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
	self.act3 = activation()

	self.drop_path = DropPath(
	drop_path) if drop_path > 0. else nn.Identity()

	def forward(self, x):
	shortcut = x

	x = self.conv1(x)
	x = self.act1(x)

	x = self.conv2(x)
	x = self.act2(x)

	x = self.conv3(x)

	x = self.drop_path(x)

	x += shortcut
	x = self.act3(x)

	return x


	class PatchMerging(nn.Module):
	def __init__(self, input_resolution, dim, out_dim, activation):
	super().__init__()

	self.input_resolution = input_resolution
	self.dim = dim
	self.out_dim = out_dim
	self.act = activation()
	self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
	stride_c=2
	if(out_dim==320 or out_dim==448 or out_dim==576):
	stride_c=1
	self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
	self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)

	def forward(self, x):
	if x.ndim == 3:
	H, W = self.input_resolution
	B = len(x)
	# (B, C, H, W)
	x = x.view(B, H, W, -1).permute(0, 3, 1, 2)

	x = self.conv1(x)
	x = self.act(x)

	x = self.conv2(x)
	x = self.act(x)
	x = self.conv3(x)
	x = x.flatten(2).transpose(1, 2)
	return x


	class ConvLayer(nn.Module):
	def __init__(self, dim, input_resolution, depth,
	activation,
	drop_path=0., downsample=None, use_checkpoint=False,
	out_dim=None,
	conv_expand_ratio=4.,
	):

	super().__init__()
	self.dim = dim
	self.input_resolution = input_resolution
	self.depth = depth
	self.use_checkpoint = use_checkpoint

	# build blocks
	self.blocks = nn.ModuleList([
	MBConv(dim, dim, conv_expand_ratio, activation,
	drop_path[i] if isinstance(drop_path, list) else drop_path,
	)
	for i in range(depth)])

	# patch merging layer
	if downsample is not None:
	self.downsample = downsample(
	input_resolution, dim=dim, out_dim=out_dim, activation=activation)
	else:
	self.downsample = None

	def forward(self, x):
	for blk in self.blocks:
	if self.use_checkpoint:
	x = checkpoint.checkpoint(blk, x)
	else:
	x = blk(x)
	if self.downsample is not None:
	x = self.downsample(x)
	return x


	class BasicLayer(nn.Module):
	""" A basic TinyViT layer for one stage.

	Args:
	dim (int): Number of input channels.
	input_resolution (tuple[int]): Input resolution.
	depth (int): Number of blocks.
	num_heads (int): Number of attention heads.
	window_size (int): Local window size.
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
	drop (float, optional): Dropout rate. Default: 0.0
	drop_path (float \| tuple[float], optional): Stochastic depth rate. Default: 0.0
	downsample (nn.Module \| None, optional): Downsample layer at the end of the layer. Default: None
	use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
	local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
	activation: the activation function. Default: nn.GELU
	out_dim: the output dimension of the layer. Default: dim
	"""

	def __init__(self, args, dim, input_resolution, depth, num_heads, window_size,
	mlp_ratio=4., drop=0.,
	drop_path=0., downsample=None, use_checkpoint=False,
	local_conv_size=3,
	activation=nn.GELU,
	out_dim=None,
	):

	super().__init__()
	self.dim = dim
	self.input_resolution = input_resolution
	self.depth = depth
	self.use_checkpoint = use_checkpoint

	# build blocks
	if args.mod == 'sam_adpt':
	block_class = TinyViTAdapterBlock
	elif args.mod == 'sam_lora':
	block_class = TinyViTLoraBlock
	elif args.mod == 'sam_adalora':
	block_class = TinyViTAdaloraBlock
	else:
	block_class = TinyViTBlock

	self.blocks = nn.ModuleList([
	block_class(dim=dim, args = args,input_resolution=input_resolution,
	num_heads=num_heads, window_size=window_size,
	mlp_ratio=mlp_ratio,
	drop=drop,
	drop_path=drop_path[i] if isinstance(
	drop_path, list) else drop_path,
	local_conv_size=local_conv_size,
	activation=activation,
	)
	for i in range(depth)])

	# patch merging layer
	if downsample is not None:
	self.downsample = downsample(
	input_resolution, dim=dim, out_dim=out_dim, activation=activation)
	else:
	self.downsample = None

	def forward(self, x):
	for blk in self.blocks:
	if self.use_checkpoint:
	x = checkpoint.checkpoint(blk, x)
	else:
	x = blk(x)
	if self.downsample is not None:
	x = self.downsample(x)
	return x

	def extra_repr(self) -> str:
	return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"

	class TinyViT(nn.Module):
	def __init__(self, args, img_size=224, in_chans=3, num_classes=1000,
	embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
	num_heads=[3, 6, 12, 24],
	window_sizes=[7, 7, 14, 7],
	mlp_ratio=4.,
	drop_rate=0.,
	drop_path_rate=0.1,
	use_checkpoint=False,
	mbconv_expand_ratio=4.0,
	local_conv_size=3,
	layer_lr_decay=1.0,
	):
	super().__init__()
	self.img_size=img_size
	#import pdb;pdb.set_trace()
	self.num_classes = num_classes
	self.depths = depths
	self.num_layers = len(depths)
	self.mlp_ratio = mlp_ratio

	activation = nn.GELU

	self.patch_embed = PatchEmbed(in_chans=in_chans,
	embed_dim=embed_dims[0],
	resolution=img_size,
	activation=activation)

	patches_resolution = self.patch_embed.patches_resolution
	self.patches_resolution = patches_resolution

	# stochastic depth
	dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
	sum(depths))] # stochastic depth decay rule

	# build layers
	self.layers = nn.ModuleList()
	for i_layer in range(self.num_layers):
	kwargs = dict(dim=embed_dims[i_layer],
	input_resolution=(patches_resolution[0] // (2 ** (i_layer-1 if i_layer == 3 else i_layer)),
	patches_resolution[1] // (2 ** (i_layer-1 if i_layer == 3 else i_layer))),
	# input_resolution=(patches_resolution[0] // (2 ** i_layer),
	# patches_resolution[1] // (2 ** i_layer)),
	depth=depths[i_layer],
	drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
	downsample=PatchMerging if (
	i_layer < self.num_layers - 1) else None,
	use_checkpoint=use_checkpoint,
	out_dim=embed_dims[min(
	i_layer + 1, len(embed_dims) - 1)],
	activation=activation,
	)
	if i_layer == 0:
	layer = ConvLayer(
	conv_expand_ratio=mbconv_expand_ratio,
	**kwargs,
	)
	else:
	layer = BasicLayer(
	args = args,
	num_heads=num_heads[i_layer],
	window_size=window_sizes[i_layer],
	mlp_ratio=self.mlp_ratio,
	drop=drop_rate,
	local_conv_size=local_conv_size,
	**kwargs)
	self.layers.append(layer)

	# Classifier head
	self.norm_head = nn.LayerNorm(embed_dims[-1])
	self.head = nn.Linear(
	embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()

	# init weights
	self.apply(self._init_weights)
	self.set_layer_lr_decay(layer_lr_decay)
	self.neck = nn.Sequential(
	nn.Conv2d(
	embed_dims[-1],
	256,
	kernel_size=1,
	bias=False,
	),
	LayerNorm2d(256),
	nn.Conv2d(
	256,
	256,
	kernel_size=3,
	padding=1,
	bias=False,
	),
	LayerNorm2d(256),
	)
	def set_layer_lr_decay(self, layer_lr_decay):
	decay_rate = layer_lr_decay

	# layers -> blocks (depth)
	depth = sum(self.depths)
	lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
	#print("LR SCALES:", lr_scales)

	def _set_lr_scale(m, scale):
	for p in m.parameters():
	p.lr_scale = scale

	self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0]))
	i = 0
	for layer in self.layers:
	for block in layer.blocks:
	block.apply(lambda x: _set_lr_scale(x, lr_scales[i]))
	i += 1
	if layer.downsample is not None:
	layer.downsample.apply(
	lambda x: _set_lr_scale(x, lr_scales[i - 1]))
	assert i == depth
	for m in [self.norm_head, self.head]:
	m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))

	for k, p in self.named_parameters():
	p.param_name = k

	def _check_lr_scale(m):
	for p in m.parameters():
	assert hasattr(p, 'lr_scale'), p.param_name

	self.apply(_check_lr_scale)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	@torch.jit.ignore
	def no_weight_decay_keywords(self):
	return {'attention_biases'}

	def forward_features(self, x):
	# x: (N, C, H, W)
	x = self.patch_embed(x)

	x = self.layers[0](x)
	start_i = 1

	for i in range(start_i, len(self.layers)):
	layer = self.layers[i]
	x = layer(x)
	B,_,C=x.size()
	x = x.view(B, self.img_size//16, self.img_size//16, C)
	x=x.permute(0, 3, 1, 2)
	x=self.neck(x)
	return x

	def forward(self, x):
	x = self.forward_features(x)
	#x = self.norm_head(x)
	#x = self.head(x)
	return x


	_checkpoint_url_format = \
	'https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/{}.pth'
	_provided_checkpoints = {
	'tiny_vit_5m_224': 'tiny_vit_5m_22kto1k_distill',
	'tiny_vit_11m_224': 'tiny_vit_11m_22kto1k_distill',
	'tiny_vit_21m_224': 'tiny_vit_21m_22kto1k_distill',
	'tiny_vit_21m_384': 'tiny_vit_21m_22kto1k_384_distill',
	'tiny_vit_21m_512': 'tiny_vit_21m_22kto1k_512_distill',
	}


	def register_tiny_vit_model(fn):
	'''Register a TinyViT model
	It is a wrapper of `register_model` with loading the pretrained checkpoint.
	'''
	def fn_wrapper(pretrained=False, **kwargs):
	model = fn()
	if pretrained:
	model_name = fn.__name__
	assert model_name in _provided_checkpoints, \
	f'Sorry that the checkpoint `{model_name}` is not provided yet.'
	url = _checkpoint_url_format.format(
	_provided_checkpoints[model_name])
	checkpoint = torch.hub.load_state_dict_from_url(
	url=url,
	map_location='cpu', check_hash=False,
	)
	model.load_state_dict(checkpoint['model'])

	return model

	# rename the name of fn_wrapper
	fn_wrapper.__name__ = fn.__name__
	return register_model(fn_wrapper)


	# @register_tiny_vit_model
	def tiny_vit_5m_224(pretrained=False, num_classes=1000, drop_path_rate=0.0):
	return TinyViT(
	num_classes=num_classes,
	embed_dims=[64, 128, 160, 320],
	depths=[2, 2, 6, 2],
	num_heads=[2, 4, 5, 10],
	window_sizes=[7, 7, 14, 7],
	drop_path_rate=drop_path_rate,
	)


	# @register_tiny_vit_model
	def tiny_vit_11m_224(pretrained=False, num_classes=1000, drop_path_rate=0.1):
	return TinyViT(
	num_classes=num_classes,
	embed_dims=[64, 128, 256, 448],
	depths=[2, 2, 6, 2],
	num_heads=[2, 4, 8, 14],
	window_sizes=[7, 7, 14, 7],
	drop_path_rate=drop_path_rate,
	)


	# @register_tiny_vit_model
	def tiny_vit_21m_224(pretrained=False, num_classes=1000, drop_path_rate=0.2):
	return TinyViT(
	num_classes=num_classes,
	embed_dims=[96, 192, 384, 576],
	depths=[2, 2, 6, 2],
	num_heads=[3, 6, 12, 18],
	window_sizes=[7, 7, 14, 7],
	drop_path_rate=drop_path_rate,
	)


	# @register_tiny_vit_model
	def tiny_vit_21m_384(pretrained=False, num_classes=1000, drop_path_rate=0.1):
	return TinyViT(
	img_size=384,
	num_classes=num_classes,
	embed_dims=[96, 192, 384, 576],
	depths=[2, 2, 6, 2],
	num_heads=[3, 6, 12, 18],
	window_sizes=[12, 12, 24, 12],
	drop_path_rate=drop_path_rate,
	)


	# @register_tiny_vit_model
	def tiny_vit_21m_512(pretrained=False, num_classes=1000, drop_path_rate=0.1):
	return TinyViT(
	img_size=512,
	num_classes=num_classes,
	embed_dims=[96, 192, 384, 576],
	depths=[2, 2, 6, 2],
	num_heads=[3, 6, 12, 18],
	window_sizes=[16, 16, 32, 16],
	drop_path_rate=drop_path_rate,
	)