Spaces:

Vishakaraj
/

DinoV2_Semantic_Segmentation

Runtime error

App Files Files Community

DinoV2_Semantic_Segmentation / dinov2 /eval /segmentation_m2f /models /backbones /vit.py

Vishakaraj

Upload folder using huggingface_hub

3fad000 almost 2 years ago

raw

history blame contribute delete

20.1 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	#
	# This source code is licensed under the Apache License, Version 2.0
	# found in the LICENSE file in the root directory of this source tree.

	"""Vision Transformer (ViT) in PyTorch.

	A PyTorch implement of Vision Transformers as described in:

	'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
	- https://arxiv.org/abs/2010.11929

	`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
	- https://arxiv.org/abs/2106.10270

	The official jax code is released and available at https://github.com/google-research/vision_transformer

	DeiT model defs and weights from https://github.com/facebookresearch/deit,
	paper `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877

	Acknowledgments:
	* The paper authors for releasing code and weights, thanks!
	* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out
	for some einops/einsum fun
	* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
	* Bert reference code checks against Huggingface Transformers and Tensorflow Bert

	Hacked together by / Copyright 2021 Ross Wightman
	"""
	import logging
	import math
	from functools import partial
	from itertools import repeat
	from typing import Callable, Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as cp
	from mmcv.runner import BaseModule, load_checkpoint
	from mmseg.ops import resize
	from mmseg.utils import get_root_logger
	from torch import Tensor

	from .drop_path import DropPath


	def to_2tuple(x):
	return tuple(repeat(x, 2))


	class Mlp(nn.Module):
	def __init__(
	self,
	in_features: int,
	hidden_features: Optional[int] = None,
	out_features: Optional[int] = None,
	act_layer: Callable[..., nn.Module] = nn.GELU,
	drop: float = 0.0,
	bias: bool = True,
	) -> None:
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
	self.drop = nn.Dropout(drop)

	def forward(self, x: Tensor) -> Tensor:
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class SwiGLUFFN(nn.Module):
	def __init__(
	self,
	in_features: int,
	hidden_features: Optional[int] = None,
	out_features: Optional[int] = None,
	act_layer: Callable[..., nn.Module] = None,
	drop: float = 0.0,
	) -> None:
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	swiglu_hidden_features = int(2 * hidden_features / 3)
	align_as = 8
	swiglu_hidden_features = (swiglu_hidden_features + align_as - 1) // align_as * align_as
	self.w1 = nn.Linear(in_features, swiglu_hidden_features)
	self.w2 = nn.Linear(in_features, swiglu_hidden_features)
	self.w3 = nn.Linear(swiglu_hidden_features, out_features)

	def forward(self, x: Tensor) -> Tensor:
	x1 = self.w1(x)
	x2 = self.w2(x)
	hidden = F.silu(x1) * x2
	return self.w3(hidden)


	class PatchEmbed(nn.Module):
	"""2D Image to Patch Embedding."""

	def __init__(
	self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, bias=True
	):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	self.img_size = img_size
	self.patch_size = patch_size
	self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
	self.num_patches = self.grid_size[0] * self.grid_size[1]
	self.flatten = flatten

	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
	self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

	def forward(self, x):
	x = self.proj(x)
	_, _, H, W = x.shape
	if self.flatten:
	x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
	x = self.norm(x)
	return x, H, W


	class Attention(nn.Module):
	def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = head_dim**-0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x, H, W):
	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)

	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class MemEffAttention(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int = 8,
	qkv_bias: bool = False,
	attn_drop: float = 0.0,
	proj_drop: float = 0.0,
	) -> None:
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = head_dim**-0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x: Tensor, H, W) -> Tensor:
	from xformers.ops import memory_efficient_attention, unbind

	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)

	q, k, v = unbind(qkv, 2)

	x = memory_efficient_attention(q, k, v)
	x = x.reshape([B, N, C])

	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	def window_partition(x, window_size):
	"""
	Args:
	x: (B, H, W, C)
	window_size (int): window size
	Returns:
	windows: (num_windows*B, window_size, window_size, C)
	"""
	B, H, W, C = x.shape
	x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
	windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
	return windows


	def window_reverse(windows, window_size, H, W):
	"""
	Args:
	windows: (num_windows*B, window_size, window_size, C)
	window_size (int): Window size
	H (int): Height of image
	W (int): Width of image
	Returns:
	x: (B, H, W, C)
	"""
	B = int(windows.shape[0] / (H * W / window_size / window_size))
	x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
	x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
	return x


	class WindowedAttention(nn.Module):
	def __init__(
	self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0, window_size=14, pad_mode="constant"
	):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = head_dim**-0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)
	self.window_size = window_size
	self.pad_mode = pad_mode

	def forward(self, x, H, W):
	B, N, C = x.shape
	N_ = self.window_size * self.window_size
	H_ = math.ceil(H / self.window_size) * self.window_size
	W_ = math.ceil(W / self.window_size) * self.window_size

	qkv = self.qkv(x) # [B, N, C]
	qkv = qkv.transpose(1, 2).reshape(B, C * 3, H, W) # [B, C, H, W]
	qkv = F.pad(qkv, [0, W_ - W, 0, H_ - H], mode=self.pad_mode)

	qkv = F.unfold(
	qkv, kernel_size=(self.window_size, self.window_size), stride=(self.window_size, self.window_size)
	)
	B, C_kw_kw, L = qkv.shape # L - the num of windows
	qkv = qkv.reshape(B, C * 3, N_, L).permute(0, 3, 2, 1) # [B, L, N_, C]
	qkv = qkv.reshape(B, L, N_, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5)
	q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)

	# q,k,v [B, L, num_head, N_, C/num_head]
	attn = (q @ k.transpose(-2, -1)) * self.scale # [B, L, num_head, N_, N_]
	# if self.mask:
	# attn = attn * mask
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn) # [B, L, num_head, N_, N_]
	# attn @ v = [B, L, num_head, N_, C/num_head]
	x = (attn @ v).permute(0, 2, 4, 3, 1).reshape(B, C_kw_kw // 3, L)

	x = F.fold(
	x,
	output_size=(H_, W_),
	kernel_size=(self.window_size, self.window_size),
	stride=(self.window_size, self.window_size),
	) # [B, C, H_, W_]
	x = x[:, :, :H, :W].reshape(B, C, N).transpose(-1, -2)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	# class WindowedAttention(nn.Module):
	# def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., window_size=14, pad_mode="constant"):
	# super().__init__()
	# self.num_heads = num_heads
	# head_dim = dim // num_heads
	# self.scale = head_dim ** -0.5
	#
	# self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	# self.attn_drop = nn.Dropout(attn_drop)
	# self.proj = nn.Linear(dim, dim)
	# self.proj_drop = nn.Dropout(proj_drop)
	# self.window_size = window_size
	# self.pad_mode = pad_mode
	#
	# def forward(self, x, H, W):
	# B, N, C = x.shape
	#
	# N_ = self.window_size * self.window_size
	# H_ = math.ceil(H / self.window_size) * self.window_size
	# W_ = math.ceil(W / self.window_size) * self.window_size
	# x = x.view(B, H, W, C)
	# x = F.pad(x, [0, 0, 0, W_ - W, 0, H_- H], mode=self.pad_mode)
	#
	# x = window_partition(x, window_size=self.window_size)# nW*B, window_size, window_size, C
	# x = x.view(-1, N_, C)
	#
	# qkv = self.qkv(x).view(-1, N_, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	# q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
	# attn = (q @ k.transpose(-2, -1)) * self.scale # [B, L, num_head, N_, N_]
	# attn = attn.softmax(dim=-1)
	# attn = self.attn_drop(attn) # [B, L, num_head, N_, N_]
	# x = (attn @ v).transpose(1, 2).reshape(-1, self.window_size, self.window_size, C)
	#
	# x = window_reverse(x, self.window_size, H_, W_)
	# x = x[:, :H, :W, :].reshape(B, N, C).contiguous()
	# x = self.proj(x)
	# x = self.proj_drop(x)
	# return x


	class Block(nn.Module):
	def __init__(
	self,
	dim,
	num_heads,
	mlp_ratio=4.0,
	qkv_bias=False,
	drop=0.0,
	attn_drop=0.0,
	drop_path=0.0,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	windowed=False,
	window_size=14,
	pad_mode="constant",
	layer_scale=False,
	with_cp=False,
	ffn_layer=Mlp,
	memeff=False,
	):
	super().__init__()
	self.with_cp = with_cp
	self.norm1 = norm_layer(dim)
	if windowed:
	self.attn = WindowedAttention(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	attn_drop=attn_drop,
	proj_drop=drop,
	window_size=window_size,
	pad_mode=pad_mode,
	)
	elif memeff:
	self.attn = MemEffAttention(
	dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop
	)
	else:
	self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
	# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = ffn_layer(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
	self.layer_scale = layer_scale
	if layer_scale:
	self.gamma1 = nn.Parameter(torch.ones((dim)), requires_grad=True)
	self.gamma2 = nn.Parameter(torch.ones((dim)), requires_grad=True)

	def forward(self, x, H, W):
	def _inner_forward(x):
	if self.layer_scale:
	x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x), H, W))
	x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
	else:
	x = x + self.drop_path(self.attn(self.norm1(x), H, W))
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x

	if self.with_cp and x.requires_grad:
	x = cp.checkpoint(_inner_forward, x)
	else:
	x = _inner_forward(x)

	return x


	class TIMMVisionTransformer(BaseModule):
	"""Vision Transformer.

	A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
	- https://arxiv.org/abs/2010.11929

	Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
	- https://arxiv.org/abs/2012.12877
	"""

	def __init__(
	self,
	img_size=224,
	patch_size=16,
	in_chans=3,
	num_classes=1000,
	embed_dim=768,
	depth=12,
	num_heads=12,
	mlp_ratio=4.0,
	qkv_bias=True,
	drop_rate=0.0,
	attn_drop_rate=0.0,
	drop_path_rate=0.0,
	layer_scale=True,
	embed_layer=PatchEmbed,
	norm_layer=partial(nn.LayerNorm, eps=1e-6),
	act_layer=nn.GELU,
	window_attn=False,
	window_size=14,
	pretrained=None,
	with_cp=False,
	pre_norm=False,
	ffn_type="mlp",
	memeff=False,
	):
	"""
	Args:
	img_size (int, tuple): input image size
	patch_size (int, tuple): patch size
	in_chans (int): number of input channels
	num_classes (int): number of classes for classification head
	embed_dim (int): embedding dimension
	depth (int): depth of transformer
	num_heads (int): number of attention heads
	mlp_ratio (int): ratio of mlp hidden dim to embedding dim
	qkv_bias (bool): enable bias for qkv if True
	drop_rate (float): dropout rate
	attn_drop_rate (float): attention dropout rate
	drop_path_rate (float): stochastic depth rate
	embed_layer (nn.Module): patch embedding layer
	norm_layer: (nn.Module): normalization layer
	pretrained: (str): pretrained path
	"""
	super().__init__()
	self.num_classes = num_classes
	self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
	self.num_tokens = 1
	norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
	act_layer = act_layer or nn.GELU
	self.norm_layer = norm_layer
	self.act_layer = act_layer
	self.pretrain_size = img_size
	self.drop_path_rate = drop_path_rate
	self.drop_rate = drop_rate
	self.patch_size = patch_size

	window_attn = [window_attn] * depth if not isinstance(window_attn, list) else window_attn
	window_size = [window_size] * depth if not isinstance(window_size, list) else window_size
	logging.info("window attention:", window_attn)
	logging.info("window size:", window_size)
	logging.info("layer scale:", layer_scale)

	self.patch_embed = embed_layer(
	img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, bias=not pre_norm
	)
	num_patches = self.patch_embed.num_patches

	self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
	self.pos_drop = nn.Dropout(p=drop_rate)

	ffn_types = {"mlp": Mlp, "swiglu": SwiGLUFFN}

	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
	self.blocks = nn.Sequential(
	*[
	Block(
	dim=embed_dim,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	drop=drop_rate,
	attn_drop=attn_drop_rate,
	drop_path=dpr[i],
	norm_layer=norm_layer,
	act_layer=act_layer,
	windowed=window_attn[i],
	window_size=window_size[i],
	layer_scale=layer_scale,
	with_cp=with_cp,
	ffn_layer=ffn_types[ffn_type],
	memeff=memeff,
	)
	for i in range(depth)
	]
	)

	# self.norm = norm_layer(embed_dim)
	self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
	# For CLIP
	if pre_norm:
	norm_pre = norm_layer(embed_dim)
	self.norm_pre = norm_pre
	else:
	self.norm_pre = nn.Identity()
	self.init_weights(pretrained)

	def init_weights(self, pretrained=None):
	if isinstance(pretrained, str):
	logger = get_root_logger()
	load_checkpoint(self, pretrained, map_location="cpu", strict=False, logger=logger)

	def forward_features(self, x):
	x, H, W = self.patch_embed(x)
	cls_token = self.cls_token.expand(x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks
	x = torch.cat((cls_token, x), dim=1)
	x = self.pos_drop(x + self.pos_embed)

	# For CLIP
	x = self.norm_pre(x)

	for blk in self.blocks:
	x = blk(x, H, W)
	x = self.norm(x)
	return x

	def forward(self, x):
	x = self.forward_features(x)
	return x

	@staticmethod
	def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
	"""Resize pos_embed weights.

	Resize pos_embed using bicubic interpolate method.
	Args:
	pos_embed (torch.Tensor): Position embedding weights.
	input_shpae (tuple): Tuple for (downsampled input image height,
	downsampled input image width).
	pos_shape (tuple): The resolution of downsampled origin training
	image.
	mode (str): Algorithm used for upsampling:
	``'nearest'`` \| ``'linear'`` \| ``'bilinear'`` \| ``'bicubic'`` \|
	``'trilinear'``. Default: ``'nearest'``
	Return:
	torch.Tensor: The resized pos_embed of shape [B, L_new, C]
	"""
	assert pos_embed.ndim == 3, "shape of pos_embed must be [B, L, C]"
	pos_h, pos_w = pos_shape
	# keep dim for easy deployment
	cls_token_weight = pos_embed[:, 0:1]
	pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w) :]
	pos_embed_weight = pos_embed_weight.reshape(1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
	pos_embed_weight = resize(pos_embed_weight, size=input_shpae, align_corners=False, mode=mode)
	pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
	pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
	return pos_embed