IDM-VTON-demo

Paused

IDM-VTON-demo / detectron2 /layers /aspp.py

IDM-VTON

update IDM-VTON Demo

938e515 7 months ago

5.76 kB

	# Copyright (c) Facebook, Inc. and its affiliates.

	from copy import deepcopy
	import fvcore.nn.weight_init as weight_init
	import torch
	from torch import nn
	from torch.nn import functional as F

	from .batch_norm import get_norm
	from .blocks import DepthwiseSeparableConv2d
	from .wrappers import Conv2d


	class ASPP(nn.Module):
	"""
	Atrous Spatial Pyramid Pooling (ASPP).
	"""

	def __init__(
	self,
	in_channels,
	out_channels,
	dilations,
	*,
	norm,
	activation,
	pool_kernel_size=None,
	dropout: float = 0.0,
	use_depthwise_separable_conv=False,
	):
	"""
	Args:
	in_channels (int): number of input channels for ASPP.
	out_channels (int): number of output channels.
	dilations (list): a list of 3 dilations in ASPP.
	norm (str or callable): normalization for all conv layers.
	See :func:`layers.get_norm` for supported format. norm is
	applied to all conv layers except the conv following
	global average pooling.
	activation (callable): activation function.
	pool_kernel_size (tuple, list): the average pooling size (kh, kw)
	for image pooling layer in ASPP. If set to None, it always
	performs global average pooling. If not None, it must be
	divisible by the shape of inputs in forward(). It is recommended
	to use a fixed input feature size in training, and set this
	option to match this size, so that it performs global average
	pooling in training, and the size of the pooling window stays
	consistent in inference.
	dropout (float): apply dropout on the output of ASPP. It is used in
	the official DeepLab implementation with a rate of 0.1:
	https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532 # noqa
	use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
	for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
	"""
	super(ASPP, self).__init__()
	assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
	self.pool_kernel_size = pool_kernel_size
	self.dropout = dropout
	use_bias = norm == ""
	self.convs = nn.ModuleList()
	# conv 1x1
	self.convs.append(
	Conv2d(
	in_channels,
	out_channels,
	kernel_size=1,
	bias=use_bias,
	norm=get_norm(norm, out_channels),
	activation=deepcopy(activation),
	)
	)
	weight_init.c2_xavier_fill(self.convs[-1])
	# atrous convs
	for dilation in dilations:
	if use_depthwise_separable_conv:
	self.convs.append(
	DepthwiseSeparableConv2d(
	in_channels,
	out_channels,
	kernel_size=3,
	padding=dilation,
	dilation=dilation,
	norm1=norm,
	activation1=deepcopy(activation),
	norm2=norm,
	activation2=deepcopy(activation),
	)
	)
	else:
	self.convs.append(
	Conv2d(
	in_channels,
	out_channels,
	kernel_size=3,
	padding=dilation,
	dilation=dilation,
	bias=use_bias,
	norm=get_norm(norm, out_channels),
	activation=deepcopy(activation),
	)
	)
	weight_init.c2_xavier_fill(self.convs[-1])
	# image pooling
	# We do not add BatchNorm because the spatial resolution is 1x1,
	# the original TF implementation has BatchNorm.
	if pool_kernel_size is None:
	image_pooling = nn.Sequential(
	nn.AdaptiveAvgPool2d(1),
	Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
	)
	else:
	image_pooling = nn.Sequential(
	nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
	Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
	)
	weight_init.c2_xavier_fill(image_pooling[1])
	self.convs.append(image_pooling)

	self.project = Conv2d(
	5 * out_channels,
	out_channels,
	kernel_size=1,
	bias=use_bias,
	norm=get_norm(norm, out_channels),
	activation=deepcopy(activation),
	)
	weight_init.c2_xavier_fill(self.project)

	def forward(self, x):
	size = x.shape[-2:]
	if self.pool_kernel_size is not None:
	if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
	raise ValueError(
	"`pool_kernel_size` must be divisible by the shape of inputs. "
	"Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
	)
	res = []
	for conv in self.convs:
	res.append(conv(x))
	res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
	res = torch.cat(res, dim=1)
	res = self.project(res)
	res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
	return res