Spaces:
Configuration error
Configuration error
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import paddle | |
import paddle.nn as nn | |
import paddle.nn.functional as F | |
from paddleseg.models import layers | |
from paddleseg.models.layers import tensor_fusion_helper as helper | |
class UAFM(nn.Layer): | |
""" | |
The base of Unified Attention Fusion Module. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__() | |
self.conv_x = layers.ConvBNReLU( | |
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) | |
self.conv_out = layers.ConvBNReLU( | |
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) | |
self.resize_mode = resize_mode | |
def check(self, x, y): | |
assert x.ndim == 4 and y.ndim == 4 | |
x_h, x_w = x.shape[2:] | |
y_h, y_w = y.shape[2:] | |
assert x_h >= y_h and x_w >= y_w | |
def prepare(self, x, y): | |
x = self.prepare_x(x, y) | |
y = self.prepare_y(x, y) | |
return x, y | |
def prepare_x(self, x, y): | |
x = self.conv_x(x) | |
return x | |
def prepare_y(self, x, y): | |
y_up = F.interpolate(y, paddle.shape(x)[2:], mode=self.resize_mode) | |
return y_up | |
def fuse(self, x, y): | |
out = x + y | |
out = self.conv_out(out) | |
return out | |
def forward(self, x, y): | |
""" | |
Args: | |
x (Tensor): The low level feature. | |
y (Tensor): The high level feature. | |
""" | |
self.check(x, y) | |
x, y = self.prepare(x, y) | |
out = self.fuse(x, y) | |
return out | |
class UAFM_ChAtten(UAFM): | |
""" | |
The UAFM with channel attention, which uses mean and max values. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) | |
self.conv_xy_atten = nn.Sequential( | |
layers.ConvBNAct( | |
4 * y_ch, | |
y_ch // 2, | |
kernel_size=1, | |
bias_attr=False, | |
act_type="leakyrelu"), | |
layers.ConvBN( | |
y_ch // 2, y_ch, kernel_size=1, bias_attr=False)) | |
def fuse(self, x, y): | |
""" | |
Args: | |
x (Tensor): The low level feature. | |
y (Tensor): The high level feature. | |
""" | |
atten = helper.avg_max_reduce_hw([x, y], self.training) | |
atten = F.sigmoid(self.conv_xy_atten(atten)) | |
out = x * atten + y * (1 - atten) | |
out = self.conv_out(out) | |
return out | |
class UAFM_ChAtten_S(UAFM): | |
""" | |
The UAFM with channel attention, which uses mean values. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) | |
self.conv_xy_atten = nn.Sequential( | |
layers.ConvBNAct( | |
2 * y_ch, | |
y_ch // 2, | |
kernel_size=1, | |
bias_attr=False, | |
act_type="leakyrelu"), | |
layers.ConvBN( | |
y_ch // 2, y_ch, kernel_size=1, bias_attr=False)) | |
def fuse(self, x, y): | |
""" | |
Args: | |
x (Tensor): The low level feature. | |
y (Tensor): The high level feature. | |
""" | |
atten = helper.avg_reduce_hw([x, y]) | |
atten = F.sigmoid(self.conv_xy_atten(atten)) | |
out = x * atten + y * (1 - atten) | |
out = self.conv_out(out) | |
return out | |
class UAFM_SpAtten(UAFM): | |
""" | |
The UAFM with spatial attention, which uses mean and max values. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) | |
self.conv_xy_atten = nn.Sequential( | |
layers.ConvBNReLU( | |
4, 2, kernel_size=3, padding=1, bias_attr=False), | |
layers.ConvBN( | |
2, 1, kernel_size=3, padding=1, bias_attr=False)) | |
def fuse(self, x, y): | |
""" | |
Args: | |
x (Tensor): The low level feature. | |
y (Tensor): The high level feature. | |
""" | |
atten = helper.avg_max_reduce_channel([x, y]) | |
atten = F.sigmoid(self.conv_xy_atten(atten)) | |
out = x * atten + y * (1 - atten) | |
out = self.conv_out(out) | |
return out | |
class UAFM_SpAtten_S(UAFM): | |
""" | |
The UAFM with spatial attention, which uses mean values. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) | |
self.conv_xy_atten = nn.Sequential( | |
layers.ConvBNReLU( | |
2, 2, kernel_size=3, padding=1, bias_attr=False), | |
layers.ConvBN( | |
2, 1, kernel_size=3, padding=1, bias_attr=False)) | |
def fuse(self, x, y): | |
""" | |
Args: | |
x (Tensor): The low level feature. | |
y (Tensor): The high level feature. | |
""" | |
atten = helper.avg_reduce_channel([x, y]) | |
atten = F.sigmoid(self.conv_xy_atten(atten)) | |
out = x * atten + y * (1 - atten) | |
out = self.conv_out(out) | |
return out | |
class UAFMMobile(UAFM): | |
""" | |
Unified Attention Fusion Module for mobile. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) | |
self.conv_x = layers.SeparableConvBNReLU( | |
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) | |
self.conv_out = layers.SeparableConvBNReLU( | |
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) | |
class UAFMMobile_SpAtten(UAFM): | |
""" | |
Unified Attention Fusion Module with spatial attention for mobile. | |
Args: | |
x_ch (int): The channel of x tensor, which is the low level feature. | |
y_ch (int): The channel of y tensor, which is the high level feature. | |
out_ch (int): The channel of output tensor. | |
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. | |
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. | |
""" | |
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): | |
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) | |
self.conv_x = layers.SeparableConvBNReLU( | |
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) | |
self.conv_out = layers.SeparableConvBNReLU( | |
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) | |
self.conv_xy_atten = nn.Sequential( | |
layers.ConvBNReLU( | |
4, 2, kernel_size=3, padding=1, bias_attr=False), | |
layers.ConvBN( | |
2, 1, kernel_size=3, padding=1, bias_attr=False)) | |
def fuse(self, x, y): | |
""" | |
Args: | |
x (Tensor): The low level feature. | |
y (Tensor): The high level feature. | |
""" | |
atten = helper.avg_max_reduce_channel([x, y]) | |
atten = F.sigmoid(self.conv_xy_atten(atten)) | |
out = x * atten + y * (1 - atten) | |
out = self.conv_out(out) | |
return out | |