Spaces:
Configuration error
Configuration error
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import paddle | |
import paddle.nn as nn | |
import paddle.nn.functional as F | |
from paddleseg.cvlibs import manager | |
from paddleseg.models import layers | |
from paddleseg.utils import utils | |
class BiseNetV1(nn.Layer): | |
""" | |
The BiSeNetV1 implementation based on PaddlePaddle. | |
The original article refers to | |
Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation" | |
(https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for) | |
Args: | |
num_classes (int): The unique number of target classes. | |
backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd. | |
pretrained (str, optional): The path or url of pretrained model. Default: None. | |
""" | |
def __init__(self, num_classes, backbone, conv_channel=128, | |
pretrained=None): | |
super().__init__() | |
self.backbone = backbone | |
self.spatial_path = SpatialPath(3, 128) | |
self.global_context = nn.Sequential( | |
nn.AdaptiveAvgPool2D(1), | |
layers.ConvBNReLU( | |
512, conv_channel, 1, bias_attr=False), ) | |
self.arms = nn.LayerList([ | |
AttentionRefinement(512, conv_channel), | |
AttentionRefinement(256, conv_channel), | |
]) | |
self.refines = nn.LayerList([ | |
layers.ConvBNReLU( | |
conv_channel, | |
conv_channel, | |
3, | |
stride=1, | |
padding=1, | |
bias_attr=False), | |
layers.ConvBNReLU( | |
conv_channel, | |
conv_channel, | |
3, | |
stride=1, | |
padding=1, | |
bias_attr=False), | |
]) | |
self.heads = nn.LayerList([ | |
BiSeNetHead(conv_channel, num_classes, 8, True), | |
BiSeNetHead(conv_channel, num_classes, 8, True), | |
BiSeNetHead(conv_channel * 2, num_classes, 8, False), | |
]) | |
self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1) | |
self.pretrained = pretrained | |
def init_weight(self): | |
if self.pretrained is not None: | |
utils.load_entire_model(self, self.pretrained) | |
def forward(self, x): | |
spatial_out = self.spatial_path(x) | |
context_blocks = self.backbone(x) | |
context_blocks.reverse() | |
global_context = self.global_context(context_blocks[0]) | |
global_context = F.interpolate( | |
global_context, | |
size=paddle.shape(context_blocks[0])[2:], | |
mode='bilinear', | |
align_corners=True) | |
last_fm = global_context | |
pred_out = [] | |
for i, ( | |
fm, arm, refine | |
) in enumerate(zip(context_blocks[:2], self.arms, self.refines)): | |
fm = arm(fm) | |
fm += last_fm | |
last_fm = F.interpolate( | |
fm, | |
size=paddle.shape(context_blocks[i + 1])[2:], | |
mode='bilinear', | |
align_corners=True) | |
last_fm = refine(last_fm) | |
pred_out.append(last_fm) | |
context_out = last_fm | |
concate_fm = self.ffm(spatial_out, context_out) | |
pred_out.append(concate_fm) | |
output = [] | |
if self.training: | |
for i, head in enumerate(self.heads): | |
out = head(pred_out[i]) | |
output.append(out) | |
else: | |
out = self.heads[-1](pred_out[-1]) | |
output.append(out) | |
return output | |
class SpatialPath(nn.Layer): | |
""" | |
SpatialPath module of BiseNetV1 model | |
Args: | |
in_channels (int): The number of input channels in spatial path module. | |
out_channels (int): The number of output channels in spatial path module. | |
""" | |
def __init__(self, in_channels, out_channels, inner_channel=64): | |
super().__init__() | |
self.conv_7x7 = layers.ConvBNReLU( | |
in_channels, inner_channel, 7, stride=2, padding=3, bias_attr=False) | |
self.conv_3x3_1 = layers.ConvBNReLU( | |
inner_channel, | |
inner_channel, | |
3, | |
stride=2, | |
padding=1, | |
bias_attr=False) | |
self.conv_3x3_2 = layers.ConvBNReLU( | |
inner_channel, | |
inner_channel, | |
3, | |
stride=2, | |
padding=1, | |
bias_attr=False) | |
self.conv_1x1 = layers.ConvBNReLU( | |
inner_channel, out_channels, 1, bias_attr=False) | |
def forward(self, x): | |
x = self.conv_7x7(x) | |
x = self.conv_3x3_1(x) | |
x = self.conv_3x3_2(x) | |
x = self.conv_1x1(x) | |
return x | |
class BiSeNetHead(nn.Layer): | |
""" | |
BiSeNet head of BiseNetV1 model | |
Args: | |
in_channels (int): The number of input channels in spatial path module. | |
out_channels (int): The number of output channels in spatial path module. | |
scale (int, float): The scale factor of interpolation. | |
""" | |
def __init__(self, in_channels, out_channels, scale, is_aux=False): | |
super().__init__() | |
inner_channel = 128 if is_aux else 64 | |
self.conv_3x3 = layers.ConvBNReLU( | |
in_channels, inner_channel, 3, stride=1, padding=1, bias_attr=False) | |
self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1) | |
self.scale = scale | |
def forward(self, x): | |
x = self.conv_3x3(x) | |
x = self.conv_1x1(x) | |
if self.scale > 1: | |
x = F.interpolate( | |
x, scale_factor=self.scale, mode='bilinear', align_corners=True) | |
return x | |
class AttentionRefinement(nn.Layer): | |
""" | |
AttentionRefinement module of BiseNetV1 model | |
Args: | |
in_channels (int): The number of input channels in spatial path module. | |
out_channels (int): The number of output channels in spatial path module. | |
""" | |
def __init__(self, in_channels, out_channels): | |
super().__init__() | |
self.conv_3x3 = layers.ConvBNReLU( | |
in_channels, out_channels, 3, stride=1, padding=1, bias_attr=False) | |
self.channel_attention = nn.Sequential( | |
nn.AdaptiveAvgPool2D(1), | |
layers.ConvBNReLU( | |
out_channels, out_channels, 1, bias_attr=False), | |
nn.Sigmoid(), ) | |
def forward(self, x): | |
x = self.conv_3x3(x) | |
se = self.channel_attention(x) | |
x = x * se | |
return x | |
class FeatureFusion(nn.Layer): | |
""" | |
AttentionRefinement module of BiseNetV1 model | |
Args: | |
in_channels (int): The number of input channels in spatial path module. | |
out_channels (int): The number of output channels in spatial path module. | |
reduction (int): A factor shrinks convolutional channels. Default: 1. | |
""" | |
def __init__(self, in_channels, out_channels, reduction=1): | |
super().__init__() | |
self.conv_1x1 = layers.ConvBNReLU( | |
in_channels, out_channels, 1, bias_attr=False) | |
self.channel_attention = nn.Sequential( | |
nn.AdaptiveAvgPool2D(1), | |
layers.ConvBNReLU( | |
out_channels, out_channels // reduction, 1, bias_attr=False), | |
layers.ConvBNReLU( | |
out_channels // reduction, out_channels, 1, bias_attr=False), | |
nn.Sigmoid(), ) | |
def forward(self, x1, x2): | |
fm = paddle.concat([x1, x2], axis=1) | |
fm = self.conv_1x1(fm) | |
fm_se = self.channel_attention(fm) | |
output = fm + fm * fm_se | |
return output | |