Spaces:
Configuration error
Configuration error
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import paddle | |
import paddle.nn as nn | |
import paddle.nn.functional as F | |
from paddleseg.models import layers | |
from paddleseg.cvlibs import manager | |
from paddleseg.utils import utils | |
class ESPNetV1(nn.Layer): | |
""" | |
The ESPNetV1 implementation based on PaddlePaddle. | |
The original article refers to | |
Sachin Mehta1, Mohammad Rastegari, Anat Caspi, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNet: Efficient Spatial Pyramid of Dilated Convolutions for Semantic Segmentation" | |
(https://arxiv.org/abs/1803.06815). | |
Args: | |
num_classes (int): The unique number of target classes. | |
in_channels (int, optional): Number of input channels. Default: 3. | |
level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 2. | |
level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3. | |
pretrained (str, optional): The path or url of pretrained model. Default: None. | |
""" | |
def __init__(self, | |
num_classes, | |
in_channels=3, | |
level2_depth=2, | |
level3_depth=3, | |
pretrained=None): | |
super().__init__() | |
self.encoder = ESPNetEncoder(num_classes, in_channels, level2_depth, | |
level3_depth) | |
self.level3_up = nn.Conv2DTranspose( | |
num_classes, | |
num_classes, | |
2, | |
stride=2, | |
padding=0, | |
output_padding=0, | |
bias_attr=False) | |
self.br3 = layers.SyncBatchNorm(num_classes) | |
self.level2_proj = nn.Conv2D( | |
in_channels + 128, num_classes, 1, bias_attr=False) | |
self.combine_l2_l3 = nn.Sequential( | |
BNPReLU(2 * num_classes), | |
DilatedResidualBlock( | |
2 * num_classes, num_classes, residual=False), ) | |
self.level2_up = nn.Sequential( | |
nn.Conv2DTranspose( | |
num_classes, | |
num_classes, | |
2, | |
stride=2, | |
padding=0, | |
output_padding=0, | |
bias_attr=False), | |
BNPReLU(num_classes), ) | |
self.out_proj = layers.ConvBNPReLU( | |
16 + in_channels + num_classes, | |
num_classes, | |
3, | |
padding='same', | |
stride=1) | |
self.out_up = nn.Conv2DTranspose( | |
num_classes, | |
num_classes, | |
2, | |
stride=2, | |
padding=0, | |
output_padding=0, | |
bias_attr=False) | |
self.pretrained = pretrained | |
def init_weight(self): | |
if self.pretrained is not None: | |
utils.load_entire_model(self, self.pretrained) | |
def forward(self, x): | |
p1, p2, p3 = self.encoder(x) | |
up_p3 = self.level3_up(p3) | |
combine = self.combine_l2_l3(paddle.concat([up_p3, p2], axis=1)) | |
up_p2 = self.level2_up(combine) | |
combine = self.out_proj(paddle.concat([up_p2, p1], axis=1)) | |
out = self.out_up(combine) | |
return [out] | |
class BNPReLU(nn.Layer): | |
def __init__(self, channels): | |
super().__init__() | |
self.bn = layers.SyncBatchNorm(channels) | |
self.act = nn.PReLU(channels) | |
def forward(self, x): | |
x = self.bn(x) | |
x = self.act(x) | |
return x | |
class DownSampler(nn.Layer): | |
""" | |
Down sampler. | |
Args: | |
in_channels (int): Number of input channels. | |
out_channels (int): Number of output channels. | |
""" | |
def __init__(self, in_channels, out_channels): | |
super().__init__() | |
branch_channels = out_channels // 5 | |
remain_channels = out_channels - branch_channels * 4 | |
self.conv1 = nn.Conv2D( | |
in_channels, | |
branch_channels, | |
3, | |
stride=2, | |
padding=1, | |
bias_attr=False) | |
self.d_conv1 = nn.Conv2D( | |
branch_channels, remain_channels, 3, padding=1, bias_attr=False) | |
self.d_conv2 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=2, | |
dilation=2, | |
bias_attr=False) | |
self.d_conv4 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=4, | |
dilation=4, | |
bias_attr=False) | |
self.d_conv8 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=8, | |
dilation=8, | |
bias_attr=False) | |
self.d_conv16 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=16, | |
dilation=16, | |
bias_attr=False) | |
self.bn = layers.SyncBatchNorm(out_channels) | |
self.act = nn.PReLU(out_channels) | |
def forward(self, x): | |
x = self.conv1(x) | |
d1 = self.d_conv1(x) | |
d2 = self.d_conv2(x) | |
d4 = self.d_conv4(x) | |
d8 = self.d_conv8(x) | |
d16 = self.d_conv16(x) | |
feat1 = d2 | |
feat2 = feat1 + d4 | |
feat3 = feat2 + d8 | |
feat4 = feat3 + d16 | |
feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1) | |
out = self.bn(feat) | |
out = self.act(out) | |
return out | |
class DilatedResidualBlock(nn.Layer): | |
''' | |
ESP block, principle: reduce -> split -> transform -> merge | |
Args: | |
in_channels (int): Number of input channels. | |
out_channels (int): Number of output channels. | |
residual (bool, optional): Add a residual connection through identity operation. Default: True. | |
''' | |
def __init__(self, in_channels, out_channels, residual=True): | |
super().__init__() | |
branch_channels = out_channels // 5 | |
remain_channels = out_channels - branch_channels * 4 | |
self.conv1 = nn.Conv2D(in_channels, branch_channels, 1, bias_attr=False) | |
self.d_conv1 = nn.Conv2D( | |
branch_channels, remain_channels, 3, padding=1, bias_attr=False) | |
self.d_conv2 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=2, | |
dilation=2, | |
bias_attr=False) | |
self.d_conv4 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=4, | |
dilation=4, | |
bias_attr=False) | |
self.d_conv8 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=8, | |
dilation=8, | |
bias_attr=False) | |
self.d_conv16 = nn.Conv2D( | |
branch_channels, | |
branch_channels, | |
3, | |
padding=16, | |
dilation=16, | |
bias_attr=False) | |
self.bn = BNPReLU(out_channels) | |
self.residual = residual | |
def forward(self, x): | |
x_proj = self.conv1(x) | |
d1 = self.d_conv1(x_proj) | |
d2 = self.d_conv2(x_proj) | |
d4 = self.d_conv4(x_proj) | |
d8 = self.d_conv8(x_proj) | |
d16 = self.d_conv16(x_proj) | |
feat1 = d2 | |
feat2 = feat1 + d4 | |
feat3 = feat2 + d8 | |
feat4 = feat3 + d16 | |
feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1) | |
if self.residual: | |
feat = feat + x | |
out = self.bn(feat) | |
return out | |
class ESPNetEncoder(nn.Layer): | |
''' | |
The ESPNet-C implementation based on PaddlePaddle. | |
Args: | |
num_classes (int): The unique number of target classes. | |
in_channels (int, optional): Number of input channels. Default: 3. | |
level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 5. | |
level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3. | |
''' | |
def __init__(self, | |
num_classes, | |
in_channels=3, | |
level2_depth=5, | |
level3_depth=3): | |
super().__init__() | |
self.level1 = layers.ConvBNPReLU( | |
in_channels, 16, 3, padding='same', stride=2) | |
self.br1 = BNPReLU(in_channels + 16) | |
self.proj1 = layers.ConvBNPReLU(in_channels + 16, num_classes, 1) | |
self.level2_0 = DownSampler(in_channels + 16, 64) | |
self.level2 = nn.Sequential( | |
*[DilatedResidualBlock(64, 64) for i in range(level2_depth)]) | |
self.br2 = BNPReLU(in_channels + 128) | |
self.proj2 = layers.ConvBNPReLU(in_channels + 128, num_classes, 1) | |
self.level3_0 = DownSampler(in_channels + 128, 128) | |
self.level3 = nn.Sequential( | |
*[DilatedResidualBlock(128, 128) for i in range(level3_depth)]) | |
self.br3 = BNPReLU(256) | |
self.proj3 = layers.ConvBNPReLU(256, num_classes, 1) | |
def forward(self, x): | |
f1 = self.level1(x) | |
down2 = F.adaptive_avg_pool2d(x, output_size=f1.shape[2:]) | |
feat1 = paddle.concat([f1, down2], axis=1) | |
feat1 = self.br1(feat1) | |
p1 = self.proj1(feat1) | |
f2_res = self.level2_0(feat1) | |
f2 = self.level2(f2_res) | |
down4 = F.adaptive_avg_pool2d(x, output_size=f2.shape[2:]) | |
feat2 = paddle.concat([f2, f2_res, down4], axis=1) | |
feat2 = self.br2(feat2) | |
p2 = self.proj2(feat2) | |
f3_res = self.level3_0(feat2) | |
f3 = self.level3(f3_res) | |
feat3 = paddle.concat([f3, f3_res], axis=1) | |
feat3 = self.br3(feat3) | |
p3 = self.proj3(feat3) | |
return p1, p2, p3 | |