diff --git a/.gitattributes b/.gitattributes index bf07816c74bac9b682df196e02c6482e474e9b52..e1bb436f28db5d4d9c4fe6ca35e6cfdabe33c239 100644 --- a/.gitattributes +++ b/.gitattributes @@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zstandard filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.pdparams filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6bc3df0d35955f1d1df00845c89d4f339ef21232 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Anagha S Menon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 2b2734a47ac8ea85de80650ebd6cea9681d851e5..99b6798ec27ce31c900c101b95b32312a34a30a9 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,20 @@ ---- -title: Pipeline Paddle -emoji: 😻 -colorFrom: pink -colorTo: pink -sdk: gradio -sdk_version: 3.1.1 -app_file: app.py -pinned: false ---- +# pipeline_paddle_viton -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +To run + +
Step 1: git clone https://github.com/ANAGHA-20/pipeline-paddle +
Step 2: %cd ./pipeline_paddle_viton +
Step 3: pip install -r requirements.txt +
pip install paddlepaddle-gpu +
pip install pymatting +
import os +
Step 4: export CUDA_VISIBLE_DEVICES=0 +
Step 5: wget "https://paddleseg.bj.bcebos.com/matting/models/ppmatting-hrnet_w18-human_1024.pdparams" -O "/content/pipeline_paddle_viton/models/ppmatting-hrnet_w18-human_1024.pdparams" +
Step 6: run +
+!python bg_replace.py \ + --config configs/ppmatting/ppmatting-hrnet_w18-human_1024.yml \ + --model_path models/ppmatting-hrnet_w18-human_1024.pdparams \ + --image_path ./image/person.jpg \ + --save_dir ./output \ + --fg_estimate True diff --git a/bg_replace.py b/bg_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..1743fc2d0a6eeb3a8f7ceb9fb4e7ae6fadf95dfd --- /dev/null +++ b/bg_replace.py @@ -0,0 +1,146 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys + +import cv2 +import numpy as np +import paddle +from paddleseg.cvlibs import manager, Config +from paddleseg.utils import get_sys_env, logger + +LOCAL_PATH = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(LOCAL_PATH, '..')) + +manager.BACKBONES._components_dict.clear() +manager.TRANSFORMS._components_dict.clear() + +import ppmatting +from ppmatting.core import predict +from ppmatting.utils import get_image_list, estimate_foreground_ml + + +def parse_args(): + parser = argparse.ArgumentParser( + description='PP-HumanSeg inference for video') + parser.add_argument( + "--config", + dest="cfg", + help="The config file.", + default=None, + type=str, + required=True) + parser.add_argument( + '--model_path', + dest='model_path', + help='The path of model for prediction', + type=str, + default=None) + parser.add_argument( + '--image_path', + dest='image_path', + help='Image including human', + type=str, + default=None) + parser.add_argument( + '--trimap_path', + dest='trimap_path', + help='The path of trimap', + type=str, + default=None) + parser.add_argument( + '--background', + dest='background', + help='Background for replacing. It is a string which specifies the background color (r,g,b,w) or a path to background image. If not specified, a green background is used.', + type=str, + default=None) + parser.add_argument( + '--save_dir', + dest='save_dir', + help='The directory for saving the inference results', + type=str, + default='./output') + parser.add_argument( + '--fg_estimate', + default=True, + type=eval, + choices=[True, False], + help='Whether to estimate foreground when predicting.') + + return parser.parse_args() + + +def main(args): + env_info = get_sys_env() + place = 'gpu' if env_info['Paddle compiled with cuda'] and env_info[ + 'GPUs used'] else 'cpu' + paddle.set_device(place) + if not args.cfg: + raise RuntimeError('No configuration file specified.') + + cfg = Config(args.cfg) + + msg = '\n---------------Config Information---------------\n' + msg += str(cfg) + msg += '------------------------------------------------' + logger.info(msg) + + model = cfg.model + transforms = ppmatting.transforms.Compose(cfg.val_transforms) + + alpha, fg = predict( + model, + model_path=args.model_path, + transforms=transforms, + image_list=[args.image_path], + trimap_list=[args.trimap_path], + save_dir=args.save_dir, + fg_estimate=args.fg_estimate) + + img_ori = cv2.imread(args.image_path) + bg = get_bg(args.background, img_ori.shape) + alpha = alpha / 255.0 + alpha = alpha[:, :, np.newaxis] + com = alpha * fg + (1 - alpha) * bg + com = com.astype('uint8') + com_save_path = os.path.join(args.save_dir, + os.path.basename(args.image_path)) + cv2.imwrite(com_save_path, com) + + +def get_bg(background, img_shape): + bg = np.zeros(img_shape) + if background == 'r': + bg[:, :, 2] = 255 + elif background is None or background == 'g': + bg[:, :, 1] = 255 + elif background == 'b': + bg[:, :, 0] = 255 + elif background == 'w': + bg[:, :, :] = 255 + + elif not os.path.exists(background): + raise Exception('The --background is not existed: {}'.format( + background)) + else: + bg = cv2.imread(background) + bg = cv2.resize(bg, (img_shape[1], img_shape[0])) + return bg + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/configs/ppmatting/README.md b/configs/ppmatting/README.md new file mode 100644 index 0000000000000000000000000000000000000000..66b18759d715bf55badcb2a60dd08e92ec0421ea --- /dev/null +++ b/configs/ppmatting/README.md @@ -0,0 +1,20 @@ +# PP-Matting: High-Accuracy Natural Image Matting + +## Reference + +> Chen G, Liu Y, Wang J, et al. PP-Matting: High-Accuracy Natural Image Matting[J]. arXiv preprint arXiv:2204.09433, 2022. + +## Performance + +### Composition-1k + +| Model | Backbone | Resolution | Training Iters | SAD $\downarrow$ | MSE $\downarrow$ | Grad $\downarrow$ | Conn $\downarrow$ | Links | +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +|PP-Matting|HRNet_W48|512x512|300000|46.22|0.005|22.69|45.40|[model](https://paddleseg.bj.bcebos.com/matting/models/ppmatting-hrnet_w48-composition.pdparams)| + + +### Distinctions-646 + +| Model | Backbone | Resolution | Training Iters | SAD $\downarrow$ | MSE $\downarrow$ | Grad $\downarrow$ | Conn $\downarrow$ | Links | +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +|PP-Matting|HRNet_W48|512x512|300000|40.69|0.009|43.91|40.56|[model](https://paddleseg.bj.bcebos.com/matting/models/ppmatting-hrnet_w48-distinctions.pdparams)| diff --git a/configs/ppmatting/ppmatting-hrnet_w18-human_1024.yml b/configs/ppmatting/ppmatting-hrnet_w18-human_1024.yml new file mode 100644 index 0000000000000000000000000000000000000000..a7ece05efda852ecee950fe37541751d5a15dc90 --- /dev/null +++ b/configs/ppmatting/ppmatting-hrnet_w18-human_1024.yml @@ -0,0 +1,29 @@ +_base_: 'ppmatting-hrnet_w18-human_512.yml' + + +train_dataset: + transforms: + - type: LoadImages + - type: LimitShort + max_short: 1024 + - type: RandomCrop + crop_size: [1024, 1024] + - type: RandomDistort + - type: RandomBlur + prob: 0.1 + - type: RandomNoise + prob: 0.5 + - type: RandomReJpeg + prob: 0.2 + - type: RandomHorizontalFlip + - type: Normalize + +val_dataset: + transforms: + - type: LoadImages + - type: LimitShort + max_short: 1024 + - type: ResizeToIntMult + mult_int: 32 + - type: Normalize + diff --git a/configs/ppmatting/ppmatting-hrnet_w18-human_512.yml b/configs/ppmatting/ppmatting-hrnet_w18-human_512.yml new file mode 100644 index 0000000000000000000000000000000000000000..90b8d7f3f9bd5384766b840210a422952c558f4a --- /dev/null +++ b/configs/ppmatting/ppmatting-hrnet_w18-human_512.yml @@ -0,0 +1,44 @@ +_base_: 'ppmatting-hrnet_w48-distinctions.yml' + +batch_size: 4 +iters: 200000 + +train_dataset: + type: MattingDataset + dataset_root: data/PPM-100 + train_file: train.txt + transforms: + - type: LoadImages + - type: LimitShort + max_short: 512 + - type: RandomCrop + crop_size: [512, 512] + - type: RandomDistort + - type: RandomBlur + prob: 0.1 + - type: RandomNoise + prob: 0.5 + - type: RandomReJpeg + prob: 0.2 + - type: RandomHorizontalFlip + - type: Normalize + mode: train + +val_dataset: + type: MattingDataset + dataset_root: data/PPM-100 + val_file: val.txt + transforms: + - type: LoadImages + - type: LimitShort + max_short: 512 + - type: ResizeToIntMult + mult_int: 32 + - type: Normalize + mode: val + get_trimap: False + +model: + backbone: + type: HRNet_W18 + pretrained: https://bj.bcebos.com/paddleseg/dygraph/hrnet_w18_ssld.tar.gz diff --git a/configs/ppmatting/ppmatting-hrnet_w48-composition.yml b/configs/ppmatting/ppmatting-hrnet_w48-composition.yml new file mode 100644 index 0000000000000000000000000000000000000000..64f011587223fa635d6368199d9ab237a3266e08 --- /dev/null +++ b/configs/ppmatting/ppmatting-hrnet_w48-composition.yml @@ -0,0 +1,7 @@ +_base_: 'ppmatting-hrnet_w48-distinctions.yml' + +train_dataset: + dataset_root: data/matting/Composition-1k + +val_dataset: + dataset_root: data/matting/Composition-1k \ No newline at end of file diff --git a/configs/ppmatting/ppmatting-hrnet_w48-distinctions.yml b/configs/ppmatting/ppmatting-hrnet_w48-distinctions.yml new file mode 100644 index 0000000000000000000000000000000000000000..991b0f466d9ab6462af5bd57882c17cba281b3ff --- /dev/null +++ b/configs/ppmatting/ppmatting-hrnet_w48-distinctions.yml @@ -0,0 +1,55 @@ +batch_size: 4 +iters: 300000 + +train_dataset: + type: MattingDataset + dataset_root: data/matting/Distinctions-646 + train_file: train.txt + transforms: + - type: LoadImages + - type: Padding + target_size: [512, 512] + - type: RandomCrop + crop_size: [[512, 512],[640, 640], [800, 800]] + - type: Resize + target_size: [512, 512] + - type: RandomDistort + - type: RandomBlur + prob: 0.1 + - type: RandomHorizontalFlip + - type: Normalize + mode: train + separator: '|' + +val_dataset: + type: MattingDataset + dataset_root: data/matting/Distinctions-646 + val_file: val.txt + transforms: + - type: LoadImages + - type: LimitShort + max_short: 1536 + - type: ResizeToIntMult + mult_int: 32 + - type: Normalize + mode: val + get_trimap: False + separator: '|' + +model: + type: PPMatting + backbone: + type: HRNet_W48 + pretrained: https://bj.bcebos.com/paddleseg/dygraph/hrnet_w48_ssld.tar.gz + pretrained: Null + +optimizer: + type: sgd + momentum: 0.9 + weight_decay: 4.0e-5 + +lr_scheduler: + type: PolynomialDecay + learning_rate: 0.01 + end_lr: 0 + power: 0.9 \ No newline at end of file diff --git a/image/readme.md b/image/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..3e0b94c6df84cac026c441540b4e27b27b71757c --- /dev/null +++ b/image/readme.md @@ -0,0 +1 @@ +Upload image of person as person.jpg diff --git a/models/ppmatting-hrnet_w18-human_1024.pdparams b/models/ppmatting-hrnet_w18-human_1024.pdparams new file mode 100644 index 0000000000000000000000000000000000000000..0fa75c592be4b0ea331c51b90b9858a595348369 --- /dev/null +++ b/models/ppmatting-hrnet_w18-human_1024.pdparams @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65315fb68255266cb9adedc6879ba1e28ed7b84e5d02c0dc7ad8caace8370011 +size 98439023 diff --git a/models/readme.md b/models/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..e770c6b565947d30c355265a376ccd3cc1685e7e --- /dev/null +++ b/models/readme.md @@ -0,0 +1 @@ +models required diff --git a/output/readme.md b/output/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..6d5548f75e42c8bbaf349defbd1082756440881f --- /dev/null +++ b/output/readme.md @@ -0,0 +1 @@ +Output of paddle using ppmatting will be available here diff --git a/paddleseg/__init__.py b/paddleseg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82121bb166560ec8dfdbe1f9cab9b644efcf19f7 --- /dev/null +++ b/paddleseg/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import models, datasets, transforms + +__version__ = '2.6.0' diff --git a/paddleseg/core/__init__.py b/paddleseg/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35189064a6996aaf9acbb3e1d70ac8da16912bf5 --- /dev/null +++ b/paddleseg/core/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .train import train +from .val import evaluate +from .predict import predict +from . import infer + +__all__ = ['train', 'evaluate', 'predict'] diff --git a/paddleseg/core/infer.py b/paddleseg/core/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..aa6eb425764502898286c792859af057ad84090c --- /dev/null +++ b/paddleseg/core/infer.py @@ -0,0 +1,232 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +from itertools import combinations + +import numpy as np +import cv2 +import paddle +import paddle.nn.functional as F + + +def reverse_transform(pred, trans_info, mode='nearest'): + """recover pred to origin shape""" + intTypeList = [paddle.int8, paddle.int16, paddle.int32, paddle.int64] + dtype = pred.dtype + for item in trans_info[::-1]: + if isinstance(item[0], list): + trans_mode = item[0][0] + else: + trans_mode = item[0] + if trans_mode == 'resize': + h, w = item[1][0], item[1][1] + if paddle.get_device() == 'cpu' and dtype in intTypeList: + pred = paddle.cast(pred, 'float32') + pred = F.interpolate(pred, (h, w), mode=mode) + pred = paddle.cast(pred, dtype) + else: + pred = F.interpolate(pred, (h, w), mode=mode) + elif trans_mode == 'padding': + h, w = item[1][0], item[1][1] + pred = pred[:, :, 0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format(item[0])) + return pred + + +def flip_combination(flip_horizontal=False, flip_vertical=False): + """ + Get flip combination. + + Args: + flip_horizontal (bool): Whether to flip horizontally. Default: False. + flip_vertical (bool): Whether to flip vertically. Default: False. + + Returns: + list: List of tuple. The first element of tuple is whether to flip horizontally, + and the second is whether to flip vertically. + """ + + flip_comb = [(False, False)] + if flip_horizontal: + flip_comb.append((True, False)) + if flip_vertical: + flip_comb.append((False, True)) + if flip_horizontal: + flip_comb.append((True, True)) + return flip_comb + + +def tensor_flip(x, flip): + """Flip tensor according directions""" + if flip[0]: + x = x[:, :, :, ::-1] + if flip[1]: + x = x[:, :, ::-1, :] + return x + + +def slide_inference(model, im, crop_size, stride): + """ + Infer by sliding window. + + Args: + model (paddle.nn.Layer): model to get logits of image. + im (Tensor): the input image. + crop_size (tuple|list). The size of sliding window, (w, h). + stride (tuple|list). The size of stride, (w, h). + + Return: + Tensor: The logit of input image. + """ + h_im, w_im = im.shape[-2:] + w_crop, h_crop = crop_size + w_stride, h_stride = stride + # calculate the crop nums + rows = np.int(np.ceil(1.0 * (h_im - h_crop) / h_stride)) + 1 + cols = np.int(np.ceil(1.0 * (w_im - w_crop) / w_stride)) + 1 + # prevent negative sliding rounds when imgs after scaling << crop_size + rows = 1 if h_im <= h_crop else rows + cols = 1 if w_im <= w_crop else cols + # TODO 'Tensor' object does not support item assignment. If support, use tensor to calculation. + final_logit = None + count = np.zeros([1, 1, h_im, w_im]) + for r in range(rows): + for c in range(cols): + h1 = r * h_stride + w1 = c * w_stride + h2 = min(h1 + h_crop, h_im) + w2 = min(w1 + w_crop, w_im) + h1 = max(h2 - h_crop, 0) + w1 = max(w2 - w_crop, 0) + im_crop = im[:, :, h1:h2, w1:w2] + logits = model(im_crop) + if not isinstance(logits, collections.abc.Sequence): + raise TypeError( + "The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}" + .format(type(logits))) + logit = logits[0].numpy() + if final_logit is None: + final_logit = np.zeros([1, logit.shape[1], h_im, w_im]) + final_logit[:, :, h1:h2, w1:w2] += logit[:, :, :h2 - h1, :w2 - w1] + count[:, :, h1:h2, w1:w2] += 1 + if np.sum(count == 0) != 0: + raise RuntimeError( + 'There are pixel not predicted. It is possible that stride is greater than crop_size' + ) + final_logit = final_logit / count + final_logit = paddle.to_tensor(final_logit) + return final_logit + + +def inference(model, + im, + trans_info=None, + is_slide=False, + stride=None, + crop_size=None): + """ + Inference for image. + + Args: + model (paddle.nn.Layer): model to get logits of image. + im (Tensor): the input image. + trans_info (list): Image shape informating changed process. Default: None. + is_slide (bool): Whether to infer by sliding window. Default: False. + crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True. + stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True. + + Returns: + Tensor: If ori_shape is not None, a prediction with shape (1, 1, h, w) is returned. + If ori_shape is None, a logit with shape (1, num_classes, h, w) is returned. + """ + if hasattr(model, 'data_format') and model.data_format == 'NHWC': + im = im.transpose((0, 2, 3, 1)) + if not is_slide: + logits = model(im) + if not isinstance(logits, collections.abc.Sequence): + raise TypeError( + "The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}" + .format(type(logits))) + logit = logits[0] + else: + logit = slide_inference(model, im, crop_size=crop_size, stride=stride) + if hasattr(model, 'data_format') and model.data_format == 'NHWC': + logit = logit.transpose((0, 3, 1, 2)) + if trans_info is not None: + logit = reverse_transform(logit, trans_info, mode='bilinear') + pred = paddle.argmax(logit, axis=1, keepdim=True, dtype='int32') + return pred, logit + else: + return logit + + +def aug_inference(model, + im, + trans_info, + scales=1.0, + flip_horizontal=False, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None): + """ + Infer with augmentation. + + Args: + model (paddle.nn.Layer): model to get logits of image. + im (Tensor): the input image. + trans_info (list): Transforms for image. + scales (float|tuple|list): Scales for resize. Default: 1. + flip_horizontal (bool): Whether to flip horizontally. Default: False. + flip_vertical (bool): Whether to flip vertically. Default: False. + is_slide (bool): Whether to infer by sliding wimdow. Default: False. + crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True. + stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True. + + Returns: + Tensor: Prediction of image with shape (1, 1, h, w) is returned. + """ + if isinstance(scales, float): + scales = [scales] + elif not isinstance(scales, (tuple, list)): + raise TypeError( + '`scales` expects float/tuple/list type, but received {}'.format( + type(scales))) + final_logit = 0 + h_input, w_input = im.shape[-2], im.shape[-1] + flip_comb = flip_combination(flip_horizontal, flip_vertical) + for scale in scales: + h = int(h_input * scale + 0.5) + w = int(w_input * scale + 0.5) + im = F.interpolate(im, (h, w), mode='bilinear') + for flip in flip_comb: + im_flip = tensor_flip(im, flip) + logit = inference( + model, + im_flip, + is_slide=is_slide, + crop_size=crop_size, + stride=stride) + logit = tensor_flip(logit, flip) + logit = F.interpolate(logit, (h_input, w_input), mode='bilinear') + + logit = F.softmax(logit, axis=1) + final_logit = final_logit + logit + + final_logit = reverse_transform(final_logit, trans_info, mode='bilinear') + pred = paddle.argmax(final_logit, axis=1, keepdim=True, dtype='int32') + + return pred, final_logit diff --git a/paddleseg/core/predict.py b/paddleseg/core/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..98097c7f5a8a5500a15f5e7c8a5d99a33ebc79c6 --- /dev/null +++ b/paddleseg/core/predict.py @@ -0,0 +1,147 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math + +import cv2 +import numpy as np +import paddle + +from paddleseg import utils +from paddleseg.core import infer +from paddleseg.utils import logger, progbar, visualize + + +def mkdir(path): + sub_dir = os.path.dirname(path) + if not os.path.exists(sub_dir): + os.makedirs(sub_dir) + + +def partition_list(arr, m): + """split the list 'arr' into m pieces""" + n = int(math.ceil(len(arr) / float(m))) + return [arr[i:i + n] for i in range(0, len(arr), n)] + + +def preprocess(im_path, transforms): + data = {} + data['img'] = im_path + data = transforms(data) + data['img'] = data['img'][np.newaxis, ...] + data['img'] = paddle.to_tensor(data['img']) + return data + + +def predict(model, + model_path, + transforms, + image_list, + image_dir=None, + save_dir='output', + aug_pred=False, + scales=1.0, + flip_horizontal=True, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None, + custom_color=None): + """ + predict and visualize the image_list. + + Args: + model (nn.Layer): Used to predict for input image. + model_path (str): The path of pretrained model. + transforms (transform.Compose): Preprocess for input image. + image_list (list): A list of image path to be predicted. + image_dir (str, optional): The root directory of the images predicted. Default: None. + save_dir (str, optional): The directory to save the visualized results. Default: 'output'. + aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False. + scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0. + flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True. + flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False. + is_slide (bool, optional): Whether to predict by sliding window. Default: False. + stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height. + It should be provided when `is_slide` is True. + crop_size (tuple|list, optional): The crop size of sliding window, the first is width and the second is height. + It should be provided when `is_slide` is True. + custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map. + + """ + utils.utils.load_entire_model(model, model_path) + model.eval() + nranks = paddle.distributed.get_world_size() + local_rank = paddle.distributed.get_rank() + if nranks > 1: + img_lists = partition_list(image_list, nranks) + else: + img_lists = [image_list] + + added_saved_dir = os.path.join(save_dir, 'added_prediction') + pred_saved_dir = os.path.join(save_dir, 'pseudo_color_prediction') + + logger.info("Start to predict...") + progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1) + color_map = visualize.get_color_map_list(256, custom_color=custom_color) + with paddle.no_grad(): + for i, im_path in enumerate(img_lists[local_rank]): + data = preprocess(im_path, transforms) + + if aug_pred: + pred, _ = infer.aug_inference( + model, + data['img'], + trans_info=data['trans_info'], + scales=scales, + flip_horizontal=flip_horizontal, + flip_vertical=flip_vertical, + is_slide=is_slide, + stride=stride, + crop_size=crop_size) + else: + pred, _ = infer.inference( + model, + data['img'], + trans_info=data['trans_info'], + is_slide=is_slide, + stride=stride, + crop_size=crop_size) + pred = paddle.squeeze(pred) + pred = pred.numpy().astype('uint8') + + # get the saved name + if image_dir is not None: + im_file = im_path.replace(image_dir, '') + else: + im_file = os.path.basename(im_path) + if im_file[0] == '/' or im_file[0] == '\\': + im_file = im_file[1:] + + # save added image + added_image = utils.visualize.visualize( + im_path, pred, color_map, weight=0.6) + added_image_path = os.path.join(added_saved_dir, im_file) + mkdir(added_image_path) + cv2.imwrite(added_image_path, added_image) + + # save pseudo color prediction + pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map) + pred_saved_path = os.path.join( + pred_saved_dir, os.path.splitext(im_file)[0] + ".png") + mkdir(pred_saved_path) + pred_mask.save(pred_saved_path) + + progbar_pred.update(i + 1) diff --git a/paddleseg/core/train.py b/paddleseg/core/train.py new file mode 100644 index 0000000000000000000000000000000000000000..fae72d23112c67ec9e834993afee165d39665880 --- /dev/null +++ b/paddleseg/core/train.py @@ -0,0 +1,334 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from collections import deque +import shutil + +import paddle +import paddle.nn.functional as F + +from paddleseg.utils import (TimeAverager, calculate_eta, resume, logger, + worker_init_fn, train_profiler, op_flops_funs) +from paddleseg.core.val import evaluate + + +def check_logits_losses(logits_list, losses): + len_logits = len(logits_list) + len_losses = len(losses['types']) + if len_logits != len_losses: + raise RuntimeError( + 'The length of logits_list should equal to the types of loss config: {} != {}.' + .format(len_logits, len_losses)) + + +def loss_computation(logits_list, labels, edges, losses): + check_logits_losses(logits_list, losses) + loss_list = [] + for i in range(len(logits_list)): + logits = logits_list[i] + loss_i = losses['types'][i] + coef_i = losses['coef'][i] + if loss_i.__class__.__name__ in ('BCELoss', ) and loss_i.edge_label: + # Use edges as labels According to loss type. + loss_list.append(coef_i * loss_i(logits, edges)) + elif loss_i.__class__.__name__ == 'MixedLoss': + mixed_loss_list = loss_i(logits, labels) + for mixed_loss in mixed_loss_list: + loss_list.append(coef_i * mixed_loss) + elif loss_i.__class__.__name__ in ("KLLoss", ): + loss_list.append(coef_i * + loss_i(logits_list[0], logits_list[1].detach())) + else: + loss_list.append(coef_i * loss_i(logits, labels)) + return loss_list + + +def train(model, + train_dataset, + val_dataset=None, + optimizer=None, + save_dir='output', + iters=10000, + batch_size=2, + resume_model=None, + save_interval=1000, + log_iters=10, + num_workers=0, + use_vdl=False, + losses=None, + keep_checkpoint_max=5, + test_config=None, + precision='fp32', + amp_level='O1', + profiler_options=None, + to_static_training=False): + """ + Launch training. + + Args: + model(nn.Layer): A semantic segmentation model. + train_dataset (paddle.io.Dataset): Used to read and process training datasets. + val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. + optimizer (paddle.optimizer.Optimizer): The optimizer. + save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. + iters (int, optional): How may iters to train the model. Defualt: 10000. + batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. + resume_model (str, optional): The path of resume model. + save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. + log_iters (int, optional): Display logging information at every log_iters. Default: 10. + num_workers (int, optional): Num workers for data loader. Default: 0. + use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. + losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). + The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. + keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. + test_config(dict, optional): Evaluation config. + precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the training is normal. + amp_level (str, optional): Auto mixed precision level. Accepted values are “O1” and “O2”: O1 represent mixed precision, + the input data type of each operator will be casted by white_list and black_list; O2 represent Pure fp16, all operators + parameters and input data will be casted to fp16, except operators in black_list, don’t support fp16 kernel and batchnorm. Default is O1(amp) + profiler_options (str, optional): The option of train profiler. + to_static_training (bool, optional): Whether to use @to_static for training. + """ + model.train() + nranks = paddle.distributed.ParallelEnv().nranks + local_rank = paddle.distributed.ParallelEnv().local_rank + + start_iter = 0 + if resume_model is not None: + start_iter = resume(model, optimizer, resume_model) + + if not os.path.isdir(save_dir): + if os.path.exists(save_dir): + os.remove(save_dir) + os.makedirs(save_dir, exist_ok=True) + + # use amp + if precision == 'fp16': + logger.info('use AMP to train. AMP level = {}'.format(amp_level)) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + if amp_level == 'O2': + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level='O2', + save_dtype='float32') + + if nranks > 1: + paddle.distributed.fleet.init(is_collective=True) + optimizer = paddle.distributed.fleet.distributed_optimizer( + optimizer) # The return is Fleet object + ddp_model = paddle.distributed.fleet.distributed_model(model) + + batch_sampler = paddle.io.DistributedBatchSampler( + train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) + + loader = paddle.io.DataLoader( + train_dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + return_list=True, + worker_init_fn=worker_init_fn, ) + + if use_vdl: + from visualdl import LogWriter + log_writer = LogWriter(save_dir) + + if to_static_training: + model = paddle.jit.to_static(model) + logger.info("Successfully applied @to_static") + + avg_loss = 0.0 + avg_loss_list = [] + iters_per_epoch = len(batch_sampler) + best_mean_iou = -1.0 + best_model_iter = -1 + reader_cost_averager = TimeAverager() + batch_cost_averager = TimeAverager() + save_models = deque() + batch_start = time.time() + + iter = start_iter + while iter < iters: + for data in loader: + iter += 1 + if iter > iters: + version = paddle.__version__ + if version == '2.1.2': + continue + else: + break + reader_cost_averager.record(time.time() - batch_start) + images = data['img'] + labels = data['label'].astype('int64') + edges = None + if 'edge' in data.keys(): + edges = data['edge'].astype('int64') + if hasattr(model, 'data_format') and model.data_format == 'NHWC': + images = images.transpose((0, 2, 3, 1)) + + if precision == 'fp16': + with paddle.amp.auto_cast( + level=amp_level, + enable=True, + custom_white_list={ + "elementwise_add", "batch_norm", "sync_batch_norm" + }, + custom_black_list={'bilinear_interp_v2'}): + logits_list = ddp_model(images) if nranks > 1 else model( + images) + loss_list = loss_computation( + logits_list=logits_list, + labels=labels, + edges=edges, + losses=losses) + loss = sum(loss_list) + + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + if isinstance(optimizer, paddle.distributed.fleet.Fleet): + scaler.minimize(optimizer.user_defined_optimizer, scaled) + else: + scaler.minimize(optimizer, scaled) # update parameters + else: + logits_list = ddp_model(images) if nranks > 1 else model(images) + loss_list = loss_computation( + logits_list=logits_list, + labels=labels, + edges=edges, + losses=losses) + loss = sum(loss_list) + loss.backward() + # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step. + if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau): + optimizer.step(loss) + else: + optimizer.step() + + lr = optimizer.get_lr() + + # update lr + if isinstance(optimizer, paddle.distributed.fleet.Fleet): + lr_sche = optimizer.user_defined_optimizer._learning_rate + else: + lr_sche = optimizer._learning_rate + if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler): + lr_sche.step() + + train_profiler.add_profiler_step(profiler_options) + + model.clear_gradients() + avg_loss += loss.numpy()[0] + if not avg_loss_list: + avg_loss_list = [l.numpy() for l in loss_list] + else: + for i in range(len(loss_list)): + avg_loss_list[i] += loss_list[i].numpy() + batch_cost_averager.record( + time.time() - batch_start, num_samples=batch_size) + + if (iter) % log_iters == 0 and local_rank == 0: + avg_loss /= log_iters + avg_loss_list = [l[0] / log_iters for l in avg_loss_list] + remain_iters = iters - iter + avg_train_batch_cost = batch_cost_averager.get_average() + avg_train_reader_cost = reader_cost_averager.get_average() + eta = calculate_eta(remain_iters, avg_train_batch_cost) + logger.info( + "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}" + .format((iter - 1 + ) // iters_per_epoch + 1, iter, iters, avg_loss, + lr, avg_train_batch_cost, avg_train_reader_cost, + batch_cost_averager.get_ips_average(), eta)) + if use_vdl: + log_writer.add_scalar('Train/loss', avg_loss, iter) + # Record all losses if there are more than 2 losses. + if len(avg_loss_list) > 1: + avg_loss_dict = {} + for i, value in enumerate(avg_loss_list): + avg_loss_dict['loss_' + str(i)] = value + for key, value in avg_loss_dict.items(): + log_tag = 'Train/' + key + log_writer.add_scalar(log_tag, value, iter) + + log_writer.add_scalar('Train/lr', lr, iter) + log_writer.add_scalar('Train/batch_cost', + avg_train_batch_cost, iter) + log_writer.add_scalar('Train/reader_cost', + avg_train_reader_cost, iter) + avg_loss = 0.0 + avg_loss_list = [] + reader_cost_averager.reset() + batch_cost_averager.reset() + + if (iter % save_interval == 0 or + iter == iters) and (val_dataset is not None): + num_workers = 1 if num_workers > 0 else 0 + + if test_config is None: + test_config = {} + + mean_iou, acc, _, _, _ = evaluate( + model, + val_dataset, + num_workers=num_workers, + precision=precision, + amp_level=amp_level, + **test_config) + + model.train() + + if (iter % save_interval == 0 or iter == iters) and local_rank == 0: + current_save_dir = os.path.join(save_dir, + "iter_{}".format(iter)) + if not os.path.isdir(current_save_dir): + os.makedirs(current_save_dir) + paddle.save(model.state_dict(), + os.path.join(current_save_dir, 'model.pdparams')) + paddle.save(optimizer.state_dict(), + os.path.join(current_save_dir, 'model.pdopt')) + save_models.append(current_save_dir) + if len(save_models) > keep_checkpoint_max > 0: + model_to_remove = save_models.popleft() + shutil.rmtree(model_to_remove) + + if val_dataset is not None: + if mean_iou > best_mean_iou: + best_mean_iou = mean_iou + best_model_iter = iter + best_model_dir = os.path.join(save_dir, "best_model") + paddle.save( + model.state_dict(), + os.path.join(best_model_dir, 'model.pdparams')) + logger.info( + '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' + .format(best_mean_iou, best_model_iter)) + + if use_vdl: + log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) + log_writer.add_scalar('Evaluate/Acc', acc, iter) + batch_start = time.time() + + # Calculate flops. + if local_rank == 0 and not (precision == 'fp16' and amp_level == 'O2'): + _, c, h, w = images.shape + _ = paddle.flops( + model, [1, c, h, w], + custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn}) + + # Sleep for half a second to let dataloader release resources. + time.sleep(0.5) + if use_vdl: + log_writer.close() diff --git a/paddleseg/core/val.py b/paddleseg/core/val.py new file mode 100644 index 0000000000000000000000000000000000000000..80a820b6bc1c8e8fa6015c534c4d51f5ca59663f --- /dev/null +++ b/paddleseg/core/val.py @@ -0,0 +1,237 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import time +import paddle +import paddle.nn.functional as F + +from paddleseg.utils import metrics, TimeAverager, calculate_eta, logger, progbar +from paddleseg.core import infer + +np.set_printoptions(suppress=True) + + +def evaluate(model, + eval_dataset, + aug_eval=False, + scales=1.0, + flip_horizontal=False, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None, + precision='fp32', + amp_level='O1', + num_workers=0, + print_detail=True, + auc_roc=False): + """ + Launch evalution. + + Args: + model(nn.Layer): A semantic segmentation model. + eval_dataset (paddle.io.Dataset): Used to read and process validation datasets. + aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False. + scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0. + flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_eval` is True. Default: True. + flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_eval` is True. Default: False. + is_slide (bool, optional): Whether to evaluate by sliding window. Default: False. + stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height. + It should be provided when `is_slide` is True. + crop_size (tuple|list, optional): The crop size of sliding window, the first is width and the second is height. + It should be provided when `is_slide` is True. + precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the evaluation is normal. + amp_level (str, optional): Auto mixed precision level. Accepted values are “O1” and “O2”: O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don’t support fp16 kernel and batchnorm. Default is O1(amp) + num_workers (int, optional): Num workers for data loader. Default: 0. + print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True. + auc_roc(bool, optional): whether add auc_roc metric + + Returns: + float: The mIoU of validation datasets. + float: The accuracy of validation datasets. + """ + model.eval() + nranks = paddle.distributed.ParallelEnv().nranks + local_rank = paddle.distributed.ParallelEnv().local_rank + if nranks > 1: + # Initialize parallel environment if not done. + if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( + ): + paddle.distributed.init_parallel_env() + batch_sampler = paddle.io.DistributedBatchSampler( + eval_dataset, batch_size=1, shuffle=False, drop_last=False) + loader = paddle.io.DataLoader( + eval_dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + return_list=True, ) + + total_iters = len(loader) + intersect_area_all = paddle.zeros([1], dtype='int64') + pred_area_all = paddle.zeros([1], dtype='int64') + label_area_all = paddle.zeros([1], dtype='int64') + logits_all = None + label_all = None + + if print_detail: + logger.info("Start evaluating (total_samples: {}, total_iters: {})...". + format(len(eval_dataset), total_iters)) + #TODO(chenguowei): fix log print error with multi-gpus + progbar_val = progbar.Progbar( + target=total_iters, verbose=1 if nranks < 2 else 2) + reader_cost_averager = TimeAverager() + batch_cost_averager = TimeAverager() + batch_start = time.time() + with paddle.no_grad(): + for iter, data in enumerate(loader): + reader_cost_averager.record(time.time() - batch_start) + label = data['label'].astype('int64') + + if aug_eval: + if precision == 'fp16': + with paddle.amp.auto_cast( + level=amp_level, + enable=True, + custom_white_list={ + "elementwise_add", "batch_norm", + "sync_batch_norm" + }, + custom_black_list={'bilinear_interp_v2'}): + pred, logits = infer.aug_inference( + model, + data['img'], + trans_info=data['trans_info'], + scales=scales, + flip_horizontal=flip_horizontal, + flip_vertical=flip_vertical, + is_slide=is_slide, + stride=stride, + crop_size=crop_size) + else: + pred, logits = infer.aug_inference( + model, + data['img'], + trans_info=data['trans_info'], + scales=scales, + flip_horizontal=flip_horizontal, + flip_vertical=flip_vertical, + is_slide=is_slide, + stride=stride, + crop_size=crop_size) + else: + if precision == 'fp16': + with paddle.amp.auto_cast( + level=amp_level, + enable=True, + custom_white_list={ + "elementwise_add", "batch_norm", + "sync_batch_norm" + }, + custom_black_list={'bilinear_interp_v2'}): + pred, logits = infer.inference( + model, + data['img'], + trans_info=data['trans_info'], + is_slide=is_slide, + stride=stride, + crop_size=crop_size) + else: + pred, logits = infer.inference( + model, + data['img'], + trans_info=data['trans_info'], + is_slide=is_slide, + stride=stride, + crop_size=crop_size) + + intersect_area, pred_area, label_area = metrics.calculate_area( + pred, + label, + eval_dataset.num_classes, + ignore_index=eval_dataset.ignore_index) + + # Gather from all ranks + if nranks > 1: + intersect_area_list = [] + pred_area_list = [] + label_area_list = [] + paddle.distributed.all_gather(intersect_area_list, + intersect_area) + paddle.distributed.all_gather(pred_area_list, pred_area) + paddle.distributed.all_gather(label_area_list, label_area) + + # Some image has been evaluated and should be eliminated in last iter + if (iter + 1) * nranks > len(eval_dataset): + valid = len(eval_dataset) - iter * nranks + intersect_area_list = intersect_area_list[:valid] + pred_area_list = pred_area_list[:valid] + label_area_list = label_area_list[:valid] + + for i in range(len(intersect_area_list)): + intersect_area_all = intersect_area_all + intersect_area_list[ + i] + pred_area_all = pred_area_all + pred_area_list[i] + label_area_all = label_area_all + label_area_list[i] + else: + intersect_area_all = intersect_area_all + intersect_area + pred_area_all = pred_area_all + pred_area + label_area_all = label_area_all + label_area + + if auc_roc: + logits = F.softmax(logits, axis=1) + if logits_all is None: + logits_all = logits.numpy() + label_all = label.numpy() + else: + logits_all = np.concatenate( + [logits_all, logits.numpy()]) # (KN, C, H, W) + label_all = np.concatenate([label_all, label.numpy()]) + + batch_cost_averager.record( + time.time() - batch_start, num_samples=len(label)) + batch_cost = batch_cost_averager.get_average() + reader_cost = reader_cost_averager.get_average() + + if local_rank == 0 and print_detail: + progbar_val.update(iter + 1, [('batch_cost', batch_cost), + ('reader cost', reader_cost)]) + reader_cost_averager.reset() + batch_cost_averager.reset() + batch_start = time.time() + + metrics_input = (intersect_area_all, pred_area_all, label_area_all) + class_iou, miou = metrics.mean_iou(*metrics_input) + acc, class_precision, class_recall = metrics.class_measurement( + *metrics_input) + kappa = metrics.kappa(*metrics_input) + class_dice, mdice = metrics.dice(*metrics_input) + + if auc_roc: + auc_roc = metrics.auc_roc( + logits_all, label_all, num_classes=eval_dataset.num_classes) + auc_infor = ' Auc_roc: {:.4f}'.format(auc_roc) + + if print_detail: + infor = "[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} Dice: {:.4f}".format( + len(eval_dataset), miou, acc, kappa, mdice) + infor = infor + auc_infor if auc_roc else infor + logger.info(infor) + logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4))) + logger.info("[EVAL] Class Precision: \n" + str( + np.round(class_precision, 4))) + logger.info("[EVAL] Class Recall: \n" + str(np.round(class_recall, 4))) + return miou, acc, class_iou, class_precision, kappa diff --git a/paddleseg/cvlibs/__init__.py b/paddleseg/cvlibs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5fcb1d6c10201026c0ba01a059c19e3f92bd3d86 --- /dev/null +++ b/paddleseg/cvlibs/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import manager +from . import param_init +from .config import Config diff --git a/paddleseg/cvlibs/callbacks.py b/paddleseg/cvlibs/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..1188b2cdac8cee982248c196672cb2ac9c59b49c --- /dev/null +++ b/paddleseg/cvlibs/callbacks.py @@ -0,0 +1,279 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time + +import numpy as np +import paddle +from paddle.distributed.parallel import ParallelEnv +from visualdl import LogWriter +from paddleseg.utils.progbar import Progbar +import paddleseg.utils.logger as logger + + +class CallbackList(object): + """ + Container abstracting a list of callbacks. + + Args: + callbacks (list[Callback]): List of `Callback` instances. + """ + + def __init__(self, callbacks=None): + callbacks = callbacks or [] + self.callbacks = [c for c in callbacks] + + def append(self, callback): + self.callbacks.append(callback) + + def set_params(self, params): + for callback in self.callbacks: + callback.set_params(params) + + def set_model(self, model): + for callback in self.callbacks: + callback.set_model(model) + + def set_optimizer(self, optimizer): + for callback in self.callbacks: + callback.set_optimizer(optimizer) + + def on_iter_begin(self, iter, logs=None): + """Called right before processing a batch. + """ + logs = logs or {} + for callback in self.callbacks: + callback.on_iter_begin(iter, logs) + self._t_enter_iter = time.time() + + def on_iter_end(self, iter, logs=None): + """Called at the end of a batch. + """ + logs = logs or {} + for callback in self.callbacks: + callback.on_iter_end(iter, logs) + self._t_exit_iter = time.time() + + def on_train_begin(self, logs=None): + """Called at the beginning of training. + """ + logs = logs or {} + for callback in self.callbacks: + callback.on_train_begin(logs) + + def on_train_end(self, logs=None): + """Called at the end of training. + """ + logs = logs or {} + for callback in self.callbacks: + callback.on_train_end(logs) + + def __iter__(self): + return iter(self.callbacks) + + +class Callback(object): + """Abstract base class used to build new callbacks. + """ + + def __init__(self): + self.validation_data = None + + def set_params(self, params): + self.params = params + + def set_model(self, model): + self.model = model + + def set_optimizer(self, optimizer): + self.optimizer = optimizer + + def on_iter_begin(self, iter, logs=None): + pass + + def on_iter_end(self, iter, logs=None): + pass + + def on_train_begin(self, logs=None): + pass + + def on_train_end(self, logs=None): + pass + + +class BaseLogger(Callback): + def __init__(self, period=10): + super(BaseLogger, self).__init__() + self.period = period + + def _reset(self): + self.totals = {} + + def on_train_begin(self, logs=None): + self.totals = {} + + def on_iter_end(self, iter, logs=None): + logs = logs or {} + #(iter - 1) // iters_per_epoch + 1 + for k, v in logs.items(): + if k in self.totals.keys(): + self.totals[k] += v + else: + self.totals[k] = v + + if iter % self.period == 0 and ParallelEnv().local_rank == 0: + + for k in self.totals: + logs[k] = self.totals[k] / self.period + self._reset() + + +class TrainLogger(Callback): + def __init__(self, log_freq=10): + self.log_freq = log_freq + + def _calculate_eta(self, remaining_iters, speed): + if remaining_iters < 0: + remaining_iters = 0 + remaining_time = int(remaining_iters * speed) + result = "{:0>2}:{:0>2}:{:0>2}" + arr = [] + for i in range(2, -1, -1): + arr.append(int(remaining_time / 60**i)) + remaining_time %= 60**i + return result.format(*arr) + + def on_iter_end(self, iter, logs=None): + + if iter % self.log_freq == 0 and ParallelEnv().local_rank == 0: + total_iters = self.params["total_iters"] + iters_per_epoch = self.params["iters_per_epoch"] + remaining_iters = total_iters - iter + eta = self._calculate_eta(remaining_iters, logs["batch_cost"]) + current_epoch = (iter - 1) // self.params["iters_per_epoch"] + 1 + loss = logs["loss"] + lr = self.optimizer.get_lr() + batch_cost = logs["batch_cost"] + reader_cost = logs["reader_cost"] + + logger.info( + "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" + .format(current_epoch, iter, total_iters, loss, lr, batch_cost, + reader_cost, eta)) + + +class ProgbarLogger(Callback): + def __init__(self): + super(ProgbarLogger, self).__init__() + + def on_train_begin(self, logs=None): + self.verbose = self.params["verbose"] + self.total_iters = self.params["total_iters"] + self.target = self.params["total_iters"] + self.progbar = Progbar(target=self.target, verbose=self.verbose) + self.seen = 0 + self.log_values = [] + + def on_iter_begin(self, iter, logs=None): + #self.seen = 0 + if self.seen < self.target: + self.log_values = [] + + def on_iter_end(self, iter, logs=None): + logs = logs or {} + self.seen += 1 + for k in self.params['metrics']: + if k in logs: + self.log_values.append((k, logs[k])) + + #if self.verbose and self.seen < self.target and ParallelEnv.local_rank == 0: + #print(self.log_values) + if self.seen < self.target: + self.progbar.update(self.seen, self.log_values) + + +class ModelCheckpoint(Callback): + def __init__(self, + save_dir, + monitor="miou", + save_best_only=False, + save_params_only=True, + mode="max", + period=1): + + super(ModelCheckpoint, self).__init__() + self.monitor = monitor + self.save_dir = save_dir + self.save_best_only = save_best_only + self.save_params_only = save_params_only + self.period = period + self.iters_since_last_save = 0 + + if mode == "min": + self.monitor_op = np.less + self.best = np.Inf + elif mode == "max": + self.monitor_op = np.greater + self.best = -np.Inf + else: + raise RuntimeError("`mode` is neither \"min\" nor \"max\"!") + + def on_train_begin(self, logs=None): + self.verbose = self.params["verbose"] + save_dir = self.save_dir + if not os.path.isdir(save_dir): + if os.path.exists(save_dir): + os.remove(save_dir) + os.makedirs(save_dir) + + def on_iter_end(self, iter, logs=None): + logs = logs or {} + self.iters_since_last_save += 1 + current_save_dir = os.path.join(self.save_dir, "iter_{}".format(iter)) + current_save_dir = os.path.abspath(current_save_dir) + #if self.iters_since_last_save % self.period and ParallelEnv().local_rank == 0: + #self.iters_since_last_save = 0 + if iter % self.period == 0 and ParallelEnv().local_rank == 0: + if self.verbose > 0: + print("iter {iter_num}: saving model to {path}".format( + iter_num=iter, path=current_save_dir)) + + paddle.save(self.model.state_dict(), + os.path.join(current_save_dir, 'model.pdparams')) + + if not self.save_params_only: + paddle.save(self.optimizer.state_dict(), + os.path.join(current_save_dir, 'model.pdopt')) + + +class VisualDL(Callback): + def __init__(self, log_dir="./log", freq=1): + super(VisualDL, self).__init__() + self.log_dir = log_dir + self.freq = freq + + def on_train_begin(self, logs=None): + self.writer = LogWriter(self.log_dir) + + def on_iter_end(self, iter, logs=None): + logs = logs or {} + if iter % self.freq == 0 and ParallelEnv().local_rank == 0: + for k, v in logs.items(): + self.writer.add_scalar("Train/{}".format(k), v, iter) + + self.writer.flush() + + def on_train_end(self, logs=None): + self.writer.close() diff --git a/paddleseg/cvlibs/config.py b/paddleseg/cvlibs/config.py new file mode 100644 index 0000000000000000000000000000000000000000..0fbaa032de14cd7c3f38b3834d1ddbef5107e7d2 --- /dev/null +++ b/paddleseg/cvlibs/config.py @@ -0,0 +1,445 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import codecs +import os +from typing import Any, Dict, Generic +import warnings + +import paddle +import yaml + +from paddleseg.cvlibs import manager +from paddleseg.utils import logger + + +class Config(object): + ''' + Training configuration parsing. The only yaml/yml file is supported. + + The following hyper-parameters are available in the config file: + batch_size: The number of samples per gpu. + iters: The total training steps. + train_dataset: A training data config including type/data_root/transforms/mode. + For data type, please refer to paddleseg.datasets. + For specific transforms, please refer to paddleseg.transforms.transforms. + val_dataset: A validation data config including type/data_root/transforms/mode. + optimizer: A optimizer config, but currently PaddleSeg only supports sgd with momentum in config file. + In addition, weight_decay could be set as a regularization. + learning_rate: A learning rate config. If decay is configured, learning _rate value is the starting learning rate, + where only poly decay is supported using the config file. In addition, decay power and end_lr are tuned experimentally. + loss: A loss config. Multi-loss config is available. The loss type order is consistent with the seg model outputs, + where the coef term indicates the weight of corresponding loss. Note that the number of coef must be the same as the number of + model outputs, and there could be only one loss type if using the same loss type among the outputs, otherwise the number of + loss type must be consistent with coef. + model: A model config including type/backbone and model-dependent arguments. + For model type, please refer to paddleseg.models. + For backbone, please refer to paddleseg.models.backbones. + + Args: + path (str) : The path of config file, supports yaml format only. + + Examples: + + from paddleseg.cvlibs.config import Config + + # Create a cfg object with yaml file path. + cfg = Config(yaml_cfg_path) + + # Parsing the argument when its property is used. + train_dataset = cfg.train_dataset + + # the argument of model should be parsed after dataset, + # since the model builder uses some properties in dataset. + model = cfg.model + ... + ''' + + def __init__(self, + path: str, + learning_rate: float=None, + batch_size: int=None, + iters: int=None): + if not path: + raise ValueError('Please specify the configuration file path.') + + if not os.path.exists(path): + raise FileNotFoundError('File {} does not exist'.format(path)) + + self._model = None + self._losses = None + if path.endswith('yml') or path.endswith('yaml'): + self.dic = self._parse_from_yaml(path) + else: + raise RuntimeError('Config file should in yaml format!') + + self.update( + learning_rate=learning_rate, batch_size=batch_size, iters=iters) + + def _update_dic(self, dic, base_dic): + """ + Update config from dic based base_dic + """ + base_dic = base_dic.copy() + dic = dic.copy() + + if dic.get('_inherited_', True) == False: + dic.pop('_inherited_') + return dic + + for key, val in dic.items(): + if isinstance(val, dict) and key in base_dic: + base_dic[key] = self._update_dic(val, base_dic[key]) + else: + base_dic[key] = val + dic = base_dic + return dic + + def _parse_from_yaml(self, path: str): + '''Parse a yaml file and build config''' + with codecs.open(path, 'r', 'utf-8') as file: + dic = yaml.load(file, Loader=yaml.FullLoader) + + if '_base_' in dic: + cfg_dir = os.path.dirname(path) + base_path = dic.pop('_base_') + base_path = os.path.join(cfg_dir, base_path) + base_dic = self._parse_from_yaml(base_path) + dic = self._update_dic(dic, base_dic) + return dic + + def update(self, + learning_rate: float=None, + batch_size: int=None, + iters: int=None): + '''Update config''' + if learning_rate: + if 'lr_scheduler' in self.dic: + self.dic['lr_scheduler']['learning_rate'] = learning_rate + else: + self.dic['learning_rate']['value'] = learning_rate + + if batch_size: + self.dic['batch_size'] = batch_size + + if iters: + self.dic['iters'] = iters + + @property + def batch_size(self) -> int: + return self.dic.get('batch_size', 1) + + @property + def iters(self) -> int: + iters = self.dic.get('iters') + if not iters: + raise RuntimeError('No iters specified in the configuration file.') + return iters + + @property + def lr_scheduler(self) -> paddle.optimizer.lr.LRScheduler: + if 'lr_scheduler' not in self.dic: + raise RuntimeError( + 'No `lr_scheduler` specified in the configuration file.') + params = self.dic.get('lr_scheduler') + + use_warmup = False + if 'warmup_iters' in params: + use_warmup = True + warmup_iters = params.pop('warmup_iters') + assert 'warmup_start_lr' in params, \ + "When use warmup, please set warmup_start_lr and warmup_iters in lr_scheduler" + warmup_start_lr = params.pop('warmup_start_lr') + end_lr = params['learning_rate'] + + lr_type = params.pop('type') + if lr_type == 'PolynomialDecay': + iters = self.iters - warmup_iters if use_warmup else self.iters + iters = max(iters, 1) + params.setdefault('decay_steps', iters) + params.setdefault('end_lr', 0) + params.setdefault('power', 0.9) + lr_sche = getattr(paddle.optimizer.lr, lr_type)(**params) + + if use_warmup: + lr_sche = paddle.optimizer.lr.LinearWarmup( + learning_rate=lr_sche, + warmup_steps=warmup_iters, + start_lr=warmup_start_lr, + end_lr=end_lr) + + return lr_sche + + @property + def learning_rate(self) -> paddle.optimizer.lr.LRScheduler: + logger.warning( + '''`learning_rate` in configuration file will be deprecated, please use `lr_scheduler` instead. E.g + lr_scheduler: + type: PolynomialDecay + learning_rate: 0.01''') + + _learning_rate = self.dic.get('learning_rate', {}) + if isinstance(_learning_rate, float): + return _learning_rate + + _learning_rate = self.dic.get('learning_rate', {}).get('value') + if not _learning_rate: + raise RuntimeError( + 'No learning rate specified in the configuration file.') + + args = self.decay_args + decay_type = args.pop('type') + + if decay_type == 'poly': + lr = _learning_rate + return paddle.optimizer.lr.PolynomialDecay(lr, **args) + elif decay_type == 'piecewise': + values = _learning_rate + return paddle.optimizer.lr.PiecewiseDecay(values=values, **args) + elif decay_type == 'stepdecay': + lr = _learning_rate + return paddle.optimizer.lr.StepDecay(lr, **args) + else: + raise RuntimeError('Only poly and piecewise decay support.') + + @property + def optimizer(self) -> paddle.optimizer.Optimizer: + if 'lr_scheduler' in self.dic: + lr = self.lr_scheduler + else: + lr = self.learning_rate + args = self.optimizer_args + optimizer_type = args.pop('type') + + if optimizer_type == 'sgd': + return paddle.optimizer.Momentum( + lr, parameters=self.model.parameters(), **args) + elif optimizer_type == 'adam': + return paddle.optimizer.Adam( + lr, parameters=self.model.parameters(), **args) + elif optimizer_type in paddle.optimizer.__all__: + return getattr(paddle.optimizer, + optimizer_type)(lr, + parameters=self.model.parameters(), + **args) + + raise RuntimeError('Unknown optimizer type {}.'.format(optimizer_type)) + + @property + def optimizer_args(self) -> dict: + args = self.dic.get('optimizer', {}).copy() + if args['type'] == 'sgd': + args.setdefault('momentum', 0.9) + + return args + + @property + def decay_args(self) -> dict: + args = self.dic.get('learning_rate', {}).get( + 'decay', {'type': 'poly', + 'power': 0.9}).copy() + + if args['type'] == 'poly': + args.setdefault('decay_steps', self.iters) + args.setdefault('end_lr', 0) + + return args + + @property + def loss(self) -> dict: + if self._losses is None: + self._losses = self._prepare_loss('loss') + return self._losses + + @property + def distill_loss(self) -> dict: + if not hasattr(self, '_distill_losses'): + self._distill_losses = self._prepare_loss('distill_loss') + return self._distill_losses + + def _prepare_loss(self, loss_name): + """ + Parse the loss parameters and load the loss layers. + + Args: + loss_name (str): The root name of loss in the yaml file. + Returns: + dict: A dict including the loss parameters and layers. + """ + args = self.dic.get(loss_name, {}).copy() + if 'types' in args and 'coef' in args: + len_types = len(args['types']) + len_coef = len(args['coef']) + if len_types != len_coef: + if len_types == 1: + args['types'] = args['types'] * len_coef + else: + raise ValueError( + 'The length of types should equal to coef or equal to 1 in loss config, but they are {} and {}.' + .format(len_types, len_coef)) + else: + raise ValueError( + 'Loss config should contain keys of "types" and "coef"') + + losses = dict() + for key, val in args.items(): + if key == 'types': + losses['types'] = [] + for item in args['types']: + if item['type'] != 'MixedLoss': + if 'ignore_index' in item: + assert item['ignore_index'] == self.train_dataset.ignore_index, 'If ignore_index of loss is set, '\ + 'the ignore_index of loss and train_dataset must be the same. \nCurrently, loss ignore_index = {}, '\ + 'train_dataset ignore_index = {}. \nIt is recommended not to set loss ignore_index, so it is consistent with '\ + 'train_dataset by default.'.format(item['ignore_index'], self.train_dataset.ignore_index) + item['ignore_index'] = \ + self.train_dataset.ignore_index + losses['types'].append(self._load_object(item)) + else: + losses[key] = val + if len(losses['coef']) != len(losses['types']): + raise RuntimeError( + 'The length of coef should equal to types in loss config: {} != {}.' + .format(len(losses['coef']), len(losses['types']))) + return losses + + @property + def model(self) -> paddle.nn.Layer: + model_cfg = self.dic.get('model').copy() + if not model_cfg: + raise RuntimeError('No model specified in the configuration file.') + + if not 'num_classes' in model_cfg: + num_classes = None + try: + if self.train_dataset_config: + if hasattr(self.train_dataset_class, 'NUM_CLASSES'): + num_classes = self.train_dataset_class.NUM_CLASSES + elif 'num_classes' in self.train_dataset_config: + num_classes = self.train_dataset_config['num_classes'] + elif hasattr(self.train_dataset, 'num_classes'): + num_classes = self.train_dataset.num_classes + elif self.val_dataset_config: + if hasattr(self.val_dataset_class, 'NUM_CLASSES'): + num_classes = self.val_dataset_class.NUM_CLASSES + elif 'num_classes' in self.val_dataset_config: + num_classes = self.val_dataset_config['num_classes'] + elif hasattr(self.val_dataset, 'num_classes'): + num_classes = self.val_dataset.num_classes + except FileNotFoundError: + warnings.warn("`dataset_root` is not found. Is it correct?") + + if num_classes is not None: + model_cfg['num_classes'] = num_classes + + if not self._model: + self._model = self._load_object(model_cfg) + return self._model + + @property + def train_dataset_config(self) -> Dict: + return self.dic.get('train_dataset', {}).copy() + + @property + def val_dataset_config(self) -> Dict: + return self.dic.get('val_dataset', {}).copy() + + @property + def train_dataset_class(self) -> Generic: + dataset_type = self.train_dataset_config['type'] + return self._load_component(dataset_type) + + @property + def val_dataset_class(self) -> Generic: + dataset_type = self.val_dataset_config['type'] + return self._load_component(dataset_type) + + @property + def train_dataset(self) -> paddle.io.Dataset: + _train_dataset = self.train_dataset_config + if not _train_dataset: + return None + return self._load_object(_train_dataset) + + @property + def val_dataset(self) -> paddle.io.Dataset: + _val_dataset = self.val_dataset_config + if not _val_dataset: + return None + return self._load_object(_val_dataset) + + def _load_component(self, com_name: str) -> Any: + com_list = [ + manager.MODELS, manager.BACKBONES, manager.DATASETS, + manager.TRANSFORMS, manager.LOSSES + ] + + for com in com_list: + if com_name in com.components_dict: + return com[com_name] + else: + raise RuntimeError( + 'The specified component was not found {}.'.format(com_name)) + + def _load_object(self, cfg: dict) -> Any: + cfg = cfg.copy() + if 'type' not in cfg: + raise RuntimeError('No object information in {}.'.format(cfg)) + + component = self._load_component(cfg.pop('type')) + + params = {} + for key, val in cfg.items(): + if self._is_meta_type(val): + params[key] = self._load_object(val) + elif isinstance(val, list): + params[key] = [ + self._load_object(item) + if self._is_meta_type(item) else item for item in val + ] + else: + params[key] = val + + return component(**params) + + @property + def test_config(self) -> Dict: + return self.dic.get('test_config', {}) + + @property + def export_config(self) -> Dict: + return self.dic.get('export', {}) + + @property + def to_static_training(self) -> bool: + '''Whether to use @to_static for training''' + return self.dic.get('to_static_training', False) + + def _is_meta_type(self, item: Any) -> bool: + return isinstance(item, dict) and 'type' in item + + def __str__(self) -> str: + return yaml.dump(self.dic) + + @property + def val_transforms(self) -> list: + """Get val_transform from val_dataset""" + _val_dataset = self.val_dataset_config + if not _val_dataset: + return [] + _transforms = _val_dataset.get('transforms', []) + transforms = [] + for i in _transforms: + transforms.append(self._load_object(i)) + return transforms diff --git a/paddleseg/cvlibs/manager.py b/paddleseg/cvlibs/manager.py new file mode 100644 index 0000000000000000000000000000000000000000..8437445ac08229ec443c8d907c29b07025831d8b --- /dev/null +++ b/paddleseg/cvlibs/manager.py @@ -0,0 +1,147 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from collections.abc import Sequence + +import warnings + + +class ComponentManager: + """ + Implement a manager class to add the new component properly. + The component can be added as either class or function type. + + Args: + name (str): The name of component. + + Returns: + A callable object of ComponentManager. + + Examples 1: + + from paddleseg.cvlibs.manager import ComponentManager + + model_manager = ComponentManager() + + class AlexNet: ... + class ResNet: ... + + model_manager.add_component(AlexNet) + model_manager.add_component(ResNet) + + # Or pass a sequence alliteratively: + model_manager.add_component([AlexNet, ResNet]) + print(model_manager.components_dict) + # {'AlexNet': , 'ResNet': } + + Examples 2: + + # Or an easier way, using it as a Python decorator, while just add it above the class declaration. + from paddleseg.cvlibs.manager import ComponentManager + + model_manager = ComponentManager() + + @model_manager.add_component + class AlexNet: ... + + @model_manager.add_component + class ResNet: ... + + print(model_manager.components_dict) + # {'AlexNet': , 'ResNet': } + """ + + def __init__(self, name=None): + self._components_dict = dict() + self._name = name + + def __len__(self): + return len(self._components_dict) + + def __repr__(self): + name_str = self._name if self._name else self.__class__.__name__ + return "{}:{}".format(name_str, list(self._components_dict.keys())) + + def __getitem__(self, item): + if item not in self._components_dict.keys(): + raise KeyError("{} does not exist in availabel {}".format(item, + self)) + return self._components_dict[item] + + @property + def components_dict(self): + return self._components_dict + + @property + def name(self): + return self._name + + def _add_single_component(self, component): + """ + Add a single component into the corresponding manager. + + Args: + component (function|class): A new component. + + Raises: + TypeError: When `component` is neither class nor function. + KeyError: When `component` was added already. + """ + + # Currently only support class or function type + if not (inspect.isclass(component) or inspect.isfunction(component)): + raise TypeError("Expect class/function type, but received {}". + format(type(component))) + + # Obtain the internal name of the component + component_name = component.__name__ + + # Check whether the component was added already + if component_name in self._components_dict.keys(): + warnings.warn("{} exists already! It is now updated to {} !!!". + format(component_name, component)) + self._components_dict[component_name] = component + + else: + # Take the internal name of the component as its key + self._components_dict[component_name] = component + + def add_component(self, components): + """ + Add component(s) into the corresponding manager. + + Args: + components (function|class|list|tuple): Support four types of components. + + Returns: + components (function|class|list|tuple): Same with input components. + """ + + # Check whether the type is a sequence + if isinstance(components, Sequence): + for component in components: + self._add_single_component(component) + else: + component = components + self._add_single_component(component) + + return components + + +MODELS = ComponentManager("models") +BACKBONES = ComponentManager("backbones") +DATASETS = ComponentManager("datasets") +TRANSFORMS = ComponentManager("transforms") +LOSSES = ComponentManager("losses") diff --git a/paddleseg/cvlibs/param_init.py b/paddleseg/cvlibs/param_init.py new file mode 100644 index 0000000000000000000000000000000000000000..bf6195e75d5d8270537398613f73a0a38d18da71 --- /dev/null +++ b/paddleseg/cvlibs/param_init.py @@ -0,0 +1,146 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn + + +def constant_init(param, **kwargs): + """ + Initialize the `param` with constants. + + Args: + param (Tensor): Tensor that needs to be initialized. + + Examples: + + from paddleseg.cvlibs import param_init + import paddle.nn as nn + + linear = nn.Linear(2, 4) + param_init.constant_init(linear.weight, value=2.0) + print(linear.weight.numpy()) + # result is [[2. 2. 2. 2.], [2. 2. 2. 2.]] + + """ + initializer = nn.initializer.Constant(**kwargs) + initializer(param, param.block) + + +def normal_init(param, **kwargs): + """ + Initialize the `param` with a Normal distribution. + + Args: + param (Tensor): Tensor that needs to be initialized. + + Examples: + + from paddleseg.cvlibs import param_init + import paddle.nn as nn + + linear = nn.Linear(2, 4) + param_init.normal_init(linear.weight, loc=0.0, scale=1.0) + + """ + initializer = nn.initializer.Normal(**kwargs) + initializer(param, param.block) + + +def kaiming_normal_init(param, **kwargs): + r""" + Initialize the input tensor with Kaiming Normal initialization. + + This function implements the `param` initialization from the paper + `Delving Deep into Rectifiers: Surpassing Human-Level Performance on + ImageNet Classification ` + by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a + robust initialization method that particularly considers the rectifier + nonlinearities. In case of Uniform distribution, the range is [-x, x], where + .. math:: + x = \sqrt{\\frac{6.0}{fan\_in}} + In case of Normal distribution, the mean is 0 and the standard deviation + is + .. math:: + \sqrt{\\frac{2.0}{fan\_in}} + + Args: + param (Tensor): Tensor that needs to be initialized. + + Examples: + + from paddleseg.cvlibs import param_init + import paddle.nn as nn + + linear = nn.Linear(2, 4) + # uniform is used to decide whether to use uniform or normal distribution + param_init.kaiming_normal_init(linear.weight) + + """ + initializer = nn.initializer.KaimingNormal(**kwargs) + initializer(param, param.block) + + +def kaiming_uniform(param, **kwargs): + r"""Implements the Kaiming Uniform initializer + This class implements the weight initialization from the paper + `Delving Deep into Rectifiers: Surpassing Human-Level Performance on + ImageNet Classification `_ + by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a + robust initialization method that particularly considers the rectifier + nonlinearities. + + In case of Uniform distribution, the range is [-x, x], where + .. math:: + x = \sqrt{\\frac{6.0}{fan\_in}} + + Args: + param (Tensor): Tensor that needs to be initialized. + + Examples: + + from paddleseg.cvlibs import param_init + import paddle.nn as nn + + linear = nn.Linear(2, 4) + param_init.kaiming_uniform(linear.weight) + """ + + initializer = nn.initializer.KaimingUniform(**kwargs) + initializer(param, param.block) + + +def xavier_uniform(param, **kwargs): + r""" + This implements the Xavier weight initializer from the paper + `Understanding the difficulty of training deep feedforward neural + networks `_ + by Xavier Glorot and Yoshua Bengio. + This initializer is designed to keep the scale of the gradients + approximately same in all the layers. In case of Uniform distribution, + the range is [-x, x], where + .. math:: + x = \sqrt{\frac{6.0}{fan\_in + fan\_out}} + Args: + param (Tensor): Tensor that needs to be initialized. + + Examples: + + from paddleseg.cvlibs import param_init + import paddle.nn as nn + + linear = nn.Linear(2, 4) + param_init.xavier_uniform(linear.weight) + """ + initializer = nn.initializer.XavierUniform(**kwargs) + initializer(param, param.block) \ No newline at end of file diff --git a/paddleseg/datasets/__init__.py b/paddleseg/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad526850c46a7410abb89021ca3a7abd68ce1b48 --- /dev/null +++ b/paddleseg/datasets/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dataset import Dataset +from .cityscapes import Cityscapes +from .voc import PascalVOC +from .ade import ADE20K +from .optic_disc_seg import OpticDiscSeg +from .pascal_context import PascalContext +from .mini_deep_globe_road_extraction import MiniDeepGlobeRoadExtraction +from .eg1800 import EG1800 +from .supervisely import SUPERVISELY +from .cocostuff import CocoStuff +from .stare import STARE +from .drive import DRIVE +from .hrf import HRF +from .chase_db1 import CHASEDB1 +from .pp_humanseg14k import PPHumanSeg14K +from .pssl import PSSLDataset diff --git a/paddleseg/datasets/ade.py b/paddleseg/datasets/ade.py new file mode 100644 index 0000000000000000000000000000000000000000..8a9a2e9324546807fbba69c3760d095076b16eab --- /dev/null +++ b/paddleseg/datasets/ade.py @@ -0,0 +1,119 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +from PIL import Image + +from paddleseg.datasets import Dataset +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +import paddleseg.transforms.functional as F + +URL = "http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip" + + +@manager.DATASETS.add_component +class ADE20K(Dataset): + """ + ADE20K dataset `http://sceneparsing.csail.mit.edu/`. + + Args: + transforms (list): A list of image transformations. + dataset_root (str, optional): The ADK20K dataset directory. Default: None. + mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 150 + + def __init__(self, transforms, dataset_root=None, mode='train', edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'val']: + raise ValueError( + "`mode` should be one of ('train', 'val') in ADE20K dataset, but got {}." + .format(mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME, + extraname='ADEChallengeData2016') + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + img_dir = os.path.join(self.dataset_root, 'images/training') + label_dir = os.path.join(self.dataset_root, 'annotations/training') + elif mode == 'val': + img_dir = os.path.join(self.dataset_root, 'images/validation') + label_dir = os.path.join(self.dataset_root, + 'annotations/validation') + img_files = os.listdir(img_dir) + label_files = [i.replace('.jpg', '.png') for i in img_files] + for i in range(len(img_files)): + img_path = os.path.join(img_dir, img_files[i]) + label_path = os.path.join(label_dir, label_files[i]) + self.file_list.append([img_path, label_path]) + + def __getitem__(self, idx): + data = {} + data['trans_info'] = [] + image_path, label_path = self.file_list[idx] + data['img'] = image_path + data['gt_fields'] = [ + ] # If key in gt_fields, the data[key] have transforms synchronous. + + if self.mode == 'val': + data = self.transforms(data) + label = np.asarray(Image.open(label_path)) + # The class 0 is ignored. And it will equal to 255 after + # subtracted 1, because the dtype of label is uint8. + label = label - 1 + label = label[np.newaxis, :, :] + data['label'] = label + return data + else: + data['label'] = label_path + data['gt_fields'].append('label') + data = self.transforms(data) + data['label'] = data['label'] - 1 + # Recover the ignore pixels adding by transform + data['label'][data['label'] == 254] = 255 + if self.edge: + edge_mask = F.mask_to_binary_edge( + label, radius=2, num_classes=self.num_classes) + data['edge'] = edge_mask + return data diff --git a/paddleseg/datasets/chase_db1.py b/paddleseg/datasets/chase_db1.py new file mode 100644 index 0000000000000000000000000000000000000000..1b2538025b79cad71eefa278d4b854bc1afd70c4 --- /dev/null +++ b/paddleseg/datasets/chase_db1.py @@ -0,0 +1,98 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +from paddleseg.datasets import Dataset + +URL = 'https://bj.bcebos.com/paddleseg/dataset/chase_db1/chase_db1.zip' + + +@manager.DATASETS.add_component +class CHASEDB1(Dataset): + """ + CHASE_DB1 dataset is a dataset for retinal vessel segmentation + which contains 28 color retina images with the size of 999×960 pixels. + It is collected from both left and right eyes of 14 school children. + Each image is annotated by two independent human experts, and we choose the labels from 1st expert. + (https://blogs.kingston.ac.uk/retinal/chasedb1/) + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + edge (bool): whether extract edge infor in the output + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + edge=False, + mode='train'): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.edge = edge + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 # labels only have 1/0, thus ignore_index is not necessary + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train_list.txt') + elif mode == 'val': + file_path = os.path.join(self.dataset_root, 'val_list.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split() + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/cityscapes.py b/paddleseg/datasets/cityscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..564926e3841af182516dbbd24ad3e44920fe5768 --- /dev/null +++ b/paddleseg/datasets/cityscapes.py @@ -0,0 +1,88 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import glob + +from paddleseg.datasets import Dataset +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + + +@manager.DATASETS.add_component +class Cityscapes(Dataset): + """ + Cityscapes dataset `https://www.cityscapes-dataset.com/`. + The folder structure is as follow: + + cityscapes + | + |--leftImg8bit + | |--train + | |--val + | |--test + | + |--gtFine + | |--train + | |--val + | |--test + + Make sure there are **labelTrainIds.png in gtFine directory. If not, please run the conver_cityscapes.py in tools. + + Args: + transforms (list): Transforms for image. + dataset_root (str): Cityscapes dataset directory. + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 19 + + def __init__(self, transforms, dataset_root, mode='train', edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + self.file_list = list() + mode = mode.lower() + self.mode = mode + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "mode should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + img_dir = os.path.join(self.dataset_root, 'leftImg8bit') + label_dir = os.path.join(self.dataset_root, 'gtFine') + if self.dataset_root is None or not os.path.isdir( + self.dataset_root) or not os.path.isdir( + img_dir) or not os.path.isdir(label_dir): + raise ValueError( + "The dataset is not Found or the folder structure is nonconfoumance." + ) + + label_files = sorted( + glob.glob( + os.path.join(label_dir, mode, '*', + '*_gtFine_labelTrainIds.png'))) + img_files = sorted( + glob.glob(os.path.join(img_dir, mode, '*', '*_leftImg8bit.png'))) + + self.file_list = [ + [img_path, label_path] + for img_path, label_path in zip(img_files, label_files) + ] diff --git a/paddleseg/datasets/cocostuff.py b/paddleseg/datasets/cocostuff.py new file mode 100644 index 0000000000000000000000000000000000000000..ae66461d5efbb44c4bb84390537792e1800204cf --- /dev/null +++ b/paddleseg/datasets/cocostuff.py @@ -0,0 +1,83 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import glob + +from paddleseg.datasets import Dataset +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + + +@manager.DATASETS.add_component +class CocoStuff(Dataset): + """ + COCO-Stuff dataset `https://github.com/nightrome/cocostuff`. + The folder structure is as follow: + + cocostuff + | + |--images + | |--train2017 + | |--val2017 + | + |--annotations + | |--train2017 + | |--val2017 + + + Args: + transforms (list): Transforms for image. + dataset_root (str): Cityscapes dataset directory. + mode (str): Which part of dataset to use. it is one of ('train', 'val'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 171 + + def __init__(self, transforms, dataset_root, mode='train', edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + self.file_list = list() + mode = mode.lower() + self.mode = mode + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'val']: + raise ValueError( + "mode should be 'train', 'val', but got {}.".format(mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + img_dir = os.path.join(self.dataset_root, 'images') + label_dir = os.path.join(self.dataset_root, 'annotations') + if self.dataset_root is None or not os.path.isdir( + self.dataset_root) or not os.path.isdir( + img_dir) or not os.path.isdir(label_dir): + raise ValueError( + "The dataset is not Found or the folder structure is nonconfoumance." + ) + + label_files = sorted( + glob.glob(os.path.join(label_dir, mode + '2017', '*.png'))) + + img_files = sorted( + glob.glob(os.path.join(img_dir, mode + '2017', '*.jpg'))) + + self.file_list = [ + [img_path, label_path] + for img_path, label_path in zip(img_files, label_files) + ] diff --git a/paddleseg/datasets/dataset.py b/paddleseg/datasets/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ca797f2fabb5e4f32e11e245600582c690d34eee --- /dev/null +++ b/paddleseg/datasets/dataset.py @@ -0,0 +1,163 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import numpy as np +from PIL import Image + +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +import paddleseg.transforms.functional as F + + +@manager.DATASETS.add_component +class Dataset(paddle.io.Dataset): + """ + Pass in a custom dataset that conforms to the format. + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. + num_classes (int): Number of classes. + mode (str, optional): which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + train_path (str, optional): The train dataset file. When mode is 'train', train_path is necessary. + The contents of train_path file are as follow: + image1.jpg ground_truth1.png + image2.jpg ground_truth2.png + val_path (str. optional): The evaluation dataset file. When mode is 'val', val_path is necessary. + The contents is the same as train_path + test_path (str, optional): The test dataset file. When mode is 'test', test_path is necessary. + The annotation file is not necessary in test_path file. + separator (str, optional): The separator of dataset list. Default: ' '. + edge (bool, optional): Whether to compute edge while training. Default: False + + Examples: + + import paddleseg.transforms as T + from paddleseg.datasets import Dataset + + transforms = [T.RandomPaddingCrop(crop_size=(512,512)), T.Normalize()] + dataset_root = 'dataset_root_path' + train_path = 'train_path' + num_classes = 2 + dataset = Dataset(transforms = transforms, + dataset_root = dataset_root, + num_classes = 2, + train_path = train_path, + mode = 'train') + + """ + + def __init__(self, + transforms, + dataset_root, + num_classes, + mode='train', + train_path=None, + val_path=None, + test_path=None, + separator=' ', + ignore_index=255, + edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + self.file_list = list() + self.mode = mode.lower() + self.num_classes = num_classes + self.ignore_index = ignore_index + self.edge = edge + + if self.mode not in ['train', 'val', 'test']: + raise ValueError( + "mode should be 'train', 'val' or 'test', but got {}.".format( + self.mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if not os.path.exists(self.dataset_root): + raise FileNotFoundError('there is not `dataset_root`: {}.'.format( + self.dataset_root)) + + if self.mode == 'train': + if train_path is None: + raise ValueError( + 'When `mode` is "train", `train_path` is necessary, but it is None.' + ) + elif not os.path.exists(train_path): + raise FileNotFoundError('`train_path` is not found: {}'.format( + train_path)) + else: + file_path = train_path + elif self.mode == 'val': + if val_path is None: + raise ValueError( + 'When `mode` is "val", `val_path` is necessary, but it is None.' + ) + elif not os.path.exists(val_path): + raise FileNotFoundError('`val_path` is not found: {}'.format( + val_path)) + else: + file_path = val_path + else: + if test_path is None: + raise ValueError( + 'When `mode` is "test", `test_path` is necessary, but it is None.' + ) + elif not os.path.exists(test_path): + raise FileNotFoundError('`test_path` is not found: {}'.format( + test_path)) + else: + file_path = test_path + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split(separator) + if len(items) != 2: + if self.mode == 'train' or self.mode == 'val': + raise ValueError( + "File list format incorrect! In training or evaluation task it should be" + " image_name{}label_name\\n".format(separator)) + image_path = os.path.join(self.dataset_root, items[0]) + label_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + label_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, label_path]) + + def __getitem__(self, idx): + data = {} + data['trans_info'] = [] + image_path, label_path = self.file_list[idx] + data['img'] = image_path + data['label'] = label_path + # If key in gt_fields, the data[key] have transforms synchronous. + data['gt_fields'] = [] + if self.mode == 'val': + data = self.transforms(data) + data['label'] = data['label'][np.newaxis, :, :] + + else: + data['gt_fields'].append('label') + data = self.transforms(data) + if self.edge: + edge_mask = F.mask_to_binary_edge( + data['label'], radius=2, num_classes=self.num_classes) + data['edge'] = edge_mask + return data + + def __len__(self): + return len(self.file_list) diff --git a/paddleseg/datasets/drive.py b/paddleseg/datasets/drive.py new file mode 100644 index 0000000000000000000000000000000000000000..8984aa005b6aea4981dd536edbfc5bfd2d1e0ef6 --- /dev/null +++ b/paddleseg/datasets/drive.py @@ -0,0 +1,96 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +from paddleseg.datasets import Dataset + +URL = 'https://bj.bcebos.com/paddleseg/dataset/drive/drive.zip' + + +@manager.DATASETS.add_component +class DRIVE(Dataset): + """ + The Digital Retinal Images for Vessel Extraction (DRIVE) dataset is a dataset for retinal vessel segmentation. + It consists of a total of JPEG 40 color fundus images which is of size (584, 565); including 7 abnormal pathology cases. + (http://www.isi.uu.nl/Research/Databases/DRIVE/) + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + edge (bool): whether extract edge infor in the output + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + edge=False, + mode='train'): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.edge = edge + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 # labels only have 1/0, thus ignore_index is not necessary + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train_list.txt') + elif mode == 'val': + file_path = os.path.join(self.dataset_root, 'val_list.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split() + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/eg1800.py b/paddleseg/datasets/eg1800.py new file mode 100644 index 0000000000000000000000000000000000000000..9005083a92050a325cc946edfbd422e841efaefa --- /dev/null +++ b/paddleseg/datasets/eg1800.py @@ -0,0 +1,137 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy + +import cv2 +import numpy as np + +from paddleseg.datasets import Dataset +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +import paddleseg.transforms.functional as F + +URL = "https://paddleseg.bj.bcebos.com/dataset/EG1800.zip" + + +@manager.DATASETS.add_component +class EG1800(Dataset): + """ + EG1800 dataset `http://xiaoyongshen.me/webpage_portrait/index.html`. + + Args: + common_transforms (list): A list of common image transformations for two inputs of portrait net. + transforms1 (list): A list of image transformations for the first input of portrait net. + transforms2 (list): A list of image transformations for the second input of portrait net. + dataset_root (str, optional): The EG1800 dataset directory. Default: None. + mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 2 + + def __init__(self, + common_transforms, + transforms1, + transforms2, + dataset_root=None, + mode='train', + edge=False): + self.dataset_root = dataset_root + self.common_transforms = Compose(common_transforms) + self.transforms = self.common_transforms + if transforms1 is not None: + self.transforms1 = Compose(transforms1, to_rgb=False) + if transforms2 is not None: + self.transforms2 = Compose(transforms2, to_rgb=False) + mode = mode.lower() + self.ignore_index = 255 + self.mode = mode + self.num_classes = self.NUM_CLASSES + self.input_width = 224 + self.input_height = 224 + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + path = os.path.join(dataset_root, 'eg1800_train.txt') + else: + path = os.path.join(dataset_root, 'eg1800_test.txt') + with open(path, 'r') as f: + files = f.readlines() + img_files = [ + os.path.join(dataset_root, 'Images', file).strip() for file in files + ] + label_files = [ + os.path.join(dataset_root, 'Labels', file).strip() for file in files + ] + + self.file_list = [ + [img_path, label_path] + for img_path, label_path in zip(img_files, label_files) + ] + pass + + def __getitem__(self, item): + image_path, label_path = self.file_list[item] + im = cv2.imread(image_path) + label = cv2.imread(label_path, 0) + label[label > 1] = 0 + + if self.mode == "val": + common_im, label = self.common_transforms(im=im, label=label) + im = np.float32(common_im[::-1, :, :]) # RGB => BGR + im_aug = copy.deepcopy(im) + else: + common_im, label = self.common_transforms(im=im, label=label) + common_im = np.transpose(common_im, [1, 2, 0]) + # add augmentation + im, _ = self.transforms1(common_im) + im_aug, _ = self.transforms2(common_im) + + im = np.float32(im[::-1, :, :]) # RGB => BGR + im_aug = np.float32(im_aug[::-1, :, :]) # RGB => BGR + + label = cv2.resize( + np.uint8(label), (self.input_width, self.input_height), + interpolation=cv2.INTER_NEAREST) + + # add mask blur + label = np.uint8(cv2.blur(label, (5, 5))) + label[label >= 0.5] = 1 + label[label < 0.5] = 0 + + edge_mask = F.mask_to_binary_edge( + label, radius=4, num_classes=self.num_classes) + edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1) + im = np.concatenate([im_aug, im]) + if self.mode == "train": + return im, label, edge_mask + else: + return im, label diff --git a/paddleseg/datasets/hrf.py b/paddleseg/datasets/hrf.py new file mode 100644 index 0000000000000000000000000000000000000000..fb378a72c351b60d5c2a2e338e9fc54a3b87b28d --- /dev/null +++ b/paddleseg/datasets/hrf.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +from paddleseg.datasets import Dataset + +URL = 'https://bj.bcebos.com/paddleseg/dataset/hrf/hrf.zip' + + +@manager.DATASETS.add_component +class HRF(Dataset): + """ + The HRF dataset is a dataset for retinal vessel segmentation which comprises 45 images and is organized as 15 subsets. Each subset contains one healthy fundus image, one image of patient with diabetic retinopathy and one glaucoma image. The image sizes are 3,304 x 2,336, with a training/testing image split of 21/24. + (https://doi.org/10.1155/2013/154860) + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + edge (bool): whether extract edge infor in the output + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + edge=False, + mode='train'): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.edge = edge + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train_list.txt') + elif mode == 'val': + file_path = os.path.join(self.dataset_root, 'val_list.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split() + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/mini_deep_globe_road_extraction.py b/paddleseg/datasets/mini_deep_globe_road_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..7180a9dbe50f35a34fcab99a29411f2103a635cd --- /dev/null +++ b/paddleseg/datasets/mini_deep_globe_road_extraction.py @@ -0,0 +1,95 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from .dataset import Dataset +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + +URL = "https://paddleseg.bj.bcebos.com/dataset/MiniDeepGlobeRoadExtraction.zip" + + +@manager.DATASETS.add_component +class MiniDeepGlobeRoadExtraction(Dataset): + """ + MiniDeepGlobeRoadExtraction dataset is extraced from DeepGlobe CVPR2018 challenge (http://deepglobe.org/) + + There are 800 images in the training set and 200 images in the validation set. + + Args: + dataset_root (str, optional): The dataset directory. Default: None. + transforms (list, optional): Transforms for image. Default: None. + mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False. + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + mode='train', + edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'val']: + raise ValueError( + "`mode` should be 'train' or 'val', but got {}.".format(mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train.txt') + else: + file_path = os.path.join(self.dataset_root, 'val.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split('|') + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name|label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/optic_disc_seg.py b/paddleseg/datasets/optic_disc_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..36332b085e15432f9885087b2a2d9bee5bc2a29f --- /dev/null +++ b/paddleseg/datasets/optic_disc_seg.py @@ -0,0 +1,97 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from .dataset import Dataset +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + +URL = "https://paddleseg.bj.bcebos.com/dataset/optic_disc_seg.zip" + + +@manager.DATASETS.add_component +class OpticDiscSeg(Dataset): + """ + OpticDiscSeg dataset is extraced from iChallenge-AMD + (https://ai.baidu.com/broad/subordinate?dataset=amd). + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + mode='train', + edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train_list.txt') + elif mode == 'val': + file_path = os.path.join(self.dataset_root, 'val_list.txt') + else: + file_path = os.path.join(self.dataset_root, 'test_list.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split() + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/pascal_context.py b/paddleseg/datasets/pascal_context.py new file mode 100644 index 0000000000000000000000000000000000000000..d76ce310d16b8abd9e009e4a91ee9b4c2f819935 --- /dev/null +++ b/paddleseg/datasets/pascal_context.py @@ -0,0 +1,86 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from PIL import Image +from paddleseg.datasets import Dataset +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + + +@manager.DATASETS.add_component +class PascalContext(Dataset): + """ + PascalVOC2010 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`. + If you want to use pascal context dataset, please run the convert_voc2010.py in tools firstly. + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + mode (str): Which part of dataset to use. it is one of ('train', 'trainval', 'context', 'val'). + If you want to set mode to 'context', please make sure the dataset have been augmented. Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 60 + + def __init__(self, + transforms=None, + dataset_root=None, + mode='train', + edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'trainval', 'val']: + raise ValueError( + "`mode` should be one of ('train', 'trainval', 'val') in PascalContext dataset, but got {}." + .format(mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + if self.dataset_root is None: + raise ValueError( + "The dataset is not Found or the folder structure is nonconfoumance." + ) + + image_set_dir = os.path.join(self.dataset_root, 'ImageSets', + 'Segmentation') + + if mode == 'train': + file_path = os.path.join(image_set_dir, 'train_context.txt') + elif mode == 'val': + file_path = os.path.join(image_set_dir, 'val_context.txt') + elif mode == 'trainval': + file_path = os.path.join(image_set_dir, 'trainval_context.txt') + if not os.path.exists(file_path): + raise RuntimeError( + "PASCAL-Context annotations are not ready, " + "Please make sure voc_context.py has been properly run.") + + img_dir = os.path.join(self.dataset_root, 'JPEGImages') + label_dir = os.path.join(self.dataset_root, 'Context') + + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + image_path = os.path.join(img_dir, ''.join([line, '.jpg'])) + label_path = os.path.join(label_dir, ''.join([line, '.png'])) + self.file_list.append([image_path, label_path]) diff --git a/paddleseg/datasets/pp_humanseg14k.py b/paddleseg/datasets/pp_humanseg14k.py new file mode 100644 index 0000000000000000000000000000000000000000..e809611975792ca7d5246df76dbceaebb42c61d4 --- /dev/null +++ b/paddleseg/datasets/pp_humanseg14k.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from .dataset import Dataset +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + + +@manager.DATASETS.add_component +class PPHumanSeg14K(Dataset): + """ + This is the PP-HumanSeg14K Dataset. + + This dataset was introduced in the work: + Chu, Lutao, et al. "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset." Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2022. + + This dataset is divided into training set, validation set and test set. The training set includes 8770 pictures, the validation set includes 2431 pictures, and the test set includes 2482 pictures. + + Args: + dataset_root (str, optional): The dataset directory. Default: None. + transforms (list, optional): Transforms for image. Default: None. + mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False. + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + mode='train', + edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train.txt') + elif mode == 'val': + file_path = os.path.join(self.dataset_root, 'val.txt') + else: + file_path = os.path.join(self.dataset_root, 'test.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split(' ') + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/pssl.py b/paddleseg/datasets/pssl.py new file mode 100644 index 0000000000000000000000000000000000000000..6ebe7fb312c2078be910e31ab0a340379ee2f460 --- /dev/null +++ b/paddleseg/datasets/pssl.py @@ -0,0 +1,135 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np + +from paddleseg.datasets import Dataset +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + + +@manager.DATASETS.add_component +class PSSLDataset(Dataset): + """ + The PSSL dataset for segmentation. PSSL is short for Pseudo Semantic Segmentation Labels, where the pseudo label + is computed by the Consensus explanation algorithm. + + The PSSL refers to "Distilling Ensemble of Explanations for Weakly-Supervised Pre-Training of Image Segmentation + Models" (https://arxiv.org/abs/2207.03335). + + The Consensus explanation refers to "Cross-Model Consensus of Explanations and Beyond for Image Classification + Models: An Empirical Study" (https://arxiv.org/abs/2109.00707). + + To use this dataset, we need to additionally prepare the orignal ImageNet dataset, which has the folder structure + as follows: + + imagenet_root + | + |--train + | |--n01440764 + | | |--n01440764_10026.JPEG + | | |--... + | |--nxxxxxxxx + | |--... + + where only the "train" set is needed. + + The PSSL dataset has the folder structure as follows: + + pssl_root + | + |--train + | |--n01440764 + | | |--n01440764_10026.JPEG_eiseg.npz + | | |--... + | |--nxxxxxxxx + | |--... + | + |--imagenet_lsvrc_2015_synsets.txt + |--train.txt + + where "train.txt" and "imagenet_lsvrc_2015_synsets.txt" are included in the PSSL dataset. + + Args: + transforms (list): Transforms for image. + imagenet_root (str): The path to the original ImageNet dataset. + pssl_root (str): The path to the PSSL dataset. + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False. + """ + ignore_index = 1001 # 0~999 is target class, 1000 is bg + NUM_CLASSES = 1001 # consider target class and bg + + def __init__(self, + transforms, + imagenet_root, + pssl_root, + mode='train', + edge=False): + mode = mode.lower() + if mode not in ['train']: + raise ValueError("mode should be 'train', but got {}.".format(mode)) + if transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + self.transforms = Compose(transforms) + self.mode = mode + self.edge = edge + + self.num_classes = self.NUM_CLASSES + self.ignore_index = self.num_classes # 1001 + self.file_list = [] + self.class_id_dict = {} + + if imagenet_root is None or not os.path.isdir(pssl_root): + raise ValueError( + "The dataset is not Found or the folder structure is nonconfoumance." + ) + + train_list_file = os.path.join(pssl_root, "train.txt") + if not os.path.exists(train_list_file): + raise ValueError("Train list file isn't exists.") + for idx, line in enumerate(open(train_list_file)): + # line: train/n04118776/n04118776_45912.JPEG_eiseg.npz + label_path = line.strip() + img_path = label_path.split('.JPEG')[0] + '.JPEG' + label_path = os.path.join(pssl_root, label_path) + img_path = os.path.join(imagenet_root, img_path) + self.file_list.append([img_path, label_path]) + + # mapping class name to class id. + class_id_file = os.path.join(pssl_root, + "imagenet_lsvrc_2015_synsets.txt") + if not os.path.exists(class_id_file): + raise ValueError("Class id file isn't exists.") + for idx, line in enumerate(open(class_id_file)): + class_name = line.strip() + self.class_id_dict[class_name] = idx + + def __getitem__(self, idx): + image_path, label_path = self.file_list[idx] + + # transform label + class_name = (image_path.split('/')[-1]).split('_')[0] + class_id = self.class_id_dict[class_name] + + pssl_seg = np.load(label_path)['arr_0'] + gt_semantic_seg = np.zeros_like(pssl_seg, dtype=np.int64) + 1000 + # [0, 999] for imagenet classes, 1000 for background, others(-1) will be ignored during training. + gt_semantic_seg[pssl_seg == 1] = class_id + + im, label = self.transforms(im=image_path, label=gt_semantic_seg) + + return im, label diff --git a/paddleseg/datasets/stare.py b/paddleseg/datasets/stare.py new file mode 100644 index 0000000000000000000000000000000000000000..5de8be58c3a9a634172d57b736b91097ba51ea4e --- /dev/null +++ b/paddleseg/datasets/stare.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +from paddleseg.datasets import Dataset + +URL = 'https://bj.bcebos.com/paddleseg/dataset/stare/stare.zip' + + +@manager.DATASETS.add_component +class STARE(Dataset): + """ + STARE dataset is processed from the STARE(STructured Analysis of the Retina) project. + (https://cecas.clemson.edu/~ahoover/stare/) + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + edge (bool): whether extract edge infor in the output + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + """ + NUM_CLASSES = 2 + + def __init__(self, + dataset_root=None, + transforms=None, + edge=False, + mode='train'): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.edge = edge + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) # data STARE + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + file_path = os.path.join(self.dataset_root, 'train_list.txt') + elif mode == 'val': + file_path = os.path.join(self.dataset_root, 'val_list.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split() + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + grt_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, grt_path]) diff --git a/paddleseg/datasets/supervisely.py b/paddleseg/datasets/supervisely.py new file mode 100644 index 0000000000000000000000000000000000000000..accfa465afbd11fb5033aac4d22bf1c2ce28ce77 --- /dev/null +++ b/paddleseg/datasets/supervisely.py @@ -0,0 +1,136 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy + +import cv2 +import numpy as np + +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose +from paddleseg.datasets import Dataset +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +import paddleseg.transforms.functional as F + +URL = "https://paddleseg.bj.bcebos.com/dataset/Supervisely_face.zip" + + +@manager.DATASETS.add_component +class SUPERVISELY(Dataset): + """ + Supervise.ly dataset `https://supervise.ly/`. + + Args: + common_transforms (list): A list of common image transformations for two inputs of portrait net. + transforms1 (list): A list of image transformations for the first input of portrait net. + transforms2 (list): A list of image transformations for the second input of portrait net. + dataset_root (str, optional): The Supervise.ly dataset directory. Default: None. + mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 2 + + def __init__(self, + common_transforms, + transforms1, + transforms2, + dataset_root=None, + mode='train', + edge=False): + self.dataset_root = dataset_root + self.common_transforms = Compose(common_transforms) + self.transforms = self.common_transforms + if transforms1 is not None: + self.transforms1 = Compose(transforms1, to_rgb=False) + if transforms2 is not None: + self.transforms2 = Compose(transforms2, to_rgb=False) + mode = mode.lower() + self.ignore_index = 255 + self.mode = mode + self.num_classes = self.NUM_CLASSES + self.input_width = 224 + self.input_height = 224 + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME) + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + if mode == 'train': + path = os.path.join(dataset_root, 'supervisely_face_train_easy.txt') + else: + path = os.path.join(dataset_root, 'supervisely_face_test_easy.txt') + with open(path, 'r') as f: + files = f.readlines() + files = ["/".join(file.split('/')[1:]) for file in files] + img_files = [os.path.join(dataset_root, file).strip() for file in files] + label_files = [ + os.path.join(dataset_root, file.replace('/img/', '/ann/')).strip() + for file in files + ] + + self.file_list = [ + [img_path, label_path] + for img_path, label_path in zip(img_files, label_files) + ] + + def __getitem__(self, item): + image_path, label_path = self.file_list[item] + im = cv2.imread(image_path) + label = cv2.imread(label_path, 0) + label[label > 0] = 1 + + if self.mode == "val": + common_im, label = self.common_transforms(im=im, label=label) + im = np.float32(common_im[::-1, :, :]) # RGB => BGR + im_aug = copy.deepcopy(im) + else: + common_im, label = self.common_transforms(im=im, label=label) + common_im = np.transpose(common_im, [1, 2, 0]) + # add augmentation + im, _ = self.transforms1(common_im) + im_aug, _ = self.transforms2(common_im) + + im = np.float32(im[::-1, :, :]) # RGB => BGR + im_aug = np.float32(im_aug[::-1, :, :]) # RGB => BGR + + label = cv2.resize( + np.uint8(label), (self.input_width, self.input_height), + interpolation=cv2.INTER_NEAREST) + + # add mask blur + label = np.uint8(cv2.blur(label, (5, 5))) + label[label >= 0.5] = 1 + label[label < 0.5] = 0 + + edge_mask = F.mask_to_binary_edge( + label, radius=4, num_classes=self.num_classes) + edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1) + im = np.concatenate([im_aug, im]) + if self.mode == "train": + return im, label, edge_mask + else: + return im, label diff --git a/paddleseg/datasets/voc.py b/paddleseg/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..f48ad50418203b87d99ccd1754ef0ec48e20a545 --- /dev/null +++ b/paddleseg/datasets/voc.py @@ -0,0 +1,112 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddleseg.datasets import Dataset +from paddleseg.utils.download import download_file_and_uncompress +from paddleseg.utils import seg_env +from paddleseg.cvlibs import manager +from paddleseg.transforms import Compose + +URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar" + + +@manager.DATASETS.add_component +class PascalVOC(Dataset): + """ + PascalVOC2012 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`. + If you want to augment the dataset, please run the voc_augment.py in tools. + + Args: + transforms (list): Transforms for image. + dataset_root (str): The dataset directory. Default: None + mode (str, optional): Which part of dataset to use. it is one of ('train', 'trainval', 'trainaug', 'val'). + If you want to set mode to 'trainaug', please make sure the dataset have been augmented. Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + NUM_CLASSES = 21 + + def __init__(self, transforms, dataset_root=None, mode='train', edge=False): + self.dataset_root = dataset_root + self.transforms = Compose(transforms) + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = self.NUM_CLASSES + self.ignore_index = 255 + self.edge = edge + + if mode not in ['train', 'trainval', 'trainaug', 'val']: + raise ValueError( + "`mode` should be one of ('train', 'trainval', 'trainaug', 'val') in PascalVOC dataset, but got {}." + .format(mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + if self.dataset_root is None: + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=seg_env.DATA_HOME, + extrapath=seg_env.DATA_HOME, + extraname='VOCdevkit') + elif not os.path.exists(self.dataset_root): + self.dataset_root = os.path.normpath(self.dataset_root) + savepath, extraname = self.dataset_root.rsplit( + sep=os.path.sep, maxsplit=1) + self.dataset_root = download_file_and_uncompress( + url=URL, + savepath=savepath, + extrapath=savepath, + extraname=extraname) + + image_set_dir = os.path.join(self.dataset_root, 'VOC2012', 'ImageSets', + 'Segmentation') + if mode == 'train': + file_path = os.path.join(image_set_dir, 'train.txt') + elif mode == 'val': + file_path = os.path.join(image_set_dir, 'val.txt') + elif mode == 'trainval': + file_path = os.path.join(image_set_dir, 'trainval.txt') + elif mode == 'trainaug': + file_path = os.path.join(image_set_dir, 'train.txt') + file_path_aug = os.path.join(image_set_dir, 'aug.txt') + + if not os.path.exists(file_path_aug): + raise RuntimeError( + "When `mode` is 'trainaug', Pascal Voc dataset should be augmented, " + "Please make sure voc_augment.py has been properly run when using this mode." + ) + + img_dir = os.path.join(self.dataset_root, 'VOC2012', 'JPEGImages') + label_dir = os.path.join(self.dataset_root, 'VOC2012', + 'SegmentationClass') + label_dir_aug = os.path.join(self.dataset_root, 'VOC2012', + 'SegmentationClassAug') + + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + image_path = os.path.join(img_dir, ''.join([line, '.jpg'])) + label_path = os.path.join(label_dir, ''.join([line, '.png'])) + self.file_list.append([image_path, label_path]) + if mode == 'trainaug': + with open(file_path_aug, 'r') as f: + for line in f: + line = line.strip() + image_path = os.path.join(img_dir, ''.join([line, '.jpg'])) + label_path = os.path.join(label_dir_aug, + ''.join([line, '.png'])) + self.file_list.append([image_path, label_path]) diff --git a/paddleseg/models/ann.py b/paddleseg/models/ann.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2af1e2a7971733ac266d5f5ab0404a7c6847cb --- /dev/null +++ b/paddleseg/models/ann.py @@ -0,0 +1,434 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class ANN(nn.Layer): + """ + The ANN implementation based on PaddlePaddle. + + The original article refers to + Zhen, Zhu, et al. "Asymmetric Non-local Neural Networks for Semantic Segmentation" + (https://arxiv.org/pdf/1908.07678.pdf). + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. + backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. + key_value_channels (int, optional): The key and value channels of self-attention map in both AFNB and APNB modules. + Default: 256. + inter_channels (int, optional): Both input and output channels of APNB modules. Default: 512. + psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8). + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + key_value_channels=256, + inter_channels=512, + psp_size=(1, 3, 6, 8), + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + self.head = ANNHead(num_classes, backbone_indices, backbone_channels, + key_value_channels, inter_channels, psp_size, + enable_auxiliary_loss) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class ANNHead(nn.Layer): + """ + The ANNHead implementation. + + It mainly consists of AFNB and APNB modules. + + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. + The first index will be taken as low-level features; the second one will be + taken as high-level features in AFNB module. Usually backbone consists of four + downsampling stage, such as ResNet, and return an output of each stage. If it is (2, 3), + it means taking feature map of the third stage and the fourth stage in backbone. + backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index. + key_value_channels (int): The key and value channels of self-attention map in both AFNB and APNB modules. + inter_channels (int): Both input and output channels of APNB modules. + psp_size (tuple): The out size of pooled feature maps. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + """ + + def __init__(self, + num_classes, + backbone_indices, + backbone_channels, + key_value_channels, + inter_channels, + psp_size, + enable_auxiliary_loss=True): + super().__init__() + + low_in_channels = backbone_channels[0] + high_in_channels = backbone_channels[1] + + self.fusion = AFNB( + low_in_channels=low_in_channels, + high_in_channels=high_in_channels, + out_channels=high_in_channels, + key_channels=key_value_channels, + value_channels=key_value_channels, + dropout_prob=0.05, + repeat_sizes=([1]), + psp_size=psp_size) + + self.context = nn.Sequential( + layers.ConvBNReLU( + in_channels=high_in_channels, + out_channels=inter_channels, + kernel_size=3, + padding=1), + APNB( + in_channels=inter_channels, + out_channels=inter_channels, + key_channels=key_value_channels, + value_channels=key_value_channels, + dropout_prob=0.05, + repeat_sizes=([1]), + psp_size=psp_size)) + + self.cls = nn.Conv2D( + in_channels=inter_channels, out_channels=num_classes, kernel_size=1) + self.auxlayer = layers.AuxLayer( + in_channels=low_in_channels, + inter_channels=low_in_channels // 2, + out_channels=num_classes, + dropout_prob=0.05) + + self.backbone_indices = backbone_indices + self.enable_auxiliary_loss = enable_auxiliary_loss + + def forward(self, feat_list): + logit_list = [] + low_level_x = feat_list[self.backbone_indices[0]] + high_level_x = feat_list[self.backbone_indices[1]] + x = self.fusion(low_level_x, high_level_x) + x = self.context(x) + logit = self.cls(x) + logit_list.append(logit) + + if self.enable_auxiliary_loss: + auxiliary_logit = self.auxlayer(low_level_x) + logit_list.append(auxiliary_logit) + + return logit_list + + +class AFNB(nn.Layer): + """ + Asymmetric Fusion Non-local Block. + + Args: + low_in_channels (int): Low-level-feature channels. + high_in_channels (int): High-level-feature channels. + out_channels (int): Out channels of AFNB module. + key_channels (int): The key channels in self-attention block. + value_channels (int): The value channels in self-attention block. + dropout_prob (float): The dropout rate of output. + repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]). + psp_size (tuple. optional): The out size of pooled feature maps. Default: (1, 3, 6, 8). + """ + + def __init__(self, + low_in_channels, + high_in_channels, + out_channels, + key_channels, + value_channels, + dropout_prob, + repeat_sizes=([1]), + psp_size=(1, 3, 6, 8)): + super().__init__() + + self.psp_size = psp_size + self.stages = nn.LayerList([ + SelfAttentionBlock_AFNB(low_in_channels, high_in_channels, + key_channels, value_channels, out_channels, + size) for size in repeat_sizes + ]) + self.conv_bn = layers.ConvBN( + in_channels=out_channels + high_in_channels, + out_channels=out_channels, + kernel_size=1) + self.dropout = nn.Dropout(p=dropout_prob) + + def forward(self, low_feats, high_feats): + priors = [stage(low_feats, high_feats) for stage in self.stages] + context = priors[0] + for i in range(1, len(priors)): + context += priors[i] + + output = self.conv_bn(paddle.concat([context, high_feats], axis=1)) + output = self.dropout(output) + + return output + + +class APNB(nn.Layer): + """ + Asymmetric Pyramid Non-local Block. + + Args: + in_channels (int): The input channels of APNB module. + out_channels (int): Out channels of APNB module. + key_channels (int): The key channels in self-attention block. + value_channels (int): The value channels in self-attention block. + dropout_prob (float): The dropout rate of output. + repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]). + psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8). + """ + + def __init__(self, + in_channels, + out_channels, + key_channels, + value_channels, + dropout_prob, + repeat_sizes=([1]), + psp_size=(1, 3, 6, 8)): + super().__init__() + + self.psp_size = psp_size + self.stages = nn.LayerList([ + SelfAttentionBlock_APNB(in_channels, out_channels, + key_channels, value_channels, size) + for size in repeat_sizes + ]) + self.conv_bn = layers.ConvBNReLU( + in_channels=in_channels * 2, + out_channels=out_channels, + kernel_size=1) + self.dropout = nn.Dropout(p=dropout_prob) + + def forward(self, x): + priors = [stage(x) for stage in self.stages] + context = priors[0] + for i in range(1, len(priors)): + context += priors[i] + + output = self.conv_bn(paddle.concat([context, x], axis=1)) + output = self.dropout(output) + + return output + + +def _pp_module(x, psp_size): + n, c, h, w = x.shape + priors = [] + for size in psp_size: + feat = F.adaptive_avg_pool2d(x, size) + feat = paddle.reshape(feat, shape=(0, c, -1)) + priors.append(feat) + center = paddle.concat(priors, axis=-1) + return center + + +class SelfAttentionBlock_AFNB(nn.Layer): + """ + Self-Attention Block for AFNB module. + + Args: + low_in_channels (int): Low-level-feature channels. + high_in_channels (int): High-level-feature channels. + key_channels (int): The key channels in self-attention block. + value_channels (int): The value channels in self-attention block. + out_channels (int, optional): Out channels of AFNB module. Default: None. + scale (int, optional): Pooling size. Default: 1. + psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8). + """ + + def __init__(self, + low_in_channels, + high_in_channels, + key_channels, + value_channels, + out_channels=None, + scale=1, + psp_size=(1, 3, 6, 8)): + super().__init__() + + self.scale = scale + self.in_channels = low_in_channels + self.out_channels = out_channels + self.key_channels = key_channels + self.value_channels = value_channels + if out_channels == None: + self.out_channels = high_in_channels + self.pool = nn.MaxPool2D(scale) + self.f_key = layers.ConvBNReLU( + in_channels=low_in_channels, + out_channels=key_channels, + kernel_size=1) + self.f_query = layers.ConvBNReLU( + in_channels=high_in_channels, + out_channels=key_channels, + kernel_size=1) + self.f_value = nn.Conv2D( + in_channels=low_in_channels, + out_channels=value_channels, + kernel_size=1) + + self.W = nn.Conv2D( + in_channels=value_channels, + out_channels=out_channels, + kernel_size=1) + + self.psp_size = psp_size + + def forward(self, low_feats, high_feats): + batch_size, _, h, w = high_feats.shape + + value = self.f_value(low_feats) + value = _pp_module(value, self.psp_size) + value = paddle.transpose(value, (0, 2, 1)) + + query = self.f_query(high_feats) + query = paddle.reshape(query, shape=(0, self.key_channels, -1)) + query = paddle.transpose(query, perm=(0, 2, 1)) + + key = self.f_key(low_feats) + key = _pp_module(key, self.psp_size) + + sim_map = paddle.matmul(query, key) + sim_map = (self.key_channels**-.5) * sim_map + sim_map = F.softmax(sim_map, axis=-1) + + context = paddle.matmul(sim_map, value) + context = paddle.transpose(context, perm=(0, 2, 1)) + hf_shape = paddle.shape(high_feats) + context = paddle.reshape( + context, shape=[0, self.value_channels, hf_shape[2], hf_shape[3]]) + + context = self.W(context) + + return context + + +class SelfAttentionBlock_APNB(nn.Layer): + """ + Self-Attention Block for APNB module. + + Args: + in_channels (int): The input channels of APNB module. + out_channels (int): The out channels of APNB module. + key_channels (int): The key channels in self-attention block. + value_channels (int): The value channels in self-attention block. + scale (int, optional): Pooling size. Default: 1. + psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8). + """ + + def __init__(self, + in_channels, + out_channels, + key_channels, + value_channels, + scale=1, + psp_size=(1, 3, 6, 8)): + super().__init__() + + self.scale = scale + self.in_channels = in_channels + self.out_channels = out_channels + self.key_channels = key_channels + self.value_channels = value_channels + self.pool = nn.MaxPool2D(scale) + self.f_key = layers.ConvBNReLU( + in_channels=self.in_channels, + out_channels=self.key_channels, + kernel_size=1) + self.f_query = self.f_key + self.f_value = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.value_channels, + kernel_size=1) + self.W = nn.Conv2D( + in_channels=self.value_channels, + out_channels=self.out_channels, + kernel_size=1) + + self.psp_size = psp_size + + def forward(self, x): + batch_size, _, h, w = x.shape + if self.scale > 1: + x = self.pool(x) + + value = self.f_value(x) + value = _pp_module(value, self.psp_size) + value = paddle.transpose(value, perm=(0, 2, 1)) + + query = self.f_query(x) + query = paddle.reshape(query, shape=(0, self.key_channels, -1)) + query = paddle.transpose(query, perm=(0, 2, 1)) + + key = self.f_key(x) + key = _pp_module(key, self.psp_size) + + sim_map = paddle.matmul(query, key) + sim_map = (self.key_channels**-.5) * sim_map + sim_map = F.softmax(sim_map, axis=-1) + + context = paddle.matmul(sim_map, value) + context = paddle.transpose(context, perm=(0, 2, 1)) + + x_shape = paddle.shape(x) + context = paddle.reshape( + context, shape=[0, self.value_channels, x_shape[2], x_shape[3]]) + context = self.W(context) + + return context diff --git a/paddleseg/models/attention_unet.py b/paddleseg/models/attention_unet.py new file mode 100644 index 0000000000000000000000000000000000000000..32bebe023fa70e0bf45b10d54fcd686d97c6c6d6 --- /dev/null +++ b/paddleseg/models/attention_unet.py @@ -0,0 +1,189 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg import utils +import numpy as np + + +@manager.MODELS.add_component +class AttentionUNet(nn.Layer): + """ + The Attention-UNet implementation based on PaddlePaddle. + As mentioned in the original paper, author proposes a novel attention gate (AG) + that automatically learns to focus on target structures of varying shapes and sizes. + Models trained with AGs implicitly learn to suppress irrelevant regions in an input image while + highlighting salient features useful for a specific task. + + The original article refers to + Oktay, O, et, al. "Attention u-net: Learning where to look for the pancreas." + (https://arxiv.org/pdf/1804.03999.pdf). + + Args: + num_classes (int): The unique number of target classes. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, num_classes, pretrained=None): + super().__init__() + n_channels = 3 + self.encoder = Encoder(n_channels, [64, 128, 256, 512]) + filters = np.array([64, 128, 256, 512, 1024]) + self.up5 = UpConv(ch_in=filters[4], ch_out=filters[3]) + self.att5 = AttentionBlock( + F_g=filters[3], F_l=filters[3], F_out=filters[2]) + self.up_conv5 = ConvBlock(ch_in=filters[4], ch_out=filters[3]) + + self.up4 = UpConv(ch_in=filters[3], ch_out=filters[2]) + self.att4 = AttentionBlock( + F_g=filters[2], F_l=filters[2], F_out=filters[1]) + self.up_conv4 = ConvBlock(ch_in=filters[3], ch_out=filters[2]) + + self.up3 = UpConv(ch_in=filters[2], ch_out=filters[1]) + self.att3 = AttentionBlock( + F_g=filters[1], F_l=filters[1], F_out=filters[0]) + self.up_conv3 = ConvBlock(ch_in=filters[2], ch_out=filters[1]) + + self.up2 = UpConv(ch_in=filters[1], ch_out=filters[0]) + self.att2 = AttentionBlock( + F_g=filters[0], F_l=filters[0], F_out=filters[0] // 2) + self.up_conv2 = ConvBlock(ch_in=filters[1], ch_out=filters[0]) + + self.conv_1x1 = nn.Conv2D( + filters[0], num_classes, kernel_size=1, stride=1, padding=0) + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + x5, (x1, x2, x3, x4) = self.encoder(x) + d5 = self.up5(x5) + x4 = self.att5(g=d5, x=x4) + d5 = paddle.concat([x4, d5], axis=1) + d5 = self.up_conv5(d5) + + d4 = self.up4(d5) + x3 = self.att4(g=d4, x=x3) + d4 = paddle.concat((x3, d4), axis=1) + d4 = self.up_conv4(d4) + + d3 = self.up3(d4) + x2 = self.att3(g=d3, x=x2) + d3 = paddle.concat((x2, d3), axis=1) + d3 = self.up_conv3(d3) + + d2 = self.up2(d3) + x1 = self.att2(g=d2, x=x1) + d2 = paddle.concat((x1, d2), axis=1) + d2 = self.up_conv2(d2) + + logit = self.conv_1x1(d2) + logit_list = [logit] + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class AttentionBlock(nn.Layer): + def __init__(self, F_g, F_l, F_out): + super().__init__() + self.W_g = nn.Sequential( + nn.Conv2D( + F_g, F_out, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(F_out)) + + self.W_x = nn.Sequential( + nn.Conv2D( + F_l, F_out, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(F_out)) + + self.psi = nn.Sequential( + nn.Conv2D( + F_out, 1, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2D(1), + nn.Sigmoid()) + + self.relu = nn.ReLU() + + def forward(self, g, x): + g1 = self.W_g(g) + x1 = self.W_x(x) + psi = self.relu(g1 + x1) + psi = self.psi(psi) + res = x * psi + return res + + +class UpConv(nn.Layer): + def __init__(self, ch_in, ch_out): + super().__init__() + self.up = nn.Sequential( + nn.Upsample( + scale_factor=2, mode="bilinear"), + nn.Conv2D( + ch_in, ch_out, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2D(ch_out), + nn.ReLU()) + + def forward(self, x): + return self.up(x) + + +class Encoder(nn.Layer): + def __init__(self, input_channels, filters): + super().__init__() + self.double_conv = nn.Sequential( + layers.ConvBNReLU(input_channels, 64, 3), + layers.ConvBNReLU(64, 64, 3)) + down_channels = filters + self.down_sample_list = nn.LayerList([ + self.down_sampling(channel, channel * 2) + for channel in down_channels + ]) + + def down_sampling(self, in_channels, out_channels): + modules = [] + modules.append(nn.MaxPool2D(kernel_size=2, stride=2)) + modules.append(layers.ConvBNReLU(in_channels, out_channels, 3)) + modules.append(layers.ConvBNReLU(out_channels, out_channels, 3)) + return nn.Sequential(*modules) + + def forward(self, x): + short_cuts = [] + x = self.double_conv(x) + for down_sample in self.down_sample_list: + short_cuts.append(x) + x = down_sample(x) + return x, short_cuts + + +class ConvBlock(nn.Layer): + def __init__(self, ch_in, ch_out): + super(ConvBlock, self).__init__() + self.conv = nn.Sequential( + nn.Conv2D( + ch_in, ch_out, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2D(ch_out), + nn.ReLU(), + nn.Conv2D( + ch_out, ch_out, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2D(ch_out), + nn.ReLU()) + + def forward(self, x): + return self.conv(x) diff --git a/paddleseg/models/backbones/__init__.py b/paddleseg/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..77860d06267817790572a676d70e4659beae1d41 --- /dev/null +++ b/paddleseg/models/backbones/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .hrnet import * +from .resnet_vd import * +from .xception_deeplab import * +from .mobilenetv3 import * +from .vision_transformer import * +from .swin_transformer import * +from .mobilenetv2 import * +from .mix_transformer import * +from .stdcnet import * +from .lite_hrnet import * +from .shufflenetv2 import * +from .ghostnet import * \ No newline at end of file diff --git a/paddleseg/models/backbones/ghostnet.py b/paddleseg/models/backbones/ghostnet.py new file mode 100644 index 0000000000000000000000000000000000000000..eaa47f2880126d4939566dbe1e16cef7c6a7bc8e --- /dev/null +++ b/paddleseg/models/backbones/ghostnet.py @@ -0,0 +1,318 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/ghostnet_pytorch + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Uniform, KaimingNormal + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils, logger + +__all__ = ["GhostNet_x0_5", "GhostNet_x1_0", "GhostNet_x1_3"] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act="relu", + name=None): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=KaimingNormal(), name=name + "_weights"), + bias_attr=False) + bn_name = name + "_bn" + + self._batch_norm = BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr( + name=bn_name + "_scale", regularizer=L2Decay(0.0)), + bias_attr=ParamAttr( + name=bn_name + "_offset", regularizer=L2Decay(0.0)), + moving_mean_name=bn_name + "_mean", + moving_variance_name=bn_name + "_variance") + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class SEBlock(nn.Layer): + def __init__(self, num_channels, reduction_ratio=4, name=None): + super(SEBlock, self).__init__() + self.pool2d_gap = AdaptiveAvgPool2D(1) + self._num_channels = num_channels + stdv = 1.0 / math.sqrt(num_channels * 1.0) + med_ch = num_channels // reduction_ratio + self.squeeze = Linear( + num_channels, + med_ch, + weight_attr=ParamAttr( + initializer=Uniform(-stdv, stdv), name=name + "_1_weights"), + bias_attr=ParamAttr(name=name + "_1_offset")) + stdv = 1.0 / math.sqrt(med_ch * 1.0) + self.excitation = Linear( + med_ch, + num_channels, + weight_attr=ParamAttr( + initializer=Uniform(-stdv, stdv), name=name + "_2_weights"), + bias_attr=ParamAttr(name=name + "_2_offset")) + + def forward(self, inputs): + pool = self.pool2d_gap(inputs) + pool = paddle.squeeze(pool, axis=[2, 3]) + squeeze = self.squeeze(pool) + squeeze = F.relu(squeeze) + excitation = self.excitation(squeeze) + excitation = paddle.clip(x=excitation, min=0, max=1) + excitation = paddle.unsqueeze(excitation, axis=[2, 3]) + out = paddle.multiply(inputs, excitation) + return out + + +class GhostModule(nn.Layer): + def __init__(self, + in_channels, + output_channels, + kernel_size=1, + ratio=2, + dw_size=3, + stride=1, + relu=True, + name=None): + super(GhostModule, self).__init__() + init_channels = int(math.ceil(output_channels / ratio)) + new_channels = int(init_channels * (ratio - 1)) + self.primary_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=init_channels, + kernel_size=kernel_size, + stride=stride, + groups=1, + act="relu" if relu else None, + name=name + "_primary_conv") + self.cheap_operation = ConvBNLayer( + in_channels=init_channels, + out_channels=new_channels, + kernel_size=dw_size, + stride=1, + groups=init_channels, + act="relu" if relu else None, + name=name + "_cheap_operation") + + def forward(self, inputs): + x = self.primary_conv(inputs) + y = self.cheap_operation(x) + out = paddle.concat([x, y], axis=1) + return out + + +class GhostBottleneck(nn.Layer): + def __init__(self, + in_channels, + hidden_dim, + output_channels, + kernel_size, + stride, + use_se, + name=None): + super(GhostBottleneck, self).__init__() + self._stride = stride + self._use_se = use_se + self._num_channels = in_channels + self._output_channels = output_channels + self.ghost_module_1 = GhostModule( + in_channels=in_channels, + output_channels=hidden_dim, + kernel_size=1, + stride=1, + relu=True, + name=name + "_ghost_module_1") + if stride == 2: + self.depthwise_conv = ConvBNLayer( + in_channels=hidden_dim, + out_channels=hidden_dim, + kernel_size=kernel_size, + stride=stride, + groups=hidden_dim, + act=None, + name=name + + "_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. + ) + if use_se: + self.se_block = SEBlock(num_channels=hidden_dim, name=name + "_se") + self.ghost_module_2 = GhostModule( + in_channels=hidden_dim, + output_channels=output_channels, + kernel_size=1, + relu=False, + name=name + "_ghost_module_2") + if stride != 1 or in_channels != output_channels: + self.shortcut_depthwise = ConvBNLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + groups=in_channels, + act=None, + name=name + + "_shortcut_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. + ) + self.shortcut_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=output_channels, + kernel_size=1, + stride=1, + groups=1, + act=None, + name=name + "_shortcut_conv") + + def forward(self, inputs): + x = self.ghost_module_1(inputs) + if self._stride == 2: + x = self.depthwise_conv(x) + if self._use_se: + x = self.se_block(x) + x = self.ghost_module_2(x) + if self._stride == 1 and self._num_channels == self._output_channels: + shortcut = inputs + else: + shortcut = self.shortcut_depthwise(inputs) + shortcut = self.shortcut_conv(shortcut) + return paddle.add(x=x, y=shortcut) + + +class GhostNet(nn.Layer): + def __init__(self, scale, pretrained=None): + super(GhostNet, self).__init__() + self.cfgs = [ + # k, t, c, SE, s + [3, 16, 16, 0, 1], + [3, 48, 24, 0, 2], + [3, 72, 24, 0, 1], # x4 + [5, 72, 40, 1, 2], + [5, 120, 40, 1, 1], # x8 + [3, 240, 80, 0, 2], + [3, 200, 80, 0, 1], + [3, 184, 80, 0, 1], + [3, 184, 80, 0, 1], + [3, 480, 112, 1, 1], + [3, 672, 112, 1, 1], # x16 + [5, 672, 160, 1, 2], + [5, 960, 160, 0, 1], + [5, 960, 160, 1, 1], + [5, 960, 160, 0, 1], + [5, 960, 160, 1, 1] # x32 + ] + self.scale = scale + self.pretrained = pretrained + + output_channels = int(self._make_divisible(16 * self.scale, 4)) + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=output_channels, + kernel_size=3, + stride=2, + groups=1, + act="relu", + name="conv1") + + # build inverted residual blocks + self.out_index = [2, 4, 10, 15] + self.feat_channels = [] + self.ghost_bottleneck_list = [] + for idx, (k, exp_size, c, use_se, s) in enumerate(self.cfgs): + in_channels = output_channels + output_channels = int(self._make_divisible(c * self.scale, 4)) + hidden_dim = int(self._make_divisible(exp_size * self.scale, 4)) + ghost_bottleneck = self.add_sublayer( + name="_ghostbottleneck_" + str(idx), + sublayer=GhostBottleneck( + in_channels=in_channels, + hidden_dim=hidden_dim, + output_channels=output_channels, + kernel_size=k, + stride=s, + use_se=use_se, + name="_ghostbottleneck_" + str(idx))) + self.ghost_bottleneck_list.append(ghost_bottleneck) + if idx in self.out_index: + self.feat_channels.append(output_channels) + + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, inputs): + feat_list = [] + x = self.conv1(inputs) + for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list): + x = ghost_bottleneck(x) + if idx in self.out_index: + feat_list.append(x) + return feat_list + + def _make_divisible(self, v, divisor, min_value=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +@manager.BACKBONES.add_component +def GhostNet_x0_5(**kwargs): + model = GhostNet(scale=0.5, **kwargs) + return model + + +@manager.BACKBONES.add_component +def GhostNet_x1_0(**kwargs): + model = GhostNet(scale=1.0, **kwargs) + return model + + +@manager.BACKBONES.add_component +def GhostNet_x1_3(**kwargs): + model = GhostNet(scale=1.3, **kwargs) + return model diff --git a/paddleseg/models/backbones/hrnet.py b/paddleseg/models/backbones/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..d71daab74dd5e5058e832789c660cbab3777eec6 --- /dev/null +++ b/paddleseg/models/backbones/hrnet.py @@ -0,0 +1,837 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers +from paddleseg.utils import utils + +__all__ = [ + "HRNet_W18_Small_V1", "HRNet_W18_Small_V2", "HRNet_W18", "HRNet_W30", + "HRNet_W32", "HRNet_W40", "HRNet_W44", "HRNet_W48", "HRNet_W60", "HRNet_W64" +] + + +class HRNet(nn.Layer): + """ + The HRNet implementation based on PaddlePaddle. + + The original article refers to + Jingdong Wang, et, al. "HRNet:Deep High-Resolution Representation Learning for Visual Recognition" + (https://arxiv.org/pdf/1908.07919.pdf). + + Args: + pretrained (str, optional): The path of pretrained model. + stage1_num_modules (int, optional): Number of modules for stage1. Default 1. + stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4). + stage1_num_channels (list, optional): Number of channels per branch for stage1. Default (64). + stage2_num_modules (int, optional): Number of modules for stage2. Default 1. + stage2_num_blocks (list, optional): Number of blocks per module for stage2. Default (4, 4). + stage2_num_channels (list, optional): Number of channels per branch for stage2. Default (18, 36). + stage3_num_modules (int, optional): Number of modules for stage3. Default 4. + stage3_num_blocks (list, optional): Number of blocks per module for stage3. Default (4, 4, 4). + stage3_num_channels (list, optional): Number of channels per branch for stage3. Default [18, 36, 72). + stage4_num_modules (int, optional): Number of modules for stage4. Default 3. + stage4_num_blocks (list, optional): Number of blocks per module for stage4. Default (4, 4, 4, 4). + stage4_num_channels (list, optional): Number of channels per branch for stage4. Default (18, 36, 72. 144). + has_se (bool, optional): Whether to use Squeeze-and-Excitation module. Default False. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + """ + + def __init__(self, + pretrained=None, + stage1_num_modules=1, + stage1_num_blocks=(4, ), + stage1_num_channels=(64, ), + stage2_num_modules=1, + stage2_num_blocks=(4, 4), + stage2_num_channels=(18, 36), + stage3_num_modules=4, + stage3_num_blocks=(4, 4, 4), + stage3_num_channels=(18, 36, 72), + stage4_num_modules=3, + stage4_num_blocks=(4, 4, 4, 4), + stage4_num_channels=(18, 36, 72, 144), + has_se=False, + align_corners=False, + padding_same=True): + super(HRNet, self).__init__() + self.pretrained = pretrained + self.stage1_num_modules = stage1_num_modules + self.stage1_num_blocks = stage1_num_blocks + self.stage1_num_channels = stage1_num_channels + self.stage2_num_modules = stage2_num_modules + self.stage2_num_blocks = stage2_num_blocks + self.stage2_num_channels = stage2_num_channels + self.stage3_num_modules = stage3_num_modules + self.stage3_num_blocks = stage3_num_blocks + self.stage3_num_channels = stage3_num_channels + self.stage4_num_modules = stage4_num_modules + self.stage4_num_blocks = stage4_num_blocks + self.stage4_num_channels = stage4_num_channels + self.has_se = has_se + self.align_corners = align_corners + self.feat_channels = [sum(stage4_num_channels)] + + self.conv_layer1_1 = layers.ConvBNReLU( + in_channels=3, + out_channels=64, + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False) + + self.conv_layer1_2 = layers.ConvBNReLU( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False) + + self.la1 = Layer1( + num_channels=64, + num_blocks=self.stage1_num_blocks[0], + num_filters=self.stage1_num_channels[0], + has_se=has_se, + name="layer2", + padding_same=padding_same) + + self.tr1 = TransitionLayer( + in_channels=[self.stage1_num_channels[0] * 4], + out_channels=self.stage2_num_channels, + name="tr1", + padding_same=padding_same) + + self.st2 = Stage( + num_channels=self.stage2_num_channels, + num_modules=self.stage2_num_modules, + num_blocks=self.stage2_num_blocks, + num_filters=self.stage2_num_channels, + has_se=self.has_se, + name="st2", + align_corners=align_corners, + padding_same=padding_same) + + self.tr2 = TransitionLayer( + in_channels=self.stage2_num_channels, + out_channels=self.stage3_num_channels, + name="tr2", + padding_same=padding_same) + self.st3 = Stage( + num_channels=self.stage3_num_channels, + num_modules=self.stage3_num_modules, + num_blocks=self.stage3_num_blocks, + num_filters=self.stage3_num_channels, + has_se=self.has_se, + name="st3", + align_corners=align_corners, + padding_same=padding_same) + + self.tr3 = TransitionLayer( + in_channels=self.stage3_num_channels, + out_channels=self.stage4_num_channels, + name="tr3", + padding_same=padding_same) + self.st4 = Stage( + num_channels=self.stage4_num_channels, + num_modules=self.stage4_num_modules, + num_blocks=self.stage4_num_blocks, + num_filters=self.stage4_num_channels, + has_se=self.has_se, + name="st4", + align_corners=align_corners, + padding_same=padding_same) + + self.init_weight() + + def forward(self, x): + conv1 = self.conv_layer1_1(x) + conv2 = self.conv_layer1_2(conv1) + + la1 = self.la1(conv2) + + tr1 = self.tr1([la1]) + st2 = self.st2(tr1) + + tr2 = self.tr2(st2) + st3 = self.st3(tr2) + + tr3 = self.tr3(st3) + st4 = self.st4(tr3) + + size = paddle.shape(st4[0])[2:] + x1 = F.interpolate( + st4[1], size, mode='bilinear', align_corners=self.align_corners) + x2 = F.interpolate( + st4[2], size, mode='bilinear', align_corners=self.align_corners) + x3 = F.interpolate( + st4[3], size, mode='bilinear', align_corners=self.align_corners) + x = paddle.concat([st4[0], x1, x2, x3], axis=1) + + return [x] + + def init_weight(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + param_init.normal_init(layer.weight, std=0.001) + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +class Layer1(nn.Layer): + def __init__(self, + num_channels, + num_filters, + num_blocks, + has_se=False, + name=None, + padding_same=True): + super(Layer1, self).__init__() + + self.bottleneck_block_list = [] + + for i in range(num_blocks): + bottleneck_block = self.add_sublayer( + "bb_{}_{}".format(name, i + 1), + BottleneckBlock( + num_channels=num_channels if i == 0 else num_filters * 4, + num_filters=num_filters, + has_se=has_se, + stride=1, + downsample=True if i == 0 else False, + name=name + '_' + str(i + 1), + padding_same=padding_same)) + self.bottleneck_block_list.append(bottleneck_block) + + def forward(self, x): + conv = x + for block_func in self.bottleneck_block_list: + conv = block_func(conv) + return conv + + +class TransitionLayer(nn.Layer): + def __init__(self, in_channels, out_channels, name=None, padding_same=True): + super(TransitionLayer, self).__init__() + + num_in = len(in_channels) + num_out = len(out_channels) + self.conv_bn_func_list = [] + for i in range(num_out): + residual = None + if i < num_in: + if in_channels[i] != out_channels[i]: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + layers.ConvBNReLU( + in_channels=in_channels[i], + out_channels=out_channels[i], + kernel_size=3, + padding=1 if not padding_same else 'same', + bias_attr=False)) + else: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + layers.ConvBNReLU( + in_channels=in_channels[-1], + out_channels=out_channels[i], + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False)) + self.conv_bn_func_list.append(residual) + + def forward(self, x): + outs = [] + for idx, conv_bn_func in enumerate(self.conv_bn_func_list): + if conv_bn_func is None: + outs.append(x[idx]) + else: + if idx < len(x): + outs.append(conv_bn_func(x[idx])) + else: + outs.append(conv_bn_func(x[-1])) + return outs + + +class Branches(nn.Layer): + def __init__(self, + num_blocks, + in_channels, + out_channels, + has_se=False, + name=None, + padding_same=True): + super(Branches, self).__init__() + + self.basic_block_list = [] + + for i in range(len(out_channels)): + self.basic_block_list.append([]) + for j in range(num_blocks[i]): + in_ch = in_channels[i] if j == 0 else out_channels[i] + basic_block_func = self.add_sublayer( + "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1), + BasicBlock( + num_channels=in_ch, + num_filters=out_channels[i], + has_se=has_se, + name=name + '_branch_layer_' + str(i + 1) + '_' + + str(j + 1), + padding_same=padding_same)) + self.basic_block_list[i].append(basic_block_func) + + def forward(self, x): + outs = [] + for idx, input in enumerate(x): + conv = input + for basic_block_func in self.basic_block_list[idx]: + conv = basic_block_func(conv) + outs.append(conv) + return outs + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + has_se, + stride=1, + downsample=False, + name=None, + padding_same=True): + super(BottleneckBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = layers.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + bias_attr=False) + + self.conv2 = layers.ConvBNReLU( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + stride=stride, + padding=1 if not padding_same else 'same', + bias_attr=False) + + self.conv3 = layers.ConvBN( + in_channels=num_filters, + out_channels=num_filters * 4, + kernel_size=1, + bias_attr=False) + + if self.downsample: + self.conv_down = layers.ConvBN( + in_channels=num_channels, + out_channels=num_filters * 4, + kernel_size=1, + bias_attr=False) + + if self.has_se: + self.se = SELayer( + num_channels=num_filters * 4, + num_filters=num_filters * 4, + reduction_ratio=16, + name=name + '_fc') + + self.add = layers.Add() + self.relu = layers.Activation("relu") + + def forward(self, x): + residual = x + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + conv3 = self.conv3(conv2) + + if self.downsample: + residual = self.conv_down(x) + + if self.has_se: + conv3 = self.se(conv3) + + y = self.add(conv3, residual) + y = self.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride=1, + has_se=False, + downsample=False, + name=None, + padding_same=True): + super(BasicBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = layers.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=3, + stride=stride, + padding=1 if not padding_same else 'same', + bias_attr=False) + self.conv2 = layers.ConvBN( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + padding=1 if not padding_same else 'same', + bias_attr=False) + + if self.downsample: + self.conv_down = layers.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + bias_attr=False) + + if self.has_se: + self.se = SELayer( + num_channels=num_filters, + num_filters=num_filters, + reduction_ratio=16, + name=name + '_fc') + + self.add = layers.Add() + self.relu = layers.Activation("relu") + + def forward(self, x): + residual = x + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + + if self.downsample: + residual = self.conv_down(x) + + if self.has_se: + conv2 = self.se(conv2) + + y = self.add(conv2, residual) + y = self.relu(y) + return y + + +class SELayer(nn.Layer): + def __init__(self, num_channels, num_filters, reduction_ratio, name=None): + super(SELayer, self).__init__() + + self.pool2d_gap = nn.AdaptiveAvgPool2D(1) + + self._num_channels = num_channels + + med_ch = int(num_channels / reduction_ratio) + stdv = 1.0 / math.sqrt(num_channels * 1.0) + self.squeeze = nn.Linear( + num_channels, + med_ch, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Uniform(-stdv, stdv))) + + stdv = 1.0 / math.sqrt(med_ch * 1.0) + self.excitation = nn.Linear( + med_ch, + num_filters, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Uniform(-stdv, stdv))) + + def forward(self, x): + pool = self.pool2d_gap(x) + pool = paddle.reshape(pool, shape=[-1, self._num_channels]) + squeeze = self.squeeze(pool) + squeeze = F.relu(squeeze) + excitation = self.excitation(squeeze) + excitation = F.sigmoid(excitation) + excitation = paddle.reshape( + excitation, shape=[-1, self._num_channels, 1, 1]) + out = x * excitation + return out + + +class Stage(nn.Layer): + def __init__(self, + num_channels, + num_modules, + num_blocks, + num_filters, + has_se=False, + multi_scale_output=True, + name=None, + align_corners=False, + padding_same=True): + super(Stage, self).__init__() + + self._num_modules = num_modules + + self.stage_func_list = [] + for i in range(num_modules): + if i == num_modules - 1 and not multi_scale_output: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_blocks=num_blocks, + num_filters=num_filters, + has_se=has_se, + multi_scale_output=False, + name=name + '_' + str(i + 1), + align_corners=align_corners, + padding_same=padding_same)) + else: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_blocks=num_blocks, + num_filters=num_filters, + has_se=has_se, + name=name + '_' + str(i + 1), + align_corners=align_corners, + padding_same=padding_same)) + + self.stage_func_list.append(stage_func) + + def forward(self, x): + out = x + for idx in range(self._num_modules): + out = self.stage_func_list[idx](out) + return out + + +class HighResolutionModule(nn.Layer): + def __init__(self, + num_channels, + num_blocks, + num_filters, + has_se=False, + multi_scale_output=True, + name=None, + align_corners=False, + padding_same=True): + super(HighResolutionModule, self).__init__() + + self.branches_func = Branches( + num_blocks=num_blocks, + in_channels=num_channels, + out_channels=num_filters, + has_se=has_se, + name=name, + padding_same=padding_same) + + self.fuse_func = FuseLayers( + in_channels=num_filters, + out_channels=num_filters, + multi_scale_output=multi_scale_output, + name=name, + align_corners=align_corners, + padding_same=padding_same) + + def forward(self, x): + out = self.branches_func(x) + out = self.fuse_func(out) + return out + + +class FuseLayers(nn.Layer): + def __init__(self, + in_channels, + out_channels, + multi_scale_output=True, + name=None, + align_corners=False, + padding_same=True): + super(FuseLayers, self).__init__() + + self._actual_ch = len(in_channels) if multi_scale_output else 1 + self._in_channels = in_channels + self.align_corners = align_corners + + self.residual_func_list = [] + for i in range(self._actual_ch): + for j in range(len(in_channels)): + if j > i: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}".format(name, i + 1, j + 1), + layers.ConvBN( + in_channels=in_channels[j], + out_channels=out_channels[i], + kernel_size=1, + bias_attr=False)) + self.residual_func_list.append(residual_func) + elif j < i: + pre_num_filters = in_channels[j] + for k in range(i - j): + if k == i - j - 1: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + layers.ConvBN( + in_channels=pre_num_filters, + out_channels=out_channels[i], + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False)) + pre_num_filters = out_channels[i] + else: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + layers.ConvBNReLU( + in_channels=pre_num_filters, + out_channels=out_channels[j], + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False)) + pre_num_filters = out_channels[j] + self.residual_func_list.append(residual_func) + + def forward(self, x): + outs = [] + residual_func_idx = 0 + for i in range(self._actual_ch): + residual = x[i] + residual_shape = paddle.shape(residual)[-2:] + for j in range(len(self._in_channels)): + if j > i: + y = self.residual_func_list[residual_func_idx](x[j]) + residual_func_idx += 1 + + y = F.interpolate( + y, + residual_shape, + mode='bilinear', + align_corners=self.align_corners) + residual = residual + y + elif j < i: + y = x[j] + for k in range(i - j): + y = self.residual_func_list[residual_func_idx](y) + residual_func_idx += 1 + + residual = residual + y + + residual = F.relu(residual) + outs.append(residual) + + return outs + + +@manager.BACKBONES.add_component +def HRNet_W18_Small_V1(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[1], + stage1_num_channels=[32], + stage2_num_modules=1, + stage2_num_blocks=[2, 2], + stage2_num_channels=[16, 32], + stage3_num_modules=1, + stage3_num_blocks=[2, 2, 2], + stage3_num_channels=[16, 32, 64], + stage4_num_modules=1, + stage4_num_blocks=[2, 2, 2, 2], + stage4_num_channels=[16, 32, 64, 128], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W18_Small_V2(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[2], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[2, 2], + stage2_num_channels=[18, 36], + stage3_num_modules=3, + stage3_num_blocks=[2, 2, 2], + stage3_num_channels=[18, 36, 72], + stage4_num_modules=2, + stage4_num_blocks=[2, 2, 2, 2], + stage4_num_channels=[18, 36, 72, 144], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W18(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[18, 36], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[18, 36, 72], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[18, 36, 72, 144], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W30(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[30, 60], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[30, 60, 120], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[30, 60, 120, 240], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W32(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[32, 64], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[32, 64, 128], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[32, 64, 128, 256], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W40(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[40, 80], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[40, 80, 160], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[40, 80, 160, 320], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W44(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[44, 88], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[44, 88, 176], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[44, 88, 176, 352], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W48(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[48, 96], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[48, 96, 192], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[48, 96, 192, 384], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W60(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[60, 120], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[60, 120, 240], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[60, 120, 240, 480], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W64(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[64, 128], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[64, 128, 256], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[64, 128, 256, 512], + **kwargs) + return model diff --git a/paddleseg/models/backbones/lite_hrnet.py b/paddleseg/models/backbones/lite_hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..9b55e68ca59e859e655f3ba0042f8425aab38003 --- /dev/null +++ b/paddleseg/models/backbones/lite_hrnet.py @@ -0,0 +1,972 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on +https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py +""" + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from numbers import Integral +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Normal, Constant + +from paddleseg.cvlibs import manager +from paddleseg import utils + +__all__ = [ + "Lite_HRNet_18", "Lite_HRNet_30", "Lite_HRNet_naive", + "Lite_HRNet_wider_naive", "LiteHRNet" +] + + +def Conv2d(in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + weight_init=Normal(std=0.001), + bias_init=Constant(0.)): + weight_attr = paddle.framework.ParamAttr(initializer=weight_init) + if bias: + bias_attr = paddle.framework.ParamAttr(initializer=bias_init) + else: + bias_attr = False + conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + weight_attr=weight_attr, + bias_attr=bias_attr) + return conv + + +def channel_shuffle(x, groups): + x_shape = paddle.shape(x) + batch_size, height, width = x_shape[0], x_shape[2], x_shape[3] + num_channels = x.shape[1] + channels_per_group = num_channels // groups + + x = paddle.reshape( + x=x, shape=[batch_size, groups, channels_per_group, height, width]) + x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4]) + x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width]) + + return x + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride=1, + groups=1, + norm_type=None, + norm_groups=32, + norm_decay=0., + freeze_norm=False, + act=None): + super(ConvNormLayer, self).__init__() + self.act = act + norm_lr = 0. if freeze_norm else 1. + if norm_type is not None: + assert norm_type in ['bn', 'sync_bn', 'gn'], \ + "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type) + param_attr = ParamAttr( + initializer=Constant(1.0), + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), ) + bias_attr = ParamAttr( + learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) + global_stats = True if freeze_norm else None + if norm_type in ['bn', 'sync_bn']: + self.norm = nn.BatchNorm2D( + ch_out, + weight_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats, ) + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + weight_attr=param_attr, + bias_attr=bias_attr) + norm_params = self.norm.parameters() + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + conv_bias_attr = False + else: + conv_bias_attr = True + self.norm = None + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=Normal( + mean=0., std=0.001)), + bias_attr=conv_bias_attr) + + def forward(self, inputs): + out = self.conv(inputs) + if self.norm is not None: + out = self.norm(out) + + if self.act == 'relu': + out = F.relu(out) + elif self.act == 'sigmoid': + out = F.sigmoid(out) + return out + + +class DepthWiseSeparableConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride=1, + dw_norm_type=None, + pw_norm_type=None, + norm_decay=0., + freeze_norm=False, + dw_act=None, + pw_act=None): + super(DepthWiseSeparableConvNormLayer, self).__init__() + self.depthwise_conv = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_in, + filter_size=filter_size, + stride=stride, + groups=ch_in, + norm_type=dw_norm_type, + act=dw_act, + norm_decay=norm_decay, + freeze_norm=freeze_norm, ) + self.pointwise_conv = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=1, + norm_type=pw_norm_type, + act=pw_act, + norm_decay=norm_decay, + freeze_norm=freeze_norm, ) + + def forward(self, x): + x = self.depthwise_conv(x) + x = self.pointwise_conv(x) + return x + + +class CrossResolutionWeightingModule(nn.Layer): + def __init__(self, + channels, + ratio=16, + norm_type='bn', + freeze_norm=False, + norm_decay=0.): + super(CrossResolutionWeightingModule, self).__init__() + self.channels = channels + total_channel = sum(channels) + self.conv1 = ConvNormLayer( + ch_in=total_channel, + ch_out=total_channel // ratio, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + self.conv2 = ConvNormLayer( + ch_in=total_channel // ratio, + ch_out=total_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='sigmoid', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + + def forward(self, x): + out = [] + for idx, xi in enumerate(x[:-1]): + kernel_size = stride = pow(2, len(x) - idx - 1) + xi = F.avg_pool2d(xi, kernel_size=kernel_size, stride=stride) + out.append(xi) + out.append(x[-1]) + + out = paddle.concat(out, 1) + out = self.conv1(out) + out = self.conv2(out) + out = paddle.split(out, self.channels, 1) + out = [ + s * F.interpolate( + a, paddle.shape(s)[-2:], mode='nearest') for s, a in zip(x, out) + ] + return out + + +class SpatialWeightingModule(nn.Layer): + def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.): + super(SpatialWeightingModule, self).__init__() + self.global_avgpooling = nn.AdaptiveAvgPool2D(1) + self.conv1 = ConvNormLayer( + ch_in=in_channel, + ch_out=in_channel // ratio, + filter_size=1, + stride=1, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + self.conv2 = ConvNormLayer( + ch_in=in_channel // ratio, + ch_out=in_channel, + filter_size=1, + stride=1, + act='sigmoid', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + + def forward(self, x): + out = self.global_avgpooling(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out + + +class ConditionalChannelWeightingBlock(nn.Layer): + def __init__(self, + in_channels, + stride, + reduce_ratio, + norm_type='bn', + freeze_norm=False, + norm_decay=0.): + super(ConditionalChannelWeightingBlock, self).__init__() + assert stride in [1, 2] + branch_channels = [channel // 2 for channel in in_channels] + + self.cross_resolution_weighting = CrossResolutionWeightingModule( + branch_channels, + ratio=reduce_ratio, + norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay) + self.depthwise_convs = nn.LayerList([ + ConvNormLayer( + channel, + channel, + filter_size=3, + stride=stride, + groups=channel, + norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay) for channel in branch_channels + ]) + + self.spatial_weighting = nn.LayerList([ + SpatialWeightingModule( + channel, + ratio=4, + freeze_norm=freeze_norm, + norm_decay=norm_decay) for channel in branch_channels + ]) + + def forward(self, x): + x = [s.chunk(2, axis=1) for s in x] + x1 = [s[0] for s in x] + x2 = [s[1] for s in x] + + x2 = self.cross_resolution_weighting(x2) + x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)] + x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)] + + out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)] + out = [channel_shuffle(s, groups=2) for s in out] + return out + + +class ShuffleUnit(nn.Layer): + def __init__(self, + in_channel, + out_channel, + stride, + norm_type='bn', + freeze_norm=False, + norm_decay=0.): + super(ShuffleUnit, self).__init__() + branch_channel = out_channel // 2 + self.stride = stride + if self.stride == 1: + assert in_channel == branch_channel * 2, \ + "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2) + if stride > 1: + self.branch1 = nn.Sequential( + ConvNormLayer( + ch_in=in_channel, + ch_out=in_channel, + filter_size=3, + stride=self.stride, + groups=in_channel, + norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay), + ConvNormLayer( + ch_in=in_channel, + ch_out=branch_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay), ) + self.branch2 = nn.Sequential( + ConvNormLayer( + ch_in=branch_channel if stride == 1 else in_channel, + ch_out=branch_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay), + ConvNormLayer( + ch_in=branch_channel, + ch_out=branch_channel, + filter_size=3, + stride=self.stride, + groups=branch_channel, + norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay), + ConvNormLayer( + ch_in=branch_channel, + ch_out=branch_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay), ) + + def forward(self, x): + if self.stride > 1: + x1 = self.branch1(x) + x2 = self.branch2(x) + else: + x1, x2 = x.chunk(2, axis=1) + x2 = self.branch2(x2) + out = paddle.concat([x1, x2], axis=1) + out = channel_shuffle(out, groups=2) + return out + + +class IterativeHead(nn.Layer): + def __init__(self, + in_channels, + norm_type='bn', + freeze_norm=False, + norm_decay=0.): + super(IterativeHead, self).__init__() + num_branches = len(in_channels) + self.in_channels = in_channels[::-1] + + projects = [] + for i in range(num_branches): + if i != num_branches - 1: + projects.append( + DepthWiseSeparableConvNormLayer( + ch_in=self.in_channels[i], + ch_out=self.in_channels[i + 1], + filter_size=3, + stride=1, + dw_act=None, + pw_act='relu', + dw_norm_type=norm_type, + pw_norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay)) + else: + projects.append( + DepthWiseSeparableConvNormLayer( + ch_in=self.in_channels[i], + ch_out=self.in_channels[i], + filter_size=3, + stride=1, + dw_act=None, + pw_act='relu', + dw_norm_type=norm_type, + pw_norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay)) + self.projects = nn.LayerList(projects) + + def forward(self, x): + x = x[::-1] + y = [] + last_x = None + for i, s in enumerate(x): + if last_x is not None: + last_x = F.interpolate( + last_x, + size=paddle.shape(s)[-2:], + mode='bilinear', + align_corners=True) + s = s + last_x + s = self.projects[i](s) + y.append(s) + last_x = s + + return y[::-1] + + +class Stem(nn.Layer): + def __init__(self, + in_channel, + stem_channel, + out_channel, + expand_ratio, + norm_type='bn', + freeze_norm=False, + norm_decay=0.): + super(Stem, self).__init__() + self.conv1 = ConvNormLayer( + in_channel, + stem_channel, + filter_size=3, + stride=2, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + mid_channel = int(round(stem_channel * expand_ratio)) + branch_channel = stem_channel // 2 + if stem_channel == out_channel: + inc_channel = out_channel - branch_channel + else: + inc_channel = out_channel - stem_channel + self.branch1 = nn.Sequential( + ConvNormLayer( + ch_in=branch_channel, + ch_out=branch_channel, + filter_size=3, + stride=2, + groups=branch_channel, + norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay), + ConvNormLayer( + ch_in=branch_channel, + ch_out=inc_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay), ) + self.expand_conv = ConvNormLayer( + ch_in=branch_channel, + ch_out=mid_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + self.depthwise_conv = ConvNormLayer( + ch_in=mid_channel, + ch_out=mid_channel, + filter_size=3, + stride=2, + groups=mid_channel, + norm_type=norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay) + self.linear_conv = ConvNormLayer( + ch_in=mid_channel, + ch_out=branch_channel + if stem_channel == out_channel else stem_channel, + filter_size=1, + stride=1, + norm_type=norm_type, + act='relu', + freeze_norm=freeze_norm, + norm_decay=norm_decay) + + def forward(self, x): + x = self.conv1(x) + x1, x2 = x.chunk(2, axis=1) + x1 = self.branch1(x1) + x2 = self.expand_conv(x2) + x2 = self.depthwise_conv(x2) + x2 = self.linear_conv(x2) + out = paddle.concat([x1, x2], axis=1) + out = channel_shuffle(out, groups=2) + + return out + + +class LiteHRNetModule(nn.Layer): + def __init__(self, + num_branches, + num_blocks, + in_channels, + reduce_ratio, + module_type, + multiscale_output=False, + with_fuse=True, + norm_type='bn', + freeze_norm=False, + norm_decay=0.): + super(LiteHRNetModule, self).__init__() + assert num_branches == len(in_channels),\ + "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels)) + assert module_type in [ + 'LITE', 'NAIVE' + ], "module_type should be one of ['LITE', 'NAIVE']" + self.num_branches = num_branches + self.in_channels = in_channels + self.multiscale_output = multiscale_output + self.with_fuse = with_fuse + self.norm_type = 'bn' + self.module_type = module_type + + if self.module_type == 'LITE': + self.layers = self._make_weighting_blocks( + num_blocks, + reduce_ratio, + freeze_norm=freeze_norm, + norm_decay=norm_decay) + elif self.module_type == 'NAIVE': + self.layers = self._make_naive_branches( + num_branches, + num_blocks, + freeze_norm=freeze_norm, + norm_decay=norm_decay) + + if self.with_fuse: + self.fuse_layers = self._make_fuse_layers( + freeze_norm=freeze_norm, norm_decay=norm_decay) + self.relu = nn.ReLU() + + def _make_weighting_blocks(self, + num_blocks, + reduce_ratio, + stride=1, + freeze_norm=False, + norm_decay=0.): + layers = [] + for i in range(num_blocks): + layers.append( + ConditionalChannelWeightingBlock( + self.in_channels, + stride=stride, + reduce_ratio=reduce_ratio, + norm_type=self.norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay)) + return nn.Sequential(*layers) + + def _make_naive_branches(self, + num_branches, + num_blocks, + freeze_norm=False, + norm_decay=0.): + branches = [] + for branch_idx in range(num_branches): + layers = [] + for i in range(num_blocks): + layers.append( + ShuffleUnit( + self.in_channels[branch_idx], + self.in_channels[branch_idx], + stride=1, + norm_type=self.norm_type, + freeze_norm=freeze_norm, + norm_decay=norm_decay)) + branches.append(nn.Sequential(*layers)) + return nn.LayerList(branches) + + def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.): + if self.num_branches == 1: + return None + fuse_layers = [] + num_out_branches = self.num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(self.num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + Conv2d( + self.in_channels[j], + self.in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False, ), + nn.BatchNorm2D(self.in_channels[i]), + nn.Upsample( + scale_factor=2**(j - i), mode='nearest'))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + Conv2d( + self.in_channels[j], + self.in_channels[j], + kernel_size=3, + stride=2, + padding=1, + groups=self.in_channels[j], + bias=False, ), + nn.BatchNorm2D(self.in_channels[j]), + Conv2d( + self.in_channels[j], + self.in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False, ), + nn.BatchNorm2D(self.in_channels[i]))) + else: + conv_downsamples.append( + nn.Sequential( + Conv2d( + self.in_channels[j], + self.in_channels[j], + kernel_size=3, + stride=2, + padding=1, + groups=self.in_channels[j], + bias=False, ), + nn.BatchNorm2D(self.in_channels[j]), + Conv2d( + self.in_channels[j], + self.in_channels[j], + kernel_size=1, + stride=1, + padding=0, + bias=False, ), + nn.BatchNorm2D(self.in_channels[j]), + nn.ReLU())) + + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.LayerList(fuse_layer)) + + return nn.LayerList(fuse_layers) + + def forward(self, x): + if self.num_branches == 1: + return [self.layers[0](x[0])] + if self.module_type == 'LITE': + out = self.layers(x) + elif self.module_type == 'NAIVE': + for i in range(self.num_branches): + x[i] = self.layers[i](x[i]) + out = x + if self.with_fuse: + out_fuse = [] + for i in range(len(self.fuse_layers)): + y = out[0] if i == 0 else self.fuse_layers[i][0](out[0]) + for j in range(self.num_branches): + if j == 0: + y += y + elif i == j: + y += out[j] + else: + y += self.fuse_layers[i][j](out[j]) + if i == 0: + out[i] = y + out_fuse.append(self.relu(y)) + out = out_fuse + elif not self.multiscale_output: + out = [out[0]] + return out + + +class LiteHRNet(nn.Layer): + """ + @inproceedings{Yulitehrnet21, + title={Lite-HRNet: A Lightweight High-Resolution Network}, + author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, + booktitle={CVPR},year={2021} + } + + Args: + network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"], + "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet. + "wider_naive": Naive network with wider channels in each block. + "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting. + "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18. + freeze_at (int): the stage to freeze + freeze_norm (bool): whether to freeze norm in HRNet + norm_decay (float): weight decay for normalization layer weights + return_idx (List): the stage to return + """ + + def __init__(self, + network_type, + freeze_at=0, + freeze_norm=True, + norm_decay=0., + return_idx=[0, 1, 2, 3], + use_head=False, + pretrained=None): + super(LiteHRNet, self).__init__() + if isinstance(return_idx, Integral): + return_idx = [return_idx] + assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \ + "the network_type should be one of [lite_18, lite_30, naive, wider_naive]" + assert len(return_idx) > 0, "need one or more return index" + self.freeze_at = freeze_at + self.freeze_norm = freeze_norm + self.norm_decay = norm_decay + self.return_idx = return_idx + self.norm_type = 'bn' + self.use_head = use_head + self.pretrained = pretrained + + self.module_configs = { + "lite_18": { + "num_modules": [2, 4, 2], + "num_branches": [2, 3, 4], + "num_blocks": [2, 2, 2], + "module_type": ["LITE", "LITE", "LITE"], + "reduce_ratios": [8, 8, 8], + "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], + }, + "lite_30": { + "num_modules": [3, 8, 3], + "num_branches": [2, 3, 4], + "num_blocks": [2, 2, 2], + "module_type": ["LITE", "LITE", "LITE"], + "reduce_ratios": [8, 8, 8], + "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], + }, + "naive": { + "num_modules": [2, 4, 2], + "num_branches": [2, 3, 4], + "num_blocks": [2, 2, 2], + "module_type": ["NAIVE", "NAIVE", "NAIVE"], + "reduce_ratios": [1, 1, 1], + "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]], + }, + "wider_naive": { + "num_modules": [2, 4, 2], + "num_branches": [2, 3, 4], + "num_blocks": [2, 2, 2], + "module_type": ["NAIVE", "NAIVE", "NAIVE"], + "reduce_ratios": [1, 1, 1], + "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], + }, + } + + self.stages_config = self.module_configs[network_type] + + self.stem = Stem(3, 32, 32, 1) + num_channels_pre_layer = [32] + for stage_idx in range(3): + num_channels = self.stages_config["num_channels"][stage_idx] + setattr(self, 'transition{}'.format(stage_idx), + self._make_transition_layer(num_channels_pre_layer, + num_channels, self.freeze_norm, + self.norm_decay)) + stage, num_channels_pre_layer = self._make_stage( + self.stages_config, stage_idx, num_channels, True, + self.freeze_norm, self.norm_decay) + setattr(self, 'stage{}'.format(stage_idx), stage) + + num_channels = self.stages_config["num_channels"][-1] + self.feat_channels = num_channels + + if self.use_head: + self.head_layer = IterativeHead(num_channels_pre_layer, 'bn', + self.freeze_norm, self.norm_decay) + + self.feat_channels = [num_channels[0]] + for i in range(1, len(num_channels)): + self.feat_channels.append(num_channels[i] // 2) + + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def _make_transition_layer(self, + num_channels_pre_layer, + num_channels_cur_layer, + freeze_norm=False, + norm_decay=0.): + num_branches_pre = len(num_channels_pre_layer) + num_branches_cur = len(num_channels_cur_layer) + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + Conv2d( + num_channels_pre_layer[i], + num_channels_pre_layer[i], + kernel_size=3, + stride=1, + padding=1, + groups=num_channels_pre_layer[i], + bias=False), + nn.BatchNorm2D(num_channels_pre_layer[i]), + Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=1, + stride=1, + padding=0, + bias=False, ), + nn.BatchNorm2D(num_channels_cur_layer[i]), + nn.ReLU())) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + conv_downsamples.append( + nn.Sequential( + Conv2d( + num_channels_pre_layer[-1], + num_channels_pre_layer[-1], + groups=num_channels_pre_layer[-1], + kernel_size=3, + stride=2, + padding=1, + bias=False, ), + nn.BatchNorm2D(num_channels_pre_layer[-1]), + Conv2d( + num_channels_pre_layer[-1], + num_channels_cur_layer[i] + if j == i - num_branches_pre else + num_channels_pre_layer[-1], + kernel_size=1, + stride=1, + padding=0, + bias=False, ), + nn.BatchNorm2D(num_channels_cur_layer[i] + if j == i - num_branches_pre else + num_channels_pre_layer[-1]), + nn.ReLU())) + transition_layers.append(nn.Sequential(*conv_downsamples)) + return nn.LayerList(transition_layers) + + def _make_stage(self, + stages_config, + stage_idx, + in_channels, + multiscale_output, + freeze_norm=False, + norm_decay=0.): + num_modules = stages_config["num_modules"][stage_idx] + num_branches = stages_config["num_branches"][stage_idx] + num_blocks = stages_config["num_blocks"][stage_idx] + reduce_ratio = stages_config['reduce_ratios'][stage_idx] + module_type = stages_config['module_type'][stage_idx] + + modules = [] + for i in range(num_modules): + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + modules.append( + LiteHRNetModule( + num_branches, + num_blocks, + in_channels, + reduce_ratio, + module_type, + multiscale_output=reset_multiscale_output, + with_fuse=True, + freeze_norm=freeze_norm, + norm_decay=norm_decay)) + in_channels = modules[-1].in_channels + return nn.Sequential(*modules), in_channels + + def forward(self, x): + x = self.stem(x) + + y_list = [x] + for stage_idx in range(3): + x_list = [] + transition = getattr(self, 'transition{}'.format(stage_idx)) + for j in range(self.stages_config["num_branches"][stage_idx]): + if transition[j] is not None: + if j >= len(y_list): + x_list.append(transition[j](y_list[-1])) + else: + x_list.append(transition[j](y_list[j])) + else: + x_list.append(y_list[j]) + y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list) + + if self.use_head: + y_list = self.head_layer(y_list) + + res = [] + for i, layer in enumerate(y_list): + if i == self.freeze_at: + layer.stop_gradient = True + if i in self.return_idx: + res.append(layer) + return res + + +@manager.BACKBONES.add_component +def Lite_HRNet_18(**kwargs): + model = LiteHRNet(network_type="lite_18", **kwargs) + return model + + +@manager.BACKBONES.add_component +def Lite_HRNet_30(**kwargs): + model = LiteHRNet(network_type="lite_30", **kwargs) + return model + + +@manager.BACKBONES.add_component +def Lite_HRNet_naive(**kwargs): + model = LiteHRNet(network_type="naive", **kwargs) + return model + + +@manager.BACKBONES.add_component +def Lite_HRNet_wider_naive(**kwargs): + model = LiteHRNet(network_type="wider_naive", **kwargs) + return model diff --git a/paddleseg/models/backbones/mix_transformer.py b/paddleseg/models/backbones/mix_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..589e7e3c7f5cfd7642d800dad6ecc33ff0cf0f81 --- /dev/null +++ b/paddleseg/models/backbones/mix_transformer.py @@ -0,0 +1,593 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from functools import partial + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.nn.initializer as paddle_init + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils +from paddleseg.models.backbones.transformer_utils import * + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.dwconv = DWConv(hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + fan_out //= m._groups + paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward(self, x, H, W): + x = self.fc1(x) + x = self.dwconv(x, H, W) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + sr_ratio=1): + super().__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + self.dim = dim + + self.q = nn.Linear(dim, dim, bias_attr=qkv_bias) + self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.sr_ratio = sr_ratio + if sr_ratio > 1: + self.sr = nn.Conv2D(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.norm = nn.LayerNorm(dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + fan_out //= m._groups + paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward(self, x, H, W): + x_shape = paddle.shape(x) + B, N = x_shape[0], x_shape[1] + C = self.dim + + q = self.q(x).reshape([B, N, self.num_heads, + C // self.num_heads]).transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W]) + x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1]) + x_ = self.norm(x_) + kv = self.kv(x_).reshape( + [B, -1, 2, self.num_heads, + C // self.num_heads]).transpose([2, 0, 3, 1, 4]) + else: + kv = self.kv(x).reshape( + [B, -1, 2, self.num_heads, + C // self.num_heads]).transpose([2, 0, 3, 1, 4]) + k, v = kv[0], kv[1] + + attn = (q @k.transpose([0, 1, 3, 2])) * self.scale + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + sr_ratio=1): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + sr_ratio=sr_ratio) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + fan_out //= m._groups + paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + + return x + + +class OverlapPatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=224, + patch_size=7, + stride=4, + in_chans=3, + embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + self.img_size = img_size + self.patch_size = patch_size + self.H, self.W = img_size[0] // patch_size[0], img_size[ + 1] // patch_size[1] + self.num_patches = self.H * self.W + self.proj = nn.Conv2D( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=stride, + padding=(patch_size[0] // 2, patch_size[1] // 2)) + self.norm = nn.LayerNorm(embed_dim) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + fan_out //= m._groups + paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward(self, x): + x = self.proj(x) + x_shape = paddle.shape(x) + H, W = x_shape[2], x_shape[3] + x = x.flatten(2).transpose([0, 2, 1]) + x = self.norm(x) + + return x, H, W + + +class MixVisionTransformer(nn.Layer): + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dims=[64, 128, 256, 512], + num_heads=[1, 2, 4, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=nn.LayerNorm, + depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1], + pretrained=None): + super().__init__() + self.num_classes = num_classes + self.depths = depths + self.feat_channels = embed_dims[:] + + # patch_embed + self.patch_embed1 = OverlapPatchEmbed( + img_size=img_size, + patch_size=7, + stride=4, + in_chans=in_chans, + embed_dim=embed_dims[0]) + self.patch_embed2 = OverlapPatchEmbed( + img_size=img_size // 4, + patch_size=3, + stride=2, + in_chans=embed_dims[0], + embed_dim=embed_dims[1]) + self.patch_embed3 = OverlapPatchEmbed( + img_size=img_size // 8, + patch_size=3, + stride=2, + in_chans=embed_dims[1], + embed_dim=embed_dims[2]) + self.patch_embed4 = OverlapPatchEmbed( + img_size=img_size // 16, + patch_size=3, + stride=2, + in_chans=embed_dims[2], + embed_dim=embed_dims[3]) + + # transformer encoder + dpr = [ + x.numpy() for x in paddle.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + cur = 0 + self.block1 = nn.LayerList([ + Block( + dim=embed_dims[0], + num_heads=num_heads[0], + mlp_ratio=mlp_ratios[0], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[0]) for i in range(depths[0]) + ]) + self.norm1 = norm_layer(embed_dims[0]) + + cur += depths[0] + self.block2 = nn.LayerList([ + Block( + dim=embed_dims[1], + num_heads=num_heads[1], + mlp_ratio=mlp_ratios[1], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[1]) for i in range(depths[1]) + ]) + self.norm2 = norm_layer(embed_dims[1]) + + cur += depths[1] + self.block3 = nn.LayerList([ + Block( + dim=embed_dims[2], + num_heads=num_heads[2], + mlp_ratio=mlp_ratios[2], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[2]) for i in range(depths[2]) + ]) + self.norm3 = norm_layer(embed_dims[2]) + + cur += depths[2] + self.block4 = nn.LayerList([ + Block( + dim=embed_dims[3], + num_heads=num_heads[3], + mlp_ratio=mlp_ratios[3], + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[cur + i], + norm_layer=norm_layer, + sr_ratio=sr_ratios[3]) for i in range(depths[3]) + ]) + self.norm4 = norm_layer(embed_dims[3]) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + else: + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + fan_out //= m._groups + paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def reset_drop_path(self, drop_path_rate): + dpr = [ + x.item() + for x in paddle.linspace(0, drop_path_rate, sum(self.depths)) + ] + cur = 0 + for i in range(self.depths[0]): + self.block1[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[0] + for i in range(self.depths[1]): + self.block2[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[1] + for i in range(self.depths[2]): + self.block3[i].drop_path.drop_prob = dpr[cur + i] + + cur += self.depths[2] + for i in range(self.depths[3]): + self.block4[i].drop_path.drop_prob = dpr[cur + i] + + def freeze_patch_emb(self): + self.patch_embed1.requires_grad = False + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, + num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = paddle.shape(x)[0] + outs = [] + + # stage 1 + x, H, W = self.patch_embed1(x) + for i, blk in enumerate(self.block1): + x = blk(x, H, W) + + x = self.norm1(x) + x = x.reshape([B, H, W, self.feat_channels[0]]).transpose([0, 3, 1, 2]) + outs.append(x) + + # stage 2 + x, H, W = self.patch_embed2(x) + for i, blk in enumerate(self.block2): + x = blk(x, H, W) + x = self.norm2(x) + x = x.reshape([B, H, W, self.feat_channels[1]]).transpose([0, 3, 1, 2]) + outs.append(x) + + # stage 3 + x, H, W = self.patch_embed3(x) + for i, blk in enumerate(self.block3): + x = blk(x, H, W) + x = self.norm3(x) + x = x.reshape([B, H, W, self.feat_channels[2]]).transpose([0, 3, 1, 2]) + outs.append(x) + + # stage 4 + x, H, W = self.patch_embed4(x) + for i, blk in enumerate(self.block4): + x = blk(x, H, W) + x = self.norm4(x) + x = x.reshape([B, H, W, self.feat_channels[3]]).transpose([0, 3, 1, 2]) + outs.append(x) + + return outs + + def forward(self, x): + x = self.forward_features(x) + # x = self.head(x) + + return x + + +class DWConv(nn.Layer): + def __init__(self, dim=768): + super(DWConv, self).__init__() + self.dim = dim + self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim) + + def forward(self, x, H, W): + x_shape = paddle.shape(x) + B, N = x_shape[0], x_shape[1] + x = x.transpose([0, 2, 1]).reshape([B, self.dim, H, W]) + x = self.dwconv(x) + x = x.flatten(2).transpose([0, 2, 1]) + + return x + + +@manager.BACKBONES.add_component +def MixVisionTransformer_B0(**kwargs): + return MixVisionTransformer( + patch_size=4, + embed_dims=[32, 64, 160, 256], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1, + **kwargs) + + +@manager.BACKBONES.add_component +def MixVisionTransformer_B1(**kwargs): + return MixVisionTransformer( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1, + **kwargs) + + +@manager.BACKBONES.add_component +def MixVisionTransformer_B2(**kwargs): + return MixVisionTransformer( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + depths=[3, 4, 6, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1, + **kwargs) + + +@manager.BACKBONES.add_component +def MixVisionTransformer_B3(**kwargs): + return MixVisionTransformer( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + depths=[3, 4, 18, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1, + **kwargs) + + +@manager.BACKBONES.add_component +def MixVisionTransformer_B4(**kwargs): + return MixVisionTransformer( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + depths=[3, 8, 27, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1, + **kwargs) + + +@manager.BACKBONES.add_component +def MixVisionTransformer_B5(**kwargs): + return MixVisionTransformer( + patch_size=4, + embed_dims=[64, 128, 320, 512], + num_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + qkv_bias=True, + norm_layer=partial( + nn.LayerNorm, epsilon=1e-6), + depths=[3, 6, 40, 3], + sr_ratios=[8, 4, 2, 1], + drop_rate=0.0, + drop_path_rate=0.1, + **kwargs) diff --git a/paddleseg/models/backbones/mobilenetv2.py b/paddleseg/models/backbones/mobilenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..0405cee5a0176b583672f9ac6e7cf08cd9d043d6 --- /dev/null +++ b/paddleseg/models/backbones/mobilenetv2.py @@ -0,0 +1,264 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D + +from paddleseg.cvlibs import manager +from paddleseg import utils + +__all__ = [ + "MobileNetV2_x0_25", + "MobileNetV2_x0_5", + "MobileNetV2_x0_75", + "MobileNetV2_x1_0", + "MobileNetV2_x1_5", + "MobileNetV2_x2_0", +] + + +class MobileNetV2(nn.Layer): + """ + The MobileNetV2 implementation based on PaddlePaddle. + + The original article refers to + Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen + "MobileNetV2: Inverted Residuals and Linear Bottlenecks" + (https://arxiv.org/abs/1801.04381). + + Args: + scale (float, optional): The scale of channel. Default: 1.0 + pretrained (str, optional): The path or url of pretrained model. Default: None + """ + + def __init__(self, scale=1.0, pretrained=None): + super().__init__() + self.scale = scale + self.pretrained = pretrained + prefix_name = "" + + bottleneck_params_list = [ + (1, 16, 1, 1), + (6, 24, 2, 2), # x4 + (6, 32, 3, 2), # x8 + (6, 64, 4, 2), + (6, 96, 3, 1), # x16 + (6, 160, 3, 2), + (6, 320, 1, 1), # x32 + ] + self.out_index = [1, 2, 4, 6] + + self.conv1 = ConvBNLayer( + num_channels=3, + num_filters=int(32 * scale), + filter_size=3, + stride=2, + padding=1, + name=prefix_name + "conv1_1") + + self.block_list = [] + i = 1 + in_c = int(32 * scale) + for layer_setting in bottleneck_params_list: + t, c, n, s = layer_setting + i += 1 + block = self.add_sublayer( + prefix_name + "conv" + str(i), + sublayer=InvresiBlocks( + in_c=in_c, + t=t, + c=int(c * scale), + n=n, + s=s, + name=prefix_name + "conv" + str(i))) + self.block_list.append(block) + in_c = int(c * scale) + + out_channels = [ + bottleneck_params_list[idx][1] for idx in self.out_index + ] + self.feat_channels = [int(c * scale) for c in out_channels] + + self.init_weight() + + def forward(self, inputs): + feat_list = [] + + y = self.conv1(inputs, if_act=True) + for idx, block in enumerate(self.block_list): + y = block(y) + if idx in self.out_index: + feat_list.append(y) + + return feat_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + name=None, + use_cudnn=True): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + param_attr=ParamAttr(name=name + "_bn_scale"), + bias_attr=ParamAttr(name=name + "_bn_offset"), + moving_mean_name=name + "_bn_mean", + moving_variance_name=name + "_bn_variance") + + def forward(self, inputs, if_act=True): + y = self._conv(inputs) + y = self._batch_norm(y) + if if_act: + y = F.relu6(y) + return y + + +class InvertedResidualUnit(nn.Layer): + def __init__(self, num_channels, num_in_filter, num_filters, stride, + filter_size, padding, expansion_factor, name): + super(InvertedResidualUnit, self).__init__() + num_expfilter = int(round(num_in_filter * expansion_factor)) + self._expand_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=num_expfilter, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + name=name + "_expand") + + self._bottleneck_conv = ConvBNLayer( + num_channels=num_expfilter, + num_filters=num_expfilter, + filter_size=filter_size, + stride=stride, + padding=padding, + num_groups=num_expfilter, + use_cudnn=False, + name=name + "_dwise") + + self._linear_conv = ConvBNLayer( + num_channels=num_expfilter, + num_filters=num_filters, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + name=name + "_linear") + + def forward(self, inputs, ifshortcut): + y = self._expand_conv(inputs, if_act=True) + y = self._bottleneck_conv(y, if_act=True) + y = self._linear_conv(y, if_act=False) + if ifshortcut: + y = paddle.add(inputs, y) + return y + + +class InvresiBlocks(nn.Layer): + def __init__(self, in_c, t, c, n, s, name): + super(InvresiBlocks, self).__init__() + + self._first_block = InvertedResidualUnit( + num_channels=in_c, + num_in_filter=in_c, + num_filters=c, + stride=s, + filter_size=3, + padding=1, + expansion_factor=t, + name=name + "_1") + + self._block_list = [] + for i in range(1, n): + block = self.add_sublayer( + name + "_" + str(i + 1), + sublayer=InvertedResidualUnit( + num_channels=c, + num_in_filter=c, + num_filters=c, + stride=1, + filter_size=3, + padding=1, + expansion_factor=t, + name=name + "_" + str(i + 1))) + self._block_list.append(block) + + def forward(self, inputs): + y = self._first_block(inputs, ifshortcut=False) + for block in self._block_list: + y = block(y, ifshortcut=True) + return y + + +@manager.BACKBONES.add_component +def MobileNetV2_x0_25(**kwargs): + model = MobileNetV2(scale=0.25, **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV2_x0_5(**kwargs): + model = MobileNetV2(scale=0.5, **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV2_x0_75(**kwargs): + model = MobileNetV2(scale=0.75, **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV2_x1_0(**kwargs): + model = MobileNetV2(scale=1.0, **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV2_x1_5(**kwargs): + model = MobileNetV2(scale=1.5, **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV2_x2_0(**kwargs): + model = MobileNetV2(scale=2.0, **kwargs) + return model diff --git a/paddleseg/models/backbones/mobilenetv3.py b/paddleseg/models/backbones/mobilenetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..39e92f173a79146b9dc82e46d5dd3260186dc046 --- /dev/null +++ b/paddleseg/models/backbones/mobilenetv3.py @@ -0,0 +1,496 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils, logger +from paddleseg.models import layers + +__all__ = [ + "MobileNetV3_small_x0_35", "MobileNetV3_small_x0_5", + "MobileNetV3_small_x0_75", "MobileNetV3_small_x1_0", + "MobileNetV3_small_x1_25", "MobileNetV3_large_x0_35", + "MobileNetV3_large_x0_5", "MobileNetV3_large_x0_75", + "MobileNetV3_large_x1_0", "MobileNetV3_large_x1_25" +] + +MODEL_STAGES_PATTERN = { + "MobileNetV3_small": ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"], + "MobileNetV3_large": + ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"] +} + +# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively. +# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s. +# k: kernel_size +# exp: middle channel number in depthwise block +# c: output channel number in depthwise block +# se: whether to use SE block +# act: which activation to use +# s: stride in depthwise block +# d: dilation rate in depthwise block +NET_CONFIG = { + "large": [ + # k, exp, c, se, act, s + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], # x4 + [5, 72, 40, True, "relu", 2], + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], # x8 + [3, 240, 80, False, "hardswish", 2], + [3, 200, 80, False, "hardswish", 1], + [3, 184, 80, False, "hardswish", 1], + [3, 184, 80, False, "hardswish", 1], + [3, 480, 112, True, "hardswish", 1], + [3, 672, 112, True, "hardswish", 1], # x16 + [5, 672, 160, True, "hardswish", 2], + [5, 960, 160, True, "hardswish", 1], + [5, 960, 160, True, "hardswish", 1], # x32 + ], + "small": [ + # k, exp, c, se, act, s + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], + [3, 88, 24, False, "relu", 1], + [5, 96, 40, True, "hardswish", 2], + [5, 240, 40, True, "hardswish", 1], + [5, 240, 40, True, "hardswish", 1], + [5, 120, 48, True, "hardswish", 1], + [5, 144, 48, True, "hardswish", 1], + [5, 288, 96, True, "hardswish", 2], + [5, 576, 96, True, "hardswish", 1], + [5, 576, 96, True, "hardswish", 1], + ], + "large_os8": [ + # k, exp, c, se, act, s, {d} + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], # x4 + [5, 72, 40, True, "relu", 2], + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], # x8 + [3, 240, 80, False, "hardswish", 1], + [3, 200, 80, False, "hardswish", 1, 2], + [3, 184, 80, False, "hardswish", 1, 2], + [3, 184, 80, False, "hardswish", 1, 2], + [3, 480, 112, True, "hardswish", 1, 2], + [3, 672, 112, True, "hardswish", 1, 2], + [5, 672, 160, True, "hardswish", 1, 2], + [5, 960, 160, True, "hardswish", 1, 4], + [5, 960, 160, True, "hardswish", 1, 4], + ], + "small_os8": [ + # k, exp, c, se, act, s, {d} + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], + [3, 88, 24, False, "relu", 1], + [5, 96, 40, True, "hardswish", 1], + [5, 240, 40, True, "hardswish", 1, 2], + [5, 240, 40, True, "hardswish", 1, 2], + [5, 120, 48, True, "hardswish", 1, 2], + [5, 144, 48, True, "hardswish", 1, 2], + [5, 288, 96, True, "hardswish", 1, 2], + [5, 576, 96, True, "hardswish", 1, 4], + [5, 576, 96, True, "hardswish", 1, 4], + ] +} + +OUT_INDEX = {"large": [2, 5, 11, 14], "small": [0, 2, 7, 10]} + + +def _make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def _create_act(act): + if act == "hardswish": + return nn.Hardswish() + elif act == "relu": + return nn.ReLU() + elif act is None: + return None + else: + raise RuntimeError( + "The activation function is not supported: {}".format(act)) + + +class MobileNetV3(nn.Layer): + """ + MobileNetV3 + Args: + config: list. MobileNetV3 depthwise blocks config. + scale: float=1.0. The coefficient that controls the size of network parameters. + Returns: + model: nn.Layer. Specific MobileNetV3 model depends on args. + """ + + def __init__(self, + config, + stages_pattern, + out_index, + scale=1.0, + pretrained=None): + super().__init__() + + self.cfg = config + self.out_index = out_index + self.scale = scale + self.pretrained = pretrained + inplanes = 16 + + self.conv = ConvBNLayer( + in_c=3, + out_c=_make_divisible(inplanes * self.scale), + filter_size=3, + stride=2, + padding=1, + num_groups=1, + if_act=True, + act="hardswish") + self.blocks = nn.Sequential(*[ + ResidualUnit( + in_c=_make_divisible(inplanes * self.scale if i == 0 else + self.cfg[i - 1][2] * self.scale), + mid_c=_make_divisible(self.scale * exp), + out_c=_make_divisible(self.scale * c), + filter_size=k, + stride=s, + use_se=se, + act=act, + dilation=td[0] if td else 1) + for i, (k, exp, c, se, act, s, *td) in enumerate(self.cfg) + ]) + + out_channels = [config[idx][2] for idx in self.out_index] + self.feat_channels = [ + _make_divisible(self.scale * c) for c in out_channels + ] + + self.init_res(stages_pattern) + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def init_res(self, stages_pattern, return_patterns=None, + return_stages=None): + if return_patterns and return_stages: + msg = f"The 'return_patterns' would be ignored when 'return_stages' is set." + logger.warning(msg) + return_stages = None + + if return_stages is True: + return_patterns = stages_pattern + # return_stages is int or bool + if type(return_stages) is int: + return_stages = [return_stages] + if isinstance(return_stages, list): + if max(return_stages) > len(stages_pattern) or min( + return_stages) < 0: + msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}." + logger.warning(msg) + return_stages = [ + val for val in return_stages + if val >= 0 and val < len(stages_pattern) + ] + return_patterns = [stages_pattern[i] for i in return_stages] + + def forward(self, x): + x = self.conv(x) + + feat_list = [] + for idx, block in enumerate(self.blocks): + x = block(x) + if idx in self.out_index: + feat_list.append(x) + + return feat_list + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_c, + out_c, + filter_size, + stride, + padding, + num_groups=1, + if_act=True, + act=None, + dilation=1): + super().__init__() + + self.conv = Conv2D( + in_channels=in_c, + out_channels=out_c, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + bias_attr=False, + dilation=dilation) + self.bn = BatchNorm( + num_channels=out_c, + act=None, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self.if_act = if_act + self.act = _create_act(act) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + x = self.act(x) + return x + + +class ResidualUnit(nn.Layer): + def __init__(self, + in_c, + mid_c, + out_c, + filter_size, + stride, + use_se, + act=None, + dilation=1): + super().__init__() + self.if_shortcut = stride == 1 and in_c == out_c + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_c=in_c, + out_c=mid_c, + filter_size=1, + stride=1, + padding=0, + if_act=True, + act=act) + self.bottleneck_conv = ConvBNLayer( + in_c=mid_c, + out_c=mid_c, + filter_size=filter_size, + stride=stride, + padding=int((filter_size - 1) // 2) * dilation, + num_groups=mid_c, + if_act=True, + act=act, + dilation=dilation) + if self.if_se: + self.mid_se = SEModule(mid_c) + self.linear_conv = ConvBNLayer( + in_c=mid_c, + out_c=out_c, + filter_size=1, + stride=1, + padding=0, + if_act=False, + act=None) + + def forward(self, x): + identity = x + x = self.expand_conv(x) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(identity, x) + return x + + +# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid +class Hardsigmoid(nn.Layer): + def __init__(self, slope=0.2, offset=0.5): + super().__init__() + self.slope = slope + self.offset = offset + + def forward(self, x): + return nn.functional.hardsigmoid( + x, slope=self.slope, offset=self.offset) + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5) + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + return paddle.multiply(x=identity, y=x) + + +@manager.BACKBONES.add_component +def MobileNetV3_small_x0_35(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["small"], + scale=0.35, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["small"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_small_x0_5(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["small"], + scale=0.5, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["small"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_small_x0_75(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["small"], + scale=0.75, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["small"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_small_x1_0(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["small"], + scale=1.0, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["small"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_small_x1_25(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["small"], + scale=1.25, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["small"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_large_x0_35(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["large"], + scale=0.35, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["large"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_large_x0_5(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["large"], + scale=0.5, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"], + out_index=OUT_INDEX["large"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_large_x0_75(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["large"], + scale=0.75, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"], + out_index=OUT_INDEX["large"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_large_x1_0(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["large"], + scale=1.0, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"], + out_index=OUT_INDEX["large"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_large_x1_25(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["large"], + scale=1.25, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"], + out_index=OUT_INDEX["large"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_large_x1_0_os8(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["large_os8"], + scale=1.0, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"], + out_index=OUT_INDEX["large"], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def MobileNetV3_small_x1_0_os8(**kwargs): + model = MobileNetV3( + config=NET_CONFIG["small_os8"], + scale=1.0, + stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"], + out_index=OUT_INDEX["small"], + **kwargs) + return model diff --git a/paddleseg/models/backbones/resnet_vd.py b/paddleseg/models/backbones/resnet_vd.py new file mode 100644 index 0000000000000000000000000000000000000000..2810ea44ba15f2e4d30c4dc535160d41cd6bce1e --- /dev/null +++ b/paddleseg/models/backbones/resnet_vd.py @@ -0,0 +1,398 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + +__all__ = [ + "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd" +] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + groups=1, + is_vd_mode=False, + act=None, + data_format='NCHW'): + super(ConvBNLayer, self).__init__() + if dilation != 1 and kernel_size != 3: + raise RuntimeError("When the dilation isn't 1," \ + "the kernel_size should be 3.") + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, + stride=2, + padding=0, + ceil_mode=True, + data_format=data_format) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 \ + if dilation == 1 else dilation, + dilation=dilation, + groups=groups, + bias_attr=False, + data_format=data_format) + + self._batch_norm = layers.SyncBatchNorm( + out_channels, data_format=data_format) + self._act_op = layers.Activation(act=act) + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + y = self._act_op(y) + + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + dilation=1, + data_format='NCHW'): + super(BottleneckBlock, self).__init__() + + self.data_format = data_format + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + data_format=data_format) + + self.dilation = dilation + + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + dilation=dilation, + data_format=data_format) + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + data_format=data_format) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first or stride == 1 else True, + data_format=data_format) + + self.shortcut = shortcut + # NOTE: Use the wrap layer for quantization training + self.add = layers.Add() + self.relu = layers.Activation(act="relu") + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = self.add(short, conv2) + y = self.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + dilation=1, + shortcut=True, + if_first=False, + data_format='NCHW'): + super(BasicBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + dilation=dilation, + act='relu', + data_format=data_format) + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + dilation=dilation, + act=None, + data_format=data_format) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first or stride == 1 else True, + data_format=data_format) + + self.shortcut = shortcut + self.dilation = dilation + self.data_format = data_format + self.add = layers.Add() + self.relu = layers.Activation(act="relu") + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = self.add(short, conv1) + y = self.relu(y) + + return y + + +class ResNet_vd(nn.Layer): + """ + The ResNet_vd implementation based on PaddlePaddle. + + The original article refers to Jingdong + Tong He, et, al. "Bag of Tricks for Image Classification with Convolutional Neural Networks" + (https://arxiv.org/pdf/1812.01187.pdf). + + Args: + layers (int, optional): The layers of ResNet_vd. The supported layers are (18, 34, 50, 101, 152, 200). Default: 50. + output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 8. + multi_grid (tuple|list, optional): The grid of stage4. Defult: (1, 1, 1). + pretrained (str, optional): The path of pretrained model. + + """ + + def __init__(self, + layers=50, + output_stride=8, + multi_grid=(1, 1, 1), + pretrained=None, + data_format='NCHW'): + super(ResNet_vd, self).__init__() + + self.data_format = data_format + self.conv1_logit = None # for gscnn shape stream + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + # for channels of four returned stages + self.feat_channels = [c * 4 for c in num_filters + ] if layers >= 50 else num_filters + + dilation_dict = None + if output_stride == 8: + dilation_dict = {2: 2, 3: 4} + elif output_stride == 16: + dilation_dict = {3: 2} + + self.conv1_1 = ConvBNLayer( + in_channels=3, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + data_format=data_format) + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + data_format=data_format) + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + data_format=data_format) + self.pool2d_max = nn.MaxPool2D( + kernel_size=3, stride=2, padding=1, data_format=data_format) + + # self.block_list = [] + self.stage_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + block_list = [] + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + + ############################################################################### + # Add dilation rate for some segmentation tasks, if dilation_dict is not None. + dilation_rate = dilation_dict[ + block] if dilation_dict and block in dilation_dict else 1 + + # Actually block here is 'stage', and i is 'block' in 'stage' + # At the stage 4, expand the the dilation_rate if given multi_grid + if block == 3: + dilation_rate = dilation_rate * multi_grid[i] + ############################################################################### + + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 and + dilation_rate == 1 else 1, + shortcut=shortcut, + if_first=block == i == 0, + dilation=dilation_rate, + data_format=data_format)) + + block_list.append(bottleneck_block) + shortcut = True + self.stage_list.append(block_list) + else: + for block in range(len(depth)): + shortcut = False + block_list = [] + for i in range(depth[block]): + dilation_rate = dilation_dict[block] \ + if dilation_dict and block in dilation_dict else 1 + if block == 3: + dilation_rate = dilation_rate * multi_grid[i] + + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 \ + and dilation_rate == 1 else 1, + dilation=dilation_rate, + shortcut=shortcut, + if_first=block == i == 0, + data_format=data_format)) + block_list.append(basic_block) + shortcut = True + self.stage_list.append(block_list) + + self.pretrained = pretrained + self.init_weight() + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + self.conv1_logit = y.clone() + y = self.pool2d_max(y) + + # A feature list saves the output feature map of each stage. + feat_list = [] + for stage in self.stage_list: + for block in stage: + y = block(y) + feat_list.append(y) + + return feat_list + + def init_weight(self): + utils.load_pretrained_model(self, self.pretrained) + + +@manager.BACKBONES.add_component +def ResNet18_vd(**args): + model = ResNet_vd(layers=18, **args) + return model + + +def ResNet34_vd(**args): + model = ResNet_vd(layers=34, **args) + return model + + +@manager.BACKBONES.add_component +def ResNet50_vd(**args): + model = ResNet_vd(layers=50, **args) + return model + + +@manager.BACKBONES.add_component +def ResNet101_vd(**args): + model = ResNet_vd(layers=101, **args) + return model + + +def ResNet152_vd(**args): + model = ResNet_vd(layers=152, **args) + return model + + +def ResNet200_vd(**args): + model = ResNet_vd(layers=200, **args) + return model diff --git a/paddleseg/models/backbones/shufflenetv2.py b/paddleseg/models/backbones/shufflenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..a4c4ae6b18468a8cfb15b8b04594d3dcc52f47fc --- /dev/null +++ b/paddleseg/models/backbones/shufflenetv2.py @@ -0,0 +1,315 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr, reshape, transpose, concat, split +from paddle.nn import Layer, Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm, Linear +from paddle.nn.initializer import KaimingNormal +from paddle.nn.functional import swish + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils, logger + +__all__ = [ + 'ShuffleNetV2_x0_25', 'ShuffleNetV2_x0_33', 'ShuffleNetV2_x0_5', + 'ShuffleNetV2_x1_0', 'ShuffleNetV2_x1_5', 'ShuffleNetV2_x2_0', + 'ShuffleNetV2_swish' +] + + +def channel_shuffle(x, groups): + x_shape = paddle.shape(x) + batch_size, height, width = x_shape[0], x_shape[2], x_shape[3] + num_channels = x.shape[1] + channels_per_group = num_channels // groups + + # reshape + x = reshape( + x=x, shape=[batch_size, groups, channels_per_group, height, width]) + + # transpose + x = transpose(x=x, perm=[0, 2, 1, 3, 4]) + + # flatten + x = reshape(x=x, shape=[batch_size, num_channels, height, width]) + + return x + + +class ConvBNLayer(Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr( + initializer=KaimingNormal(), name=name + "_weights"), + bias_attr=False) + + self._batch_norm = BatchNorm( + out_channels, + param_attr=ParamAttr(name=name + "_bn_scale"), + bias_attr=ParamAttr(name=name + "_bn_offset"), + act=act, + moving_mean_name=name + "_bn_mean", + moving_variance_name=name + "_bn_variance") + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class InvertedResidual(Layer): + def __init__(self, in_channels, out_channels, stride, act="relu", + name=None): + super(InvertedResidual, self).__init__() + self._conv_pw = ConvBNLayer( + in_channels=in_channels // 2, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act, + name='stage_' + name + '_conv1') + self._conv_dw = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=3, + stride=stride, + padding=1, + groups=out_channels // 2, + act=None, + name='stage_' + name + '_conv2') + self._conv_linear = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act, + name='stage_' + name + '_conv3') + + def forward(self, inputs): + x1, x2 = split( + inputs, + num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], + axis=1) + x2 = self._conv_pw(x2) + x2 = self._conv_dw(x2) + x2 = self._conv_linear(x2) + out = concat([x1, x2], axis=1) + return channel_shuffle(out, 2) + + +class InvertedResidualDS(Layer): + def __init__(self, in_channels, out_channels, stride, act="relu", + name=None): + super(InvertedResidualDS, self).__init__() + + # branch1 + self._conv_dw_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=stride, + padding=1, + groups=in_channels, + act=None, + name='stage_' + name + '_conv4') + self._conv_linear_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act, + name='stage_' + name + '_conv5') + # branch2 + self._conv_pw_2 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act, + name='stage_' + name + '_conv1') + self._conv_dw_2 = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=3, + stride=stride, + padding=1, + groups=out_channels // 2, + act=None, + name='stage_' + name + '_conv2') + self._conv_linear_2 = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act, + name='stage_' + name + '_conv3') + + def forward(self, inputs): + x1 = self._conv_dw_1(inputs) + x1 = self._conv_linear_1(x1) + x2 = self._conv_pw_2(inputs) + x2 = self._conv_dw_2(x2) + x2 = self._conv_linear_2(x2) + out = concat([x1, x2], axis=1) + + return channel_shuffle(out, 2) + + +class ShuffleNet(Layer): + def __init__(self, scale=1.0, act="relu", pretrained=None): + super(ShuffleNet, self).__init__() + self.scale = scale + self.pretrained = pretrained + stage_repeats = [4, 8, 4] + + if scale == 0.25: + stage_out_channels = [-1, 24, 24, 48, 96, 512] + elif scale == 0.33: + stage_out_channels = [-1, 24, 32, 64, 128, 512] + elif scale == 0.5: + stage_out_channels = [-1, 24, 48, 96, 192, 1024] + elif scale == 1.0: + stage_out_channels = [-1, 24, 116, 232, 464, 1024] + elif scale == 1.5: + stage_out_channels = [-1, 24, 176, 352, 704, 1024] + elif scale == 2.0: + stage_out_channels = [-1, 24, 224, 488, 976, 2048] + else: + raise NotImplementedError("This scale size:[" + str(scale) + + "] is not implemented!") + + self.out_index = [3, 11, 15] + self.feat_channels = stage_out_channels[1:5] + + # 1. conv1 + self._conv1 = ConvBNLayer( + in_channels=3, + out_channels=stage_out_channels[1], + kernel_size=3, + stride=2, + padding=1, + act=act, + name='stage1_conv') + self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) + + # 2. bottleneck sequences + self._block_list = [] + for stage_id, num_repeat in enumerate(stage_repeats): + for i in range(num_repeat): + if i == 0: + block = self.add_sublayer( + name=str(stage_id + 2) + '_' + str(i + 1), + sublayer=InvertedResidualDS( + in_channels=stage_out_channels[stage_id + 1], + out_channels=stage_out_channels[stage_id + 2], + stride=2, + act=act, + name=str(stage_id + 2) + '_' + str(i + 1))) + else: + block = self.add_sublayer( + name=str(stage_id + 2) + '_' + str(i + 1), + sublayer=InvertedResidual( + in_channels=stage_out_channels[stage_id + 2], + out_channels=stage_out_channels[stage_id + 2], + stride=1, + act=act, + name=str(stage_id + 2) + '_' + str(i + 1))) + self._block_list.append(block) + + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, inputs): + feat_list = [] + + y = self._conv1(inputs) + y = self._max_pool(y) + feat_list.append(y) + + for idx, inv in enumerate(self._block_list): + y = inv(y) + if idx in self.out_index: + feat_list.append(y) + return feat_list + + +@manager.BACKBONES.add_component +def ShuffleNetV2_x0_25(**kwargs): + model = ShuffleNet(scale=0.25, **kwargs) + return model + + +@manager.BACKBONES.add_component +def ShuffleNetV2_x0_33(**kwargs): + model = ShuffleNet(scale=0.33, **kwargs) + return model + + +@manager.BACKBONES.add_component +def ShuffleNetV2_x0_5(**kwargs): + model = ShuffleNet(scale=0.5, **kwargs) + return model + + +@manager.BACKBONES.add_component +def ShuffleNetV2_x1_0(**kwargs): + model = ShuffleNet(scale=1.0, **kwargs) + return model + + +@manager.BACKBONES.add_component +def ShuffleNetV2_x1_5(**kwargs): + model = ShuffleNet(scale=1.5, **kwargs) + return model + + +@manager.BACKBONES.add_component +def ShuffleNetV2_x2_0(**kwargs): + model = ShuffleNet(scale=2.0, **kwargs) + return model + + +@manager.BACKBONES.add_component +def ShuffleNetV2_swish(**kwargs): + model = ShuffleNet(scale=1.0, act="swish", **kwargs) + return model diff --git a/paddleseg/models/backbones/stdcnet.py b/paddleseg/models/backbones/stdcnet.py new file mode 100644 index 0000000000000000000000000000000000000000..289f88667b3045b8bc6f86761ba5ce816350f9e9 --- /dev/null +++ b/paddleseg/models/backbones/stdcnet.py @@ -0,0 +1,288 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn + +from paddleseg.utils import utils +from paddleseg.cvlibs import manager, param_init +from paddleseg.models.layers.layer_libs import SyncBatchNorm + +__all__ = ["STDC1", "STDC2"] + + +class STDCNet(nn.Layer): + """ + The STDCNet implementation based on PaddlePaddle. + + The original article refers to Meituan + Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation." + (https://arxiv.org/abs/2104.13188) + + Args: + base(int, optional): base channels. Default: 64. + layers(list, optional): layers numbers list. It determines STDC block numbers of STDCNet's stage3\4\5. Defualt: [4, 5, 3]. + block_num(int,optional): block_num of features block. Default: 4. + type(str,optional): feature fusion method "cat"/"add". Default: "cat". + relative_lr(float,optional): parameters here receive a different learning rate when updating. The effective + learning rate is the prodcut of relative_lr and the global learning rate. Default: 1.0. + pretrained(str, optional): the path of pretrained model. + """ + + def __init__(self, + base=64, + layers=[4, 5, 3], + block_num=4, + type="cat", + relative_lr=1.0, + pretrained=None): + super(STDCNet, self).__init__() + if type == "cat": + block = CatBottleneck + elif type == "add": + block = AddBottleneck + self.layers = layers + self.feat_channels = [base // 2, base, base * 4, base * 8, base * 16] + self.features = self._make_layers(base, layers, block_num, block, relative_lr) + + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + """ + forward function for feature extract. + """ + out_feats = [] + + x = self.features[0](x) + out_feats.append(x) + x = self.features[1](x) + out_feats.append(x) + + idx = [[2, 2 + self.layers[0]], + [2 + self.layers[0], 2 + sum(self.layers[0:2])], + [2 + sum(self.layers[0:2]), 2 + sum(self.layers)]] + for start_idx, end_idx in idx: + for i in range(start_idx, end_idx): + x = self.features[i](x) + out_feats.append(x) + + return out_feats + + def _make_layers(self, base, layers, block_num, block, relative_lr): + features = [] + features += [ConvBNRelu(3, base // 2, 3, 2, relative_lr)] + features += [ConvBNRelu(base // 2, base, 3, 2, relative_lr)] + + for i, layer in enumerate(layers): + for j in range(layer): + if i == 0 and j == 0: + features.append(block(base, base * 4, block_num, 2, relative_lr)) + elif j == 0: + features.append( + block(base * int(math.pow(2, i + 1)), base * int( + math.pow(2, i + 2)), block_num, 2, relative_lr)) + else: + features.append( + block(base * int(math.pow(2, i + 2)), base * int( + math.pow(2, i + 2)), block_num, 1, relative_lr)) + + return nn.Sequential(*features) + + def init_weight(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + param_init.normal_init(layer.weight, std=0.001) + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +class ConvBNRelu(nn.Layer): + def __init__(self, in_planes, out_planes, kernel=3, stride=1, relative_lr=1.0): + super(ConvBNRelu, self).__init__() + param_attr = paddle.ParamAttr(learning_rate=relative_lr) + self.conv = nn.Conv2D( + in_planes, + out_planes, + kernel_size=kernel, + stride=stride, + padding=kernel // 2, + weight_attr=param_attr, + bias_attr=False) + self.bn = nn.BatchNorm2D( + out_planes, + weight_attr=param_attr, + bias_attr=param_attr + ) + self.relu = nn.ReLU() + + def forward(self, x): + out = self.relu(self.bn(self.conv(x))) + return out + + +class AddBottleneck(nn.Layer): + def __init__(self, in_planes, out_planes, block_num=3, stride=1, relative_lr=1.0): + super(AddBottleneck, self).__init__() + assert block_num > 1, "block number should be larger than 1." + self.conv_list = nn.LayerList() + self.stride = stride + param_attr = paddle.ParamAttr(learning_rate=relative_lr) + if stride == 2: + self.avd_layer = nn.Sequential( + nn.Conv2D( + out_planes // 2, + out_planes // 2, + kernel_size=3, + stride=2, + padding=1, + groups=out_planes // 2, + weight_attr=param_attr, + bias_attr=False), + nn.BatchNorm2D(out_planes // 2, weight_attr=param_attr, bias_attr=param_attr), ) + self.skip = nn.Sequential( + nn.Conv2D( + in_planes, + in_planes, + kernel_size=3, + stride=2, + padding=1, + groups=in_planes, + weight_attr=param_attr, + bias_attr=False), + nn.BatchNorm2D(in_planes, weight_attr=param_attr, bias_attr=param_attr), + nn.Conv2D( + in_planes, out_planes, kernel_size=1, bias_attr=False, + weight_attr=param_attr + ), + nn.BatchNorm2D(out_planes, weight_attr=param_attr, bias_attr=param_attr), ) + stride = 1 + + for idx in range(block_num): + if idx == 0: + self.conv_list.append( + ConvBNRelu( + in_planes, out_planes // 2, kernel=1, relative_lr=relative_lr)) + elif idx == 1 and block_num == 2: + self.conv_list.append( + ConvBNRelu( + out_planes // 2, out_planes // 2, stride=stride, relative_lr=relative_lr)) + elif idx == 1 and block_num > 2: + self.conv_list.append( + ConvBNRelu( + out_planes // 2, out_planes // 4, stride=stride, relative_lr=relative_lr)) + elif idx < block_num - 1: + self.conv_list.append( + ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes + // int(math.pow(2, idx + 1)), relative_lr=relative_lr) + ) + else: + self.conv_list.append( + ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes + // int(math.pow(2, idx))), relative_lr=relative_lr + ) + + def forward(self, x): + out_list = [] + out = x + for idx, conv in enumerate(self.conv_list): + if idx == 0 and self.stride == 2: + out = self.avd_layer(conv(out)) + else: + out = conv(out) + out_list.append(out) + if self.stride == 2: + x = self.skip(x) + return paddle.concat(out_list, axis=1) + x + + +class CatBottleneck(nn.Layer): + def __init__(self, in_planes, out_planes, block_num=3, stride=1, relative_lr=1.0): + super(CatBottleneck, self).__init__() + assert block_num > 1, "block number should be larger than 1." + self.conv_list = nn.LayerList() + self.stride = stride + param_attr = paddle.ParamAttr(learning_rate=relative_lr) + if stride == 2: + self.avd_layer = nn.Sequential( + nn.Conv2D( + out_planes // 2, + out_planes // 2, + kernel_size=3, + stride=2, + padding=1, + groups=out_planes // 2, + weight_attr=param_attr, + bias_attr=False), + nn.BatchNorm2D(out_planes // 2, weight_attr=param_attr, bias_attr=param_attr), ) + self.skip = nn.AvgPool2D(kernel_size=3, stride=2, padding=1) + stride = 1 + + for idx in range(block_num): + if idx == 0: + self.conv_list.append( + ConvBNRelu( + in_planes, out_planes // 2, kernel=1, relative_lr=relative_lr)) + elif idx == 1 and block_num == 2: + self.conv_list.append( + ConvBNRelu( + out_planes // 2, out_planes // 2, stride=stride, relative_lr=relative_lr)) + elif idx == 1 and block_num > 2: + self.conv_list.append( + ConvBNRelu( + out_planes // 2, out_planes // 4, stride=stride, relative_lr=relative_lr)) + elif idx < block_num - 1: + self.conv_list.append( + ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes + // int(math.pow(2, idx + 1)), relative_lr=relative_lr)) + else: + self.conv_list.append( + ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes + // int(math.pow(2, idx)), relative_lr=relative_lr)) + + def forward(self, x): + out_list = [] + out1 = self.conv_list[0](x) + for idx, conv in enumerate(self.conv_list[1:]): + if idx == 0: + if self.stride == 2: + out = conv(self.avd_layer(out1)) + else: + out = conv(out1) + else: + out = conv(out) + out_list.append(out) + + if self.stride == 2: + out1 = self.skip(out1) + out_list.insert(0, out1) + out = paddle.concat(out_list, axis=1) + return out + + +@manager.BACKBONES.add_component +def STDC2(**kwargs): + model = STDCNet(base=64, layers=[4, 5, 3], **kwargs) + return model + + +@manager.BACKBONES.add_component +def STDC1(**kwargs): + model = STDCNet(base=64, layers=[2, 2, 2], **kwargs) + return model diff --git a/paddleseg/models/backbones/swin_transformer.py b/paddleseg/models/backbones/swin_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..12dc71a76dbf80d2f502af3da12a961c7d346400 --- /dev/null +++ b/paddleseg/models/backbones/swin_transformer.py @@ -0,0 +1,791 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils +from paddleseg.models.backbones.transformer_utils import * + + +class Mlp(nn.Layer): + """ Multilayer perceptron.""" + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.reshape( + [B, H // window_size, window_size, W // window_size, window_size, C]) + windows = x.transpose([0, 1, 3, 2, 4, + 5]).reshape([-1, window_size, window_size, C]) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.reshape( + [B, H // window_size, W // window_size, window_size, window_size, -1]) + x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1]) + return x + + +class WindowAttention(nn.Layer): + """ + Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = self.create_parameter( + shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads), + default_initializer=zeros_) + self.add_parameter("relative_position_bias_table", + self.relative_position_bias_table) + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(self.window_size[0]) + coords_w = paddle.arange(self.window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, + coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + coords_flatten_1 = coords_flatten.unsqueeze(axis=2) + coords_flatten_2 = coords_flatten.unsqueeze(axis=1) + relative_coords = coords_flatten_1 - coords_flatten_2 + + relative_coords = relative_coords.transpose([1, 2, 0]) + + relative_coords[:, :, 0] += self.window_size[ + 0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape( + [B_, N, 3, self.num_heads, + C // self.num_heads]).transpose([2, 0, 3, 1, 4]) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) + + index = self.relative_position_index.reshape([-1]) + relative_position_bias = paddle.index_select( + self.relative_position_bias_table, index) + + relative_position_bias = relative_position_bias.reshape([ + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], -1 + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N + ]) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.reshape([-1, self.num_heads, N, N]) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Layer): + """ + Swin Transformer Block. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """ + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.reshape([B, H, W, C]) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + + x = x.transpose([0, 3, 1, 2]) + x = F.pad(x, [pad_l, pad_r, pad_t, pad_b]) + x = x.transpose([0, 2, 3, 1]) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = paddle.roll( + x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.reshape( + [-1, self.window_size * self.window_size, + C]) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.reshape( + [-1, self.window_size, self.window_size, C]) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, + Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = paddle.roll( + shifted_x, + shifts=(self.shift_size, self.shift_size), + axis=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :] + + x = x.reshape([B, H * W, C]) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Layer): + """ + Patch Merging Layer + + Args: + dim (int): Number of input channels. + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """ + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.reshape([B, H, W, C]) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = x.transpose([0, 3, 1, 2]) + x = F.pad(x, [0, W % 2, 0, H % 2]) + x = x.transpose([0, 2, 3, 1]) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.reshape([B, -1, 4 * C]) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Layer): + """ + A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of feature channels. + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None + """ + + def __init__(self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + + # build blocks + self.blocks = nn.LayerList([ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) for i in range(depth) + ]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """ + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = paddle.zeros((1, Hp, Wp, 1)) # 1 Hp Wp 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.reshape( + [-1, self.window_size * self.window_size]) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + + huns = -100.0 * paddle.ones_like(attn_mask) + attn_mask = huns * (attn_mask != 0).astype("float32") + + for blk in self.blocks: + blk.H, blk.W = H, W + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Layer): + """ + Image to Patch Embedding. + + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Layer, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.shape + if W % self.patch_size[1] != 0: + x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) + if H % self.patch_size[0] != 0: + x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + _, _, Wh, Ww = x.shape + x = x.flatten(2).transpose([0, 2, 1]) + x = self.norm(x) + x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) + + return x + + +@manager.BACKBONES.add_component +class SwinTransformer(nn.Layer): + """ + The SwinTransformer implementation based on PaddlePaddle. + + The original article refers to + Liu, Ze, et al. "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows" + (https://arxiv.org/abs/2103.14030) + + Args: + pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default: 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. Default: -1. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + pretrained=None): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1] + ] + + self.absolute_pos_embed = self.create_parameter( + shape=(1, embed_dim, patches_resolution[0], + patches_resolution[1]), + default_initializer=zeros_) + self.add_parameter("absolute_pos_embed", self.absolute_pos_embed) + trunc_normal_(self.absolute_pos_embed) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = np.linspace(0, drop_path_rate, sum(depths)).tolist() + + # build layers + self.layers = nn.LayerList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging + if (i_layer < self.num_layers - 1) else None) + self.layers.append(layer) + + feat_channels = [int(embed_dim * 2**i) for i in range(self.num_layers)] + self.feat_channels = feat_channels + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(feat_channels[i_layer]) + layer_name = f'norm{i_layer}' + self.add_sublayer(layer_name, layer) + + self._freeze_stages() + + self.pretrained = pretrained + self.init_weights(self.pretrained) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + layer = self.layers[i] + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + else: + for sublayer in self.sublayers(): + if isinstance(sublayer, nn.Linear): + trunc_normal_(sublayer.weight) + if isinstance(sublayer, + nn.Linear) and sublayer.bias is not None: + zeros_(sublayer.bias) + elif isinstance(sublayer, nn.LayerNorm): + zeros_(sublayer.bias) + ones_(sublayer.weight) + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + _, _, Wh, Ww = x.shape + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose([0, 2, 1]) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x_out) + + out = x_out.reshape( + [-1, H, W, self.feat_channels[i]]).transpose([0, 3, 1, 2]) + outs.append(out) + + return tuple(outs) + + def train(self): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train() + self._freeze_stages() + + +@manager.BACKBONES.add_component +def SwinTransformer_tiny_patch4_window7_224(**kwargs): + model = SwinTransformer( + pretrain_img_size=224, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + **kwargs) + + return model + + +@manager.BACKBONES.add_component +def SwinTransformer_small_patch4_window7_224(**kwargs): + model = SwinTransformer( + pretrain_img_size=224, + embed_dim=96, + depths=[2, 2, 18, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + **kwargs) + + return model + + +@manager.BACKBONES.add_component +def SwinTransformer_base_patch4_window7_224(**kwargs): + model = SwinTransformer( + pretrain_img_size=224, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + **kwargs) + + return model + + +@manager.BACKBONES.add_component +def SwinTransformer_base_patch4_window12_384(**kwargs): + model = SwinTransformer( + pretrain_img_size=384, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=12, + **kwargs) + + return model + + +@manager.BACKBONES.add_component +def SwinTransformer_large_patch4_window7_224(**kwargs): + model = SwinTransformer( + pretrain_img_size=224, + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=7, + **kwargs) + + return model + + +@manager.BACKBONES.add_component +def SwinTransformer_large_patch4_window12_384(**kwargs): + model = SwinTransformer( + pretrain_img_size=384, + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + **kwargs) + + return model diff --git a/paddleseg/models/backbones/transformer_utils.py b/paddleseg/models/backbones/transformer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..db3e5361b56f14219f57f04d17339c4579e98a45 --- /dev/null +++ b/paddleseg/models/backbones/transformer_utils.py @@ -0,0 +1,83 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.initializer as paddle_init + +__all__ = [ + 'to_2tuple', 'DropPath', 'Identity', 'trunc_normal_', 'zeros_', 'ones_', + 'init_weights' +] + + +def to_2tuple(x): + return tuple([x] * 2) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +trunc_normal_ = paddle_init.TruncatedNormal(std=.02) +zeros_ = paddle_init.Constant(value=0.) +ones_ = paddle_init.Constant(value=1.) + + +def init_weights(layer): + """ + Init the weights of transformer. + Args: + layer(nn.Layer): The layer to init weights. + Returns: + None + """ + if isinstance(layer, nn.Linear): + trunc_normal_(layer.weight) + if layer.bias is not None: + zeros_(layer.bias) + elif isinstance(layer, nn.LayerNorm): + zeros_(layer.bias) + ones_(layer.weight) diff --git a/paddleseg/models/backbones/vision_transformer.py b/paddleseg/models/backbones/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..6b21601d8788bb190c98f443f3ec10e47b557ca1 --- /dev/null +++ b/paddleseg/models/backbones/vision_transformer.py @@ -0,0 +1,407 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils, logger +from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath, Identity + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + x_shape = paddle.shape(x) + N, C = x_shape[1], x_shape[2] + qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // + self.num_heads)).transpose((2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5): + super().__init__() + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + self.img_size = to_2tuple(img_size) + self.patch_size = to_2tuple(patch_size) + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + @property + def num_patches_in_h(self): + return self.img_size[1] // self.patch_size[1] + + @property + def num_patches_in_w(self): + return self.img_size[0] // self.patch_size[0] + + def forward(self, x): + x = self.proj(x) + return x + + +@manager.BACKBONES.add_component +class VisionTransformer(nn.Layer): + """ Vision Transformer with support for patch input + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer='nn.LayerNorm', + epsilon=1e-5, + final_norm=False, + pretrained=None, + **args): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + self.pos_w = self.patch_embed.num_patches_in_w + self.pos_h = self.patch_embed.num_patches_in_h + + self.pos_embed = self.create_parameter( + shape=(1, self.pos_w * self.pos_h + 1, embed_dim), + default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + self.cls_token = self.create_parameter( + shape=(1, 1, embed_dim), + default_initializer=paddle.nn.initializer.Constant(value=0.)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = np.linspace(0, drop_path_rate, depth) + + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + epsilon=epsilon) for i in range(depth) + ]) + + self.final_norm = final_norm + if self.final_norm: + self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + utils.load_pretrained_model(self, self.pretrained) + + # load and resize pos_embed + model_path = self.pretrained + if not os.path.exists(model_path): + model_path = utils.download_pretrained_model(model_path) + + load_state_dict = paddle.load(model_path) + model_state_dict = self.state_dict() + pos_embed_name = "pos_embed" + if pos_embed_name in load_state_dict.keys(): + load_pos_embed = paddle.to_tensor( + load_state_dict[pos_embed_name], dtype="float32") + if self.pos_embed.shape != load_pos_embed.shape: + pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) + model_state_dict[pos_embed_name] = self.resize_pos_embed( + load_pos_embed, (pos_size, pos_size), + (self.pos_h, self.pos_w)) + self.set_dict(model_state_dict) + logger.info("Load pos_embed and resize it from {} to {} .". + format(load_pos_embed.shape, self.pos_embed.shape)) + + def resize_pos_embed(self, pos_embed, old_hw, new_hw): + """ + Resize pos_embed weight. + Args: + pos_embed (Tensor): the pos_embed weight + old_hw (list[int]): the height and width of old pos_embed + new_hw (list[int]): the height and width of new pos_embed + Returns: + Tensor: the resized pos_embed weight + """ + cls_pos_embed = pos_embed[:, :1, :] + pos_embed = pos_embed[:, 1:, :] + + pos_embed = pos_embed.transpose([0, 2, 1]) + pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) + pos_embed = F.interpolate( + pos_embed, new_hw, mode='bicubic', align_corners=False) + pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) + pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) + + return pos_embed + + def forward(self, x): + x = self.patch_embed(x) + x_shape = paddle.shape(x) # b * c * h * w + + cls_tokens = self.cls_token.expand((x_shape[0], -1, -1)) + x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c + x = paddle.concat([cls_tokens, x], axis=1) + + if paddle.shape(x)[1] == self.pos_embed.shape[1]: + x = x + self.pos_embed + else: + x = x + self.resize_pos_embed(self.pos_embed, + (self.pos_h, self.pos_w), x_shape[2:]) + x = self.pos_drop(x) + + res = [] + for idx, blk in enumerate(self.blocks): + x = blk(x) + if self.final_norm and idx == len(self.blocks) - 1: + x = self.norm(x) + res.append(x[:, 1:, :]) + + return res, x_shape + + +@manager.BACKBONES.add_component +def ViT_small_patch16_224(**kwargs): + model = VisionTransformer( + patch_size=16, + embed_dim=768, + depth=8, + num_heads=8, + mlp_ratio=3, + qk_scale=768**-0.5, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_base_patch16_224(**kwargs): + model = VisionTransformer( + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + epsilon=1e-6, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_base_patch16_384(**kwargs): + model = VisionTransformer( + img_size=384, + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + epsilon=1e-6, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_base_patch32_384(**kwargs): + model = VisionTransformer( + img_size=384, + patch_size=32, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + epsilon=1e-6, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_large_patch16_224(**kwargs): + model = VisionTransformer( + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + epsilon=1e-6, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_large_patch16_384(**kwargs): + model = VisionTransformer( + img_size=384, + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + epsilon=1e-6, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_large_patch32_384(**kwargs): + model = VisionTransformer( + img_size=384, + patch_size=32, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + epsilon=1e-6, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_huge_patch16_224(**kwargs): + model = VisionTransformer( + patch_size=16, + embed_dim=1280, + depth=32, + num_heads=16, + mlp_ratio=4, + **kwargs) + return model + + +@manager.BACKBONES.add_component +def ViT_huge_patch32_384(**kwargs): + model = VisionTransformer( + img_size=384, + patch_size=32, + embed_dim=1280, + depth=32, + num_heads=16, + mlp_ratio=4, + **kwargs) + return model diff --git a/paddleseg/models/backbones/xception_deeplab.py b/paddleseg/models/backbones/xception_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..62beedcbd65a61ffebc4c48e765237180cfa8f40 --- /dev/null +++ b/paddleseg/models/backbones/xception_deeplab.py @@ -0,0 +1,414 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils +from paddleseg.models import layers + +__all__ = ["Xception41_deeplab", "Xception65_deeplab", "Xception71_deeplab"] + + +def check_data(data, number): + if type(data) == int: + return [data] * number + assert len(data) == number + return data + + +def check_stride(s, os): + if s <= os: + return True + else: + return False + + +def check_points(count, points): + if points is None: + return False + else: + if isinstance(points, list): + return (True if count in points else False) + else: + return (True if count == points else False) + + +def gen_bottleneck_params(backbone='xception_65'): + if backbone == 'xception_65': + bottleneck_params = { + "entry_flow": (3, [2, 2, 2], [128, 256, 728]), + "middle_flow": (16, 1, 728), + "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]]) + } + elif backbone == 'xception_41': + bottleneck_params = { + "entry_flow": (3, [2, 2, 2], [128, 256, 728]), + "middle_flow": (8, 1, 728), + "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]]) + } + elif backbone == 'xception_71': + bottleneck_params = { + "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]), + "middle_flow": (16, 1, 728), + "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]]) + } + else: + raise ValueError( + "Xception backbont only support xception_41/xception_65/xception_71") + return bottleneck_params + + +class ConvBNLayer(nn.Layer): + def __init__(self, + input_channels, + output_channels, + filter_size, + stride=1, + padding=0, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + + self._conv = nn.Conv2D( + in_channels=input_channels, + out_channels=output_channels, + kernel_size=filter_size, + stride=stride, + padding=padding, + bias_attr=False) + self._bn = layers.SyncBatchNorm( + num_features=output_channels, epsilon=1e-3, momentum=0.99) + + self._act_op = layers.Activation(act=act) + + def forward(self, inputs): + return self._act_op(self._bn(self._conv(inputs))) + + +class Seperate_Conv(nn.Layer): + def __init__(self, + input_channels, + output_channels, + stride, + filter, + dilation=1, + act=None, + name=None): + super(Seperate_Conv, self).__init__() + + self._conv1 = nn.Conv2D( + in_channels=input_channels, + out_channels=input_channels, + kernel_size=filter, + stride=stride, + groups=input_channels, + padding=(filter) // 2 * dilation, + dilation=dilation, + bias_attr=False) + self._bn1 = layers.SyncBatchNorm( + input_channels, epsilon=1e-3, momentum=0.99) + + self._act_op1 = layers.Activation(act=act) + + self._conv2 = nn.Conv2D( + input_channels, + output_channels, + 1, + stride=1, + groups=1, + padding=0, + bias_attr=False) + self._bn2 = layers.SyncBatchNorm( + output_channels, epsilon=1e-3, momentum=0.99) + + self._act_op2 = layers.Activation(act=act) + + def forward(self, inputs): + x = self._conv1(inputs) + x = self._bn1(x) + x = self._act_op1(x) + x = self._conv2(x) + x = self._bn2(x) + x = self._act_op2(x) + return x + + +class Xception_Block(nn.Layer): + def __init__(self, + input_channels, + output_channels, + strides=1, + filter_size=3, + dilation=1, + skip_conv=True, + has_skip=True, + activation_fn_in_separable_conv=False, + name=None): + super(Xception_Block, self).__init__() + + repeat_number = 3 + output_channels = check_data(output_channels, repeat_number) + filter_size = check_data(filter_size, repeat_number) + strides = check_data(strides, repeat_number) + + self.has_skip = has_skip + self.skip_conv = skip_conv + self.activation_fn_in_separable_conv = activation_fn_in_separable_conv + if not activation_fn_in_separable_conv: + self._conv1 = Seperate_Conv( + input_channels, + output_channels[0], + stride=strides[0], + filter=filter_size[0], + dilation=dilation, + name=name + "/separable_conv1") + self._conv2 = Seperate_Conv( + output_channels[0], + output_channels[1], + stride=strides[1], + filter=filter_size[1], + dilation=dilation, + name=name + "/separable_conv2") + self._conv3 = Seperate_Conv( + output_channels[1], + output_channels[2], + stride=strides[2], + filter=filter_size[2], + dilation=dilation, + name=name + "/separable_conv3") + else: + self._conv1 = Seperate_Conv( + input_channels, + output_channels[0], + stride=strides[0], + filter=filter_size[0], + act="relu", + dilation=dilation, + name=name + "/separable_conv1") + self._conv2 = Seperate_Conv( + output_channels[0], + output_channels[1], + stride=strides[1], + filter=filter_size[1], + act="relu", + dilation=dilation, + name=name + "/separable_conv2") + self._conv3 = Seperate_Conv( + output_channels[1], + output_channels[2], + stride=strides[2], + filter=filter_size[2], + act="relu", + dilation=dilation, + name=name + "/separable_conv3") + + if has_skip and skip_conv: + self._short = ConvBNLayer( + input_channels, + output_channels[-1], + 1, + stride=strides[-1], + padding=0, + name=name + "/shortcut") + + def forward(self, inputs): + if not self.activation_fn_in_separable_conv: + x = F.relu(inputs) + x = self._conv1(x) + x = F.relu(x) + x = self._conv2(x) + x = F.relu(x) + x = self._conv3(x) + else: + x = self._conv1(inputs) + x = self._conv2(x) + x = self._conv3(x) + if self.has_skip is False: + return x + if self.skip_conv: + skip = self._short(inputs) + else: + skip = inputs + return x + skip + + +class XceptionDeeplab(nn.Layer): + """ + The Xception backobne of DeepLabv3+ implementation based on PaddlePaddle. + + The original article refers to + Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation" + (https://arxiv.org/abs/1802.02611) + + Args: + backbone (str): Which type of Xception_DeepLab to select. It should be one of ('xception_41', 'xception_65', 'xception_71'). + pretrained (str, optional): The path of pretrained model. + output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 16. + + """ + + def __init__(self, backbone, pretrained=None, output_stride=16): + + super(XceptionDeeplab, self).__init__() + + bottleneck_params = gen_bottleneck_params(backbone) + self.backbone = backbone + self.feat_channels = [128, 2048] + + self._conv1 = ConvBNLayer( + 3, + 32, + 3, + stride=2, + padding=1, + act="relu", + name=self.backbone + "/entry_flow/conv1") + self._conv2 = ConvBNLayer( + 32, + 64, + 3, + stride=1, + padding=1, + act="relu", + name=self.backbone + "/entry_flow/conv2") + """ + bottleneck_params = { + "entry_flow": (3, [2, 2, 2], [128, 256, 728]), + "middle_flow": (16, 1, 728), + "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]]) + } + + if output_stride == 16: + entry_block3_stride = 2 + middle_block_dilation = 1 + exit_block_dilations = (1, 2) + elif output_stride == 8: + entry_block3_stride = 1 + middle_block_dilation = 2 + exit_block_dilations = (2, 4) + + """ + self.block_num = bottleneck_params["entry_flow"][0] + self.strides = bottleneck_params["entry_flow"][1] + self.chns = bottleneck_params["entry_flow"][2] + self.strides = check_data(self.strides, self.block_num) + self.chns = check_data(self.chns, self.block_num) + + self.entry_flow = [] + self.middle_flow = [] + + self.stride = 2 + self.output_stride = output_stride + s = self.stride + + for i in range(self.block_num): + stride = self.strides[i] if check_stride(s * self.strides[i], + self.output_stride) else 1 + xception_block = self.add_sublayer( + self.backbone + "/entry_flow/block" + str(i + 1), + Xception_Block( + input_channels=64 if i == 0 else self.chns[i - 1], + output_channels=self.chns[i], + strides=[1, 1, self.stride], + name=self.backbone + "/entry_flow/block" + str(i + 1))) + self.entry_flow.append(xception_block) + s = s * stride + self.stride = s + + self.block_num = bottleneck_params["middle_flow"][0] + self.strides = bottleneck_params["middle_flow"][1] + self.chns = bottleneck_params["middle_flow"][2] + self.strides = check_data(self.strides, self.block_num) + self.chns = check_data(self.chns, self.block_num) + s = self.stride + + for i in range(self.block_num): + stride = self.strides[i] if check_stride(s * self.strides[i], + self.output_stride) else 1 + xception_block = self.add_sublayer( + self.backbone + "/middle_flow/block" + str(i + 1), + Xception_Block( + input_channels=728, + output_channels=728, + strides=[1, 1, self.strides[i]], + skip_conv=False, + name=self.backbone + "/middle_flow/block" + str(i + 1))) + self.middle_flow.append(xception_block) + s = s * stride + self.stride = s + + self.block_num = bottleneck_params["exit_flow"][0] + self.strides = bottleneck_params["exit_flow"][1] + self.chns = bottleneck_params["exit_flow"][2] + self.strides = check_data(self.strides, self.block_num) + self.chns = check_data(self.chns, self.block_num) + s = self.stride + stride = self.strides[0] if check_stride(s * self.strides[0], + self.output_stride) else 1 + self._exit_flow_1 = Xception_Block( + 728, + self.chns[0], [1, 1, stride], + name=self.backbone + "/exit_flow/block1") + s = s * stride + stride = self.strides[1] if check_stride(s * self.strides[1], + self.output_stride) else 1 + self._exit_flow_2 = Xception_Block( + self.chns[0][-1], + self.chns[1], [1, 1, stride], + dilation=2, + has_skip=False, + activation_fn_in_separable_conv=True, + name=self.backbone + "/exit_flow/block2") + + self.pretrained = pretrained + self.init_weight() + + def forward(self, inputs): + x = self._conv1(inputs) + x = self._conv2(x) + feat_list = [] + for i, ef in enumerate(self.entry_flow): + x = ef(x) + if i == 0: + feat_list.append(x) + for mf in self.middle_flow: + x = mf(x) + x = self._exit_flow_1(x) + x = self._exit_flow_2(x) + feat_list.append(x) + return feat_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +@manager.BACKBONES.add_component +def Xception41_deeplab(**args): + model = XceptionDeeplab('xception_41', **args) + return model + + +@manager.BACKBONES.add_component +def Xception65_deeplab(**args): + model = XceptionDeeplab("xception_65", **args) + return model + + +@manager.BACKBONES.add_component +def Xception71_deeplab(**args): + model = XceptionDeeplab("xception_71", **args) + return model diff --git a/paddleseg/models/bisenet.py b/paddleseg/models/bisenet.py new file mode 100644 index 0000000000000000000000000000000000000000..d1a573bef4123d8452566cf0e49937334f79aebe --- /dev/null +++ b/paddleseg/models/bisenet.py @@ -0,0 +1,315 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers + + +@manager.MODELS.add_component +class BiSeNetV2(nn.Layer): + """ + The BiSeNet V2 implementation based on PaddlePaddle. + + The original article refers to + Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation" + (https://arxiv.org/abs/2004.02147) + + Args: + num_classes (int): The unique number of target classes. + lambd (float, optional): A factor for controlling the size of semantic branch channels. Default: 0.25. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + lambd=0.25, + align_corners=False, + pretrained=None): + super().__init__() + + C1, C2, C3 = 64, 64, 128 + db_channels = (C1, C2, C3) + C1, C3, C4, C5 = int(C1 * lambd), int(C3 * lambd), 64, 128 + sb_channels = (C1, C3, C4, C5) + mid_channels = 128 + + self.db = DetailBranch(db_channels) + self.sb = SemanticBranch(sb_channels) + + self.bga = BGA(mid_channels, align_corners) + self.aux_head1 = SegHead(C1, C1, num_classes) + self.aux_head2 = SegHead(C3, C3, num_classes) + self.aux_head3 = SegHead(C4, C4, num_classes) + self.aux_head4 = SegHead(C5, C5, num_classes) + self.head = SegHead(mid_channels, mid_channels, num_classes) + + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + dfm = self.db(x) + feat1, feat2, feat3, feat4, sfm = self.sb(x) + logit = self.head(self.bga(dfm, sfm)) + + if not self.training: + logit_list = [logit] + else: + logit1 = self.aux_head1(feat1) + logit2 = self.aux_head2(feat2) + logit3 = self.aux_head3(feat3) + logit4 = self.aux_head4(feat4) + logit_list = [logit, logit1, logit2, logit3, logit4] + + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + else: + for sublayer in self.sublayers(): + if isinstance(sublayer, nn.Conv2D): + param_init.kaiming_normal_init(sublayer.weight) + elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(sublayer.weight, value=1.0) + param_init.constant_init(sublayer.bias, value=0.0) + + +class StemBlock(nn.Layer): + def __init__(self, in_dim, out_dim): + super(StemBlock, self).__init__() + + self.conv = layers.ConvBNReLU(in_dim, out_dim, 3, stride=2) + + self.left = nn.Sequential( + layers.ConvBNReLU(out_dim, out_dim // 2, 1), + layers.ConvBNReLU( + out_dim // 2, out_dim, 3, stride=2)) + + self.right = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.fuse = layers.ConvBNReLU(out_dim * 2, out_dim, 3) + + def forward(self, x): + x = self.conv(x) + left = self.left(x) + right = self.right(x) + concat = paddle.concat([left, right], axis=1) + return self.fuse(concat) + + +class ContextEmbeddingBlock(nn.Layer): + def __init__(self, in_dim, out_dim): + super(ContextEmbeddingBlock, self).__init__() + + self.gap = nn.AdaptiveAvgPool2D(1) + self.bn = layers.SyncBatchNorm(in_dim) + + self.conv_1x1 = layers.ConvBNReLU(in_dim, out_dim, 1) + self.add = layers.Add() + self.conv_3x3 = nn.Conv2D(out_dim, out_dim, 3, 1, 1) + + def forward(self, x): + gap = self.gap(x) + bn = self.bn(gap) + conv1 = self.add(self.conv_1x1(bn), x) + return self.conv_3x3(conv1) + + +class GatherAndExpansionLayer1(nn.Layer): + """Gather And Expansion Layer with stride 1""" + + def __init__(self, in_dim, out_dim, expand): + super().__init__() + + expand_dim = expand * in_dim + + self.conv = nn.Sequential( + layers.ConvBNReLU(in_dim, in_dim, 3), + layers.DepthwiseConvBN(in_dim, expand_dim, 3), + layers.ConvBN(expand_dim, out_dim, 1)) + self.relu = layers.Activation("relu") + + def forward(self, x): + return self.relu(self.conv(x) + x) + + +class GatherAndExpansionLayer2(nn.Layer): + """Gather And Expansion Layer with stride 2""" + + def __init__(self, in_dim, out_dim, expand): + super().__init__() + + expand_dim = expand * in_dim + + self.branch_1 = nn.Sequential( + layers.ConvBNReLU(in_dim, in_dim, 3), + layers.DepthwiseConvBN( + in_dim, expand_dim, 3, stride=2), + layers.DepthwiseConvBN(expand_dim, expand_dim, 3), + layers.ConvBN(expand_dim, out_dim, 1)) + + self.branch_2 = nn.Sequential( + layers.DepthwiseConvBN( + in_dim, in_dim, 3, stride=2), + layers.ConvBN(in_dim, out_dim, 1)) + + self.relu = layers.Activation("relu") + + def forward(self, x): + return self.relu(self.branch_1(x) + self.branch_2(x)) + + +class DetailBranch(nn.Layer): + """The detail branch of BiSeNet, which has wide channels but shallow layers.""" + + def __init__(self, in_channels): + super().__init__() + + C1, C2, C3 = in_channels + + self.convs = nn.Sequential( + # stage 1 + layers.ConvBNReLU( + 3, C1, 3, stride=2), + layers.ConvBNReLU(C1, C1, 3), + # stage 2 + layers.ConvBNReLU( + C1, C2, 3, stride=2), + layers.ConvBNReLU(C2, C2, 3), + layers.ConvBNReLU(C2, C2, 3), + # stage 3 + layers.ConvBNReLU( + C2, C3, 3, stride=2), + layers.ConvBNReLU(C3, C3, 3), + layers.ConvBNReLU(C3, C3, 3), ) + + def forward(self, x): + return self.convs(x) + + +class SemanticBranch(nn.Layer): + """The semantic branch of BiSeNet, which has narrow channels but deep layers.""" + + def __init__(self, in_channels): + super().__init__() + C1, C3, C4, C5 = in_channels + + self.stem = StemBlock(3, C1) + + self.stage3 = nn.Sequential( + GatherAndExpansionLayer2(C1, C3, 6), + GatherAndExpansionLayer1(C3, C3, 6)) + + self.stage4 = nn.Sequential( + GatherAndExpansionLayer2(C3, C4, 6), + GatherAndExpansionLayer1(C4, C4, 6)) + + self.stage5_4 = nn.Sequential( + GatherAndExpansionLayer2(C4, C5, 6), + GatherAndExpansionLayer1(C5, C5, 6), + GatherAndExpansionLayer1(C5, C5, 6), + GatherAndExpansionLayer1(C5, C5, 6)) + + self.ce = ContextEmbeddingBlock(C5, C5) + + def forward(self, x): + stage2 = self.stem(x) + stage3 = self.stage3(stage2) + stage4 = self.stage4(stage3) + stage5_4 = self.stage5_4(stage4) + fm = self.ce(stage5_4) + return stage2, stage3, stage4, stage5_4, fm + + +class BGA(nn.Layer): + """The Bilateral Guided Aggregation Layer, used to fuse the semantic features and spatial features.""" + + def __init__(self, out_dim, align_corners): + super().__init__() + + self.align_corners = align_corners + + self.db_branch_keep = nn.Sequential( + layers.DepthwiseConvBN(out_dim, out_dim, 3), + nn.Conv2D(out_dim, out_dim, 1)) + + self.db_branch_down = nn.Sequential( + layers.ConvBN( + out_dim, out_dim, 3, stride=2), + nn.AvgPool2D( + kernel_size=3, stride=2, padding=1)) + + self.sb_branch_keep = nn.Sequential( + layers.DepthwiseConvBN(out_dim, out_dim, 3), + nn.Conv2D(out_dim, out_dim, 1), + layers.Activation(act='sigmoid')) + + self.sb_branch_up = layers.ConvBN(out_dim, out_dim, 3) + + self.conv = layers.ConvBN(out_dim, out_dim, 3) + + def forward(self, dfm, sfm): + db_feat_keep = self.db_branch_keep(dfm) + db_feat_down = self.db_branch_down(dfm) + sb_feat_keep = self.sb_branch_keep(sfm) + + sb_feat_up = self.sb_branch_up(sfm) + sb_feat_up = F.interpolate( + sb_feat_up, + paddle.shape(db_feat_keep)[2:], + mode='bilinear', + align_corners=self.align_corners) + + sb_feat_up = F.sigmoid(sb_feat_up) + db_feat = db_feat_keep * sb_feat_up + + sb_feat = db_feat_down * sb_feat_keep + sb_feat = F.interpolate( + sb_feat, + paddle.shape(db_feat)[2:], + mode='bilinear', + align_corners=self.align_corners) + + return self.conv(db_feat + sb_feat) + + +class SegHead(nn.Layer): + def __init__(self, in_dim, mid_dim, num_classes): + super().__init__() + + self.conv_3x3 = nn.Sequential( + layers.ConvBNReLU(in_dim, mid_dim, 3), nn.Dropout(0.1)) + + self.conv_1x1 = nn.Conv2D(mid_dim, num_classes, 1, 1) + + def forward(self, x): + conv1 = self.conv_3x3(x) + conv2 = self.conv_1x1(conv1) + return conv2 diff --git a/paddleseg/models/bisenetv1.py b/paddleseg/models/bisenetv1.py new file mode 100644 index 0000000000000000000000000000000000000000..1e7b89762c4711f7c3ca7f65095eddee79bb69aa --- /dev/null +++ b/paddleseg/models/bisenetv1.py @@ -0,0 +1,244 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class BiseNetV1(nn.Layer): + """ + The BiSeNetV1 implementation based on PaddlePaddle. + + The original article refers to + Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation" + (https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for) + + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, num_classes, backbone, conv_channel=128, + pretrained=None): + super().__init__() + self.backbone = backbone + self.spatial_path = SpatialPath(3, 128) + self.global_context = nn.Sequential( + nn.AdaptiveAvgPool2D(1), + layers.ConvBNReLU( + 512, conv_channel, 1, bias_attr=False), ) + + self.arms = nn.LayerList([ + AttentionRefinement(512, conv_channel), + AttentionRefinement(256, conv_channel), + ]) + self.refines = nn.LayerList([ + layers.ConvBNReLU( + conv_channel, + conv_channel, + 3, + stride=1, + padding=1, + bias_attr=False), + layers.ConvBNReLU( + conv_channel, + conv_channel, + 3, + stride=1, + padding=1, + bias_attr=False), + ]) + + self.heads = nn.LayerList([ + BiSeNetHead(conv_channel, num_classes, 8, True), + BiSeNetHead(conv_channel, num_classes, 8, True), + BiSeNetHead(conv_channel * 2, num_classes, 8, False), + ]) + + self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1) + + self.pretrained = pretrained + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, x): + spatial_out = self.spatial_path(x) + context_blocks = self.backbone(x) + context_blocks.reverse() + + global_context = self.global_context(context_blocks[0]) + global_context = F.interpolate( + global_context, + size=paddle.shape(context_blocks[0])[2:], + mode='bilinear', + align_corners=True) + last_fm = global_context + pred_out = [] + + for i, ( + fm, arm, refine + ) in enumerate(zip(context_blocks[:2], self.arms, self.refines)): + fm = arm(fm) + fm += last_fm + last_fm = F.interpolate( + fm, + size=paddle.shape(context_blocks[i + 1])[2:], + mode='bilinear', + align_corners=True) + last_fm = refine(last_fm) + pred_out.append(last_fm) + context_out = last_fm + + concate_fm = self.ffm(spatial_out, context_out) + pred_out.append(concate_fm) + + output = [] + if self.training: + for i, head in enumerate(self.heads): + out = head(pred_out[i]) + output.append(out) + else: + out = self.heads[-1](pred_out[-1]) + output.append(out) + return output + + +class SpatialPath(nn.Layer): + """ + SpatialPath module of BiseNetV1 model + + Args: + in_channels (int): The number of input channels in spatial path module. + out_channels (int): The number of output channels in spatial path module. + """ + + def __init__(self, in_channels, out_channels, inner_channel=64): + super().__init__() + self.conv_7x7 = layers.ConvBNReLU( + in_channels, inner_channel, 7, stride=2, padding=3, bias_attr=False) + self.conv_3x3_1 = layers.ConvBNReLU( + inner_channel, + inner_channel, + 3, + stride=2, + padding=1, + bias_attr=False) + self.conv_3x3_2 = layers.ConvBNReLU( + inner_channel, + inner_channel, + 3, + stride=2, + padding=1, + bias_attr=False) + self.conv_1x1 = layers.ConvBNReLU( + inner_channel, out_channels, 1, bias_attr=False) + + def forward(self, x): + x = self.conv_7x7(x) + x = self.conv_3x3_1(x) + x = self.conv_3x3_2(x) + x = self.conv_1x1(x) + return x + + +class BiSeNetHead(nn.Layer): + """ + BiSeNet head of BiseNetV1 model + + Args: + in_channels (int): The number of input channels in spatial path module. + out_channels (int): The number of output channels in spatial path module. + scale (int, float): The scale factor of interpolation. + """ + + def __init__(self, in_channels, out_channels, scale, is_aux=False): + super().__init__() + inner_channel = 128 if is_aux else 64 + self.conv_3x3 = layers.ConvBNReLU( + in_channels, inner_channel, 3, stride=1, padding=1, bias_attr=False) + self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1) + self.scale = scale + + def forward(self, x): + x = self.conv_3x3(x) + x = self.conv_1x1(x) + if self.scale > 1: + x = F.interpolate( + x, scale_factor=self.scale, mode='bilinear', align_corners=True) + return x + + +class AttentionRefinement(nn.Layer): + """ + AttentionRefinement module of BiseNetV1 model + + Args: + in_channels (int): The number of input channels in spatial path module. + out_channels (int): The number of output channels in spatial path module. + """ + + def __init__(self, in_channels, out_channels): + super().__init__() + self.conv_3x3 = layers.ConvBNReLU( + in_channels, out_channels, 3, stride=1, padding=1, bias_attr=False) + self.channel_attention = nn.Sequential( + nn.AdaptiveAvgPool2D(1), + layers.ConvBNReLU( + out_channels, out_channels, 1, bias_attr=False), + nn.Sigmoid(), ) + + def forward(self, x): + x = self.conv_3x3(x) + se = self.channel_attention(x) + x = x * se + return x + + +class FeatureFusion(nn.Layer): + """ + AttentionRefinement module of BiseNetV1 model + + Args: + in_channels (int): The number of input channels in spatial path module. + out_channels (int): The number of output channels in spatial path module. + reduction (int): A factor shrinks convolutional channels. Default: 1. + """ + + def __init__(self, in_channels, out_channels, reduction=1): + super().__init__() + self.conv_1x1 = layers.ConvBNReLU( + in_channels, out_channels, 1, bias_attr=False) + self.channel_attention = nn.Sequential( + nn.AdaptiveAvgPool2D(1), + layers.ConvBNReLU( + out_channels, out_channels // reduction, 1, bias_attr=False), + layers.ConvBNReLU( + out_channels // reduction, out_channels, 1, bias_attr=False), + nn.Sigmoid(), ) + + def forward(self, x1, x2): + fm = paddle.concat([x1, x2], axis=1) + fm = self.conv_1x1(fm) + fm_se = self.channel_attention(fm) + output = fm + fm * fm_se + return output diff --git a/paddleseg/models/ccnet.py b/paddleseg/models/ccnet.py new file mode 100644 index 0000000000000000000000000000000000000000..e42154c7ece2368219eea8b8effc204447d64108 --- /dev/null +++ b/paddleseg/models/ccnet.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class CCNet(nn.Layer): + """ + The CCNet implementation based on PaddlePaddle. + + The original article refers to + Zilong Huang, et al. "CCNet: Criss-Cross Attention for Semantic Segmentation" + (https://arxiv.org/abs/1811.11721) + + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd. + backbone_indices (tuple, list, optional): Two values in the tuple indicate the indices of output of backbone. Default: (2, 3). + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + dropout_prob (float, optional): The probability of dropout. Default: 0.0. + recurrence (int, optional): The number of recurrent operations. Defautl: 1. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + enable_auxiliary_loss=True, + dropout_prob=0.0, + recurrence=1, + align_corners=False, + pretrained=None): + super().__init__() + self.enable_auxiliary_loss = enable_auxiliary_loss + self.recurrence = recurrence + self.align_corners = align_corners + + self.backbone = backbone + self.backbone_indices = backbone_indices + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + if enable_auxiliary_loss: + self.aux_head = layers.AuxLayer( + backbone_channels[0], + 512, + num_classes, + dropout_prob=dropout_prob) + self.head = RCCAModule( + backbone_channels[1], + 512, + num_classes, + dropout_prob=dropout_prob, + recurrence=recurrence) + self.pretrained = pretrained + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = [] + output = self.head(feat_list[self.backbone_indices[-1]]) + logit_list.append(output) + if self.training and self.enable_auxiliary_loss: + aux_out = self.aux_head(feat_list[self.backbone_indices[-2]]) + logit_list.append(aux_out) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + +class RCCAModule(nn.Layer): + def __init__(self, + in_channels, + out_channels, + num_classes, + dropout_prob=0.1, + recurrence=1): + super().__init__() + inter_channels = in_channels // 4 + self.recurrence = recurrence + self.conva = layers.ConvBNLeakyReLU( + in_channels, inter_channels, 3, padding=1, bias_attr=False) + self.cca = CrissCrossAttention(inter_channels) + self.convb = layers.ConvBNLeakyReLU( + inter_channels, inter_channels, 3, padding=1, bias_attr=False) + self.out = layers.AuxLayer( + in_channels + inter_channels, + out_channels, + num_classes, + dropout_prob=dropout_prob) + + def forward(self, x): + feat = self.conva(x) + for i in range(self.recurrence): + feat = self.cca(feat) + feat = self.convb(feat) + output = self.out(paddle.concat([x, feat], axis=1)) + return output + + +class CrissCrossAttention(nn.Layer): + def __init__(self, in_channels): + super().__init__() + self.q_conv = nn.Conv2D(in_channels, in_channels // 8, kernel_size=1) + self.k_conv = nn.Conv2D(in_channels, in_channels // 8, kernel_size=1) + self.v_conv = nn.Conv2D(in_channels, in_channels, kernel_size=1) + self.softmax = nn.Softmax(axis=3) + self.gamma = self.create_parameter( + shape=(1, ), default_initializer=nn.initializer.Constant(0)) + self.inf_tensor = paddle.full(shape=(1, ), fill_value=float('inf')) + + def forward(self, x): + b, c, h, w = paddle.shape(x) + proj_q = self.q_conv(x) + proj_q_h = proj_q.transpose([0, 3, 1, 2]).reshape( + [b * w, -1, h]).transpose([0, 2, 1]) + proj_q_w = proj_q.transpose([0, 2, 1, 3]).reshape( + [b * h, -1, w]).transpose([0, 2, 1]) + + proj_k = self.k_conv(x) + proj_k_h = proj_k.transpose([0, 3, 1, 2]).reshape([b * w, -1, h]) + proj_k_w = proj_k.transpose([0, 2, 1, 3]).reshape([b * h, -1, w]) + + proj_v = self.v_conv(x) + proj_v_h = proj_v.transpose([0, 3, 1, 2]).reshape([b * w, -1, h]) + proj_v_w = proj_v.transpose([0, 2, 1, 3]).reshape([b * h, -1, w]) + + energy_h = (paddle.bmm(proj_q_h, proj_k_h) + self.Inf(b, h, w)).reshape( + [b, w, h, h]).transpose([0, 2, 1, 3]) + energy_w = paddle.bmm(proj_q_w, proj_k_w).reshape([b, h, w, w]) + concate = self.softmax(paddle.concat([energy_h, energy_w], axis=3)) + + attn_h = concate[:, :, :, 0:h].transpose([0, 2, 1, 3]).reshape( + [b * w, h, h]) + attn_w = concate[:, :, :, h:h + w].reshape([b * h, w, w]) + out_h = paddle.bmm(proj_v_h, attn_h.transpose([0, 2, 1])).reshape( + [b, w, -1, h]).transpose([0, 2, 3, 1]) + out_w = paddle.bmm(proj_v_w, attn_w.transpose([0, 2, 1])).reshape( + [b, h, -1, w]).transpose([0, 2, 1, 3]) + return self.gamma * (out_h + out_w) + x + + def Inf(self, B, H, W): + return -paddle.tile( + paddle.diag(paddle.tile(self.inf_tensor, [H]), 0).unsqueeze(0), + [B * W, 1, 1]) diff --git a/paddleseg/models/danet.py b/paddleseg/models/danet.py new file mode 100644 index 0000000000000000000000000000000000000000..c4191311f0708d2b371abab5f5ac2645ab30e421 --- /dev/null +++ b/paddleseg/models/danet.py @@ -0,0 +1,218 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class DANet(nn.Layer): + """ + The DANet implementation based on PaddlePaddle. + + The original article refers to + Fu, jun, et al. "Dual Attention Network for Scene Segmentation" + (https://arxiv.org/pdf/1809.02983.pdf) + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): A backbone network. + backbone_indices (tuple): The values in the tuple indicate the indices of + output of backbone. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + self.backbone_indices = backbone_indices + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + + self.head = DAHead(num_classes=num_classes, in_channels=in_channels) + + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + if not self.training: + logit_list = [logit_list[0]] + + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners, + align_mode=1) for logit in logit_list + ] + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class DAHead(nn.Layer): + """ + The Dual attention head. + + Args: + num_classes (int): The unique number of target classes. + in_channels (tuple): The number of input channels. + """ + + def __init__(self, num_classes, in_channels): + super().__init__() + in_channels = in_channels[-1] + inter_channels = in_channels // 4 + + self.channel_conv = layers.ConvBNReLU(in_channels, inter_channels, 3) + self.position_conv = layers.ConvBNReLU(in_channels, inter_channels, 3) + self.pam = PAM(inter_channels) + self.cam = CAM(inter_channels) + self.conv1 = layers.ConvBNReLU(inter_channels, inter_channels, 3) + self.conv2 = layers.ConvBNReLU(inter_channels, inter_channels, 3) + + self.aux_head = nn.Sequential( + nn.Dropout2D(0.1), nn.Conv2D(in_channels, num_classes, 1)) + + self.aux_head_pam = nn.Sequential( + nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1)) + + self.aux_head_cam = nn.Sequential( + nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1)) + + self.cls_head = nn.Sequential( + nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1)) + + def forward(self, feat_list): + feats = feat_list[-1] + channel_feats = self.channel_conv(feats) + channel_feats = self.cam(channel_feats) + channel_feats = self.conv1(channel_feats) + + position_feats = self.position_conv(feats) + position_feats = self.pam(position_feats) + position_feats = self.conv2(position_feats) + + feats_sum = position_feats + channel_feats + logit = self.cls_head(feats_sum) + + if not self.training: + return [logit] + + cam_logit = self.aux_head_cam(channel_feats) + pam_logit = self.aux_head_cam(position_feats) + aux_logit = self.aux_head(feats) + return [logit, cam_logit, pam_logit, aux_logit] + + +class PAM(nn.Layer): + """Position attention module.""" + + def __init__(self, in_channels): + super().__init__() + mid_channels = in_channels // 8 + self.mid_channels = mid_channels + self.in_channels = in_channels + + self.query_conv = nn.Conv2D(in_channels, mid_channels, 1, 1) + self.key_conv = nn.Conv2D(in_channels, mid_channels, 1, 1) + self.value_conv = nn.Conv2D(in_channels, in_channels, 1, 1) + + self.gamma = self.create_parameter( + shape=[1], + dtype='float32', + default_initializer=nn.initializer.Constant(0)) + + def forward(self, x): + x_shape = paddle.shape(x) + + # query: n, h * w, c1 + query = self.query_conv(x) + query = paddle.reshape(query, (0, self.mid_channels, -1)) + query = paddle.transpose(query, (0, 2, 1)) + + # key: n, c1, h * w + key = self.key_conv(x) + key = paddle.reshape(key, (0, self.mid_channels, -1)) + + # sim: n, h * w, h * w + sim = paddle.bmm(query, key) + sim = F.softmax(sim, axis=-1) + + value = self.value_conv(x) + value = paddle.reshape(value, (0, self.in_channels, -1)) + sim = paddle.transpose(sim, (0, 2, 1)) + + # feat: from (n, c2, h * w) -> (n, c2, h, w) + feat = paddle.bmm(value, sim) + feat = paddle.reshape(feat, + (0, self.in_channels, x_shape[2], x_shape[3])) + + out = self.gamma * feat + x + return out + + +class CAM(nn.Layer): + """Channel attention module.""" + + def __init__(self, channels): + super().__init__() + + self.channels = channels + self.gamma = self.create_parameter( + shape=[1], + dtype='float32', + default_initializer=nn.initializer.Constant(0)) + + def forward(self, x): + x_shape = paddle.shape(x) + # query: n, c, h * w + query = paddle.reshape(x, (0, self.channels, -1)) + # key: n, h * w, c + key = paddle.reshape(x, (0, self.channels, -1)) + key = paddle.transpose(key, (0, 2, 1)) + + # sim: n, c, c + sim = paddle.bmm(query, key) + # The danet author claims that this can avoid gradient divergence + sim = paddle.max(sim, axis=-1, keepdim=True).tile( + [1, 1, self.channels]) - sim + sim = F.softmax(sim, axis=-1) + + # feat: from (n, c, h * w) to (n, c, h, w) + value = paddle.reshape(x, (0, self.channels, -1)) + feat = paddle.bmm(sim, value) + feat = paddle.reshape(feat, (0, self.channels, x_shape[2], x_shape[3])) + + out = self.gamma * feat + x + return out diff --git a/paddleseg/models/ddrnet.py b/paddleseg/models/ddrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b2a69922294fe0da46e1903c24377c5f4749e5e7 --- /dev/null +++ b/paddleseg/models/ddrnet.py @@ -0,0 +1,403 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers +from paddleseg.utils import utils + + +class DualResNet(nn.Layer): + """ + The DDRNet implementation based on PaddlePaddle. + + The original article refers to + Yuanduo Hong, Huihui Pan, Weichao Sun, et al. "Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes" + (https://arxiv.org/abs/2101.06085) + + Args: + num_classes (int): The unique number of target classes. + in_channels (int, optional): Number of input channels. Default: 3. + block_layers (list, tuple): The numbers of layers in different blocks. Default: [2, 2, 2, 2]. + planes (int): Base channels in network. Default: 64. + spp_planes (int): Branch channels for DAPPM. Default: 128. + head_planes (int): Mid channels of segmentation head. Default: 128. + enable_auxiliary_loss (bool): Whether use auxiliary head for stage3. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + in_channels=3, + block_layers=[2, 2, 2, 2], + planes=64, + spp_planes=128, + head_planes=128, + enable_auxiliary_loss=False, + pretrained=None): + super().__init__() + highres_planes = planes * 2 + self.enable_auxiliary_loss = enable_auxiliary_loss + self.conv1 = nn.Sequential( + layers.ConvBNReLU( + in_channels, planes, kernel_size=3, stride=2, padding=1), + layers.ConvBNReLU( + planes, planes, kernel_size=3, stride=2, padding=1), ) + self.relu = nn.ReLU() + self.layer1 = self._make_layers(BasicBlock, planes, planes, + block_layers[0]) + self.layer2 = self._make_layers( + BasicBlock, planes, planes * 2, block_layers[1], stride=2) + self.layer3 = self._make_layers( + BasicBlock, planes * 2, planes * 4, block_layers[2], stride=2) + self.layer4 = self._make_layers( + BasicBlock, planes * 4, planes * 8, block_layers[3], stride=2) + + self.compression3 = layers.ConvBN( + planes * 4, highres_planes, kernel_size=1, bias_attr=False) + + self.compression4 = layers.ConvBN( + planes * 8, highres_planes, kernel_size=1, bias_attr=False) + + self.down3 = layers.ConvBN( + highres_planes, + planes * 4, + kernel_size=3, + stride=2, + bias_attr=False) + + self.down4 = nn.Sequential( + layers.ConvBNReLU( + highres_planes, + planes * 4, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False), + layers.ConvBN( + planes * 4, + planes * 8, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False)) + + self.layer3_ = self._make_layers(BasicBlock, planes * 2, highres_planes, + 2) + self.layer4_ = self._make_layers(BasicBlock, highres_planes, + highres_planes, 2) + self.layer5_ = self._make_layers(Bottleneck, highres_planes, + highres_planes, 1) + self.layer5 = self._make_layers( + Bottleneck, planes * 8, planes * 8, 1, stride=2) + + self.spp = DAPPM(planes * 16, spp_planes, planes * 4) + if self.enable_auxiliary_loss: + self.aux_head = DDRNetHead(highres_planes, head_planes, num_classes) + self.head = DDRNetHead(planes * 4, head_planes, num_classes) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + else: + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + param_init.kaiming_normal_init(m.weight) + elif isinstance(m, nn.BatchNorm2D): + param_init.constant_init(m.weight, value=1) + param_init.constant_init(m.bias, value=0) + + def _make_layers(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.BatchNorm2D(planes * block.expansion), ) + layers = [] + layers.append(block(inplanes, planes, stride, downsample)) + inplanes = planes * block.expansion + for i in range(1, blocks): + if i == (blocks - 1): + layers.append(block(inplanes, planes, stride=1, no_relu=True)) + else: + layers.append(block(inplanes, planes, stride=1, no_relu=False)) + return nn.Sequential(*layers) + + def forward(self, x): + n, c, h, w = paddle.shape(x) + width_output = w // 8 + height_output = h // 8 + + x = self.conv1(x) + stage1_out = self.layer1(x) + stage2_out = self.layer2(self.relu(stage1_out)) + stage3_out = self.layer3(self.relu(stage2_out)) + stage3_out_dual = self.layer3_(self.relu(stage2_out)) + x = stage3_out + self.down3(self.relu(stage3_out_dual)) + stage3_merge = stage3_out_dual + F.interpolate( + self.compression3(self.relu(stage3_out)), + size=[height_output, width_output], + mode='bilinear') + + stage4_out = self.layer4(self.relu(x)) + stage4_out_dual = self.layer4_(self.relu(stage3_merge)) + + x = stage4_out + self.down4(self.relu(stage4_out_dual)) + stage4_merge = stage4_out_dual + F.interpolate( + self.compression4(self.relu(stage4_out)), + size=[height_output, width_output], + mode='bilinear') + + stage5_out_dual = self.layer5_(self.relu(stage4_merge)) + x = F.interpolate( + self.spp(self.layer5(self.relu(x))), + size=[height_output, width_output], + mode='bilinear') + + output = self.head(x + stage5_out_dual) + logit_list = [] + logit_list.append(output) + + if self.enable_auxiliary_loss: + aux_out = self.aux_head(stage3_merge) + logit_list.append(aux_out) + return [ + F.interpolate( + logit, [h, w], mode='bilinear') for logit in logit_list + ] + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + no_relu=False): + super().__init__() + self.conv_bn_relu = layers.ConvBNReLU( + inplanes, + planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + self.relu = nn.ReLU() + self.conv_bn = layers.ConvBN( + planes, planes, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.downsample = downsample + self.stride = stride + self.no_relu = no_relu + + def forward(self, x): + residual = x + out = self.conv_bn_relu(x) + out = self.conv_bn(out) + if self.downsample is not None: + residual = self.downsample(x) + out += residual + if self.no_relu: + return out + else: + return self.relu(out) + + +class Bottleneck(nn.Layer): + expansion = 2 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + no_relu=True): + super().__init__() + self.conv_bn_relu1 = layers.ConvBNReLU( + inplanes, planes, kernel_size=1, bias_attr=False) + self.conv_bn_relu2 = layers.ConvBNReLU( + planes, + planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + self.conv_bn = layers.ConvBN( + planes, planes * self.expansion, kernel_size=1, bias_attr=False) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + self.no_relu = no_relu + + def forward(self, x): + residual = x + out = self.conv_bn_relu1(x) + out = self.conv_bn_relu2(out) + out = self.conv_bn(out) + if self.downsample is not None: + residual = self.downsample(x) + out += residual + if self.no_relu: + return out + else: + return self.relu(out) + + +class DAPPM(nn.Layer): + def __init__(self, inplanes, branch_planes, outplanes): + super().__init__() + self.scale1 = nn.Sequential( + nn.AvgPool2D( + kernel_size=5, stride=2, padding=2), + layers.SyncBatchNorm(inplanes), + nn.ReLU(), + nn.Conv2D( + inplanes, branch_planes, kernel_size=1, bias_attr=False), ) + self.scale2 = nn.Sequential( + nn.AvgPool2D( + kernel_size=9, stride=4, padding=4), + layers.SyncBatchNorm(inplanes), + nn.ReLU(), + nn.Conv2D( + inplanes, branch_planes, kernel_size=1, bias_attr=False), ) + self.scale3 = nn.Sequential( + nn.AvgPool2D( + kernel_size=17, stride=8, padding=8), + layers.SyncBatchNorm(inplanes), + nn.ReLU(), + nn.Conv2D( + inplanes, branch_planes, kernel_size=1, bias_attr=False), ) + self.scale4 = nn.Sequential( + nn.AdaptiveAvgPool2D((1, 1)), + layers.SyncBatchNorm(inplanes), + nn.ReLU(), + nn.Conv2D( + inplanes, branch_planes, kernel_size=1, bias_attr=False), ) + self.scale0 = nn.Sequential( + layers.SyncBatchNorm(inplanes), + nn.ReLU(), + nn.Conv2D( + inplanes, branch_planes, kernel_size=1, bias_attr=False), ) + self.process1 = nn.Sequential( + layers.SyncBatchNorm(branch_planes), + nn.ReLU(), + nn.Conv2D( + branch_planes, + branch_planes, + kernel_size=3, + padding=1, + bias_attr=False), ) + self.process2 = nn.Sequential( + layers.SyncBatchNorm(branch_planes), + nn.ReLU(), + nn.Conv2D( + branch_planes, + branch_planes, + kernel_size=3, + padding=1, + bias_attr=False), ) + self.process3 = nn.Sequential( + layers.SyncBatchNorm(branch_planes), + nn.ReLU(), + nn.Conv2D( + branch_planes, + branch_planes, + kernel_size=3, + padding=1, + bias_attr=False), ) + self.process4 = nn.Sequential( + layers.SyncBatchNorm(branch_planes), + nn.ReLU(), + nn.Conv2D( + branch_planes, + branch_planes, + kernel_size=3, + padding=1, + bias_attr=False), ) + self.compression = nn.Sequential( + layers.SyncBatchNorm(branch_planes * 5), + nn.ReLU(), + nn.Conv2D( + branch_planes * 5, outplanes, kernel_size=1, bias_attr=False)) + self.shortcut = nn.Sequential( + layers.SyncBatchNorm(inplanes), + nn.ReLU(), + nn.Conv2D( + inplanes, outplanes, kernel_size=1, bias_attr=False)) + + def forward(self, x): + n, c, h, w = paddle.shape(x) + x0 = self.scale0(x) + x1 = self.process1( + F.interpolate( + self.scale1(x), size=[h, w], mode='bilinear') + x0) + x2 = self.process2( + F.interpolate( + self.scale2(x), size=[h, w], mode='bilinear') + x1) + x3 = self.process3( + F.interpolate( + self.scale3(x), size=[h, w], mode='bilinear') + x2) + x4 = self.process4( + F.interpolate( + self.scale4(x), size=[h, w], mode='bilinear') + x3) + + out = self.compression(paddle.concat([x0, x1, x2, x3, x4], + 1)) + self.shortcut(x) + return out + + +class DDRNetHead(nn.Layer): + def __init__(self, inplanes, interplanes, outplanes, scale_factor=None): + super().__init__() + self.bn1 = nn.BatchNorm2D(inplanes) + self.relu = nn.ReLU() + self.conv_bn_relu = layers.ConvBNReLU( + inplanes, interplanes, kernel_size=3, padding=1, bias_attr=False) + self.conv = nn.Conv2D( + interplanes, outplanes, kernel_size=1, padding=0, bias_attr=True) + + self.scale_factor = scale_factor + + def forward(self, x): + x = self.bn1(x) + x = self.relu(x) + x = self.conv_bn_relu(x) + out = self.conv(x) + + if self.scale_factor is not None: + out = F.interpolate( + out, scale_factor=self.scale_factor, mode='bilinear') + return out + + +@manager.MODELS.add_component +def DDRNet_23(**kwargs): + return DualResNet( + block_layers=[2, 2, 2, 2], + planes=64, + spp_planes=128, + head_planes=128, + **kwargs) diff --git a/paddleseg/models/decoupled_segnet.py b/paddleseg/models/decoupled_segnet.py new file mode 100644 index 0000000000000000000000000000000000000000..8386d6b74d50dbf55f5667667427dc817840deea --- /dev/null +++ b/paddleseg/models/decoupled_segnet.py @@ -0,0 +1,232 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.models.backbones import resnet_vd +from paddleseg.models import deeplab +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class DecoupledSegNet(nn.Layer): + """ + The DecoupledSegNet implementation based on PaddlePaddle. + + The original article refers to + Xiangtai Li, et, al. "Improving Semantic Segmentation via Decoupled Body and Edge Supervision" + (https://arxiv.org/pdf/2007.10035.pdf) + + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd. + backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. + Default: (0, 3). + aspp_ratios (tuple, optional): The dilation rate using in ASSP module. + If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18). + If output_stride=8, aspp_ratios is (1, 12, 24, 36). + Default: (1, 6, 12, 18). + aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(0, 3), + aspp_ratios=(1, 6, 12, 18), + aspp_out_channels=256, + align_corners=False, + pretrained=None): + super().__init__() + self.backbone = backbone + backbone_channels = self.backbone.feat_channels + self.head = DecoupledSegNetHead(num_classes, backbone_indices, + backbone_channels, aspp_ratios, + aspp_out_channels, align_corners) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + + seg_logit, body_logit, edge_logit = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + if self.training: + return [seg_logit, body_logit, edge_logit, (seg_logit, edge_logit)] + return [seg_logit] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class DecoupledSegNetHead(nn.Layer): + """ + The DecoupledSegNetHead implementation based on PaddlePaddle. + + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. + the first index will be taken as a low-level feature in Edge presevation component; + the second one will be taken as input of ASPP component. + backbone_channels (tuple): The channels of output of backbone. + aspp_ratios (tuple): The dilation rates using in ASSP module. + aspp_out_channels (int): The output channels of ASPP module. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, num_classes, backbone_indices, backbone_channels, + aspp_ratios, aspp_out_channels, align_corners): + super().__init__() + self.backbone_indices = backbone_indices + self.align_corners = align_corners + self.aspp = layers.ASPPModule( + aspp_ratios=aspp_ratios, + in_channels=backbone_channels[backbone_indices[1]], + out_channels=aspp_out_channels, + align_corners=align_corners, + image_pooling=True) + + self.bot_fine = nn.Conv2D( + backbone_channels[backbone_indices[0]], 48, 1, bias_attr=False) + # decoupled + self.squeeze_body_edge = SqueezeBodyEdge( + 256, align_corners=self.align_corners) + self.edge_fusion = nn.Conv2D(256 + 48, 256, 1, bias_attr=False) + self.sigmoid_edge = nn.Sigmoid() + self.edge_out = nn.Sequential( + layers.ConvBNReLU( + in_channels=256, + out_channels=48, + kernel_size=3, + bias_attr=False), + nn.Conv2D( + 48, 1, 1, bias_attr=False)) + self.dsn_seg_body = nn.Sequential( + layers.ConvBNReLU( + in_channels=256, + out_channels=256, + kernel_size=3, + bias_attr=False), + nn.Conv2D( + 256, num_classes, 1, bias_attr=False)) + + self.final_seg = nn.Sequential( + layers.ConvBNReLU( + in_channels=512, + out_channels=256, + kernel_size=3, + bias_attr=False), + layers.ConvBNReLU( + in_channels=256, + out_channels=256, + kernel_size=3, + bias_attr=False), + nn.Conv2D( + 256, num_classes, kernel_size=1, bias_attr=False)) + + def forward(self, feat_list): + fine_fea = feat_list[self.backbone_indices[0]] + fine_size = paddle.shape(fine_fea) + x = feat_list[self.backbone_indices[1]] + aspp = self.aspp(x) + + # decoupled + seg_body, seg_edge = self.squeeze_body_edge(aspp) + # Edge presevation and edge out + fine_fea = self.bot_fine(fine_fea) + seg_edge = F.interpolate( + seg_edge, + fine_size[2:], + mode='bilinear', + align_corners=self.align_corners) + seg_edge = self.edge_fusion(paddle.concat([seg_edge, fine_fea], axis=1)) + seg_edge_out = self.edge_out(seg_edge) + seg_edge_out = self.sigmoid_edge(seg_edge_out) # seg_edge output + seg_body_out = self.dsn_seg_body(seg_body) # body out + + # seg_final out + seg_out = seg_edge + F.interpolate( + seg_body, + fine_size[2:], + mode='bilinear', + align_corners=self.align_corners) + aspp = F.interpolate( + aspp, + fine_size[2:], + mode='bilinear', + align_corners=self.align_corners) + seg_out = paddle.concat([aspp, seg_out], axis=1) + seg_final_out = self.final_seg(seg_out) + + return [seg_final_out, seg_body_out, seg_edge_out] + + +class SqueezeBodyEdge(nn.Layer): + def __init__(self, inplane, align_corners=False): + super().__init__() + self.align_corners = align_corners + self.down = nn.Sequential( + layers.ConvBNReLU( + inplane, inplane, kernel_size=3, groups=inplane, stride=2), + layers.ConvBNReLU( + inplane, inplane, kernel_size=3, groups=inplane, stride=2)) + self.flow_make = nn.Conv2D( + inplane * 2, 2, kernel_size=3, padding='same', bias_attr=False) + + def forward(self, x): + size = paddle.shape(x)[2:] + seg_down = self.down(x) + seg_down = F.interpolate( + seg_down, + size=size, + mode='bilinear', + align_corners=self.align_corners) + flow = self.flow_make(paddle.concat([x, seg_down], axis=1)) + seg_flow_warp = self.flow_warp(x, flow, size) + seg_edge = x - seg_flow_warp + return seg_flow_warp, seg_edge + + def flow_warp(self, input, flow, size): + input_shape = paddle.shape(input) + norm = size[::-1].reshape([1, 1, 1, -1]) + norm.stop_gradient = True + h_grid = paddle.linspace(-1.0, 1.0, size[0]).reshape([-1, 1]) + h_grid = h_grid.tile([size[1]]) + w_grid = paddle.linspace(-1.0, 1.0, size[1]).reshape([-1, 1]) + w_grid = w_grid.tile([size[0]]).transpose([1, 0]) + grid = paddle.concat([w_grid.unsqueeze(2), h_grid.unsqueeze(2)], axis=2) + grid.unsqueeze(0).tile([input_shape[0], 1, 1, 1]) + grid = grid + paddle.transpose(flow, (0, 2, 3, 1)) / norm + + output = F.grid_sample(input, grid) + return output diff --git a/paddleseg/models/deeplab.py b/paddleseg/models/deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..07863d5b04d21d22393ae3601f9be24a44ea83ef --- /dev/null +++ b/paddleseg/models/deeplab.py @@ -0,0 +1,308 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + +__all__ = ['DeepLabV3P', 'DeepLabV3'] + + +@manager.MODELS.add_component +class DeepLabV3P(nn.Layer): + """ + The DeepLabV3Plus implementation based on PaddlePaddle. + + The original article refers to + Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation" + (https://arxiv.org/abs/1802.02611) + + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd/Xception65. + backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. + Default: (0, 3). + aspp_ratios (tuple, optional): The dilation rate using in ASSP module. + If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18). + If output_stride=8, aspp_ratios is (1, 12, 24, 36). + Default: (1, 6, 12, 18). + aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW". + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(0, 3), + aspp_ratios=(1, 6, 12, 18), + aspp_out_channels=256, + align_corners=False, + pretrained=None, + data_format="NCHW"): + super().__init__() + + self.backbone = backbone + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + self.head = DeepLabV3PHead( + num_classes, + backbone_indices, + backbone_channels, + aspp_ratios, + aspp_out_channels, + align_corners, + data_format=data_format) + + self.align_corners = align_corners + self.pretrained = pretrained + self.data_format = data_format + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + if self.data_format == 'NCHW': + ori_shape = paddle.shape(x)[2:] + else: + ori_shape = paddle.shape(x)[1:3] + return [ + F.interpolate( + logit, + ori_shape, + mode='bilinear', + align_corners=self.align_corners, + data_format=self.data_format) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class DeepLabV3PHead(nn.Layer): + """ + The DeepLabV3PHead implementation based on PaddlePaddle. + + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. + the first index will be taken as a low-level feature in Decoder component; + the second one will be taken as input of ASPP component. + Usually backbone consists of four downsampling stage, and return an output of + each stage. If we set it as (0, 3), it means taking feature map of the first + stage in backbone as low-level feature used in Decoder, and feature map of the fourth + stage as input of ASPP. + backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index. + aspp_ratios (tuple): The dilation rates using in ASSP module. + aspp_out_channels (int): The output channels of ASPP module. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW". + """ + + def __init__(self, + num_classes, + backbone_indices, + backbone_channels, + aspp_ratios, + aspp_out_channels, + align_corners, + data_format='NCHW'): + super().__init__() + + self.aspp = layers.ASPPModule( + aspp_ratios, + backbone_channels[1], + aspp_out_channels, + align_corners, + use_sep_conv=True, + image_pooling=True, + data_format=data_format) + self.decoder = Decoder( + num_classes, + backbone_channels[0], + align_corners, + data_format=data_format) + self.backbone_indices = backbone_indices + + def forward(self, feat_list): + logit_list = [] + low_level_feat = feat_list[self.backbone_indices[0]] + x = feat_list[self.backbone_indices[1]] + x = self.aspp(x) + logit = self.decoder(x, low_level_feat) + logit_list.append(logit) + + return logit_list + + +@manager.MODELS.add_component +class DeepLabV3(nn.Layer): + """ + The DeepLabV3 implementation based on PaddlePaddle. + + The original article refers to + Liang-Chieh Chen, et, al. "Rethinking Atrous Convolution for Semantic Image Segmentation" + (https://arxiv.org/pdf/1706.05587.pdf). + + Args: + Please Refer to DeepLabV3P above. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(3, ), + aspp_ratios=(1, 6, 12, 18), + aspp_out_channels=256, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + self.head = DeepLabV3Head(num_classes, backbone_indices, + backbone_channels, aspp_ratios, + aspp_out_channels, align_corners) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class DeepLabV3Head(nn.Layer): + """ + The DeepLabV3Head implementation based on PaddlePaddle. + + Args: + Please Refer to DeepLabV3PHead above. + """ + + def __init__(self, num_classes, backbone_indices, backbone_channels, + aspp_ratios, aspp_out_channels, align_corners): + super().__init__() + + self.aspp = layers.ASPPModule( + aspp_ratios, + backbone_channels[0], + aspp_out_channels, + align_corners, + use_sep_conv=False, + image_pooling=True) + + self.cls = nn.Conv2D( + in_channels=aspp_out_channels, + out_channels=num_classes, + kernel_size=1) + + self.backbone_indices = backbone_indices + + def forward(self, feat_list): + logit_list = [] + x = feat_list[self.backbone_indices[0]] + x = self.aspp(x) + logit = self.cls(x) + logit_list.append(logit) + + return logit_list + + +class Decoder(nn.Layer): + """ + Decoder module of DeepLabV3P model + + Args: + num_classes (int): The number of classes. + in_channels (int): The number of input channels in decoder module. + """ + + def __init__(self, + num_classes, + in_channels, + align_corners, + data_format='NCHW'): + super(Decoder, self).__init__() + + self.data_format = data_format + self.conv_bn_relu1 = layers.ConvBNReLU( + in_channels=in_channels, + out_channels=48, + kernel_size=1, + data_format=data_format) + + self.conv_bn_relu2 = layers.SeparableConvBNReLU( + in_channels=304, + out_channels=256, + kernel_size=3, + padding=1, + data_format=data_format) + self.conv_bn_relu3 = layers.SeparableConvBNReLU( + in_channels=256, + out_channels=256, + kernel_size=3, + padding=1, + data_format=data_format) + self.conv = nn.Conv2D( + in_channels=256, + out_channels=num_classes, + kernel_size=1, + data_format=data_format) + + self.align_corners = align_corners + + def forward(self, x, low_level_feat): + low_level_feat = self.conv_bn_relu1(low_level_feat) + if self.data_format == 'NCHW': + low_level_shape = paddle.shape(low_level_feat)[-2:] + axis = 1 + else: + low_level_shape = paddle.shape(low_level_feat)[1:3] + axis = -1 + x = F.interpolate( + x, + low_level_shape, + mode='bilinear', + align_corners=self.align_corners, + data_format=self.data_format) + x = paddle.concat([x, low_level_feat], axis=axis) + x = self.conv_bn_relu2(x) + x = self.conv_bn_relu3(x) + x = self.conv(x) + return x diff --git a/paddleseg/models/dmnet.py b/paddleseg/models/dmnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c150ac60ccb80d61d3ac22a0c0ef12703170db71 --- /dev/null +++ b/paddleseg/models/dmnet.py @@ -0,0 +1,143 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class DMNet(nn.Layer): + """ + The DMNet implementation based on PaddlePaddle. + + The original article refers to + Junjun He, Zhongying Deng, Yu Qiao. "Dynamic Multi-scale Filters for Semantic Segmentation" + + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd. + mid_channels (int): The middle channels of convolution layer. Default: 512. + filter_sizes (list, tuple): The filter size of generated convolution kernel used in Dynamic Convolutional Module. Default: [1, 3, 5, 7]. + fusion (bool): Add one conv to fuse DCM output feature. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + mid_channels=512, + filter_sizes=[1, 3, 5, 7], + fusion=False, + pretrained=None): + super().__init__() + self.backbone = backbone + self.dcm_modules = nn.LayerList() + for filter_size in filter_sizes: + self.dcm_modules.append( + DCM(filter_size, fusion, self.backbone.feat_channels[-1], + mid_channels), ) + self.bottleneck = layers.ConvBNReLU( + self.backbone.feat_channels[-1] + len(filter_sizes) * mid_channels, + mid_channels, + 3, + padding=1, ) + self.cls = nn.Conv2D(mid_channels, num_classes, 1) + + self.fcn_head = nn.Sequential( + layers.ConvBNReLU( + self.backbone.feat_channels[2], mid_channels, 3, padding=1), + nn.Conv2D(mid_channels, num_classes, 1), ) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, x): + feats = self.backbone(x) + x = feats[-1] + dcm_outs = [x] + for dcm_module in self.dcm_modules: + dcm_outs.append(dcm_module(x)) + dcm_outs = paddle.concat(dcm_outs, axis=1) + x = self.bottleneck(dcm_outs) + x = self.cls(x) + x = F.interpolate( + x, scale_factor=8, mode='bilinear', align_corners=True) + output = [x] + if self.training: + fcn_out = self.fcn_head(feats[2]) + fcn_out = F.interpolate( + fcn_out, scale_factor=8, mode='bilinear', align_corners=True) + output.append(fcn_out) + return output + return output + + +class DCM(nn.Layer): + """ + Dynamic Convolutional Module used in DMNet. + + Args: + filter_size (int): The filter size of generated convolution kernel used in Dynamic Convolutional Module. + fusion (bool): Add one conv to fuse DCM output feature. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + """ + + def __init__(self, filter_size, fusion, in_channels, channels): + super().__init__() + self.filter_size = filter_size + self.fusion = fusion + self.channels = channels + + pad = (self.filter_size - 1) // 2 + if (self.filter_size - 1) % 2 == 0: + self.pad = (pad, pad, pad, pad) + else: + self.pad = (pad + 1, pad, pad + 1, pad) + + self.avg_pool = nn.AdaptiveAvgPool2D(filter_size) + self.filter_gen_conv = nn.Conv2D(in_channels, channels, 1) + self.input_redu_conv = layers.ConvBNReLU(in_channels, channels, 1) + + self.norm = layers.SyncBatchNorm(channels) + self.act = nn.ReLU() + + if self.fusion: + self.fusion_conv = layers.ConvBNReLU(channels, channels, 1) + + def forward(self, x): + generated_filter = self.filter_gen_conv(self.avg_pool(x)) + x = self.input_redu_conv(x) + b, c, h, w = x.shape + x = x.reshape([1, b * c, h, w]) + generated_filter = generated_filter.reshape( + [b * c, 1, self.filter_size, self.filter_size]) + + x = F.pad(x, self.pad, mode='constant', value=0) + output = F.conv2d(x, weight=generated_filter, groups=b * c) + output = output.reshape([b, self.channels, h, w]) + output = self.norm(output) + output = self.act(output) + if self.fusion: + output = self.fusion_conv(output) + return output diff --git a/paddleseg/models/dnlnet.py b/paddleseg/models/dnlnet.py new file mode 100644 index 0000000000000000000000000000000000000000..527eab09cb1b397d9f96979414e285336282ef7b --- /dev/null +++ b/paddleseg/models/dnlnet.py @@ -0,0 +1,227 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class DNLNet(nn.Layer): + """Disentangled Non-Local Neural Networks. + + The original article refers to + Minghao Yin, et al. "Disentangled Non-Local Neural Networks" + (https://arxiv.org/abs/2006.06668) + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): A backbone network. + backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone. + reduction (int): Reduction factor of projection transform. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + sqrt(1/inter_channels). Default: False. + mode (str): The nonlocal mode. Options are 'embedded_gaussian', + 'dot_product'. Default: 'embedded_gaussian'. + temperature (float): Temperature to adjust attention. Default: 0.05. + concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + reduction=2, + use_scale=True, + mode='embedded_gaussian', + temperature=0.05, + concat_input=True, + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + self.backbone = backbone + self.backbone_indices = backbone_indices + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + self.head = DNLHead(num_classes, in_channels, reduction, use_scale, + mode, temperature, concat_input, + enable_auxiliary_loss) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners, + align_mode=1) for logit in logit_list + ] + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class DNLHead(nn.Layer): + """ + The DNLNet head. + + Args: + num_classes (int): The unique number of target classes. + in_channels (tuple): The number of input channels. + reduction (int): Reduction factor of projection transform. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + sqrt(1/inter_channels). Default: False. + mode (str): The nonlocal mode. Options are 'embedded_gaussian', + 'dot_product'. Default: 'embedded_gaussian.'. + temperature (float): Temperature to adjust attention. Default: 0.05 + concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + """ + + def __init__(self, + num_classes, + in_channels, + reduction, + use_scale, + mode, + temperature, + concat_input=True, + enable_auxiliary_loss=True, + **kwargs): + super(DNLHead, self).__init__() + self.in_channels = in_channels[-1] + self.concat_input = concat_input + self.enable_auxiliary_loss = enable_auxiliary_loss + inter_channels = self.in_channels // 4 + + self.dnl_block = DisentangledNonLocal2D( + in_channels=inter_channels, + reduction=reduction, + use_scale=use_scale, + temperature=temperature, + mode=mode) + self.conv0 = layers.ConvBNReLU( + in_channels=self.in_channels, + out_channels=inter_channels, + kernel_size=3, + bias_attr=False) + self.conv1 = layers.ConvBNReLU( + in_channels=inter_channels, + out_channels=inter_channels, + kernel_size=3, + bias_attr=False) + self.cls = nn.Sequential( + nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1)) + self.aux = nn.Sequential( + layers.ConvBNReLU( + in_channels=1024, + out_channels=256, + kernel_size=3, + bias_attr=False), + nn.Dropout2D(p=0.1), + nn.Conv2D(256, num_classes, 1)) + if self.concat_input: + self.conv_cat = layers.ConvBNReLU( + self.in_channels + inter_channels, + inter_channels, + kernel_size=3, + bias_attr=False) + + def forward(self, feat_list): + C3, C4 = feat_list + output = self.conv0(C4) + output = self.dnl_block(output) + output = self.conv1(output) + if self.concat_input: + output = self.conv_cat(paddle.concat([C4, output], axis=1)) + output = self.cls(output) + if self.enable_auxiliary_loss: + auxout = self.aux(C3) + return [output, auxout] + else: + return [output] + + +class DisentangledNonLocal2D(layers.NonLocal2D): + """Disentangled Non-Local Blocks. + + Args: + temperature (float): Temperature to adjust attention. + """ + + def __init__(self, temperature, *arg, **kwargs): + super().__init__(*arg, **kwargs) + self.temperature = temperature + self.conv_mask = nn.Conv2D(self.in_channels, 1, kernel_size=1) + + def embedded_gaussian(self, theta_x, phi_x): + pairwise_weight = paddle.matmul(theta_x, phi_x) + if self.use_scale: + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight /= self.temperature + pairwise_weight = F.softmax(pairwise_weight, -1) + return pairwise_weight + + def forward(self, x): + x_shape = paddle.shape(x) + g_x = self.g(x).reshape([0, self.inter_channels, + -1]).transpose([0, 2, 1]) + + if self.mode == "gaussian": + theta_x = paddle.transpose( + x.reshape([0, self.in_channels, -1]), [0, 2, 1]) + if self.sub_sample: + phi_x = paddle.transpose(self.phi(x), [0, self.in_channels, -1]) + else: + phi_x = paddle.transpose(x, [0, self.in_channels, -1]) + + elif self.mode == "concatenation": + theta_x = paddle.reshape( + self.theta(x), [0, self.inter_channels, -1, 1]) + phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, 1, -1]) + + else: + theta_x = self.theta(x).reshape([0, self.inter_channels, + -1]).transpose([0, 2, 1]) + phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, -1]) + + theta_x -= paddle.mean(theta_x, axis=-2, keepdim=True) + phi_x -= paddle.mean(phi_x, axis=-1, keepdim=True) + + pairwise_func = getattr(self, self.mode) + pairwise_weight = pairwise_func(theta_x, phi_x) + + y = paddle.matmul(pairwise_weight, g_x).transpose([0, 2, 1]).reshape( + [0, self.inter_channels, x_shape[2], x_shape[3]]) + unary_mask = F.softmax( + paddle.reshape(self.conv_mask(x), [0, 1, -1]), -1) + unary_x = paddle.matmul(unary_mask, g_x).transpose([0, 2, 1]).reshape( + [0, self.inter_channels, 1, 1]) + output = x + self.conv_out(y + unary_x) + return output diff --git a/paddleseg/models/emanet.py b/paddleseg/models/emanet.py new file mode 100644 index 0000000000000000000000000000000000000000..33eedcda8990a086837a9e24683ae4fe970ef637 --- /dev/null +++ b/paddleseg/models/emanet.py @@ -0,0 +1,218 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class EMANet(nn.Layer): + """ + Expectation Maximization Attention Networks for Semantic Segmentation based on PaddlePaddle. + + The original article refers to + Xia Li, et al. "Expectation-Maximization Attention Networks for Semantic Segmentation" + (https://arxiv.org/abs/1907.13426) + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): A backbone network. + backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone. + ema_channels (int): EMA module channels. + gc_channels (int): The input channels to Global Context Block. + num_bases (int): Number of bases. + stage_num (int): The iteration number for EM. + momentum (float): The parameter for updating bases. + concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + ema_channels=512, + gc_channels=256, + num_bases=64, + stage_num=3, + momentum=0.1, + concat_input=True, + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + self.backbone_indices = backbone_indices + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + self.head = EMAHead(num_classes, in_channels, ema_channels, gc_channels, + num_bases, stage_num, momentum, concat_input, + enable_auxiliary_loss) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class EMAHead(nn.Layer): + """ + The EMANet head. + + Args: + num_classes (int): The unique number of target classes. + in_channels (tuple): The number of input channels. + ema_channels (int): EMA module channels. + gc_channels (int): The input channels to Global Context Block. + num_bases (int): Number of bases. + stage_num (int): The iteration number for EM. + momentum (float): The parameter for updating bases. + concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + """ + + def __init__(self, + num_classes, + in_channels, + ema_channels, + gc_channels, + num_bases, + stage_num, + momentum, + concat_input=True, + enable_auxiliary_loss=True): + super(EMAHead, self).__init__() + + self.in_channels = in_channels[-1] + self.concat_input = concat_input + self.enable_auxiliary_loss = enable_auxiliary_loss + + self.emau = EMAU(ema_channels, num_bases, stage_num, momentum=momentum) + self.ema_in_conv = layers.ConvBNReLU( + in_channels=self.in_channels, + out_channels=ema_channels, + kernel_size=3) + self.ema_mid_conv = nn.Conv2D(ema_channels, ema_channels, kernel_size=1) + self.ema_out_conv = layers.ConvBNReLU( + in_channels=ema_channels, out_channels=ema_channels, kernel_size=1) + self.bottleneck = layers.ConvBNReLU( + in_channels=ema_channels, out_channels=gc_channels, kernel_size=3) + self.cls = nn.Sequential( + nn.Dropout2D(p=0.1), nn.Conv2D(gc_channels, num_classes, 1)) + self.aux = nn.Sequential( + layers.ConvBNReLU( + in_channels=1024, out_channels=256, kernel_size=3), + nn.Dropout2D(p=0.1), + nn.Conv2D(256, num_classes, 1)) + if self.concat_input: + self.conv_cat = layers.ConvBNReLU( + self.in_channels + gc_channels, gc_channels, kernel_size=3) + + def forward(self, feat_list): + C3, C4 = feat_list + feats = self.ema_in_conv(C4) + identity = feats + feats = self.ema_mid_conv(feats) + recon = self.emau(feats) + recon = F.relu(recon) + recon = self.ema_out_conv(recon) + output = F.relu(identity + recon) + output = self.bottleneck(output) + if self.concat_input: + output = self.conv_cat(paddle.concat([C4, output], axis=1)) + output = self.cls(output) + if self.enable_auxiliary_loss: + auxout = self.aux(C3) + return [output, auxout] + else: + return [output] + + +class EMAU(nn.Layer): + '''The Expectation-Maximization Attention Unit (EMAU). + + Arguments: + c (int): The input and output channel number. + k (int): The number of the bases. + stage_num (int): The iteration number for EM. + momentum (float): The parameter for updating bases. + ''' + + def __init__(self, c, k, stage_num=3, momentum=0.1): + super(EMAU, self).__init__() + assert stage_num >= 1 + self.stage_num = stage_num + self.momentum = momentum + self.c = c + + tmp_mu = self.create_parameter( + shape=[1, c, k], + default_initializer=paddle.nn.initializer.KaimingNormal(k)) + mu = F.normalize(paddle.to_tensor(tmp_mu), axis=1, p=2) + self.register_buffer('mu', mu) + + def forward(self, x): + x_shape = paddle.shape(x) + x = x.flatten(2) + mu = paddle.tile(self.mu, [x_shape[0], 1, 1]) + + with paddle.no_grad(): + for i in range(self.stage_num): + x_t = paddle.transpose(x, [0, 2, 1]) + z = paddle.bmm(x_t, mu) + z = F.softmax(z, axis=2) + z_ = F.normalize(z, axis=1, p=1) + mu = paddle.bmm(x, z_) + mu = F.normalize(mu, axis=1, p=2) + + z_t = paddle.transpose(z, [0, 2, 1]) + x = paddle.matmul(mu, z_t) + x = paddle.reshape(x, [0, self.c, x_shape[2], x_shape[3]]) + + if self.training: + mu = paddle.mean(mu, 0, keepdim=True) + mu = F.normalize(mu, axis=1, p=2) + mu = self.mu * (1 - self.momentum) + mu * self.momentum + if paddle.distributed.get_world_size() > 1: + out = paddle.distributed.all_reduce(mu) + if out is not None: + mu = out + mu /= paddle.distributed.get_world_size() + self.mu = mu + + return x diff --git a/paddleseg/models/encnet.py b/paddleseg/models/encnet.py new file mode 100644 index 0000000000000000000000000000000000000000..81bb9ef48c794193f1072a9f859d3868b89d0429 --- /dev/null +++ b/paddleseg/models/encnet.py @@ -0,0 +1,213 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class ENCNet(nn.Layer): + """ + The ENCNet implementation based on PaddlePaddle. + + The original article refers to + Hang Zhang, Kristin Dana, et, al. "Context Encoding for Semantic Segmentation". + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): A backbone network. + backbone_indices (tuple): The values in the tuple indicate the indices of + output of backbone. + num_codes (int): The number of encoded words. Default: 32. + mid_channels (int): The channels of middle layers. Default: 512. + use_se_loss (int): Whether use semantic encoding loss. Default: True. + add_lateral (int): Whether use lateral convolution layers. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=[1, 2, 3], + num_codes=32, + mid_channels=512, + use_se_loss=True, + add_lateral=False, + pretrained=None): + super().__init__() + self.add_lateral = add_lateral + self.num_codes = num_codes + self.backbone = backbone + self.backbone_indices = backbone_indices + in_channels = [ + self.backbone.feat_channels[index] for index in backbone_indices + ] + + self.bottleneck = layers.ConvBNReLU( + in_channels[-1], + mid_channels, + 3, + padding=1, ) + if self.add_lateral: + self.lateral_convs = nn.LayerList() + for in_ch in in_channels[:-1]: + self.lateral_convs.append( + layers.ConvBNReLU( + in_ch, + mid_channels, + 1, )) + self.fusion = layers.ConvBNReLU( + len(in_channels) * mid_channels, + mid_channels, + 3, + padding=1, ) + + self.enc_module = EncModule(mid_channels, num_codes) + self.head = nn.Conv2D(mid_channels, num_classes, 1) + + self.fcn_head = layers.AuxLayer(self.backbone.feat_channels[2], + mid_channels, num_classes) + + self.use_se_loss = use_se_loss + if use_se_loss: + self.se_layer = nn.Linear(mid_channels, num_classes) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, inputs): + N, C, H, W = paddle.shape(inputs) + feats = self.backbone(inputs) + fcn_feat = feats[2] + + feats = [feats[i] for i in self.backbone_indices] + feat = self.bottleneck(feats[-1]) + + if self.add_lateral: + laterals = [] + for j, lateral_conv in enumerate(self.lateral_convs): + laterals.append( + F.interpolate( + lateral_conv(feats[j]), + size=paddle.shape(feat)[2:], + mode='bilinear', + align_corners=False)) + feat = self.fusion(paddle.concat([feat, *laterals], 1)) + encode_feat, feat = self.enc_module(feat) + out = self.head(feat) + out = F.interpolate( + out, size=[H, W], mode='bilinear', align_corners=False) + output = [out] + if self.training: + fcn_out = self.fcn_head(fcn_feat) + fcn_out = F.interpolate( + fcn_out, size=[H, W], mode='bilinear', align_corners=False) + output.append(fcn_out) + if self.use_se_loss: + se_out = self.se_layer(encode_feat) + output.append(se_out) + return output + return output + + +class Encoding(nn.Layer): + def __init__(self, channels, num_codes): + super().__init__() + self.channels, self.num_codes = channels, num_codes + + std = 1 / ((channels * num_codes)**0.5) + self.codewords = self.create_parameter( + shape=(num_codes, channels), + default_initializer=nn.initializer.Uniform(-std, std), ) + self.scale = self.create_parameter( + shape=(num_codes, ), + default_initializer=nn.initializer.Uniform(-1, 0), ) + self.channels = channels + + def scaled_l2(self, x, codewords, scale): + num_codes, channels = paddle.shape(codewords) + reshaped_scale = scale.reshape([1, 1, num_codes]) + expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1]) + reshaped_codewords = codewords.reshape([1, 1, num_codes, channels]) + + scaled_l2_norm = paddle.multiply( + reshaped_scale, + (expanded_x - reshaped_codewords).pow(2).sum(axis=3)) + return scaled_l2_norm + + def aggregate(self, assignment_weights, x, codewords): + num_codes, channels = paddle.shape(codewords) + reshaped_codewords = codewords.reshape([1, 1, num_codes, channels]) + expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1]) + + encoded_feat = paddle.multiply( + assignment_weights.unsqueeze(3), + (expanded_x - reshaped_codewords)).sum(axis=1) + encoded_feat = paddle.reshape(encoded_feat, + [-1, self.num_codes, self.channels]) + return encoded_feat + + def forward(self, x): + x_dims = x.ndim + assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format( + x_dims) + assert paddle.shape( + x + )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format( + self.channels, paddle.shape(x)[1]) + batch_size = paddle.shape(x)[0] + x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1]) + assignment_weights = F.softmax( + self.scaled_l2(x, self.codewords, self.scale), axis=2) + encoded_feat = self.aggregate(assignment_weights, x, self.codewords) + return encoded_feat + + +class EncModule(nn.Layer): + def __init__(self, in_channels, num_codes): + super().__init__() + self.encoding_project = layers.ConvBNReLU( + in_channels, + in_channels, + 1, ) + self.encoding = nn.Sequential( + Encoding( + channels=in_channels, num_codes=num_codes), + nn.BatchNorm1D(num_codes), + nn.ReLU(), ) + self.fc = nn.Sequential( + nn.Linear(in_channels, in_channels), + nn.Sigmoid(), ) + self.in_channels = in_channels + + def forward(self, x): + encoding_projection = self.encoding_project(x) + encoding_feat = self.encoding(encoding_projection) + + encoding_feat = encoding_feat.mean(axis=1) + batch_size, _, _, _ = paddle.shape(x) + + gamma = self.fc(encoding_feat) + y = gamma.reshape([batch_size, self.in_channels, 1, 1]) + output = F.relu(x + x * y) + return encoding_feat, output diff --git a/paddleseg/models/enet.py b/paddleseg/models/enet.py new file mode 100644 index 0000000000000000000000000000000000000000..a03ace8d9c14f994e26abdda0486071cdf03fc19 --- /dev/null +++ b/paddleseg/models/enet.py @@ -0,0 +1,588 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.models import layers +from paddleseg.cvlibs import manager, param_init + +__all__ = ['ENet'] + + +@manager.MODELS.add_component +class ENet(nn.Layer): + """ + The ENet implementation based on PaddlePaddle. + + The original article refers to + Adam Paszke, Abhishek Chaurasia, Sangpil Kim, Eugenio Culurciello, et al."ENet: A Deep Neural Network Architecture for Real-Time Semantic Segmentation" + (https://arxiv.org/abs/1606.02147). + + Args: + num_classes (int): The unique number of target classes. + pretrained (str, optional): The path or url of pretrained model. Default: None. + encoder_relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: False. + decoder_relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + + def __init__(self, + num_classes, + pretrained=None, + encoder_relu=False, + decoder_relu=True): + super(ENet, self).__init__() + + self.numclasses = num_classes + self.initial_block = InitialBlock(3, 16, relu=encoder_relu) + + self.downsample1_0 = DownsamplingBottleneck( + 16, 64, return_indices=True, dropout_prob=0.01, relu=encoder_relu) + self.regular1_1 = RegularBottleneck( + 64, padding=1, dropout_prob=0.01, relu=encoder_relu) + self.regular1_2 = RegularBottleneck( + 64, padding=1, dropout_prob=0.01, relu=encoder_relu) + self.regular1_3 = RegularBottleneck( + 64, padding=1, dropout_prob=0.01, relu=encoder_relu) + self.regular1_4 = RegularBottleneck( + 64, padding=1, dropout_prob=0.01, relu=encoder_relu) + + self.downsample2_0 = DownsamplingBottleneck( + 64, 128, return_indices=True, dropout_prob=0.1, relu=encoder_relu) + self.regular2_1 = RegularBottleneck( + 128, padding=1, dropout_prob=0.1, relu=encoder_relu) + self.dilated2_2 = RegularBottleneck( + 128, dilation=2, padding=2, dropout_prob=0.1, relu=encoder_relu) + self.asymmetric2_3 = RegularBottleneck( + 128, + kernel_size=5, + padding=2, + asymmetric=True, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated2_4 = RegularBottleneck( + 128, dilation=4, padding=4, dropout_prob=0.1, relu=encoder_relu) + self.regular2_5 = RegularBottleneck( + 128, padding=1, dropout_prob=0.1, relu=encoder_relu) + self.dilated2_6 = RegularBottleneck( + 128, dilation=8, padding=8, dropout_prob=0.1, relu=encoder_relu) + self.asymmetric2_7 = RegularBottleneck( + 128, + kernel_size=5, + asymmetric=True, + padding=2, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated2_8 = RegularBottleneck( + 128, dilation=16, padding=16, dropout_prob=0.1, relu=encoder_relu) + + self.regular3_0 = RegularBottleneck( + 128, padding=1, dropout_prob=0.1, relu=encoder_relu) + self.dilated3_1 = RegularBottleneck( + 128, dilation=2, padding=2, dropout_prob=0.1, relu=encoder_relu) + self.asymmetric3_2 = RegularBottleneck( + 128, + kernel_size=5, + padding=2, + asymmetric=True, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated3_3 = RegularBottleneck( + 128, dilation=4, padding=4, dropout_prob=0.1, relu=encoder_relu) + self.regular3_4 = RegularBottleneck( + 128, padding=1, dropout_prob=0.1, relu=encoder_relu) + self.dilated3_5 = RegularBottleneck( + 128, dilation=8, padding=8, dropout_prob=0.1, relu=encoder_relu) + self.asymmetric3_6 = RegularBottleneck( + 128, + kernel_size=5, + asymmetric=True, + padding=2, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated3_7 = RegularBottleneck( + 128, dilation=16, padding=16, dropout_prob=0.1, relu=encoder_relu) + + self.upsample4_0 = UpsamplingBottleneck( + 128, 64, dropout_prob=0.1, relu=decoder_relu) + self.regular4_1 = RegularBottleneck( + 64, padding=1, dropout_prob=0.1, relu=decoder_relu) + self.regular4_2 = RegularBottleneck( + 64, padding=1, dropout_prob=0.1, relu=decoder_relu) + + self.upsample5_0 = UpsamplingBottleneck( + 64, 16, dropout_prob=0.1, relu=decoder_relu) + self.regular5_1 = RegularBottleneck( + 16, padding=1, dropout_prob=0.1, relu=decoder_relu) + self.transposed_conv = nn.Conv2DTranspose( + 16, + num_classes, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False) + + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + + input_size = x.shape + x = self.initial_block(x) + + stage1_input_size = x.shape + x, max_indices1_0 = self.downsample1_0(x) + x = self.regular1_1(x) + x = self.regular1_2(x) + x = self.regular1_3(x) + x = self.regular1_4(x) + + stage2_input_size = x.shape + x, max_indices2_0 = self.downsample2_0(x) + x = self.regular2_1(x) + x = self.dilated2_2(x) + x = self.asymmetric2_3(x) + x = self.dilated2_4(x) + x = self.regular2_5(x) + x = self.dilated2_6(x) + x = self.asymmetric2_7(x) + x = self.dilated2_8(x) + + x = self.regular3_0(x) + x = self.dilated3_1(x) + x = self.asymmetric3_2(x) + x = self.dilated3_3(x) + x = self.regular3_4(x) + x = self.dilated3_5(x) + x = self.asymmetric3_6(x) + x = self.dilated3_7(x) + + x = self.upsample4_0(x, max_indices2_0, output_size=stage2_input_size) + x = self.regular4_1(x) + x = self.regular4_2(x) + + x = self.upsample5_0(x, max_indices1_0, output_size=stage1_input_size) + x = self.regular5_1(x) + x = self.transposed_conv(x, output_size=input_size[2:]) + return [x] + + def init_weight(self): + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +class InitialBlock(nn.Layer): + """ + The initial block is composed of two branches: + 1. a main branch which performs a regular convolution with stride 2; + 2. an extension branch which performs max-pooling. + Doing both operations in parallel and concatenating their results + allows for efficient downsampling and expansion. The main branch + outputs 13 feature maps while the extension branch outputs 3, for a + total of 16 feature maps after concatenation. + + Args: + in_channels (int): the number of input channels. + out_channels (int): the number output channels. + kernel_size (int, optional): the kernel size of the filters used in + the convolution layer. Default: 3. + padding (int, optional): zero-padding added to both sides of the + input. Default: 0. + bias (bool, optional): Adds a learnable bias to the output if + ``True``. Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + + def __init__(self, in_channels, out_channels, bias=False, relu=True): + super(InitialBlock, self).__init__() + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.main_branch = nn.Conv2D( + in_channels, + out_channels - 3, + kernel_size=3, + stride=2, + padding=1, + bias_attr=bias) + + self.ext_branch = nn.MaxPool2D(3, stride=2, padding=1) + + self.batch_norm = layers.SyncBatchNorm(out_channels) + + self.out_activation = activation() + + def forward(self, x): + main = self.main_branch(x) + ext = self.ext_branch(x) + + out = paddle.concat((main, ext), 1) + + out = self.batch_norm(out) + + return self.out_activation(out) + + +class RegularBottleneck(nn.Layer): + """ + Regular bottlenecks are the main building block of ENet. + Main branch: + 1. Shortcut connection. + Extension branch: + 1. 1x1 convolution which decreases the number of channels by + ``internal_ratio``, also called a projection; + 2. regular, dilated or asymmetric convolution; + 3. 1x1 convolution which increases the number of channels back to + ``channels``, also called an expansion; + 4. dropout as a regularizer. + + Args: + channels (int): the number of input and output channels. + internal_ratio (int, optional): a scale factor applied to + ``channels`` used to compute the number of + channels after the projection. eg. given ``channels`` equal to 128 and + internal_ratio equal to 2 the number of channels after the projection + is 64. Default: 4. + kernel_size (int, optional): the kernel size of the filters used in + the convolution layer described above in item 2 of the extension + branch. Default: 3. + padding (int, optional): zero-padding added to both sides of the + input. Default: 0. + dilation (int, optional): spacing between kernel elements for the + convolution described in item 2 of the extension branch. Default: 1. + asymmetric (bool, optional): flags if the convolution described in + item 2 of the extension branch is asymmetric or not. Default: False. + dropout_prob (float, optional): probability of an element to be + zeroed. Default: 0 (no dropout). + bias (bool, optional): Adds a learnable bias to the output if + ``True``. Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + + def __init__(self, + channels, + internal_ratio=4, + kernel_size=3, + padding=0, + dilation=1, + asymmetric=False, + dropout_prob=0, + bias=False, + relu=True): + super(RegularBottleneck, self).__init__() + + if internal_ratio <= 1 or internal_ratio > channels: + raise RuntimeError("Value out of range. Expected value in the " + "interval [1, {0}], got internal_scale={1}.". + format(channels, internal_ratio)) + + internal_channels = channels // internal_ratio + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.ext_conv1 = nn.Sequential( + nn.Conv2D( + channels, + internal_channels, + kernel_size=1, + stride=1, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation()) + + if asymmetric: + self.ext_conv2 = nn.Sequential( + nn.Conv2D( + internal_channels, + internal_channels, + kernel_size=(kernel_size, 1), + stride=1, + padding=(padding, 0), + dilation=dilation, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation(), + nn.Conv2D( + internal_channels, + internal_channels, + kernel_size=(1, kernel_size), + stride=1, + padding=(0, padding), + dilation=dilation, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation()) + else: + self.ext_conv2 = nn.Sequential( + nn.Conv2D( + internal_channels, + internal_channels, + kernel_size=kernel_size, + stride=1, + padding=padding, + dilation=dilation, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_conv3 = nn.Sequential( + nn.Conv2D( + internal_channels, + channels, + kernel_size=1, + stride=1, + bias_attr=bias), + layers.SyncBatchNorm(channels), + activation()) + + self.ext_regul = nn.Dropout2D(p=dropout_prob) + + self.out_activation = activation() + + def forward(self, x): + main = x + + ext = self.ext_conv1(x) + ext = self.ext_conv2(ext) + ext = self.ext_conv3(ext) + ext = self.ext_regul(ext) + + out = main + ext + + return self.out_activation(out) + + +class DownsamplingBottleneck(nn.Layer): + """ + Downsampling bottlenecks further downsample the feature map size. + Main branch: + 1. max pooling with stride 2; indices are saved to be used for + unpooling later. + Extension branch: + 1. 2x2 convolution with stride 2 that decreases the number of channels + by ``internal_ratio``, also called a projection; + 2. regular convolution (by default, 3x3); + 3. 1x1 convolution which increases the number of channels to + ``out_channels``, also called an expansion; + 4. dropout as a regularizer. + + Args: + in_channels (int): the number of input channels. + out_channels (int): the number of output channels. + internal_ratio (int, optional): a scale factor applied to ``channels`` + used to compute the number of channels after the projection. eg. given + ``channels`` equal to 128 and internal_ratio equal to 2 the number of + channels after the projection is 64. Default: 4. + return_indices (bool, optional): if ``True``, will return the max + indices along with the outputs. Useful when unpooling later. + dropout_prob (float, optional): probability of an element to be + zeroed. Default: 0 (no dropout). + bias (bool, optional): Adds a learnable bias to the output if + ``True``. Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + + def __init__(self, + in_channels, + out_channels, + internal_ratio=4, + return_indices=False, + dropout_prob=0, + bias=False, + relu=True): + super(DownsamplingBottleneck, self).__init__() + + self.return_indices = return_indices + + if internal_ratio <= 1 or internal_ratio > in_channels: + raise RuntimeError("Value out of range. Expected value in the " + "interval [1, {0}], got internal_scale={1}. ". + format(in_channels, internal_ratio)) + + internal_channels = in_channels // internal_ratio + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.main_max1 = nn.MaxPool2D(2, stride=2, return_mask=return_indices) + + self.ext_conv1 = nn.Sequential( + nn.Conv2D( + in_channels, + internal_channels, + kernel_size=2, + stride=2, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_conv2 = nn.Sequential( + nn.Conv2D( + internal_channels, + internal_channels, + kernel_size=3, + stride=1, + padding=1, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_conv3 = nn.Sequential( + nn.Conv2D( + internal_channels, + out_channels, + kernel_size=1, + stride=1, + bias_attr=bias), + layers.SyncBatchNorm(out_channels), + activation()) + + self.ext_regul = nn.Dropout2D(p=dropout_prob) + + self.out_activation = activation() + + def forward(self, x): + if self.return_indices: + main, max_indices = self.main_max1(x) + else: + main = self.main_max1(x) + + ext = self.ext_conv1(x) + ext = self.ext_conv2(ext) + ext = self.ext_conv3(ext) + ext = self.ext_regul(ext) + + n, ch_ext, h, w = ext.shape + ch_main = main.shape[1] + padding = paddle.zeros((n, ch_ext - ch_main, h, w)) + + main = paddle.concat((main, padding), 1) + + out = main + ext + + return self.out_activation(out), max_indices + + +class UpsamplingBottleneck(nn.Layer): + """ + The upsampling bottlenecks upsample the feature map resolution using max + pooling indices stored from the corresponding downsampling bottleneck. + Main branch: + 1. 1x1 convolution with stride 1 that decreases the number of channels by + ``internal_ratio``, also called a projection; + 2. max unpool layer using the max pool indices from the corresponding + downsampling max pool layer. + Extension branch: + 1. 1x1 convolution with stride 1 that decreases the number of channels by + ``internal_ratio``, also called a projection; + 2. transposed convolution (by default, 3x3); + 3. 1x1 convolution which increases the number of channels to + ``out_channels``, also called an expansion; + 4. dropout as a regularizer. + + Args: + in_channels (int): the number of input channels. + out_channels (int): the number of output channels. + internal_ratio (int, optional): a scale factor applied to ``in_channels`` + used to compute the number of channels after the projection. eg. given + ``in_channels`` equal to 128 and ``internal_ratio`` equal to 2 the number + of channels after the projection is 64. Default: 4. + dropout_prob (float, optional): probability of an element to be zeroed. + Default: 0 (no dropout). + bias (bool, optional): Adds a learnable bias to the output if ``True``. + Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + + def __init__(self, + in_channels, + out_channels, + internal_ratio=4, + dropout_prob=0, + bias=False, + relu=True): + super(UpsamplingBottleneck, self).__init__() + + if internal_ratio <= 1 or internal_ratio > in_channels: + raise RuntimeError("Value out of range. Expected value in the " + "interval [1, {0}], got internal_scale={1}. ". + format(in_channels, internal_ratio)) + + internal_channels = in_channels // internal_ratio + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.main_conv1 = nn.Sequential( + nn.Conv2D( + in_channels, out_channels, kernel_size=1, bias_attr=bias), + layers.SyncBatchNorm(out_channels)) + + self.ext_conv1 = nn.Sequential( + nn.Conv2D( + in_channels, internal_channels, kernel_size=1, bias_attr=bias), + layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_tconv1 = nn.Conv2DTranspose( + internal_channels, + internal_channels, + kernel_size=2, + stride=2, + bias_attr=bias) + self.ext_tconv1_bnorm = layers.SyncBatchNorm(internal_channels) + self.ext_tconv1_activation = activation() + + self.ext_conv2 = nn.Sequential( + nn.Conv2D( + internal_channels, out_channels, kernel_size=1, bias_attr=bias), + layers.SyncBatchNorm(out_channels)) + + self.ext_regul = nn.Dropout2D(p=dropout_prob) + + self.out_activation = activation() + + def forward(self, x, max_indices, output_size): + main = self.main_conv1(x) + main = F.max_unpool2d( + main, max_indices, kernel_size=2, output_size=output_size) + + ext = self.ext_conv1(x) + ext = self.ext_tconv1(ext, output_size=output_size[2:]) + ext = self.ext_tconv1_bnorm(ext) + ext = self.ext_tconv1_activation(ext) + ext = self.ext_conv2(ext) + ext = self.ext_regul(ext) + + out = main + ext + + return self.out_activation(out) diff --git a/paddleseg/models/espnet.py b/paddleseg/models/espnet.py new file mode 100644 index 0000000000000000000000000000000000000000..1751f0ec0789a3a1367f4465cb41099bbd739013 --- /dev/null +++ b/paddleseg/models/espnet.py @@ -0,0 +1,481 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers + + +@manager.MODELS.add_component +class ESPNetV2(nn.Layer): + """ + The ESPNetV2 implementation based on PaddlePaddle. + + The original article refers to + Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network" + (https://arxiv.org/abs/1811.11431). + + Args: + num_classes (int): The unique number of target classes. + in_channels (int, optional): Number of input channels. Default: 3. + scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0. + drop_prob (floa, optional): The probability of dropout. Default: 0.1. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + in_channels=3, + scale=1.0, + drop_prob=0.1, + pretrained=None): + super().__init__() + self.backbone = EESPNetBackbone(in_channels, drop_prob, scale) + self.in_channels = self.backbone.out_channels + self.proj_l4_c = layers.ConvBNPReLU( + self.in_channels[3], + self.in_channels[2], + 1, + stride=1, + bias_attr=False) + psp_size = 2 * self.in_channels[2] + self.eesp_psp = nn.Sequential( + EESP( + psp_size, + psp_size // 2, + stride=1, + branches=4, + kernel_size_maximum=7), + PSPModule(psp_size // 2, psp_size // 2), ) + + self.project_l3 = nn.Sequential( + nn.Dropout2D(p=drop_prob), + nn.Conv2D( + psp_size // 2, num_classes, 1, 1, bias_attr=False), ) + self.act_l3 = BNPReLU(num_classes) + self.project_l2 = layers.ConvBNPReLU( + self.in_channels[1] + num_classes, + num_classes, + 1, + stride=1, + bias_attr=False) + self.project_l1 = nn.Sequential( + nn.Dropout2D(p=drop_prob), + nn.Conv2D( + self.in_channels[0] + num_classes, + num_classes, + 1, + 1, + bias_attr=False), ) + + self.pretrained = pretrained + + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def hierarchical_upsample(self, x, factor=3): + for i in range(factor): + x = F.interpolate( + x, scale_factor=2, mode='bilinear', align_corners=True) + return x + + def forward(self, x): + out_l1, out_l2, out_l3, out_l4 = self.backbone(x) + + out_l4_proj = self.proj_l4_c(out_l4) + l4_to_l3 = F.interpolate( + out_l4_proj, scale_factor=2, mode='bilinear', align_corners=True) + merged_l3 = self.eesp_psp(paddle.concat([out_l3, l4_to_l3], axis=1)) + proj_merge_l3 = self.project_l3(merged_l3) + proj_merge_l3 = self.act_l3(proj_merge_l3) + + l3_to_l2 = F.interpolate( + proj_merge_l3, scale_factor=2, mode='bilinear', align_corners=True) + merged_l2 = self.project_l2(paddle.concat([out_l2, l3_to_l2], axis=1)) + + l2_to_l1 = F.interpolate( + merged_l2, scale_factor=2, mode='bilinear', align_corners=True) + merged_l1 = self.project_l1(paddle.concat([out_l1, l2_to_l1], axis=1)) + + if self.training: + return [ + F.interpolate( + merged_l1, + scale_factor=2, + mode='bilinear', + align_corners=True), + self.hierarchical_upsample(proj_merge_l3), + ] + else: + return [ + F.interpolate( + merged_l1, + scale_factor=2, + mode='bilinear', + align_corners=True) + ] + + +class BNPReLU(nn.Layer): + def __init__(self, out_channels, **kwargs): + super().__init__() + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self._batch_norm = layers.SyncBatchNorm( + out_channels, data_format=data_format) + self._prelu = layers.Activation("prelu") + + def forward(self, x): + x = self._batch_norm(x) + x = self._prelu(x) + return x + + +class EESP(nn.Layer): + """ + EESP block, principle: reduce -> split -> transform -> merge + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (int, optional): Factor by which we should skip (useful for down-sampling). If 2, then down-samples the feature map by 2. Default: 1. + branches (int, optional): Number of branches. Default: 4. + kernel_size_maximum (int, optional): A maximum value of receptive field allowed for EESP block. Default: 7. + down_method (str, optional): Down sample or not, only support 'avg' and 'esp'(equivalent to stride is 2 or not). Default: 'esp'. + """ + + def __init__(self, + in_channels, + out_channels, + stride=1, + branches=4, + kernel_size_maximum=7, + down_method='esp'): + super(EESP, self).__init__() + if out_channels % branches != 0: + raise RuntimeError( + "The out_channes for EESP should be factorized by branches, but out_channels={} cann't be factorized by branches={}" + .format(out_channels, branches)) + assert down_method in [ + 'avg', 'esp' + ], "The down_method for EESP only support 'avg' or 'esp', but got down_method={}".format( + down_method) + self.in_channels = in_channels + self.stride = stride + + in_branch_channels = int(out_channels / branches) + self.group_conv_in = layers.ConvBNPReLU( + in_channels, + in_branch_channels, + 1, + stride=1, + groups=branches, + bias_attr=False) + + map_ksize_dilation = { + 3: 1, + 5: 2, + 7: 3, + 9: 4, + 11: 5, + 13: 6, + 15: 7, + 17: 8 + } + self.kernel_sizes = [] + for i in range(branches): + kernel_size = 3 + 2 * i + kernel_size = kernel_size if kernel_size <= kernel_size_maximum else 3 + self.kernel_sizes.append(kernel_size) + self.kernel_sizes.sort() + + self.spp_modules = nn.LayerList() + for i in range(branches): + dilation = map_ksize_dilation[self.kernel_sizes[i]] + self.spp_modules.append( + nn.Conv2D( + in_branch_channels, + in_branch_channels, + kernel_size=3, + padding='same', + stride=stride, + dilation=dilation, + groups=in_branch_channels, + bias_attr=False)) + self.group_conv_out = layers.ConvBN( + out_channels, + out_channels, + kernel_size=1, + stride=1, + groups=branches, + bias_attr=False) + self.bn_act = BNPReLU(out_channels) + self._act = nn.PReLU() + self.down_method = True if down_method == 'avg' else False + + @paddle.jit.not_to_static + def convert_group_x(self, group_merge, x): + if x.shape == group_merge.shape: + group_merge += x + + return group_merge + + def forward(self, x): + group_out = self.group_conv_in(x) + output = [self.spp_modules[0](group_out)] + + for k in range(1, len(self.spp_modules)): + output_k = self.spp_modules[k](group_out) + output_k = output_k + output[k - 1] + output.append(output_k) + + group_merge = self.group_conv_out( + self.bn_act(paddle.concat( + output, axis=1))) + + if self.stride == 2 and self.down_method: + return group_merge + + group_merge = self.convert_group_x(group_merge, x) + out = self._act(group_merge) + return out + + +class PSPModule(nn.Layer): + def __init__(self, in_channels, out_channels, sizes=4): + super().__init__() + self.stages = nn.LayerList([ + nn.Conv2D( + in_channels, + in_channels, + kernel_size=3, + stride=1, + groups=in_channels, + padding='same', + bias_attr=False) for _ in range(sizes) + ]) + self.project = layers.ConvBNPReLU( + in_channels * (sizes + 1), + out_channels, + 1, + stride=1, + bias_attr=False) + + def forward(self, feats): + h, w = paddle.shape(feats)[2:4] + out = [feats] + for stage in self.stages: + feats = F.avg_pool2d(feats, kernel_size=3, stride=2, padding='same') + upsampled = F.interpolate( + stage(feats), size=[h, w], mode='bilinear', align_corners=True) + out.append(upsampled) + return self.project(paddle.concat(out, axis=1)) + + +class DownSampler(nn.Layer): + """ + Down sampler. + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + branches (int, optional): Number of branches. Default: 9. + kernel_size_maximum (int, optional): A maximum value of kernel_size for EESP block. Default: 9. + shortcut (bool, optional): Use shortcut or not. Default: True. + """ + + def __init__(self, + in_channels, + out_channels, + branches=4, + kernel_size_maximum=9, + shortcut=True): + super().__init__() + if out_channels < in_channels: + raise RuntimeError( + "The out_channes for DownSampler should be bigger than in_channels, but got in_channles={}, out_channels={}" + .format(in_channels, out_channels)) + self.eesp = EESP( + in_channels, + out_channels - in_channels, + stride=2, + branches=branches, + kernel_size_maximum=kernel_size_maximum, + down_method='avg') + self.avg = nn.AvgPool2D(kernel_size=3, padding=1, stride=2) + if shortcut: + self.shortcut_layer = nn.Sequential( + layers.ConvBNPReLU( + 3, 3, 3, stride=1, bias_attr=False), + layers.ConvBN( + 3, out_channels, 1, stride=1, bias_attr=False), ) + self._act = nn.PReLU() + + def forward(self, x, inputs=None): + avg_out = self.avg(x) + eesp_out = self.eesp(x) + output = paddle.concat([avg_out, eesp_out], axis=1) + + if inputs is not None: + w1 = paddle.shape(avg_out)[2] + w2 = paddle.shape(inputs)[2] + + while w2 != w1: + inputs = F.avg_pool2d( + inputs, kernel_size=3, padding=1, stride=2) + w2 = paddle.shape(inputs)[2] + # import pdb + # pdb.set_trace() + output = output + self.shortcut_layer(inputs) + return self._act(output) + + +class EESPNetBackbone(nn.Layer): + """ + The EESPNetBackbone implementation based on PaddlePaddle. + + The original article refers to + Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network" + (https://arxiv.org/abs/1811.11431). + + Args: + in_channels (int, optional): Number of input channels. Default: 3. + drop_prob (float, optional): The probability of dropout. Default: 3. + scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0. + """ + + def __init__(self, in_channels=3, drop_prob=0.1, scale=1.0): + super().__init__() + reps = [0, 3, 7, 3] + + num_level = 4 # 1/2, 1/4, 1/8, 1/16 + kernel_size_limitations = [13, 11, 9, 7] # kernel size limitation + branch_list = [4] * len( + kernel_size_limitations) # branches at different levels + + base_channels = 32 # first conv output channels + channels_config = [base_channels] * num_level + + for i in range(num_level): + if i == 0: + channels = int(base_channels * scale) + channels = math.ceil(channels / branch_list[0]) * branch_list[0] + channels_config[ + i] = base_channels if channels > base_channels else channels + else: + channels_config[i] = channels * pow(2, i) + + self.level1 = layers.ConvBNPReLU( + in_channels, channels_config[0], 3, stride=2, bias_attr=False) + + self.level2 = DownSampler( + channels_config[0], + channels_config[1], + branches=branch_list[0], + kernel_size_maximum=kernel_size_limitations[0], + shortcut=True) + + self.level3_0 = DownSampler( + channels_config[1], + channels_config[2], + branches=branch_list[1], + kernel_size_maximum=kernel_size_limitations[1], + shortcut=True) + self.level3 = nn.LayerList() + for i in range(reps[1]): + self.level3.append( + EESP( + channels_config[2], + channels_config[2], + stride=1, + branches=branch_list[2], + kernel_size_maximum=kernel_size_limitations[2])) + + self.level4_0 = DownSampler( + channels_config[2], + channels_config[3], + branches=branch_list[2], + kernel_size_maximum=kernel_size_limitations[2], + shortcut=True) + self.level4 = nn.LayerList() + for i in range(reps[2]): + self.level4.append( + EESP( + channels_config[3], + channels_config[3], + stride=1, + branches=branch_list[3], + kernel_size_maximum=kernel_size_limitations[3])) + + self.out_channels = channels_config + + self.init_params() + + def init_params(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + param_init.kaiming_normal_init(m.weight) + if m.bias is not None: + param_init.constant_init(m.bias, value=0.0) + elif isinstance(m, nn.BatchNorm2D): + param_init.constant_init(m.weight, value=1.0) + param_init.constant_init(m.bias, value=0.0) + elif isinstance(m, nn.Linear): + param_init.normal_init(m.weight, std=0.001) + if m.bias is not None: + param_init.constant_init(m.bias, value=0.0) + + def forward(self, x): + out_l1 = self.level1(x) + out_l2 = self.level2(out_l1, x) + out_l3 = self.level3_0(out_l2, x) + for i, layer in enumerate(self.level3): + out_l3 = layer(out_l3) + out_l4 = self.level4_0(out_l3, x) + for i, layer in enumerate(self.level4): + out_l4 = layer(out_l4) + return out_l1, out_l2, out_l3, out_l4 + + +if __name__ == '__main__': + import paddle + import numpy as np + + paddle.enable_static() + + startup_prog = paddle.static.default_startup_program() + + exe = paddle.static.Executor(paddle.CPUPlace()) + exe.run(startup_prog) + path_prefix = "./output/model" + + [inference_program, feed_target_names, fetch_targets] = ( + paddle.static.load_inference_model(path_prefix, exe)) + print('inference_program:', inference_program) + + tensor_img = np.array( + np.random.random((1, 3, 1024, 2048)), dtype=np.float32) + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) diff --git a/paddleseg/models/espnetv1.py b/paddleseg/models/espnetv1.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1142e48d940780ba3705dd365c033ac5127b88 --- /dev/null +++ b/paddleseg/models/espnetv1.py @@ -0,0 +1,313 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class ESPNetV1(nn.Layer): + """ + The ESPNetV1 implementation based on PaddlePaddle. + + The original article refers to + Sachin Mehta1, Mohammad Rastegari, Anat Caspi, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNet: Efficient Spatial Pyramid of Dilated Convolutions for Semantic Segmentation" + (https://arxiv.org/abs/1803.06815). + + Args: + num_classes (int): The unique number of target classes. + in_channels (int, optional): Number of input channels. Default: 3. + level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 2. + level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + in_channels=3, + level2_depth=2, + level3_depth=3, + pretrained=None): + super().__init__() + self.encoder = ESPNetEncoder(num_classes, in_channels, level2_depth, + level3_depth) + + self.level3_up = nn.Conv2DTranspose( + num_classes, + num_classes, + 2, + stride=2, + padding=0, + output_padding=0, + bias_attr=False) + self.br3 = layers.SyncBatchNorm(num_classes) + self.level2_proj = nn.Conv2D( + in_channels + 128, num_classes, 1, bias_attr=False) + self.combine_l2_l3 = nn.Sequential( + BNPReLU(2 * num_classes), + DilatedResidualBlock( + 2 * num_classes, num_classes, residual=False), ) + self.level2_up = nn.Sequential( + nn.Conv2DTranspose( + num_classes, + num_classes, + 2, + stride=2, + padding=0, + output_padding=0, + bias_attr=False), + BNPReLU(num_classes), ) + self.out_proj = layers.ConvBNPReLU( + 16 + in_channels + num_classes, + num_classes, + 3, + padding='same', + stride=1) + self.out_up = nn.Conv2DTranspose( + num_classes, + num_classes, + 2, + stride=2, + padding=0, + output_padding=0, + bias_attr=False) + self.pretrained = pretrained + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, x): + p1, p2, p3 = self.encoder(x) + up_p3 = self.level3_up(p3) + + combine = self.combine_l2_l3(paddle.concat([up_p3, p2], axis=1)) + up_p2 = self.level2_up(combine) + + combine = self.out_proj(paddle.concat([up_p2, p1], axis=1)) + out = self.out_up(combine) + return [out] + + +class BNPReLU(nn.Layer): + def __init__(self, channels): + super().__init__() + self.bn = layers.SyncBatchNorm(channels) + self.act = nn.PReLU(channels) + + def forward(self, x): + x = self.bn(x) + x = self.act(x) + return x + + +class DownSampler(nn.Layer): + """ + Down sampler. + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + """ + + def __init__(self, in_channels, out_channels): + super().__init__() + branch_channels = out_channels // 5 + remain_channels = out_channels - branch_channels * 4 + self.conv1 = nn.Conv2D( + in_channels, + branch_channels, + 3, + stride=2, + padding=1, + bias_attr=False) + self.d_conv1 = nn.Conv2D( + branch_channels, remain_channels, 3, padding=1, bias_attr=False) + self.d_conv2 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=2, + dilation=2, + bias_attr=False) + self.d_conv4 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=4, + dilation=4, + bias_attr=False) + self.d_conv8 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=8, + dilation=8, + bias_attr=False) + self.d_conv16 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=16, + dilation=16, + bias_attr=False) + self.bn = layers.SyncBatchNorm(out_channels) + self.act = nn.PReLU(out_channels) + + def forward(self, x): + x = self.conv1(x) + d1 = self.d_conv1(x) + d2 = self.d_conv2(x) + d4 = self.d_conv4(x) + d8 = self.d_conv8(x) + d16 = self.d_conv16(x) + + feat1 = d2 + feat2 = feat1 + d4 + feat3 = feat2 + d8 + feat4 = feat3 + d16 + + feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1) + out = self.bn(feat) + out = self.act(out) + return out + + +class DilatedResidualBlock(nn.Layer): + ''' + ESP block, principle: reduce -> split -> transform -> merge + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + residual (bool, optional): Add a residual connection through identity operation. Default: True. + ''' + + def __init__(self, in_channels, out_channels, residual=True): + super().__init__() + branch_channels = out_channels // 5 + remain_channels = out_channels - branch_channels * 4 + self.conv1 = nn.Conv2D(in_channels, branch_channels, 1, bias_attr=False) + self.d_conv1 = nn.Conv2D( + branch_channels, remain_channels, 3, padding=1, bias_attr=False) + self.d_conv2 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=2, + dilation=2, + bias_attr=False) + self.d_conv4 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=4, + dilation=4, + bias_attr=False) + self.d_conv8 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=8, + dilation=8, + bias_attr=False) + self.d_conv16 = nn.Conv2D( + branch_channels, + branch_channels, + 3, + padding=16, + dilation=16, + bias_attr=False) + + self.bn = BNPReLU(out_channels) + self.residual = residual + + def forward(self, x): + x_proj = self.conv1(x) + d1 = self.d_conv1(x_proj) + d2 = self.d_conv2(x_proj) + d4 = self.d_conv4(x_proj) + d8 = self.d_conv8(x_proj) + d16 = self.d_conv16(x_proj) + + feat1 = d2 + feat2 = feat1 + d4 + feat3 = feat2 + d8 + feat4 = feat3 + d16 + + feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1) + + if self.residual: + feat = feat + x + out = self.bn(feat) + return out + + +class ESPNetEncoder(nn.Layer): + ''' + The ESPNet-C implementation based on PaddlePaddle. + Args: + num_classes (int): The unique number of target classes. + in_channels (int, optional): Number of input channels. Default: 3. + level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 5. + level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3. + ''' + + def __init__(self, + num_classes, + in_channels=3, + level2_depth=5, + level3_depth=3): + super().__init__() + self.level1 = layers.ConvBNPReLU( + in_channels, 16, 3, padding='same', stride=2) + self.br1 = BNPReLU(in_channels + 16) + self.proj1 = layers.ConvBNPReLU(in_channels + 16, num_classes, 1) + + self.level2_0 = DownSampler(in_channels + 16, 64) + self.level2 = nn.Sequential( + *[DilatedResidualBlock(64, 64) for i in range(level2_depth)]) + self.br2 = BNPReLU(in_channels + 128) + self.proj2 = layers.ConvBNPReLU(in_channels + 128, num_classes, 1) + + self.level3_0 = DownSampler(in_channels + 128, 128) + self.level3 = nn.Sequential( + *[DilatedResidualBlock(128, 128) for i in range(level3_depth)]) + self.br3 = BNPReLU(256) + self.proj3 = layers.ConvBNPReLU(256, num_classes, 1) + + def forward(self, x): + f1 = self.level1(x) + down2 = F.adaptive_avg_pool2d(x, output_size=f1.shape[2:]) + feat1 = paddle.concat([f1, down2], axis=1) + feat1 = self.br1(feat1) + p1 = self.proj1(feat1) + + f2_res = self.level2_0(feat1) + f2 = self.level2(f2_res) + down4 = F.adaptive_avg_pool2d(x, output_size=f2.shape[2:]) + feat2 = paddle.concat([f2, f2_res, down4], axis=1) + feat2 = self.br2(feat2) + p2 = self.proj2(feat2) + + f3_res = self.level3_0(feat2) + f3 = self.level3(f3_res) + feat3 = paddle.concat([f3, f3_res], axis=1) + feat3 = self.br3(feat3) + p3 = self.proj3(feat3) + + return p1, p2, p3 diff --git a/paddleseg/models/fast_scnn.py b/paddleseg/models/fast_scnn.py new file mode 100644 index 0000000000000000000000000000000000000000..0f1422e704c3745c07574fe4b7d98d90b1add11c --- /dev/null +++ b/paddleseg/models/fast_scnn.py @@ -0,0 +1,316 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn +import paddle.nn.functional as F + +import paddle +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + +__all__ = ['FastSCNN'] + + +@manager.MODELS.add_component +class FastSCNN(nn.Layer): + """ + The FastSCNN implementation based on PaddlePaddle. + As mentioned in the original paper, FastSCNN is a real-time segmentation algorithm (123.5fps) + even for high resolution images (1024x2048). + The original article refers to + Poudel, Rudra PK, et al. "Fast-scnn: Fast semantic segmentation network" + (https://arxiv.org/pdf/1902.04502.pdf). + Args: + num_classes (int): The unique number of target classes. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. + If true, auxiliary loss will be added after LearningToDownsample module. Default: False. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + + super().__init__() + + self.learning_to_downsample = LearningToDownsample(32, 48, 64) + self.global_feature_extractor = GlobalFeatureExtractor( + in_channels=64, + block_channels=[64, 96, 128], + out_channels=128, + expansion=6, + num_blocks=[3, 3, 3], + align_corners=True) + self.feature_fusion = FeatureFusionModule(64, 128, 128, align_corners) + self.classifier = Classifier(128, num_classes) + + if enable_auxiliary_loss: + self.auxlayer = layers.AuxLayer(64, 32, num_classes) + + self.enable_auxiliary_loss = enable_auxiliary_loss + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + logit_list = [] + input_size = paddle.shape(x)[2:] + higher_res_features = self.learning_to_downsample(x) + x = self.global_feature_extractor(higher_res_features) + x = self.feature_fusion(higher_res_features, x) + logit = self.classifier(x) + logit = F.interpolate( + logit, + input_size, + mode='bilinear', + align_corners=self.align_corners) + logit_list.append(logit) + + if self.enable_auxiliary_loss: + auxiliary_logit = self.auxlayer(higher_res_features) + auxiliary_logit = F.interpolate( + auxiliary_logit, + input_size, + mode='bilinear', + align_corners=self.align_corners) + logit_list.append(auxiliary_logit) + + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class LearningToDownsample(nn.Layer): + """ + Learning to downsample module. + This module consists of three downsampling blocks (one conv and two separable conv) + Args: + dw_channels1 (int, optional): The input channels of the first sep conv. Default: 32. + dw_channels2 (int, optional): The input channels of the second sep conv. Default: 48. + out_channels (int, optional): The output channels of LearningToDownsample module. Default: 64. + """ + + def __init__(self, dw_channels1=32, dw_channels2=48, out_channels=64): + super(LearningToDownsample, self).__init__() + + self.conv_bn_relu = layers.ConvBNReLU( + in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2) + self.dsconv_bn_relu1 = layers.SeparableConvBNReLU( + in_channels=dw_channels1, + out_channels=dw_channels2, + kernel_size=3, + stride=2, + padding=1) + self.dsconv_bn_relu2 = layers.SeparableConvBNReLU( + in_channels=dw_channels2, + out_channels=out_channels, + kernel_size=3, + stride=2, + padding=1) + + def forward(self, x): + x = self.conv_bn_relu(x) + x = self.dsconv_bn_relu1(x) + x = self.dsconv_bn_relu2(x) + return x + + +class GlobalFeatureExtractor(nn.Layer): + """ + Global feature extractor module. + This module consists of three InvertedBottleneck blocks (like inverted residual introduced by MobileNetV2) and + a PPModule (introduced by PSPNet). + Args: + in_channels (int): The number of input channels to the module. + block_channels (tuple): A tuple represents output channels of each bottleneck block. + out_channels (int): The number of output channels of the module. Default: + expansion (int): The expansion factor in bottleneck. + num_blocks (tuple): It indicates the repeat time of each bottleneck. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, in_channels, block_channels, out_channels, expansion, + num_blocks, align_corners): + super(GlobalFeatureExtractor, self).__init__() + + self.bottleneck1 = self._make_layer(InvertedBottleneck, in_channels, + block_channels[0], num_blocks[0], + expansion, 2) + self.bottleneck2 = self._make_layer( + InvertedBottleneck, block_channels[0], block_channels[1], + num_blocks[1], expansion, 2) + self.bottleneck3 = self._make_layer( + InvertedBottleneck, block_channels[1], block_channels[2], + num_blocks[2], expansion, 1) + + self.ppm = layers.PPModule( + block_channels[2], + out_channels, + bin_sizes=(1, 2, 3, 6), + dim_reduction=True, + align_corners=align_corners) + + def _make_layer(self, + block, + in_channels, + out_channels, + blocks, + expansion=6, + stride=1): + layers = [] + layers.append(block(in_channels, out_channels, expansion, stride)) + for _ in range(1, blocks): + layers.append(block(out_channels, out_channels, expansion, 1)) + return nn.Sequential(*layers) + + def forward(self, x): + x = self.bottleneck1(x) + x = self.bottleneck2(x) + x = self.bottleneck3(x) + x = self.ppm(x) + return x + + +class InvertedBottleneck(nn.Layer): + """ + Single Inverted bottleneck implementation. + Args: + in_channels (int): The number of input channels to bottleneck block. + out_channels (int): The number of output channels of bottleneck block. + expansion (int, optional). The expansion factor in bottleneck. Default: 6. + stride (int, optional). The stride used in depth-wise conv. Defalt: 2. + """ + + def __init__(self, in_channels, out_channels, expansion=6, stride=2): + super().__init__() + + self.use_shortcut = stride == 1 and in_channels == out_channels + + expand_channels = in_channels * expansion + self.block = nn.Sequential( + # pw + layers.ConvBNReLU( + in_channels=in_channels, + out_channels=expand_channels, + kernel_size=1, + bias_attr=False), + # dw + layers.ConvBNReLU( + in_channels=expand_channels, + out_channels=expand_channels, + kernel_size=3, + stride=stride, + padding=1, + groups=expand_channels, + bias_attr=False), + # pw-linear + layers.ConvBN( + in_channels=expand_channels, + out_channels=out_channels, + kernel_size=1, + bias_attr=False)) + + def forward(self, x): + out = self.block(x) + if self.use_shortcut: + out = x + out + return out + + +class FeatureFusionModule(nn.Layer): + """ + Feature Fusion Module Implementation. + This module fuses high-resolution feature and low-resolution feature. + Args: + high_in_channels (int): The channels of high-resolution feature (output of LearningToDownsample). + low_in_channels (int): The channels of low-resolution feature (output of GlobalFeatureExtractor). + out_channels (int): The output channels of this module. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, high_in_channels, low_in_channels, out_channels, + align_corners): + super().__init__() + + # Only depth-wise conv + self.dwconv = layers.ConvBNReLU( + in_channels=low_in_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + groups=128, + bias_attr=False) + + self.conv_low_res = layers.ConvBN(out_channels, out_channels, 1) + self.conv_high_res = layers.ConvBN(high_in_channels, out_channels, 1) + self.align_corners = align_corners + + def forward(self, high_res_input, low_res_input): + low_res_input = F.interpolate( + low_res_input, + paddle.shape(high_res_input)[2:], + mode='bilinear', + align_corners=self.align_corners) + low_res_input = self.dwconv(low_res_input) + low_res_input = self.conv_low_res(low_res_input) + high_res_input = self.conv_high_res(high_res_input) + x = high_res_input + low_res_input + + return F.relu(x) + + +class Classifier(nn.Layer): + """ + The Classifier module implementation. + This module consists of two depth-wise conv and one conv. + Args: + input_channels (int): The input channels to this module. + num_classes (int): The unique number of target classes. + """ + + def __init__(self, input_channels, num_classes): + super().__init__() + + self.dsconv1 = layers.SeparableConvBNReLU( + in_channels=input_channels, + out_channels=input_channels, + kernel_size=3, + padding=1) + + self.dsconv2 = layers.SeparableConvBNReLU( + in_channels=input_channels, + out_channels=input_channels, + kernel_size=3, + padding=1) + + self.conv = nn.Conv2D( + in_channels=input_channels, out_channels=num_classes, kernel_size=1) + + self.dropout = nn.Dropout(p=0.1) # dropout_prob + + def forward(self, x): + x = self.dsconv1(x) + x = self.dsconv2(x) + x = self.dropout(x) + x = self.conv(x) + return x diff --git a/paddleseg/models/fastfcn.py b/paddleseg/models/fastfcn.py new file mode 100644 index 0000000000000000000000000000000000000000..87c86eb7af4a6bd88d056763f51b1bfa2064e113 --- /dev/null +++ b/paddleseg/models/fastfcn.py @@ -0,0 +1,224 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class FastFCN(nn.Layer): + """ + The FastFCN implementation based on PaddlePaddle. + + The original article refers to + Huikai Wu, Junge Zhang, Kaiqi Huang. "FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation". + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): A backbone network. + backbone_indices (tuple): The values in the tuple indicate the indices of + output of backbone. + num_codes (int): The number of encoded words. Default: 32. + mid_channels (int): The channels of middle layers. Default: 512. + use_jpu (bool): Whether use jpu module. Default: True. + aux_loss (bool): Whether use auxiliary head loss. Default: True. + use_se_loss (int): Whether use semantic encoding loss. Default: True. + add_lateral (int): Whether use lateral convolution layers. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + num_codes=32, + mid_channels=512, + use_jpu=True, + aux_loss=True, + use_se_loss=True, + add_lateral=False, + pretrained=None): + super().__init__() + self.add_lateral = add_lateral + self.num_codes = num_codes + self.backbone = backbone + self.use_jpu = use_jpu + in_channels = self.backbone.feat_channels + + if use_jpu: + self.jpu_layer = layers.JPU(in_channels, mid_channels) + in_channels[-1] = mid_channels * 4 + self.bottleneck = layers.ConvBNReLU( + in_channels[-1], + mid_channels, + 1, + padding=0, + bias_attr=False, ) + else: + self.bottleneck = layers.ConvBNReLU( + in_channels[-1], + mid_channels, + 3, + padding=1, + bias_attr=False, ) + if self.add_lateral: + self.lateral_convs = nn.LayerList([ + layers.ConvBNReLU( + in_channels[0], mid_channels, 1, bias_attr=False), + layers.ConvBNReLU( + in_channels[1], mid_channels, 1, bias_attr=False), + ]) + + self.fusion = layers.ConvBNReLU( + 3 * mid_channels, + mid_channels, + 3, + padding=1, + bias_attr=False, ) + + self.enc_module = EncModule(mid_channels, num_codes) + self.cls_seg = nn.Conv2D(mid_channels, num_classes, 1) + + self.aux_loss = aux_loss + if self.aux_loss: + self.fcn_head = layers.AuxLayer(in_channels[-2], mid_channels, + num_classes) + + self.use_se_loss = use_se_loss + if use_se_loss: + self.se_layer = nn.Linear(mid_channels, num_classes) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, inputs): + imsize = paddle.shape(inputs)[2:] + feats = self.backbone(inputs) + if self.use_jpu: + feats = self.jpu_layer(*feats) + + fcn_feat = feats[2] + + feat = self.bottleneck(feats[-1]) + if self.add_lateral: + laterals = [] + for i, lateral_conv in enumerate(self.lateral_convs): + laterals.append( + F.interpolate( + lateral_conv(feats[i]), + size=paddle.shape(feat)[2:], + mode='bilinear', + align_corners=False)) + feat = self.fusion(paddle.concat([feat, *laterals], 1)) + encode_feat, feat = self.enc_module(feat) + out = self.cls_seg(feat) + out = F.interpolate( + out, size=imsize, mode='bilinear', align_corners=False) + output = [out] + + if self.training: + fcn_out = self.fcn_head(fcn_feat) + fcn_out = F.interpolate( + fcn_out, size=imsize, mode='bilinear', align_corners=False) + output.append(fcn_out) + if self.use_se_loss: + se_out = self.se_layer(encode_feat) + output.append(se_out) + return output + return output + + +class Encoding(nn.Layer): + def __init__(self, channels, num_codes): + super().__init__() + self.channels, self.num_codes = channels, num_codes + + std = 1 / ((channels * num_codes)**0.5) + self.codewords = self.create_parameter( + shape=(num_codes, channels), + default_initializer=nn.initializer.Uniform(-std, std), ) + self.scale = self.create_parameter( + shape=(num_codes, ), + default_initializer=nn.initializer.Uniform(-1, 0), ) + + def scaled_l2(self, x, codewords, scale): + num_codes, channels = paddle.shape(codewords) + reshaped_scale = scale.reshape([1, 1, num_codes]) + expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1]) + reshaped_codewords = codewords.reshape([1, 1, num_codes, channels]) + + scaled_l2_norm = reshaped_scale * ( + expanded_x - reshaped_codewords).pow(2).sum(axis=3) + return scaled_l2_norm + + def aggregate(self, assignment_weights, x, codewords): + num_codes, channels = paddle.shape(codewords) + reshaped_codewords = codewords.reshape([1, 1, num_codes, channels]) + expanded_x = paddle.tile( + x.unsqueeze(2), + [1, 1, num_codes, 1], ) + encoded_feat = (assignment_weights.unsqueeze(3) * + (expanded_x - reshaped_codewords)).sum(axis=1) + return encoded_feat + + def forward(self, x): + x_dims = x.ndim + assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format( + x_dims) + assert paddle.shape( + x + )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format( + self.channels, paddle.shape(x)[1]) + batch_size = paddle.shape(x)[0] + x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1]) + assignment_weights = F.softmax( + self.scaled_l2(x, self.codewords, self.scale), axis=2) + + encoded_feat = self.aggregate(assignment_weights, x, self.codewords) + encoded_feat = encoded_feat.reshape([batch_size, self.num_codes, -1]) + return encoded_feat + + +class EncModule(nn.Layer): + def __init__(self, in_channels, num_codes): + super().__init__() + self.encoding_project = layers.ConvBNReLU( + in_channels, + in_channels, + 1, ) + self.encoding = nn.Sequential( + Encoding( + channels=in_channels, num_codes=num_codes), + nn.BatchNorm1D(num_codes), + nn.ReLU(), ) + self.fc = nn.Sequential( + nn.Linear(in_channels, in_channels), + nn.Sigmoid(), ) + + def forward(self, x): + encoding_projection = self.encoding_project(x) + encoding_feat = self.encoding(encoding_projection).mean(axis=1) + batch_size, channels, _, _ = paddle.shape(x) + gamma = self.fc(encoding_feat) + y = gamma.reshape([batch_size, channels, 1, 1]) + output = F.relu(x + x * y) + return encoding_feat, output diff --git a/paddleseg/models/fcn.py b/paddleseg/models/fcn.py new file mode 100644 index 0000000000000000000000000000000000000000..e12aacd7f3cb2481c392b979300f1d686eb875bb --- /dev/null +++ b/paddleseg/models/fcn.py @@ -0,0 +1,145 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn +import paddle.nn.functional as F + +import paddle +from paddleseg import utils +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers + + +@manager.MODELS.add_component +class FCN(nn.Layer): + """ + A simple implementation for FCN based on PaddlePaddle. + + The original article refers to + Evan Shelhamer, et, al. "Fully Convolutional Networks for Semantic Segmentation" + (https://arxiv.org/abs/1411.4038). + + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone networks. + backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone. + Default: (-1, ). + channels (int, optional): The channels between conv layer and the last layer of FCNHead. + If None, it will be the number of channels of input features. Default: None. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(-1, ), + channels=None, + align_corners=False, + pretrained=None, + bias=True, + data_format="NCHW"): + super(FCN, self).__init__() + + if data_format != 'NCHW': + raise ('fcn only support NCHW data format') + self.backbone = backbone + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + self.head = FCNHead( + num_classes, + backbone_indices, + backbone_channels, + channels, + bias=bias) + + self.align_corners = align_corners + self.pretrained = pretrained + self.data_format = data_format + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class FCNHead(nn.Layer): + """ + A simple implementation for FCNHead based on PaddlePaddle + + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone. + Default: (-1, ). + channels (int, optional): The channels between conv layer and the last layer of FCNHead. + If None, it will be the number of channels of input features. Default: None. + pretrained (str, optional): The path of pretrained model. Default: None + """ + + def __init__(self, + num_classes, + backbone_indices=(-1, ), + backbone_channels=(270, ), + channels=None, + bias=True): + super(FCNHead, self).__init__() + + self.num_classes = num_classes + self.backbone_indices = backbone_indices + if channels is None: + channels = backbone_channels[0] + + self.conv_1 = layers.ConvBNReLU( + in_channels=backbone_channels[0], + out_channels=channels, + kernel_size=1, + stride=1, + bias_attr=bias) + self.cls = nn.Conv2D( + in_channels=channels, + out_channels=self.num_classes, + kernel_size=1, + stride=1, + bias_attr=bias) + self.init_weight() + + def forward(self, feat_list): + logit_list = [] + x = feat_list[self.backbone_indices[0]] + x = self.conv_1(x) + logit = self.cls(x) + logit_list.append(logit) + return logit_list + + def init_weight(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + param_init.normal_init(layer.weight, std=0.001) + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) diff --git a/paddleseg/models/gcnet.py b/paddleseg/models/gcnet.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5d8e3e919f31ce8fd31470a1c02ea5eab56a29 --- /dev/null +++ b/paddleseg/models/gcnet.py @@ -0,0 +1,223 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class GCNet(nn.Layer): + """ + The GCNet implementation based on PaddlePaddle. + + The original article refers to + Cao, Yue, et al. "GCnet: Non-local networks meet squeeze-excitation networks and beyond" + (https://arxiv.org/pdf/1904.11492.pdf). + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. + backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. + gc_channels (int, optional): The input channels to Global Context Block. Default: 512. + ratio (float, optional): It indicates the ratio of attention channels and gc_channels. Default: 0.25. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + gc_channels=512, + ratio=0.25, + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + self.head = GCNetHead(num_classes, backbone_indices, backbone_channels, + gc_channels, ratio, enable_auxiliary_loss) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class GCNetHead(nn.Layer): + """ + The GCNetHead implementation. + + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. + The first index will be taken as a deep-supervision feature in auxiliary layer; + the second one will be taken as input of GlobalContextBlock. + backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index. + gc_channels (int): The input channels to Global Context Block. + ratio (float): It indicates the ratio of attention channels and gc_channels. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + """ + + def __init__(self, + num_classes, + backbone_indices, + backbone_channels, + gc_channels, + ratio, + enable_auxiliary_loss=True): + + super().__init__() + + in_channels = backbone_channels[1] + self.conv_bn_relu1 = layers.ConvBNReLU( + in_channels=in_channels, + out_channels=gc_channels, + kernel_size=3, + padding=1) + + self.gc_block = GlobalContextBlock( + gc_channels=gc_channels, in_channels=gc_channels, ratio=ratio) + + self.conv_bn_relu2 = layers.ConvBNReLU( + in_channels=gc_channels, + out_channels=gc_channels, + kernel_size=3, + padding=1) + + self.conv_bn_relu3 = layers.ConvBNReLU( + in_channels=in_channels + gc_channels, + out_channels=gc_channels, + kernel_size=3, + padding=1) + + self.dropout = nn.Dropout(p=0.1) + + self.conv = nn.Conv2D( + in_channels=gc_channels, out_channels=num_classes, kernel_size=1) + + if enable_auxiliary_loss: + self.auxlayer = layers.AuxLayer( + in_channels=backbone_channels[0], + inter_channels=backbone_channels[0] // 4, + out_channels=num_classes) + + self.backbone_indices = backbone_indices + self.enable_auxiliary_loss = enable_auxiliary_loss + + def forward(self, feat_list): + logit_list = [] + x = feat_list[self.backbone_indices[1]] + + output = self.conv_bn_relu1(x) + output = self.gc_block(output) + output = self.conv_bn_relu2(output) + + output = paddle.concat([x, output], axis=1) + output = self.conv_bn_relu3(output) + + output = self.dropout(output) + logit = self.conv(output) + logit_list.append(logit) + + if self.enable_auxiliary_loss: + low_level_feat = feat_list[self.backbone_indices[0]] + auxiliary_logit = self.auxlayer(low_level_feat) + logit_list.append(auxiliary_logit) + + return logit_list + + +class GlobalContextBlock(nn.Layer): + """ + Global Context Block implementation. + + Args: + in_channels (int): The input channels of Global Context Block. + ratio (float): The channels of attention map. + """ + + def __init__(self, gc_channels, in_channels, ratio): + super().__init__() + self.gc_channels = gc_channels + + self.conv_mask = nn.Conv2D( + in_channels=in_channels, out_channels=1, kernel_size=1) + + self.softmax = nn.Softmax(axis=2) + + inter_channels = int(in_channels * ratio) + self.channel_add_conv = nn.Sequential( + nn.Conv2D( + in_channels=in_channels, + out_channels=inter_channels, + kernel_size=1), + nn.LayerNorm(normalized_shape=[inter_channels, 1, 1]), + nn.ReLU(), + nn.Conv2D( + in_channels=inter_channels, + out_channels=in_channels, + kernel_size=1)) + + def global_context_block(self, x): + x_shape = paddle.shape(x) + + # [N, C, H * W] + input_x = paddle.reshape(x, shape=[0, self.gc_channels, -1]) + # [N, 1, C, H * W] + input_x = paddle.unsqueeze(input_x, axis=1) + # [N, 1, H, W] + context_mask = self.conv_mask(x) + # [N, 1, H * W] + context_mask = paddle.reshape(context_mask, shape=[0, 1, -1]) + context_mask = self.softmax(context_mask) + # [N, 1, H * W, 1] + context_mask = paddle.unsqueeze(context_mask, axis=-1) + # [N, 1, C, 1] + context = paddle.matmul(input_x, context_mask) + # [N, C, 1, 1] + context = paddle.reshape(context, shape=[0, self.gc_channels, 1, 1]) + + return context + + def forward(self, x): + context = self.global_context_block(x) + channel_add_term = self.channel_add_conv(context) + out = x + channel_add_term + return out diff --git a/paddleseg/models/ginet.py b/paddleseg/models/ginet.py new file mode 100644 index 0000000000000000000000000000000000000000..fe4b9ae5a3c3f49a046037e67a090d13b92918e2 --- /dev/null +++ b/paddleseg/models/ginet.py @@ -0,0 +1,290 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +from paddle.nn import functional as F + +from paddleseg.utils import utils +from paddleseg.models import layers +from paddleseg.cvlibs import manager + + +@manager.MODELS.add_component +class GINet(nn.Layer): + """ + The GINet implementation based on PaddlePaddle. + The original article refers to + Wu, Tianyi, Yu Lu, Yu Zhu, Chuang Zhang, Ming Wu, Zhanyu Ma, and Guodong Guo. "GINet: Graph interaction network for scene parsing." In European Conference on Computer Vision, pp. 34-51. Springer, Cham, 2020. + (https://arxiv.org/pdf/2009.06160). + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network. + backbone_indices (tuple, optional): Values in the tuple indicate the indices of output of backbone. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. + If true, auxiliary loss will be added after LearningToDownsample module. Default: False. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False. + jpu (bool, optional)): whether to use jpu unit in the base forward. Default:True. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=[0, 1, 2, 3], + enable_auxiliary_loss=True, + align_corners=True, + jpu=True, + pretrained=None): + super().__init__() + self.nclass = num_classes + self.aux = enable_auxiliary_loss + self.jpu = jpu + + self.backbone = backbone + self.backbone_indices = backbone_indices + self.align_corners = align_corners + + self.jpu = layers.JPU([512, 1024, 2048], width=512) if jpu else None + self.head = GIHead(in_channels=2048, nclass=num_classes) + + if self.aux: + self.auxlayer = layers.AuxLayer( + 1024, 1024 // 4, num_classes, bias_attr=False) + + self.pretrained = pretrained + self.init_weight() + + def base_forward(self, x): + feat_list = self.backbone(x) + + c1, c2, c3, c4 = [feat_list[i] for i in self.backbone_indices] + + if self.jpu: + return self.jpu(c1, c2, c3, c4) + else: + return c1, c2, c3, c4 + + def forward(self, x): + _, _, h, w = paddle.shape(x) + _, _, c3, c4 = self.base_forward(x) + + logit_list = [] + x, _ = self.head(c4) + logit_list.append(x) + + if self.aux: + auxout = self.auxlayer(c3) + + logit_list.append(auxout) + + return [ + F.interpolate( + logit, [h, w], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class GIHead(nn.Layer): + """The Graph Interaction Network head.""" + + def __init__(self, in_channels, nclass): + super().__init__() + self.nclass = nclass + inter_channels = in_channels // 4 + self.inp = paddle.zeros(shape=(nclass, 300), dtype='float32') + self.inp = paddle.create_parameter( + shape=self.inp.shape, + dtype=str(self.inp.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(self.inp)) + self.inp.stop_gradient = True + + self.fc1 = nn.Sequential( + nn.Linear(300, 128), nn.BatchNorm1D(128), nn.ReLU()) + self.fc2 = nn.Sequential( + nn.Linear(128, 256), nn.BatchNorm1D(256), nn.ReLU()) + self.conv5 = layers.ConvBNReLU( + in_channels, + inter_channels, + 3, + padding=1, + bias_attr=False, + stride=1) + + self.gloru = GlobalReasonUnit( + in_channels=inter_channels, + num_state=256, + num_node=84, + nclass=nclass) + self.conv6 = nn.Sequential( + nn.Dropout(0.1), nn.Conv2D(inter_channels, nclass, 1)) + + def forward(self, x): + + B, C, H, W = paddle.shape(x) + inp = self.inp + + inp = self.fc1(inp) + inp = self.fc2(inp).unsqueeze(axis=0).transpose((0, 2, 1))\ + .expand((B, 256, self.nclass)) + + out = self.conv5(x) + + out, se_out = self.gloru(out, inp) + out = self.conv6(out) + return out, se_out + + +class GlobalReasonUnit(nn.Layer): + """ + The original paper refers to: + Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" (https://arxiv.org/abs/1811.12814) + """ + + def __init__(self, in_channels, num_state=256, num_node=84, nclass=59): + super().__init__() + self.num_state = num_state + self.conv_theta = nn.Conv2D( + in_channels, num_node, kernel_size=1, stride=1, padding=0) + self.conv_phi = nn.Conv2D( + in_channels, num_state, kernel_size=1, stride=1, padding=0) + self.graph = GraphLayer(num_state, num_node, nclass) + self.extend_dim = nn.Conv2D( + num_state, in_channels, kernel_size=1, bias_attr=False) + + self.bn = layers.SyncBatchNorm(in_channels) + + def forward(self, x, inp): + B = self.conv_theta(x) + sizeB = paddle.shape(B) + B = paddle.flatten(B, 2, 3) + + sizex = paddle.shape(x) + x_reduce = self.conv_phi(x) + + x_reduce = paddle.flatten(x_reduce, 2, 3).transpose((0, 2, 1)) + + V = paddle.bmm(B, x_reduce).transpose((0, 2, 1)) + V = paddle.divide(V, (sizex[2] * sizex[3]).astype('float32')) + + class_node, new_V = self.graph(inp, V) + D = B.transpose((0, 2, 1)) + Y = paddle.bmm(D, new_V.transpose((0, 2, 1))) + Y = Y.transpose((0, 2, 1)).reshape((sizex[0], self.num_state, \ + sizex[2], -1)) + Y = self.extend_dim(Y) + Y = self.bn(Y) + out = Y + x + + return out, class_node + + +class GraphLayer(nn.Layer): + def __init__(self, num_state, num_node, num_class): + super().__init__() + self.vis_gcn = GCN(num_state, num_node) + self.word_gcn = GCN(num_state, num_class) + self.transfer = GraphTransfer(num_state) + self.gamma_vis = paddle.zeros([num_node]) + self.gamma_word = paddle.zeros([num_class]) + self.gamma_vis = paddle.create_parameter( + shape=paddle.shape(self.gamma_vis), + dtype=str(self.gamma_vis.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(self.gamma_vis)) + self.gamma_word = paddle.create_parameter( + shape=paddle.shape(self.gamma_word), + dtype=str(self.gamma_word.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(self.gamma_word)) + + def forward(self, inp, vis_node): + inp = self.word_gcn(inp) + new_V = self.vis_gcn(vis_node) + class_node, vis_node = self.transfer(inp, new_V) + + class_node = self.gamma_word * inp + class_node + new_V = self.gamma_vis * vis_node + new_V + return class_node, new_V + + +class GCN(nn.Layer): + def __init__(self, num_state=128, num_node=64, bias=False): + super().__init__() + self.conv1 = nn.Conv1D( + num_node, + num_node, + kernel_size=1, + padding=0, + stride=1, + groups=1, ) + self.relu = nn.ReLU() + self.conv2 = nn.Conv1D( + num_state, + num_state, + kernel_size=1, + padding=0, + stride=1, + groups=1, + bias_attr=bias) + + def forward(self, x): + h = self.conv1(x.transpose((0, 2, 1))).transpose((0, 2, 1)) + h = h + x + h = self.relu(h) + h = self.conv2(h) + return h + + +class GraphTransfer(nn.Layer): + """Transfer vis graph to class node, transfer class node to vis feature""" + + def __init__(self, in_dim): + super().__init__() + self.channle_in = in_dim + self.query_conv = nn.Conv1D( + in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1) + self.key_conv = nn.Conv1D( + in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1) + self.value_conv_vis = nn.Conv1D( + in_channels=in_dim, out_channels=in_dim, kernel_size=1) + self.value_conv_word = nn.Conv1D( + in_channels=in_dim, out_channels=in_dim, kernel_size=1) + self.softmax_vis = nn.Softmax(axis=-1) + self.softmax_word = nn.Softmax(axis=-2) + + def forward(self, word, vis_node): + m_batchsize, C, Nc = paddle.shape(word) + m_batchsize, C, Nn = paddle.shape(vis_node) + + proj_query = self.query_conv(word).reshape((m_batchsize, -1, Nc))\ + .transpose((0, 2, 1)) + proj_key = self.key_conv(vis_node).reshape((m_batchsize, -1, Nn)) + + energy = paddle.bmm(proj_query, proj_key) + attention_vis = self.softmax_vis(energy).transpose((0, 2, 1)) + attention_word = self.softmax_word(energy) + + proj_value_vis = self.value_conv_vis(vis_node).reshape((m_batchsize, -1, + Nn)) + proj_value_word = self.value_conv_word(word).reshape((m_batchsize, -1, + Nc)) + + class_out = paddle.bmm(proj_value_vis, attention_vis) + node_out = paddle.bmm(proj_value_word, attention_word) + return class_out, node_out diff --git a/paddleseg/models/glore.py b/paddleseg/models/glore.py new file mode 100644 index 0000000000000000000000000000000000000000..12a26c18bf04e87f16764779948c13cb2fb0efd3 --- /dev/null +++ b/paddleseg/models/glore.py @@ -0,0 +1,198 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class GloRe(nn.Layer): + """ + The GloRe implementation based on PaddlePaddle. + + The original article refers to: + Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" + (https://arxiv.org/pdf/1811.12814.pdf) + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. + backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. + gru_channels (int, optional): The number of input channels in GloRe Unit. Default: 512. + gru_num_state (int, optional): The number of states in GloRe Unit. Default: 128. + gru_num_node (tuple, optional): The number of nodes in GloRe Unit. Default: Default: 128. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + gru_channels=512, + gru_num_state=128, + gru_num_node=64, + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + backbone_channels = [ + backbone.feat_channels[i] for i in backbone_indices + ] + + self.head = GloReHead(num_classes, backbone_indices, backbone_channels, + gru_channels, gru_num_state, gru_num_node, + enable_auxiliary_loss) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class GloReHead(nn.Layer): + def __init__(self, + num_classes, + backbone_indices, + backbone_channels, + gru_channels=512, + gru_num_state=128, + gru_num_node=64, + enable_auxiliary_loss=True): + super().__init__() + + in_channels = backbone_channels[1] + self.conv_bn_relu = layers.ConvBNReLU( + in_channels, gru_channels, 1, bias_attr=False) + self.gru_module = GruModule( + num_input=gru_channels, + num_state=gru_num_state, + num_node=gru_num_node) + + self.dropout = nn.Dropout(0.1) + self.classifier = nn.Conv2D(512, num_classes, kernel_size=1) + self.auxlayer = layers.AuxLayer( + in_channels=backbone_channels[0], + inter_channels=backbone_channels[0] // 4, + out_channels=num_classes) + + self.backbone_indices = backbone_indices + self.enable_auxiliary_loss = enable_auxiliary_loss + + def forward(self, feat_list): + + logit_list = [] + x = feat_list[self.backbone_indices[1]] + + feature = self.conv_bn_relu(x) + gru_output = self.gru_module(feature) + output = self.dropout(gru_output) + logit = self.classifier(output) + logit_list.append(logit) + + if self.enable_auxiliary_loss: + low_level_feat = feat_list[self.backbone_indices[0]] + auxiliary_logit = self.auxlayer(low_level_feat) + logit_list.append(auxiliary_logit) + + return logit_list + + +class GCN(nn.Layer): + def __init__(self, num_state, num_node, bias=False): + super(GCN, self).__init__() + self.conv1 = nn.Conv1D(num_node, num_node, kernel_size=1) + self.relu = nn.ReLU() + self.conv2 = nn.Conv1D( + num_state, num_state, kernel_size=1, bias_attr=bias) + + def forward(self, x): + h = self.conv1(paddle.transpose(x, perm=(0, 2, 1))) + h = paddle.transpose(h, perm=(0, 2, 1)) + h = h + x + h = self.relu(self.conv2(h)) + return h + + +class GruModule(nn.Layer): + def __init__(self, + num_input=512, + num_state=128, + num_node=64, + normalize=False): + super(GruModule, self).__init__() + self.normalize = normalize + self.num_state = num_state + self.num_node = num_node + self.reduction_dim = nn.Conv2D(num_input, num_state, kernel_size=1) + self.projection_mat = nn.Conv2D(num_input, num_node, kernel_size=1) + self.gcn = GCN(num_state=self.num_state, num_node=self.num_node) + self.extend_dim = nn.Conv2D( + self.num_state, num_input, kernel_size=1, bias_attr=False) + self.extend_bn = layers.SyncBatchNorm(num_input, epsilon=1e-4) + + def forward(self, input): + n, c, h, w = input.shape + # B, C, H, W + reduction_dim = self.reduction_dim(input) + # B, N, H, W + mat_B = self.projection_mat(input) + # B, C, H*W + reshaped_reduction = paddle.reshape( + reduction_dim, shape=[n, self.num_state, h * w]) + # B, N, H*W + reshaped_B = paddle.reshape(mat_B, shape=[n, self.num_node, h * w]) + # B, N, H*W + reproject = reshaped_B + # B, C, N + node_state_V = paddle.matmul( + reshaped_reduction, paddle.transpose( + reshaped_B, perm=[0, 2, 1])) + + if self.normalize: + node_state_V = node_state_V * (1. / reshaped_reduction.shape[2]) + + # B, C, N + gcn_out = self.gcn(node_state_V) + # B, C, H*W + Y = paddle.matmul(gcn_out, reproject) + # B, C, H, W + Y = paddle.reshape(Y, shape=[n, self.num_state, h, w]) + Y_extend = self.extend_dim(Y) + Y_extend = self.extend_bn(Y_extend) + + out = input + Y_extend + return out diff --git a/paddleseg/models/gscnn.py b/paddleseg/models/gscnn.py new file mode 100644 index 0000000000000000000000000000000000000000..93527511eed05ac9eb2b919faf4c17d9dfae7d69 --- /dev/null +++ b/paddleseg/models/gscnn.py @@ -0,0 +1,357 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.models.backbones import resnet_vd +from paddleseg.models import deeplab +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class GSCNN(nn.Layer): + """ + The GSCNN implementation based on PaddlePaddle. + The original article refers to + Towaki Takikawa, et, al. "Gated-SCNN: Gated Shape CNNs for Semantic Segmentation" + (https://arxiv.org/pdf/1907.05740.pdf) + Args: + num_classes (int): The unique number of target classes. + backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd. + backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. + Default: (0, 1, 2, 3). + aspp_ratios (tuple, optional): The dilation rate using in ASSP module. + If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18). + If output_stride=8, aspp_ratios is (1, 12, 24, 36). + Default: (1, 6, 12, 18). + aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(0, 1, 2, 3), + aspp_ratios=(1, 6, 12, 18), + aspp_out_channels=256, + align_corners=False, + pretrained=None): + super().__init__() + self.backbone = backbone + backbone_channels = self.backbone.feat_channels + self.head = GSCNNHead(num_classes, backbone_indices, backbone_channels, + aspp_ratios, aspp_out_channels, align_corners) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logit_list = self.head(x, feat_list, self.backbone.conv1_logit) + seg_logit, edge_logit = [ + F.interpolate( + logit, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + return [seg_logit, (seg_logit, edge_logit), edge_logit, seg_logit] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class GSCNNHead(nn.Layer): + """ + The GSCNNHead implementation based on PaddlePaddle. + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. + the first index will be taken as a low-level feature in Decoder component; + the last one will be taken as input of ASPP component; the second to fourth + will be taken as input for GCL component. + Usually backbone consists of four downsampling stage, and return an output of + each stage. If we set it as (0, 1, 2, 3), it means taking feature map of the first + stage in backbone as low-level feature used in Decoder, feature map of the fourth + stage as input of ASPP, and the feature map of the second to fourth stage as input of GCL. + backbone_channels (tuple): The channels of output of backbone. + aspp_ratios (tuple): The dilation rates using in ASSP module. + aspp_out_channels (int): The output channels of ASPP module. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, num_classes, backbone_indices, backbone_channels, + aspp_ratios, aspp_out_channels, align_corners): + super().__init__() + self.backbone_indices = backbone_indices + self.align_corners = align_corners + + self.dsn1 = nn.Conv2D( + backbone_channels[backbone_indices[1]], 1, kernel_size=1) + self.dsn2 = nn.Conv2D( + backbone_channels[backbone_indices[2]], 1, kernel_size=1) + self.dsn3 = nn.Conv2D( + backbone_channels[backbone_indices[3]], 1, kernel_size=1) + + self.res1 = resnet_vd.BasicBlock(64, 64, stride=1) + self.d1 = nn.Conv2D(64, 32, kernel_size=1) + self.gate1 = GatedSpatailConv2d(32, 32) + self.res2 = resnet_vd.BasicBlock(32, 32, stride=1) + self.d2 = nn.Conv2D(32, 16, kernel_size=1) + self.gate2 = GatedSpatailConv2d(16, 16) + self.res3 = resnet_vd.BasicBlock(16, 16, stride=1) + self.d3 = nn.Conv2D(16, 8, kernel_size=1) + self.gate3 = GatedSpatailConv2d(8, 8) + self.fuse = nn.Conv2D(8, 1, kernel_size=1, bias_attr=False) + + self.cw = nn.Conv2D(2, 1, kernel_size=1, bias_attr=False) + + self.aspp = ASPPModule( + aspp_ratios=aspp_ratios, + in_channels=backbone_channels[-1], + out_channels=aspp_out_channels, + align_corners=self.align_corners, + image_pooling=True) + + self.decoder = deeplab.Decoder( + num_classes=num_classes, + in_channels=backbone_channels[0], + align_corners=self.align_corners) + + def forward(self, x, feat_list, s_input): + input_shape = paddle.shape(x) + m1f = F.interpolate( + s_input, + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + + l1, l2, l3 = [ + feat_list[self.backbone_indices[i]] + for i in range(1, len(self.backbone_indices)) + ] + s1 = F.interpolate( + self.dsn1(l1), + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + s2 = F.interpolate( + self.dsn2(l2), + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + s3 = F.interpolate( + self.dsn3(l3), + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + + # Get image gradient + im_arr = x.numpy().transpose((0, 2, 3, 1)) + im_arr = ((im_arr * 0.5 + 0.5) * 255).astype(np.uint8) + canny = np.zeros((input_shape[0], 1, input_shape[2], input_shape[3])) + for i in range(input_shape[0]): + canny[i] = cv2.Canny(im_arr[i], 10, 100) + canny = canny / 255 + canny = paddle.to_tensor(canny).astype('float32') + canny.stop_gradient = True + + cs = self.res1(m1f) + cs = F.interpolate( + cs, + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + cs = self.d1(cs) + cs = self.gate1(cs, s1) + + cs = self.res2(cs) + cs = F.interpolate( + cs, + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + cs = self.d2(cs) + cs = self.gate2(cs, s2) + + cs = self.res3(cs) + cs = F.interpolate( + cs, + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + cs = self.d3(cs) + cs = self.gate3(cs, s3) + + cs = self.fuse(cs) + cs = F.interpolate( + cs, + input_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + edge_out = F.sigmoid(cs) # Ouput of shape stream + + cat = paddle.concat([edge_out, canny], axis=1) + acts = self.cw(cat) + acts = F.sigmoid(acts) # Input of fusion module + + x = self.aspp(l3, acts) + + low_level_feat = feat_list[self.backbone_indices[0]] + logit = self.decoder(x, low_level_feat) + logit_list = [logit, edge_out] + return logit_list + + +class GatedSpatailConv2d(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + dilation=1, + groups=1, + bias_attr=False): + super().__init__() + self._gate_conv = nn.Sequential( + layers.SyncBatchNorm(in_channels + 1), + nn.Conv2D( + in_channels + 1, in_channels + 1, kernel_size=1), + nn.ReLU(), + nn.Conv2D( + in_channels + 1, 1, kernel_size=1), + layers.SyncBatchNorm(1), + nn.Sigmoid()) + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias_attr) + + def forward(self, input_features, gating_features): + cat = paddle.concat([input_features, gating_features], axis=1) + alphas = self._gate_conv(cat) + x = input_features * (alphas + 1) + x = self.conv(x) + return x + + +class ASPPModule(nn.Layer): + """ + Atrous Spatial Pyramid Pooling. + Args: + aspp_ratios (tuple): The dilation rate using in ASSP module. + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False. + image_pooling (bool, optional): If augmented with image-level features. Default: False + """ + + def __init__(self, + aspp_ratios, + in_channels, + out_channels, + align_corners, + use_sep_conv=False, + image_pooling=False): + super().__init__() + + self.align_corners = align_corners + self.aspp_blocks = nn.LayerList() + + for ratio in aspp_ratios: + if use_sep_conv and ratio > 1: + conv_func = layers.SeparableConvBNReLU + else: + conv_func = layers.ConvBNReLU + + block = conv_func( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1 if ratio == 1 else 3, + dilation=ratio, + padding=0 if ratio == 1 else ratio) + self.aspp_blocks.append(block) + + out_size = len(self.aspp_blocks) + + if image_pooling: + self.global_avg_pool = nn.Sequential( + nn.AdaptiveAvgPool2D(output_size=(1, 1)), + layers.ConvBNReLU( + in_channels, out_channels, kernel_size=1, bias_attr=False)) + out_size += 1 + self.image_pooling = image_pooling + + self.edge_conv = layers.ConvBNReLU( + 1, out_channels, kernel_size=1, bias_attr=False) + out_size += 1 + + self.conv_bn_relu = layers.ConvBNReLU( + in_channels=out_channels * out_size, + out_channels=out_channels, + kernel_size=1) + + self.dropout = nn.Dropout(p=0.1) # drop rate + + def forward(self, x, edge): + outputs = [] + x_shape = paddle.shape(x) + for block in self.aspp_blocks: + y = block(x) + y = F.interpolate( + y, + x_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + outputs.append(y) + + if self.image_pooling: + img_avg = self.global_avg_pool(x) + img_avg = F.interpolate( + img_avg, + x_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + outputs.append(img_avg) + + edge_features = F.interpolate( + edge, + size=x_shape[2:], + mode='bilinear', + align_corners=self.align_corners) + edge_features = self.edge_conv(edge_features) + outputs.append(edge_features) + + x = paddle.concat(outputs, axis=1) + x = self.conv_bn_relu(x) + x = self.dropout(x) + return x diff --git a/paddleseg/models/hardnet.py b/paddleseg/models/hardnet.py new file mode 100644 index 0000000000000000000000000000000000000000..6bd765805bf2ebee3154dd6638f5753efff75d3e --- /dev/null +++ b/paddleseg/models/hardnet.py @@ -0,0 +1,309 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class HarDNet(nn.Layer): + """ + [Real Time] The FC-HardDNet 70 implementation based on PaddlePaddle. + The original article refers to + Chao, Ping, et al. "HarDNet: A Low Memory Traffic Network" + (https://arxiv.org/pdf/1909.00948.pdf) + + Args: + num_classes (int): The unique number of target classes. + stem_channels (tuple|list, optional): The number of channels before the encoder. Default: (16, 24, 32, 48). + ch_list (tuple|list, optional): The number of channels at each block in the encoder. Default: (64, 96, 160, 224, 320). + grmul (float, optional): The channel multiplying factor in HarDBlock, which is m in the paper. Default: 1.7. + gr (tuple|list, optional): The growth rate in each HarDBlock, which is k in the paper. Default: (10, 16, 18, 24, 32). + n_layers (tuple|list, optional): The number of layers in each HarDBlock. Default: (4, 4, 8, 8, 8). + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + stem_channels=(16, 24, 32, 48), + ch_list=(64, 96, 160, 224, 320), + grmul=1.7, + gr=(10, 16, 18, 24, 32), + n_layers=(4, 4, 8, 8, 8), + align_corners=False, + pretrained=None): + + super().__init__() + self.align_corners = align_corners + self.pretrained = pretrained + encoder_blks_num = len(n_layers) + decoder_blks_num = encoder_blks_num - 1 + encoder_in_channels = stem_channels[3] + + self.stem = nn.Sequential( + layers.ConvBNReLU( + 3, stem_channels[0], kernel_size=3, bias_attr=False), + layers.ConvBNReLU( + stem_channels[0], + stem_channels[1], + kernel_size=3, + bias_attr=False), + layers.ConvBNReLU( + stem_channels[1], + stem_channels[2], + kernel_size=3, + stride=2, + bias_attr=False), + layers.ConvBNReLU( + stem_channels[2], + stem_channels[3], + kernel_size=3, + bias_attr=False)) + + self.encoder = Encoder(encoder_blks_num, encoder_in_channels, ch_list, + gr, grmul, n_layers) + + skip_connection_channels = self.encoder.get_skip_channels() + decoder_in_channels = self.encoder.get_out_channels() + + self.decoder = Decoder(decoder_blks_num, decoder_in_channels, + skip_connection_channels, gr, grmul, n_layers, + align_corners) + + self.cls_head = nn.Conv2D( + in_channels=self.decoder.get_out_channels(), + out_channels=num_classes, + kernel_size=1) + + self.init_weight() + + def forward(self, x): + input_shape = paddle.shape(x)[2:] + x = self.stem(x) + x, skip_connections = self.encoder(x) + x = self.decoder(x, skip_connections) + logit = self.cls_head(x) + logit = F.interpolate( + logit, + size=input_shape, + mode="bilinear", + align_corners=self.align_corners) + return [logit] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class Encoder(nn.Layer): + """The Encoder implementation of FC-HardDNet 70. + + Args: + n_blocks (int): The number of blocks in the Encoder module. + in_channels (int): The number of input channels. + ch_list (tuple|list): The number of channels at each block in the encoder. + grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper. + gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper. + n_layers (tuple|list): The number of layers in each HarDBlock. + """ + + def __init__(self, n_blocks, in_channels, ch_list, gr, grmul, n_layers): + super().__init__() + self.skip_connection_channels = [] + self.shortcut_layers = [] + self.blks = nn.LayerList() + ch = in_channels + for i in range(n_blocks): + blk = HarDBlock(ch, gr[i], grmul, n_layers[i]) + ch = blk.get_out_ch() + self.skip_connection_channels.append(ch) + self.blks.append(blk) + if i < n_blocks - 1: + self.shortcut_layers.append(len(self.blks) - 1) + self.blks.append( + layers.ConvBNReLU( + ch, ch_list[i], kernel_size=1, bias_attr=False)) + + ch = ch_list[i] + if i < n_blocks - 1: + self.blks.append(nn.AvgPool2D(kernel_size=2, stride=2)) + self.out_channels = ch + + def forward(self, x): + skip_connections = [] + for i in range(len(self.blks)): + x = self.blks[i](x) + if i in self.shortcut_layers: + skip_connections.append(x) + return x, skip_connections + + def get_skip_channels(self): + return self.skip_connection_channels + + def get_out_channels(self): + return self.out_channels + + +class Decoder(nn.Layer): + """The Decoder implementation of FC-HardDNet 70. + + Args: + n_blocks (int): The number of blocks in the Encoder module. + in_channels (int): The number of input channels. + skip_connection_channels (tuple|list): The channels of shortcut layers in encoder. + grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper. + gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper. + n_layers (tuple|list): The number of layers in each HarDBlock. + """ + + def __init__(self, + n_blocks, + in_channels, + skip_connection_channels, + gr, + grmul, + n_layers, + align_corners=False): + super().__init__() + prev_block_channels = in_channels + self.n_blocks = n_blocks + self.dense_blocks_up = nn.LayerList() + self.conv1x1_up = nn.LayerList() + + for i in range(n_blocks - 1, -1, -1): + cur_channels_count = prev_block_channels + skip_connection_channels[ + i] + conv1x1 = layers.ConvBNReLU( + cur_channels_count, + cur_channels_count // 2, + kernel_size=1, + bias_attr=False) + blk = HarDBlock( + base_channels=cur_channels_count // 2, + growth_rate=gr[i], + grmul=grmul, + n_layers=n_layers[i]) + + self.conv1x1_up.append(conv1x1) + self.dense_blocks_up.append(blk) + + prev_block_channels = blk.get_out_ch() + + self.out_channels = prev_block_channels + self.align_corners = align_corners + + def forward(self, x, skip_connections): + for i in range(self.n_blocks): + skip = skip_connections.pop() + x = F.interpolate( + x, + size=paddle.shape(skip)[2:], + mode="bilinear", + align_corners=self.align_corners) + x = paddle.concat([x, skip], axis=1) + x = self.conv1x1_up[i](x) + x = self.dense_blocks_up[i](x) + return x + + def get_out_channels(self): + return self.out_channels + + +class HarDBlock(nn.Layer): + """The HarDBlock implementation + + Args: + base_channels (int): The base channels. + growth_rate (tuple|list): The growth rate. + grmul (float): The channel multiplying factor. + n_layers (tuple|list): The number of layers. + keepBase (bool, optional): A bool value indicates whether concatenating the first layer. Default: False. + """ + + def __init__(self, + base_channels, + growth_rate, + grmul, + n_layers, + keepBase=False): + super().__init__() + self.keepBase = keepBase + self.links = [] + layers_ = [] + self.out_channels = 0 + for i in range(n_layers): + outch, inch, link = get_link(i + 1, base_channels, growth_rate, + grmul) + + self.links.append(link) + layers_.append( + layers.ConvBNReLU( + inch, outch, kernel_size=3, bias_attr=False)) + if (i % 2 == 0) or (i == n_layers - 1): + self.out_channels += outch + self.layers = nn.LayerList(layers_) + + def forward(self, x): + layers_ = [x] + for layer in range(len(self.layers)): + link = self.links[layer] + tin = [] + for i in link: + tin.append(layers_[i]) + if len(tin) > 1: + x = paddle.concat(tin, axis=1) + else: + x = tin[0] + out = self.layers[layer](x) + layers_.append(out) + + t = len(layers_) + out_ = [] + for i in range(t): + if (i == 0 and self.keepBase) or \ + (i == t - 1) or (i % 2 == 1): + out_.append(layers_[i]) + out = paddle.concat(out_, 1) + + return out + + def get_out_ch(self): + return self.out_channels + + +def get_link(layer, base_ch, growth_rate, grmul): + if layer == 0: + return base_ch, 0, [] + out_channels = growth_rate + link = [] + for i in range(10): + dv = 2**i + if layer % dv == 0: + k = layer - dv + link.insert(0, k) + if i > 0: + out_channels *= grmul + out_channels = int(int(out_channels + 1) / 2) * 2 + in_channels = 0 + for i in link: + ch, _, _ = get_link(i, base_ch, growth_rate, grmul) + in_channels += ch + return out_channels, in_channels, link diff --git a/paddleseg/models/hrnet_contrast.py b/paddleseg/models/hrnet_contrast.py new file mode 100644 index 0000000000000000000000000000000000000000..dd5a20640e31e2cb49e488a9acb4f7bbcc9f6c45 --- /dev/null +++ b/paddleseg/models/hrnet_contrast.py @@ -0,0 +1,127 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class HRNetW48Contrast(nn.Layer): + """ + The HRNetW48Contrast implementation based on PaddlePaddle. + + The original article refers to + Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation" + (https://arxiv.org/abs/2101.11939). + + Args: + in_channels (int): The output dimensions of backbone. + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support HRNet_W48. + drop_prob (float): The probability of dropout. + proj_dim (int): The projection dimensions. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + in_channels, + num_classes, + backbone, + drop_prob, + proj_dim, + align_corners=False, + pretrained=None): + super().__init__() + self.in_channels = in_channels + self.backbone = backbone + self.num_classes = num_classes + self.proj_dim = proj_dim + self.align_corners = align_corners + + self.cls_head = nn.Sequential( + layers.ConvBNReLU( + in_channels, in_channels, kernel_size=3, stride=1, padding=1), + nn.Dropout2D(drop_prob), + nn.Conv2D( + in_channels, + num_classes, + kernel_size=1, + stride=1, + bias_attr=False), ) + self.proj_head = ProjectionHead( + dim_in=in_channels, proj_dim=self.proj_dim) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, x): + feats = self.backbone(x)[0] + out = self.cls_head(feats) + logit_list = [] + if self.training: + emb = self.proj_head(feats) + logit_list.append( + F.interpolate( + out, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners)) + logit_list.append({'seg': out, 'embed': emb}) + else: + logit_list.append( + F.interpolate( + out, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners)) + return logit_list + + +class ProjectionHead(nn.Layer): + """ + The projection head used by contrast learning. + Args: + dim_in (int): The dimensions of input features. + proj_dim (int, optional): The output dimensions of projection head. Default: 256. + proj (str, optional): The type of projection head, only support 'linear' and 'convmlp'. Default: 'convmlp'. + """ + + def __init__(self, dim_in, proj_dim=256, proj='convmlp'): + super(ProjectionHead, self).__init__() + if proj == 'linear': + self.proj = nn.Conv2D(dim_in, proj_dim, kernel_size=1) + elif proj == 'convmlp': + self.proj = nn.Sequential( + layers.ConvBNReLU( + dim_in, dim_in, kernel_size=1), + nn.Conv2D( + dim_in, proj_dim, kernel_size=1), ) + else: + raise ValueError( + "The type of project head only support 'linear' and 'convmlp', but got {}." + .format(proj)) + + def forward(self, x): + return F.normalize(self.proj(x), p=2, axis=1) diff --git a/paddleseg/models/isanet.py b/paddleseg/models/isanet.py new file mode 100644 index 0000000000000000000000000000000000000000..e8bb3df88b34cd5aca22842f12956dbc1b2b2444 --- /dev/null +++ b/paddleseg/models/isanet.py @@ -0,0 +1,200 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class ISANet(nn.Layer): + """Interlaced Sparse Self-Attention for Semantic Segmentation. + + The original article refers to Lang Huang, et al. "Interlaced Sparse Self-Attention for Semantic Segmentation" + (https://arxiv.org/abs/1907.12273). + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): A backbone network. + backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone. + isa_channels (int): The channels of ISA Module. + down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=(2, 3), + isa_channels=256, + down_factor=(8, 8), + enable_auxiliary_loss=True, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + self.backbone_indices = backbone_indices + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + self.head = ISAHead(num_classes, in_channels, isa_channels, down_factor, + enable_auxiliary_loss) + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners, + align_mode=1) for logit in logit_list + ] + + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class ISAHead(nn.Layer): + """ + The ISAHead. + + Args: + num_classes (int): The unique number of target classes. + in_channels (tuple): The number of input channels. + isa_channels (int): The channels of ISA Module. + down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. + """ + + def __init__(self, num_classes, in_channels, isa_channels, down_factor, + enable_auxiliary_loss): + super(ISAHead, self).__init__() + self.in_channels = in_channels[-1] + inter_channels = self.in_channels // 4 + self.inter_channels = inter_channels + self.down_factor = down_factor + self.enable_auxiliary_loss = enable_auxiliary_loss + self.in_conv = layers.ConvBNReLU( + self.in_channels, inter_channels, 3, bias_attr=False) + self.global_relation = SelfAttentionBlock(inter_channels, isa_channels) + self.local_relation = SelfAttentionBlock(inter_channels, isa_channels) + self.out_conv = layers.ConvBNReLU( + inter_channels * 2, inter_channels, 1, bias_attr=False) + self.cls = nn.Sequential( + nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1)) + self.aux = nn.Sequential( + layers.ConvBNReLU( + in_channels=1024, + out_channels=256, + kernel_size=3, + bias_attr=False), + nn.Dropout2D(p=0.1), + nn.Conv2D(256, num_classes, 1)) + + def forward(self, feat_list): + C3, C4 = feat_list + x = self.in_conv(C4) + x_shape = paddle.shape(x) + P_h, P_w = self.down_factor + Q_h, Q_w = paddle.ceil(x_shape[2] / P_h).astype('int32'), paddle.ceil( + x_shape[3] / P_w).astype('int32') + pad_h, pad_w = (Q_h * P_h - x_shape[2]).astype('int32'), ( + Q_w * P_w - x_shape[3]).astype('int32') + if pad_h > 0 or pad_w > 0: + padding = paddle.concat( + [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, + pad_h - pad_h // 2 + ], + axis=0) + feat = F.pad(x, padding) + else: + feat = x + + feat = feat.reshape([0, x_shape[1], Q_h, P_h, Q_w, P_w]) + feat = feat.transpose([0, 3, 5, 1, 2, + 4]).reshape([-1, self.inter_channels, Q_h, Q_w]) + feat = self.global_relation(feat) + + feat = feat.reshape([x_shape[0], P_h, P_w, x_shape[1], Q_h, Q_w]) + feat = feat.transpose([0, 4, 5, 3, 1, + 2]).reshape([-1, self.inter_channels, P_h, P_w]) + feat = self.local_relation(feat) + + feat = feat.reshape([x_shape[0], Q_h, Q_w, x_shape[1], P_h, P_w]) + feat = feat.transpose([0, 3, 1, 4, 2, 5]).reshape( + [0, self.inter_channels, P_h * Q_h, P_w * Q_w]) + if pad_h > 0 or pad_w > 0: + feat = paddle.slice( + feat, + axes=[2, 3], + starts=[pad_h // 2, pad_w // 2], + ends=[pad_h // 2 + x_shape[2], pad_w // 2 + x_shape[3]]) + + feat = self.out_conv(paddle.concat([feat, x], axis=1)) + output = self.cls(feat) + + if self.enable_auxiliary_loss: + auxout = self.aux(C3) + return [output, auxout] + else: + return [output] + + +class SelfAttentionBlock(layers.AttentionBlock): + """General self-attention block/non-local block. + + Args: + in_channels (int): Input channels of key/query feature. + channels (int): Output channels of key/query transform. + """ + + def __init__(self, in_channels, channels): + super(SelfAttentionBlock, self).__init__( + key_in_channels=in_channels, + query_in_channels=in_channels, + channels=channels, + out_channels=in_channels, + share_key_query=False, + query_downsample=None, + key_downsample=None, + key_query_num_convs=2, + key_query_norm=True, + value_out_num_convs=1, + value_out_norm=False, + matmul_norm=True, + with_out=False) + + self.output_project = self.build_project( + in_channels, in_channels, num_convs=1, use_conv_module=True) + + def forward(self, x): + context = super(SelfAttentionBlock, self).forward(x, x) + return self.output_project(context) diff --git a/paddleseg/models/layers/__init__.py b/paddleseg/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..509641cfc60d9d3fa728ea0a341d9a64abfb1617 --- /dev/null +++ b/paddleseg/models/layers/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .layer_libs import ConvBNReLU, ConvBN, SeparableConvBNReLU, DepthwiseConvBN, AuxLayer, SyncBatchNorm, JPU, ConvBNPReLU, ConvBNAct, ConvBNLeakyReLU +from .activation import Activation +from .pyramid_pool import ASPPModule, PPModule +from .attention import AttentionBlock +from .nonlocal2d import NonLocal2D +from .wrap_functions import * +from .tensor_fusion import UAFM_SpAtten, UAFM_SpAtten_S, UAFM_ChAtten, UAFM_ChAtten_S, UAFM, UAFMMobile, UAFMMobile_SpAtten diff --git a/paddleseg/models/layers/activation.py b/paddleseg/models/layers/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..046ba87ef23b4af8925b7122d0d94767363c9b6e --- /dev/null +++ b/paddleseg/models/layers/activation.py @@ -0,0 +1,73 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn + + +class Activation(nn.Layer): + """ + The wrapper of activations. + + Args: + act (str, optional): The activation name in lowercase. It must be one of ['elu', 'gelu', + 'hardshrink', 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', + 'softmax', 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', + 'hsigmoid']. Default: None, means identical transformation. + + Returns: + A callable object of Activation. + + Raises: + KeyError: When parameter `act` is not in the optional range. + + Examples: + + from paddleseg.models.common.activation import Activation + + relu = Activation("relu") + print(relu) + # + + sigmoid = Activation("sigmoid") + print(sigmoid) + # + + not_exit_one = Activation("not_exit_one") + # KeyError: "not_exit_one does not exist in the current dict_keys(['elu', 'gelu', 'hardshrink', + # 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', 'softmax', + # 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', 'hsigmoid'])" + """ + + def __init__(self, act=None): + super(Activation, self).__init__() + + self._act = act + upper_act_names = nn.layer.activation.__dict__.keys() + lower_act_names = [act.lower() for act in upper_act_names] + act_dict = dict(zip(lower_act_names, upper_act_names)) + + if act is not None: + if act in act_dict.keys(): + act_name = act_dict[act] + self.act_func = eval("nn.layer.activation.{}()".format( + act_name)) + else: + raise KeyError("{} does not exist in the current {}".format( + act, act_dict.keys())) + + def forward(self, x): + if self._act is not None: + return self.act_func(x) + else: + return x diff --git a/paddleseg/models/layers/attention.py b/paddleseg/models/layers/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..e2db9ea49b6eb3732f6364aeaf08d76c132f631e --- /dev/null +++ b/paddleseg/models/layers/attention.py @@ -0,0 +1,272 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers + + +class AttentionBlock(nn.Layer): + """General self-attention block/non-local block. + + The original article refers to refer to https://arxiv.org/abs/1706.03762. + Args: + key_in_channels (int): Input channels of key feature. + query_in_channels (int): Input channels of query feature. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + share_key_query (bool): Whether share projection weight between key + and query projection. + query_downsample (nn.Module): Query downsample module. + key_downsample (nn.Module): Key downsample module. + key_query_num_convs (int): Number of convs for key/query projection. + value_out_num_convs (int): Number of convs for value projection. + key_query_norm (bool): Whether to use BN for key/query projection. + value_out_norm (bool): Whether to use BN for value projection. + matmul_norm (bool): Whether normalize attention map with sqrt of + channels + with_out (bool): Whether use out projection. + """ + + def __init__(self, key_in_channels, query_in_channels, channels, + out_channels, share_key_query, query_downsample, + key_downsample, key_query_num_convs, value_out_num_convs, + key_query_norm, value_out_norm, matmul_norm, with_out): + super(AttentionBlock, self).__init__() + if share_key_query: + assert key_in_channels == query_in_channels + self.with_out = with_out + self.key_in_channels = key_in_channels + self.query_in_channels = query_in_channels + self.out_channels = out_channels + self.channels = channels + self.share_key_query = share_key_query + self.key_project = self.build_project( + key_in_channels, + channels, + num_convs=key_query_num_convs, + use_conv_module=key_query_norm) + if share_key_query: + self.query_project = self.key_project + else: + self.query_project = self.build_project( + query_in_channels, + channels, + num_convs=key_query_num_convs, + use_conv_module=key_query_norm) + + self.value_project = self.build_project( + key_in_channels, + channels if self.with_out else out_channels, + num_convs=value_out_num_convs, + use_conv_module=value_out_norm) + + if self.with_out: + self.out_project = self.build_project( + channels, + out_channels, + num_convs=value_out_num_convs, + use_conv_module=value_out_norm) + else: + self.out_project = None + + self.query_downsample = query_downsample + self.key_downsample = key_downsample + self.matmul_norm = matmul_norm + + def build_project(self, in_channels, channels, num_convs, use_conv_module): + if use_conv_module: + convs = [ + layers.ConvBNReLU( + in_channels=in_channels, + out_channels=channels, + kernel_size=1, + bias_attr=False) + ] + for _ in range(num_convs - 1): + convs.append( + layers.ConvBNReLU( + in_channels=channels, + out_channels=channels, + kernel_size=1, + bias_attr=False)) + else: + convs = [nn.Conv2D(in_channels, channels, 1)] + for _ in range(num_convs - 1): + convs.append(nn.Conv2D(channels, channels, 1)) + + if len(convs) > 1: + convs = nn.Sequential(*convs) + else: + convs = convs[0] + return convs + + def forward(self, query_feats, key_feats): + query_shape = paddle.shape(query_feats) + query = self.query_project(query_feats) + if self.query_downsample is not None: + query = self.query_downsample(query) + query = query.flatten(2).transpose([0, 2, 1]) + + key = self.key_project(key_feats) + value = self.value_project(key_feats) + + if self.key_downsample is not None: + key = self.key_downsample(key) + value = self.key_downsample(value) + + key = key.flatten(2) + value = value.flatten(2).transpose([0, 2, 1]) + sim_map = paddle.matmul(query, key) + if self.matmul_norm: + sim_map = (self.channels**-0.5) * sim_map + sim_map = F.softmax(sim_map, axis=-1) + + context = paddle.matmul(sim_map, value) + context = paddle.transpose(context, [0, 2, 1]) + + context = paddle.reshape( + context, [0, self.out_channels, query_shape[2], query_shape[3]]) + + if self.out_project is not None: + context = self.out_project(context) + return context + + +class DualAttentionModule(nn.Layer): + """ + Dual attention module. + + Args: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + """ + + def __init__(self, in_channels, out_channels): + super().__init__() + inter_channels = in_channels // 4 + + self.channel_conv = layers.ConvBNReLU(in_channels, inter_channels, 1) + self.position_conv = layers.ConvBNReLU(in_channels, inter_channels, 1) + self.pam = PAM(inter_channels) + self.cam = CAM(inter_channels) + self.conv1 = layers.ConvBNReLU(inter_channels, inter_channels, 3) + self.conv2 = layers.ConvBNReLU(inter_channels, inter_channels, 3) + self.conv3 = layers.ConvBNReLU(inter_channels, out_channels, 3) + + def forward(self, feats): + channel_feats = self.channel_conv(feats) + channel_feats = self.cam(channel_feats) + channel_feats = self.conv1(channel_feats) + + position_feats = self.position_conv(feats) + position_feats = self.pam(position_feats) + position_feats = self.conv2(position_feats) + + feats_sum = position_feats + channel_feats + out = self.conv3(feats_sum) + return out + + +class PAM(nn.Layer): + """ + Position attention module. + Args: + in_channels (int): The number of input channels. + """ + + def __init__(self, in_channels): + super().__init__() + mid_channels = in_channels // 8 + self.mid_channels = mid_channels + self.in_channels = in_channels + + self.query_conv = nn.Conv2D(in_channels, mid_channels, 1, 1) + self.key_conv = nn.Conv2D(in_channels, mid_channels, 1, 1) + self.value_conv = nn.Conv2D(in_channels, in_channels, 1, 1) + + self.gamma = self.create_parameter( + shape=[1], + dtype='float32', + default_initializer=nn.initializer.Constant(0)) + + def forward(self, x): + x_shape = paddle.shape(x) + + # query: n, h * w, c1 + query = self.query_conv(x) + query = paddle.reshape(query, (0, self.mid_channels, -1)) + query = paddle.transpose(query, (0, 2, 1)) + + # key: n, c1, h * w + key = self.key_conv(x) + key = paddle.reshape(key, (0, self.mid_channels, -1)) + + # sim: n, h * w, h * w + sim = paddle.bmm(query, key) + sim = F.softmax(sim, axis=-1) + + value = self.value_conv(x) + value = paddle.reshape(value, (0, self.in_channels, -1)) + sim = paddle.transpose(sim, (0, 2, 1)) + + # feat: from (n, c2, h * w) -> (n, c2, h, w) + feat = paddle.bmm(value, sim) + feat = paddle.reshape(feat, + (0, self.in_channels, x_shape[2], x_shape[3])) + + out = self.gamma * feat + x + return out + + +class CAM(nn.Layer): + """ + Channel attention module. + Args: + in_channels (int): The number of input channels. + """ + + def __init__(self, channels): + super().__init__() + + self.channels = channels + self.gamma = self.create_parameter( + shape=[1], + dtype='float32', + default_initializer=nn.initializer.Constant(0)) + + def forward(self, x): + x_shape = paddle.shape(x) + # query: n, c, h * w + query = paddle.reshape(x, (0, self.channels, -1)) + # key: n, h * w, c + key = paddle.reshape(x, (0, self.channels, -1)) + key = paddle.transpose(key, (0, 2, 1)) + + # sim: n, c, c + sim = paddle.bmm(query, key) + # The danet author claims that this can avoid gradient divergence + sim = paddle.max(sim, axis=-1, keepdim=True).tile( + [1, 1, self.channels]) - sim + sim = F.softmax(sim, axis=-1) + + # feat: from (n, c, h * w) to (n, c, h, w) + value = paddle.reshape(x, (0, self.channels, -1)) + feat = paddle.bmm(sim, value) + feat = paddle.reshape(feat, (0, self.channels, x_shape[2], x_shape[3])) + + out = self.gamma * feat + x + return out diff --git a/paddleseg/models/layers/layer_libs.py b/paddleseg/models/layers/layer_libs.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9b3329d2a222f1098fd20ed0722e853a52c445 --- /dev/null +++ b/paddleseg/models/layers/layer_libs.py @@ -0,0 +1,352 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddleseg.models import layers + + +def SyncBatchNorm(*args, **kwargs): + """In cpu environment nn.SyncBatchNorm does not have kernel so use nn.BatchNorm2D instead""" + if paddle.get_device() == 'cpu' or os.environ.get('PADDLESEG_EXPORT_STAGE'): + return nn.BatchNorm2D(*args, **kwargs) + elif paddle.distributed.ParallelEnv().nranks == 1: + return nn.BatchNorm2D(*args, **kwargs) + else: + return nn.SyncBatchNorm(*args, **kwargs) + + +class ConvBNReLU(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + **kwargs): + super().__init__() + + self._conv = nn.Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format) + self._relu = layers.Activation("relu") + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + x = self._relu(x) + return x + + +class ConvBNAct(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + act_type=None, + **kwargs): + super().__init__() + + self._conv = nn.Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format) + + self._act_type = act_type + if act_type is not None: + self._act = layers.Activation(act_type) + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + if self._act_type is not None: + x = self._act(x) + return x + + +class ConvBN(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + **kwargs): + super().__init__() + self._conv = nn.Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format) + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + return x + + +class ConvReLUPool(nn.Layer): + def __init__(self, in_channels, out_channels): + super().__init__() + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + dilation=1) + self._relu = layers.Activation("relu") + self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2) + + def forward(self, x): + x = self.conv(x) + x = self._relu(x) + x = self._max_pool(x) + return x + + +class SeparableConvBNReLU(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + pointwise_bias=None, + **kwargs): + super().__init__() + self.depthwise_conv = ConvBN( + in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + padding=padding, + groups=in_channels, + **kwargs) + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self.piontwise_conv = ConvBNReLU( + in_channels, + out_channels, + kernel_size=1, + groups=1, + data_format=data_format, + bias_attr=pointwise_bias) + + def forward(self, x): + x = self.depthwise_conv(x) + x = self.piontwise_conv(x) + return x + + +class DepthwiseConvBN(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + **kwargs): + super().__init__() + self.depthwise_conv = ConvBN( + in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + padding=padding, + groups=in_channels, + **kwargs) + + def forward(self, x): + x = self.depthwise_conv(x) + return x + + +class AuxLayer(nn.Layer): + """ + The auxiliary layer implementation for auxiliary loss. + + Args: + in_channels (int): The number of input channels. + inter_channels (int): The intermediate channels. + out_channels (int): The number of output channels, and usually it is num_classes. + dropout_prob (float, optional): The drop rate. Default: 0.1. + """ + + def __init__(self, + in_channels, + inter_channels, + out_channels, + dropout_prob=0.1, + **kwargs): + super().__init__() + + self.conv_bn_relu = ConvBNReLU( + in_channels=in_channels, + out_channels=inter_channels, + kernel_size=3, + padding=1, + **kwargs) + + self.dropout = nn.Dropout(p=dropout_prob) + + self.conv = nn.Conv2D( + in_channels=inter_channels, + out_channels=out_channels, + kernel_size=1) + + def forward(self, x): + x = self.conv_bn_relu(x) + x = self.dropout(x) + x = self.conv(x) + return x + + +class JPU(nn.Layer): + """ + Joint Pyramid Upsampling of FCN. + The original paper refers to + Wu, Huikai, et al. "Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation." arXiv preprint arXiv:1903.11816 (2019). + """ + + def __init__(self, in_channels, width=512): + super().__init__() + + self.conv5 = ConvBNReLU( + in_channels[-1], width, 3, padding=1, bias_attr=False) + self.conv4 = ConvBNReLU( + in_channels[-2], width, 3, padding=1, bias_attr=False) + self.conv3 = ConvBNReLU( + in_channels[-3], width, 3, padding=1, bias_attr=False) + + self.dilation1 = SeparableConvBNReLU( + 3 * width, + width, + 3, + padding=1, + pointwise_bias=False, + dilation=1, + bias_attr=False, + stride=1, ) + self.dilation2 = SeparableConvBNReLU( + 3 * width, + width, + 3, + padding=2, + pointwise_bias=False, + dilation=2, + bias_attr=False, + stride=1) + self.dilation3 = SeparableConvBNReLU( + 3 * width, + width, + 3, + padding=4, + pointwise_bias=False, + dilation=4, + bias_attr=False, + stride=1) + self.dilation4 = SeparableConvBNReLU( + 3 * width, + width, + 3, + padding=8, + pointwise_bias=False, + dilation=8, + bias_attr=False, + stride=1) + + def forward(self, *inputs): + feats = [ + self.conv5(inputs[-1]), self.conv4(inputs[-2]), + self.conv3(inputs[-3]) + ] + size = paddle.shape(feats[-1])[2:] + feats[-2] = F.interpolate( + feats[-2], size, mode='bilinear', align_corners=True) + feats[-3] = F.interpolate( + feats[-3], size, mode='bilinear', align_corners=True) + + feat = paddle.concat(feats, axis=1) + feat = paddle.concat( + [ + self.dilation1(feat), self.dilation2(feat), + self.dilation3(feat), self.dilation4(feat) + ], + axis=1) + + return inputs[0], inputs[1], inputs[2], feat + + +class ConvBNPReLU(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + **kwargs): + super().__init__() + + self._conv = nn.Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format) + self._prelu = layers.Activation("prelu") + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + x = self._prelu(x) + return x + + +class ConvBNLeakyReLU(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding='same', + **kwargs): + super().__init__() + + self._conv = nn.Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format) + self._relu = layers.Activation("leakyrelu") + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + x = self._relu(x) + return x diff --git a/paddleseg/models/layers/nonlocal2d.py b/paddleseg/models/layers/nonlocal2d.py new file mode 100644 index 0000000000000000000000000000000000000000..7552ff4242831babf0fa629e78882a3e17cbd709 --- /dev/null +++ b/paddleseg/models/layers/nonlocal2d.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers + + +class NonLocal2D(nn.Layer): + """Basic Non-local module. + This model is the implementation of "Non-local Neural Networks" + (https://arxiv.org/abs/1711.07971) + + Args: + in_channels (int): Channels of the input feature map. + reduction (int): Channel reduction ratio. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. Default: True. + sub_sample (bool): Whether to utilize max pooling after pairwise function. Default: False. + mode (str): Options are `gaussian`, `concatenation`, `embedded_gaussian` and `dot_product`. Default: embedded_gaussian. + """ + + def __init__(self, + in_channels, + reduction=2, + use_scale=True, + sub_sample=False, + mode='embedded_gaussian'): + super(NonLocal2D, self).__init__() + self.in_channels = in_channels + self.reduction = reduction + self.use_scale = use_scale + self.sub_sample = sub_sample + self.mode = mode + if mode not in [ + 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation' + ]: + raise ValueError( + "Mode should be in 'gaussian', 'concatenation','embedded_gaussian' or 'dot_product'." + ) + + self.inter_channels = max(in_channels // reduction, 1) + + self.g = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1) + self.conv_out = layers.ConvBNReLU( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + bias_attr=False) + + if self.mode != "gaussian": + self.theta = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1) + self.phi = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1) + + if self.mode == "concatenation": + self.concat_project = layers.ConvBNReLU( + in_channels=self.inter_channels * 2, + out_channels=1, + kernel_size=1, + bias_attr=False) + + if self.sub_sample: + max_pool_layer = nn.MaxPool2D(kernel_size=(2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + def gaussian(self, theta_x, phi_x): + pairwise_weight = paddle.matmul(theta_x, phi_x) + pairwise_weight = F.softmax(pairwise_weight, axis=-1) + return pairwise_weight + + def embedded_gaussian(self, theta_x, phi_x): + pairwise_weight = paddle.matmul(theta_x, phi_x) + if self.use_scale: + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight = F.softmax(pairwise_weight, -1) + return pairwise_weight + + def dot_product(self, theta_x, phi_x): + pairwise_weight = paddle.matmul(theta_x, phi_x) + pairwise_weight /= pairwise_weight.shape[-1] + return pairwise_weight + + def concatenation(self, theta_x, phi_x): + h = theta_x.shape[2] + w = phi_x.shape[3] + theta_x = paddle.tile(theta_x, [1, 1, 1, w]) + phi_x = paddle.tile(phi_x, [1, 1, h, 1]) + + concat_feature = paddle.concat([theta_x, phi_x], axis=1) + pairwise_weight = self.concat_project(concat_feature) + n, _, h, w = pairwise_weight.shape + pairwise_weight = paddle.reshape(pairwise_weight, [n, h, w]) + pairwise_weight /= pairwise_weight.shape[-1] + return pairwise_weight + + def forward(self, x): + n, c, h, w = x.shape + g_x = paddle.reshape(self.g(x), [n, self.inter_channels, -1]) + g_x = paddle.transpose(g_x, [0, 2, 1]) + + if self.mode == 'gaussian': + theta_x = paddle.reshape(x, [n, self.inter_channels, -1]) + theta_x = paddle.transpose(theta_x, [0, 2, 1]) + if self.sub_sample: + phi_x = paddle.reshape( + self.phi(x), [n, self.inter_channels, -1]) + else: + phi_x = paddle.reshape(x, [n, self.in_channels, -1]) + + elif self.mode == 'concatenation': + theta_x = paddle.reshape( + self.theta(x), [n, self.inter_channels, -1, 1]) + phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, 1, -1]) + + else: + theta_x = paddle.reshape( + self.theta(x), [n, self.inter_channels, -1]) + theta_x = paddle.transpose(theta_x, [0, 2, 1]) + phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, -1]) + + pairwise_func = getattr(self, self.mode) + pairwise_weight = pairwise_func(theta_x, phi_x) + y = paddle.matmul(pairwise_weight, g_x) + y = paddle.transpose(y, [0, 2, 1]) + y = paddle.reshape(y, [n, self.inter_channels, h, w]) + + output = x + self.conv_out(y) + + return output diff --git a/paddleseg/models/layers/pyramid_pool.py b/paddleseg/models/layers/pyramid_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..9e26912535090405ab62d0c1e1c5f23470c5a074 --- /dev/null +++ b/paddleseg/models/layers/pyramid_pool.py @@ -0,0 +1,192 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddleseg.models import layers + + +class ASPPModule(nn.Layer): + """ + Atrous Spatial Pyramid Pooling. + + Args: + aspp_ratios (tuple): The dilation rate using in ASSP module. + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False. + image_pooling (bool, optional): If augmented with image-level features. Default: False + """ + + def __init__(self, + aspp_ratios, + in_channels, + out_channels, + align_corners, + use_sep_conv=False, + image_pooling=False, + data_format='NCHW'): + super().__init__() + + self.align_corners = align_corners + self.data_format = data_format + self.aspp_blocks = nn.LayerList() + + for ratio in aspp_ratios: + if use_sep_conv and ratio > 1: + conv_func = layers.SeparableConvBNReLU + else: + conv_func = layers.ConvBNReLU + + block = conv_func( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1 if ratio == 1 else 3, + dilation=ratio, + padding=0 if ratio == 1 else ratio, + data_format=data_format) + self.aspp_blocks.append(block) + + out_size = len(self.aspp_blocks) + + if image_pooling: + self.global_avg_pool = nn.Sequential( + nn.AdaptiveAvgPool2D( + output_size=(1, 1), data_format=data_format), + layers.ConvBNReLU( + in_channels, + out_channels, + kernel_size=1, + bias_attr=False, + data_format=data_format)) + out_size += 1 + self.image_pooling = image_pooling + + self.conv_bn_relu = layers.ConvBNReLU( + in_channels=out_channels * out_size, + out_channels=out_channels, + kernel_size=1, + data_format=data_format) + + self.dropout = nn.Dropout(p=0.1) # drop rate + + def forward(self, x): + outputs = [] + if self.data_format == 'NCHW': + interpolate_shape = paddle.shape(x)[2:] + axis = 1 + else: + interpolate_shape = paddle.shape(x)[1:3] + axis = -1 + for block in self.aspp_blocks: + y = block(x) + outputs.append(y) + + if self.image_pooling: + img_avg = self.global_avg_pool(x) + img_avg = F.interpolate( + img_avg, + interpolate_shape, + mode='bilinear', + align_corners=self.align_corners, + data_format=self.data_format) + outputs.append(img_avg) + + x = paddle.concat(outputs, axis=axis) + x = self.conv_bn_relu(x) + x = self.dropout(x) + + return x + + +class PPModule(nn.Layer): + """ + Pyramid pooling module originally in PSPNet. + + Args: + in_channels (int): The number of intput channels to pyramid pooling module. + out_channels (int): The number of output channels after pyramid pooling module. + bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 2, 3, 6). + dim_reduction (bool, optional): A bool value represents if reducing dimension after pooling. Default: True. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, in_channels, out_channels, bin_sizes, dim_reduction, + align_corners): + super().__init__() + + self.bin_sizes = bin_sizes + + inter_channels = in_channels + if dim_reduction: + inter_channels = in_channels // len(bin_sizes) + + # we use dimension reduction after pooling mentioned in original implementation. + self.stages = nn.LayerList([ + self._make_stage(in_channels, inter_channels, size) + for size in bin_sizes + ]) + + self.conv_bn_relu2 = layers.ConvBNReLU( + in_channels=in_channels + inter_channels * len(bin_sizes), + out_channels=out_channels, + kernel_size=3, + padding=1) + + self.align_corners = align_corners + + def _make_stage(self, in_channels, out_channels, size): + """ + Create one pooling layer. + + In our implementation, we adopt the same dimension reduction as the original paper that might be + slightly different with other implementations. + + After pooling, the channels are reduced to 1/len(bin_sizes) immediately, while some other implementations + keep the channels to be same. + + Args: + in_channels (int): The number of intput channels to pyramid pooling module. + size (int): The out size of the pooled layer. + + Returns: + conv (Tensor): A tensor after Pyramid Pooling Module. + """ + + prior = nn.AdaptiveAvgPool2D(output_size=(size, size)) + conv = layers.ConvBNReLU( + in_channels=in_channels, out_channels=out_channels, kernel_size=1) + + return nn.Sequential(prior, conv) + + def forward(self, input): + cat_layers = [] + for stage in self.stages: + x = stage(input) + x = F.interpolate( + x, + paddle.shape(input)[2:], + mode='bilinear', + align_corners=self.align_corners) + cat_layers.append(x) + cat_layers = [input] + cat_layers[::-1] + cat = paddle.concat(cat_layers, axis=1) + out = self.conv_bn_relu2(cat) + + return out diff --git a/paddleseg/models/layers/tensor_fusion.py b/paddleseg/models/layers/tensor_fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..8548f7b780f1028cc7d3e0957be197d88883183a --- /dev/null +++ b/paddleseg/models/layers/tensor_fusion.py @@ -0,0 +1,279 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.models.layers import tensor_fusion_helper as helper + + +class UAFM(nn.Layer): + """ + The base of Unified Attention Fusion Module. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__() + + self.conv_x = layers.ConvBNReLU( + x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) + self.conv_out = layers.ConvBNReLU( + y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) + self.resize_mode = resize_mode + + def check(self, x, y): + assert x.ndim == 4 and y.ndim == 4 + x_h, x_w = x.shape[2:] + y_h, y_w = y.shape[2:] + assert x_h >= y_h and x_w >= y_w + + def prepare(self, x, y): + x = self.prepare_x(x, y) + y = self.prepare_y(x, y) + return x, y + + def prepare_x(self, x, y): + x = self.conv_x(x) + return x + + def prepare_y(self, x, y): + y_up = F.interpolate(y, paddle.shape(x)[2:], mode=self.resize_mode) + return y_up + + def fuse(self, x, y): + out = x + y + out = self.conv_out(out) + return out + + def forward(self, x, y): + """ + Args: + x (Tensor): The low level feature. + y (Tensor): The high level feature. + """ + self.check(x, y) + x, y = self.prepare(x, y) + out = self.fuse(x, y) + return out + + +class UAFM_ChAtten(UAFM): + """ + The UAFM with channel attention, which uses mean and max values. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) + + self.conv_xy_atten = nn.Sequential( + layers.ConvBNAct( + 4 * y_ch, + y_ch // 2, + kernel_size=1, + bias_attr=False, + act_type="leakyrelu"), + layers.ConvBN( + y_ch // 2, y_ch, kernel_size=1, bias_attr=False)) + + def fuse(self, x, y): + """ + Args: + x (Tensor): The low level feature. + y (Tensor): The high level feature. + """ + atten = helper.avg_max_reduce_hw([x, y], self.training) + atten = F.sigmoid(self.conv_xy_atten(atten)) + + out = x * atten + y * (1 - atten) + out = self.conv_out(out) + return out + + +class UAFM_ChAtten_S(UAFM): + """ + The UAFM with channel attention, which uses mean values. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) + + self.conv_xy_atten = nn.Sequential( + layers.ConvBNAct( + 2 * y_ch, + y_ch // 2, + kernel_size=1, + bias_attr=False, + act_type="leakyrelu"), + layers.ConvBN( + y_ch // 2, y_ch, kernel_size=1, bias_attr=False)) + + def fuse(self, x, y): + """ + Args: + x (Tensor): The low level feature. + y (Tensor): The high level feature. + """ + atten = helper.avg_reduce_hw([x, y]) + atten = F.sigmoid(self.conv_xy_atten(atten)) + + out = x * atten + y * (1 - atten) + out = self.conv_out(out) + return out + + +class UAFM_SpAtten(UAFM): + """ + The UAFM with spatial attention, which uses mean and max values. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) + + self.conv_xy_atten = nn.Sequential( + layers.ConvBNReLU( + 4, 2, kernel_size=3, padding=1, bias_attr=False), + layers.ConvBN( + 2, 1, kernel_size=3, padding=1, bias_attr=False)) + + def fuse(self, x, y): + """ + Args: + x (Tensor): The low level feature. + y (Tensor): The high level feature. + """ + atten = helper.avg_max_reduce_channel([x, y]) + atten = F.sigmoid(self.conv_xy_atten(atten)) + + out = x * atten + y * (1 - atten) + out = self.conv_out(out) + return out + + +class UAFM_SpAtten_S(UAFM): + """ + The UAFM with spatial attention, which uses mean values. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) + + self.conv_xy_atten = nn.Sequential( + layers.ConvBNReLU( + 2, 2, kernel_size=3, padding=1, bias_attr=False), + layers.ConvBN( + 2, 1, kernel_size=3, padding=1, bias_attr=False)) + + def fuse(self, x, y): + """ + Args: + x (Tensor): The low level feature. + y (Tensor): The high level feature. + """ + atten = helper.avg_reduce_channel([x, y]) + atten = F.sigmoid(self.conv_xy_atten(atten)) + + out = x * atten + y * (1 - atten) + out = self.conv_out(out) + return out + + +class UAFMMobile(UAFM): + """ + Unified Attention Fusion Module for mobile. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) + + self.conv_x = layers.SeparableConvBNReLU( + x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) + self.conv_out = layers.SeparableConvBNReLU( + y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) + + +class UAFMMobile_SpAtten(UAFM): + """ + Unified Attention Fusion Module with spatial attention for mobile. + Args: + x_ch (int): The channel of x tensor, which is the low level feature. + y_ch (int): The channel of y tensor, which is the high level feature. + out_ch (int): The channel of output tensor. + ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. + resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. + """ + + def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): + super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) + + self.conv_x = layers.SeparableConvBNReLU( + x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) + self.conv_out = layers.SeparableConvBNReLU( + y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) + + self.conv_xy_atten = nn.Sequential( + layers.ConvBNReLU( + 4, 2, kernel_size=3, padding=1, bias_attr=False), + layers.ConvBN( + 2, 1, kernel_size=3, padding=1, bias_attr=False)) + + def fuse(self, x, y): + """ + Args: + x (Tensor): The low level feature. + y (Tensor): The high level feature. + """ + atten = helper.avg_max_reduce_channel([x, y]) + atten = F.sigmoid(self.conv_xy_atten(atten)) + + out = x * atten + y * (1 - atten) + out = self.conv_out(out) + return out diff --git a/paddleseg/models/layers/tensor_fusion_helper.py b/paddleseg/models/layers/tensor_fusion_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..f47d14bfa23908b91e74db59f273b46e8ae0f6dc --- /dev/null +++ b/paddleseg/models/layers/tensor_fusion_helper.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def avg_reduce_hw(x): + # Reduce hw by avg + # Return cat([avg_pool_0, avg_pool_1, ...]) + if not isinstance(x, (list, tuple)): + return F.adaptive_avg_pool2d(x, 1) + elif len(x) == 1: + return F.adaptive_avg_pool2d(x[0], 1) + else: + res = [] + for xi in x: + res.append(F.adaptive_avg_pool2d(xi, 1)) + return paddle.concat(res, axis=1) + + +def avg_max_reduce_hw_helper(x, is_training, use_concat=True): + assert not isinstance(x, (list, tuple)) + avg_pool = F.adaptive_avg_pool2d(x, 1) + # TODO(pjc): when axis=[2, 3], the paddle.max api has bug for training. + if is_training: + max_pool = F.adaptive_max_pool2d(x, 1) + else: + max_pool = paddle.max(x, axis=[2, 3], keepdim=True) + + if use_concat: + res = paddle.concat([avg_pool, max_pool], axis=1) + else: + res = [avg_pool, max_pool] + return res + + +def avg_max_reduce_hw(x, is_training): + # Reduce hw by avg and max + # Return cat([avg_pool_0, avg_pool_1, ..., max_pool_0, max_pool_1, ...]) + if not isinstance(x, (list, tuple)): + return avg_max_reduce_hw_helper(x, is_training) + elif len(x) == 1: + return avg_max_reduce_hw_helper(x[0], is_training) + else: + res_avg = [] + res_max = [] + for xi in x: + avg, max = avg_max_reduce_hw_helper(xi, is_training, False) + res_avg.append(avg) + res_max.append(max) + res = res_avg + res_max + return paddle.concat(res, axis=1) + + +def avg_reduce_channel(x): + # Reduce channel by avg + # Return cat([avg_ch_0, avg_ch_1, ...]) + if not isinstance(x, (list, tuple)): + return paddle.mean(x, axis=1, keepdim=True) + elif len(x) == 1: + return paddle.mean(x[0], axis=1, keepdim=True) + else: + res = [] + for xi in x: + res.append(paddle.mean(xi, axis=1, keepdim=True)) + return paddle.concat(res, axis=1) + + +def max_reduce_channel(x): + # Reduce channel by max + # Return cat([max_ch_0, max_ch_1, ...]) + if not isinstance(x, (list, tuple)): + return paddle.max(x, axis=1, keepdim=True) + elif len(x) == 1: + return paddle.max(x[0], axis=1, keepdim=True) + else: + res = [] + for xi in x: + res.append(paddle.max(xi, axis=1, keepdim=True)) + return paddle.concat(res, axis=1) + + +def avg_max_reduce_channel_helper(x, use_concat=True): + # Reduce hw by avg and max, only support single input + assert not isinstance(x, (list, tuple)) + mean_value = paddle.mean(x, axis=1, keepdim=True) + max_value = paddle.max(x, axis=1, keepdim=True) + + if use_concat: + res = paddle.concat([mean_value, max_value], axis=1) + else: + res = [mean_value, max_value] + return res + + +def avg_max_reduce_channel(x): + # Reduce hw by avg and max + # Return cat([avg_ch_0, max_ch_0, avg_ch_1, max_ch_1, ...]) + if not isinstance(x, (list, tuple)): + return avg_max_reduce_channel_helper(x) + elif len(x) == 1: + return avg_max_reduce_channel_helper(x[0]) + else: + res = [] + for xi in x: + res.extend(avg_max_reduce_channel_helper(xi, False)) + return paddle.concat(res, axis=1) + + +def cat_avg_max_reduce_channel(x): + # Reduce hw by cat+avg+max + assert isinstance(x, (list, tuple)) and len(x) > 1 + + x = paddle.concat(x, axis=1) + + mean_value = paddle.mean(x, axis=1, keepdim=True) + max_value = paddle.max(x, axis=1, keepdim=True) + res = paddle.concat([mean_value, max_value], axis=1) + + return res \ No newline at end of file diff --git a/paddleseg/models/layers/wrap_functions.py b/paddleseg/models/layers/wrap_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..c86dd24baea4e7fb029311e323bc97cbf7ac258d --- /dev/null +++ b/paddleseg/models/layers/wrap_functions.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +""" +Warp the functon api, so the normal and quantization training can use the same network. +""" + + +class Add(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, y, name=None): + return paddle.add(x, y, name) + + +class Subtract(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, y, name=None): + return paddle.subtract(x, y, name) + + +class Multiply(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, y, name=None): + return paddle.multiply(x, y, name) + + +class Divide(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, y, name=None): + return paddle.divide(x, y, name) + + +class Reshape(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, shape, name=None): + return paddle.reshape(x, shape, name) + + +class Transpose(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, perm, name=None): + return paddle.transpose(x, perm, name) + + +class Concat(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, axis=0, name=None): + return paddle.concat(x, axis, name) + + +class Flatten(nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x, start_axis=0, stop_axis=-1, name=None): + return paddle.flatten(x, start_axis, stop_axis, name) diff --git a/paddleseg/models/losses/__init__.py b/paddleseg/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d59bbb00d2f592fa29590b6bd5981e57e35df475 --- /dev/null +++ b/paddleseg/models/losses/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .mixed_loss import MixedLoss +from .cross_entropy_loss import CrossEntropyLoss +from .cross_entropy_loss import DistillCrossEntropyLoss +from .binary_cross_entropy_loss import BCELoss +from .lovasz_loss import LovaszSoftmaxLoss, LovaszHingeLoss +from .gscnn_dual_task_loss import DualTaskLoss +from .edge_attention_loss import EdgeAttentionLoss +from .bootstrapped_cross_entropy import BootstrappedCrossEntropyLoss +from .dice_loss import DiceLoss +from .ohem_cross_entropy_loss import OhemCrossEntropyLoss +from .decoupledsegnet_relax_boundary_loss import RelaxBoundaryLoss +from .ohem_edge_attention_loss import OhemEdgeAttentionLoss +from .l1_loss import L1Loss +from .mean_square_error_loss import MSELoss +from .focal_loss import FocalLoss +from .kl_loss import KLLoss +from .rmi_loss import RMILoss +from .detail_aggregate_loss import DetailAggregateLoss +from .point_cross_entropy_loss import PointCrossEntropyLoss +from .pixel_contrast_cross_entropy_loss import PixelContrastCrossEntropyLoss +from .semantic_encode_cross_entropy_loss import SECrossEntropyLoss +from .semantic_connectivity_loss import SemanticConnectivityLoss diff --git a/paddleseg/models/losses/binary_cross_entropy_loss.py b/paddleseg/models/losses/binary_cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4bf7bc7bb76fb56feed4289700da29b50c25bcfe --- /dev/null +++ b/paddleseg/models/losses/binary_cross_entropy_loss.py @@ -0,0 +1,174 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class BCELoss(nn.Layer): + r""" + This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer. + Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits`` + layer and some reduce operations. + This measures the element-wise probability error in classification tasks + in which each class is independent. + This can be thought of as predicting labels for a data-point, where labels + are not mutually exclusive. For example, a news article can be about + politics, technology or sports at the same time or none of these. + First this operator calculate loss function as follows: + .. math:: + Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit)) + We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get: + .. math:: + Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit}) + For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0, + we reformulate the loss as follows: + .. math:: + Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|}) + Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the + weight tensor on the loss `Out`. The ``weight`` tensor will attach different + weight on every items in the batch. The ``pos_weight`` will attach different + weight on the positive label of each class. + Finally, this operator applies reduce operation on the loss. + If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`. + If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`. + If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`. + Note that the target labels ``label`` should be numbers between 0 and 1. + Args: + weight (Tensor | str, optional): A manual rescaling weight given to the loss of each + batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`, + The data type is float32, float64. If type is str, it should equal to 'dynamic'. + It will compute weight dynamically in every step. + Default is ``'None'``. + pos_weight (float|str, optional): A weight of positive examples. If type is str, + it should equal to 'dynamic'. It will compute weight dynamically in every step. + Default is ``'None'``. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + edge_label (bool, optional): Whether to use edge label. Default: False + Shapes: + logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *], + N is batch_size, `*` means number of additional dimensions. The ``logit`` + is usually the output of Linear layer. Available dtype is float32, float64. + label (Tensor): The target labels tensor. 2-D tensor with the same shape as + ``logit``. The target labels which values should be numbers between 0 and 1. + Available dtype is float32, float64. + Returns: + A callable object of BCEWithLogitsLoss. + Examples: + .. code-block:: python + import paddle + paddle.disable_static() + logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32") + label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32") + bce_logit_loss = paddle.nn.BCEWithLogitsLoss() + output = bce_logit_loss(logit, label) + print(output.numpy()) # [0.45618808] + """ + + def __init__(self, + weight=None, + pos_weight=None, + ignore_index=255, + edge_label=False): + super().__init__() + self.weight = weight + self.pos_weight = pos_weight + self.ignore_index = ignore_index + self.edge_label = edge_label + self.EPS = 1e-10 + + if self.weight is not None: + if isinstance(self.weight, str): + if self.weight != 'dynamic': + raise ValueError( + "if type of `weight` is str, it should equal to 'dynamic', but it is {}" + .format(self.weight)) + elif not isinstance(self.weight, paddle.Tensor): + raise TypeError( + 'The type of `weight` is wrong, it should be Tensor or str, but it is {}' + .format(type(self.weight))) + + if self.pos_weight is not None: + if isinstance(self.pos_weight, str): + if self.pos_weight != 'dynamic': + raise ValueError( + "if type of `pos_weight` is str, it should equal to 'dynamic', but it is {}" + .format(self.pos_weight)) + elif isinstance(self.pos_weight, float): + self.pos_weight = paddle.to_tensor( + self.pos_weight, dtype='float32') + else: + raise TypeError( + 'The type of `pos_weight` is wrong, it should be float or str, but it is {}' + .format(type(self.pos_weight))) + + def forward(self, logit, label): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each + value is 0 or 1, and if shape is more than 2D, this is + (N, C, D1, D2,..., Dk), k >= 1. + """ + if len(label.shape) != len(logit.shape): + label = paddle.unsqueeze(label, 1) + mask = (label != self.ignore_index) + mask = paddle.cast(mask, 'float32') + # label.shape should equal to the logit.shape + if label.shape[1] != logit.shape[1]: + label = label.squeeze(1) + label = F.one_hot(label, logit.shape[1]) + label = label.transpose((0, 3, 1, 2)) + if isinstance(self.weight, str): + pos_index = (label == 1) + neg_index = (label == 0) + pos_num = paddle.sum(pos_index.astype('float32')) + neg_num = paddle.sum(neg_index.astype('float32')) + sum_num = pos_num + neg_num + weight_pos = 2 * neg_num / (sum_num + self.EPS) + weight_neg = 2 * pos_num / (sum_num + self.EPS) + weight = weight_pos * label + weight_neg * (1 - label) + else: + weight = self.weight + if isinstance(self.pos_weight, str): + pos_index = (label == 1) + neg_index = (label == 0) + pos_num = paddle.sum(pos_index.astype('float32')) + neg_num = paddle.sum(neg_index.astype('float32')) + sum_num = pos_num + neg_num + pos_weight = 2 * neg_num / (sum_num + self.EPS) + else: + pos_weight = self.pos_weight + label = label.astype('float32') + loss = paddle.nn.functional.binary_cross_entropy_with_logits( + logit, + label, + weight=weight, + reduction='none', + pos_weight=pos_weight) + loss = loss * mask + loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS) + label.stop_gradient = True + mask.stop_gradient = True + + return loss diff --git a/paddleseg/models/losses/bootstrapped_cross_entropy.py b/paddleseg/models/losses/bootstrapped_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d09efe97d1d9658edcb2b540738d8e6ea25d5e --- /dev/null +++ b/paddleseg/models/losses/bootstrapped_cross_entropy.py @@ -0,0 +1,73 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class BootstrappedCrossEntropyLoss(nn.Layer): + """ + Implements the cross entropy loss function. + + Args: + min_K (int): the minimum number of pixels to be counted in loss computation. + loss_th (float): the loss threshold. Only loss that is larger than the threshold + would be calculated. + weight (tuple|list, optional): The weight for different classes. Default: None. + ignore_index (int, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default: 255. + """ + + def __init__(self, min_K, loss_th, weight=None, ignore_index=255): + super().__init__() + self.ignore_index = ignore_index + self.K = min_K + self.threshold = loss_th + if weight is not None: + weight = paddle.to_tensor(weight, dtype='float32') + self.weight = weight + + def forward(self, logit, label): + + n, c, h, w = logit.shape + total_loss = 0.0 + if len(label.shape) != len(logit.shape): + label = paddle.unsqueeze(label, 1) + + for i in range(n): + x = paddle.unsqueeze(logit[i], 0) + y = paddle.unsqueeze(label[i], 0) + x = paddle.transpose(x, (0, 2, 3, 1)) + y = paddle.transpose(y, (0, 2, 3, 1)) + x = paddle.reshape(x, shape=(-1, c)) + y = paddle.reshape(y, shape=(-1, )) + loss = F.cross_entropy( + x, + y, + weight=self.weight, + ignore_index=self.ignore_index, + reduction="none") + sorted_loss = paddle.sort(loss, descending=True) + if sorted_loss[self.K] > self.threshold: + new_indices = paddle.nonzero(sorted_loss > self.threshold) + loss = paddle.gather(sorted_loss, new_indices) + else: + loss = sorted_loss[:self.K] + + total_loss += paddle.mean(loss) + return total_loss / float(n) diff --git a/paddleseg/models/losses/cross_entropy_loss.py b/paddleseg/models/losses/cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c934a0a5b484dcfb1dc2125b7d163a7fba7eb5c3 --- /dev/null +++ b/paddleseg/models/losses/cross_entropy_loss.py @@ -0,0 +1,216 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class CrossEntropyLoss(nn.Layer): + """ + Implements the cross entropy loss function. + + Args: + weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight + given to each class. Its length must be equal to the number of classes. + Default ``None``. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. + When its value < 1.0, only compute the loss for the top k percent pixels + (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``. + data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``. + """ + + def __init__(self, + weight=None, + ignore_index=255, + top_k_percent_pixels=1.0, + data_format='NCHW'): + super(CrossEntropyLoss, self).__init__() + self.ignore_index = ignore_index + self.top_k_percent_pixels = top_k_percent_pixels + self.EPS = 1e-8 + self.data_format = data_format + if weight is not None: + self.weight = paddle.to_tensor(weight, dtype='float32') + else: + self.weight = None + + def forward(self, logit, label, semantic_weights=None): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + semantic_weights (Tensor, optional): Weights about loss for each pixels, + shape is the same as label. Default: None. + Returns: + (Tensor): The average loss. + """ + channel_axis = 1 if self.data_format == 'NCHW' else -1 + if self.weight is not None and logit.shape[channel_axis] != len( + self.weight): + raise ValueError( + 'The number of weights = {} must be the same as the number of classes = {}.' + .format(len(self.weight), logit.shape[channel_axis])) + + if channel_axis == 1: + logit = paddle.transpose(logit, [0, 2, 3, 1]) + label = label.astype('int64') + + loss = F.cross_entropy( + logit, + label, + ignore_index=self.ignore_index, + reduction='none', + weight=self.weight) + + return self._post_process_loss(logit, label, semantic_weights, loss) + + def _post_process_loss(self, logit, label, semantic_weights, loss): + """ + Consider mask and top_k to calculate the final loss. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + semantic_weights (Tensor, optional): Weights about loss for each pixels, + shape is the same as label. + loss (Tensor): Loss tensor which is the output of cross_entropy. If soft_label + is False in cross_entropy, the shape of loss should be the same as the label. + If soft_label is True in cross_entropy, the shape of loss should be + (N, D1, D2,..., Dk, 1). + Returns: + (Tensor): The average loss. + """ + mask = label != self.ignore_index + mask = paddle.cast(mask, 'float32') + label.stop_gradient = True + mask.stop_gradient = True + + if loss.ndim > mask.ndim: + loss = paddle.squeeze(loss, axis=-1) + loss = loss * mask + if semantic_weights is not None: + loss = loss * semantic_weights + + if self.weight is not None: + _one_hot = F.one_hot(label * mask, logit.shape[-1]) + coef = paddle.sum(_one_hot * self.weight, axis=-1) + else: + coef = paddle.ones_like(label) + + if self.top_k_percent_pixels == 1.0: + avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS) + else: + loss = loss.reshape((-1, )) + top_k_pixels = int(self.top_k_percent_pixels * loss.numel()) + loss, indices = paddle.topk(loss, top_k_pixels) + coef = coef.reshape((-1, )) + coef = paddle.gather(coef, indices) + coef.stop_gradient = True + coef = coef.astype('float32') + avg_loss = loss.mean() / (paddle.mean(coef) + self.EPS) + + return avg_loss + + +@manager.LOSSES.add_component +class DistillCrossEntropyLoss(CrossEntropyLoss): + """ + The implementation of distill cross entropy loss. + + Args: + weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight + given to each class. Its length must be equal to the number of classes. + Default ``None``. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. + When its value < 1.0, only compute the loss for the top k percent pixels + (e.g., the top 20% pixels). This is useful for hard pixel mining. + Default ``1.0``. + data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. + Default ``'NCHW'``. + """ + + def __init__(self, + weight=None, + ignore_index=255, + top_k_percent_pixels=1.0, + data_format='NCHW'): + super().__init__(weight, ignore_index, top_k_percent_pixels, + data_format) + + def forward(self, + student_logit, + teacher_logit, + label, + semantic_weights=None): + """ + Forward computation. + + Args: + student_logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + teacher_logit (Tensor): Logit tensor, the data type is float32, float64. The shape + is the same as the student_logit. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + semantic_weights (Tensor, optional): Weights about loss for each pixels, + shape is the same as label. Default: None. + """ + + if student_logit.shape != teacher_logit.shape: + raise ValueError( + 'The shape of student_logit = {} must be the same as the shape of teacher_logit = {}.' + .format(student_logit.shape, teacher_logit.shape)) + + channel_axis = 1 if self.data_format == 'NCHW' else -1 + if self.weight is not None and student_logit.shape[channel_axis] != len( + self.weight): + raise ValueError( + 'The number of weights = {} must be the same as the number of classes = {}.' + .format(len(self.weight), student_logit.shape[channel_axis])) + + if channel_axis == 1: + student_logit = paddle.transpose(student_logit, [0, 2, 3, 1]) + teacher_logit = paddle.transpose(teacher_logit, [0, 2, 3, 1]) + + teacher_logit = F.softmax(teacher_logit) + + loss = F.cross_entropy( + student_logit, + teacher_logit, + weight=self.weight, + reduction='none', + soft_label=True) + + return self._post_process_loss(student_logit, label, semantic_weights, + loss) diff --git a/paddleseg/models/losses/decoupledsegnet_relax_boundary_loss.py b/paddleseg/models/losses/decoupledsegnet_relax_boundary_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..eb02389c4b7fa461d9bb262c2f424dbca37e99af --- /dev/null +++ b/paddleseg/models/losses/decoupledsegnet_relax_boundary_loss.py @@ -0,0 +1,129 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle import nn +import paddle.nn.functional as F +from scipy.ndimage import shift + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class RelaxBoundaryLoss(nn.Layer): + """ + Implements the ohem cross entropy loss function. + + Args: + border (int, optional): The value of border to relax. Default: 1. + calculate_weights (bool, optional): Whether to calculate weights for every classes. Default: False. + upper_bound (float, optional): The upper bound of weights if calculating weights for every classes. Default: 1.0. + ignore_index (int64): Specifies a target value that is ignored + and does not contribute to the input gradient. Default: 255. + """ + + def __init__(self, + border=1, + calculate_weights=False, + upper_bound=1.0, + ignore_index=255): + super(RelaxBoundaryLoss, self).__init__() + self.border = border + self.calculate_weights = calculate_weights + self.upper_bound = upper_bound + self.ignore_index = ignore_index + self.EPS = 1e-5 + + def relax_onehot(self, label, num_classes): + # pad label, and let ignore_index as num_classes + if len(label.shape) == 3: + label = label.unsqueeze(1) + h, w = label.shape[-2], label.shape[-1] + label = F.pad(label, [self.border] * 4, value=num_classes) + label = label.squeeze(1) + ignore_mask = (label == self.ignore_index).astype('int64') + label = label * (1 - ignore_mask) + num_classes * ignore_mask + + onehot = 0 + for i in range(-self.border, self.border + 1): + for j in range(-self.border, self.border + 1): + h_start, h_end = 1 + i, h + 1 + i + w_start, w_end = 1 + j, w + 1 + j + label_ = label[:, h_start:h_end, w_start:w_end] + onehot_ = F.one_hot(label_, num_classes + 1) + onehot += onehot_ + onehot = (onehot > 0).astype('int64') + onehot = paddle.transpose(onehot, (0, 3, 1, 2)) + + return onehot + + def calculate_weights(self, label): + hist = paddle.sum(label, axis=(1, 2)) * 1.0 / label.sum() + hist = ((hist != 0) * self.upper_bound * (1 - hist)) + 1 + + def custom_nll(self, + logit, + label, + class_weights=None, + border_weights=None, + ignore_mask=None): + soft = F.softmax(logit, axis=1) + # calculate the valid soft where label is 1. + soft_label = ((soft * label[:, :-1, :, :]).sum( + 1, keepdim=True)) * (label[:, :-1, :, :].astype('float32')) + soft = soft * (1 - label[:, :-1, :, :]) + soft_label + logsoft = paddle.log(soft) + if class_weights is not None: + logsoft = class_weights.unsqueeze((0, 2, 3)) + logsoft = label[:, :-1, :, :] * logsoft + logsoft = logsoft.sum(1) + # border loss is divided equally + logsoft = -1 / border_weights * logsoft * (1. - ignore_mask) + n, _, h, w = label.shape + logsoft = logsoft.sum() / (n * h * w - ignore_mask.sum() + 1) + return logsoft + + def forward(self, logit, label): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + """ + n, c, h, w = logit.shape + label.stop_gradient = True + label = self.relax_onehot(label, c) + weights = label[:, :-1, :, :].sum(1).astype('float32') + ignore_mask = (weights == 0).astype('float32') + # border is greater than 1, other is 1 + border_weights = weights + ignore_mask + + loss = 0 + class_weights = None + for i in range(n): + if self.calculate_weights: + class_weights = self.calculate_weights(label[i]) + loss = loss + self.custom_nll( + logit[i].unsqueeze(0), + label[i].unsqueeze(0), + class_weights=class_weights, + border_weights=border_weights, + ignore_mask=ignore_mask[i]) + return loss diff --git a/paddleseg/models/losses/detail_aggregate_loss.py b/paddleseg/models/losses/detail_aggregate_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d6b49c6e8b6cb9ee31772a7ce6f618c76b628160 --- /dev/null +++ b/paddleseg/models/losses/detail_aggregate_loss.py @@ -0,0 +1,145 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class DetailAggregateLoss(nn.Layer): + """ + DetailAggregateLoss's implementation based on PaddlePaddle. + + The original article refers to Meituan + Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation." + (https://arxiv.org/abs/2104.13188) + + Args: + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + + """ + + def __init__(self, ignore_index=255): + super(DetailAggregateLoss, self).__init__() + self.ignore_index = ignore_index + self.laplacian_kernel = paddle.to_tensor( + [-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype='float32').reshape( + (1, 1, 3, 3)) + self.fuse_kernel = paddle.create_parameter( + [1, 3, 1, 1], dtype='float32') + + def forward(self, logits, label): + """ + Args: + logits (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + Returns: loss + """ + boundary_targets = F.conv2d( + paddle.unsqueeze( + label, axis=1).astype('float32'), + self.laplacian_kernel, + padding=1) + boundary_targets = paddle.clip(boundary_targets, min=0) + boundary_targets = boundary_targets > 0.1 + boundary_targets = boundary_targets.astype('float32') + + boundary_targets_x2 = F.conv2d( + paddle.unsqueeze( + label, axis=1).astype('float32'), + self.laplacian_kernel, + stride=2, + padding=1) + boundary_targets_x2 = paddle.clip(boundary_targets_x2, min=0) + boundary_targets_x4 = F.conv2d( + paddle.unsqueeze( + label, axis=1).astype('float32'), + self.laplacian_kernel, + stride=4, + padding=1) + boundary_targets_x4 = paddle.clip(boundary_targets_x4, min=0) + + boundary_targets_x8 = F.conv2d( + paddle.unsqueeze( + label, axis=1).astype('float32'), + self.laplacian_kernel, + stride=8, + padding=1) + boundary_targets_x8 = paddle.clip(boundary_targets_x8, min=0) + + boundary_targets_x8_up = F.interpolate( + boundary_targets_x8, boundary_targets.shape[2:], mode='nearest') + boundary_targets_x4_up = F.interpolate( + boundary_targets_x4, boundary_targets.shape[2:], mode='nearest') + boundary_targets_x2_up = F.interpolate( + boundary_targets_x2, boundary_targets.shape[2:], mode='nearest') + + boundary_targets_x2_up = boundary_targets_x2_up > 0.1 + boundary_targets_x2_up = boundary_targets_x2_up.astype('float32') + + boundary_targets_x4_up = boundary_targets_x4_up > 0.1 + boundary_targets_x4_up = boundary_targets_x4_up.astype('float32') + + boundary_targets_x8_up = boundary_targets_x8_up > 0.1 + boundary_targets_x8_up = boundary_targets_x8_up.astype('float32') + + boudary_targets_pyramids = paddle.stack( + (boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up), + axis=1) + + boudary_targets_pyramids = paddle.squeeze( + boudary_targets_pyramids, axis=2) + boudary_targets_pyramid = F.conv2d(boudary_targets_pyramids, + self.fuse_kernel) + + boudary_targets_pyramid = boudary_targets_pyramid > 0.1 + boudary_targets_pyramid = boudary_targets_pyramid.astype('float32') + + if logits.shape[-1] != boundary_targets.shape[-1]: + logits = F.interpolate( + logits, + boundary_targets.shape[2:], + mode='bilinear', + align_corners=True) + + bce_loss = F.binary_cross_entropy_with_logits(logits, + boudary_targets_pyramid) + dice_loss = self.fixed_dice_loss_func( + F.sigmoid(logits), boudary_targets_pyramid) + detail_loss = bce_loss + dice_loss + + label.stop_gradient = True + return detail_loss + + def fixed_dice_loss_func(self, input, target): + """ + simplified diceloss for DetailAggregateLoss. + """ + smooth = 1. + n = input.shape[0] + iflat = paddle.reshape(input, [n, -1]) + tflat = paddle.reshape(target, [n, -1]) + intersection = paddle.sum((iflat * tflat), axis=1) + loss = 1 - ( + (2. * intersection + smooth) / + (paddle.sum(iflat, axis=1) + paddle.sum(tflat, axis=1) + smooth)) + return paddle.mean(loss) diff --git a/paddleseg/models/losses/dice_loss.py b/paddleseg/models/losses/dice_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..e7b8cef3049015613a6a1b8357134265c7a8cf19 --- /dev/null +++ b/paddleseg/models/losses/dice_loss.py @@ -0,0 +1,77 @@ +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class DiceLoss(nn.Layer): + """ + The implements of the dice loss. + + Args: + weight (list[float], optional): The weight for each class. Default: None. + ignore_index (int64): ignore_index (int64, optional): Specifies a target value that + is ignored and does not contribute to the input gradient. Default ``255``. + smooth (float32): Laplace smoothing to smooth dice loss and accelerate convergence. + Default: 1.0 + """ + + def __init__(self, weight=None, ignore_index=255, smooth=1.0): + super().__init__() + self.weight = weight + self.ignore_index = ignore_index + self.smooth = smooth + self.eps = 1e-8 + + def forward(self, logits, labels): + num_class = logits.shape[1] + if self.weight is not None: + assert num_class == len(self.weight), \ + "The lenght of weight should be euqal to the num class" + + mask = labels != self.ignore_index + mask = paddle.cast(paddle.unsqueeze(mask, 1), 'float32') + + labels[labels == self.ignore_index] = 0 + labels_one_hot = F.one_hot(labels, num_class) + labels_one_hot = paddle.transpose(labels_one_hot, [0, 3, 1, 2]) + logits = F.softmax(logits, axis=1) + + dice_loss = 0.0 + for i in range(num_class): + dice_loss_i = dice_loss_helper(logits[:, i], labels_one_hot[:, i], + mask, self.smooth, self.eps) + if self.weight is not None: + dice_loss_i *= self.weight[i] + dice_loss += dice_loss_i + dice_loss = dice_loss / num_class + + return dice_loss + + +def dice_loss_helper(logit, label, mask, smooth, eps): + assert logit.shape == label.shape, \ + "The shape of logit and label should be the same" + logit = paddle.reshape(logit, [0, -1]) + label = paddle.reshape(label, [0, -1]) + mask = paddle.reshape(mask, [0, -1]) + logit *= mask + label *= mask + intersection = paddle.sum(logit * label, axis=1) + cardinality = paddle.sum(logit + label, axis=1) + dice_loss = 1 - (2 * intersection + smooth) / (cardinality + smooth + eps) + dice_loss = dice_loss.mean() + return dice_loss diff --git a/paddleseg/models/losses/edge_attention_loss.py b/paddleseg/models/losses/edge_attention_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b000b75b25415a97a9d3bfdfbd95ed6ef219fa5f --- /dev/null +++ b/paddleseg/models/losses/edge_attention_loss.py @@ -0,0 +1,78 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import losses + + +@manager.LOSSES.add_component +class EdgeAttentionLoss(nn.Layer): + """ + Implements the cross entropy loss function. It only compute the edge part. + + Args: + edge_threshold (float): The pixels greater edge_threshold as edges. + ignore_index (int64): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, edge_threshold=0.8, ignore_index=255): + super().__init__() + self.edge_threshold = edge_threshold + self.ignore_index = ignore_index + self.EPS = 1e-10 + self.mean_mask = 1 + + def forward(self, logits, label): + """ + Forward computation. + + Args: + logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit . + label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, C, D1, D2,..., Dk), k >= 1. + """ + seg_logit, edge_logit = logits[0], logits[1] + if len(label.shape) != len(seg_logit.shape): + label = paddle.unsqueeze(label, 1) + if edge_logit.shape != label.shape: + raise ValueError( + 'The shape of edge_logit should equal to the label, but they are {} != {}' + .format(edge_logit.shape, label.shape)) + + filler = paddle.ones_like(label) * self.ignore_index + label = paddle.where(edge_logit > self.edge_threshold, label, filler) + + seg_logit = paddle.transpose(seg_logit, [0, 2, 3, 1]) + label = paddle.transpose(label, [0, 2, 3, 1]) + loss = F.softmax_with_cross_entropy( + seg_logit, label, ignore_index=self.ignore_index, axis=-1) + + mask = label != self.ignore_index + mask = paddle.cast(mask, 'float32') + loss = loss * mask + avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS) + if paddle.mean(mask) < self.mean_mask: + self.mean_mask = paddle.mean(mask) + + label.stop_gradient = True + mask.stop_gradient = True + return avg_loss diff --git a/paddleseg/models/losses/focal_loss.py b/paddleseg/models/losses/focal_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4b5edd00257ac072631c63302ef4e1177c36a544 --- /dev/null +++ b/paddleseg/models/losses/focal_loss.py @@ -0,0 +1,132 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class FocalLoss(nn.Layer): + """ + The implement of focal loss. + + The focal loss requires the label is 0 or 1 for now. + + Args: + alpha (float, list, optional): The alpha of focal loss. alpha is the weight + of class 1, 1-alpha is the weight of class 0. Default: 0.25 + gamma (float, optional): The gamma of Focal Loss. Default: 2.0 + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, alpha=0.25, gamma=2.0, ignore_index=255): + super().__init__() + self.alpha = alpha + self.gamma = gamma + self.ignore_index = ignore_index + self.EPS = 1e-10 + + def forward(self, logit, label): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C, H, W), where C is number of classes. + label (Tensor): Label tensor, the data type is int64. Shape is (N, W, W), + where each value is 0 <= label[i] <= C-1. + Returns: + (Tensor): The average loss. + """ + assert logit.ndim == 4, "The ndim of logit should be 4." + assert logit.shape[1] == 2, "The channel of logit should be 2." + assert label.ndim == 3, "The ndim of label should be 3." + + class_num = logit.shape[1] # class num is 2 + logit = paddle.transpose(logit, [0, 2, 3, 1]) # N,C,H,W => N,H,W,C + + mask = label != self.ignore_index # N,H,W + mask = paddle.unsqueeze(mask, 3) + mask = paddle.cast(mask, 'float32') + mask.stop_gradient = True + + label = F.one_hot(label, class_num) # N,H,W,C + label = paddle.cast(label, logit.dtype) + label.stop_gradient = True + + loss = F.sigmoid_focal_loss( + logit=logit, + label=label, + alpha=self.alpha, + gamma=self.gamma, + reduction='none') + loss = loss * mask + avg_loss = paddle.sum(loss) / ( + paddle.sum(paddle.cast(mask != 0., 'int32')) * class_num + self.EPS) + return avg_loss + + +@manager.LOSSES.add_component +class MultiClassFocalLoss(nn.Layer): + """ + The implement of focal loss for multi class. + + Args: + alpha (float, list, optional): The alpha of focal loss. alpha is the weight + of class 1, 1-alpha is the weight of class 0. Default: 0.25 + gamma (float, optional): The gamma of Focal Loss. Default: 2.0 + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, num_class, alpha=1.0, gamma=2.0, ignore_index=255): + super().__init__() + self.num_class = num_class + self.alpha = alpha + self.gamma = gamma + self.ignore_index = ignore_index + self.EPS = 1e-10 + + def forward(self, logit, label): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C, H, W), where C is number of classes. + label (Tensor): Label tensor, the data type is int64. Shape is (N, W, W), + where each value is 0 <= label[i] <= C-1. + Returns: + (Tensor): The average loss. + """ + assert logit.ndim == 4, "The ndim of logit should be 4." + assert label.ndim == 3, "The ndim of label should be 3." + + logit = paddle.transpose(logit, [0, 2, 3, 1]) + label = label.astype('int64') + ce_loss = F.cross_entropy( + logit, label, ignore_index=self.ignore_index, reduction='none') + + pt = paddle.exp(-ce_loss) + focal_loss = self.alpha * ((1 - pt)**self.gamma) * ce_loss + + mask = paddle.cast(label != self.ignore_index, 'float32') + focal_loss *= mask + avg_loss = paddle.mean(focal_loss) / (paddle.mean(mask) + self.EPS) + return avg_loss diff --git a/paddleseg/models/losses/gscnn_dual_task_loss.py b/paddleseg/models/losses/gscnn_dual_task_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..6a3d81b7c1038a89b8843d8755cfd1079f1ca4a0 --- /dev/null +++ b/paddleseg/models/losses/gscnn_dual_task_loss.py @@ -0,0 +1,141 @@ +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class DualTaskLoss(nn.Layer): + """ + The dual task loss implement of GSCNN + + Args: + ignore_index (int64): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + tau (float): the tau of gumbel softmax sample. + """ + + def __init__(self, ignore_index=255, tau=0.5): + super().__init__() + self.ignore_index = ignore_index + self.tau = tau + + def _gumbel_softmax_sample(self, logit, tau=1, eps=1e-10): + """ + Draw a sample from the Gumbel-Softmax distribution + + based on + https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb + (MIT license) + """ + gumbel_noise = paddle.rand(logit.shape) + gumbel_noise = -paddle.log(eps - paddle.log(gumbel_noise + eps)) + logit = logit + gumbel_noise + return F.softmax(logit / tau, axis=1) + + def compute_grad_mag(self, x): + eps = 1e-6 + n, c, h, w = x.shape + if h <= 1 or w <= 1: + raise ValueError( + 'The width and height of tensor to compute grad must be greater than 1, but the shape is {}.' + .format(x.shape)) + + x = self.conv_tri(x, r=4) + kernel = [[-1, 0, 1]] + kernel = paddle.to_tensor(kernel).astype('float32') + kernel = 0.5 * kernel + + kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0) + grad_x = F.conv2d(x, kernel_x, padding='same', groups=c) + kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0) + grad_y = F.conv2d(x, kernel_y, padding='same', groups=c) + mag = paddle.sqrt(grad_x * grad_x + grad_y * grad_y + eps) + + return mag / mag.max() + + def conv_tri(self, input, r): + """ + Convolves an image by a 2D triangle filter (the 1D triangle filter f is + [1:r r+1 r:-1:1]/(r+1)^2, the 2D version is simply conv2(f,f')) + """ + if r <= 1: + raise ValueError( + '`r` should be greater than 1, but it is {}.'.format(r)) + + kernel = [ + list(range(1, r + 1)) + [r + 1] + list(reversed(range(1, r + 1))) + ] + kernel = paddle.to_tensor(kernel).astype('float32') + kernel = kernel / (r + 1)**2 + input_ = F.pad(input, [1, 1, 0, 0], mode='replicate') + input_ = F.pad(input_, [r, r, 0, 0], mode='reflect') + input_ = [input_[:, :, :, :r], input, input_[:, :, :, -r:]] + input_ = paddle.concat(input_, axis=3) + tem = input_.clone() + + input_ = F.pad(input_, [0, 0, 1, 1], mode='replicate') + input_ = F.pad(input_, [0, 0, r, r], mode='reflect') + input_ = [input_[:, :, :r, :], tem, input_[:, :, -r:, :]] + input_ = paddle.concat(input_, axis=2) + + c = input.shape[1] + kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0) + output = F.conv2d(input_, kernel_x, padding=0, groups=c) + kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0) + output = F.conv2d(output, kernel_y, padding=0, groups=c) + return output + + def forward(self, logit, labels): + # import pdb; pdb.set_trace() + n, c, h, w = logit.shape + th = 1e-8 + eps = 1e-10 + if len(labels.shape) == 3: + labels = labels.unsqueeze(1) + mask = (labels != self.ignore_index) + mask.stop_gradient = True + logit = logit * mask + + labels = labels * mask + if len(labels.shape) == 4: + labels = labels.squeeze(1) + labels.stop_gradient = True + labels = F.one_hot(labels, logit.shape[1]).transpose((0, 3, 1, 2)) + labels.stop_gradient = True + + g = self._gumbel_softmax_sample(logit, tau=self.tau) + g = self.compute_grad_mag(g) + g_hat = self.compute_grad_mag(labels) + loss = F.l1_loss(g, g_hat, reduction='none') + loss = loss * mask + + g_mask = (g > th).astype('float32') + g_mask.stop_gradient = True + g_mask_sum = paddle.sum(g_mask) + loss_g = paddle.sum(loss * g_mask) + if g_mask_sum > eps: + loss_g = loss_g / g_mask_sum + + g_hat_mask = (g_hat > th).astype('float32') + g_hat_mask.stop_gradient = True + g_hat_mask_sum = paddle.sum(g_hat_mask) + loss_g_hat = paddle.sum(loss * g_hat_mask) + if g_hat_mask_sum > eps: + loss_g_hat = loss_g_hat / g_hat_mask_sum + + total_loss = 0.5 * loss_g + 0.5 * loss_g_hat + + return total_loss diff --git a/paddleseg/models/losses/kl_loss.py b/paddleseg/models/losses/kl_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..30cb925516165c5ad7fced527535c88e3542eaea --- /dev/null +++ b/paddleseg/models/losses/kl_loss.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class KLLoss(nn.Layer): + """ + The implementation of Kullback-Leibler divergence Loss. + Refer to https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence. + + Args: + ignore_index (int64): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + temperature (float): the coefficient of kl_loss. + """ + + def __init__(self, ignore_index=255, temperature=1): + super().__init__() + self.ignore_index = ignore_index + self.temperature = temperature + + self.kl_loss = nn.KLDivLoss(reduction="none") + self.EPS = 1e-8 + + def forward(self, logit_1, logit_2, label=None): + """ + Calculate the KL loss. If the label is not None, it considers the + ignore_index in label and calculates the masked loss. + + Args: + logit_1 (Tensor): Logit tensor, the data type is float32 or float64. + The shape is (N, C), where C is number of classes, and if shape is + more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. + logit_2 (Tensor): Logit tensor, the data type is float32 or float64. + The shape of logit_2 and logit_1 are the same. + label (Tensor, optional): Label tensor, the data type is int64. + The shape is (N), where each value is 0 <= label[i] <= C-1, and + if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1. + Returns: + (Tensor): The average loss. + """ + if logit_1.shape != logit_2.shape: + raise ValueError( + 'The shape of logit_1 = {} must be the same as the shape of logit_2 = {}.' + .format(logit_1.shape, logit_2.shape)) + + logit_1 = F.log_softmax(logit_1 / self.temperature, axis=1) + logit_2 = F.softmax(logit_2 / self.temperature, axis=1) + loss = self.kl_loss(logit_1, logit_2) + loss = loss * self.temperature * self.temperature + + if label is None: + avg_loss = paddle.mean(loss) + else: + mask = label != self.ignore_index + mask = paddle.cast(mask, 'float32') + mask = paddle.unsqueeze(mask, axis=1) + label.stop_gradient = True + mask.stop_gradient = True + + loss = loss * mask + avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS) + return avg_loss diff --git a/paddleseg/models/losses/l1_loss.py b/paddleseg/models/losses/l1_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..125c55d406c17108634183b5005b459ad94fb7ca --- /dev/null +++ b/paddleseg/models/losses/l1_loss.py @@ -0,0 +1,76 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class L1Loss(nn.L1Loss): + r""" + This interface is used to construct a callable object of the ``L1Loss`` class. + The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows. + If `reduction` set to ``'none'``, the loss is: + .. math:: + Out = \lvert input - label\rvert + If `reduction` set to ``'mean'``, the loss is: + .. math:: + Out = MEAN(\lvert input - label\rvert) + If `reduction` set to ``'sum'``, the loss is: + .. math:: + Out = SUM(\lvert input - label\rvert) + + Args: + reduction (str, optional): Indicate the reduction to apply to the loss, + the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. + If `reduction` is ``'none'``, the unreduced loss is returned; + If `reduction` is ``'mean'``, the reduced mean loss is returned. + If `reduction` is ``'sum'``, the reduced sum loss is returned. + Default is ``'mean'``. + ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255. + Shape: + input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64. + label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64. + output (Tensor): The L1 Loss of ``input`` and ``label``. + If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . + If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. + Examples: + .. code-block:: python + + import paddle + import numpy as np + input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32") + label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32") + input = paddle.to_tensor(input_data) + label = paddle.to_tensor(label_data) + l1_loss = paddle.nn.L1Loss() + output = l1_loss(input, label) + print(output.numpy()) + # [0.35] + l1_loss = paddle.nn.L1Loss(reduction='sum') + output = l1_loss(input, label) + print(output.numpy()) + # [1.4] + l1_loss = paddle.nn.L1Loss(reduction='none') + output = l1_loss(input, label) + print(output) + # [[0.20000005 0.19999999] + # [0.2 0.79999995]] + """ + + def __init__(self, reduction='mean', ignore_index=255): + super().__init__(reduction=reduction) diff --git a/paddleseg/models/losses/lovasz_loss.py b/paddleseg/models/losses/lovasz_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4385c979fee787406b91c096677383ad3ecc0000 --- /dev/null +++ b/paddleseg/models/losses/lovasz_loss.py @@ -0,0 +1,222 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Lovasz-Softmax and Jaccard hinge loss in PaddlePaddle""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class LovaszSoftmaxLoss(nn.Layer): + """ + Multi-class Lovasz-Softmax loss. + + Args: + ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``. + classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average. + """ + + def __init__(self, ignore_index=255, classes='present'): + super(LovaszSoftmaxLoss, self).__init__() + self.ignore_index = ignore_index + self.classes = classes + + def forward(self, logits, labels): + r""" + Forward computation. + + Args: + logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty). + labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], ground truth labels (between 0 and C - 1). + """ + probas = F.softmax(logits, axis=1) + vprobas, vlabels = flatten_probas(probas, labels, self.ignore_index) + loss = lovasz_softmax_flat(vprobas, vlabels, classes=self.classes) + return loss + + +@manager.LOSSES.add_component +class LovaszHingeLoss(nn.Layer): + """ + Binary Lovasz hinge loss. + + Args: + ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, ignore_index=255): + super(LovaszHingeLoss, self).__init__() + self.ignore_index = ignore_index + + def forward(self, logits, labels): + r""" + Forward computation. + + Args: + logits (Tensor): Shape is [N, 1, H, W] or [N, 2, H, W], logits at each pixel (between -\infty and +\infty). + labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], binary ground truth masks (0 or 1). + """ + if logits.shape[1] == 2: + logits = binary_channel_to_unary(logits) + loss = lovasz_hinge_flat( + *flatten_binary_scores(logits, labels, self.ignore_index)) + return loss + + +def lovasz_grad(gt_sorted): + """ + Computes gradient of the Lovasz extension w.r.t sorted errors. + See Alg. 1 in paper. + """ + gts = paddle.sum(gt_sorted) + p = len(gt_sorted) + + intersection = gts - paddle.cumsum(gt_sorted, axis=0) + union = gts + paddle.cumsum(1 - gt_sorted, axis=0) + jaccard = 1.0 - intersection.cast('float32') / union.cast('float32') + + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + +def binary_channel_to_unary(logits, eps=1e-9): + """ + Converts binary channel logits to unary channel logits for lovasz hinge loss. + """ + probas = F.softmax(logits, axis=1) + probas = probas[:, 1, :, :] + logits = paddle.log(probas + eps / (1 - probas + eps)) + logits = logits.unsqueeze(1) + return logits + + +def lovasz_hinge_flat(logits, labels): + r""" + Binary Lovasz hinge loss. + + Args: + logits (Tensor): Shape is [P], logits at each prediction (between -\infty and +\infty). + labels (Tensor): Shape is [P], binary ground truth labels (0 or 1). + """ + if len(labels) == 0: + # only void pixels, the gradients should be 0 + return logits.sum() * 0. + signs = 2. * labels - 1. + signs.stop_gradient = True + errors = 1. - logits * signs + errors_sorted, perm = paddle._C_ops.argsort(errors, 'axis', 0, 'descending', + True) + errors_sorted.stop_gradient = False + gt_sorted = paddle.gather(labels, perm) + grad = lovasz_grad(gt_sorted) + grad.stop_gradient = True + loss = paddle.sum(F.relu(errors_sorted) * grad) + return loss + + +def flatten_binary_scores(scores, labels, ignore=None): + """ + Flattens predictions in the batch (binary case). + Remove labels according to 'ignore'. + """ + scores = paddle.reshape(scores, [-1]) + labels = paddle.reshape(labels, [-1]) + labels.stop_gradient = True + if ignore is None: + return scores, labels + valid = labels != ignore + valid_mask = paddle.reshape(valid, (-1, 1)) + indexs = paddle.nonzero(valid_mask) + indexs.stop_gradient = True + vscores = paddle.gather(scores, indexs[:, 0]) + vlabels = paddle.gather(labels, indexs[:, 0]) + return vscores, vlabels + + +def lovasz_softmax_flat(probas, labels, classes='present'): + """ + Multi-class Lovasz-Softmax loss. + + Args: + probas (Tensor): Shape is [P, C], class probabilities at each prediction (between 0 and 1). + labels (Tensor): Shape is [P], ground truth labels (between 0 and C - 1). + classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average. + """ + if probas.numel() == 0: + # only void pixels, the gradients should be 0 + return probas * 0. + C = probas.shape[1] + losses = [] + classes_to_sum = list(range(C)) if classes in ['all', 'present' + ] else classes + for c in classes_to_sum: + fg = paddle.cast(labels == c, probas.dtype) # foreground for class c + if classes == 'present' and fg.sum() == 0: + continue + fg.stop_gradient = True + if C == 1: + if len(classes_to_sum) > 1: + raise ValueError('Sigmoid output possible only with 1 class') + class_pred = probas[:, 0] + else: + class_pred = probas[:, c] + errors = paddle.abs(fg - class_pred) + errors_sorted, perm = paddle._C_ops.argsort(errors, 'axis', 0, + 'descending', True) + errors_sorted.stop_gradient = False + + fg_sorted = paddle.gather(fg, perm) + fg_sorted.stop_gradient = True + + grad = lovasz_grad(fg_sorted) + grad.stop_gradient = True + loss = paddle.sum(errors_sorted * grad) + losses.append(loss) + + if len(classes_to_sum) == 1: + return losses[0] + + losses_tensor = paddle.stack(losses) + mean_loss = paddle.mean(losses_tensor) + return mean_loss + + +def flatten_probas(probas, labels, ignore=None): + """ + Flattens predictions in the batch. + """ + if len(probas.shape) == 3: + probas = paddle.unsqueeze(probas, axis=1) + C = probas.shape[1] + probas = paddle.transpose(probas, [0, 2, 3, 1]) + probas = paddle.reshape(probas, [-1, C]) + labels = paddle.reshape(labels, [-1]) + if ignore is None: + return probas, labels + valid = labels != ignore + valid_mask = paddle.reshape(valid, [-1, 1]) + indexs = paddle.nonzero(valid_mask) + indexs.stop_gradient = True + vprobas = paddle.gather(probas, indexs[:, 0]) + vlabels = paddle.gather(labels, indexs[:, 0]) + return vprobas, vlabels diff --git a/paddleseg/models/losses/mean_square_error_loss.py b/paddleseg/models/losses/mean_square_error_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..e6fc8918c2ca1bb770ec2d434f672556e797f957 --- /dev/null +++ b/paddleseg/models/losses/mean_square_error_loss.py @@ -0,0 +1,65 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class MSELoss(nn.MSELoss): + r""" + **Mean Square Error Loss** + Computes the mean square error (squared L2 norm) of given input and label. + If :attr:`reduction` is set to ``'none'``, loss is calculated as: + .. math:: + Out = (input - label)^2 + If :attr:`reduction` is set to ``'mean'``, loss is calculated as: + .. math:: + Out = \operatorname{mean}((input - label)^2) + If :attr:`reduction` is set to ``'sum'``, loss is calculated as: + .. math:: + Out = \operatorname{sum}((input - label)^2) + where `input` and `label` are `float32` tensors of same shape. + + Args: + reduction (string, optional): The reduction method for the output, + could be 'none' | 'mean' | 'sum'. + If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. + If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. + If :attr:`reduction` is ``'none'``, the unreduced loss is returned. + Default is ``'mean'``. + ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255. + Shape: + input (Tensor): Input tensor, the data type is float32 or float64 + label (Tensor): Label tensor, the data type is float32 or float64 + output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input. + Examples: + .. code-block:: python + import numpy as np + import paddle + input_data = np.array([1.5]).astype("float32") + label_data = np.array([1.7]).astype("float32") + mse_loss = paddle.nn.loss.MSELoss() + input = paddle.to_tensor(input_data) + label = paddle.to_tensor(label_data) + output = mse_loss(input, label) + print(output) + # [0.04000002] + """ + + def __init__(self, reduction='mean', ignore_index=255): + super().__init__(reduction=reduction) diff --git a/paddleseg/models/losses/mixed_loss.py b/paddleseg/models/losses/mixed_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..563b5c209fa12cfaad6e93cab67fe1e4a9b6c2c9 --- /dev/null +++ b/paddleseg/models/losses/mixed_loss.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class MixedLoss(nn.Layer): + """ + Weighted computations for multiple Loss. + The advantage is that mixed loss training can be achieved without changing the networking code. + + Args: + losses (list[nn.Layer]): A list consisting of multiple loss classes + coef (list[float|int]): Weighting coefficient of multiple loss + + Returns: + A callable object of MixedLoss. + """ + + def __init__(self, losses, coef): + super(MixedLoss, self).__init__() + if not isinstance(losses, list): + raise TypeError('`losses` must be a list!') + if not isinstance(coef, list): + raise TypeError('`coef` must be a list!') + len_losses = len(losses) + len_coef = len(coef) + if len_losses != len_coef: + raise ValueError( + 'The length of `losses` should equal to `coef`, but they are {} and {}.' + .format(len_losses, len_coef)) + + self.losses = losses + self.coef = coef + + def forward(self, logits, labels): + loss_list = [] + for i, loss in enumerate(self.losses): + output = loss(logits, labels) + loss_list.append(output * self.coef[i]) + return loss_list diff --git a/paddleseg/models/losses/ohem_cross_entropy_loss.py b/paddleseg/models/losses/ohem_cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c69d81ef11c2504dfc896477d5f3260295d40be6 --- /dev/null +++ b/paddleseg/models/losses/ohem_cross_entropy_loss.py @@ -0,0 +1,99 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class OhemCrossEntropyLoss(nn.Layer): + """ + Implements the ohem cross entropy loss function. + + Args: + thresh (float, optional): The threshold of ohem. Default: 0.7. + min_kept (int, optional): The min number to keep in loss computation. Default: 10000. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, thresh=0.7, min_kept=10000, ignore_index=255): + super(OhemCrossEntropyLoss, self).__init__() + self.thresh = thresh + self.min_kept = min_kept + self.ignore_index = ignore_index + self.EPS = 1e-5 + + def forward(self, logit, label): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + """ + if len(label.shape) != len(logit.shape): + label = paddle.unsqueeze(label, 1) + + # get the label after ohem + n, c, h, w = logit.shape + label = label.reshape((-1, )).astype('int64') + valid_mask = (label != self.ignore_index).astype('int64') + num_valid = valid_mask.sum() + label = label * valid_mask + + prob = F.softmax(logit, axis=1) + prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1)) + + if self.min_kept < num_valid and num_valid > 0: + # let the value which ignored greater than 1 + prob = prob + (1 - valid_mask) + + # get the prob of relevant label + label_onehot = F.one_hot(label, c) + label_onehot = label_onehot.transpose((1, 0)) + prob = prob * label_onehot + prob = paddle.sum(prob, axis=0) + + threshold = self.thresh + if self.min_kept > 0: + index = prob.argsort() + threshold_index = index[min(len(index), self.min_kept) - 1] + threshold_index = int(threshold_index.numpy()[0]) + if prob[threshold_index] > self.thresh: + threshold = prob[threshold_index] + kept_mask = (prob < threshold).astype('int64') + label = label * kept_mask + valid_mask = valid_mask * kept_mask + + # make the invalid region as ignore + label = label + (1 - valid_mask) * self.ignore_index + + label = label.reshape((n, 1, h, w)) + valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32') + loss = F.softmax_with_cross_entropy( + logit, label, ignore_index=self.ignore_index, axis=1) + loss = loss * valid_mask + avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS) + + label.stop_gradient = True + valid_mask.stop_gradient = True + return avg_loss diff --git a/paddleseg/models/losses/ohem_edge_attention_loss.py b/paddleseg/models/losses/ohem_edge_attention_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..f37fe07af6d724d3076e36ce4b11570d4b93d419 --- /dev/null +++ b/paddleseg/models/losses/ohem_edge_attention_loss.py @@ -0,0 +1,114 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import losses + + +@manager.LOSSES.add_component +class OhemEdgeAttentionLoss(nn.Layer): + """ + Implements the cross entropy loss function. It only compute the edge part. + + Args: + edge_threshold (float, optional): The pixels greater edge_threshold as edges. Default: 0.8. + thresh (float, optional): The threshold of ohem. Default: 0.7. + min_kept (int, optional): The min number to keep in loss computation. Default: 5000. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, + edge_threshold=0.8, + thresh=0.7, + min_kept=5000, + ignore_index=255): + super().__init__() + self.edge_threshold = edge_threshold + self.thresh = thresh + self.min_kept = min_kept + self.ignore_index = ignore_index + self.EPS = 1e-10 + + def forward(self, logits, label): + """ + Forward computation. + + Args: + logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is + (N, C), where C is number of classes, and if shape is more than 2D, this + is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit . + label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, C, D1, D2,..., Dk), k >= 1. + """ + seg_logit, edge_logit = logits[0], logits[1] + if len(label.shape) != len(seg_logit.shape): + label = paddle.unsqueeze(label, 1) + if edge_logit.shape != label.shape: + raise ValueError( + 'The shape of edge_logit should equal to the label, but they are {} != {}' + .format(edge_logit.shape, label.shape)) + + # Filter out edge + filler = paddle.ones_like(label) * self.ignore_index + label = paddle.where(edge_logit > self.edge_threshold, label, filler) + + # ohem + n, c, h, w = seg_logit.shape + label = label.reshape((-1, )) + valid_mask = (label != self.ignore_index).astype('int64') + num_valid = valid_mask.sum() + label = label * valid_mask + + prob = F.softmax(seg_logit, axis=1) + prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1)) + + if self.min_kept < num_valid and num_valid > 0: + # let the value which ignored greater than 1 + prob = prob + (1 - valid_mask) + + # get the prob of relevant label + label_onehot = F.one_hot(label, c) + label_onehot = label_onehot.transpose((1, 0)) + prob = prob * label_onehot + prob = paddle.sum(prob, axis=0) + + threshold = self.thresh + if self.min_kept > 0: + index = prob.argsort() + threshold_index = index[min(len(index), self.min_kept) - 1] + threshold_index = int(threshold_index.numpy()[0]) + if prob[threshold_index] > self.thresh: + threshold = prob[threshold_index] + kept_mask = (prob < threshold).astype('int64') + label = label * kept_mask + valid_mask = valid_mask * kept_mask + # make the invalid region as ignore + label = label + (1 - valid_mask) * self.ignore_index + label = label.reshape((n, 1, h, w)) + valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32') + + loss = F.softmax_with_cross_entropy( + seg_logit, label, ignore_index=self.ignore_index, axis=1) + loss = loss * valid_mask + avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS) + + label.stop_gradient = True + valid_mask.stop_gradient = True + return avg_loss diff --git a/paddleseg/models/losses/pixel_contrast_cross_entropy_loss.py b/paddleseg/models/losses/pixel_contrast_cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..7abe865cc4a743a95dc86a3749bf5a1b410315fd --- /dev/null +++ b/paddleseg/models/losses/pixel_contrast_cross_entropy_loss.py @@ -0,0 +1,203 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class PixelContrastCrossEntropyLoss(nn.Layer): + """ + The PixelContrastCrossEntropyLoss implementation based on PaddlePaddle. + + The original article refers to + Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation" + (https://arxiv.org/abs/2101.11939). + + Args: + temperature (float, optional): Controling the numerical similarity of features. Default: 0.1. + base_temperature (float, optional): Controling the numerical range of contrast loss. Default: 0.07. + ignore_index (int, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default 255. + max_samples (int, optional): Max sampling anchors. Default: 1024. + max_views (int): Sampled samplers of a class. Default: 100. + """ + + def __init__(self, + temperature=0.1, + base_temperature=0.07, + ignore_index=255, + max_samples=1024, + max_views=100): + super().__init__() + self.temperature = temperature + self.base_temperature = base_temperature + self.ignore_index = ignore_index + self.max_samples = max_samples + self.max_views = max_views + + def _hard_anchor_sampling(self, X, y_hat, y): + """ + Args: + X (Tensor): reshaped feats, shape = [N, H * W, feat_channels] + y_hat (Tensor): reshaped label, shape = [N, H * W] + y (Tensor): reshaped predict, shape = [N, H * W] + """ + batch_size, feat_dim = paddle.shape(X)[0], paddle.shape(X)[-1] + classes = [] + total_classes = 0 + for i in range(batch_size): + current_y = y_hat[i] + current_classes = paddle.unique(current_y) + current_classes = [ + x for x in current_classes if x != self.ignore_index + ] + current_classes = [ + x for x in current_classes + if (current_y == x).nonzero().shape[0] > self.max_views + ] + + classes.append(current_classes) + total_classes += len(current_classes) + + n_view = self.max_samples // total_classes + n_view = min(n_view, self.max_views) + + X_ = [] + y_ = paddle.zeros([total_classes], dtype='float32') + + X_ptr = 0 + for i in range(batch_size): + this_y_hat = y_hat[i] + current_y = y[i] + current_classes = classes[i] + + for cls_id in current_classes: + hard_indices = paddle.logical_and( + (this_y_hat == cls_id), (current_y != cls_id)).nonzero() + easy_indices = paddle.logical_and( + (this_y_hat == cls_id), (current_y == cls_id)).nonzero() + + num_hard = hard_indices.shape[0] + num_easy = easy_indices.shape[0] + + if num_hard >= n_view / 2 and num_easy >= n_view / 2: + num_hard_keep = n_view // 2 + num_easy_keep = n_view - num_hard_keep + elif num_hard >= n_view / 2: + num_easy_keep = num_easy + num_hard_keep = n_view - num_easy_keep + elif num_easy >= n_view / 2: + num_hard_keep = num_hard + num_easy_keep = n_view - num_hard_keep + else: + num_hard_keep = num_hard + num_easy_keep = num_easy + + indices = None + if num_hard > 0: + perm = paddle.randperm(num_hard) + hard_indices = hard_indices[perm[:num_hard_keep]].reshape( + (-1, hard_indices.shape[-1])) + indices = hard_indices + if num_easy > 0: + perm = paddle.randperm(num_easy) + easy_indices = easy_indices[perm[:num_easy_keep]].reshape( + (-1, easy_indices.shape[-1])) + if indices is None: + indices = easy_indices + else: + indices = paddle.concat((indices, easy_indices), axis=0) + if indices is None: + raise UserWarning('hard sampling indice error') + + X_.append(paddle.index_select(X[i, :, :], indices.squeeze(1))) + y_[X_ptr] = float(cls_id) + X_ptr += 1 + X_ = paddle.stack(X_, axis=0) + return X_, y_ + + def _contrastive(self, feats_, labels_): + """ + Args: + feats_ (Tensor): sampled pixel, shape = [total_classes, n_view, feat_dim], total_classes = batch_size * single image classes + labels_ (Tensor): label, shape = [total_classes] + """ + anchor_num, n_view = feats_.shape[0], feats_.shape[1] + + labels_ = labels_.reshape((-1, 1)) + mask = paddle.equal(labels_, paddle.transpose(labels_, + [1, 0])).astype('float32') + + contrast_count = n_view + contrast_feature = paddle.concat(paddle.unbind(feats_, axis=1), axis=0) + + anchor_feature = contrast_feature + anchor_count = contrast_count + + anchor_dot_contrast = paddle.matmul( + anchor_feature, paddle.transpose(contrast_feature, + [1, 0])) / self.temperature + logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True) + logits = anchor_dot_contrast - logits_max + + mask = paddle.tile(mask, [anchor_count, contrast_count]) + neg_mask = 1 - mask + + logits_mask = 1 - paddle.eye(mask.shape[0]).astype('float32') + mask = mask * logits_mask + + neg_logits = paddle.exp(logits) * neg_mask + neg_logits = neg_logits.sum(1, keepdim=True) + + exp_logits = paddle.exp(logits) + + log_prob = logits - paddle.log(exp_logits + neg_logits) + + mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1) + + loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos + loss = loss.mean() + + return loss + + def contrast_criterion(self, feats, labels=None, predict=None): + labels = labels.unsqueeze(1) + labels = F.interpolate(labels, feats.shape[2:], mode='nearest') + labels = labels.squeeze(1) + + batch_size = feats.shape[0] + labels = labels.reshape((batch_size, -1)) + predict = predict.reshape((batch_size, -1)) + feats = paddle.transpose(feats, [0, 2, 3, 1]) + feats = feats.reshape((feats.shape[0], -1, feats.shape[-1])) + + feats_, labels_ = self._hard_anchor_sampling(feats, labels, predict) + + loss = self._contrastive(feats_, labels_) + return loss + + def forward(self, preds, label): + assert "seg" in preds, "The input of PixelContrastCrossEntropyLoss should include 'seg' output, but not found." + assert "embed" in preds, "The input of PixelContrastCrossEntropyLoss should include 'embed' output, but not found." + + seg = preds['seg'] + embedding = preds['embed'] + + predict = paddle.argmax(seg, axis=1) + loss = self.contrast_criterion(embedding, label, predict) + return loss diff --git a/paddleseg/models/losses/point_cross_entropy_loss.py b/paddleseg/models/losses/point_cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d43f6b2a2f6319782dcc9fa222e3ccb15faa8cdb --- /dev/null +++ b/paddleseg/models/losses/point_cross_entropy_loss.py @@ -0,0 +1,160 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class PointCrossEntropyLoss(nn.Layer): + """ + Implements the point cross entropy loss function. + + The original article refers to + Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering." + (https://arxiv.org/abs/1912.08193). + + Args: + weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight + given to each class. Its length must be equal to the number of classes. + Default ``None``. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. When its value < 1.0, only compute the loss for + the top k percent pixels (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``. + data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``. + """ + + def __init__(self, + weight=None, + ignore_index=255, + top_k_percent_pixels=1.0, + data_format='NCHW', + align_corners=False): + super(PointCrossEntropyLoss, self).__init__() + if weight is not None: + weight = paddle.to_tensor(weight, dtype='float32') + self.weight = weight + self.ignore_index = ignore_index + self.top_k_percent_pixels = top_k_percent_pixels + self.EPS = 1e-8 + self.data_format = data_format + self.align_corners = align_corners + + def forward(self, logits, label, semantic_weights=None): + """ + Forward computation. + + Args: + logits (Tensor): Logit tensor, the data type is float32, float64. Shape is + (logit,points). logit'shape: [N, C, point_num]. logit'shape:[N, point_num, 2], where C is number of classes. + label (Tensor): Label tensor, the data type is int64. Shape is (N), where each + value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is + (N, D1, D2,..., Dk), k >= 1. + semantic_weights (Tensor, optional): Weights about loss for each pixels, shape is the same as label. Default: None. + """ + # for loss + logit, points = logits # [N, C, point_num],[N, point_num, 2] + label = label.unsqueeze(1) # [N,1,H,W] + label = point_sample( + label.astype('float32'), + points, + mode='nearest', + align_corners=self.align_corners) # [N, 1, point_num] + label = paddle.squeeze(label, axis=1).astype('int64') # [N, xx] + + channel_axis = 1 if self.data_format == 'NCHW' else -1 + if self.weight is not None and logit.shape[channel_axis] != len( + self.weight): + raise ValueError( + 'The number of weights = {} must be the same as the number of classes = {}.' + .format(len(self.weight), logit.shape[1])) + + logit = paddle.transpose(logit, [0, 2, 1]) + no_ignore_label = label + #no_ignore_label[label==self.ignore_index] = 0 + loss = F.cross_entropy( + logit, + no_ignore_label, + ignore_index=self.ignore_index, + reduction='none') + + mask = label != self.ignore_index + mask = paddle.cast(mask, 'float32') + + loss = loss * mask + if semantic_weights is not None: + loss = loss * semantic_weights + + if self.weight is not None: + _one_hot = F.one_hot(label, logit.shape[-1]) + _one_hot_weight = _one_hot * self.weight + loss = loss * _one_hot_weight.argmax(-1) + coef = paddle.sum(_one_hot_weight, axis=-1) + #coef = paddle.ones_like(label) + else: + coef = paddle.ones_like(label) + + label.stop_gradient = True + mask.stop_gradient = True + if self.top_k_percent_pixels == 1.0: + avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS) + return avg_loss + + loss = loss.reshape((-1, )) + top_k_pixels = int(self.top_k_percent_pixels * loss.numel()) + loss, indices = paddle.topk(loss, top_k_pixels) + coef = coef.reshape((-1, )) + coef = paddle.gather(coef, indices) + coef.stop_gradient = True + + return loss.mean() / (paddle.mean(coef) + self.EPS) + + +def point_sample(input, points, align_corners=False, **kwargs): + """A wrapper around :func:`grid_sample` to support 3D point_coords tensors + Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to + lie inside ``[0, 1] x [0, 1]`` square. + Args: + input (Tensor): Feature map, shape (N, C, H, W). + points (Tensor): Image based absolute point coordinates (normalized), + range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2). + align_corners (bool): Whether align_corners. Default: False + Returns: + Tensor: Features of `point` on `input`, shape (N, C, P) or + (N, C, Hgrid, Wgrid). + """ + + def denormalize(grid): + """Denormalize input grid from range [0, 1] to [-1, 1] + Args: + grid (Tensor): The grid to be denormalize, range [0, 1]. + Returns: + Tensor: Denormalized grid, range [-1, 1]. + """ + + return grid * 2.0 - 1.0 + + add_dim = False + if points.dim() == 3: + add_dim = True + points = paddle.unsqueeze(points, axis=2) # [2, 2048, 1, 2] + output = F.grid_sample( + input, denormalize(points), align_corners=align_corners, **kwargs) + if add_dim: + output = paddle.squeeze(output, axis=3) + return output diff --git a/paddleseg/models/losses/rmi_loss.py b/paddleseg/models/losses/rmi_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..271f738ba57f6d57cf5335294b6841d65105a11f --- /dev/null +++ b/paddleseg/models/losses/rmi_loss.py @@ -0,0 +1,258 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""rmi loss in PaddlePaddle""" +import numpy +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + +_euler_num = 2.718281828 +_pi = 3.14159265 +_ln_2_pi = 1.837877 +_CLIP_MIN = 1e-6 +_CLIP_MAX = 1.0 +_POS_ALPHA = 5e-4 +_IS_SUM = 1 + + +@manager.LOSSES.add_component +class RMILoss(nn.Layer): + """ + Implements the Region Mutual Information(RMI) Loss(https://arxiv.org/abs/1910.12037) for Semantic Segmentation. + Unlike vanilla rmi loss which contains Cross Entropy Loss, we disband them and only + left the RMI-related parts. + The motivation is to allow for a more flexible combination of losses during training. + For example, by employing mixed loss to merge RMI Loss with Boostrap Cross Entropy Loss, + we can achieve the online mining of hard examples together with attention to region information. + Args: + weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight + given to each class. Its length must be equal to the number of classes. + Default ``None``. + ignore_index (int64, optional): Specifies a target value that is ignored + and does not contribute to the input gradient. Default ``255``. + """ + + def __init__(self, + num_classes=19, + rmi_radius=3, + rmi_pool_way=0, + rmi_pool_size=3, + rmi_pool_stride=3, + loss_weight_lambda=0.5, + ignore_index=255): + super(RMILoss, self).__init__() + + self.num_classes = num_classes + assert rmi_radius in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + self.rmi_radius = rmi_radius + assert rmi_pool_way in [0, 1, 2, 3] + self.rmi_pool_way = rmi_pool_way + assert rmi_pool_size == rmi_pool_stride + self.rmi_pool_size = rmi_pool_size + self.rmi_pool_stride = rmi_pool_stride + self.weight_lambda = loss_weight_lambda + self.half_d = self.rmi_radius * self.rmi_radius + self.d = 2 * self.half_d + self.kernel_padding = self.rmi_pool_size // 2 + self.ignore_index = ignore_index + + def forward(self, logits_4D, labels_4D, do_rmi=True): + """ + Forward computation. + Args: + logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty). + labels (Tensor): Shape is [N, H, W], ground truth labels (between 0 and C - 1). + """ + logits_4D = paddle.cast(logits_4D, dtype='float32') + labels_4D = paddle.cast(labels_4D, dtype='float32') + + loss = self.forward_sigmoid(logits_4D, labels_4D, do_rmi=do_rmi) + return loss + + def forward_sigmoid(self, logits_4D, labels_4D, do_rmi=False): + """ + Using the sigmiod operation both. + Args: + logits_4D : [N, C, H, W], dtype=float32 + labels_4D : [N, H, W], dtype=long + do_rmi : bool + """ + label_mask_3D = labels_4D != self.ignore_index + valid_onehot_labels_4D = paddle.cast( + F.one_hot( + paddle.cast( + labels_4D, dtype='int64') * paddle.cast( + label_mask_3D, dtype='int64'), + num_classes=self.num_classes), + dtype='float32') + # label_mask_flat = paddle.cast( + # paddle.reshape(label_mask_3D, [-1]), dtype='float32') + + valid_onehot_labels_4D = valid_onehot_labels_4D * paddle.unsqueeze( + label_mask_3D, axis=3) + valid_onehot_labels_4D.stop_gradient = True + probs_4D = F.sigmoid(logits_4D) * paddle.unsqueeze( + label_mask_3D, axis=1) + _CLIP_MIN + + valid_onehot_labels_4D = paddle.transpose(valid_onehot_labels_4D, + [0, 3, 1, 2]) + valid_onehot_labels_4D.stop_gradient = True + rmi_loss = self.rmi_lower_bound(valid_onehot_labels_4D, probs_4D) + + return rmi_loss + + def inverse(self, x): + return paddle.inverse(x) + + def rmi_lower_bound(self, labels_4D, probs_4D): + """ + calculate the lower bound of the region mutual information. + Args: + labels_4D : [N, C, H, W], dtype=float32 + probs_4D : [N, C, H, W], dtype=float32 + """ + assert labels_4D.shape == probs_4D.shape, print( + 'shapes', labels_4D.shape, probs_4D.shape) + + p, s = self.rmi_pool_size, self.rmi_pool_stride + if self.rmi_pool_stride > 1: + if self.rmi_pool_way == 0: + labels_4D = F.max_pool2d( + labels_4D, + kernel_size=p, + stride=s, + padding=self.kernel_padding) + probs_4D = F.max_pool2d( + probs_4D, + kernel_size=p, + stride=s, + padding=self.kernel_padding) + elif self.rmi_pool_way == 1: + labels_4D = F.avg_pool2d( + labels_4D, + kernel_size=p, + stride=s, + padding=self.kernel_padding) + probs_4D = F.avg_pool2d( + probs_4D, + kernel_size=p, + stride=s, + padding=self.kernel_padding) + elif self.rmi_pool_way == 2: + shape = labels_4D.shape + new_h, new_w = shape[2] // s, shape[3] // s + labels_4D = F.interpolate( + labels_4D, size=(new_h, new_w), mode='nearest') + probs_4D = F.interpolate( + probs_4D, + size=(new_h, new_w), + mode='bilinear', + align_corners=True) + else: + raise NotImplementedError("Pool way of RMI is not defined!") + + label_shape = labels_4D.shape + n, c = label_shape[0], label_shape[1] + + la_vectors, pr_vectors = self.map_get_pairs( + labels_4D, probs_4D, radius=self.rmi_radius, is_combine=0) + + la_vectors = paddle.reshape(la_vectors, [n, c, self.half_d, -1]) + la_vectors = paddle.cast(la_vectors, dtype='float64') + la_vectors.stop_gradient = True + + pr_vectors = paddle.reshape(pr_vectors, [n, c, self.half_d, -1]) + pr_vectors = paddle.cast(pr_vectors, dtype='float64') + + diag_matrix = paddle.unsqueeze( + paddle.unsqueeze( + paddle.eye(self.half_d), axis=0), axis=0) + la_vectors = la_vectors - paddle.mean(la_vectors, axis=3, keepdim=True) + + la_cov = paddle.matmul(la_vectors, + paddle.transpose(la_vectors, [0, 1, 3, 2])) + pr_vectors = pr_vectors - paddle.mean(pr_vectors, axis=3, keepdim=True) + pr_cov = paddle.matmul(pr_vectors, + paddle.transpose(pr_vectors, [0, 1, 3, 2])) + + pr_cov_inv = self.inverse(pr_cov + paddle.cast( + diag_matrix, dtype='float64') * _POS_ALPHA) + + la_pr_cov = paddle.matmul(la_vectors, + paddle.transpose(pr_vectors, [0, 1, 3, 2])) + + appro_var = la_cov - paddle.matmul( + paddle.matmul(la_pr_cov, pr_cov_inv), + paddle.transpose(la_pr_cov, [0, 1, 3, 2])) + + rmi_now = 0.5 * self.log_det_by_cholesky(appro_var + paddle.cast( + diag_matrix, dtype='float64') * _POS_ALPHA) + + rmi_per_class = paddle.cast( + paddle.mean( + paddle.reshape(rmi_now, [-1, self.num_classes]), axis=0), + dtype='float32') + rmi_per_class = paddle.divide(rmi_per_class, + paddle.to_tensor(float(self.half_d))) + + rmi_loss = paddle.sum(rmi_per_class) if _IS_SUM else paddle.mean( + rmi_per_class) + + return rmi_loss + + def log_det_by_cholesky(self, matrix): + """ + Args: + matrix: matrix must be a positive define matrix. + shape [N, C, D, D]. + """ + + chol = paddle.cholesky(matrix) + diag = paddle.diagonal(chol, offset=0, axis1=-2, axis2=-1) + chol = paddle.log(diag + 1e-8) + + return 2.0 * paddle.sum(chol, axis=-1) + + def map_get_pairs(self, labels_4D, probs_4D, radius=3, is_combine=True): + """ + Args: + labels_4D : labels, shape [N, C, H, W] + probs_4D : probabilities, shape [N, C, H, W] + radius : the square radius + Return: + tensor with shape [N, C, radius * radius, H - (radius - 1), W - (radius - 1)] + """ + + label_shape = labels_4D.shape + h, w = label_shape[2], label_shape[3] + new_h, new_w = h - (radius - 1), w - (radius - 1) + la_ns = [] + pr_ns = [] + for y in range(0, radius, 1): + for x in range(0, radius, 1): + la_now = labels_4D[:, :, y:y + new_h, x:x + new_w] + pr_now = probs_4D[:, :, y:y + new_h, x:x + new_w] + la_ns.append(la_now) + pr_ns.append(pr_now) + + if is_combine: + pair_ns = la_ns + pr_ns + p_vectors = paddle.stack(pair_ns, axis=2) + return p_vectors + else: + la_vectors = paddle.stack(la_ns, axis=2) + pr_vectors = paddle.stack(pr_ns, axis=2) + return la_vectors, pr_vectors diff --git a/paddleseg/models/losses/semantic_connectivity_loss.py b/paddleseg/models/losses/semantic_connectivity_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b54d545d09a0debc5a8a1912d5572c44a082289b --- /dev/null +++ b/paddleseg/models/losses/semantic_connectivity_loss.py @@ -0,0 +1,177 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np +import paddle +from paddle import nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class SemanticConnectivityLoss(nn.Layer): + ''' + SCL (Semantic Connectivity-aware Learning) framework, which introduces a SC Loss (Semantic Connectivity-aware Loss) + to improve the quality of segmentation results from the perspective of connectivity. Support multi-class segmentation. + + The original article refers to + Lutao Chu, Yi Liu, Zewu Wu, Shiyu Tang, Guowei Chen, Yuying Hao, Juncai Peng, Zhiliang Yu, Zeyu Chen, Baohua Lai, Haoyi Xiong. + "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset" + In WACV 2022 workshop + https://arxiv.org/abs/2112.07146 + + Running process: + Step 1. Connected Components Calculation + Step 2. Connected Components Matching and SC Loss Calculation + ''' + + def __init__(self, ignore_index=255, max_pred_num_conn=10, use_argmax=True): + ''' + Args: + ignore_index (int): Specify a pixel value to be ignored in the annotated image and does not contribute to + the input gradient.When there are pixels that cannot be marked (or difficult to be marked) in the marked + image, they can be marked as a specific gray value. When calculating the loss value, the pixel corresponding + to the original image will not be used as the independent variable of the loss function. *Default:``255``* + max_pred_num_conn (int): Maximum number of predicted connected components. At the beginning of training, + there will be a large number of connected components, and the calculation is very time-consuming. + Therefore, it is necessary to limit the maximum number of predicted connected components, + and the rest will not participate in the calculation. + use_argmax (bool): Whether to use argmax for logits. + ''' + super().__init__() + self.ignore_index = ignore_index + self.max_pred_num_conn = max_pred_num_conn + self.use_argmax = use_argmax + + def forward(self, logits, labels): + ''' + Args: + logits (Tensor): [N, C, H, W] + lables (Tensor): [N, H, W] + ''' + preds = paddle.argmax(logits, axis=1) if self.use_argmax else logits + preds_np = preds.astype('uint8').numpy() + labels_np = labels.astype('uint8').numpy() + preds = paddle.to_tensor(preds, 'float32', stop_gradient=False) + multi_class_sc_loss = paddle.zeros([preds.shape[0]]) + zero = paddle.to_tensor([0.]) # for accelerating + + # Traverse each image + for i in range(preds.shape[0]): + sc_loss = 0 + class_num = 0 + + pred_i = preds[i] + preds_np_i = preds_np[i] + labels_np_i = labels_np[i] + + # Traverse each class + for class_ in np.unique(labels_np_i): + if class_ == self.ignore_index: + continue + class_num += 1 + + # Connected Components Calculation + preds_np_class = preds_np_i == class_ + labels_np_class = labels_np_i == class_ + pred_num_conn, pred_conn = cv2.connectedComponents( + preds_np_class.astype(np.uint8)) # pred_conn.shape = [H,W] + label_num_conn, label_conn = cv2.connectedComponents( + labels_np_class.astype(np.uint8)) + + origin_pred_num_conn = pred_num_conn + if pred_num_conn > 2 * label_num_conn: + pred_num_conn = min(pred_num_conn, self.max_pred_num_conn) + real_pred_num = pred_num_conn - 1 + real_label_num = label_num_conn - 1 + + # Connected Components Matching and SC Loss Calculation + if real_label_num > 0 and real_pred_num > 0: + img_connectivity = compute_class_connectiveity( + pred_conn, label_conn, pred_num_conn, + origin_pred_num_conn, label_num_conn, pred_i, + real_label_num, real_pred_num, zero) + sc_loss += 1 - img_connectivity + elif real_label_num == 0 and real_pred_num == 0: + # if no connected component, SC Loss = 0, so pass + pass + else: + preds_class = pred_i == int(class_) + not_preds_class = paddle.bitwise_not(preds_class) + labels_class = paddle.to_tensor(labels_np_class) + missed_detect = labels_class * not_preds_class + missed_detect_area = paddle.sum(missed_detect).astype( + 'float32') + sc_loss += missed_detect_area / missed_detect.numel() + 1 + + multi_class_sc_loss[ + i] = sc_loss / class_num if class_num != 0 else 0 + multi_class_sc_loss = paddle.mean(multi_class_sc_loss) + return multi_class_sc_loss + + +def compute_class_connectiveity(pred_conn, label_conn, pred_num_conn, + origin_pred_num_conn, label_num_conn, pred, + real_label_num, real_pred_num, zero): + + pred_conn = paddle.to_tensor(pred_conn) + label_conn = paddle.to_tensor(label_conn) + pred_conn = F.one_hot(pred_conn, origin_pred_num_conn) + label_conn = F.one_hot(label_conn, label_num_conn) + + ious = paddle.zeros((real_label_num, real_pred_num)) + pair_conn_sum = paddle.to_tensor([0.], stop_gradient=False) + + for i in range(1, label_num_conn): + label_i = label_conn[:, :, i] + + pair_conn = paddle.to_tensor([0.], stop_gradient=False) + pair_conn_num = 0 + + for j in range(1, pred_num_conn): + pred_j_mask = pred_conn[:, :, j] + pred_j = pred_j_mask * pred + + iou = compute_iou(pred_j, label_i, zero) + ious[i - 1, j - 1] = iou + if iou != 0: + pair_conn += iou + pair_conn_num += 1 + + if pair_conn_num != 0: + pair_conn_sum += pair_conn / pair_conn_num + lone_pred_num = 0 + + pred_sum = paddle.sum(ious, axis=0) + for m in range(0, real_pred_num): + if pred_sum[m] == 0: + lone_pred_num += 1 + img_connectivity = pair_conn_sum / (real_label_num + lone_pred_num) + return img_connectivity + + +def compute_iou(pred_i, label_i, zero): + intersect_area_i = paddle.sum(pred_i * label_i) + if paddle.equal(intersect_area_i, zero): + return 0 + + pred_area_i = paddle.sum(pred_i) + label_area_i = paddle.sum(label_i) + union_area_i = pred_area_i + label_area_i - intersect_area_i + if paddle.equal(union_area_i, zero): + return 1 + else: + return intersect_area_i / union_area_i diff --git a/paddleseg/models/losses/semantic_encode_cross_entropy_loss.py b/paddleseg/models/losses/semantic_encode_cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..648ed35862a02cb85842843b2ac446dfd987569e --- /dev/null +++ b/paddleseg/models/losses/semantic_encode_cross_entropy_loss.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager + + +@manager.LOSSES.add_component +class SECrossEntropyLoss(nn.Layer): + """ + The Semantic Encoding Loss implementation based on PaddlePaddle. + + """ + + def __init__(self, *args, **kwargs): + super(SECrossEntropyLoss, self).__init__() + + def forward(self, logit, label): + if logit.ndim == 4: + logit = logit.squeeze(2).squeeze(3) + assert logit.ndim == 2, "The shape of logit should be [N, C, 1, 1] or [N, C], but the logit dim is {}.".format( + logit.ndim) + + batch_size, num_classes = paddle.shape(logit) + se_label = paddle.zeros([batch_size, num_classes]) + for i in range(batch_size): + hist = paddle.histogram( + label[i], bins=num_classes, min=0, max=num_classes - 1) + hist = hist.astype('float32') / hist.sum().astype('float32') + se_label[i] = (hist > 0).astype('float32') + loss = F.binary_cross_entropy_with_logits(logit, se_label) + return loss diff --git a/paddleseg/models/lraspp.py b/paddleseg/models/lraspp.py new file mode 100644 index 0000000000000000000000000000000000000000..6f7db422a70d903d6a98ed45445a5920253ad287 --- /dev/null +++ b/paddleseg/models/lraspp.py @@ -0,0 +1,162 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.models import layers +from paddleseg.cvlibs import manager + + +@manager.MODELS.add_component +class LRASPP(nn.Layer): + """ + Semantic segmentation model with a light R-ASPP head. + + The original article refers to + Howard, Andrew, et al. "Searching for mobilenetv3." + (https://arxiv.org/pdf/1909.11065.pdf) + + Args: + num_classes (int): The number of target classes. + backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must + has feat_channels, of which the length is 5. + backbone_indices (List(int), optional): The values indicate the indices of backbone output + used as the input of the LR-ASPP head. + Default: [0, 1, 3]. + lraspp_head_inter_chs (List(int), optional): The intermediate channels of LR-ASPP head. + Default: [32, 64]. + lraspp_head_out_ch (int, optional): The output channels of each ASPP branch in the LR-ASPP head. + Default: 128 + resize_mode (str, optional): The resize mode for the upsampling operation in the LR-ASPP head. + Default: bilinear. + use_gap (bool, optional): If true, use global average pooling in the LR-ASPP head; otherwise, use + a 49x49 kernel for average pooling. + Default: True. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=[0, 1, 3], + lraspp_head_inter_chs=[32, 64], + lraspp_head_out_ch=128, + resize_mode='bilinear', + use_gap=True, + pretrained=None): + super().__init__() + + # backbone + assert hasattr(backbone, 'feat_channels'), \ + "The backbone should has feat_channels." + assert len(backbone.feat_channels) >= len(backbone_indices), \ + f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \ + f"greater than the length of feat_channels ({len(backbone.feat_channels)})." + assert len(backbone.feat_channels) > max(backbone_indices), \ + f"The max value ({max(backbone_indices)}) of backbone_indices should be " \ + f"less than the length of feat_channels ({len(backbone.feat_channels)})." + self.backbone = backbone + + assert len(backbone_indices) >= 1, "The lenght of backbone_indices " \ + "should not be lesser than 1" + + # head + assert len(backbone_indices) == len( + lraspp_head_inter_chs + ) + 1, "The length of backbone_indices should be 1 greater than lraspp_head_inter_chs." + self.backbone_indices = backbone_indices + + self.lraspp_head = LRASPPHead(backbone_indices, backbone.feat_channels, + lraspp_head_inter_chs, lraspp_head_out_ch, + num_classes, resize_mode, use_gap) + + # pretrained + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + x_hw = paddle.shape(x)[2:] + + feats_backbone = self.backbone(x) + assert len(feats_backbone) >= len(self.backbone_indices), \ + f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \ + f"equal than the nums of backbone_indices ({len(self.backbone_indices)})" + + y = self.lraspp_head(feats_backbone) + y = F.interpolate(y, x_hw, mode='bilinear', align_corners=False) + logit_list = [y] + + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class LRASPPHead(nn.Layer): + def __init__(self, + indices, + in_chs, + mid_chs, + out_ch, + n_classes, + resize_mode, + use_gap, + align_corners=False): + super().__init__() + + self.indices = indices[-2::-1] + self.in_chs = [in_chs[i] for i in indices[::-1]] + self.mid_chs = mid_chs[::-1] + self.convs = nn.LayerList() + self.conv_ups = nn.LayerList() + for in_ch, mid_ch in zip(self.in_chs[1:], self.mid_chs): + self.convs.append( + nn.Conv2D( + in_ch, mid_ch, kernel_size=1, bias_attr=False)) + self.conv_ups.append(layers.ConvBNReLU(out_ch + mid_ch, out_ch, 1)) + self.conv_w = nn.Sequential( + nn.AvgPool2D( + kernel_size=(49, 49), stride=(16, 20)) + if not use_gap else nn.AdaptiveAvgPool2D(1), + nn.Conv2D( + self.in_chs[0], out_ch, 1, bias_attr=False), + nn.Sigmoid()) + self.conv_v = layers.ConvBNReLU(self.in_chs[0], out_ch, 1) + self.conv_t = nn.Conv2D(out_ch, out_ch, kernel_size=1, bias_attr=False) + self.conv_out = nn.Conv2D( + out_ch, n_classes, kernel_size=1, bias_attr=False) + + self.interp = partial( + F.interpolate, mode=resize_mode, align_corners=align_corners) + + def forward(self, in_feat_list): + x = in_feat_list[-1] + + x = self.conv_v(x) * self.interp(self.conv_w(x), paddle.shape(x)[2:]) + y = self.conv_t(x) + + for idx, conv, conv_up in zip(self.indices, self.convs, self.conv_ups): + feat = in_feat_list[idx] + y = self.interp(y, paddle.shape(feat)[2:]) + y = paddle.concat([y, conv(feat)], axis=1) + y = conv_up(y) + + y = self.conv_out(y) + return y diff --git a/paddleseg/models/mla_transformer.py b/paddleseg/models/mla_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d5647e7564bd7a77bc92fb29aacf37c6f8f98796 --- /dev/null +++ b/paddleseg/models/mla_transformer.py @@ -0,0 +1,240 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +class MLAHeads(nn.Layer): + def __init__(self, mlahead_channels=128): + super(MLAHeads, self).__init__() + self.head2 = nn.Sequential( + layers.ConvBNReLU( + mlahead_channels * 2, + mlahead_channels, + 3, + padding=1, + bias_attr=False), + layers.ConvBNReLU( + mlahead_channels, + mlahead_channels, + 3, + padding=1, + bias_attr=False)) + self.head3 = nn.Sequential( + layers.ConvBNReLU( + mlahead_channels * 2, + mlahead_channels, + 3, + padding=1, + bias_attr=False), + layers.ConvBNReLU( + mlahead_channels, + mlahead_channels, + 3, + padding=1, + bias_attr=False)) + self.head4 = nn.Sequential( + layers.ConvBNReLU( + mlahead_channels * 2, + mlahead_channels, + 3, + padding=1, + bias_attr=False), + layers.ConvBNReLU( + mlahead_channels, + mlahead_channels, + 3, + padding=1, + bias_attr=False)) + self.head5 = nn.Sequential( + layers.ConvBNReLU( + mlahead_channels * 2, + mlahead_channels, + 3, + padding=1, + bias_attr=False), + layers.ConvBNReLU( + mlahead_channels, + mlahead_channels, + 3, + padding=1, + bias_attr=False)) + + def forward(self, mla_p2, mla_p3, mla_p4, mla_p5): + head2 = F.interpolate( + self.head2(mla_p2), + size=(4 * mla_p2.shape[3], 4 * mla_p2.shape[3]), + mode='bilinear', + align_corners=True) + head3 = F.interpolate( + self.head3(mla_p3), + size=(4 * mla_p3.shape[3], 4 * mla_p3.shape[3]), + mode='bilinear', + align_corners=True) + head4 = F.interpolate( + self.head4(mla_p4), + size=(4 * mla_p4.shape[3], 4 * mla_p4.shape[3]), + mode='bilinear', + align_corners=True) + head5 = F.interpolate( + self.head5(mla_p5), + size=(4 * mla_p5.shape[3], 4 * mla_p5.shape[3]), + mode='bilinear', + align_corners=True) + + return paddle.concat([head2, head3, head4, head5], axis=1) + + +@manager.MODELS.add_component +class MLATransformer(nn.Layer): + def __init__(self, + num_classes, + in_channels, + backbone, + mlahead_channels=128, + aux_channels=256, + norm_layer=nn.BatchNorm2D, + pretrained=None, + **kwargs): + super(MLATransformer, self).__init__() + + self.BatchNorm = norm_layer + self.mlahead_channels = mlahead_channels + self.num_classes = num_classes + self.in_channels = in_channels + self.backbone = backbone + + self.mlahead = MLAHeads(mlahead_channels=self.mlahead_channels) + self.cls = nn.Conv2D( + 4 * self.mlahead_channels, self.num_classes, 3, padding=1) + + self.conv0 = layers.ConvBNReLU( + self.in_channels[0], + self.in_channels[0] * 2, + 3, + padding=1, + bias_attr=False) + self.conv1 = layers.ConvBNReLU( + self.in_channels[1], + self.in_channels[1], + 3, + padding=1, + bias_attr=False) + self.conv21 = layers.ConvBNReLU( + self.in_channels[2], + self.in_channels[2], + 3, + padding=1, + bias_attr=False) + self.conv22 = layers.ConvBNReLU( + self.in_channels[2], + self.in_channels[2] // 2, + 3, + padding=1, + bias_attr=False) + self.conv31 = layers.ConvBNReLU( + self.in_channels[3], + self.in_channels[3], + 3, + padding=1, + bias_attr=False) + self.conv32 = layers.ConvBNReLU( + self.in_channels[3], + self.in_channels[3] // 2, + 3, + padding=1, + bias_attr=False) + self.conv33 = layers.ConvBNReLU( + self.in_channels[3] // 2, + self.in_channels[3] // 4, + 3, + padding=1, + bias_attr=False) + + self.aux_head = nn.Sequential( + layers.ConvBN( + in_channels=self.in_channels[2], + out_channels=aux_channels, + kernel_size=3, + padding=1, + bias_attr=False), + nn.Conv2D( + in_channels=aux_channels, + out_channels=self.num_classes, + kernel_size=1, )) + + self.pretrained = pretrained + self.init_weight() + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + def forward(self, x): + inputs = self.backbone(x) + + inputs0 = self.conv0(inputs[0]) + inputs1 = F.interpolate( + self.conv1(inputs[1]), + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=True) + inputs2 = F.interpolate( + self.conv21(inputs[2]), + scale_factor=2, + mode='bilinear', + align_corners=True) + inputs2 = F.interpolate( + self.conv22(inputs2), + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=True) + inputs3 = F.interpolate( + self.conv31(inputs[3]), + scale_factor=2, + mode='bilinear', + align_corners=True) + inputs3 = F.interpolate( + self.conv32(inputs3), + scale_factor=2, + mode='bilinear', + align_corners=True) + inputs3 = F.interpolate( + self.conv33(inputs3), + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=True) + inputs2 = inputs2 + inputs3 + inputs1 = inputs1 + inputs2 + inputs0 = inputs0 + inputs1 + + feats = self.mlahead(inputs0, inputs1, inputs2, inputs3) + logit = self.cls(feats) + logit_list = [logit] + + if self.training: + logit_list.append(self.aux_head(inputs[2])) + + logit_list = [ + F.interpolate( + logit, paddle.shape(x)[2:], mode='bilinear', align_corners=True) + for logit in logit_list + ] + return logit_list diff --git a/paddleseg/models/mobileseg.py b/paddleseg/models/mobileseg.py new file mode 100644 index 0000000000000000000000000000000000000000..8d4ea5bca043947234ec90191f87f23c36026307 --- /dev/null +++ b/paddleseg/models/mobileseg.py @@ -0,0 +1,289 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.models import layers +from paddleseg.cvlibs import manager + + +@manager.MODELS.add_component +class MobileSeg(nn.Layer): + """ + The semantic segmentation models for mobile devices. + + Args: + num_classes (int): The number of target classes. + backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must + has feat_channels, of which the length is 5. + backbone_indices (List(int), optional): The values indicate the indices of output of backbone. + Default: [2, 3, 4]. + cm_bin_sizes (List(int), optional): The bin size of context module. Default: [1,2,4]. + cm_out_ch (int, optional): The output channel of the last context module. Default: 128. + arm_type (str, optional): The type of attention refinement module. Default: ARM_Add_SpAttenAdd3. + arm_out_chs (List(int), optional): The out channels of each arm module. Default: [64, 96, 128]. + seg_head_inter_chs (List(int), optional): The intermediate channels of segmentation head. + Default: [64, 64, 64]. + resize_mode (str, optional): The resize mode for the upsampling operation in decoder. + Default: bilinear. + use_last_fuse (bool, optional): Whether use fusion in the last. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices=[1, 2, 3], + cm_bin_sizes=[1, 2], + cm_out_ch=64, + arm_type='UAFMMobile', + arm_out_chs=[32, 48, 64], + seg_head_inter_chs=[32, 32, 32], + resize_mode='bilinear', + use_last_fuse=False, + pretrained=None): + super().__init__() + + # backbone + assert hasattr(backbone, 'feat_channels'), \ + "The backbone should has feat_channels." + assert len(backbone.feat_channels) >= len(backbone_indices), \ + f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \ + f"greater than the length of feat_channels ({len(backbone.feat_channels)})." + assert len(backbone.feat_channels) > max(backbone_indices), \ + f"The max value ({max(backbone_indices)}) of backbone_indices should be " \ + f"less than the length of feat_channels ({len(backbone.feat_channels)})." + self.backbone = backbone + + assert len(backbone_indices) >= 1, "The lenght of backbone_indices " \ + "should not be lesser than 1" + self.backbone_indices = backbone_indices # [..., x16_id, x32_id] + backbone_out_chs = [backbone.feat_channels[i] for i in backbone_indices] + + # head + if len(arm_out_chs) == 1: + arm_out_chs = arm_out_chs * len(backbone_indices) + assert len(arm_out_chs) == len(backbone_indices), "The length of " \ + "arm_out_chs and backbone_indices should be equal" + + self.ppseg_head = MobileSegHead(backbone_out_chs, arm_out_chs, + cm_bin_sizes, cm_out_ch, arm_type, + resize_mode, use_last_fuse) + + if len(seg_head_inter_chs) == 1: + seg_head_inter_chs = seg_head_inter_chs * len(backbone_indices) + assert len(seg_head_inter_chs) == len(backbone_indices), "The length of " \ + "seg_head_inter_chs and backbone_indices should be equal" + self.seg_heads = nn.LayerList() # [..., head_16, head32] + for in_ch, mid_ch in zip(arm_out_chs, seg_head_inter_chs): + self.seg_heads.append(SegHead(in_ch, mid_ch, num_classes)) + + # pretrained + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + x_hw = paddle.shape(x)[2:] + + feats_backbone = self.backbone(x) # [x4, x8, x16, x32] + assert len(feats_backbone) >= len(self.backbone_indices), \ + f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \ + f"equal than the nums of backbone_indices ({len(self.backbone_indices)})" + + feats_selected = [feats_backbone[i] for i in self.backbone_indices] + feats_head = self.ppseg_head(feats_selected) # [..., x8, x16, x32] + + if self.training: + logit_list = [] + for x, seg_head in zip(feats_head, self.seg_heads): + x = seg_head(x) + logit_list.append(x) + logit_list = [ + F.interpolate( + x, x_hw, mode='bilinear', align_corners=False) + for x in logit_list + ] + else: + x = self.seg_heads[0](feats_head[0]) + x = F.interpolate(x, x_hw, mode='bilinear', align_corners=False) + logit_list = [x] + + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class MobileSegHead(nn.Layer): + """ + The head of MobileSeg. + + Args: + backbone_out_chs (List(Tensor)): The channels of output tensors in the backbone. + arm_out_chs (List(int)): The out channels of each arm module. + cm_bin_sizes (List(int)): The bin size of context module. + cm_out_ch (int): The output channel of the last context module. + arm_type (str): The type of attention refinement module. + resize_mode (str): The resize mode for the upsampling operation in decoder. + """ + + def __init__(self, backbone_out_chs, arm_out_chs, cm_bin_sizes, cm_out_ch, + arm_type, resize_mode, use_last_fuse): + super().__init__() + + self.cm = MobileContextModule(backbone_out_chs[-1], cm_out_ch, + cm_out_ch, cm_bin_sizes) + + assert hasattr(layers,arm_type), \ + "Not support arm_type ({})".format(arm_type) + arm_class = eval("layers." + arm_type) + + self.arm_list = nn.LayerList() # [..., arm8, arm16, arm32] + for i in range(len(backbone_out_chs)): + low_chs = backbone_out_chs[i] + high_ch = cm_out_ch if i == len( + backbone_out_chs) - 1 else arm_out_chs[i + 1] + out_ch = arm_out_chs[i] + arm = arm_class( + low_chs, high_ch, out_ch, ksize=3, resize_mode=resize_mode) + self.arm_list.append(arm) + + self.use_last_fuse = use_last_fuse + if self.use_last_fuse: + self.fuse_convs = nn.LayerList() + for i in range(1, len(arm_out_chs)): + conv = layers.SeparableConvBNReLU( + arm_out_chs[i], + arm_out_chs[0], + kernel_size=3, + bias_attr=False) + self.fuse_convs.append(conv) + self.last_conv = layers.SeparableConvBNReLU( + len(arm_out_chs) * arm_out_chs[0], + arm_out_chs[0], + kernel_size=3, + bias_attr=False) + + def forward(self, in_feat_list): + """ + Args: + in_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32]. + x2, x4 and x8 are optional. + Returns: + out_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32]. + x2, x4 and x8 are optional. + The length of in_feat_list and out_feat_list are the same. + """ + + high_feat = self.cm(in_feat_list[-1]) + out_feat_list = [] + + for i in reversed(range(len(in_feat_list))): + low_feat = in_feat_list[i] + arm = self.arm_list[i] + high_feat = arm(low_feat, high_feat) + out_feat_list.insert(0, high_feat) + + if self.use_last_fuse: + x_list = [out_feat_list[0]] + size = paddle.shape(out_feat_list[0])[2:] + for i, (x, conv + ) in enumerate(zip(out_feat_list[1:], self.fuse_convs)): + x = conv(x) + x = F.interpolate( + x, size=size, mode='bilinear', align_corners=False) + x_list.append(x) + x = paddle.concat(x_list, axis=1) + x = self.last_conv(x) + out_feat_list[0] = x + + return out_feat_list + + +class MobileContextModule(nn.Layer): + """ + Context Module for Mobile Model. + + Args: + in_channels (int): The number of input channels to pyramid pooling module. + inter_channels (int): The number of inter channels to pyramid pooling module. + out_channels (int): The number of output channels after pyramid pooling module. + bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 3). + align_corners (bool): An argument of F.interpolate. It should be set to False + when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, + in_channels, + inter_channels, + out_channels, + bin_sizes, + align_corners=False): + super().__init__() + + self.stages = nn.LayerList([ + self._make_stage(in_channels, inter_channels, size) + for size in bin_sizes + ]) + + self.conv_out = layers.SeparableConvBNReLU( + in_channels=inter_channels, + out_channels=out_channels, + kernel_size=3, + bias_attr=False) + + self.align_corners = align_corners + + def _make_stage(self, in_channels, out_channels, size): + prior = nn.AdaptiveAvgPool2D(output_size=size) + conv = layers.ConvBNReLU( + in_channels=in_channels, out_channels=out_channels, kernel_size=1) + return nn.Sequential(prior, conv) + + def forward(self, input): + out = None + input_shape = paddle.shape(input)[2:] + + for stage in self.stages: + x = stage(input) + x = F.interpolate( + x, + input_shape, + mode='bilinear', + align_corners=self.align_corners) + if out is None: + out = x + else: + out += x + + out = self.conv_out(out) + return out + + +class SegHead(nn.Layer): + def __init__(self, in_chan, mid_chan, n_classes): + super().__init__() + self.conv = layers.SeparableConvBNReLU( + in_chan, mid_chan, kernel_size=3, bias_attr=False) + self.conv_out = nn.Conv2D( + mid_chan, n_classes, kernel_size=1, bias_attr=False) + + def forward(self, x): + x = self.conv(x) + x = self.conv_out(x) + return x diff --git a/paddleseg/models/ocrnet.py b/paddleseg/models/ocrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b1eb73fde189562306c0059667667004229f0e47 --- /dev/null +++ b/paddleseg/models/ocrnet.py @@ -0,0 +1,246 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers + + +@manager.MODELS.add_component +class OCRNet(nn.Layer): + """ + The OCRNet implementation based on PaddlePaddle. + The original article refers to + Yuan, Yuhui, et al. "Object-Contextual Representations for Semantic Segmentation" + (https://arxiv.org/pdf/1909.11065.pdf) + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network. + backbone_indices (tuple): A tuple indicates the indices of output of backbone. + It can be either one or two values, if two values, the first index will be taken as + a deep-supervision feature in auxiliary layer; the second one will be taken as + input of pixel representation. If one value, it is taken by both above. + ocr_mid_channels (int, optional): The number of middle channels in OCRHead. Default: 512. + ocr_key_channels (int, optional): The number of key channels in ObjectAttentionBlock. Default: 256. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices, + ocr_mid_channels=512, + ocr_key_channels=256, + align_corners=False, + pretrained=None): + super().__init__() + + self.backbone = backbone + self.backbone_indices = backbone_indices + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + + self.head = OCRHead( + num_classes=num_classes, + in_channels=in_channels, + ocr_mid_channels=ocr_mid_channels, + ocr_key_channels=ocr_key_channels) + + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + if not self.training: + logit_list = [logit_list[0]] + + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class OCRHead(nn.Layer): + """ + The Object contextual representation head. + + Args: + num_classes(int): The unique number of target classes. + in_channels(tuple): The number of input channels. + ocr_mid_channels(int, optional): The number of middle channels in OCRHead. Default: 512. + ocr_key_channels(int, optional): The number of key channels in ObjectAttentionBlock. Default: 256. + """ + + def __init__(self, + num_classes, + in_channels, + ocr_mid_channels=512, + ocr_key_channels=256): + super().__init__() + + self.num_classes = num_classes + self.spatial_gather = SpatialGatherBlock(ocr_mid_channels, num_classes) + self.spatial_ocr = SpatialOCRModule(ocr_mid_channels, ocr_key_channels, + ocr_mid_channels) + + self.indices = [-2, -1] if len(in_channels) > 1 else [-1, -1] + + self.conv3x3_ocr = layers.ConvBNReLU( + in_channels[self.indices[1]], ocr_mid_channels, 3, padding=1) + self.cls_head = nn.Conv2D(ocr_mid_channels, self.num_classes, 1) + self.aux_head = nn.Sequential( + layers.ConvBNReLU(in_channels[self.indices[0]], + in_channels[self.indices[0]], 1), + nn.Conv2D(in_channels[self.indices[0]], self.num_classes, 1)) + + self.init_weight() + + def forward(self, feat_list): + feat_shallow, feat_deep = feat_list[self.indices[0]], feat_list[ + self.indices[1]] + + soft_regions = self.aux_head(feat_shallow) + pixels = self.conv3x3_ocr(feat_deep) + + object_regions = self.spatial_gather(pixels, soft_regions) + ocr = self.spatial_ocr(pixels, object_regions) + + logit = self.cls_head(ocr) + return [logit, soft_regions] + + def init_weight(self): + """Initialize the parameters of model parts.""" + for sublayer in self.sublayers(): + if isinstance(sublayer, nn.Conv2D): + param_init.normal_init(sublayer.weight, std=0.001) + elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(sublayer.weight, value=1.0) + param_init.constant_init(sublayer.bias, value=0.0) + + +class SpatialGatherBlock(nn.Layer): + """Aggregation layer to compute the pixel-region representation.""" + + def __init__(self, pixels_channels, regions_channels): + super().__init__() + self.pixels_channels = pixels_channels + self.regions_channels = regions_channels + + def forward(self, pixels, regions): + # pixels: from (n, c, h, w) to (n, h*w, c) + pixels = paddle.reshape(pixels, (0, self.pixels_channels, -1)) + pixels = paddle.transpose(pixels, (0, 2, 1)) + + # regions: from (n, k, h, w) to (n, k, h*w) + regions = paddle.reshape(regions, (0, self.regions_channels, -1)) + regions = F.softmax(regions, axis=2) + + # feats: from (n, k, c) to (n, c, k, 1) + feats = paddle.bmm(regions, pixels) + feats = paddle.transpose(feats, (0, 2, 1)) + feats = paddle.unsqueeze(feats, axis=-1) + + return feats + + +class SpatialOCRModule(nn.Layer): + """Aggregate the global object representation to update the representation for each pixel.""" + + def __init__(self, + in_channels, + key_channels, + out_channels, + dropout_rate=0.1): + super().__init__() + + self.attention_block = ObjectAttentionBlock(in_channels, key_channels) + self.conv1x1 = nn.Sequential( + layers.ConvBNReLU(2 * in_channels, out_channels, 1), + nn.Dropout2D(dropout_rate)) + + def forward(self, pixels, regions): + context = self.attention_block(pixels, regions) + feats = paddle.concat([context, pixels], axis=1) + feats = self.conv1x1(feats) + + return feats + + +class ObjectAttentionBlock(nn.Layer): + """A self-attention module.""" + + def __init__(self, in_channels, key_channels): + super().__init__() + + self.in_channels = in_channels + self.key_channels = key_channels + + self.f_pixel = nn.Sequential( + layers.ConvBNReLU(in_channels, key_channels, 1), + layers.ConvBNReLU(key_channels, key_channels, 1)) + + self.f_object = nn.Sequential( + layers.ConvBNReLU(in_channels, key_channels, 1), + layers.ConvBNReLU(key_channels, key_channels, 1)) + + self.f_down = layers.ConvBNReLU(in_channels, key_channels, 1) + + self.f_up = layers.ConvBNReLU(key_channels, in_channels, 1) + + def forward(self, x, proxy): + x_shape = paddle.shape(x) + # query : from (n, c1, h1, w1) to (n, h1*w1, key_channels) + query = self.f_pixel(x) + query = paddle.reshape(query, (0, self.key_channels, -1)) + query = paddle.transpose(query, (0, 2, 1)) + + # key : from (n, c2, h2, w2) to (n, key_channels, h2*w2) + key = self.f_object(proxy) + key = paddle.reshape(key, (0, self.key_channels, -1)) + + # value : from (n, c2, h2, w2) to (n, h2*w2, key_channels) + value = self.f_down(proxy) + value = paddle.reshape(value, (0, self.key_channels, -1)) + value = paddle.transpose(value, (0, 2, 1)) + + # sim_map (n, h1*w1, h2*w2) + sim_map = paddle.bmm(query, key) + sim_map = (self.key_channels**-.5) * sim_map + sim_map = F.softmax(sim_map, axis=-1) + + # context from (n, h1*w1, key_channels) to (n , out_channels, h1, w1) + context = paddle.bmm(sim_map, value) + context = paddle.transpose(context, (0, 2, 1)) + context = paddle.reshape(context, + (0, self.key_channels, x_shape[2], x_shape[3])) + context = self.f_up(context) + + return context diff --git a/paddleseg/models/pfpnnet.py b/paddleseg/models/pfpnnet.py new file mode 100644 index 0000000000000000000000000000000000000000..25143acc8f2e40b27904f7fa1ba2174fbe3ea767 --- /dev/null +++ b/paddleseg/models/pfpnnet.py @@ -0,0 +1,205 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.models import layers +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class PFPNNet(nn.Layer): + """ + The Panoptic Feature Pyramid Networks implementation based on PaddlePaddle. + + The original article refers to + Alexander Kirillov, Ross Girshick, Kaiming He, Piotr Dollár, et al. "Panoptic Feature Pyramid Networks" + (https://arxiv.org/abs/1901.02446) + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. + backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes, + backbone, + backbone_indices, + channels, + enable_auxiliary_loss=False, + align_corners=False, + dropout_ratio=0.1, + fpn_inplanes=[256, 512, 1024, 2048], + pretrained=None): + super(PFPNNet, self).__init__() + self.backbone = backbone + self.backbone_indices = backbone_indices + self.in_channels = [ + self.backbone.feat_channels[i] for i in backbone_indices + ] + self.align_corners = align_corners + self.pretrained = pretrained + self.enable_auxiliary_loss = enable_auxiliary_loss + + self.head = PFPNHead( + num_class=num_classes, + fpn_inplanes=fpn_inplanes, + dropout_ratio=dropout_ratio, + channels=channels, + fpn_dim=channels, + enable_auxiliary_loss=self.enable_auxiliary_loss) + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + return [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list + ] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class PFPNHead(nn.Layer): + """ + The PFPNHead implementation. + + Args: + inplane (int): Input channels of PPM module. + num_class (int): The unique number of target classes. + fpn_inplanes (list): The feature channels from backbone. + fpn_dim (int, optional): The input channels of FPN module. Default: 512. + enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False. + """ + + def __init__(self, + num_class, + fpn_inplanes, + channels, + dropout_ratio=0.1, + fpn_dim=256, + enable_auxiliary_loss=False, + align_corners=False): + super(PFPNHead, self).__init__() + self.enable_auxiliary_loss = enable_auxiliary_loss + self.align_corners = align_corners + self.lateral_convs = nn.LayerList() + self.fpn_out = nn.LayerList() + + for fpn_inplane in fpn_inplanes: + self.lateral_convs.append( + nn.Sequential( + nn.Conv2D(fpn_inplane, fpn_dim, 1), + layers.SyncBatchNorm(fpn_dim), nn.ReLU())) + self.fpn_out.append( + nn.Sequential( + layers.ConvBNReLU( + fpn_dim, fpn_dim, 3, bias_attr=False))) + + self.scale_heads = nn.LayerList() + for index in range(len(fpn_inplanes)): + head_length = max( + 1, + int(np.log2(fpn_inplanes[index]) - np.log2(fpn_inplanes[0]))) + scale_head = nn.LayerList() + for head_index in range(head_length): + scale_head.append( + layers.ConvBNReLU( + fpn_dim, + channels, + 3, + padding=1, )) + if fpn_inplanes[index] != fpn_inplanes[0]: + scale_head.append( + nn.Upsample( + scale_factor=2, + mode='bilinear', + align_corners=align_corners)) + self.scale_heads.append(nn.Sequential(*scale_head)) + + if dropout_ratio: + self.dropout = nn.Dropout2D(dropout_ratio) + if self.enable_auxiliary_loss: + self.dsn = nn.Sequential( + layers.ConvBNReLU( + fpn_inplanes[2], fpn_inplanes[2], 3, padding=1), + nn.Dropout2D(dropout_ratio), + nn.Conv2D( + fpn_inplanes[2], num_class, kernel_size=1)) + else: + self.dropout = None + if self.enable_auxiliary_loss: + self.dsn = nn.Sequential( + layers.ConvBNReLU( + fpn_inplanes[2], fpn_inplanes[2], 3, padding=1), + nn.Conv2D( + fpn_inplanes[2], num_class, kernel_size=1)) + + self.conv_last = nn.Sequential( + layers.ConvBNReLU( + len(fpn_inplanes) * fpn_dim, fpn_dim, 3, bias_attr=False), + nn.Conv2D( + fpn_dim, num_class, kernel_size=1)) + self.conv_seg = nn.Conv2D(channels, num_class, kernel_size=1) + + def cls_seg(self, feat): + if self.dropout is not None: + feat = self.dropout(feat) + output = self.conv_seg(feat) + return output + + def forward(self, conv_out): + last_out = self.lateral_convs[-1](conv_out[-1]) + f = last_out + fpn_feature_list = [last_out] + for i in reversed(range(len(conv_out) - 1)): + conv_x = conv_out[i] + conv_x = self.lateral_convs[i](conv_x) + prev_shape = paddle.shape(conv_x)[2:] + f = conv_x + F.interpolate( + f, prev_shape, mode='bilinear', align_corners=True) + fpn_feature_list.append(self.fpn_out[i](f)) + + output_size = paddle.shape(fpn_feature_list[-1])[2:] + + x = self.scale_heads[0](fpn_feature_list[-1]) + for index in range(len(self.scale_heads) - 2, 0, -1): + x = x + F.interpolate( + self.scale_heads[index](fpn_feature_list[index]), + size=output_size, + mode='bilinear', + align_corners=self.align_corners) + x = self.cls_seg(x) + if self.enable_auxiliary_loss: + dsn = self.dsn(conv_out[2]) + return [x, dsn] + else: + return [x] diff --git a/paddleseg/models/pointrend.py b/paddleseg/models/pointrend.py new file mode 100644 index 0000000000000000000000000000000000000000..b1f9b01aaff8d4f2bec0f18a45207b15515a329b --- /dev/null +++ b/paddleseg/models/pointrend.py @@ -0,0 +1,835 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + + +@manager.MODELS.add_component +class PointRend(nn.Layer): + """ + The SemanticFPN-PointRend implementation based on PaddlePaddle. + + The original article refers to + Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering." + (https://arxiv.org/abs/1912.08193). + + Args: + num_classes (int): The unique number of target classes. + backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. + backbone_indices (tuple, optional): Four values in the tuple indicate the indices of output of backbone. + fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction in FPN. Default: [256, 512, 1024, 2048]. + fpn_outplanes (int, optional): The output channels in FPN. Default: 256. + point_num_fcs (int, optional): Number of fc layers in the head in PointHead. Default: 3. + point_in_channels (list, optional): input channels of fc block in PointHead. Default: [256]. + point_out_channels (int, optional): Fc block's output channels in PointHead. Default: 256. + point_in_index (list, optional): The indexs of input features to use in PointHead. Default: [0]. + point_num_points (int, optional): The number of point in training mode in PointHead. Default: 2048. + point_oversample_ratio (int, optional): The sample ratio of points when in training mode in PointHead. + sampled_point = num_points * oversample_ratio. Default: 3. + point_importance_sample_ratio (float, optional): The importance sample ratio for compute num_uncertain_points in PointHead. Default: 0.75. + point_scale_factor(int, optinal): The scale factor of F.interpolate in refine seg logits stage when in inference in PointHead. Default: 2. + point_subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference in PointHead. Default: 2. + point_subdivision_num_points(int, optional): The points number for refine seg logits when in inference in PointHead. Default: 8196. + point_dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in PointHead. Default: 0.1. + point_coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with + the output of each fc layer in PointHead. Default: True. + point_conv_cfg(str): The config of Conv in PointHead. Default: 'Conv1D'. + point_input_transform(str): The features transform method of inputs in PointHead. + it can be found in function '_transform_inputs'. Defalut: 'multiple_select'. + PFN_feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2 in FPNHead. The first + one is of largest resolution. Default: [4, 8, 16, 32]. + PFN_in_channels(list): The input feature's channels list in FPNHead. Default: [256, 256, 256, 256]. + PFN_channels(int,optional): The output channels of scale_head's Conv before Upsample block in FPNHead. Default: 128. + PFN_in_index(list): The indexs of input features to use. it's shape should keep with in_channels in FPNHead. Default: [0, 1, 2, 3]. + PFN_dropout_ratio(float,optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in FPNHead. Default: 0.1. + PFN_conv_cfg(str): The config of Conv. Default: 'Conv2D'. + PFN_input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs' in FPNHead. Defalut: 'multiple_select'. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__( + self, + num_classes, + backbone, + backbone_indices, + fpn_inplanes=[256, 512, 1024, 2048], + fpn_outplanes=256, + point_in_channels=[256], + point_out_channels=256, + point_in_index=[0], + point_num_fcs=3, + point_num_points=2048, + point_oversample_ratio=3, + point_importance_sample_ratio=0.75, + point_scale_factor=2, + point_subdivision_steps=2, + point_subdivision_num_points=8196, + point_dropout_ratio=0, + point_coarse_pred_each_layer=True, + point_input_transform='multiple_select', # resize_concat + point_conv_cfg='Conv1D', + PFN_feature_strides=[4, 8, 16, 32], + PFN_in_channels=[256, 256, 256, 256], + PFN_channels=128, + PFN_in_index=[0, 1, 2, 3], + PFN_dropout_ratio=0, + PFN_conv_cfg='Conv2D', + PFN_input_transform='multiple_select', + align_corners=False, + pretrained=None): + super(PointRend, self).__init__() + self.backbone = backbone + self.backbone_indices = backbone_indices + self.in_channels = [ + self.backbone.feat_channels[i] for i in backbone_indices + ] + + self.neck = FPNNeck( + fpn_inplanes=fpn_inplanes, fpn_outplanes=fpn_outplanes) + self.pointhead = PointHead( + in_channels=point_in_channels, + out_channels=point_out_channels, + num_classes=num_classes, + in_index=point_in_index, + num_fcs=point_num_fcs, + num_points=point_num_points, + oversample_ratio=point_oversample_ratio, + importance_sample_ratio=point_importance_sample_ratio, + scale_factor=point_scale_factor, + subdivision_steps=point_subdivision_steps, + subdivision_num_points=point_subdivision_num_points, + dropout_ratio=point_dropout_ratio, + align_corners=align_corners, + coarse_pred_each_layer=point_coarse_pred_each_layer, + input_transform=point_input_transform, # resize_concat + conv_cfg=point_conv_cfg) + self.fpnhead = FPNHead( + feature_strides=PFN_feature_strides, + in_channels=PFN_in_channels, + channels=PFN_channels, + num_class=num_classes, + in_index=PFN_in_index, + dropout_ratio=PFN_dropout_ratio, + conv_cfg=PFN_conv_cfg, + input_transform=PFN_input_transform, + align_corners=align_corners) + + self.align_corners = align_corners + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + fpn_feats = self.neck(feats) # [n,256,64,128]*3 & [n,256,128,256] + pfn_logits = self.fpnhead( + fpn_feats) # segmainoutput decode_head[0] 512*1024->[n, 19, 64, 128] + point_logits = self.pointhead( + fpn_feats, pfn_logits) # segpointoutput decode_head[1] + + if self.training: + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in pfn_logits + ] + logit_list.append(point_logits) + else: + logit_list = [ + F.interpolate( + logit, + paddle.shape(x)[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in point_logits + ] + return logit_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class PointHead(nn.Layer): + """ + The PointHead implementation based on PaddlePaddle. + + PointHead use shared multi-layer perceptron (equivalent to + nn.Conv1D) to predict the logit of input points. The fine-grained feature + and coarse feature will be concatenate together for predication. + + The original article refers to: + Kirillov A , Wu Y , He K , et al "PointRend: Image Segmentation As Rendering." + (https://arxiv.org/abs/1912.08193) + + Args: + num_classes (int): Number of classes for logits. Default: 19. + num_fcs (int, optional): Number of fc layers in the head. Default: 3. + in_channels (list): input channels of fc block. Default: [256]. + out_channels (int, optional): Fc block's output channels. Default: 256. + in_index (list): The indexs of input features to use. Default: [0]. + num_points (int, optional): The number of point in training mode. Default: 2048. + oversample_ratio (int, optional): The sample ratio of points when in training mode. + sampled_point = num_points * oversample_ratio. Default: 3. + importance_sample_ratio(float, optional): The importance sample ratio for compute num_uncertain_points. Default: 0.75. + scale_factor(int, optional): The scale factor of F.interpolate in refine seg logits stage when in inference. Default: 2. + subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference. Default: 2. + subdivision_num_points(int, optional): The points number for refine seg logits when in inference. Default: 8196. + dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1. + coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with + the output of each fc layer. Default: True. + conv_cfg(str): The config of Conv. Default: 'Conv1D'. + input_transform(str): The features transform method of inputs. + it can be found in function '_transform_inputs'. Defalut: 'multiple_select'. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + """ + + def __init__( + self, + num_classes=19, + num_fcs=3, + in_channels=[256], + out_channels=256, + in_index=[0], + num_points=2048, + oversample_ratio=3, + importance_sample_ratio=0.75, + scale_factor=2, + subdivision_steps=2, + subdivision_num_points=8196, + dropout_ratio=0.1, + coarse_pred_each_layer=True, + conv_cfg='Conv1D', + input_transform='multiple_select', # resize_concat + align_corners=False): + super(PointHead, self).__init__() + + self.in_channels = in_channels + self.channels = out_channels + self.in_index = in_index + self.num_classes = num_classes + self.num_fcs = num_fcs + self.num_points = num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + self.scale_factor = scale_factor + self.subdivision_steps = subdivision_steps + self.subdivision_num_points = paddle.to_tensor( + subdivision_num_points, dtype="int32") + self.dropout_ratio = dropout_ratio + self.coarse_pred_each_layer = coarse_pred_each_layer + self.align_corners = align_corners + self.input_transform = input_transform + + fc_in_channels = sum(self.in_channels) + self.num_classes + fc_channels = self.channels + self.fcs = nn.LayerList() + for k in range(num_fcs): + fc = ConvModule( + fc_in_channels, + fc_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, ) + self.fcs.append(fc) + fc_in_channels = fc_channels + fc_in_channels += self.num_classes if self.coarse_pred_each_layer else 0 + self.fc_seg = nn.Conv1D( + fc_in_channels, + self.num_classes, + kernel_size=1, + stride=1, + padding=0) + + if self.dropout_ratio > 0: + self.dropout = nn.Dropout(self.dropout_ratio) + else: + self.dropout = None + + def cls_seg(self, feat): + """Classify each pixel with fc.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.fc_seg(feat) + return output + + def _get_fine_grained_point_feats(self, x, points): + """ + Sample from fine grained features. + + Args: + x (list[Tensor]): Feature pyramid from by neck or backbone. + points (Tensor): Point coordinates, shape (batch_size, + num_points, 2). + Returns: + fine_grained_feats (Tensor): Sampled fine grained feature, + shape (batch_size, sum(channels of x), num_points). + """ + + fine_grained_feats_list = [ + point_sample( + _, points, align_corners=self.align_corners) for _ in x + ] + if len(fine_grained_feats_list) > 1: + fine_grained_feats = paddle.concat(fine_grained_feats_list, axis=1) + else: + fine_grained_feats = fine_grained_feats_list[0] + return fine_grained_feats + + def _get_coarse_point_feats(self, prev_output, points): + """ + Sample from fine grained features. + + Args: + prev_output (list[Tensor]): Prediction of previous decode head. + points (Tensor): Point coordinates, shape (batch_size, + num_points, 2). + Returns: + coarse_feats (Tensor): Sampled coarse feature, shape (batch_size, + num_classes, num_points). + """ + + coarse_feats = point_sample( + prev_output, points, align_corners=self.align_corners) + return coarse_feats + + def _transform_inputs(self, inputs): + """ + Transform inputs for decoder. + + Args: + inputs (list[Tensor]): List of multi-level img features. + Returns: + Tensor: The transformed inputs + """ + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + F.interpolate( + x, + size=paddle.shape(inputs[0])[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = paddle.concat(upsampled_inputs, axis=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index[0]] + return inputs + + def get_points_train(self, seg_logits, uncertainty_func): # finish + """ + Sample points for training. + Sample points in [0, 1] x [0, 1] coordinate space based on their + uncertainty. The uncertainties are calculated for each point using + 'uncertainty_func' function that takes point's logit prediction as + input. + + Args: + seg_logits (Tensor): Semantic segmentation logits, shape ( + batch_size, num_classes, height, width). + uncertainty_func (func): uncertainty calculation function. + cfg (dict): Training config of point head. + Returns: + point_coords (Tensor): A tensor of shape (batch_size, num_points, + 2) that contains the coordinates of ``num_points`` sampled + points. + """ + + num_points = self.num_points + oversample_ratio = self.oversample_ratio + importance_sample_ratio = self.importance_sample_ratio + assert oversample_ratio >= 1 + assert 0 <= importance_sample_ratio <= 1 + batch_size = paddle.shape(seg_logits)[0] + num_sampled = int(num_points * oversample_ratio) + point_coords = paddle.rand([batch_size, num_sampled, 2]) + point_logits = point_sample(seg_logits, point_coords) + # It is crucial to calculate uncertainty based on the sampled + # prediction value for the points. Calculating uncertainties of the + # coarse predictions first and sampling them for points leads to + # incorrect results. To illustrate this: assume uncertainty func( + # logits)=-abs(logits), a sampled point between two coarse + # predictions with -1 and 1 logits has 0 logits, and therefore 0 + # uncertainty value. However, if we calculate uncertainties for the + # coarse predictions first, both will have -1 uncertainty, + # and sampled point will get -1 uncertainty. + point_uncertainties = uncertainty_func(point_logits) + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + idx = paddle.topk( + point_uncertainties[:, 0, :], k=num_uncertain_points, axis=1)[1] + shift = num_sampled * paddle.arange(batch_size, dtype='int64') + idx += shift.unsqueeze([-1]) + idx = idx.reshape([-1]) + point_coords = paddle.index_select( + point_coords.reshape([-1, 2]), idx, axis=0) + point_coords = point_coords.reshape( + [batch_size, num_uncertain_points, 2]) + if num_random_points > 0: + rand_point_coords = paddle.rand([batch_size, num_random_points, 2]) + point_coords = paddle.concat( + (point_coords, rand_point_coords), axis=1) + return point_coords + + def get_points_test(self, seg_logits, uncertainty_func): # finish + """ + Sample points for testing. + Find ``num_points`` most uncertain points from ``uncertainty_map``. + + Args: + seg_logits (Tensor): A tensor of shape (batch_size, num_classes, + height, width) for class-specific or class-agnostic prediction. + uncertainty_func (func): uncertainty calculation function. + cfg (dict): Testing config of point head. + Returns: + point_indices (Tensor): A tensor of shape (batch_size, num_points) + that contains indices from [0, height x width) of the most + uncertain points. + point_coords (Tensor): A tensor of shape (batch_size, num_points, + 2) that contains [0, 1] x [0, 1] normalized coordinates of the + most uncertain points from the ``height x width`` grid . + """ + + num_points = self.subdivision_num_points + uncertainty_map = uncertainty_func(seg_logits) + batch_size = paddle.shape(uncertainty_map)[0] + height = paddle.shape(uncertainty_map)[2] + width = paddle.shape(uncertainty_map)[3] + h_step = 1.0 / height + w_step = 1.0 / width + + uncertainty_map = uncertainty_map.reshape([batch_size, height * width]) + num_points = paddle.min(paddle.concat([height * width, num_points])) + point_indices = paddle.topk(uncertainty_map, num_points, axis=1)[1] + point_coords = paddle.zeros( + [batch_size, num_points, 2], dtype='float32') + point_coords[:, :, 0] = w_step / 2.0 + (point_indices % width + ).astype('float32') * w_step + point_coords[:, :, 1] = h_step / 2.0 + (point_indices // width + ).astype('float32') * h_step + return point_indices, point_coords + + def scatter_paddle(self, refined_seg_logits, point_indices, point_logits): + """ + paddle version scatter : equal to pytorch version scatter(-1,point_indices,point_logits). + + Args: + refined_seg_logits(Tensor): shape=[batch_size, channels, height * width] + point_indices(Tensor): shape=[batch_size, channels, height * width] + point_logits(Tensor): shape[batch_size, channels, height * width] + Returns: + scattered refined_seg_logits(Tensor). + """ + + original_shape = paddle.shape( + refined_seg_logits) # [batch_size, channels, height * width] + new_refined_seg_logits = refined_seg_logits.flatten(0, 1) # [N*C,H*W] + offsets = ( + paddle.arange(paddle.shape(new_refined_seg_logits)[0]) * + paddle.shape(new_refined_seg_logits)[1]).unsqueeze(-1) # [N*C,1] + point_indices = point_indices.flatten(0, 1) # [N*C,H*W] + new_point_indices = (point_indices + offsets).flatten() + point_logits = point_logits.flatten() # [N*C*H*W] + refined_seg_logits = paddle.scatter( + refined_seg_logits.flatten(), + new_point_indices, + point_logits, + overwrite=True) + return refined_seg_logits.reshape(shape=original_shape) + + def forward_train(self, x, prev_output): + with paddle.no_grad(): + points = self.get_points_train(prev_output, calculate_uncertainty) + + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, points) # [2, 256, 2048] + coarse_point_feats = self._get_coarse_point_feats( + prev_output, points) # [2, 19, 2048] + # forward for train + fusion_point_feats = paddle.concat( + [fine_grained_point_feats, coarse_point_feats], axis=1) + for fc in self.fcs: + fusion_point_feats = fc(fusion_point_feats) + if self.coarse_pred_each_layer: + fusion_point_feats = paddle.concat( + (fusion_point_feats, coarse_point_feats), axis=1) + point_logits = self.cls_seg(fusion_point_feats) + return [point_logits, points] # for points loss + + def forward(self, inputs, prev_output): + """ + Forward function. + + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + Returns: + [point_logits,points]: For points loss when in training. + [refined_seg_logits]: Output refined seg logits when in inference. + """ + + prev_output = prev_output[0] + x = self._transform_inputs(inputs) + if self.training: + return self.forward_train(x, prev_output) + else: + refined_seg_logits = prev_output.clone() + for _ in range(self.subdivision_steps): + refined_seg_logits = F.interpolate( + refined_seg_logits, + scale_factor=self.scale_factor, + mode='bilinear', + align_corners=self.align_corners) + + save_shape = paddle.shape(refined_seg_logits) + point_indices, points = self.get_points_test( + refined_seg_logits, calculate_uncertainty) + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, points) + coarse_point_feats = self._get_coarse_point_feats(prev_output, + points) + # forward for inference + fusion_point_feats = paddle.concat( + [fine_grained_point_feats, coarse_point_feats], axis=1) + for fc in self.fcs: + fusion_point_feats = fc(fusion_point_feats) + if self.coarse_pred_each_layer: + fusion_point_feats = paddle.concat( + (fusion_point_feats, coarse_point_feats), axis=1) + point_logits = self.cls_seg(fusion_point_feats) + point_indices = paddle.unsqueeze(point_indices, axis=1) + point_indices = paddle.expand(point_indices, + [-1, save_shape[1], -1]) + + refined_seg_logits = paddle.flatten(refined_seg_logits, 2) + refined_seg_logits = self.scatter_paddle( + refined_seg_logits, point_indices, + point_logits) # 2->height * width dim + refined_seg_logits = refined_seg_logits.reshape(save_shape) + return [refined_seg_logits] + + +class FPNHead(nn.Layer): + """ + This head is the implementation of Semantic FPN in paddle. + + The original article refers to: + Kirillov, A. , et al. "Panoptic Feature Pyramid Networks." + (https://arxiv.org/abs/1901.02446) + + Args: + num_classes(int): The unique number of target classes. Default: 19. + feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2. The first + one is of largest resolution. Default: [4, 8, 16, 32]. + in_channels(list): The input feature's channels list. Default: [256, 256, 256, 256]. + channels(int, optional): The output channels of scale_head's Conv before Upsample block. Default: 128. + in_index(list): The indexs of input features to use. it's shape should keep with in_channels. Default: [0, 1, 2, 3]. + dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1. + conv_cfg(str): The config of Conv. Default: 'Conv2D'. + input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs'. Defalut: 'multiple_select'. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + """ + + def __init__( + self, + num_class=19, + feature_strides=[4, 8, 16, 32], + in_channels=[256, 256, 256, 256], + channels=128, + in_index=[0, 1, 2, 3], + dropout_ratio=0.1, + conv_cfg='Conv2D', + input_transform='multiple_select', + align_corners=False, ): + super(FPNHead, self).__init__() + assert len(feature_strides) == len(in_channels) + assert min(feature_strides) == feature_strides[0] + self.feature_strides = feature_strides + self.in_channels = in_channels + self.channels = channels + self.in_index = in_index + self.num_class = num_class + self.conv_cfg = conv_cfg + self.dropout_ratio = dropout_ratio + self.input_transform = input_transform + self.align_corners = align_corners + self.scale_heads = nn.LayerList() + + for i in range(len(feature_strides)): + head_length = max( + 1, + int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) + scale_head = [] + for k in range(head_length): + scale_head.append( + ConvModule( + self.in_channels[i] if k == 0 else self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg)) + if feature_strides[i] != feature_strides[0]: + scale_head.append( + Upsample( + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners)) + self.scale_heads.append(nn.Sequential(*scale_head)) + + self.conv_seg = nn.Conv2D(self.channels, self.num_class, kernel_size=1) + + if self.dropout_ratio is not None: + self.dropout = nn.Dropout2D(self.dropout_ratio) + else: + self.dropout = None + + def cls_seg(self, feat): + if self.dropout is not None: + feat = self.dropout(feat) + output = self.conv_seg(feat) + return output + + def _transform_inputs(self, inputs): + """ + Transform inputs for decoder. + + Args: + inputs (list[Tensor]): List of multi-level img features. + Returns: + Tensor: The transformed inputs + """ + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + F.interpolate( + x, + size=paddle.shape(inputs[0])[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = paddle.concat(upsampled_inputs, axis=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index[0]] + + return inputs + + def forward(self, inputs): + x = self._transform_inputs(inputs) + output = self.scale_heads[0](x[0]) + for i in range(1, len(self.feature_strides)): + output = output + F.interpolate( + self.scale_heads[i](x[i]), + size=paddle.shape(output)[2:], + mode='bilinear', + align_corners=self.align_corners) + output = self.cls_seg(output) + return [output] + + +class FPNNeck(nn.Layer): + """ + The FPN Neck implementation in paddle. + + Args: + fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction. Default: [256, 512, 1024, 2048]. + fpn_outplanes (int, optional): The output channels. Default: 256. + """ + + def __init__( + self, + fpn_inplanes=[256, 512, 1024, 2048], + fpn_outplanes=256, ): + super(FPNNeck, self).__init__() + self.lateral_convs = [] + self.fpn_out = [] + + # FPN head + for fpn_inplane in fpn_inplanes: + self.lateral_convs.append( + nn.Sequential( + nn.Conv2D(fpn_inplane, fpn_outplanes, 1), + layers.SyncBatchNorm(fpn_outplanes), nn.ReLU())) + self.fpn_out.append( + nn.Sequential( + layers.ConvBNReLU( + fpn_outplanes, fpn_outplanes, 3, bias_attr=False))) + + self.lateral_convs = nn.LayerList(self.lateral_convs) + self.fpn_out = nn.LayerList(self.fpn_out) + + def forward(self, conv_out): + last_out = self.lateral_convs[-1](conv_out[-1]) + f = last_out + fpn_feature_list = [last_out] + for i in reversed(range(len(conv_out) - 1)): + conv_x = conv_out[i] + conv_x = self.lateral_convs[i](conv_x) + prev_shape = paddle.shape(conv_x)[2:] + f = conv_x + F.interpolate( + f, prev_shape, mode='bilinear', align_corners=True) + fpn_feature_list.append(self.fpn_out[i](f)) + return fpn_feature_list + + +class ConvModule(nn.Layer): + """ + ConvModule includes Conv1/Conv2D. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding=0, + stride=1, + conv_cfg='Conv1D', + norm_cfg='None', + **kwargs): + super().__init__() + if (conv_cfg == 'Conv1D'): + self._conv = nn.Conv1D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + **kwargs) + if (conv_cfg == 'Conv2D'): + self._conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + **kwargs) + if 'data_format' in kwargs: + data_format = kwargs['data_format'] + else: + data_format = 'NCHW' + if (norm_cfg != 'None'): + self._batch_norm = layers.SyncBatchNorm( + out_channels, data_format=data_format) + else: + self._batch_norm = None + + def forward(self, x): + x = self._conv(x) + if (self._batch_norm != None): + x = self._batch_norm(x) + x = F.relu(x) + return x + + +class Upsample(nn.Layer): + """ + Upsample Module. + """ + + def __init__(self, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None): + super(Upsample, self).__init__() + self.size = size + if isinstance(scale_factor, tuple): + self.scale_factor = tuple(float(factor) for factor in scale_factor) + else: + self.scale_factor = float(scale_factor) if scale_factor else None + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + if not self.size: + return F.interpolate(x, None, self.scale_factor, self.mode, + self.align_corners) + else: + return F.interpolate(x, self.size, None, self.mode, + self.align_corners) + + +def point_sample(input, points, align_corners=False, **kwargs): + """ + A wrapper around :func:`grid_sample` to support 3D point_coords tensors + Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to + lie inside ``[0, 1] x [0, 1]`` square. + + Args: + input (Tensor): Feature map, shape (N, C, H, W). + points (Tensor): Image based absolute point coordinates (normalized), + range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2). + align_corners (bool): Whether align_corners. Default: False + Returns: + Tensor: Features of `point` on `input`, shape (N, C, P) or + (N, C, Hgrid, Wgrid). + """ + + def denormalize(grid): + """Denormalize input grid from range [0, 1] to [-1, 1] + Args: + grid (Tensor): The grid to be denormalize, range [0, 1]. + Returns: + Tensor: Denormalized grid, range [-1, 1]. + """ + return grid * 2.0 - 1.0 + + add_dim = False + if points.dim() == 3: + add_dim = True + points = paddle.unsqueeze(points, axis=2) + output = F.grid_sample( + input, denormalize(points), align_corners=align_corners, **kwargs) + if add_dim: + output = paddle.squeeze(output, axis=3) + return output + + +def calculate_uncertainty(seg_logits): + """ + Estimate uncertainty based on seg logits. + For each location of the prediction ``seg_logits`` we estimate + uncertainty as the difference between top first and top second + predicted logits. + + Args: + seg_logits (Tensor): Semantic segmentation logits, + shape (batch_size, num_classes, height, width). + Returns: + scores (Tensor): T uncertainty scores with the most uncertain + locations having the highest uncertainty score, shape ( + batch_size, 1, height, width) + """ + + top2_scores = paddle.topk(seg_logits, k=2, axis=1)[0] + return paddle.unsqueeze(top2_scores[:, 1] - top2_scores[:, 0], axis=1) diff --git a/paddleseg/models/portraitnet.py b/paddleseg/models/portraitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..255eacaa2457a97bd7ca700959c448d7e2457e77 --- /dev/null +++ b/paddleseg/models/portraitnet.py @@ -0,0 +1,220 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.nn as nn + +from paddleseg import utils +from paddleseg.cvlibs import manager + + +@manager.MODELS.add_component +class PortraitNet(nn.Layer): + """ + The PortraitNet implementation based on PaddlePaddle. + + The original article refers to + Song-Hai Zhanga, Xin Donga, Jia Lib, Ruilong Lia, Yong-Liang Yangc + "PortraitNet: Real-time Portrait Segmentation Network for Mobile Device" + (https://www.yongliangyang.net/docs/mobilePotrait_c&g19.pdf). + + Args: + num_classes (int, optional): The unique number of target classes. Default: 2. + backbone (Paddle.nn.Layer): Backbone network, currently support MobileNetV2. + add_edge (bool, optional): Whether output to edge. Default: False + pretrained (str, optional): The path or url of pretrained model. Default: None + """ + + def __init__(self, + num_classes, + backbone, + min_channel=16, + channel_ratio=1.0, + add_edge=False, + pretrained=None): + super(PortraitNet, self).__init__() + self.backbone = backbone + self.head = PortraitNetHead(num_classes, min_channel, channel_ratio, + add_edge) + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + feat_list = self.backbone(x) + logits_list = self.head(feat_list) + return [logits_list] + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class PortraitNetHead(nn.Layer): + def __init__(self, + num_classes, + min_channel=16, + channel_ratio=1.0, + add_edge=False): + super().__init__() + self.min_channel = min_channel + self.channel_ratio = channel_ratio + self.add_edge = add_edge + self.deconv1 = nn.Conv2DTranspose( + self.depth(96), + self.depth(96), + groups=1, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False) + self.deconv2 = nn.Conv2DTranspose( + self.depth(32), + self.depth(32), + groups=1, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False) + self.deconv3 = nn.Conv2DTranspose( + self.depth(24), + self.depth(24), + groups=1, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False) + self.deconv4 = nn.Conv2DTranspose( + self.depth(16), + self.depth(16), + groups=1, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False) + self.deconv5 = nn.Conv2DTranspose( + self.depth(8), + self.depth(8), + groups=1, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False) + + self.transit1 = ResidualBlock(self.depth(320), self.depth(96)) + self.transit2 = ResidualBlock(self.depth(96), self.depth(32)) + self.transit3 = ResidualBlock(self.depth(32), self.depth(24)) + self.transit4 = ResidualBlock(self.depth(24), self.depth(16)) + self.transit5 = ResidualBlock(self.depth(16), self.depth(8)) + + self.pred = nn.Conv2D( + self.depth(8), num_classes, 3, 1, 1, bias_attr=False) + if self.add_edge: + self.edge = nn.Conv2D( + self.depth(8), num_classes, 3, 1, 1, bias_attr=False) + + def depth(self, channels): + min_channel = min(channels, self.min_channel) + return max(min_channel, int(channels * self.channel_ratio)) + + def forward(self, feat_list): + feature_1_4, feature_1_8, feature_1_16, feature_1_32 = feat_list + up_1_16 = self.deconv1(self.transit1(feature_1_32)) + up_1_8 = self.deconv2(self.transit2(feature_1_16 + up_1_16)) + up_1_4 = self.deconv3(self.transit3(feature_1_8 + up_1_8)) + up_1_2 = self.deconv4(self.transit4(feature_1_4 + up_1_4)) + up_1_1 = self.deconv5(self.transit5(up_1_2)) + + pred = self.pred(up_1_1) + if self.add_edge: + edge = self.edge(up_1_1) + return pred, edge + else: + return pred + + +class ConvDw(nn.Layer): + def __init__(self, inp, oup, kernel, stride): + super(ConvDw, self).__init__() + self.conv = nn.Sequential( + nn.Conv2D( + inp, + inp, + kernel, + stride, (kernel - 1) // 2, + groups=inp, + bias_attr=False), + nn.BatchNorm2D( + num_features=inp, epsilon=1e-05, momentum=0.1), + nn.ReLU(), + nn.Conv2D( + inp, oup, 1, 1, 0, bias_attr=False), + nn.BatchNorm2D( + num_features=oup, epsilon=1e-05, momentum=0.1), + nn.ReLU(), ) + + def forward(self, x): + return self.conv(x) + + +class ResidualBlock(nn.Layer): + def __init__(self, inp, oup, stride=1): + super(ResidualBlock, self).__init__() + + self.block = nn.Sequential( + ConvDw( + inp, oup, 3, stride=stride), + nn.Conv2D( + in_channels=oup, + out_channels=oup, + kernel_size=3, + stride=1, + padding=1, + groups=oup, + bias_attr=False), + nn.BatchNorm2D( + num_features=oup, epsilon=1e-05, momentum=0.1), + nn.ReLU(), + nn.Conv2D( + in_channels=oup, + out_channels=oup, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False), + nn.BatchNorm2D( + num_features=oup, epsilon=1e-05, momentum=0.1), ) + if inp == oup: + self.residual = None + else: + self.residual = nn.Sequential( + nn.Conv2D( + in_channels=inp, + out_channels=oup, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False), + nn.BatchNorm2D( + num_features=oup, epsilon=1e-05, momentum=0.1), ) + self.relu = nn.ReLU() + + def forward(self, x): + residual = x + + out = self.block(x) + if self.residual is not None: + residual = self.residual(x) + + out += residual + out = self.relu(out) + return out diff --git a/paddleseg/transforms/__init__.py b/paddleseg/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f1d5ae80aeb1eb77ac672b1cbcfedcbfbd643c4 --- /dev/null +++ b/paddleseg/transforms/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .transforms import * +from . import functional diff --git a/paddleseg/transforms/functional.py b/paddleseg/transforms/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..dc1ac572ae5413a85e6fe2b5cee9663305b382f3 --- /dev/null +++ b/paddleseg/transforms/functional.py @@ -0,0 +1,178 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import numpy as np +from PIL import Image, ImageEnhance +from scipy.ndimage import distance_transform_edt + + +def normalize(im, mean, std): + im = im.astype(np.float32, copy=False) / 255.0 + im -= mean + im /= std + return im + + +def resize(im, target_size=608, interp=cv2.INTER_LINEAR): + if isinstance(target_size, list) or isinstance(target_size, tuple): + w = target_size[0] + h = target_size[1] + else: + w = target_size + h = target_size + im = cv2.resize(im, (w, h), interpolation=interp) + return im + + +def resize_long(im, long_size=224, interpolation=cv2.INTER_LINEAR): + value = max(im.shape[0], im.shape[1]) + scale = float(long_size) / float(value) + resized_width = int(round(im.shape[1] * scale)) + resized_height = int(round(im.shape[0] * scale)) + + im = cv2.resize( + im, (resized_width, resized_height), interpolation=interpolation) + return im + + +def resize_short(im, short_size=224, interpolation=cv2.INTER_LINEAR): + value = min(im.shape[0], im.shape[1]) + scale = float(short_size) / float(value) + resized_width = int(round(im.shape[1] * scale)) + resized_height = int(round(im.shape[0] * scale)) + + im = cv2.resize( + im, (resized_width, resized_height), interpolation=interpolation) + return im + + +def horizontal_flip(im): + if len(im.shape) == 3: + im = im[:, ::-1, :] + elif len(im.shape) == 2: + im = im[:, ::-1] + return im + + +def vertical_flip(im): + if len(im.shape) == 3: + im = im[::-1, :, :] + elif len(im.shape) == 2: + im = im[::-1, :] + return im + + +def brightness(im, brightness_lower, brightness_upper): + brightness_delta = np.random.uniform(brightness_lower, brightness_upper) + im = ImageEnhance.Brightness(im).enhance(brightness_delta) + return im + + +def contrast(im, contrast_lower, contrast_upper): + contrast_delta = np.random.uniform(contrast_lower, contrast_upper) + im = ImageEnhance.Contrast(im).enhance(contrast_delta) + return im + + +def saturation(im, saturation_lower, saturation_upper): + saturation_delta = np.random.uniform(saturation_lower, saturation_upper) + im = ImageEnhance.Color(im).enhance(saturation_delta) + return im + + +def hue(im, hue_lower, hue_upper): + hue_delta = np.random.uniform(hue_lower, hue_upper) + im = np.array(im.convert('HSV')) + im[:, :, 0] = im[:, :, 0] + hue_delta + im = Image.fromarray(im, mode='HSV').convert('RGB') + return im + + +def sharpness(im, sharpness_lower, sharpness_upper): + sharpness_delta = np.random.uniform(sharpness_lower, sharpness_upper) + im = ImageEnhance.Sharpness(im).enhance(sharpness_delta) + return im + + +def rotate(im, rotate_lower, rotate_upper): + rotate_delta = np.random.uniform(rotate_lower, rotate_upper) + im = im.rotate(int(rotate_delta)) + return im + + +def mask_to_onehot(mask, num_classes): + """ + Convert a mask (H, W) to onehot (K, H, W). + + Args: + mask (np.ndarray): Label mask with shape (H, W) + num_classes (int): Number of classes. + + Returns: + np.ndarray: Onehot mask with shape(K, H, W). + """ + _mask = [mask == i for i in range(num_classes)] + _mask = np.array(_mask).astype(np.uint8) + return _mask + + +def onehot_to_binary_edge(mask, radius): + """ + Convert a onehot mask (K, H, W) to a edge mask. + + Args: + mask (np.ndarray): Onehot mask with shape (K, H, W) + radius (int|float): Radius of edge. + + Returns: + np.ndarray: Edge mask with shape(H, W). + """ + if radius < 1: + raise ValueError('`radius` should be greater than or equal to 1') + num_classes = mask.shape[0] + + edge = np.zeros(mask.shape[1:]) + # pad borders + mask = np.pad(mask, ((0, 0), (1, 1), (1, 1)), + mode='constant', + constant_values=0) + for i in range(num_classes): + dist = distance_transform_edt(mask[i, :]) + distance_transform_edt( + 1.0 - mask[i, :]) + dist = dist[1:-1, 1:-1] + dist[dist > radius] = 0 + edge += dist + + edge = np.expand_dims(edge, axis=0) + edge = (edge > 0).astype(np.uint8) + return edge + + +def mask_to_binary_edge(mask, radius, num_classes): + """ + Convert a segmentic segmentation mask (H, W) to a binary edge mask(H, W). + + Args: + mask (np.ndarray): Label mask with shape (H, W) + radius (int|float): Radius of edge. + num_classes (int): Number of classes. + + Returns: + np.ndarray: Edge mask with shape(H, W). + """ + mask = mask.squeeze() + onehot = mask_to_onehot(mask, num_classes) + edge = onehot_to_binary_edge(onehot, radius) + return edge diff --git a/paddleseg/transforms/transforms.py b/paddleseg/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..c754d9a008183eb40581c050e8bc603a6633d278 --- /dev/null +++ b/paddleseg/transforms/transforms.py @@ -0,0 +1,1072 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import math + +import cv2 +import numpy as np +from PIL import Image + +from paddleseg.cvlibs import manager +from paddleseg.transforms import functional + + +@manager.TRANSFORMS.add_component +class Compose: + """ + Do transformation on input data with corresponding pre-processing and augmentation operations. + The shape of input data to all operations is [height, width, channels]. + + Args: + transforms (list): A list contains data pre-processing or augmentation. Empty list means only reading images, no transformation. + to_rgb (bool, optional): If converting image to RGB color space. Default: True. + + Raises: + TypeError: When 'transforms' is not a list. + ValueError: when the length of 'transforms' is less than 1. + """ + + def __init__(self, transforms, to_rgb=True): + if not isinstance(transforms, list): + raise TypeError('The transforms must be a list!') + self.transforms = transforms + self.to_rgb = to_rgb + + def __call__(self, data): + """ + Args: + data: A dict to deal with. It may include keys: 'img', 'label', 'trans_info' and 'gt_fields'. + 'trans_info' reserve the image shape informating. And the 'gt_fields' save the key need to transforms + together with 'img' + + Returns: A dict after process。 + """ + if 'img' not in data.keys(): + raise ValueError("`data` must include `img` key.") + if isinstance(data['img'], str): + data['img'] = cv2.imread(data['img']).astype('float32') + if data['img'] is None: + raise ValueError('Can\'t read The image file {}!'.format(data[ + 'img'])) + if not isinstance(data['img'], np.ndarray): + raise TypeError("Image type is not numpy.") + if len(data['img'].shape) != 3: + raise ValueError('Image is not 3-dimensional.') + if 'label' in data.keys() and isinstance(data['label'], str): + data['label'] = np.asarray(Image.open(data['label'])) + + if self.to_rgb: + data['img'] = cv2.cvtColor(data['img'], cv2.COLOR_BGR2RGB) + + # the `trans_info` will save the process of image shape, and will be used in evaluation and prediction. + if 'trans_info' not in data.keys(): + data['trans_info'] = [] + + for op in self.transforms: + data = op(data) + data['img'] = np.transpose(data['img'], (2, 0, 1)) + return data + + +@manager.TRANSFORMS.add_component +class RandomHorizontalFlip: + """ + Flip an image horizontally with a certain probability. + + Args: + prob (float, optional): A probability of horizontally flipping. Default: 0.5. + """ + + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, data): + if random.random() < self.prob: + data['img'] = functional.horizontal_flip(data['img']) + for key in data.get('gt_fields', []): + data[key] = functional.horizontal_flip(data[key]) + return data + + +@manager.TRANSFORMS.add_component +class RandomVerticalFlip: + """ + Flip an image vertically with a certain probability. + + Args: + prob (float, optional): A probability of vertical flipping. Default: 0.1. + """ + + def __init__(self, prob=0.1): + self.prob = prob + + def __call__(self, data): + if random.random() < self.prob: + data['img'] = functional.vertical_flip(data['img']) + for key in data.get('gt_fields', []): + data[key] = functional.vertical_flip(data[key]) + return data + + +@manager.TRANSFORMS.add_component +class Resize: + """ + Resize an image. + + Args: + target_size (list|tuple, optional): The target size of image. Default: (512, 512). + interp (str, optional): The interpolation mode of resize is consistent with opencv. + ['NEAREST', 'LINEAR', 'CUBIC', 'AREA', 'LANCZOS4', 'RANDOM']. Note that when it is + 'RANDOM', a random interpolation mode would be specified. Default: "LINEAR". + + Raises: + TypeError: When 'target_size' type is neither list nor tuple. + ValueError: When "interp" is out of pre-defined methods ('NEAREST', 'LINEAR', 'CUBIC', + 'AREA', 'LANCZOS4', 'RANDOM'). + """ + + # The interpolation mode + interp_dict = { + 'NEAREST': cv2.INTER_NEAREST, + 'LINEAR': cv2.INTER_LINEAR, + 'CUBIC': cv2.INTER_CUBIC, + 'AREA': cv2.INTER_AREA, + 'LANCZOS4': cv2.INTER_LANCZOS4 + } + + def __init__(self, target_size=(512, 512), interp='LINEAR'): + self.interp = interp + if not (interp == "RANDOM" or interp in self.interp_dict): + raise ValueError("`interp` should be one of {}".format( + self.interp_dict.keys())) + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of `target_size` is invalid. It should be list or tuple, but it is {}" + .format(type(target_size))) + + self.target_size = target_size + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + if self.interp == "RANDOM": + interp = random.choice(list(self.interp_dict.keys())) + else: + interp = self.interp + data['img'] = functional.resize(data['img'], self.target_size, + self.interp_dict[interp]) + for key in data.get('gt_fields', []): + data[key] = functional.resize(data[key], self.target_size, + cv2.INTER_NEAREST) + + return data + + +@manager.TRANSFORMS.add_component +class ResizeByLong: + """ + Resize the long side of an image to given size, and then scale the other side proportionally. + + Args: + long_size (int): The target size of long side. + """ + + def __init__(self, long_size): + self.long_size = long_size + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + data['img'] = functional.resize_long(data['img'], self.long_size) + for key in data.get('gt_fields', []): + data[key] = functional.resize_long(data[key], self.long_size, + cv2.INTER_NEAREST) + + return data + + +@manager.TRANSFORMS.add_component +class ResizeByShort: + """ + Resize the short side of an image to given size, and then scale the other side proportionally. + + Args: + short_size (int): The target size of short side. + """ + + def __init__(self, short_size): + self.short_size = short_size + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + data['img'] = functional.resize_short(data['img'], self.short_size) + for key in data.get('gt_fields', []): + data[key] = functional.resize_short(data[key], self.short_size, + cv2.INTER_NEAREST) + + return data + + +@manager.TRANSFORMS.add_component +class LimitLong: + """ + Limit the long edge of image. + + If the long edge is larger than max_long, resize the long edge + to max_long, while scale the short edge proportionally. + + If the long edge is smaller than min_long, resize the long edge + to min_long, while scale the short edge proportionally. + + Args: + max_long (int, optional): If the long edge of image is larger than max_long, + it will be resize to max_long. Default: None. + min_long (int, optional): If the long edge of image is smaller than min_long, + it will be resize to min_long. Default: None. + """ + + def __init__(self, max_long=None, min_long=None): + if max_long is not None: + if not isinstance(max_long, int): + raise TypeError( + "Type of `max_long` is invalid. It should be int, but it is {}" + .format(type(max_long))) + if min_long is not None: + if not isinstance(min_long, int): + raise TypeError( + "Type of `min_long` is invalid. It should be int, but it is {}" + .format(type(min_long))) + if (max_long is not None) and (min_long is not None): + if min_long > max_long: + raise ValueError( + '`max_long should not smaller than min_long, but they are {} and {}' + .format(max_long, min_long)) + self.max_long = max_long + self.min_long = min_long + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + + h, w = data['img'].shape[0], data['img'].shape[1] + long_edge = max(h, w) + target = long_edge + if (self.max_long is not None) and (long_edge > self.max_long): + target = self.max_long + elif (self.min_long is not None) and (long_edge < self.min_long): + target = self.min_long + + if target != long_edge: + data['img'] = functional.resize_long(data['img'], target) + for key in data.get('gt_fields', []): + data[key] = functional.resize_long(data[key], target, + cv2.INTER_NEAREST) + + return data + + +@manager.TRANSFORMS.add_component +class ResizeRangeScaling: + """ + Resize the long side of an image into a range, and then scale the other side proportionally. + + Args: + min_value (int, optional): The minimum value of long side after resize. Default: 400. + max_value (int, optional): The maximum value of long side after resize. Default: 600. + """ + + def __init__(self, min_value=400, max_value=600): + if min_value > max_value: + raise ValueError('min_value must be less than max_value, ' + 'but they are {} and {}.'.format(min_value, + max_value)) + self.min_value = min_value + self.max_value = max_value + + def __call__(self, data): + + if self.min_value == self.max_value: + random_size = self.max_value + else: + random_size = int( + np.random.uniform(self.min_value, self.max_value) + 0.5) + data['img'] = functional.resize_long(data['img'], random_size, + cv2.INTER_LINEAR) + for key in data.get('gt_fields', []): + data[key] = functional.resize_long(data[key], random_size, + cv2.INTER_NEAREST) + + return data + + +@manager.TRANSFORMS.add_component +class ResizeStepScaling: + """ + Scale an image proportionally within a range. + + Args: + min_scale_factor (float, optional): The minimum scale. Default: 0.75. + max_scale_factor (float, optional): The maximum scale. Default: 1.25. + scale_step_size (float, optional): The scale interval. Default: 0.25. + + Raises: + ValueError: When min_scale_factor is smaller than max_scale_factor. + """ + + def __init__(self, + min_scale_factor=0.75, + max_scale_factor=1.25, + scale_step_size=0.25): + if min_scale_factor > max_scale_factor: + raise ValueError( + 'min_scale_factor must be less than max_scale_factor, ' + 'but they are {} and {}.'.format(min_scale_factor, + max_scale_factor)) + self.min_scale_factor = min_scale_factor + self.max_scale_factor = max_scale_factor + self.scale_step_size = scale_step_size + + def __call__(self, data): + + if self.min_scale_factor == self.max_scale_factor: + scale_factor = self.min_scale_factor + + elif self.scale_step_size == 0: + scale_factor = np.random.uniform(self.min_scale_factor, + self.max_scale_factor) + + else: + num_steps = int((self.max_scale_factor - self.min_scale_factor) / + self.scale_step_size + 1) + scale_factors = np.linspace(self.min_scale_factor, + self.max_scale_factor, + num_steps).tolist() + np.random.shuffle(scale_factors) + scale_factor = scale_factors[0] + w = int(round(scale_factor * data['img'].shape[1])) + h = int(round(scale_factor * data['img'].shape[0])) + + data['img'] = functional.resize(data['img'], (w, h), cv2.INTER_LINEAR) + for key in data.get('gt_fields', []): + data[key] = functional.resize(data[key], (w, h), cv2.INTER_NEAREST) + + return data + + +@manager.TRANSFORMS.add_component +class Normalize: + """ + Normalize an image. + + Args: + mean (list, optional): The mean value of a data set. Default: [0.5, 0.5, 0.5]. + std (list, optional): The standard deviation of a data set. Default: [0.5, 0.5, 0.5]. + + Raises: + ValueError: When mean/std is not list or any value in std is 0. + """ + + def __init__(self, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)): + self.mean = mean + self.std = std + if not (isinstance(self.mean, + (list, tuple)) and isinstance(self.std, + (list, tuple))): + raise ValueError( + "{}: input type is invalid. It should be list or tuple".format( + self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def __call__(self, data): + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + data['img'] = functional.normalize(data['img'], mean, std) + + return data + + +@manager.TRANSFORMS.add_component +class Padding: + """ + Add bottom-right padding to a raw image or annotation image. + + Args: + target_size (list|tuple): The target size after padding. + im_padding_value (list, optional): The padding value of raw image. + Default: [127.5, 127.5, 127.5]. + label_padding_value (int, optional): The padding value of annotation image. Default: 255. + + Raises: + TypeError: When target_size is neither list nor tuple. + ValueError: When the length of target_size is not 2. + """ + + def __init__(self, + target_size, + im_padding_value=(127.5, 127.5, 127.5), + label_padding_value=255): + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of target_size is invalid. It should be list or tuple, now is {}" + .format(type(target_size))) + self.target_size = target_size + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, data): + data['trans_info'].append(('padding', data['img'].shape[0:2])) + im_height, im_width = data['img'].shape[0], data['img'].shape[1] + if isinstance(self.target_size, int): + target_height = self.target_size + target_width = self.target_size + else: + target_height = self.target_size[1] + target_width = self.target_size[0] + pad_height = target_height - im_height + pad_width = target_width - im_width + if pad_height < 0 or pad_width < 0: + raise ValueError( + 'The size of image should be less than `target_size`, but the size of image ({}, {}) is larger than `target_size` ({}, {})' + .format(im_width, im_height, target_width, target_height)) + else: + data['img'] = cv2.copyMakeBorder( + data['img'], + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=self.im_padding_value) + for key in data.get('gt_fields', []): + data[key] = cv2.copyMakeBorder( + data[key], + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=self.label_padding_value) + return data + + +@manager.TRANSFORMS.add_component +class PaddingByAspectRatio: + """ + + Args: + aspect_ratio (int|float, optional): The aspect ratio = width / height. Default: 1. + """ + + def __init__(self, + aspect_ratio=1, + im_padding_value=(127.5, 127.5, 127.5), + label_padding_value=255): + self.aspect_ratio = aspect_ratio + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, data): + + img_height = data['img'].shape[0] + img_width = data['img'].shape[1] + ratio = img_width / img_height + if ratio == self.aspect_ratio: + return data + elif ratio > self.aspect_ratio: + img_height = int(img_width / self.aspect_ratio) + else: + img_width = int(img_height * self.aspect_ratio) + padding = Padding( + (img_width, img_height), + im_padding_value=self.im_padding_value, + label_padding_value=self.label_padding_value) + return padding(data) + + +@manager.TRANSFORMS.add_component +class RandomPaddingCrop: + """ + Crop a sub-image from a raw image and annotation image randomly. If the target cropping size + is larger than original image, then the bottom-right padding will be added. + + Args: + crop_size (tuple, optional): The target cropping size. Default: (512, 512). + im_padding_value (list, optional): The padding value of raw image. + Default: [127.5, 127.5, 127.5]. + label_padding_value (int, optional): The padding value of annotation image. Default: 255. + + Raises: + TypeError: When crop_size is neither list nor tuple. + ValueError: When the length of crop_size is not 2. + """ + + def __init__(self, + crop_size=(512, 512), + im_padding_value=(127.5, 127.5, 127.5), + label_padding_value=255): + if isinstance(crop_size, list) or isinstance(crop_size, tuple): + if len(crop_size) != 2: + raise ValueError( + 'Type of `crop_size` is list or tuple. It should include 2 elements, but it is {}' + .format(crop_size)) + else: + raise TypeError( + "The type of `crop_size` is invalid. It should be list or tuple, but it is {}" + .format(type(crop_size))) + self.crop_size = crop_size + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, data): + + if isinstance(self.crop_size, int): + crop_width = self.crop_size + crop_height = self.crop_size + else: + crop_width = self.crop_size[0] + crop_height = self.crop_size[1] + + img_height = data['img'].shape[0] + img_width = data['img'].shape[1] + + if img_height == crop_height and img_width == crop_width: + return data + else: + pad_height = max(crop_height - img_height, 0) + pad_width = max(crop_width - img_width, 0) + if (pad_height > 0 or pad_width > 0): + data['img'] = cv2.copyMakeBorder( + data['img'], + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=self.im_padding_value) + for key in data.get('gt_fields', []): + data[key] = cv2.copyMakeBorder( + data[key], + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=self.label_padding_value) + img_height = data['img'].shape[0] + img_width = data['img'].shape[1] + + if crop_height > 0 and crop_width > 0: + h_off = np.random.randint(img_height - crop_height + 1) + w_off = np.random.randint(img_width - crop_width + 1) + + data['img'] = data['img'][h_off:(crop_height + h_off), w_off:( + w_off + crop_width), :] + for key in data.get('gt_fields', []): + data[key] = data[key][h_off:(crop_height + h_off), w_off:( + w_off + crop_width)] + return data + + +@manager.TRANSFORMS.add_component +class RandomCenterCrop: + """ + Crops the given the input data at the center. + Args: + retain_ratio (tuple or list, optional): The length of the input list or tuple must be 2. Default: (0.5, 0.5). + the first value is used for width and the second is for height. + In addition, the minimum size of the cropped image is [width * retain_ratio[0], height * retain_ratio[1]]. + Raises: + TypeError: When retain_ratio is neither list nor tuple. Default: None. + ValueError: When the value of retain_ratio is not in [0-1]. + """ + + def __init__(self, retain_ratio=(0.5, 0.5)): + if isinstance(retain_ratio, list) or isinstance(retain_ratio, tuple): + if len(retain_ratio) != 2: + raise ValueError( + 'When type of `retain_ratio` is list or tuple, it shoule include 2 elements, but it is {}' + .format(retain_ratio)) + if retain_ratio[0] > 1 or retain_ratio[1] > 1 or retain_ratio[ + 0] < 0 or retain_ratio[1] < 0: + raise ValueError( + 'Value of `retain_ratio` should be in [0, 1], but it is {}'. + format(retain_ratio)) + else: + raise TypeError( + "The type of `retain_ratio` is invalid. It should be list or tuple, but it is {}" + .format(type(retain_ratio))) + self.retain_ratio = retain_ratio + + def __call__(self, data): + retain_width = self.retain_ratio[0] + retain_height = self.retain_ratio[1] + + img_height = data['img'].shape[0] + img_width = data['img'].shape[1] + + if retain_width == 1. and retain_height == 1.: + return data + else: + randw = np.random.randint(img_width * (1 - retain_width)) + randh = np.random.randint(img_height * (1 - retain_height)) + offsetw = 0 if randw == 0 else np.random.randint(randw) + offseth = 0 if randh == 0 else np.random.randint(randh) + p0, p1, p2, p3 = offseth, img_height + offseth - randh, offsetw, img_width + offsetw - randw + data['img'] = data['img'][p0:p1, p2:p3, :] + for key in data.get('gt_fields', []): + data[key] = data[key][p0:p1, p2:p3] + + return data + + +@manager.TRANSFORMS.add_component +class ScalePadding: + """ + Add center padding to a raw image or annotation image,then scale the + image to target size. + + Args: + target_size (list|tuple, optional): The target size of image. Default: (512, 512). + im_padding_value (list, optional): The padding value of raw image. + Default: [127.5, 127.5, 127.5]. + label_padding_value (int, optional): The padding value of annotation image. Default: 255. + + Raises: + TypeError: When target_size is neither list nor tuple. + ValueError: When the length of target_size is not 2. + """ + + def __init__(self, + target_size=(512, 512), + im_padding_value=(127.5, 127.5, 127.5), + label_padding_value=255): + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of `target_size` is invalid. It should be list or tuple, but it is {}" + .format(type(target_size))) + + self.target_size = target_size + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, data): + height = data['img'].shape[0] + width = data['img'].shape[1] + + new_im = np.zeros( + (max(height, width), max(height, width), 3)) + self.im_padding_value + if 'label' in data['gt_fields']: + new_label = np.zeros((max(height, width), max(height, width) + )) + self.label_padding_value + + if height > width: + padding = int((height - width) / 2) + new_im[:, padding:padding + width, :] = data['img'] + if 'label' in data['gt_fields']: + new_label[:, padding:padding + width] = data['label'] + else: + padding = int((width - height) / 2) + new_im[padding:padding + height, :, :] = data['img'] + if 'label' in data['gt_fields']: + new_label[padding:padding + height, :] = data['label'] + + data['img'] = np.uint8(new_im) + data['img'] = functional.resize( + data['img'], self.target_size, interp=cv2.INTER_CUBIC) + if 'label' in data['gt_fields']: + data['label'] = np.uint8(new_label) + data['label'] = functional.resize( + data['label'], self.target_size, interp=cv2.INTER_CUBIC) + return data + + +@manager.TRANSFORMS.add_component +class RandomNoise: + """ + Superimposing noise on an image with a certain probability. + + Args: + prob (float, optional): A probability of blurring an image. Default: 0.5. + max_sigma(float, optional): The maximum value of standard deviation of the distribution. + Default: 10.0. + """ + + def __init__(self, prob=0.5, max_sigma=10.0): + self.prob = prob + self.max_sigma = max_sigma + + def __call__(self, data): + if random.random() < self.prob: + mu = 0 + sigma = random.random() * self.max_sigma + data['img'] = np.array(data['img'], dtype=np.float32) + data['img'] += np.random.normal(mu, sigma, data['img'].shape) + data['img'][data['img'] > 255] = 255 + data['img'][data['img'] < 0] = 0 + + return data + + +@manager.TRANSFORMS.add_component +class RandomBlur: + """ + Blurring an image by a Gaussian function with a certain probability. + + Args: + prob (float, optional): A probability of blurring an image. Default: 0.1. + blur_type(str, optional): A type of blurring an image, + gaussian stands for cv2.GaussianBlur, + median stands for cv2.medianBlur, + blur stands for cv2.blur, + random represents randomly selected from above. + Default: gaussian. + """ + + def __init__(self, prob=0.1, blur_type="gaussian"): + self.prob = prob + self.blur_type = blur_type + + def __call__(self, data): + + if self.prob <= 0: + n = 0 + elif self.prob >= 1: + n = 1 + else: + n = int(1.0 / self.prob) + if n > 0: + if np.random.randint(0, n) == 0: + radius = np.random.randint(3, 10) + if radius % 2 != 1: + radius = radius + 1 + if radius > 9: + radius = 9 + data['img'] = np.array(data['img'], dtype='uint8') + if self.blur_type == "gaussian": + data['img'] = cv2.GaussianBlur(data['img'], + (radius, radius), 0, 0) + elif self.blur_type == "median": + data['img'] = cv2.medianBlur(data['img'], radius) + elif self.blur_type == "blur": + data['img'] = cv2.blur(data['img'], (radius, radius)) + elif self.blur_type == "random": + select = random.random() + if select < 0.3: + data['img'] = cv2.GaussianBlur(data['img'], + (radius, radius), 0) + elif select < 0.6: + data['img'] = cv2.medianBlur(data['img'], radius) + else: + data['img'] = cv2.blur(data['img'], (radius, radius)) + else: + data['img'] = cv2.GaussianBlur(data['img'], + (radius, radius), 0, 0) + data['img'] = np.array(data['img'], dtype='float32') + return data + + +@manager.TRANSFORMS.add_component +class RandomRotation: + """ + Rotate an image randomly with padding. + + Args: + max_rotation (float, optional): The maximum rotation degree. Default: 15. + im_padding_value (list, optional): The padding value of raw image. + Default: [127.5, 127.5, 127.5]. + label_padding_value (int, optional): The padding value of annotation image. Default: 255. + """ + + def __init__(self, + max_rotation=15, + im_padding_value=(127.5, 127.5, 127.5), + label_padding_value=255): + self.max_rotation = max_rotation + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, data): + + if self.max_rotation > 0: + (h, w) = data['img'].shape[:2] + do_rotation = np.random.uniform(-self.max_rotation, + self.max_rotation) + pc = (w // 2, h // 2) + r = cv2.getRotationMatrix2D(pc, do_rotation, 1.0) + cos = np.abs(r[0, 0]) + sin = np.abs(r[0, 1]) + + nw = int((h * sin) + (w * cos)) + nh = int((h * cos) + (w * sin)) + + (cx, cy) = pc + r[0, 2] += (nw / 2) - cx + r[1, 2] += (nh / 2) - cy + dsize = (nw, nh) + data['img'] = cv2.warpAffine( + data['img'], + r, + dsize=dsize, + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_CONSTANT, + borderValue=self.im_padding_value) + for key in data.get('gt_fields', []): + data[key] = cv2.warpAffine( + data[key], + r, + dsize=dsize, + flags=cv2.INTER_NEAREST, + borderMode=cv2.BORDER_CONSTANT, + borderValue=self.label_padding_value) + + return data + + +@manager.TRANSFORMS.add_component +class RandomScaleAspect: + """ + Crop a sub-image from an original image with a range of area ratio and aspect and + then scale the sub-image back to the size of the original image. + + Args: + min_scale (float, optional): The minimum area ratio of cropped image to the original image. Default: 0.5. + aspect_ratio (float, optional): The minimum aspect ratio. Default: 0.33. + """ + + def __init__(self, min_scale=0.5, aspect_ratio=0.33): + self.min_scale = min_scale + self.aspect_ratio = aspect_ratio + + def __call__(self, data): + + if self.min_scale != 0 and self.aspect_ratio != 0: + img_height = data['img'].shape[0] + img_width = data['img'].shape[1] + for i in range(0, 10): + area = img_height * img_width + target_area = area * np.random.uniform(self.min_scale, 1.0) + aspectRatio = np.random.uniform(self.aspect_ratio, + 1.0 / self.aspect_ratio) + + dw = int(np.sqrt(target_area * 1.0 * aspectRatio)) + dh = int(np.sqrt(target_area * 1.0 / aspectRatio)) + if (np.random.randint(10) < 5): + tmp = dw + dw = dh + dh = tmp + + if (dh < img_height and dw < img_width): + h1 = np.random.randint(0, img_height - dh) + w1 = np.random.randint(0, img_width - dw) + + data['img'] = data['img'][h1:(h1 + dh), w1:(w1 + dw), :] + data['img'] = cv2.resize( + data['img'], (img_width, img_height), + interpolation=cv2.INTER_LINEAR) + for key in data.get('gt_fields', []): + data[key] = data[key][h1:(h1 + dh), w1:(w1 + dw)] + data[key] = cv2.resize( + data[key], (img_width, img_height), + interpolation=cv2.INTER_NEAREST) + break + return data + + +@manager.TRANSFORMS.add_component +class RandomDistort: + """ + Distort an image with random configurations. + + Args: + brightness_range (float, optional): A range of brightness. Default: 0.5. + brightness_prob (float, optional): A probability of adjusting brightness. Default: 0.5. + contrast_range (float, optional): A range of contrast. Default: 0.5. + contrast_prob (float, optional): A probability of adjusting contrast. Default: 0.5. + saturation_range (float, optional): A range of saturation. Default: 0.5. + saturation_prob (float, optional): A probability of adjusting saturation. Default: 0.5. + hue_range (int, optional): A range of hue. Default: 18. + hue_prob (float, optional): A probability of adjusting hue. Default: 0.5. + sharpness_range (float, optional): A range of sharpness. Default: 0.5. + sharpness_prob (float, optional): A probability of adjusting saturation. Default: 0. + """ + + def __init__(self, + brightness_range=0.5, + brightness_prob=0.5, + contrast_range=0.5, + contrast_prob=0.5, + saturation_range=0.5, + saturation_prob=0.5, + hue_range=18, + hue_prob=0.5, + sharpness_range=0.5, + sharpness_prob=0): + self.brightness_range = brightness_range + self.brightness_prob = brightness_prob + self.contrast_range = contrast_range + self.contrast_prob = contrast_prob + self.saturation_range = saturation_range + self.saturation_prob = saturation_prob + self.hue_range = hue_range + self.hue_prob = hue_prob + self.sharpness_range = sharpness_range + self.sharpness_prob = sharpness_prob + + def __call__(self, data): + + brightness_lower = 1 - self.brightness_range + brightness_upper = 1 + self.brightness_range + contrast_lower = 1 - self.contrast_range + contrast_upper = 1 + self.contrast_range + saturation_lower = 1 - self.saturation_range + saturation_upper = 1 + self.saturation_range + hue_lower = -self.hue_range + hue_upper = self.hue_range + sharpness_lower = 1 - self.sharpness_range + sharpness_upper = 1 + self.sharpness_range + ops = [ + functional.brightness, functional.contrast, functional.saturation, + functional.hue, functional.sharpness + ] + random.shuffle(ops) + params_dict = { + 'brightness': { + 'brightness_lower': brightness_lower, + 'brightness_upper': brightness_upper + }, + 'contrast': { + 'contrast_lower': contrast_lower, + 'contrast_upper': contrast_upper + }, + 'saturation': { + 'saturation_lower': saturation_lower, + 'saturation_upper': saturation_upper + }, + 'hue': { + 'hue_lower': hue_lower, + 'hue_upper': hue_upper + }, + 'sharpness': { + 'sharpness_lower': sharpness_lower, + 'sharpness_upper': sharpness_upper, + } + } + prob_dict = { + 'brightness': self.brightness_prob, + 'contrast': self.contrast_prob, + 'saturation': self.saturation_prob, + 'hue': self.hue_prob, + 'sharpness': self.sharpness_prob + } + data['img'] = data['img'].astype('uint8') + data['img'] = Image.fromarray(data['img']) + for id in range(len(ops)): + params = params_dict[ops[id].__name__] + prob = prob_dict[ops[id].__name__] + params['im'] = data['img'] + if np.random.uniform(0, 1) < prob: + data['img'] = ops[id](**params) + data['img'] = np.asarray(data['img']).astype('float32') + return data + + +@manager.TRANSFORMS.add_component +class RandomAffine: + """ + Affine transform an image with random configurations. + + Args: + size (tuple, optional): The target size after affine transformation. Default: (224, 224). + translation_offset (float, optional): The maximum translation offset. Default: 0. + max_rotation (float, optional): The maximum rotation degree. Default: 15. + min_scale_factor (float, optional): The minimum scale. Default: 0.75. + max_scale_factor (float, optional): The maximum scale. Default: 1.25. + im_padding_value (float, optional): The padding value of raw image. Default: (128, 128, 128). + label_padding_value (int, optional): The padding value of annotation image. Default: (255, 255, 255). + """ + + def __init__(self, + size=(224, 224), + translation_offset=0, + max_rotation=15, + min_scale_factor=0.75, + max_scale_factor=1.25, + im_padding_value=(128, 128, 128), + label_padding_value=255): + self.size = size + self.translation_offset = translation_offset + self.max_rotation = max_rotation + self.min_scale_factor = min_scale_factor + self.max_scale_factor = max_scale_factor + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, data): + + w, h = self.size + bbox = [0, 0, data['img'].shape[1] - 1, data['img'].shape[0] - 1] + x_offset = (random.random() - 0.5) * 2 * self.translation_offset + y_offset = (random.random() - 0.5) * 2 * self.translation_offset + dx = (w - (bbox[2] + bbox[0])) / 2.0 + dy = (h - (bbox[3] + bbox[1])) / 2.0 + + matrix_trans = np.array([[1.0, 0, dx], [0, 1.0, dy], [0, 0, 1.0]]) + + angle = random.random() * 2 * self.max_rotation - self.max_rotation + scale = random.random() * (self.max_scale_factor - self.min_scale_factor + ) + self.min_scale_factor + scale *= np.mean( + [float(w) / (bbox[2] - bbox[0]), float(h) / (bbox[3] - bbox[1])]) + alpha = scale * math.cos(angle / 180.0 * math.pi) + beta = scale * math.sin(angle / 180.0 * math.pi) + + centerx = w / 2.0 + x_offset + centery = h / 2.0 + y_offset + matrix = np.array( + [[alpha, beta, (1 - alpha) * centerx - beta * centery], + [-beta, alpha, beta * centerx + (1 - alpha) * centery], + [0, 0, 1.0]]) + + matrix = matrix.dot(matrix_trans)[0:2, :] + data['img'] = cv2.warpAffine( + np.uint8(data['img']), + matrix, + tuple(self.size), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_CONSTANT, + borderValue=self.im_padding_value) + for key in data.get('gt_fields', []): + data[key] = cv2.warpAffine( + np.uint8(data[key]), + matrix, + tuple(self.size), + flags=cv2.INTER_NEAREST, + borderMode=cv2.BORDER_CONSTANT, + borderValue=self.label_padding_value) + return data diff --git a/paddleseg/utils/__init__.py b/paddleseg/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..63c78949dd70a5ddec45a39392934d030d1618ba --- /dev/null +++ b/paddleseg/utils/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import logger +from . import download +from . import metrics +from .env import seg_env, get_sys_env +from .utils import * +from .timer import TimeAverager, calculate_eta +from . import visualize +from .config_check import config_check +from .ema import EMA diff --git a/paddleseg/utils/config_check.py b/paddleseg/utils/config_check.py new file mode 100644 index 0000000000000000000000000000000000000000..47a7049823afa20193147c82184a1ca0b4a511f7 --- /dev/null +++ b/paddleseg/utils/config_check.py @@ -0,0 +1,59 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def config_check(cfg, train_dataset=None, val_dataset=None): + """ + To check config。 + + Args: + cfg (paddleseg.cvlibs.Config): An object of paddleseg.cvlibs.Config. + train_dataset (paddle.io.Dataset): Used to read and process training datasets. + val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. + """ + + num_classes_check(cfg, train_dataset, val_dataset) + + +def num_classes_check(cfg, train_dataset, val_dataset): + """" + Check that the num_classes in model, train_dataset and val_dataset is consistent. + """ + num_classes_set = set() + if train_dataset and hasattr(train_dataset, 'num_classes'): + num_classes_set.add(train_dataset.num_classes) + if val_dataset and hasattr(val_dataset, 'num_classes'): + num_classes_set.add(val_dataset.num_classes) + if cfg.dic.get('model', None) and cfg.dic['model'].get('num_classes', None): + num_classes_set.add(cfg.dic['model'].get('num_classes')) + if (not cfg.train_dataset) and (not cfg.val_dataset): + raise ValueError( + 'One of `train_dataset` or `val_dataset should be given, but there are none.' + ) + if len(num_classes_set) == 0: + raise ValueError( + '`num_classes` is not found. Please set it in model, train_dataset or val_dataset' + ) + elif len(num_classes_set) > 1: + raise ValueError( + '`num_classes` is not consistent: {}. Please set it consistently in model or train_dataset or val_dataset' + .format(num_classes_set)) + else: + num_classes = num_classes_set.pop() + if train_dataset: + train_dataset.num_classes = num_classes + if val_dataset: + val_dataset.num_classes = num_classes diff --git a/paddleseg/utils/download.py b/paddleseg/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..623280102abd4bf2b8012783436c956dbbce8ba9 --- /dev/null +++ b/paddleseg/utils/download.py @@ -0,0 +1,163 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import os +import shutil +import sys +import tarfile +import time +import zipfile + +import requests + +lasttime = time.time() +FLUSH_INTERVAL = 0.1 + + +def progress(str, end=False): + global lasttime + if end: + str += "\n" + lasttime = 0 + if time.time() - lasttime >= FLUSH_INTERVAL: + sys.stdout.write("\r%s" % str) + lasttime = time.time() + sys.stdout.flush() + + +def _download_file(url, savepath, print_progress): + if print_progress: + print("Connecting to {}".format(url)) + r = requests.get(url, stream=True, timeout=15) + total_length = r.headers.get('content-length') + + if total_length is None: + with open(savepath, 'wb') as f: + shutil.copyfileobj(r.raw, f) + else: + with open(savepath, 'wb') as f: + dl = 0 + total_length = int(total_length) + starttime = time.time() + if print_progress: + print("Downloading %s" % os.path.basename(savepath)) + for data in r.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + if print_progress: + done = int(50 * dl / total_length) + progress("[%-50s] %.2f%%" % + ('=' * done, float(100 * dl) / total_length)) + if print_progress: + progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True) + + +def _uncompress_file_zip(filepath, extrapath): + files = zipfile.ZipFile(filepath, 'r') + filelist = files.namelist() + rootpath = filelist[0] + total_num = len(filelist) + for index, file in enumerate(filelist): + files.extract(file, extrapath) + yield total_num, index, rootpath + files.close() + yield total_num, index, rootpath + + +def _uncompress_file_tar(filepath, extrapath, mode="r:gz"): + files = tarfile.open(filepath, mode) + filelist = files.getnames() + total_num = len(filelist) + rootpath = filelist[0] + for index, file in enumerate(filelist): + files.extract(file, extrapath) + yield total_num, index, rootpath + files.close() + yield total_num, index, rootpath + + +def _uncompress_file(filepath, extrapath, delete_file, print_progress): + if print_progress: + print("Uncompress %s" % os.path.basename(filepath)) + + if filepath.endswith("zip"): + handler = _uncompress_file_zip + elif filepath.endswith("tgz"): + handler = functools.partial(_uncompress_file_tar, mode="r:*") + else: + handler = functools.partial(_uncompress_file_tar, mode="r") + + for total_num, index, rootpath in handler(filepath, extrapath): + if print_progress: + done = int(50 * float(index) / total_num) + progress("[%-50s] %.2f%%" % + ('=' * done, float(100 * index) / total_num)) + if print_progress: + progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True) + + if delete_file: + os.remove(filepath) + + return rootpath + + +def download_file_and_uncompress(url, + savepath=None, + extrapath=None, + extraname=None, + print_progress=True, + cover=False, + delete_file=True): + if savepath is None: + savepath = "." + + if extrapath is None: + extrapath = "." + + savename = url.split("/")[-1] + if not os.path.exists(savepath): + os.makedirs(savepath) + + savepath = os.path.join(savepath, savename) + savename = ".".join(savename.split(".")[:-1]) + savename = os.path.join(extrapath, savename) + extraname = savename if extraname is None else os.path.join(extrapath, + extraname) + + if cover: + if os.path.exists(savepath): + shutil.rmtree(savepath) + if os.path.exists(savename): + shutil.rmtree(savename) + if os.path.exists(extraname): + shutil.rmtree(extraname) + + if not os.path.exists(extraname): + if not os.path.exists(savename): + if not os.path.exists(savepath): + _download_file(url, savepath, print_progress) + + if (not tarfile.is_tarfile(savepath)) and ( + not zipfile.is_zipfile(savepath)): + if not os.path.exists(extraname): + os.makedirs(extraname) + shutil.move(savepath, extraname) + return extraname + + savename = _uncompress_file(savepath, extrapath, delete_file, + print_progress) + savename = os.path.join(extrapath, savename) + shutil.move(savename, extraname) + return extraname diff --git a/paddleseg/utils/ema.py b/paddleseg/utils/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..861200c3f098b22907052ab824c7001852badb5a --- /dev/null +++ b/paddleseg/utils/ema.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle + + +class EMA(object): + """ + The implementation of Exponential Moving Average for the trainable parameters. + + Args: + model (nn.Layer): The model for applying EMA. + decay (float, optional): Decay is used to calculate ema_variable by + `ema_variable = decay * ema_variable + (1 - decay) * new_variable`. + Default: 0.99. + + Returns: + None + + Examples: + .. code-block:: python + + # 1. Define model and dataset + + # 2. Create EMA + ema = EMA(model, decay=0.99) + + # 3. Train stage + for data in dataloader(): + ... + optimizer.step() + ema.step() + + # 4. Evaluate stage + ema.apply() # Use the EMA data to replace the origin data + + for data in dataloader(): + ... + + ema.restore() # Restore the origin data to the model + + """ + + def __init__(self, model, decay=0.99): + super().__init__() + + assert isinstance(model, paddle.nn.Layer), \ + "The model should be the instance of paddle.nn.Layer." + assert decay >= 0 and decay <= 1.0, \ + "The decay = {} should in [0.0, 1.0]".format(decay) + + self._model = model + self._decay = decay + self._ema_data = {} + self._backup_data = {} + + for name, param in self._model.named_parameters(): + if not param.stop_gradient: + self._ema_data[name] = param.numpy() + + def step(self): + """ + Calculate the EMA data for all trainable parameters. + """ + for name, param in self._model.named_parameters(): + if not param.stop_gradient: + assert name in self._ema_data, \ + "The param ({}) isn't in the model".format(name) + self._ema_data[name] = self._decay * self._ema_data[name] \ + + (1.0 - self._decay) * param.numpy() + + def apply(self): + """ + Save the origin data and use the EMA data to replace the origin data. + """ + for name, param in self._model.named_parameters(): + if not param.stop_gradient: + assert name in self._ema_data, \ + "The param ({}) isn't in the model".format(name) + self._backup_data[name] = param.numpy() + param.set_value(self._ema_data[name]) + + def restore(self): + """ + Restore the origin data to the model. + """ + for name, param in self._model.named_parameters(): + if not param.stop_gradient: + assert name in self._backup_data, \ + "The param ({}) isn't in the model".format(name) + param.set_value(self._backup_data[name]) + self._backup_data = {} diff --git a/paddleseg/utils/env/__init__.py b/paddleseg/utils/env/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7af6e064491348ea9a72f57dcf4bb608c32bab63 --- /dev/null +++ b/paddleseg/utils/env/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import seg_env +from .sys_env import get_sys_env diff --git a/paddleseg/utils/env/seg_env.py b/paddleseg/utils/env/seg_env.py new file mode 100644 index 0000000000000000000000000000000000000000..cf11dbdffb8570608f97d45122ba1de28a6e2e66 --- /dev/null +++ b/paddleseg/utils/env/seg_env.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module is used to store environmental parameters in PaddleSeg. + +SEG_HOME : Root directory for storing PaddleSeg related data. Default to ~/.paddleseg. + Users can change the default value through the SEG_HOME environment variable. +DATA_HOME : The directory to store the automatically downloaded dataset, e.g ADE20K. +PRETRAINED_MODEL_HOME : The directory to store the automatically downloaded pretrained model. +""" + +import os + +from paddleseg.utils import logger + + +def _get_user_home(): + return os.path.expanduser('~') + + +def _get_seg_home(): + if 'SEG_HOME' in os.environ: + home_path = os.environ['SEG_HOME'] + if os.path.exists(home_path): + if os.path.isdir(home_path): + return home_path + else: + logger.warning('SEG_HOME {} is a file!'.format(home_path)) + else: + return home_path + return os.path.join(_get_user_home(), '.paddleseg') + + +def _get_sub_home(directory): + home = os.path.join(_get_seg_home(), directory) + if not os.path.exists(home): + os.makedirs(home, exist_ok=True) + return home + + +USER_HOME = _get_user_home() +SEG_HOME = _get_seg_home() +DATA_HOME = _get_sub_home('dataset') +TMP_HOME = _get_sub_home('tmp') +PRETRAINED_MODEL_HOME = _get_sub_home('pretrained_model') diff --git a/paddleseg/utils/env/sys_env.py b/paddleseg/utils/env/sys_env.py new file mode 100644 index 0000000000000000000000000000000000000000..868150979cd1ddfcf9e74080bba59440ef09b574 --- /dev/null +++ b/paddleseg/utils/env/sys_env.py @@ -0,0 +1,128 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os +import platform +import subprocess +import sys + +import cv2 +import paddle +import paddleseg + +IS_WINDOWS = sys.platform == 'win32' + + +def _find_cuda_home(): + '''Finds the CUDA install path. It refers to the implementation of + pytorch . + ''' + # Guess #1 + cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') + if cuda_home is None: + # Guess #2 + try: + which = 'where' if IS_WINDOWS else 'which' + nvcc = subprocess.check_output([which, + 'nvcc']).decode().rstrip('\r\n') + cuda_home = os.path.dirname(os.path.dirname(nvcc)) + except Exception: + # Guess #3 + if IS_WINDOWS: + cuda_homes = glob.glob( + 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*') + if len(cuda_homes) == 0: + cuda_home = '' + else: + cuda_home = cuda_homes[0] + else: + cuda_home = '/usr/local/cuda' + if not os.path.exists(cuda_home): + cuda_home = None + return cuda_home + + +def _get_nvcc_info(cuda_home): + if cuda_home is not None and os.path.isdir(cuda_home): + try: + nvcc = os.path.join(cuda_home, 'bin/nvcc') + if not IS_WINDOWS: + nvcc = subprocess.check_output( + "{} -V".format(nvcc), shell=True).decode() + else: + nvcc = subprocess.check_output( + "\"{}\" -V".format(nvcc), shell=True).decode() + nvcc = nvcc.strip().split('\n')[-1] + except subprocess.SubprocessError: + nvcc = "Not Available" + else: + nvcc = "Not Available" + return nvcc + + +def _get_gpu_info(): + try: + gpu_info = subprocess.check_output(['nvidia-smi', + '-L']).decode().strip() + gpu_info = gpu_info.split('\n') + for i in range(len(gpu_info)): + gpu_info[i] = ' '.join(gpu_info[i].split(' ')[:4]) + except: + gpu_info = ' Can not get GPU information. Please make sure CUDA have been installed successfully.' + return gpu_info + + +def get_sys_env(): + """collect environment information""" + env_info = {} + env_info['platform'] = platform.platform() + + env_info['Python'] = sys.version.replace('\n', '') + + # TODO is_compiled_with_cuda() has not been moved + compiled_with_cuda = paddle.is_compiled_with_cuda() + env_info['Paddle compiled with cuda'] = compiled_with_cuda + + if compiled_with_cuda: + cuda_home = _find_cuda_home() + env_info['NVCC'] = _get_nvcc_info(cuda_home) + # refer to https://github.com/PaddlePaddle/Paddle/blob/release/2.0-rc/paddle/fluid/platform/device_context.cc#L327 + v = paddle.get_cudnn_version() + v = str(v // 1000) + '.' + str(v % 1000 // 100) + env_info['cudnn'] = v + if 'gpu' in paddle.get_device(): + gpu_nums = paddle.distributed.ParallelEnv().nranks + else: + gpu_nums = 0 + env_info['GPUs used'] = gpu_nums + + env_info['CUDA_VISIBLE_DEVICES'] = os.environ.get( + 'CUDA_VISIBLE_DEVICES') + if gpu_nums == 0: + os.environ['CUDA_VISIBLE_DEVICES'] = '' + env_info['GPU'] = _get_gpu_info() + + try: + gcc = subprocess.check_output(['gcc', '--version']).decode() + gcc = gcc.strip().split('\n')[0] + env_info['GCC'] = gcc + except: + pass + + env_info['PaddleSeg'] = paddleseg.__version__ + env_info['PaddlePaddle'] = paddle.__version__ + env_info['OpenCV'] = cv2.__version__ + + return env_info diff --git a/paddleseg/utils/logger.py b/paddleseg/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..7c75b940317571bc12033592830492d5a31d99ce --- /dev/null +++ b/paddleseg/utils/logger.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time + +import paddle + +levels = {0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG'} +log_level = 2 + + +def log(level=2, message=""): + if paddle.distributed.ParallelEnv().local_rank == 0: + current_time = time.time() + time_array = time.localtime(current_time) + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) + if log_level >= level: + print("{} [{}]\t{}".format(current_time, levels[level], message) + .encode("utf-8").decode("latin1")) + sys.stdout.flush() + + +def debug(message=""): + log(level=3, message=message) + + +def info(message=""): + log(level=2, message=message) + + +def warning(message=""): + log(level=1, message=message) + + +def error(message=""): + log(level=0, message=message) diff --git a/paddleseg/utils/metrics.py b/paddleseg/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..ca765e51cfe9f0a6849e718b87c52776b9a9d761 --- /dev/null +++ b/paddleseg/utils/metrics.py @@ -0,0 +1,243 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn.functional as F +import sklearn.metrics as skmetrics + + +def calculate_area(pred, label, num_classes, ignore_index=255): + """ + Calculate intersect, prediction and label area + + Args: + pred (Tensor): The prediction by model. + label (Tensor): The ground truth of image. + num_classes (int): The unique number of target classes. + ignore_index (int): Specifies a target value that is ignored. Default: 255. + + Returns: + Tensor: The intersection area of prediction and the ground on all class. + Tensor: The prediction area on all class. + Tensor: The ground truth area on all class + """ + if len(pred.shape) == 4: + pred = paddle.squeeze(pred, axis=1) + if len(label.shape) == 4: + label = paddle.squeeze(label, axis=1) + if not pred.shape == label.shape: + raise ValueError('Shape of `pred` and `label should be equal, ' + 'but there are {} and {}.'.format(pred.shape, + label.shape)) + pred_area = [] + label_area = [] + intersect_area = [] + mask = label != ignore_index + + for i in range(num_classes): + pred_i = paddle.logical_and(pred == i, mask) + label_i = label == i + intersect_i = paddle.logical_and(pred_i, label_i) + pred_area.append(paddle.sum(paddle.cast(pred_i, "int32"))) + label_area.append(paddle.sum(paddle.cast(label_i, "int32"))) + intersect_area.append(paddle.sum(paddle.cast(intersect_i, "int32"))) + + pred_area = paddle.concat(pred_area) + label_area = paddle.concat(label_area) + intersect_area = paddle.concat(intersect_area) + + return intersect_area, pred_area, label_area + + +def auc_roc(logits, label, num_classes, ignore_index=None): + """ + Calculate area under the roc curve + + Args: + logits (Tensor): The prediction by model on testset, of shape (N,C,H,W) . + label (Tensor): The ground truth of image. (N,1,H,W) + num_classes (int): The unique number of target classes. + ignore_index (int): Specifies a target value that is ignored. Default: 255. + + Returns: + auc_roc(float): The area under roc curve + """ + if ignore_index or len(np.unique(label)) > num_classes: + raise RuntimeError('labels with ignore_index is not supported yet.') + + if len(label.shape) != 4: + raise ValueError( + 'The shape of label is not 4 dimension as (N, C, H, W), it is {}'. + format(label.shape)) + + if len(logits.shape) != 4: + raise ValueError( + 'The shape of logits is not 4 dimension as (N, C, H, W), it is {}'. + format(logits.shape)) + + N, C, H, W = logits.shape + logits = np.transpose(logits, (1, 0, 2, 3)) + logits = logits.reshape([C, N * H * W]).transpose([1, 0]) + + label = np.transpose(label, (1, 0, 2, 3)) + label = label.reshape([1, N * H * W]).squeeze() + + if not logits.shape[0] == label.shape[0]: + raise ValueError('length of `logit` and `label` should be equal, ' + 'but they are {} and {}.'.format(logits.shape[0], + label.shape[0])) + + if num_classes == 2: + auc = skmetrics.roc_auc_score(label, logits[:, 1]) + else: + auc = skmetrics.roc_auc_score(label, logits, multi_class='ovr') + + return auc + + +def mean_iou(intersect_area, pred_area, label_area): + """ + Calculate iou. + + Args: + intersect_area (Tensor): The intersection area of prediction and ground truth on all classes. + pred_area (Tensor): The prediction area on all classes. + label_area (Tensor): The ground truth area on all classes. + + Returns: + np.ndarray: iou on all classes. + float: mean iou of all classes. + """ + intersect_area = intersect_area.numpy() + pred_area = pred_area.numpy() + label_area = label_area.numpy() + union = pred_area + label_area - intersect_area + class_iou = [] + for i in range(len(intersect_area)): + if union[i] == 0: + iou = 0 + else: + iou = intersect_area[i] / union[i] + class_iou.append(iou) + miou = np.mean(class_iou) + return np.array(class_iou), miou + + +def dice(intersect_area, pred_area, label_area): + """ + Calculate DICE. + + Args: + intersect_area (Tensor): The intersection area of prediction and ground truth on all classes. + pred_area (Tensor): The prediction area on all classes. + label_area (Tensor): The ground truth area on all classes. + + Returns: + np.ndarray: DICE on all classes. + float: mean DICE of all classes. + """ + intersect_area = intersect_area.numpy() + pred_area = pred_area.numpy() + label_area = label_area.numpy() + union = pred_area + label_area + class_dice = [] + for i in range(len(intersect_area)): + if union[i] == 0: + dice = 0 + else: + dice = (2 * intersect_area[i]) / union[i] + class_dice.append(dice) + mdice = np.mean(class_dice) + return np.array(class_dice), mdice + + +# This is a deprecated function, please use class_measurement function. +def accuracy(intersect_area, pred_area): + """ + Calculate accuracy + + Args: + intersect_area (Tensor): The intersection area of prediction and ground truth on all classes.. + pred_area (Tensor): The prediction area on all classes. + + Returns: + np.ndarray: accuracy on all classes. + float: mean accuracy. + """ + intersect_area = intersect_area.numpy() + pred_area = pred_area.numpy() + class_acc = [] + for i in range(len(intersect_area)): + if pred_area[i] == 0: + acc = 0 + else: + acc = intersect_area[i] / pred_area[i] + class_acc.append(acc) + macc = np.sum(intersect_area) / np.sum(pred_area) + return np.array(class_acc), macc + + +def class_measurement(intersect_area, pred_area, label_area): + """ + Calculate accuracy, calss precision and class recall. + + Args: + intersect_area (Tensor): The intersection area of prediction and ground truth on all classes. + pred_area (Tensor): The prediction area on all classes. + label_area (Tensor): The ground truth area on all classes. + + Returns: + float: The mean accuracy. + np.ndarray: The precision of all classes. + np.ndarray: The recall of all classes. + """ + intersect_area = intersect_area.numpy() + pred_area = pred_area.numpy() + label_area = label_area.numpy() + + mean_acc = np.sum(intersect_area) / np.sum(pred_area) + class_precision = [] + class_recall = [] + for i in range(len(intersect_area)): + precision = 0 if pred_area[i] == 0 \ + else intersect_area[i] / pred_area[i] + recall = 0 if label_area[i] == 0 \ + else intersect_area[i] / label_area[i] + class_precision.append(precision) + class_recall.append(recall) + + return mean_acc, np.array(class_precision), np.array(class_recall) + + +def kappa(intersect_area, pred_area, label_area): + """ + Calculate kappa coefficient + + Args: + intersect_area (Tensor): The intersection area of prediction and ground truth on all classes.. + pred_area (Tensor): The prediction area on all classes. + label_area (Tensor): The ground truth area on all classes. + + Returns: + float: kappa coefficient. + """ + intersect_area = intersect_area.numpy().astype(np.float64) + pred_area = pred_area.numpy().astype(np.float64) + label_area = label_area.numpy().astype(np.float64) + total_area = np.sum(label_area) + po = np.sum(intersect_area) / total_area + pe = np.sum(pred_area * label_area) / (total_area * total_area) + kappa = (po - pe) / (1 - pe) + return kappa diff --git a/paddleseg/utils/op_flops_funs.py b/paddleseg/utils/op_flops_funs.py new file mode 100644 index 0000000000000000000000000000000000000000..28353d84f69b2184e8e81ce27dd5d66d1b90d76b --- /dev/null +++ b/paddleseg/utils/op_flops_funs.py @@ -0,0 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Implement the counting flops functions for some ops. +""" + + +def count_syncbn(m, x, y): + x = x[0] + nelements = x.numel() + m.total_ops += int(2 * nelements) diff --git a/paddleseg/utils/progbar.py b/paddleseg/utils/progbar.py new file mode 100644 index 0000000000000000000000000000000000000000..98c3389131886377d0121c529dd169ba464fe7a4 --- /dev/null +++ b/paddleseg/utils/progbar.py @@ -0,0 +1,208 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + +import numpy as np + + +class Progbar(object): + """ + Displays a progress bar. + It refers to https://github.com/keras-team/keras/blob/keras-2/keras/utils/generic_utils.py + + Args: + target (int): Total number of steps expected, None if unknown. + width (int): Progress bar width on screen. + verbose (int): Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose) + stateful_metrics (list|tuple): Iterable of string names of metrics that should *not* be + averaged over time. Metrics in this list will be displayed as-is. All + others will be averaged by the progbar before display. + interval (float): Minimum visual progress update interval (in seconds). + unit_name (str): Display name for step counts (usually "step" or "sample"). + """ + + def __init__(self, + target, + width=30, + verbose=1, + interval=0.05, + stateful_metrics=None, + unit_name='step'): + self.target = target + self.width = width + self.verbose = verbose + self.interval = interval + self.unit_name = unit_name + if stateful_metrics: + self.stateful_metrics = set(stateful_metrics) + else: + self.stateful_metrics = set() + + self._dynamic_display = ( + (hasattr(sys.stderr, 'isatty') and + sys.stderr.isatty()) or 'ipykernel' in sys.modules or + 'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ) + self._total_width = 0 + self._seen_so_far = 0 + # We use a dict + list to avoid garbage collection + # issues found in OrderedDict + self._values = {} + self._values_order = [] + self._start = time.time() + self._last_update = 0 + + def update(self, current, values=None, finalize=None): + """ + Updates the progress bar. + + Args: + current (int): Index of current step. + values (list): List of tuples: `(name, value_for_last_step)`. If `name` is in + `stateful_metrics`, `value_for_last_step` will be displayed as-is. + Else, an average of the metric over time will be displayed. + finalize (bool): Whether this is the last update for the progress bar. If + `None`, defaults to `current >= self.target`. + """ + + if finalize is None: + if self.target is None: + finalize = False + else: + finalize = current >= self.target + + values = values or [] + for k, v in values: + if k not in self._values_order: + self._values_order.append(k) + if k not in self.stateful_metrics: + # In the case that progress bar doesn't have a target value in the first + # epoch, both on_batch_end and on_epoch_end will be called, which will + # cause 'current' and 'self._seen_so_far' to have the same value. Force + # the minimal value to 1 here, otherwise stateful_metric will be 0s. + value_base = max(current - self._seen_so_far, 1) + if k not in self._values: + self._values[k] = [v * value_base, value_base] + else: + self._values[k][0] += v * value_base + self._values[k][1] += value_base + else: + # Stateful metrics output a numeric value. This representation + # means "take an average from a single value" but keeps the + # numeric formatting. + self._values[k] = [v, 1] + self._seen_so_far = current + + now = time.time() + info = ' - %.0fs' % (now - self._start) + if self.verbose == 1: + if now - self._last_update < self.interval and not finalize: + return + + prev_total_width = self._total_width + if self._dynamic_display: + sys.stderr.write('\b' * prev_total_width) + sys.stderr.write('\r') + else: + sys.stderr.write('\n') + + if self.target is not None: + numdigits = int(np.log10(self.target)) + 1 + bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target) + prog = float(current) / self.target + prog_width = int(self.width * prog) + if prog_width > 0: + bar += ('=' * (prog_width - 1)) + if current < self.target: + bar += '>' + else: + bar += '=' + bar += ('.' * (self.width - prog_width)) + bar += ']' + else: + bar = '%7d/Unknown' % current + + self._total_width = len(bar) + sys.stderr.write(bar) + + if current: + time_per_unit = (now - self._start) / current + else: + time_per_unit = 0 + + if self.target is None or finalize: + if time_per_unit >= 1 or time_per_unit == 0: + info += ' %.0fs/%s' % (time_per_unit, self.unit_name) + elif time_per_unit >= 1e-3: + info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name) + else: + info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name) + else: + eta = time_per_unit * (self.target - current) + if eta > 3600: + eta_format = '%d:%02d:%02d' % (eta // 3600, + (eta % 3600) // 60, eta % 60) + elif eta > 60: + eta_format = '%d:%02d' % (eta // 60, eta % 60) + else: + eta_format = '%ds' % eta + + info = ' - ETA: %s' % eta_format + + for k in self._values_order: + info += ' - %s:' % k + if isinstance(self._values[k], list): + avg = np.mean(self._values[k][0] / + max(1, self._values[k][1])) + if abs(avg) > 1e-3: + info += ' %.4f' % avg + else: + info += ' %.4e' % avg + else: + info += ' %s' % self._values[k] + + self._total_width += len(info) + if prev_total_width > self._total_width: + info += (' ' * (prev_total_width - self._total_width)) + + if finalize: + info += '\n' + + sys.stderr.write(info) + sys.stderr.flush() + + elif self.verbose == 2: + if finalize: + numdigits = int(np.log10(self.target)) + 1 + count = ('%' + str(numdigits) + 'd/%d') % (current, self.target) + info = count + info + for k in self._values_order: + info += ' - %s:' % k + avg = np.mean(self._values[k][0] / + max(1, self._values[k][1])) + if avg > 1e-3: + info += ' %.4f' % avg + else: + info += ' %.4e' % avg + info += '\n' + + sys.stderr.write(info) + sys.stderr.flush() + + self._last_update = now + + def add(self, n, values=None): + self.update(self._seen_so_far + n, values) diff --git a/paddleseg/utils/timer.py b/paddleseg/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d74670d1cf1ae914d1db4de807e21889dcabb6 --- /dev/null +++ b/paddleseg/utils/timer.py @@ -0,0 +1,53 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + + +class TimeAverager(object): + def __init__(self): + self.reset() + + def reset(self): + self._cnt = 0 + self._total_time = 0 + self._total_samples = 0 + + def record(self, usetime, num_samples=None): + self._cnt += 1 + self._total_time += usetime + if num_samples: + self._total_samples += num_samples + + def get_average(self): + if self._cnt == 0: + return 0 + return self._total_time / float(self._cnt) + + def get_ips_average(self): + if not self._total_samples or self._cnt == 0: + return 0 + return float(self._total_samples) / self._total_time + + +def calculate_eta(remaining_step, speed): + if remaining_step < 0: + remaining_step = 0 + remaining_time = int(remaining_step * speed) + result = "{:0>2}:{:0>2}:{:0>2}" + arr = [] + for i in range(2, -1, -1): + arr.append(int(remaining_time / 60**i)) + remaining_time %= 60**i + return result.format(*arr) diff --git a/paddleseg/utils/train_profiler.py b/paddleseg/utils/train_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..4b4d53b84901ba86cef32b29f719d73b36fcce88 --- /dev/null +++ b/paddleseg/utils/train_profiler.py @@ -0,0 +1,112 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + + if options_str != "": + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler(_profiler_options['state'], + _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/paddleseg/utils/utils.py b/paddleseg/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..526d04e8380b4dd523781855a257bc778904273f --- /dev/null +++ b/paddleseg/utils/utils.py @@ -0,0 +1,176 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import filelock +import os +import tempfile +import numpy as np +import random +from urllib.parse import urlparse, unquote + +import paddle + +from paddleseg.utils import logger, seg_env +from paddleseg.utils.download import download_file_and_uncompress + + +@contextlib.contextmanager +def generate_tempdir(directory: str=None, **kwargs): + '''Generate a temporary directory''' + directory = seg_env.TMP_HOME if not directory else directory + with tempfile.TemporaryDirectory(dir=directory, **kwargs) as _dir: + yield _dir + + +def load_entire_model(model, pretrained): + if pretrained is not None: + load_pretrained_model(model, pretrained) + else: + logger.warning('Not all pretrained params of {} are loaded, ' \ + 'training from scratch or a pretrained backbone.'.format(model.__class__.__name__)) + + +def download_pretrained_model(pretrained_model): + """ + Download pretrained model from url. + Args: + pretrained_model (str): the url of pretrained weight + Returns: + str: the path of pretrained weight + """ + assert urlparse(pretrained_model).netloc, "The url is not valid." + + pretrained_model = unquote(pretrained_model) + savename = pretrained_model.split('/')[-1] + if not savename.endswith(('tgz', 'tar.gz', 'tar', 'zip')): + savename = pretrained_model.split('/')[-2] + else: + savename = savename.split('.')[0] + + with generate_tempdir() as _dir: + with filelock.FileLock(os.path.join(seg_env.TMP_HOME, savename)): + pretrained_model = download_file_and_uncompress( + pretrained_model, + savepath=_dir, + extrapath=seg_env.PRETRAINED_MODEL_HOME, + extraname=savename) + pretrained_model = os.path.join(pretrained_model, 'model.pdparams') + return pretrained_model + + +def load_pretrained_model(model, pretrained_model): + if pretrained_model is not None: + logger.info('Loading pretrained model from {}'.format(pretrained_model)) + + if urlparse(pretrained_model).netloc: + pretrained_model = download_pretrained_model(pretrained_model) + + if os.path.exists(pretrained_model): + para_state_dict = paddle.load(pretrained_model) + + model_state_dict = model.state_dict() + keys = model_state_dict.keys() + num_params_loaded = 0 + for k in keys: + if k not in para_state_dict: + logger.warning("{} is not in pretrained model".format(k)) + elif list(para_state_dict[k].shape) != list(model_state_dict[k] + .shape): + logger.warning( + "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" + .format(k, para_state_dict[k].shape, model_state_dict[k] + .shape)) + else: + model_state_dict[k] = para_state_dict[k] + num_params_loaded += 1 + model.set_dict(model_state_dict) + logger.info("There are {}/{} variables loaded into {}.".format( + num_params_loaded, + len(model_state_dict), model.__class__.__name__)) + + else: + raise ValueError('The pretrained model directory is not Found: {}'. + format(pretrained_model)) + else: + logger.info( + 'No pretrained model to load, {} will be trained from scratch.'. + format(model.__class__.__name__)) + + +def resume(model, optimizer, resume_model): + if resume_model is not None: + logger.info('Resume model from {}'.format(resume_model)) + if os.path.exists(resume_model): + resume_model = os.path.normpath(resume_model) + ckpt_path = os.path.join(resume_model, 'model.pdparams') + para_state_dict = paddle.load(ckpt_path) + ckpt_path = os.path.join(resume_model, 'model.pdopt') + opti_state_dict = paddle.load(ckpt_path) + model.set_state_dict(para_state_dict) + optimizer.set_state_dict(opti_state_dict) + + iter = resume_model.split('_')[-1] + iter = int(iter) + return iter + else: + raise ValueError( + 'Directory of the model needed to resume is not Found: {}'. + format(resume_model)) + else: + logger.info('No model needed to resume.') + + +def worker_init_fn(worker_id): + np.random.seed(random.randint(0, 100000)) + + +def get_image_list(image_path): + """Get image list""" + valid_suffix = [ + '.JPEG', '.jpeg', '.JPG', '.jpg', '.BMP', '.bmp', '.PNG', '.png' + ] + image_list = [] + image_dir = None + if os.path.isfile(image_path): + if os.path.splitext(image_path)[-1] in valid_suffix: + image_list.append(image_path) + else: + image_dir = os.path.dirname(image_path) + with open(image_path, 'r') as f: + for line in f: + line = line.strip() + if len(line.split()) > 1: + line = line.split()[0] + image_list.append(os.path.join(image_dir, line)) + elif os.path.isdir(image_path): + image_dir = image_path + for root, dirs, files in os.walk(image_path): + for f in files: + if '.ipynb_checkpoints' in root: + continue + if f.startswith('.'): + continue + if os.path.splitext(f)[-1] in valid_suffix: + image_list.append(os.path.join(root, f)) + else: + raise FileNotFoundError( + '`--image_path` is not found. it should be a path of image, or a file list containing image paths, or a directory including images.' + ) + + if len(image_list) == 0: + raise RuntimeError( + 'There are not image file in `--image_path`={}'.format(image_path)) + + return image_list, image_dir diff --git a/paddleseg/utils/visualize.py b/paddleseg/utils/visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..27211c41133e4049a4247b9e3ec45e608b5e46a3 --- /dev/null +++ b/paddleseg/utils/visualize.py @@ -0,0 +1,143 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import cv2 +import numpy as np +from PIL import Image as PILImage + + +def visualize(image, result, color_map, save_dir=None, weight=0.6): + """ + Convert predict result to color image, and save added image. + + Args: + image (str): The path of origin image. + result (np.ndarray): The predict result of image. + color_map (list): The color used to save the prediction results. + save_dir (str): The directory for saving visual image. Default: None. + weight (float): The image weight of visual image, and the result weight is (1 - weight). Default: 0.6 + + Returns: + vis_result (np.ndarray): If `save_dir` is None, return the visualized result. + """ + + color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] + color_map = np.array(color_map).astype("uint8") + # Use OpenCV LUT for color mapping + c1 = cv2.LUT(result, color_map[:, 0]) + c2 = cv2.LUT(result, color_map[:, 1]) + c3 = cv2.LUT(result, color_map[:, 2]) + pseudo_img = np.dstack((c3, c2, c1)) + + im = cv2.imread(image) + vis_result = cv2.addWeighted(im, weight, pseudo_img, 1 - weight, 0) + + if save_dir is not None: + if not os.path.exists(save_dir): + os.makedirs(save_dir) + image_name = os.path.split(image)[-1] + out_path = os.path.join(save_dir, image_name) + cv2.imwrite(out_path, vis_result) + else: + return vis_result + + +def get_pseudo_color_map(pred, color_map=None): + """ + Get the pseudo color image. + + Args: + pred (numpy.ndarray): the origin predicted image. + color_map (list, optional): the palette color map. Default: None, + use paddleseg's default color map. + + Returns: + (numpy.ndarray): the pseduo image. + """ + pred_mask = PILImage.fromarray(pred.astype(np.uint8), mode='P') + if color_map is None: + color_map = get_color_map_list(256) + pred_mask.putpalette(color_map) + return pred_mask + + +def get_color_map_list(num_classes, custom_color=None): + """ + Returns the color map for visualizing the segmentation mask, + which can support arbitrary number of classes. + + Args: + num_classes (int): Number of classes. + custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map. + + Returns: + (list). The color map. + """ + + num_classes += 1 + color_map = num_classes * [0, 0, 0] + for i in range(0, num_classes): + j = 0 + lab = i + while lab: + color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) + color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) + color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) + j += 1 + lab >>= 3 + color_map = color_map[3:] + + if custom_color: + color_map[:len(custom_color)] = custom_color + return color_map + + +def paste_images(image_list): + """ + Paste all image to a image. + Args: + image_list (List or Tuple): The images to be pasted and their size are the same. + Returns: + result_img (PIL.Image): The pasted image. + """ + assert isinstance(image_list, + (list, tuple)), "image_list should be a list or tuple" + assert len( + image_list) > 1, "The length of image_list should be greater than 1" + + pil_img_list = [] + for img in image_list: + if isinstance(img, str): + assert os.path.exists(img), "The image is not existed: {}".format( + img) + img = PILImage.open(img) + img = np.array(img) + elif isinstance(img, np.ndarray): + img = PILImage.fromarray(img) + pil_img_list.append(img) + + sample_img = pil_img_list[0] + size = sample_img.size + for img in pil_img_list: + assert size == img.size, "The image size in image_list should be the same" + + width, height = sample_img.size + result_img = PILImage.new(sample_img.mode, + (width * len(pil_img_list), height)) + for i, img in enumerate(pil_img_list): + result_img.paste(img, box=(width * i, 0)) + + return result_img diff --git a/ppmatting/__init__.py b/ppmatting/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c1094808e27aa683fc3b5766e9968712b3021532 --- /dev/null +++ b/ppmatting/__init__.py @@ -0,0 +1 @@ +from . import ml, metrics, transforms, datasets, models diff --git a/ppmatting/core/__init__.py b/ppmatting/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78060ba48aac1fd7d8cb32eccc7ccddadd74017f --- /dev/null +++ b/ppmatting/core/__init__.py @@ -0,0 +1,4 @@ +from .val import evaluate +from .val_ml import evaluate_ml +from .train import train +from .predict import predict \ No newline at end of file diff --git a/ppmatting/core/predict.py b/ppmatting/core/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..e4a7457caf304f222031cc2757cdf32b6207fe3f --- /dev/null +++ b/ppmatting/core/predict.py @@ -0,0 +1,187 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math +import time + +import cv2 +import numpy as np +import paddle +import paddle.nn.functional as F +from paddleseg import utils +from paddleseg.core import infer +from paddleseg.utils import logger, progbar, TimeAverager + +from ppmatting.utils import mkdir, estimate_foreground_ml + + +def partition_list(arr, m): + """split the list 'arr' into m pieces""" + n = int(math.ceil(len(arr) / float(m))) + return [arr[i:i + n] for i in range(0, len(arr), n)] + + +def save_result(alpha, path, im_path, trimap=None, fg_estimate=True): + """ + The value of alpha is range [0, 1], shape should be [h,w] + """ + dirname = os.path.dirname(path) + if not os.path.exists(dirname): + os.makedirs(dirname) + basename = os.path.basename(path) + name = os.path.splitext(basename)[0] + alpha_save_path = os.path.join(dirname, name + '_alpha.png') + rgba_save_path = os.path.join(dirname, name + '_rgba.png') + + # save alpha matte + if trimap is not None: + trimap = cv2.imread(trimap, 0) + alpha[trimap == 0] = 0 + alpha[trimap == 255] = 255 + alpha = (alpha).astype('uint8') + cv2.imwrite(alpha_save_path, alpha) + + # save rgba + im = cv2.imread(im_path) + if fg_estimate: + fg = estimate_foreground_ml(im / 255.0, alpha / 255.0) * 255 + else: + fg = im + fg = fg.astype('uint8') + alpha = alpha[:, :, np.newaxis] + rgba = np.concatenate((fg, alpha), axis=-1) + cv2.imwrite(rgba_save_path, rgba) + + return fg + + +def reverse_transform(alpha, trans_info): + """recover pred to origin shape""" + for item in trans_info[::-1]: + if item[0] == 'resize': + h, w = item[1][0], item[1][1] + alpha = F.interpolate(alpha, [h, w], mode='bilinear') + elif item[0] == 'padding': + h, w = item[1][0], item[1][1] + alpha = alpha[:, :, 0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format(item[0])) + return alpha + + +def preprocess(img, transforms, trimap=None): + data = {} + data['img'] = img + if trimap is not None: + data['trimap'] = trimap + data['gt_fields'] = ['trimap'] + data['trans_info'] = [] + data = transforms(data) + data['img'] = paddle.to_tensor(data['img']) + data['img'] = data['img'].unsqueeze(0) + if trimap is not None: + data['trimap'] = paddle.to_tensor(data['trimap']) + data['trimap'] = data['trimap'].unsqueeze((0, 1)) + + return data + + +def predict(model, + model_path, + transforms, + image_list, + image_dir=None, + trimap_list=None, + save_dir='output', + fg_estimate=True): + """ + predict and visualize the image_list. + + Args: + model (nn.Layer): Used to predict for input image. + model_path (str): The path of pretrained model. + transforms (transforms.Compose): Preprocess for input image. + image_list (list): A list of image path to be predicted. + image_dir (str, optional): The root directory of the images predicted. Default: None. + trimap_list (list, optional): A list of trimap of image_list. Default: None. + save_dir (str, optional): The directory to save the visualized results. Default: 'output'. + """ + utils.utils.load_entire_model(model, model_path) + model.eval() + nranks = paddle.distributed.get_world_size() + local_rank = paddle.distributed.get_rank() + if nranks > 1: + img_lists = partition_list(image_list, nranks) + trimap_lists = partition_list( + trimap_list, nranks) if trimap_list is not None else None + else: + img_lists = [image_list] + trimap_lists = [trimap_list] if trimap_list is not None else None + + logger.info("Start to predict...") + progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1) + preprocess_cost_averager = TimeAverager() + infer_cost_averager = TimeAverager() + postprocess_cost_averager = TimeAverager() + batch_start = time.time() + with paddle.no_grad(): + for i, im_path in enumerate(img_lists[local_rank]): + preprocess_start = time.time() + trimap = trimap_lists[local_rank][ + i] if trimap_list is not None else None + data = preprocess(img=im_path, transforms=transforms, trimap=trimap) + preprocess_cost_averager.record(time.time() - preprocess_start) + + infer_start = time.time() + alpha_pred = model(data) + infer_cost_averager.record(time.time() - infer_start) + + postprocess_start = time.time() + alpha_pred = reverse_transform(alpha_pred, data['trans_info']) + alpha_pred = (alpha_pred.numpy()).squeeze() + alpha_pred = (alpha_pred * 255).astype('uint8') + + # get the saved name + if image_dir is not None: + im_file = im_path.replace(image_dir, '') + else: + im_file = os.path.basename(im_path) + if im_file[0] == '/' or im_file[0] == '\\': + im_file = im_file[1:] + + save_path = os.path.join(save_dir, im_file) + mkdir(save_path) + fg = save_result( + alpha_pred, + save_path, + im_path=im_path, + trimap=trimap, + fg_estimate=fg_estimate) + + postprocess_cost_averager.record(time.time() - postprocess_start) + + preprocess_cost = preprocess_cost_averager.get_average() + infer_cost = infer_cost_averager.get_average() + postprocess_cost = postprocess_cost_averager.get_average() + if local_rank == 0: + progbar_pred.update(i + 1, + [('preprocess_cost', preprocess_cost), + ('infer_cost cost', infer_cost), + ('postprocess_cost', postprocess_cost)]) + + preprocess_cost_averager.reset() + infer_cost_averager.reset() + postprocess_cost_averager.reset() + return alpha_pred, fg diff --git a/ppmatting/core/train.py b/ppmatting/core/train.py new file mode 100644 index 0000000000000000000000000000000000000000..11a536f8c40d93e5e0bcd03eea9007ae0a5054e2 --- /dev/null +++ b/ppmatting/core/train.py @@ -0,0 +1,313 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from collections import deque, defaultdict +import pickle +import shutil + +import numpy as np +import paddle +import paddle.nn.functional as F +from paddleseg.utils import TimeAverager, calculate_eta, resume, logger + +from .val import evaluate + + +def visual_in_traning(log_writer, vis_dict, step): + """ + Visual in vdl + + Args: + log_writer (LogWriter): The log writer of vdl. + vis_dict (dict): Dict of tensor. The shape of thesor is (C, H, W) + """ + for key, value in vis_dict.items(): + value_shape = value.shape + if value_shape[0] not in [1, 3]: + value = value[0] + value = value.unsqueeze(0) + value = paddle.transpose(value, (1, 2, 0)) + min_v = paddle.min(value) + max_v = paddle.max(value) + if (min_v > 0) and (max_v < 1): + value = value * 255 + elif (min_v < 0 and min_v >= -1) and (max_v <= 1): + value = (1 + value) / 2 * 255 + else: + value = (value - min_v) / (max_v - min_v) * 255 + + value = value.astype('uint8') + value = value.numpy() + log_writer.add_image(tag=key, img=value, step=step) + + +def save_best(best_model_dir, metrics_data, iter): + with open(os.path.join(best_model_dir, 'best_metrics.txt'), 'w') as f: + for key, value in metrics_data.items(): + line = key + ' ' + str(value) + '\n' + f.write(line) + f.write('iter' + ' ' + str(iter) + '\n') + + +def get_best(best_file, metrics, resume_model=None): + '''Get best metrics and iter from file''' + best_metrics_data = {} + if os.path.exists(best_file) and (resume_model is not None): + values = [] + with open(best_file, 'r') as f: + lines = f.readlines() + for line in lines: + line = line.strip() + key, value = line.split(' ') + best_metrics_data[key] = eval(value) + if key == 'iter': + best_iter = eval(value) + else: + for key in metrics: + best_metrics_data[key] = np.inf + best_iter = -1 + return best_metrics_data, best_iter + + +def train(model, + train_dataset, + val_dataset=None, + optimizer=None, + save_dir='output', + iters=10000, + batch_size=2, + resume_model=None, + save_interval=1000, + log_iters=10, + log_image_iters=1000, + num_workers=0, + use_vdl=False, + losses=None, + keep_checkpoint_max=5, + eval_begin_iters=None, + metrics='sad'): + """ + Launch training. + Args: + model(nn.Layer): A matting model. + train_dataset (paddle.io.Dataset): Used to read and process training datasets. + val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. + optimizer (paddle.optimizer.Optimizer): The optimizer. + save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. + iters (int, optional): How may iters to train the model. Defualt: 10000. + batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. + resume_model (str, optional): The path of resume model. + save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. + log_iters (int, optional): Display logging information at every log_iters. Default: 10. + log_image_iters (int, optional): Log image to vdl. Default: 1000. + num_workers (int, optional): Num workers for data loader. Default: 0. + use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. + losses (dict, optional): A dict of loss, refer to the loss function of the model for details. Default: None. + keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. + eval_begin_iters (int): The iters begin evaluation. It will evaluate at iters/2 if it is None. Defalust: None. + metrics(str|list, optional): The metrics to evaluate, it may be the combination of ("sad", "mse", "grad", "conn"). + """ + model.train() + nranks = paddle.distributed.ParallelEnv().nranks + local_rank = paddle.distributed.ParallelEnv().local_rank + + start_iter = 0 + if resume_model is not None: + start_iter = resume(model, optimizer, resume_model) + + if not os.path.isdir(save_dir): + if os.path.exists(save_dir): + os.remove(save_dir) + os.makedirs(save_dir) + + if nranks > 1: + # Initialize parallel environment if not done. + if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( + ): + paddle.distributed.init_parallel_env() + ddp_model = paddle.DataParallel(model) + else: + ddp_model = paddle.DataParallel(model) + + batch_sampler = paddle.io.DistributedBatchSampler( + train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) + + loader = paddle.io.DataLoader( + train_dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + return_list=True, ) + + if use_vdl: + from visualdl import LogWriter + log_writer = LogWriter(save_dir) + + if isinstance(metrics, str): + metrics = [metrics] + elif not isinstance(metrics, list): + metrics = ['sad'] + best_metrics_data, best_iter = get_best( + os.path.join(save_dir, 'best_model', 'best_metrics.txt'), + metrics, + resume_model=resume_model) + avg_loss = defaultdict(float) + iters_per_epoch = len(batch_sampler) + reader_cost_averager = TimeAverager() + batch_cost_averager = TimeAverager() + save_models = deque() + batch_start = time.time() + + iter = start_iter + while iter < iters: + for data in loader: + iter += 1 + if iter > iters: + break + reader_cost_averager.record(time.time() - batch_start) + + logit_dict, loss_dict = ddp_model(data) if nranks > 1 else model( + data) + + loss_dict['all'].backward() + + optimizer.step() + lr = optimizer.get_lr() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + model.clear_gradients() + + for key, value in loss_dict.items(): + avg_loss[key] += value.numpy()[0] + batch_cost_averager.record( + time.time() - batch_start, num_samples=batch_size) + + if (iter) % log_iters == 0 and local_rank == 0: + for key, value in avg_loss.items(): + avg_loss[key] = value / log_iters + remain_iters = iters - iter + avg_train_batch_cost = batch_cost_averager.get_average() + avg_train_reader_cost = reader_cost_averager.get_average() + eta = calculate_eta(remain_iters, avg_train_batch_cost) + # loss info + loss_str = ' ' * 26 + '\t[LOSSES]' + loss_str = loss_str + for key, value in avg_loss.items(): + if key != 'all': + loss_str = loss_str + ' ' + key + '={:.4f}'.format( + value) + logger.info( + "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f}, ips={:.4f} samples/sec | ETA {}\n{}\n" + .format((iter - 1) // iters_per_epoch + 1, iter, iters, + avg_loss['all'], lr, avg_train_batch_cost, + avg_train_reader_cost, + batch_cost_averager.get_ips_average( + ), eta, loss_str)) + if use_vdl: + for key, value in avg_loss.items(): + log_tag = 'Train/' + key + log_writer.add_scalar(log_tag, value, iter) + + log_writer.add_scalar('Train/lr', lr, iter) + log_writer.add_scalar('Train/batch_cost', + avg_train_batch_cost, iter) + log_writer.add_scalar('Train/reader_cost', + avg_train_reader_cost, iter) + if iter % log_image_iters == 0: + vis_dict = {} + # ground truth + vis_dict['ground truth/img'] = data['img'][0] + for key in data['gt_fields']: + key = key[0] + vis_dict['/'.join(['ground truth', key])] = data[ + key][0] + # predict + for key, value in logit_dict.items(): + vis_dict['/'.join(['predict', key])] = logit_dict[ + key][0] + visual_in_traning( + log_writer=log_writer, vis_dict=vis_dict, step=iter) + + for key in avg_loss.keys(): + avg_loss[key] = 0. + reader_cost_averager.reset() + batch_cost_averager.reset() + + # save model + if (iter % save_interval == 0 or iter == iters) and local_rank == 0: + current_save_dir = os.path.join(save_dir, + "iter_{}".format(iter)) + if not os.path.isdir(current_save_dir): + os.makedirs(current_save_dir) + paddle.save(model.state_dict(), + os.path.join(current_save_dir, 'model.pdparams')) + paddle.save(optimizer.state_dict(), + os.path.join(current_save_dir, 'model.pdopt')) + save_models.append(current_save_dir) + if len(save_models) > keep_checkpoint_max > 0: + model_to_remove = save_models.popleft() + shutil.rmtree(model_to_remove) + + # eval model + if eval_begin_iters is None: + eval_begin_iters = iters // 2 + if (iter % save_interval == 0 or iter == iters) and ( + val_dataset is not None + ) and local_rank == 0 and iter >= eval_begin_iters: + num_workers = 1 if num_workers > 0 else 0 + metrics_data = evaluate( + model, + val_dataset, + num_workers=1, + print_detail=True, + save_results=False, + metrics=metrics) + model.train() + + # save best model and add evaluation results to vdl + if (iter % save_interval == 0 or iter == iters) and local_rank == 0: + if val_dataset is not None and iter >= eval_begin_iters: + if metrics_data[metrics[0]] < best_metrics_data[metrics[0]]: + best_iter = iter + best_metrics_data = metrics_data.copy() + best_model_dir = os.path.join(save_dir, "best_model") + paddle.save( + model.state_dict(), + os.path.join(best_model_dir, 'model.pdparams')) + save_best(best_model_dir, best_metrics_data, iter) + + show_list = [] + for key, value in best_metrics_data.items(): + show_list.append((key, value)) + log_str = '[EVAL] The model with the best validation {} ({:.4f}) was saved at iter {}. while'.format( + show_list[0][0], show_list[0][1], best_iter) + for i in range(1, len(show_list)): + log_str = log_str + ' {}: {:.4f},'.format( + show_list[i][0], show_list[i][1]) + log_str = log_str[:-1] + logger.info(log_str) + + if use_vdl: + for key, value in metrics_data.items(): + log_writer.add_scalar('Evaluate/' + key, value, + iter) + + batch_start = time.time() + + # Sleep for half a second to let dataloader release resources. + time.sleep(0.5) + if use_vdl: + log_writer.close() diff --git a/ppmatting/core/val.py b/ppmatting/core/val.py new file mode 100644 index 0000000000000000000000000000000000000000..3e3117725ab3792fc7a2344082ad45f26cb2cd28 --- /dev/null +++ b/ppmatting/core/val.py @@ -0,0 +1,162 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import cv2 +import numpy as np +import time +import paddle +import paddle.nn.functional as F +from paddleseg.utils import TimeAverager, calculate_eta, logger, progbar + +from ppmatting.metrics import metrics_class_dict + +np.set_printoptions(suppress=True) + + +def save_alpha_pred(alpha, path): + """ + The value of alpha is range [0, 1], shape should be [h,w] + """ + dirname = os.path.dirname(path) + if not os.path.exists(dirname): + os.makedirs(dirname) + + alpha = (alpha).astype('uint8') + cv2.imwrite(path, alpha) + + +def reverse_transform(alpha, trans_info): + """recover pred to origin shape""" + for item in trans_info[::-1]: + if item[0][0] == 'resize': + h, w = item[1][0], item[1][1] + alpha = F.interpolate(alpha, [h, w], mode='bilinear') + elif item[0][0] == 'padding': + h, w = item[1][0], item[1][1] + alpha = alpha[:, :, 0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format(item[0])) + return alpha + + +def evaluate(model, + eval_dataset, + num_workers=0, + print_detail=True, + save_dir='output/results', + save_results=True, + metrics='sad'): + model.eval() + nranks = paddle.distributed.ParallelEnv().nranks + local_rank = paddle.distributed.ParallelEnv().local_rank + if nranks > 1: + # Initialize parallel environment if not done. + if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( + ): + paddle.distributed.init_parallel_env() + + loader = paddle.io.DataLoader( + eval_dataset, + batch_size=1, + drop_last=False, + num_workers=num_workers, + return_list=True, ) + + total_iters = len(loader) + # Get metric instances and data saving + metrics_ins = {} + metrics_data = {} + if isinstance(metrics, str): + metrics = [metrics] + elif not isinstance(metrics, list): + metrics = ['sad'] + for key in metrics: + key = key.lower() + metrics_ins[key] = metrics_class_dict[key]() + metrics_data[key] = None + + if print_detail: + logger.info("Start evaluating (total_samples: {}, total_iters: {})...". + format(len(eval_dataset), total_iters)) + progbar_val = progbar.Progbar( + target=total_iters, verbose=1 if nranks < 2 else 2) + reader_cost_averager = TimeAverager() + batch_cost_averager = TimeAverager() + batch_start = time.time() + + img_name = '' + i = 0 + with paddle.no_grad(): + for iter, data in enumerate(loader): + reader_cost_averager.record(time.time() - batch_start) + alpha_pred = model(data) + + alpha_pred = reverse_transform(alpha_pred, data['trans_info']) + alpha_pred = alpha_pred.numpy() + + alpha_gt = data['alpha'].numpy() * 255 + trimap = data.get('ori_trimap') + if trimap is not None: + trimap = trimap.numpy().astype('uint8') + alpha_pred = np.round(alpha_pred * 255) + for key in metrics_ins.keys(): + metrics_data[key] = metrics_ins[key].update(alpha_pred, + alpha_gt, trimap) + + if save_results: + alpha_pred_one = alpha_pred[0].squeeze() + if trimap is not None: + trimap = trimap.squeeze().astype('uint8') + alpha_pred_one[trimap == 255] = 255 + alpha_pred_one[trimap == 0] = 0 + + save_name = data['img_name'][0] + name, ext = os.path.splitext(save_name) + if save_name == img_name: + save_name = name + '_' + str(i) + ext + i += 1 + else: + img_name = save_name + save_name = name + '_' + str(i) + ext + i = 1 + + save_alpha_pred(alpha_pred_one, + os.path.join(save_dir, save_name)) + + batch_cost_averager.record( + time.time() - batch_start, num_samples=len(alpha_gt)) + batch_cost = batch_cost_averager.get_average() + reader_cost = reader_cost_averager.get_average() + + if local_rank == 0 and print_detail: + show_list = [(k, v) for k, v in metrics_data.items()] + show_list = show_list + [('batch_cost', batch_cost), + ('reader cost', reader_cost)] + progbar_val.update(iter + 1, show_list) + + reader_cost_averager.reset() + batch_cost_averager.reset() + batch_start = time.time() + + for key in metrics_ins.keys(): + metrics_data[key] = metrics_ins[key].evaluate() + log_str = '[EVAL] ' + for key, value in metrics_data.items(): + log_str = log_str + key + ': {:.4f}, '.format(value) + log_str = log_str[:-2] + + logger.info(log_str) + return metrics_data diff --git a/ppmatting/core/val_ml.py b/ppmatting/core/val_ml.py new file mode 100644 index 0000000000000000000000000000000000000000..77628925bec1fa08a4a24de685355cc71157db92 --- /dev/null +++ b/ppmatting/core/val_ml.py @@ -0,0 +1,162 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import cv2 +import numpy as np +import time +import paddle +import paddle.nn.functional as F +from paddleseg.utils import TimeAverager, calculate_eta, logger, progbar + +from ppmatting.metrics import metric +from pymatting.util.util import load_image, save_image, stack_images +from pymatting.foreground.estimate_foreground_ml import estimate_foreground_ml + +np.set_printoptions(suppress=True) + + +def save_alpha_pred(alpha, path): + """ + The value of alpha is range [0, 1], shape should be [h,w] + """ + dirname = os.path.dirname(path) + if not os.path.exists(dirname): + os.makedirs(dirname) + + alpha = (alpha).astype('uint8') + cv2.imwrite(path, alpha) + + +def reverse_transform(alpha, trans_info): + """recover pred to origin shape""" + for item in trans_info[::-1]: + if item[0][0] == 'resize': + h, w = item[1][0].numpy()[0], item[1][1].numpy()[0] + alpha = cv2.resize(alpha, dsize=(w, h)) + elif item[0][0] == 'padding': + h, w = item[1][0].numpy()[0], item[1][1].numpy()[0] + alpha = alpha[0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format(item[0])) + return alpha + + +def evaluate_ml(model, + eval_dataset, + num_workers=0, + print_detail=True, + save_dir='output/results', + save_results=True): + + loader = paddle.io.DataLoader( + eval_dataset, + batch_size=1, + drop_last=False, + num_workers=num_workers, + return_list=True, ) + + total_iters = len(loader) + mse_metric = metric.MSE() + sad_metric = metric.SAD() + grad_metric = metric.Grad() + conn_metric = metric.Conn() + + if print_detail: + logger.info("Start evaluating (total_samples: {}, total_iters: {})...". + format(len(eval_dataset), total_iters)) + progbar_val = progbar.Progbar(target=total_iters, verbose=1) + reader_cost_averager = TimeAverager() + batch_cost_averager = TimeAverager() + batch_start = time.time() + + img_name = '' + i = 0 + ignore_cnt = 0 + for iter, data in enumerate(loader): + + reader_cost_averager.record(time.time() - batch_start) + + image_rgb_chw = data['img'].numpy()[0] + image_rgb_hwc = np.transpose(image_rgb_chw, (1, 2, 0)) + trimap = data['trimap'].numpy().squeeze() / 255.0 + image = image_rgb_hwc * 0.5 + 0.5 # reverse normalize (x/255 - mean) / std + + is_fg = trimap >= 0.9 + is_bg = trimap <= 0.1 + + if is_fg.sum() == 0 or is_bg.sum() == 0: + ignore_cnt += 1 + logger.info(str(iter)) + continue + + alpha_pred = model(image, trimap) + + alpha_pred = reverse_transform(alpha_pred, data['trans_info']) + + alpha_gt = data['alpha'].numpy().squeeze() * 255 + + trimap = data['ori_trimap'].numpy().squeeze() + + alpha_pred = np.round(alpha_pred * 255) + mse = mse_metric.update(alpha_pred, alpha_gt, trimap) + sad = sad_metric.update(alpha_pred, alpha_gt, trimap) + grad = grad_metric.update(alpha_pred, alpha_gt, trimap) + conn = conn_metric.update(alpha_pred, alpha_gt, trimap) + + if sad > 1000: + print(data['img_name'][0]) + + if save_results: + alpha_pred_one = alpha_pred + alpha_pred_one[trimap == 255] = 255 + alpha_pred_one[trimap == 0] = 0 + + save_name = data['img_name'][0] + name, ext = os.path.splitext(save_name) + if save_name == img_name: + save_name = name + '_' + str(i) + ext + i += 1 + else: + img_name = save_name + save_name = name + '_' + str(0) + ext + i = 1 + save_alpha_pred(alpha_pred_one, os.path.join(save_dir, save_name)) + + batch_cost_averager.record( + time.time() - batch_start, num_samples=len(alpha_gt)) + batch_cost = batch_cost_averager.get_average() + reader_cost = reader_cost_averager.get_average() + + if print_detail: + progbar_val.update(iter + 1, + [('SAD', sad), ('MSE', mse), ('Grad', grad), + ('Conn', conn), ('batch_cost', batch_cost), + ('reader cost', reader_cost)]) + + reader_cost_averager.reset() + batch_cost_averager.reset() + batch_start = time.time() + + mse = mse_metric.evaluate() + sad = sad_metric.evaluate() + grad = grad_metric.evaluate() + conn = conn_metric.evaluate() + + logger.info('[EVAL] SAD: {:.4f}, MSE: {:.4f}, Grad: {:.4f}, Conn: {:.4f}'. + format(sad, mse, grad, conn)) + logger.info('{}'.format(ignore_cnt)) + + return sad, mse, grad, conn diff --git a/ppmatting/datasets/__init__.py b/ppmatting/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..55febcaefed2e14676cbb0864f8d4cc4c1ef7459 --- /dev/null +++ b/ppmatting/datasets/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .matting_dataset import MattingDataset +from .composition_1k import Composition1K +from .distinctions_646 import Distinctions646 diff --git a/ppmatting/datasets/composition_1k.py b/ppmatting/datasets/composition_1k.py new file mode 100644 index 0000000000000000000000000000000000000000..854b29bed6d91f20616060c3cee50fc21dc5b8f2 --- /dev/null +++ b/ppmatting/datasets/composition_1k.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math + +import cv2 +import numpy as np +import random +import paddle +from paddleseg.cvlibs import manager + +import ppmatting.transforms as T +from ppmatting.datasets.matting_dataset import MattingDataset + + +@manager.DATASETS.add_component +class Composition1K(MattingDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) diff --git a/ppmatting/datasets/distinctions_646.py b/ppmatting/datasets/distinctions_646.py new file mode 100644 index 0000000000000000000000000000000000000000..d20b08f2e6b2583ef03bfdc2c30e84fcefd02607 --- /dev/null +++ b/ppmatting/datasets/distinctions_646.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math + +import cv2 +import numpy as np +import random +import paddle +from paddleseg.cvlibs import manager + +import ppmatting.transforms as T +from ppmatting.datasets.matting_dataset import MattingDataset + + +@manager.DATASETS.add_component +class Distinctions646(MattingDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) diff --git a/ppmatting/datasets/matting_dataset.py b/ppmatting/datasets/matting_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d782d6c35acbb583f8fbdc61685e222ff0437996 --- /dev/null +++ b/ppmatting/datasets/matting_dataset.py @@ -0,0 +1,251 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math + +import cv2 +import numpy as np +import random +import paddle +from paddleseg.cvlibs import manager + +import ppmatting.transforms as T + + +@manager.DATASETS.add_component +class MattingDataset(paddle.io.Dataset): + """ + Pass in a dataset that conforms to the format. + matting_dataset/ + |--bg/ + | + |--train/ + | |--fg/ + | |--alpha/ + | + |--val/ + | |--fg/ + | |--alpha/ + | |--trimap/ (if existing) + | + |--train.txt + | + |--val.txt + See README.md for more information of dataset. + + Args: + dataset_root(str): The root path of dataset. + transforms(list): Transforms for image. + mode (str, optional): which part of dataset to use. it is one of ('train', 'val', 'trainval'). Default: 'train'. + train_file (str|list, optional): File list is used to train. It should be `foreground_image.png background_image.png` + or `foreground_image.png`. It shold be provided if mode equal to 'train'. Default: None. + val_file (str|list, optional): File list is used to evaluation. It should be `foreground_image.png background_image.png` + or `foreground_image.png` or ``foreground_image.png background_image.png trimap_image.png`. + It shold be provided if mode equal to 'val'. Default: None. + get_trimap (bool, optional): Whether to get triamp. Default: True. + separator (str, optional): The separator of train_file or val_file. If file name contains ' ', '|' may be perfect. Default: ' '. + key_del (tuple|list, optional): The key which is not need will be delete to accellect data reader. Default: None. + if_rssn (bool, optional): Whether to use RSSN while Compositing image. Including denoise and blur. Default: False. + """ + + def __init__(self, + dataset_root, + transforms, + mode='train', + train_file=None, + val_file=None, + get_trimap=True, + separator=' ', + key_del=None, + if_rssn=False): + super().__init__() + self.dataset_root = dataset_root + self.transforms = T.Compose(transforms) + self.mode = mode + self.get_trimap = get_trimap + self.separator = separator + self.key_del = key_del + self.if_rssn = if_rssn + + # check file + if mode == 'train' or mode == 'trainval': + if train_file is None: + raise ValueError( + "When `mode` is 'train' or 'trainval', `train_file must be provided!" + ) + if isinstance(train_file, str): + train_file = [train_file] + file_list = train_file + + if mode == 'val' or mode == 'trainval': + if val_file is None: + raise ValueError( + "When `mode` is 'val' or 'trainval', `val_file must be provided!" + ) + if isinstance(val_file, str): + val_file = [val_file] + file_list = val_file + + if mode == 'trainval': + file_list = train_file + val_file + + # read file + self.fg_bg_list = [] + for file in file_list: + file = os.path.join(dataset_root, file) + with open(file, 'r') as f: + lines = f.readlines() + for line in lines: + line = line.strip() + self.fg_bg_list.append(line) + if mode != 'val': + random.shuffle(self.fg_bg_list) + + def __getitem__(self, idx): + data = {} + fg_bg_file = self.fg_bg_list[idx] + fg_bg_file = fg_bg_file.split(self.separator) + data['img_name'] = fg_bg_file[0] # using in save prediction results + fg_file = os.path.join(self.dataset_root, fg_bg_file[0]) + alpha_file = fg_file.replace('/fg', '/alpha') + fg = cv2.imread(fg_file) + alpha = cv2.imread(alpha_file, 0) + data['alpha'] = alpha + data['gt_fields'] = [] + + # line is: fg [bg] [trimap] + if len(fg_bg_file) >= 2: + bg_file = os.path.join(self.dataset_root, fg_bg_file[1]) + bg = cv2.imread(bg_file) + data['img'], data['fg'], data['bg'] = self.composite(fg, alpha, bg) + if self.mode in ['train', 'trainval']: + data['gt_fields'].append('fg') + data['gt_fields'].append('bg') + data['gt_fields'].append('alpha') + if len(fg_bg_file) == 3 and self.get_trimap: + if self.mode == 'val': + trimap_path = os.path.join(self.dataset_root, fg_bg_file[2]) + if os.path.exists(trimap_path): + data['trimap'] = trimap_path + data['gt_fields'].append('trimap') + data['ori_trimap'] = cv2.imread(trimap_path, 0) + else: + raise FileNotFoundError( + 'trimap is not Found: {}'.format(fg_bg_file[2])) + else: + data['img'] = fg + if self.mode in ['train', 'trainval']: + data['fg'] = fg.copy() + data['bg'] = fg.copy() + data['gt_fields'].append('fg') + data['gt_fields'].append('bg') + data['gt_fields'].append('alpha') + + data['trans_info'] = [] # Record shape change information + + # Generate trimap from alpha if no trimap file provided + if self.get_trimap: + if 'trimap' not in data: + data['trimap'] = self.gen_trimap( + data['alpha'], mode=self.mode).astype('float32') + data['gt_fields'].append('trimap') + if self.mode == 'val': + data['ori_trimap'] = data['trimap'].copy() + + # Delete key which is not need + if self.key_del is not None: + for key in self.key_del: + if key in data.keys(): + data.pop(key) + if key in data['gt_fields']: + data['gt_fields'].remove(key) + data = self.transforms(data) + + # When evaluation, gt should not be transforms. + if self.mode == 'val': + data['gt_fields'].append('alpha') + + data['img'] = data['img'].astype('float32') + for key in data.get('gt_fields', []): + data[key] = data[key].astype('float32') + + if 'trimap' in data: + data['trimap'] = data['trimap'][np.newaxis, :, :] + if 'ori_trimap' in data: + data['ori_trimap'] = data['ori_trimap'][np.newaxis, :, :] + + data['alpha'] = data['alpha'][np.newaxis, :, :] / 255. + + return data + + def __len__(self): + return len(self.fg_bg_list) + + def composite(self, fg, alpha, ori_bg): + if self.if_rssn: + if np.random.rand() < 0.5: + fg = cv2.fastNlMeansDenoisingColored(fg, None, 3, 3, 7, 21) + ori_bg = cv2.fastNlMeansDenoisingColored(ori_bg, None, 3, 3, 7, + 21) + if np.random.rand() < 0.5: + radius = np.random.choice([19, 29, 39, 49, 59]) + ori_bg = cv2.GaussianBlur(ori_bg, (radius, radius), 0, 0) + fg_h, fg_w = fg.shape[:2] + ori_bg_h, ori_bg_w = ori_bg.shape[:2] + + wratio = fg_w / ori_bg_w + hratio = fg_h / ori_bg_h + ratio = wratio if wratio > hratio else hratio + + # Resize ori_bg if it is smaller than fg. + if ratio > 1: + resize_h = math.ceil(ori_bg_h * ratio) + resize_w = math.ceil(ori_bg_w * ratio) + bg = cv2.resize( + ori_bg, (resize_w, resize_h), interpolation=cv2.INTER_LINEAR) + else: + bg = ori_bg + + bg = bg[0:fg_h, 0:fg_w, :] + alpha = alpha / 255 + alpha = np.expand_dims(alpha, axis=2) + image = alpha * fg + (1 - alpha) * bg + image = image.astype(np.uint8) + return image, fg, bg + + @staticmethod + def gen_trimap(alpha, mode='train', eval_kernel=7): + if mode == 'train': + k_size = random.choice(range(2, 5)) + iterations = np.random.randint(5, 15) + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, + (k_size, k_size)) + dilated = cv2.dilate(alpha, kernel, iterations=iterations) + eroded = cv2.erode(alpha, kernel, iterations=iterations) + trimap = np.zeros(alpha.shape) + trimap.fill(128) + trimap[eroded > 254.5] = 255 + trimap[dilated < 0.5] = 0 + else: + k_size = eval_kernel + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, + (k_size, k_size)) + dilated = cv2.dilate(alpha, kernel) + trimap = np.zeros(alpha.shape) + trimap.fill(128) + trimap[alpha >= 250] = 255 + trimap[dilated <= 5] = 0 + + return trimap diff --git a/ppmatting/metrics/__init__.py b/ppmatting/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..836f0a973bf4331d36982252d47f7279e7c24752 --- /dev/null +++ b/ppmatting/metrics/__init__.py @@ -0,0 +1,3 @@ +from .metric import MSE, SAD, Grad, Conn + +metrics_class_dict = {'sad': SAD, 'mse': MSE, 'grad': Grad, 'conn': Conn} diff --git a/ppmatting/metrics/metric.py b/ppmatting/metrics/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..4ac237233b610dbb91af5e6f957656f62cbae7a9 --- /dev/null +++ b/ppmatting/metrics/metric.py @@ -0,0 +1,278 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Grad and Conn is refer to https://github.com/yucornetto/MGMatting/blob/main/code-base/utils/evaluate.py +# Output of `Grad` is sightly different from the MATLAB version provided by Adobe (less than 0.1%) +# Output of `Conn` is smaller than the MATLAB version (~5%, maybe MATLAB has a different algorithm) +# So do not report results calculated by these functions in your paper. +# Evaluate your inference with the MATLAB file `DIM_evaluation_code/evaluate.m`. + +import cv2 +import numpy as np +from scipy.ndimage.filters import convolve +from scipy.special import gamma +from skimage.measure import label + + +class MSE: + """ + Only calculate the unknown region if trimap provided. + """ + + def __init__(self): + self.mse_diffs = 0 + self.count = 0 + + def update(self, pred, gt, trimap=None): + """ + update metric. + Args: + pred (np.ndarray): The value range is [0., 255.]. + gt (np.ndarray): The value range is [0, 255]. + trimap (np.ndarray, optional) The value is in {0, 128, 255}. Default: None. + """ + if trimap is None: + trimap = np.ones_like(gt) * 128 + if not (pred.shape == gt.shape == trimap.shape): + raise ValueError( + 'The shape of `pred`, `gt` and `trimap` should be equal. ' + 'but they are {}, {} and {}'.format(pred.shape, gt.shape, + trimap.shape)) + pred[trimap == 0] = 0 + pred[trimap == 255] = 255 + + mask = trimap == 128 + pixels = float(mask.sum()) + pred = pred / 255. + gt = gt / 255. + diff = (pred - gt) * mask + mse_diff = (diff**2).sum() / pixels if pixels > 0 else 0 + + self.mse_diffs += mse_diff + self.count += 1 + + return mse_diff + + def evaluate(self): + mse = self.mse_diffs / self.count if self.count > 0 else 0 + return mse + + +class SAD: + """ + Only calculate the unknown region if trimap provided. + """ + + def __init__(self): + self.sad_diffs = 0 + self.count = 0 + + def update(self, pred, gt, trimap=None): + """ + update metric. + Args: + pred (np.ndarray): The value range is [0., 255.]. + gt (np.ndarray): The value range is [0., 255.]. + trimap (np.ndarray, optional)L The value is in {0, 128, 255}. Default: None. + """ + if trimap is None: + trimap = np.ones_like(gt) * 128 + if not (pred.shape == gt.shape == trimap.shape): + raise ValueError( + 'The shape of `pred`, `gt` and `trimap` should be equal. ' + 'but they are {}, {} and {}'.format(pred.shape, gt.shape, + trimap.shape)) + pred[trimap == 0] = 0 + pred[trimap == 255] = 255 + + mask = trimap == 128 + pred = pred / 255. + gt = gt / 255. + diff = (pred - gt) * mask + sad_diff = (np.abs(diff)).sum() + + sad_diff /= 1000 + self.sad_diffs += sad_diff + self.count += 1 + + return sad_diff + + def evaluate(self): + sad = self.sad_diffs / self.count if self.count > 0 else 0 + return sad + + +class Grad: + """ + Only calculate the unknown region if trimap provided. + Refer to: https://github.com/open-mlab/mmediting/blob/master/mmedit/core/evaluation/metrics.py + """ + + def __init__(self): + self.grad_diffs = 0 + self.count = 0 + + def gaussian(self, x, sigma): + return np.exp(-x**2 / (2 * sigma**2)) / (sigma * np.sqrt(2 * np.pi)) + + def dgaussian(self, x, sigma): + return -x * self.gaussian(x, sigma) / sigma**2 + + def gauss_filter(self, sigma, epsilon=1e-2): + half_size = np.ceil( + sigma * np.sqrt(-2 * np.log(np.sqrt(2 * np.pi) * sigma * epsilon))) + size = int(2 * half_size + 1) + + # create filter in x axis + filter_x = np.zeros((size, size)) + for i in range(size): + for j in range(size): + filter_x[i, j] = self.gaussian( + i - half_size, sigma) * self.dgaussian(j - half_size, sigma) + + # normalize filter + norm = np.sqrt((filter_x**2).sum()) + filter_x = filter_x / norm + filter_y = np.transpose(filter_x) + + return filter_x, filter_y + + def gauss_gradient(self, img, sigma): + filter_x, filter_y = self.gauss_filter(sigma) + img_filtered_x = cv2.filter2D( + img, -1, filter_x, borderType=cv2.BORDER_REPLICATE) + img_filtered_y = cv2.filter2D( + img, -1, filter_y, borderType=cv2.BORDER_REPLICATE) + return np.sqrt(img_filtered_x**2 + img_filtered_y**2) + + def update(self, pred, gt, trimap=None, sigma=1.4): + """ + update metric. + Args: + pred (np.ndarray): The value range is [0., 1.]. + gt (np.ndarray): The value range is [0, 255]. + trimap (np.ndarray, optional)L The value is in {0, 128, 255}. Default: None. + sigma (float, optional): Standard deviation of the gaussian kernel. Default: 1.4. + """ + if trimap is None: + trimap = np.ones_like(gt) * 128 + if not (pred.shape == gt.shape == trimap.shape): + raise ValueError( + 'The shape of `pred`, `gt` and `trimap` should be equal. ' + 'but they are {}, {} and {}'.format(pred.shape, gt.shape, + trimap.shape)) + pred[trimap == 0] = 0 + pred[trimap == 255] = 255 + + gt = gt.squeeze() + pred = pred.squeeze() + gt = gt.astype(np.float64) + pred = pred.astype(np.float64) + gt_normed = np.zeros_like(gt) + pred_normed = np.zeros_like(pred) + cv2.normalize(gt, gt_normed, 1., 0., cv2.NORM_MINMAX) + cv2.normalize(pred, pred_normed, 1., 0., cv2.NORM_MINMAX) + + gt_grad = self.gauss_gradient(gt_normed, sigma).astype(np.float32) + pred_grad = self.gauss_gradient(pred_normed, sigma).astype(np.float32) + + grad_diff = ((gt_grad - pred_grad)**2 * (trimap == 128)).sum() + + grad_diff /= 1000 + self.grad_diffs += grad_diff + self.count += 1 + + return grad_diff + + def evaluate(self): + grad = self.grad_diffs / self.count if self.count > 0 else 0 + return grad + + +class Conn: + """ + Only calculate the unknown region if trimap provided. + Refer to: Refer to: https://github.com/open-mlab/mmediting/blob/master/mmedit/core/evaluation/metrics.py + """ + + def __init__(self): + self.conn_diffs = 0 + self.count = 0 + + def update(self, pred, gt, trimap=None, step=0.1): + """ + update metric. + Args: + pred (np.ndarray): The value range is [0., 1.]. + gt (np.ndarray): The value range is [0, 255]. + trimap (np.ndarray, optional)L The value is in {0, 128, 255}. Default: None. + step (float, optional): Step of threshold when computing intersection between + `gt` and `pred`. Default: 0.1. + """ + if trimap is None: + trimap = np.ones_like(gt) * 128 + if not (pred.shape == gt.shape == trimap.shape): + raise ValueError( + 'The shape of `pred`, `gt` and `trimap` should be equal. ' + 'but they are {}, {} and {}'.format(pred.shape, gt.shape, + trimap.shape)) + pred[trimap == 0] = 0 + pred[trimap == 255] = 255 + + gt = gt.squeeze() + pred = pred.squeeze() + gt = gt.astype(np.float32) / 255 + pred = pred.astype(np.float32) / 255 + + thresh_steps = np.arange(0, 1 + step, step) + round_down_map = -np.ones_like(gt) + for i in range(1, len(thresh_steps)): + gt_thresh = gt >= thresh_steps[i] + pred_thresh = pred >= thresh_steps[i] + intersection = (gt_thresh & pred_thresh).astype(np.uint8) + + # connected components + _, output, stats, _ = cv2.connectedComponentsWithStats( + intersection, connectivity=4) + # start from 1 in dim 0 to exclude background + size = stats[1:, -1] + + # largest connected component of the intersection + omega = np.zeros_like(gt) + if len(size) != 0: + max_id = np.argmax(size) + # plus one to include background + omega[output == max_id + 1] = 1 + + mask = (round_down_map == -1) & (omega == 0) + round_down_map[mask] = thresh_steps[i - 1] + round_down_map[round_down_map == -1] = 1 + + gt_diff = gt - round_down_map + pred_diff = pred - round_down_map + # only calculate difference larger than or equal to 0.15 + gt_phi = 1 - gt_diff * (gt_diff >= 0.15) + pred_phi = 1 - pred_diff * (pred_diff >= 0.15) + + conn_diff = np.sum(np.abs(gt_phi - pred_phi) * (trimap == 128)) + + conn_diff /= 1000 + self.conn_diffs += conn_diff + self.count += 1 + + return conn_diff + + def evaluate(self): + conn = self.conn_diffs / self.count if self.count > 0 else 0 + return conn diff --git a/ppmatting/ml/__init__.py b/ppmatting/ml/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..612dff101f358f74db3eca601f0b9573ca6d93cb --- /dev/null +++ b/ppmatting/ml/__init__.py @@ -0,0 +1 @@ +from .methods import CloseFormMatting, KNNMatting, LearningBasedMatting, FastMatting, RandomWalksMatting diff --git a/ppmatting/ml/methods.py b/ppmatting/ml/methods.py new file mode 100644 index 0000000000000000000000000000000000000000..61d5fea2475552c14d29fe44fd08cf436e55bdbd --- /dev/null +++ b/ppmatting/ml/methods.py @@ -0,0 +1,97 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pymatting +from paddleseg.cvlibs import manager + + +class BaseMLMatting(object): + def __init__(self, alpha_estimator, **kargs): + self.alpha_estimator = alpha_estimator + self.kargs = kargs + + def __call__(self, image, trimap): + image = self.__to_float64(image) + trimap = self.__to_float64(trimap) + alpha_matte = self.alpha_estimator(image, trimap, **self.kargs) + return alpha_matte + + def __to_float64(self, x): + x_dtype = x.dtype + assert x_dtype in ["float32", "float64"] + x = x.astype("float64") + return x + + +@manager.MODELS.add_component +class CloseFormMatting(BaseMLMatting): + def __init__(self, **kargs): + cf_alpha_estimator = pymatting.estimate_alpha_cf + super().__init__(cf_alpha_estimator, **kargs) + + +@manager.MODELS.add_component +class KNNMatting(BaseMLMatting): + def __init__(self, **kargs): + knn_alpha_estimator = pymatting.estimate_alpha_knn + super().__init__(knn_alpha_estimator, **kargs) + + +@manager.MODELS.add_component +class LearningBasedMatting(BaseMLMatting): + def __init__(self, **kargs): + lbdm_alpha_estimator = pymatting.estimate_alpha_lbdm + super().__init__(lbdm_alpha_estimator, **kargs) + + +@manager.MODELS.add_component +class FastMatting(BaseMLMatting): + def __init__(self, **kargs): + lkm_alpha_estimator = pymatting.estimate_alpha_lkm + super().__init__(lkm_alpha_estimator, **kargs) + + +@manager.MODELS.add_component +class RandomWalksMatting(BaseMLMatting): + def __init__(self, **kargs): + rw_alpha_estimator = pymatting.estimate_alpha_rw + super().__init__(rw_alpha_estimator, **kargs) + + +if __name__ == "__main__": + from pymatting.util.util import load_image, save_image, stack_images + from pymatting.foreground.estimate_foreground_ml import estimate_foreground_ml + import cv2 + + root = "/mnt/liuyi22/PaddlePaddle/PaddleSeg/Matting/data/examples/" + image_path = root + "lemur.png" + trimap_path = root + "lemur_trimap.png" + cutout_path = root + "lemur_cutout.png" + image = cv2.cvtColor( + cv2.imread(image_path).astype("float64"), cv2.COLOR_BGR2RGB) / 255.0 + + cv2.imwrite("image.png", (image * 255).astype('uint8')) + trimap = load_image(trimap_path, "GRAY") + print(image.shape, trimap.shape) + print(image.dtype, trimap.dtype) + cf = CloseFormMatting() + alpha = cf(image, trimap) + + # alpha = pymatting.estimate_alpha_lkm(image, trimap) + + foreground = estimate_foreground_ml(image, alpha) + + cutout = stack_images(foreground, alpha) + + save_image(cutout_path, cutout) diff --git a/ppmatting/models/__init__.py b/ppmatting/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d446649bc75b44f5ff3f9183e22f057f128b5fa2 --- /dev/null +++ b/ppmatting/models/__init__.py @@ -0,0 +1,7 @@ +from .backbone import * +from .losses import * +from .modnet import MODNet +from .human_matting import HumanMatting +from .dim import DIM +from .ppmatting import PPMatting +from .gca import GCABaseline, GCA diff --git a/ppmatting/models/backbone/__init__.py b/ppmatting/models/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b08005b31477e57488132cd2f5d3692c6e687b4f --- /dev/null +++ b/ppmatting/models/backbone/__init__.py @@ -0,0 +1,5 @@ +from .mobilenet_v2 import * +from .hrnet import * +from .resnet_vd import * +from .vgg import * +from .gca_enc import * \ No newline at end of file diff --git a/ppmatting/models/backbone/gca_enc.py b/ppmatting/models/backbone/gca_enc.py new file mode 100644 index 0000000000000000000000000000000000000000..2afeb5df8c398d89ac1d4fe8e411571afebec5b6 --- /dev/null +++ b/ppmatting/models/backbone/gca_enc.py @@ -0,0 +1,395 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The gca code was heavily based on https://github.com/Yaoyi-Li/GCA-Matting +# and https://github.com/open-mmlab/mmediting + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddleseg.cvlibs import manager, param_init +from paddleseg.utils import utils + +from ppmatting.models.layers import GuidedCxtAtten + + +class ResNet_D(nn.Layer): + def __init__(self, + input_channels, + layers, + late_downsample=False, + pretrained=None): + + super().__init__() + + self.pretrained = pretrained + + self._norm_layer = nn.BatchNorm + self.inplanes = 64 + self.late_downsample = late_downsample + self.midplanes = 64 if late_downsample else 32 + self.start_stride = [1, 2, 1, 2] if late_downsample else [2, 1, 2, 1] + self.conv1 = nn.utils.spectral_norm( + nn.Conv2D( + input_channels, + 32, + kernel_size=3, + stride=self.start_stride[0], + padding=1, + bias_attr=False)) + self.conv2 = nn.utils.spectral_norm( + nn.Conv2D( + 32, + self.midplanes, + kernel_size=3, + stride=self.start_stride[1], + padding=1, + bias_attr=False)) + self.conv3 = nn.utils.spectral_norm( + nn.Conv2D( + self.midplanes, + self.inplanes, + kernel_size=3, + stride=self.start_stride[2], + padding=1, + bias_attr=False)) + self.bn1 = self._norm_layer(32) + self.bn2 = self._norm_layer(self.midplanes) + self.bn3 = self._norm_layer(self.inplanes) + self.activation = nn.ReLU() + self.layer1 = self._make_layer( + BasicBlock, 64, layers[0], stride=self.start_stride[3]) + self.layer2 = self._make_layer(BasicBlock, 128, layers[1], stride=2) + self.layer3 = self._make_layer(BasicBlock, 256, layers[2], stride=2) + self.layer_bottleneck = self._make_layer( + BasicBlock, 512, layers[3], stride=2) + + self.init_weight() + + def _make_layer(self, block, planes, block_num, stride=1): + if block_num == 0: + return nn.Sequential(nn.Identity()) + norm_layer = self._norm_layer + downsample = None + if stride != 1: + downsample = nn.Sequential( + nn.AvgPool2D(2, stride), + nn.utils.spectral_norm( + conv1x1(self.inplanes, planes * block.expansion)), + norm_layer(planes * block.expansion), ) + elif self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.utils.spectral_norm( + conv1x1(self.inplanes, planes * block.expansion, stride)), + norm_layer(planes * block.expansion), ) + + layers = [block(self.inplanes, planes, stride, downsample, norm_layer)] + self.inplanes = planes * block.expansion + for _ in range(1, block_num): + layers.append(block(self.inplanes, planes, norm_layer=norm_layer)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.activation(x) + x = self.conv2(x) + x = self.bn2(x) + x1 = self.activation(x) # N x 32 x 256 x 256 + x = self.conv3(x1) + x = self.bn3(x) + x2 = self.activation(x) # N x 64 x 128 x 128 + + x3 = self.layer1(x2) # N x 64 x 128 x 128 + x4 = self.layer2(x3) # N x 128 x 64 x 64 + x5 = self.layer3(x4) # N x 256 x 32 x 32 + x = self.layer_bottleneck(x5) # N x 512 x 16 x 16 + + return x, (x1, x2, x3, x4, x5) + + def init_weight(self): + + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + + if hasattr(layer, "weight_orig"): + param = layer.weight_orig + else: + param = layer.weight + param_init.xavier_uniform(param) + + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) + + elif isinstance(layer, BasicBlock): + param_init.constant_init(layer.bn2.weight, value=0.0) + + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +@manager.MODELS.add_component +class ResShortCut_D(ResNet_D): + def __init__(self, + input_channels, + layers, + late_downsample=False, + pretrained=None): + super().__init__( + input_channels, + layers, + late_downsample=late_downsample, + pretrained=pretrained) + + self.shortcut_inplane = [input_channels, self.midplanes, 64, 128, 256] + self.shortcut_plane = [32, self.midplanes, 64, 128, 256] + + self.shortcut = nn.LayerList() + for stage, inplane in enumerate(self.shortcut_inplane): + self.shortcut.append( + self._make_shortcut(inplane, self.shortcut_plane[stage])) + + def _make_shortcut(self, inplane, planes): + return nn.Sequential( + nn.utils.spectral_norm( + nn.Conv2D( + inplane, planes, kernel_size=3, padding=1, + bias_attr=False)), + nn.ReLU(), + self._norm_layer(planes), + nn.utils.spectral_norm( + nn.Conv2D( + planes, planes, kernel_size=3, padding=1, bias_attr=False)), + nn.ReLU(), + self._norm_layer(planes)) + + def forward(self, x): + + out = self.conv1(x) + out = self.bn1(out) + out = self.activation(out) + out = self.conv2(out) + out = self.bn2(out) + x1 = self.activation(out) # N x 32 x 256 x 256 + out = self.conv3(x1) + out = self.bn3(out) + out = self.activation(out) + + x2 = self.layer1(out) # N x 64 x 128 x 128 + x3 = self.layer2(x2) # N x 128 x 64 x 64 + x4 = self.layer3(x3) # N x 256 x 32 x 32 + out = self.layer_bottleneck(x4) # N x 512 x 16 x 16 + + fea1 = self.shortcut[0](x) # input image and trimap + fea2 = self.shortcut[1](x1) + fea3 = self.shortcut[2](x2) + fea4 = self.shortcut[3](x3) + fea5 = self.shortcut[4](x4) + + return out, { + 'shortcut': (fea1, fea2, fea3, fea4, fea5), + 'image': x[:, :3, ...] + } + + +@manager.MODELS.add_component +class ResGuidedCxtAtten(ResNet_D): + def __init__(self, + input_channels, + layers, + late_downsample=False, + pretrained=None): + super().__init__( + input_channels, + layers, + late_downsample=late_downsample, + pretrained=pretrained) + self.input_channels = input_channels + self.shortcut_inplane = [input_channels, self.midplanes, 64, 128, 256] + self.shortcut_plane = [32, self.midplanes, 64, 128, 256] + + self.shortcut = nn.LayerList() + for stage, inplane in enumerate(self.shortcut_inplane): + self.shortcut.append( + self._make_shortcut(inplane, self.shortcut_plane[stage])) + + self.guidance_head = nn.Sequential( + nn.Pad2D( + 1, mode="reflect"), + nn.utils.spectral_norm( + nn.Conv2D( + 3, 16, kernel_size=3, padding=0, stride=2, + bias_attr=False)), + nn.ReLU(), + self._norm_layer(16), + nn.Pad2D( + 1, mode="reflect"), + nn.utils.spectral_norm( + nn.Conv2D( + 16, 32, kernel_size=3, padding=0, stride=2, + bias_attr=False)), + nn.ReLU(), + self._norm_layer(32), + nn.Pad2D( + 1, mode="reflect"), + nn.utils.spectral_norm( + nn.Conv2D( + 32, + 128, + kernel_size=3, + padding=0, + stride=2, + bias_attr=False)), + nn.ReLU(), + self._norm_layer(128)) + + self.gca = GuidedCxtAtten(128, 128) + + self.init_weight() + + def init_weight(self): + + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + initializer = nn.initializer.XavierUniform() + if hasattr(layer, "weight_orig"): + param = layer.weight_orig + else: + param = layer.weight + initializer(param, param.block) + + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) + + elif isinstance(layer, BasicBlock): + param_init.constant_init(layer.bn2.weight, value=0.0) + + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + def _make_shortcut(self, inplane, planes): + return nn.Sequential( + nn.utils.spectral_norm( + nn.Conv2D( + inplane, planes, kernel_size=3, padding=1, + bias_attr=False)), + nn.ReLU(), + self._norm_layer(planes), + nn.utils.spectral_norm( + nn.Conv2D( + planes, planes, kernel_size=3, padding=1, bias_attr=False)), + nn.ReLU(), + self._norm_layer(planes)) + + def forward(self, x): + + out = self.conv1(x) + out = self.bn1(out) + out = self.activation(out) + out = self.conv2(out) + out = self.bn2(out) + x1 = self.activation(out) # N x 32 x 256 x 256 + out = self.conv3(x1) + out = self.bn3(out) + out = self.activation(out) + + im_fea = self.guidance_head( + x[:, :3, ...]) # downsample origin image and extract features + if self.input_channels == 6: + unknown = F.interpolate( + x[:, 4:5, ...], scale_factor=1 / 8, mode='nearest') + else: + unknown = x[:, 3:, ...].equal(paddle.to_tensor([1.])) + unknown = paddle.cast(unknown, dtype='float32') + unknown = F.interpolate(unknown, scale_factor=1 / 8, mode='nearest') + + x2 = self.layer1(out) # N x 64 x 128 x 128 + x3 = self.layer2(x2) # N x 128 x 64 x 64 + x3 = self.gca(im_fea, x3, unknown) # contextual attention + x4 = self.layer3(x3) # N x 256 x 32 x 32 + out = self.layer_bottleneck(x4) # N x 512 x 16 x 16 + + fea1 = self.shortcut[0](x) # input image and trimap + fea2 = self.shortcut[1](x1) + fea3 = self.shortcut[2](x2) + fea4 = self.shortcut[3](x3) + fea5 = self.shortcut[4](x4) + + return out, { + 'shortcut': (fea1, fea2, fea3, fea4, fea5), + 'image_fea': im_fea, + 'unknown': unknown, + } + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + norm_layer=None): + super().__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = nn.utils.spectral_norm(conv3x3(inplanes, planes, stride)) + self.bn1 = norm_layer(planes) + self.activation = nn.ReLU() + self.conv2 = nn.utils.spectral_norm(conv3x3(planes, planes)) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.activation(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.activation(out) + + return out + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias_attr=False, + dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2D( + in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False) diff --git a/ppmatting/models/backbone/hrnet.py b/ppmatting/models/backbone/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..96e23a77e656142a97c573feb501f983aecebbef --- /dev/null +++ b/ppmatting/models/backbone/hrnet.py @@ -0,0 +1,835 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager, param_init +from paddleseg.models import layers +from paddleseg.utils import utils + +__all__ = [ + "HRNet_W18_Small_V1", "HRNet_W18_Small_V2", "HRNet_W18", "HRNet_W30", + "HRNet_W32", "HRNet_W40", "HRNet_W44", "HRNet_W48", "HRNet_W60", "HRNet_W64" +] + + +class HRNet(nn.Layer): + """ + The HRNet implementation based on PaddlePaddle. + + The original article refers to + Jingdong Wang, et, al. "HRNet:Deep High-Resolution Representation Learning for Visual Recognition" + (https://arxiv.org/pdf/1908.07919.pdf). + + Args: + pretrained (str, optional): The path of pretrained model. + stage1_num_modules (int, optional): Number of modules for stage1. Default 1. + stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4). + stage1_num_channels (list, optional): Number of channels per branch for stage1. Default (64). + stage2_num_modules (int, optional): Number of modules for stage2. Default 1. + stage2_num_blocks (list, optional): Number of blocks per module for stage2. Default (4, 4). + stage2_num_channels (list, optional): Number of channels per branch for stage2. Default (18, 36). + stage3_num_modules (int, optional): Number of modules for stage3. Default 4. + stage3_num_blocks (list, optional): Number of blocks per module for stage3. Default (4, 4, 4). + stage3_num_channels (list, optional): Number of channels per branch for stage3. Default [18, 36, 72). + stage4_num_modules (int, optional): Number of modules for stage4. Default 3. + stage4_num_blocks (list, optional): Number of blocks per module for stage4. Default (4, 4, 4, 4). + stage4_num_channels (list, optional): Number of channels per branch for stage4. Default (18, 36, 72. 144). + has_se (bool, optional): Whether to use Squeeze-and-Excitation module. Default False. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + """ + + def __init__(self, + input_channels=3, + pretrained=None, + stage1_num_modules=1, + stage1_num_blocks=(4, ), + stage1_num_channels=(64, ), + stage2_num_modules=1, + stage2_num_blocks=(4, 4), + stage2_num_channels=(18, 36), + stage3_num_modules=4, + stage3_num_blocks=(4, 4, 4), + stage3_num_channels=(18, 36, 72), + stage4_num_modules=3, + stage4_num_blocks=(4, 4, 4, 4), + stage4_num_channels=(18, 36, 72, 144), + has_se=False, + align_corners=False, + padding_same=True): + super(HRNet, self).__init__() + self.pretrained = pretrained + self.stage1_num_modules = stage1_num_modules + self.stage1_num_blocks = stage1_num_blocks + self.stage1_num_channels = stage1_num_channels + self.stage2_num_modules = stage2_num_modules + self.stage2_num_blocks = stage2_num_blocks + self.stage2_num_channels = stage2_num_channels + self.stage3_num_modules = stage3_num_modules + self.stage3_num_blocks = stage3_num_blocks + self.stage3_num_channels = stage3_num_channels + self.stage4_num_modules = stage4_num_modules + self.stage4_num_blocks = stage4_num_blocks + self.stage4_num_channels = stage4_num_channels + self.has_se = has_se + self.align_corners = align_corners + + self.feat_channels = [i for i in stage4_num_channels] + self.feat_channels = [64] + self.feat_channels + + self.conv_layer1_1 = layers.ConvBNReLU( + in_channels=input_channels, + out_channels=64, + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False) + + self.conv_layer1_2 = layers.ConvBNReLU( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False) + + self.la1 = Layer1( + num_channels=64, + num_blocks=self.stage1_num_blocks[0], + num_filters=self.stage1_num_channels[0], + has_se=has_se, + name="layer2", + padding_same=padding_same) + + self.tr1 = TransitionLayer( + in_channels=[self.stage1_num_channels[0] * 4], + out_channels=self.stage2_num_channels, + name="tr1", + padding_same=padding_same) + + self.st2 = Stage( + num_channels=self.stage2_num_channels, + num_modules=self.stage2_num_modules, + num_blocks=self.stage2_num_blocks, + num_filters=self.stage2_num_channels, + has_se=self.has_se, + name="st2", + align_corners=align_corners, + padding_same=padding_same) + + self.tr2 = TransitionLayer( + in_channels=self.stage2_num_channels, + out_channels=self.stage3_num_channels, + name="tr2", + padding_same=padding_same) + self.st3 = Stage( + num_channels=self.stage3_num_channels, + num_modules=self.stage3_num_modules, + num_blocks=self.stage3_num_blocks, + num_filters=self.stage3_num_channels, + has_se=self.has_se, + name="st3", + align_corners=align_corners, + padding_same=padding_same) + + self.tr3 = TransitionLayer( + in_channels=self.stage3_num_channels, + out_channels=self.stage4_num_channels, + name="tr3", + padding_same=padding_same) + self.st4 = Stage( + num_channels=self.stage4_num_channels, + num_modules=self.stage4_num_modules, + num_blocks=self.stage4_num_blocks, + num_filters=self.stage4_num_channels, + has_se=self.has_se, + name="st4", + align_corners=align_corners, + padding_same=padding_same) + + self.init_weight() + + def forward(self, x): + feat_list = [] + conv1 = self.conv_layer1_1(x) + feat_list.append(conv1) + conv2 = self.conv_layer1_2(conv1) + + la1 = self.la1(conv2) + + tr1 = self.tr1([la1]) + st2 = self.st2(tr1) + + tr2 = self.tr2(st2) + st3 = self.st3(tr2) + + tr3 = self.tr3(st3) + st4 = self.st4(tr3) + + feat_list = feat_list + st4 + + return feat_list + + def init_weight(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + param_init.normal_init(layer.weight, std=0.001) + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +class Layer1(nn.Layer): + def __init__(self, + num_channels, + num_filters, + num_blocks, + has_se=False, + name=None, + padding_same=True): + super(Layer1, self).__init__() + + self.bottleneck_block_list = [] + + for i in range(num_blocks): + bottleneck_block = self.add_sublayer( + "bb_{}_{}".format(name, i + 1), + BottleneckBlock( + num_channels=num_channels if i == 0 else num_filters * 4, + num_filters=num_filters, + has_se=has_se, + stride=1, + downsample=True if i == 0 else False, + name=name + '_' + str(i + 1), + padding_same=padding_same)) + self.bottleneck_block_list.append(bottleneck_block) + + def forward(self, x): + conv = x + for block_func in self.bottleneck_block_list: + conv = block_func(conv) + return conv + + +class TransitionLayer(nn.Layer): + def __init__(self, in_channels, out_channels, name=None, padding_same=True): + super(TransitionLayer, self).__init__() + + num_in = len(in_channels) + num_out = len(out_channels) + self.conv_bn_func_list = [] + for i in range(num_out): + residual = None + if i < num_in: + if in_channels[i] != out_channels[i]: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + layers.ConvBNReLU( + in_channels=in_channels[i], + out_channels=out_channels[i], + kernel_size=3, + padding=1 if not padding_same else 'same', + bias_attr=False)) + else: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + layers.ConvBNReLU( + in_channels=in_channels[-1], + out_channels=out_channels[i], + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False)) + self.conv_bn_func_list.append(residual) + + def forward(self, x): + outs = [] + for idx, conv_bn_func in enumerate(self.conv_bn_func_list): + if conv_bn_func is None: + outs.append(x[idx]) + else: + if idx < len(x): + outs.append(conv_bn_func(x[idx])) + else: + outs.append(conv_bn_func(x[-1])) + return outs + + +class Branches(nn.Layer): + def __init__(self, + num_blocks, + in_channels, + out_channels, + has_se=False, + name=None, + padding_same=True): + super(Branches, self).__init__() + + self.basic_block_list = [] + + for i in range(len(out_channels)): + self.basic_block_list.append([]) + for j in range(num_blocks[i]): + in_ch = in_channels[i] if j == 0 else out_channels[i] + basic_block_func = self.add_sublayer( + "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1), + BasicBlock( + num_channels=in_ch, + num_filters=out_channels[i], + has_se=has_se, + name=name + '_branch_layer_' + str(i + 1) + '_' + + str(j + 1), + padding_same=padding_same)) + self.basic_block_list[i].append(basic_block_func) + + def forward(self, x): + outs = [] + for idx, input in enumerate(x): + conv = input + for basic_block_func in self.basic_block_list[idx]: + conv = basic_block_func(conv) + outs.append(conv) + return outs + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + has_se, + stride=1, + downsample=False, + name=None, + padding_same=True): + super(BottleneckBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = layers.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + bias_attr=False) + + self.conv2 = layers.ConvBNReLU( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + stride=stride, + padding=1 if not padding_same else 'same', + bias_attr=False) + + self.conv3 = layers.ConvBN( + in_channels=num_filters, + out_channels=num_filters * 4, + kernel_size=1, + bias_attr=False) + + if self.downsample: + self.conv_down = layers.ConvBN( + in_channels=num_channels, + out_channels=num_filters * 4, + kernel_size=1, + bias_attr=False) + + if self.has_se: + self.se = SELayer( + num_channels=num_filters * 4, + num_filters=num_filters * 4, + reduction_ratio=16, + name=name + '_fc') + + self.add = layers.Add() + self.relu = layers.Activation("relu") + + def forward(self, x): + residual = x + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + conv3 = self.conv3(conv2) + + if self.downsample: + residual = self.conv_down(x) + + if self.has_se: + conv3 = self.se(conv3) + + y = self.add(conv3, residual) + y = self.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride=1, + has_se=False, + downsample=False, + name=None, + padding_same=True): + super(BasicBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = layers.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=3, + stride=stride, + padding=1 if not padding_same else 'same', + bias_attr=False) + self.conv2 = layers.ConvBN( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + padding=1 if not padding_same else 'same', + bias_attr=False) + + if self.downsample: + self.conv_down = layers.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + bias_attr=False) + + if self.has_se: + self.se = SELayer( + num_channels=num_filters, + num_filters=num_filters, + reduction_ratio=16, + name=name + '_fc') + + self.add = layers.Add() + self.relu = layers.Activation("relu") + + def forward(self, x): + residual = x + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + + if self.downsample: + residual = self.conv_down(x) + + if self.has_se: + conv2 = self.se(conv2) + + y = self.add(conv2, residual) + y = self.relu(y) + return y + + +class SELayer(nn.Layer): + def __init__(self, num_channels, num_filters, reduction_ratio, name=None): + super(SELayer, self).__init__() + + self.pool2d_gap = nn.AdaptiveAvgPool2D(1) + + self._num_channels = num_channels + + med_ch = int(num_channels / reduction_ratio) + stdv = 1.0 / math.sqrt(num_channels * 1.0) + self.squeeze = nn.Linear( + num_channels, + med_ch, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Uniform(-stdv, stdv))) + + stdv = 1.0 / math.sqrt(med_ch * 1.0) + self.excitation = nn.Linear( + med_ch, + num_filters, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Uniform(-stdv, stdv))) + + def forward(self, x): + pool = self.pool2d_gap(x) + pool = paddle.reshape(pool, shape=[-1, self._num_channels]) + squeeze = self.squeeze(pool) + squeeze = F.relu(squeeze) + excitation = self.excitation(squeeze) + excitation = F.sigmoid(excitation) + excitation = paddle.reshape( + excitation, shape=[-1, self._num_channels, 1, 1]) + out = x * excitation + return out + + +class Stage(nn.Layer): + def __init__(self, + num_channels, + num_modules, + num_blocks, + num_filters, + has_se=False, + multi_scale_output=True, + name=None, + align_corners=False, + padding_same=True): + super(Stage, self).__init__() + + self._num_modules = num_modules + + self.stage_func_list = [] + for i in range(num_modules): + if i == num_modules - 1 and not multi_scale_output: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_blocks=num_blocks, + num_filters=num_filters, + has_se=has_se, + multi_scale_output=False, + name=name + '_' + str(i + 1), + align_corners=align_corners, + padding_same=padding_same)) + else: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_blocks=num_blocks, + num_filters=num_filters, + has_se=has_se, + name=name + '_' + str(i + 1), + align_corners=align_corners, + padding_same=padding_same)) + + self.stage_func_list.append(stage_func) + + def forward(self, x): + out = x + for idx in range(self._num_modules): + out = self.stage_func_list[idx](out) + return out + + +class HighResolutionModule(nn.Layer): + def __init__(self, + num_channels, + num_blocks, + num_filters, + has_se=False, + multi_scale_output=True, + name=None, + align_corners=False, + padding_same=True): + super(HighResolutionModule, self).__init__() + + self.branches_func = Branches( + num_blocks=num_blocks, + in_channels=num_channels, + out_channels=num_filters, + has_se=has_se, + name=name, + padding_same=padding_same) + + self.fuse_func = FuseLayers( + in_channels=num_filters, + out_channels=num_filters, + multi_scale_output=multi_scale_output, + name=name, + align_corners=align_corners, + padding_same=padding_same) + + def forward(self, x): + out = self.branches_func(x) + out = self.fuse_func(out) + return out + + +class FuseLayers(nn.Layer): + def __init__(self, + in_channels, + out_channels, + multi_scale_output=True, + name=None, + align_corners=False, + padding_same=True): + super(FuseLayers, self).__init__() + + self._actual_ch = len(in_channels) if multi_scale_output else 1 + self._in_channels = in_channels + self.align_corners = align_corners + + self.residual_func_list = [] + for i in range(self._actual_ch): + for j in range(len(in_channels)): + if j > i: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}".format(name, i + 1, j + 1), + layers.ConvBN( + in_channels=in_channels[j], + out_channels=out_channels[i], + kernel_size=1, + bias_attr=False)) + self.residual_func_list.append(residual_func) + elif j < i: + pre_num_filters = in_channels[j] + for k in range(i - j): + if k == i - j - 1: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + layers.ConvBN( + in_channels=pre_num_filters, + out_channels=out_channels[i], + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False)) + pre_num_filters = out_channels[i] + else: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + layers.ConvBNReLU( + in_channels=pre_num_filters, + out_channels=out_channels[j], + kernel_size=3, + stride=2, + padding=1 if not padding_same else 'same', + bias_attr=False)) + pre_num_filters = out_channels[j] + self.residual_func_list.append(residual_func) + + def forward(self, x): + outs = [] + residual_func_idx = 0 + for i in range(self._actual_ch): + residual = x[i] + residual_shape = paddle.shape(residual)[-2:] + for j in range(len(self._in_channels)): + if j > i: + y = self.residual_func_list[residual_func_idx](x[j]) + residual_func_idx += 1 + + y = F.interpolate( + y, + residual_shape, + mode='bilinear', + align_corners=self.align_corners) + residual = residual + y + elif j < i: + y = x[j] + for k in range(i - j): + y = self.residual_func_list[residual_func_idx](y) + residual_func_idx += 1 + + residual = residual + y + + residual = F.relu(residual) + outs.append(residual) + + return outs + + +@manager.BACKBONES.add_component +def HRNet_W18_Small_V1(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[1], + stage1_num_channels=[32], + stage2_num_modules=1, + stage2_num_blocks=[2, 2], + stage2_num_channels=[16, 32], + stage3_num_modules=1, + stage3_num_blocks=[2, 2, 2], + stage3_num_channels=[16, 32, 64], + stage4_num_modules=1, + stage4_num_blocks=[2, 2, 2, 2], + stage4_num_channels=[16, 32, 64, 128], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W18_Small_V2(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[2], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[2, 2], + stage2_num_channels=[18, 36], + stage3_num_modules=3, + stage3_num_blocks=[2, 2, 2], + stage3_num_channels=[18, 36, 72], + stage4_num_modules=2, + stage4_num_blocks=[2, 2, 2, 2], + stage4_num_channels=[18, 36, 72, 144], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W18(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[18, 36], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[18, 36, 72], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[18, 36, 72, 144], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W30(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[30, 60], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[30, 60, 120], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[30, 60, 120, 240], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W32(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[32, 64], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[32, 64, 128], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[32, 64, 128, 256], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W40(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[40, 80], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[40, 80, 160], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[40, 80, 160, 320], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W44(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[44, 88], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[44, 88, 176], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[44, 88, 176, 352], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W48(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[48, 96], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[48, 96, 192], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[48, 96, 192, 384], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W60(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[60, 120], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[60, 120, 240], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[60, 120, 240, 480], + **kwargs) + return model + + +@manager.BACKBONES.add_component +def HRNet_W64(**kwargs): + model = HRNet( + stage1_num_modules=1, + stage1_num_blocks=[4], + stage1_num_channels=[64], + stage2_num_modules=1, + stage2_num_blocks=[4, 4], + stage2_num_channels=[64, 128], + stage3_num_modules=4, + stage3_num_blocks=[4, 4, 4], + stage3_num_channels=[64, 128, 256], + stage4_num_modules=3, + stage4_num_blocks=[4, 4, 4, 4], + stage4_num_channels=[64, 128, 256, 512], + **kwargs) + return model diff --git a/ppmatting/models/backbone/mobilenet_v2.py b/ppmatting/models/backbone/mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..c7b947d78797b856f7628e19361f1e2f4261b6cd --- /dev/null +++ b/ppmatting/models/backbone/mobilenet_v2.py @@ -0,0 +1,242 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D + +from paddleseg import utils +from paddleseg.cvlibs import manager + +MODEL_URLS = { + "MobileNetV2_x0_25": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams", + "MobileNetV2_x0_5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams", + "MobileNetV2_x0_75": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams", + "MobileNetV2": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_pretrained.pdparams", + "MobileNetV2_x1_5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams", + "MobileNetV2_x2_0": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams" +} + +__all__ = ["MobileNetV2"] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + name=None, + use_cudnn=True): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + param_attr=ParamAttr(name=name + "_bn_scale"), + bias_attr=ParamAttr(name=name + "_bn_offset"), + moving_mean_name=name + "_bn_mean", + moving_variance_name=name + "_bn_variance") + + def forward(self, inputs, if_act=True): + y = self._conv(inputs) + y = self._batch_norm(y) + if if_act: + y = F.relu6(y) + return y + + +class InvertedResidualUnit(nn.Layer): + def __init__(self, num_channels, num_in_filter, num_filters, stride, + filter_size, padding, expansion_factor, name): + super(InvertedResidualUnit, self).__init__() + num_expfilter = int(round(num_in_filter * expansion_factor)) + self._expand_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=num_expfilter, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + name=name + "_expand") + + self._bottleneck_conv = ConvBNLayer( + num_channels=num_expfilter, + num_filters=num_expfilter, + filter_size=filter_size, + stride=stride, + padding=padding, + num_groups=num_expfilter, + use_cudnn=False, + name=name + "_dwise") + + self._linear_conv = ConvBNLayer( + num_channels=num_expfilter, + num_filters=num_filters, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + name=name + "_linear") + + def forward(self, inputs, ifshortcut): + y = self._expand_conv(inputs, if_act=True) + y = self._bottleneck_conv(y, if_act=True) + y = self._linear_conv(y, if_act=False) + if ifshortcut: + y = paddle.add(inputs, y) + return y + + +class InvresiBlocks(nn.Layer): + def __init__(self, in_c, t, c, n, s, name): + super(InvresiBlocks, self).__init__() + + self._first_block = InvertedResidualUnit( + num_channels=in_c, + num_in_filter=in_c, + num_filters=c, + stride=s, + filter_size=3, + padding=1, + expansion_factor=t, + name=name + "_1") + + self._block_list = [] + for i in range(1, n): + block = self.add_sublayer( + name + "_" + str(i + 1), + sublayer=InvertedResidualUnit( + num_channels=c, + num_in_filter=c, + num_filters=c, + stride=1, + filter_size=3, + padding=1, + expansion_factor=t, + name=name + "_" + str(i + 1))) + self._block_list.append(block) + + def forward(self, inputs): + y = self._first_block(inputs, ifshortcut=False) + for block in self._block_list: + y = block(y, ifshortcut=True) + return y + + +@manager.BACKBONES.add_component +class MobileNet(nn.Layer): + def __init__(self, + input_channels=3, + scale=1.0, + pretrained=None, + prefix_name=""): + super(MobileNet, self).__init__() + self.scale = scale + + bottleneck_params_list = [ + (1, 16, 1, 1), + (6, 24, 2, 2), + (6, 32, 3, 2), + (6, 64, 4, 2), + (6, 96, 3, 1), + (6, 160, 3, 2), + (6, 320, 1, 1), + ] + + self.conv1 = ConvBNLayer( + num_channels=input_channels, + num_filters=int(32 * scale), + filter_size=3, + stride=2, + padding=1, + name=prefix_name + "conv1_1") + + self.block_list = [] + i = 1 + in_c = int(32 * scale) + for layer_setting in bottleneck_params_list: + t, c, n, s = layer_setting + i += 1 + block = self.add_sublayer( + prefix_name + "conv" + str(i), + sublayer=InvresiBlocks( + in_c=in_c, + t=t, + c=int(c * scale), + n=n, + s=s, + name=prefix_name + "conv" + str(i))) + self.block_list.append(block) + in_c = int(c * scale) + + self.out_c = int(1280 * scale) if scale > 1.0 else 1280 + self.conv9 = ConvBNLayer( + num_channels=in_c, + num_filters=self.out_c, + filter_size=1, + stride=1, + padding=0, + name=prefix_name + "conv9") + + self.feat_channels = [int(i * scale) for i in [16, 24, 32, 96, 1280]] + self.pretrained = pretrained + self.init_weight() + + def forward(self, inputs): + feat_list = [] + y = self.conv1(inputs, if_act=True) + + block_index = 0 + for block in self.block_list: + y = block(y) + if block_index in [0, 1, 2, 4]: + feat_list.append(y) + block_index += 1 + y = self.conv9(y, if_act=True) + feat_list.append(y) + return feat_list + + def init_weight(self): + utils.load_pretrained_model(self, self.pretrained) + + +@manager.BACKBONES.add_component +def MobileNetV2(**kwargs): + model = MobileNet(scale=1.0, **kwargs) + return model diff --git a/ppmatting/models/backbone/resnet_vd.py b/ppmatting/models/backbone/resnet_vd.py new file mode 100644 index 0000000000000000000000000000000000000000..0fdd9a57664ad80ee59846060cd7f768f757feae --- /dev/null +++ b/ppmatting/models/backbone/resnet_vd.py @@ -0,0 +1,368 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +from paddleseg.models import layers +from paddleseg.utils import utils + +__all__ = [ + "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd" +] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + groups=1, + is_vd_mode=False, + act=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 if dilation == 1 else 0, + dilation=dilation, + groups=groups, + bias_attr=False) + + self._batch_norm = layers.SyncBatchNorm(out_channels) + self._act_op = layers.Activation(act=act) + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + y = self._act_op(y) + + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + dilation=1): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu') + + self.dilation = dilation + + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + dilation=dilation) + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first or stride == 1 else True) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + + #################################################################### + # If given dilation rate > 1, using corresponding padding. + # The performance drops down without the follow padding. + if self.dilation > 1: + padding = self.dilation + y = F.pad(y, [padding, padding, padding, padding]) + ##################################################################### + + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu') + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first or stride == 1 else True) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + + return y + + +class ResNet_vd(nn.Layer): + """ + The ResNet_vd implementation based on PaddlePaddle. + + The original article refers to Jingdong + Tong He, et, al. "Bag of Tricks for Image Classification with Convolutional Neural Networks" + (https://arxiv.org/pdf/1812.01187.pdf). + + Args: + layers (int, optional): The layers of ResNet_vd. The supported layers are (18, 34, 50, 101, 152, 200). Default: 50. + output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 8. + multi_grid (tuple|list, optional): The grid of stage4. Defult: (1, 1, 1). + pretrained (str, optional): The path of pretrained model. + + """ + + def __init__(self, + input_channels=3, + layers=50, + output_stride=32, + multi_grid=(1, 1, 1), + pretrained=None): + super(ResNet_vd, self).__init__() + + self.conv1_logit = None # for gscnn shape stream + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + # for channels of four returned stages + self.feat_channels = [c * 4 for c in num_filters + ] if layers >= 50 else num_filters + self.feat_channels = [64] + self.feat_channels + + dilation_dict = None + if output_stride == 8: + dilation_dict = {2: 2, 3: 4} + elif output_stride == 16: + dilation_dict = {3: 2} + + self.conv1_1 = ConvBNLayer( + in_channels=input_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu') + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu') + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu') + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + # self.block_list = [] + self.stage_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + block_list = [] + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + + ############################################################################### + # Add dilation rate for some segmentation tasks, if dilation_dict is not None. + dilation_rate = dilation_dict[ + block] if dilation_dict and block in dilation_dict else 1 + + # Actually block here is 'stage', and i is 'block' in 'stage' + # At the stage 4, expand the the dilation_rate if given multi_grid + if block == 3: + dilation_rate = dilation_rate * multi_grid[i] + ############################################################################### + + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 and + dilation_rate == 1 else 1, + shortcut=shortcut, + if_first=block == i == 0, + dilation=dilation_rate)) + + block_list.append(bottleneck_block) + shortcut = True + self.stage_list.append(block_list) + else: + for block in range(len(depth)): + shortcut = False + block_list = [] + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0)) + block_list.append(basic_block) + shortcut = True + self.stage_list.append(block_list) + + self.pretrained = pretrained + self.init_weight() + + def forward(self, inputs): + feat_list = [] + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + feat_list.append(y) + + y = self.pool2d_max(y) + + # A feature list saves the output feature map of each stage. + for stage in self.stage_list: + for block in stage: + y = block(y) + feat_list.append(y) + + return feat_list + + def init_weight(self): + utils.load_pretrained_model(self, self.pretrained) + + +@manager.BACKBONES.add_component +def ResNet18_vd(**args): + model = ResNet_vd(layers=18, **args) + return model + + +@manager.BACKBONES.add_component +def ResNet34_vd(**args): + model = ResNet_vd(layers=34, **args) + return model + + +@manager.BACKBONES.add_component +def ResNet50_vd(**args): + model = ResNet_vd(layers=50, **args) + return model + + +@manager.BACKBONES.add_component +def ResNet101_vd(**args): + model = ResNet_vd(layers=101, **args) + return model + + +def ResNet152_vd(**args): + model = ResNet_vd(layers=152, **args) + return model + + +def ResNet200_vd(**args): + model = ResNet_vd(layers=200, **args) + return model diff --git a/ppmatting/models/backbone/vgg.py b/ppmatting/models/backbone/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..64b529bf0c3e25cb82ea4b4c31bec9ef30d2da59 --- /dev/null +++ b/ppmatting/models/backbone/vgg.py @@ -0,0 +1,166 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D + +from paddleseg.cvlibs import manager +from paddleseg.utils import utils + + +class ConvBlock(nn.Layer): + def __init__(self, input_channels, output_channels, groups, name=None): + super(ConvBlock, self).__init__() + + self.groups = groups + self._conv_1 = Conv2D( + in_channels=input_channels, + out_channels=output_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name=name + "1_weights"), + bias_attr=False) + if groups == 2 or groups == 3 or groups == 4: + self._conv_2 = Conv2D( + in_channels=output_channels, + out_channels=output_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name=name + "2_weights"), + bias_attr=False) + if groups == 3 or groups == 4: + self._conv_3 = Conv2D( + in_channels=output_channels, + out_channels=output_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name=name + "3_weights"), + bias_attr=False) + if groups == 4: + self._conv_4 = Conv2D( + in_channels=output_channels, + out_channels=output_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name=name + "4_weights"), + bias_attr=False) + + self._pool = MaxPool2D( + kernel_size=2, stride=2, padding=0, return_mask=True) + + def forward(self, inputs): + x = self._conv_1(inputs) + x = F.relu(x) + if self.groups == 2 or self.groups == 3 or self.groups == 4: + x = self._conv_2(x) + x = F.relu(x) + if self.groups == 3 or self.groups == 4: + x = self._conv_3(x) + x = F.relu(x) + if self.groups == 4: + x = self._conv_4(x) + x = F.relu(x) + skip = x + x, max_indices = self._pool(x) + return x, max_indices, skip + + +class VGGNet(nn.Layer): + def __init__(self, input_channels=3, layers=11, pretrained=None): + super(VGGNet, self).__init__() + self.pretrained = pretrained + + self.layers = layers + self.vgg_configure = { + 11: [1, 1, 2, 2, 2], + 13: [2, 2, 2, 2, 2], + 16: [2, 2, 3, 3, 3], + 19: [2, 2, 4, 4, 4] + } + assert self.layers in self.vgg_configure.keys(), \ + "supported layers are {} but input layer is {}".format( + self.vgg_configure.keys(), layers) + self.groups = self.vgg_configure[self.layers] + + # matting的第一层卷积输入为4通道,初始化是直接初始化为0 + self._conv_block_1 = ConvBlock( + input_channels, 64, self.groups[0], name="conv1_") + self._conv_block_2 = ConvBlock(64, 128, self.groups[1], name="conv2_") + self._conv_block_3 = ConvBlock(128, 256, self.groups[2], name="conv3_") + self._conv_block_4 = ConvBlock(256, 512, self.groups[3], name="conv4_") + self._conv_block_5 = ConvBlock(512, 512, self.groups[4], name="conv5_") + + # 这一层的初始化需要利用vgg fc6的参数转换后进行初始化,可以暂时不考虑初始化 + self._conv_6 = Conv2D( + 512, 512, kernel_size=3, padding=1, bias_attr=False) + + self.init_weight() + + def forward(self, inputs): + fea_list = [] + ids_list = [] + x, ids, skip = self._conv_block_1(inputs) + fea_list.append(skip) + ids_list.append(ids) + x, ids, skip = self._conv_block_2(x) + fea_list.append(skip) + ids_list.append(ids) + x, ids, skip = self._conv_block_3(x) + fea_list.append(skip) + ids_list.append(ids) + x, ids, skip = self._conv_block_4(x) + fea_list.append(skip) + ids_list.append(ids) + x, ids, skip = self._conv_block_5(x) + fea_list.append(skip) + ids_list.append(ids) + x = F.relu(self._conv_6(x)) + fea_list.append(x) + return fea_list + + def init_weight(self): + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +@manager.BACKBONES.add_component +def VGG11(**args): + model = VGGNet(layers=11, **args) + return model + + +@manager.BACKBONES.add_component +def VGG13(**args): + model = VGGNet(layers=13, **args) + return model + + +@manager.BACKBONES.add_component +def VGG16(**args): + model = VGGNet(layers=16, **args) + return model + + +@manager.BACKBONES.add_component +def VGG19(**args): + model = VGGNet(layers=19, **args) + return model diff --git a/ppmatting/models/dim.py b/ppmatting/models/dim.py new file mode 100644 index 0000000000000000000000000000000000000000..5d9ae654322242f785407e61ff7b8405d6b443b4 --- /dev/null +++ b/ppmatting/models/dim.py @@ -0,0 +1,208 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddleseg.models import layers +from paddleseg import utils +from paddleseg.cvlibs import manager + +from ppmatting.models.losses import MRSD + + +@manager.MODELS.add_component +class DIM(nn.Layer): + """ + The DIM implementation based on PaddlePaddle. + + The original article refers to + Ning Xu, et, al. "Deep Image Matting" + (https://arxiv.org/pdf/1908.07919.pdf). + + Args: + backbone: backbone model. + stage (int, optional): The stage of model. Defautl: 3. + decoder_input_channels(int, optional): The channel of decoder input. Default: 512. + pretrained(str, optional): The path of pretrianed model. Defautl: None. + + """ + + def __init__(self, + backbone, + stage=3, + decoder_input_channels=512, + pretrained=None): + super().__init__() + self.backbone = backbone + self.pretrained = pretrained + self.stage = stage + self.loss_func_dict = None + + decoder_output_channels = [64, 128, 256, 512] + self.decoder = Decoder( + input_channels=decoder_input_channels, + output_channels=decoder_output_channels) + if self.stage == 2: + for param in self.backbone.parameters(): + param.stop_gradient = True + for param in self.decoder.parameters(): + param.stop_gradient = True + if self.stage >= 2: + self.refine = Refine() + self.init_weight() + + def forward(self, inputs): + input_shape = paddle.shape(inputs['img'])[-2:] + x = paddle.concat([inputs['img'], inputs['trimap'] / 255], axis=1) + fea_list = self.backbone(x) + + # decoder stage + up_shape = [] + for i in range(5): + up_shape.append(paddle.shape(fea_list[i])[-2:]) + alpha_raw = self.decoder(fea_list, up_shape) + alpha_raw = F.interpolate( + alpha_raw, input_shape, mode='bilinear', align_corners=False) + logit_dict = {'alpha_raw': alpha_raw} + if self.stage < 2: + return logit_dict + + if self.stage >= 2: + # refine stage + refine_input = paddle.concat([inputs['img'], alpha_raw], axis=1) + alpha_refine = self.refine(refine_input) + + # finally alpha + alpha_pred = alpha_refine + alpha_raw + alpha_pred = F.interpolate( + alpha_pred, input_shape, mode='bilinear', align_corners=False) + if not self.training: + alpha_pred = paddle.clip(alpha_pred, min=0, max=1) + logit_dict['alpha_pred'] = alpha_pred + if self.training: + loss_dict = self.loss(logit_dict, inputs) + return logit_dict, loss_dict + else: + return alpha_pred + + def loss(self, logit_dict, label_dict, loss_func_dict=None): + if loss_func_dict is None: + if self.loss_func_dict is None: + self.loss_func_dict = defaultdict(list) + self.loss_func_dict['alpha_raw'].append(MRSD()) + self.loss_func_dict['comp'].append(MRSD()) + self.loss_func_dict['alpha_pred'].append(MRSD()) + else: + self.loss_func_dict = loss_func_dict + + loss = {} + mask = label_dict['trimap'] == 128 + loss['all'] = 0 + + if self.stage != 2: + loss['alpha_raw'] = self.loss_func_dict['alpha_raw'][0]( + logit_dict['alpha_raw'], label_dict['alpha'], mask) + loss['alpha_raw'] = 0.5 * loss['alpha_raw'] + loss['all'] = loss['all'] + loss['alpha_raw'] + + if self.stage == 1 or self.stage == 3: + comp_pred = logit_dict['alpha_raw'] * label_dict['fg'] + \ + (1 - logit_dict['alpha_raw']) * label_dict['bg'] + loss['comp'] = self.loss_func_dict['comp'][0]( + comp_pred, label_dict['img'], mask) + loss['comp'] = 0.5 * loss['comp'] + loss['all'] = loss['all'] + loss['comp'] + + if self.stage == 2 or self.stage == 3: + loss['alpha_pred'] = self.loss_func_dict['alpha_pred'][0]( + logit_dict['alpha_pred'], label_dict['alpha'], mask) + loss['all'] = loss['all'] + loss['alpha_pred'] + + return loss + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +# bilinear interpolate skip connect +class Up(nn.Layer): + def __init__(self, input_channels, output_channels): + super().__init__() + self.conv = layers.ConvBNReLU( + input_channels, + output_channels, + kernel_size=5, + padding=2, + bias_attr=False) + + def forward(self, x, skip, output_shape): + x = F.interpolate( + x, size=output_shape, mode='bilinear', align_corners=False) + x = x + skip + x = self.conv(x) + x = F.relu(x) + + return x + + +class Decoder(nn.Layer): + def __init__(self, input_channels, output_channels=(64, 128, 256, 512)): + super().__init__() + self.deconv6 = nn.Conv2D( + input_channels, input_channels, kernel_size=1, bias_attr=False) + self.deconv5 = Up(input_channels, output_channels[-1]) + self.deconv4 = Up(output_channels[-1], output_channels[-2]) + self.deconv3 = Up(output_channels[-2], output_channels[-3]) + self.deconv2 = Up(output_channels[-3], output_channels[-4]) + self.deconv1 = Up(output_channels[-4], 64) + + self.alpha_conv = nn.Conv2D( + 64, 1, kernel_size=5, padding=2, bias_attr=False) + + def forward(self, fea_list, shape_list): + x = fea_list[-1] + x = self.deconv6(x) + x = self.deconv5(x, fea_list[4], shape_list[4]) + x = self.deconv4(x, fea_list[3], shape_list[3]) + x = self.deconv3(x, fea_list[2], shape_list[2]) + x = self.deconv2(x, fea_list[1], shape_list[1]) + x = self.deconv1(x, fea_list[0], shape_list[0]) + alpha = self.alpha_conv(x) + alpha = F.sigmoid(alpha) + + return alpha + + +class Refine(nn.Layer): + def __init__(self): + super().__init__() + self.conv1 = layers.ConvBNReLU( + 4, 64, kernel_size=3, padding=1, bias_attr=False) + self.conv2 = layers.ConvBNReLU( + 64, 64, kernel_size=3, padding=1, bias_attr=False) + self.conv3 = layers.ConvBNReLU( + 64, 64, kernel_size=3, padding=1, bias_attr=False) + self.alpha_pred = layers.ConvBNReLU( + 64, 1, kernel_size=3, padding=1, bias_attr=False) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + alpha = self.alpha_pred(x) + + return alpha diff --git a/ppmatting/models/gca.py b/ppmatting/models/gca.py new file mode 100644 index 0000000000000000000000000000000000000000..369a913570682f85ea696beaf3b78b7c2ec88141 --- /dev/null +++ b/ppmatting/models/gca.py @@ -0,0 +1,305 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The gca code was heavily based on https://github.com/Yaoyi-Li/GCA-Matting +# and https://github.com/open-mmlab/mmediting + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddleseg.models import layers +from paddleseg import utils +from paddleseg.cvlibs import manager, param_init + +from ppmatting.models.layers import GuidedCxtAtten + + +@manager.MODELS.add_component +class GCABaseline(nn.Layer): + def __init__(self, backbone, pretrained=None): + super().__init__() + self.encoder = backbone + self.decoder = ResShortCut_D_Dec([2, 3, 3, 2]) + + def forward(self, inputs): + + x = paddle.concat([inputs['img'], inputs['trimap'] / 255], axis=1) + embedding, mid_fea = self.encoder(x) + alpha_pred = self.decoder(embedding, mid_fea) + + if self.training: + logit_dict = {'alpha_pred': alpha_pred, } + loss_dict = {} + alpha_gt = inputs['alpha'] + loss_dict["alpha"] = F.l1_loss(alpha_pred, alpha_gt) + loss_dict["all"] = loss_dict["alpha"] + return logit_dict, loss_dict + + return alpha_pred + + +@manager.MODELS.add_component +class GCA(GCABaseline): + def __init__(self, backbone, pretrained=None): + super().__init__(backbone, pretrained) + self.decoder = ResGuidedCxtAtten_Dec([2, 3, 3, 2]) + + +def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1): + """5x5 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=5, + stride=stride, + padding=2, + groups=groups, + bias_attr=False, + dilation=dilation) + + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias_attr=False, + dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2D( + in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + upsample=None, + norm_layer=None, + large_kernel=False): + super().__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm + self.stride = stride + conv = conv5x5 if large_kernel else conv3x3 + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + if self.stride > 1: + self.conv1 = nn.utils.spectral_norm( + nn.Conv2DTranspose( + inplanes, + inplanes, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False)) + else: + self.conv1 = nn.utils.spectral_norm(conv(inplanes, inplanes)) + self.bn1 = norm_layer(inplanes) + self.activation = nn.LeakyReLU(0.2) + self.conv2 = nn.utils.spectral_norm(conv(inplanes, planes)) + self.bn2 = norm_layer(planes) + self.upsample = upsample + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.activation(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.upsample is not None: + identity = self.upsample(x) + + out += identity + out = self.activation(out) + + return out + + +class ResNet_D_Dec(nn.Layer): + def __init__(self, + layers=[3, 4, 4, 2], + norm_layer=None, + large_kernel=False, + late_downsample=False): + super().__init__() + + if norm_layer is None: + norm_layer = nn.BatchNorm + self._norm_layer = norm_layer + self.large_kernel = large_kernel + self.kernel_size = 5 if self.large_kernel else 3 + + self.inplanes = 512 if layers[0] > 0 else 256 + self.late_downsample = late_downsample + self.midplanes = 64 if late_downsample else 32 + + self.conv1 = nn.utils.spectral_norm( + nn.Conv2DTranspose( + self.midplanes, + 32, + kernel_size=4, + stride=2, + padding=1, + bias_attr=False)) + self.bn1 = norm_layer(32) + self.leaky_relu = nn.LeakyReLU(0.2) + self.conv2 = nn.Conv2D( + 32, + 1, + kernel_size=self.kernel_size, + stride=1, + padding=self.kernel_size // 2) + self.upsample = nn.UpsamplingNearest2D(scale_factor=2) + self.tanh = nn.Tanh() + self.layer1 = self._make_layer(BasicBlock, 256, layers[0], stride=2) + self.layer2 = self._make_layer(BasicBlock, 128, layers[1], stride=2) + self.layer3 = self._make_layer(BasicBlock, 64, layers[2], stride=2) + self.layer4 = self._make_layer( + BasicBlock, self.midplanes, layers[3], stride=2) + + self.init_weight() + + def _make_layer(self, block, planes, blocks, stride=1): + if blocks == 0: + return nn.Sequential(nn.Identity()) + norm_layer = self._norm_layer + upsample = None + if stride != 1: + upsample = nn.Sequential( + nn.UpsamplingNearest2D(scale_factor=2), + nn.utils.spectral_norm( + conv1x1(self.inplanes, planes * block.expansion)), + norm_layer(planes * block.expansion), ) + elif self.inplanes != planes * block.expansion: + upsample = nn.Sequential( + nn.utils.spectral_norm( + conv1x1(self.inplanes, planes * block.expansion)), + norm_layer(planes * block.expansion), ) + + layers = [ + block(self.inplanes, planes, stride, upsample, norm_layer, + self.large_kernel) + ] + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + norm_layer=norm_layer, + large_kernel=self.large_kernel)) + + return nn.Sequential(*layers) + + def forward(self, x, mid_fea): + x = self.layer1(x) # N x 256 x 32 x 32 + print(x.shape) + x = self.layer2(x) # N x 128 x 64 x 64 + print(x.shape) + x = self.layer3(x) # N x 64 x 128 x 128 + print(x.shape) + x = self.layer4(x) # N x 32 x 256 x 256 + print(x.shape) + x = self.conv1(x) + x = self.bn1(x) + x = self.leaky_relu(x) + x = self.conv2(x) + + alpha = (self.tanh(x) + 1.0) / 2.0 + + return alpha + + def init_weight(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + + if hasattr(layer, "weight_orig"): + param = layer.weight_orig + else: + param = layer.weight + param_init.xavier_uniform(param) + + elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): + param_init.constant_init(layer.weight, value=1.0) + param_init.constant_init(layer.bias, value=0.0) + + elif isinstance(layer, BasicBlock): + param_init.constant_init(layer.bn2.weight, value=0.0) + + +class ResShortCut_D_Dec(ResNet_D_Dec): + def __init__(self, + layers, + norm_layer=None, + large_kernel=False, + late_downsample=False): + super().__init__( + layers, norm_layer, large_kernel, late_downsample=late_downsample) + + def forward(self, x, mid_fea): + fea1, fea2, fea3, fea4, fea5 = mid_fea['shortcut'] + x = self.layer1(x) + fea5 + x = self.layer2(x) + fea4 + x = self.layer3(x) + fea3 + x = self.layer4(x) + fea2 + x = self.conv1(x) + x = self.bn1(x) + x = self.leaky_relu(x) + fea1 + x = self.conv2(x) + + alpha = (self.tanh(x) + 1.0) / 2.0 + + return alpha + + +class ResGuidedCxtAtten_Dec(ResNet_D_Dec): + def __init__(self, + layers, + norm_layer=None, + large_kernel=False, + late_downsample=False): + super().__init__( + layers, norm_layer, large_kernel, late_downsample=late_downsample) + self.gca = GuidedCxtAtten(128, 128) + + def forward(self, x, mid_fea): + fea1, fea2, fea3, fea4, fea5 = mid_fea['shortcut'] + im = mid_fea['image_fea'] + x = self.layer1(x) + fea5 # N x 256 x 32 x 32 + x = self.layer2(x) + fea4 # N x 128 x 64 x 64 + x = self.gca(im, x, mid_fea['unknown']) # contextual attention + x = self.layer3(x) + fea3 # N x 64 x 128 x 128 + x = self.layer4(x) + fea2 # N x 32 x 256 x 256 + x = self.conv1(x) + x = self.bn1(x) + x = self.leaky_relu(x) + fea1 + x = self.conv2(x) + + alpha = (self.tanh(x) + 1.0) / 2.0 + + return alpha diff --git a/ppmatting/models/human_matting.py b/ppmatting/models/human_matting.py new file mode 100644 index 0000000000000000000000000000000000000000..cf315edfa563fe231a119dd15b749c41157c988c --- /dev/null +++ b/ppmatting/models/human_matting.py @@ -0,0 +1,454 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +import time + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddleseg +from paddleseg.models import layers +from paddleseg import utils +from paddleseg.cvlibs import manager + +from ppmatting.models.losses import MRSD + + +def conv_up_psp(in_channels, out_channels, up_sample): + return nn.Sequential( + layers.ConvBNReLU( + in_channels, out_channels, 3, padding=1), + nn.Upsample( + scale_factor=up_sample, mode='bilinear', align_corners=False)) + + +@manager.MODELS.add_component +class HumanMatting(nn.Layer): + """A model for """ + + def __init__(self, + backbone, + pretrained=None, + backbone_scale=0.25, + refine_kernel_size=3, + if_refine=True): + super().__init__() + if if_refine: + if backbone_scale > 0.5: + raise ValueError( + 'Backbone_scale should not be greater than 1/2, but it is {}' + .format(backbone_scale)) + else: + backbone_scale = 1 + + self.backbone = backbone + self.backbone_scale = backbone_scale + self.pretrained = pretrained + self.if_refine = if_refine + if if_refine: + self.refiner = Refiner(kernel_size=refine_kernel_size) + self.loss_func_dict = None + + self.backbone_channels = backbone.feat_channels + ###################### + ### Decoder part - Glance + ###################### + self.psp_module = layers.PPModule( + self.backbone_channels[-1], + 512, + bin_sizes=(1, 3, 5), + dim_reduction=False, + align_corners=False) + self.psp4 = conv_up_psp(512, 256, 2) + self.psp3 = conv_up_psp(512, 128, 4) + self.psp2 = conv_up_psp(512, 64, 8) + self.psp1 = conv_up_psp(512, 64, 16) + # stage 5g + self.decoder5_g = nn.Sequential( + layers.ConvBNReLU( + 512 + self.backbone_channels[-1], 512, 3, padding=1), + layers.ConvBNReLU( + 512, 512, 3, padding=2, dilation=2), + layers.ConvBNReLU( + 512, 256, 3, padding=2, dilation=2), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 4g + self.decoder4_g = nn.Sequential( + layers.ConvBNReLU( + 512, 256, 3, padding=1), + layers.ConvBNReLU( + 256, 256, 3, padding=1), + layers.ConvBNReLU( + 256, 128, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 3g + self.decoder3_g = nn.Sequential( + layers.ConvBNReLU( + 256, 128, 3, padding=1), + layers.ConvBNReLU( + 128, 128, 3, padding=1), + layers.ConvBNReLU( + 128, 64, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 2g + self.decoder2_g = nn.Sequential( + layers.ConvBNReLU( + 128, 128, 3, padding=1), + layers.ConvBNReLU( + 128, 128, 3, padding=1), + layers.ConvBNReLU( + 128, 64, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 1g + self.decoder1_g = nn.Sequential( + layers.ConvBNReLU( + 128, 64, 3, padding=1), + layers.ConvBNReLU( + 64, 64, 3, padding=1), + layers.ConvBNReLU( + 64, 64, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 0g + self.decoder0_g = nn.Sequential( + layers.ConvBNReLU( + 64, 64, 3, padding=1), + layers.ConvBNReLU( + 64, 64, 3, padding=1), + nn.Conv2D( + 64, 3, 3, padding=1)) + + ########################## + ### Decoder part - FOCUS + ########################## + self.bridge_block = nn.Sequential( + layers.ConvBNReLU( + self.backbone_channels[-1], 512, 3, dilation=2, padding=2), + layers.ConvBNReLU( + 512, 512, 3, dilation=2, padding=2), + layers.ConvBNReLU( + 512, 512, 3, dilation=2, padding=2)) + # stage 5f + self.decoder5_f = nn.Sequential( + layers.ConvBNReLU( + 512 + self.backbone_channels[-1], 512, 3, padding=1), + layers.ConvBNReLU( + 512, 512, 3, padding=2, dilation=2), + layers.ConvBNReLU( + 512, 256, 3, padding=2, dilation=2), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 4f + self.decoder4_f = nn.Sequential( + layers.ConvBNReLU( + 256 + self.backbone_channels[-2], 256, 3, padding=1), + layers.ConvBNReLU( + 256, 256, 3, padding=1), + layers.ConvBNReLU( + 256, 128, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 3f + self.decoder3_f = nn.Sequential( + layers.ConvBNReLU( + 128 + self.backbone_channels[-3], 128, 3, padding=1), + layers.ConvBNReLU( + 128, 128, 3, padding=1), + layers.ConvBNReLU( + 128, 64, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 2f + self.decoder2_f = nn.Sequential( + layers.ConvBNReLU( + 64 + self.backbone_channels[-4], 128, 3, padding=1), + layers.ConvBNReLU( + 128, 128, 3, padding=1), + layers.ConvBNReLU( + 128, 64, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 1f + self.decoder1_f = nn.Sequential( + layers.ConvBNReLU( + 64 + self.backbone_channels[-5], 64, 3, padding=1), + layers.ConvBNReLU( + 64, 64, 3, padding=1), + layers.ConvBNReLU( + 64, 64, 3, padding=1), + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + # stage 0f + self.decoder0_f = nn.Sequential( + layers.ConvBNReLU( + 64, 64, 3, padding=1), + layers.ConvBNReLU( + 64, 64, 3, padding=1), + nn.Conv2D( + 64, 1 + 1 + 32, 3, padding=1)) + self.init_weight() + + def forward(self, data): + src = data['img'] + src_h, src_w = paddle.shape(src)[2:] + if self.if_refine: + # It is not need when exporting. + if isinstance(src_h, paddle.Tensor): + if (src_h % 4 != 0) or (src_w % 4) != 0: + raise ValueError( + 'The input image must have width and height that are divisible by 4' + ) + + # Downsample src for backbone + src_sm = F.interpolate( + src, + scale_factor=self.backbone_scale, + mode='bilinear', + align_corners=False) + + # Base + fea_list = self.backbone(src_sm) + ########################## + ### Decoder part - GLANCE + ########################## + #psp: N, 512, H/32, W/32 + psp = self.psp_module(fea_list[-1]) + #d6_g: N, 512, H/16, W/16 + d5_g = self.decoder5_g(paddle.concat((psp, fea_list[-1]), 1)) + #d5_g: N, 512, H/8, W/8 + d4_g = self.decoder4_g(paddle.concat((self.psp4(psp), d5_g), 1)) + #d4_g: N, 256, H/4, W/4 + d3_g = self.decoder3_g(paddle.concat((self.psp3(psp), d4_g), 1)) + #d4_g: N, 128, H/2, W/2 + d2_g = self.decoder2_g(paddle.concat((self.psp2(psp), d3_g), 1)) + #d2_g: N, 64, H, W + d1_g = self.decoder1_g(paddle.concat((self.psp1(psp), d2_g), 1)) + #d0_g: N, 3, H, W + d0_g = self.decoder0_g(d1_g) + # The 1st channel is foreground. The 2nd is transition region. The 3rd is background. + # glance_sigmoid = F.sigmoid(d0_g) + glance_sigmoid = F.softmax(d0_g, axis=1) + + ########################## + ### Decoder part - FOCUS + ########################## + bb = self.bridge_block(fea_list[-1]) + #bg: N, 512, H/32, W/32 + d5_f = self.decoder5_f(paddle.concat((bb, fea_list[-1]), 1)) + #d5_f: N, 256, H/16, W/16 + d4_f = self.decoder4_f(paddle.concat((d5_f, fea_list[-2]), 1)) + #d4_f: N, 128, H/8, W/8 + d3_f = self.decoder3_f(paddle.concat((d4_f, fea_list[-3]), 1)) + #d3_f: N, 64, H/4, W/4 + d2_f = self.decoder2_f(paddle.concat((d3_f, fea_list[-4]), 1)) + #d2_f: N, 64, H/2, W/2 + d1_f = self.decoder1_f(paddle.concat((d2_f, fea_list[-5]), 1)) + #d1_f: N, 64, H, W + d0_f = self.decoder0_f(d1_f) + #d0_f: N, 1, H, W + focus_sigmoid = F.sigmoid(d0_f[:, 0:1, :, :]) + pha_sm = self.fusion(glance_sigmoid, focus_sigmoid) + err_sm = d0_f[:, 1:2, :, :] + err_sm = paddle.clip(err_sm, 0., 1.) + hid_sm = F.relu(d0_f[:, 2:, :, :]) + + # Refiner + if self.if_refine: + pha = self.refiner( + src=src, pha=pha_sm, err=err_sm, hid=hid_sm, tri=glance_sigmoid) + # Clamp outputs + pha = paddle.clip(pha, 0., 1.) + + if self.training: + logit_dict = { + 'glance': glance_sigmoid, + 'focus': focus_sigmoid, + 'fusion': pha_sm, + 'error': err_sm + } + if self.if_refine: + logit_dict['refine'] = pha + loss_dict = self.loss(logit_dict, data) + return logit_dict, loss_dict + else: + return pha if self.if_refine else pha_sm + + def loss(self, logit_dict, label_dict, loss_func_dict=None): + if loss_func_dict is None: + if self.loss_func_dict is None: + self.loss_func_dict = defaultdict(list) + self.loss_func_dict['glance'].append(nn.NLLLoss()) + self.loss_func_dict['focus'].append(MRSD()) + self.loss_func_dict['cm'].append(MRSD()) + self.loss_func_dict['err'].append(paddleseg.models.MSELoss()) + self.loss_func_dict['refine'].append(paddleseg.models.L1Loss()) + else: + self.loss_func_dict = loss_func_dict + + loss = {} + + # glance loss computation + # get glance label + glance_label = F.interpolate( + label_dict['trimap'], + logit_dict['glance'].shape[2:], + mode='nearest', + align_corners=False) + glance_label_trans = (glance_label == 128).astype('int64') + glance_label_bg = (glance_label == 0).astype('int64') + glance_label = glance_label_trans + glance_label_bg * 2 + loss_glance = self.loss_func_dict['glance'][0]( + paddle.log(logit_dict['glance'] + 1e-6), glance_label.squeeze(1)) + loss['glance'] = loss_glance + + # focus loss computation + focus_label = F.interpolate( + label_dict['alpha'], + logit_dict['focus'].shape[2:], + mode='bilinear', + align_corners=False) + loss_focus = self.loss_func_dict['focus'][0]( + logit_dict['focus'], focus_label, glance_label_trans) + loss['focus'] = loss_focus + + # collaborative matting loss + loss_cm_func = self.loss_func_dict['cm'] + # fusion_sigmoid loss + loss_cm = loss_cm_func[0](logit_dict['fusion'], focus_label) + loss['cm'] = loss_cm + + # error loss + err = F.interpolate( + logit_dict['error'], + label_dict['alpha'].shape[2:], + mode='bilinear', + align_corners=False) + err_label = (F.interpolate( + logit_dict['fusion'], + label_dict['alpha'].shape[2:], + mode='bilinear', + align_corners=False) - label_dict['alpha']).abs() + loss_err = self.loss_func_dict['err'][0](err, err_label) + loss['err'] = loss_err + + loss_all = 0.25 * loss_glance + 0.25 * loss_focus + 0.25 * loss_cm + loss_err + + # refine loss + if self.if_refine: + loss_refine = self.loss_func_dict['refine'][0](logit_dict['refine'], + label_dict['alpha']) + loss['refine'] = loss_refine + loss_all = loss_all + loss_refine + + loss['all'] = loss_all + return loss + + def fusion(self, glance_sigmoid, focus_sigmoid): + # glance_sigmoid [N, 3, H, W]. + # In index, 0 is foreground, 1 is transition, 2 is backbone. + # After fusion, the foreground is 1, the background is 0, and the transion is between (0, 1). + index = paddle.argmax(glance_sigmoid, axis=1, keepdim=True) + transition_mask = (index == 1).astype('float32') + fg = (index == 0).astype('float32') + fusion_sigmoid = focus_sigmoid * transition_mask + fg + return fusion_sigmoid + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class Refiner(nn.Layer): + ''' + Refiner refines the coarse output to full resolution. + + Args: + kernel_size: The convolution kernel_size. Options: [1, 3]. Default: 3. + ''' + + def __init__(self, kernel_size=3): + super().__init__() + if kernel_size not in [1, 3]: + raise ValueError("kernel_size must be in [1, 3]") + + self.kernel_size = kernel_size + + channels = [32, 24, 16, 12, 1] + self.conv1 = layers.ConvBNReLU( + channels[0] + 4 + 3, + channels[1], + kernel_size, + padding=0, + bias_attr=False) + self.conv2 = layers.ConvBNReLU( + channels[1], channels[2], kernel_size, padding=0, bias_attr=False) + self.conv3 = layers.ConvBNReLU( + channels[2] + 3, + channels[3], + kernel_size, + padding=0, + bias_attr=False) + self.conv4 = nn.Conv2D( + channels[3], channels[4], kernel_size, padding=0, bias_attr=True) + + def forward(self, src, pha, err, hid, tri): + ''' + Args: + src: (B, 3, H, W) full resolution source image. + pha: (B, 1, Hc, Wc) coarse alpha prediction. + err: (B, 1, Hc, Hc) coarse error prediction. + hid: (B, 32, Hc, Hc) coarse hidden encoding. + tri: (B, 1, Hc, Hc) trimap prediction. + ''' + h_full, w_full = paddle.shape(src)[2:] + h_half, w_half = h_full // 2, w_full // 2 + h_quat, w_quat = h_full // 4, w_full // 4 + + x = paddle.concat([hid, pha, tri], axis=1) + x = F.interpolate( + x, + paddle.concat((h_half, w_half)), + mode='bilinear', + align_corners=False) + y = F.interpolate( + src, + paddle.concat((h_half, w_half)), + mode='bilinear', + align_corners=False) + + if self.kernel_size == 3: + x = F.pad(x, [3, 3, 3, 3]) + y = F.pad(y, [3, 3, 3, 3]) + + x = self.conv1(paddle.concat([x, y], axis=1)) + x = self.conv2(x) + + if self.kernel_size == 3: + x = F.interpolate(x, paddle.concat((h_full + 4, w_full + 4))) + y = F.pad(src, [2, 2, 2, 2]) + else: + x = F.interpolate( + x, paddle.concat((h_full, w_full)), mode='nearest') + y = src + + x = self.conv3(paddle.concat([x, y], axis=1)) + x = self.conv4(x) + + pha = x + return pha diff --git a/ppmatting/models/layers/__init__.py b/ppmatting/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..31eba2cacd64eddaf0734495b5a992a86b7bad37 --- /dev/null +++ b/ppmatting/models/layers/__init__.py @@ -0,0 +1,15 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .gca_module import GuidedCxtAtten diff --git a/ppmatting/models/layers/gca_module.py b/ppmatting/models/layers/gca_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8654efc9bd24de2e127393ad8338d21964e4a5 --- /dev/null +++ b/ppmatting/models/layers/gca_module.py @@ -0,0 +1,211 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The gca code was heavily based on https://github.com/Yaoyi-Li/GCA-Matting +# and https://github.com/open-mmlab/mmediting + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import param_init + + +class GuidedCxtAtten(nn.Layer): + def __init__(self, + out_channels, + guidance_channels, + kernel_size=3, + stride=1, + rate=2): + super().__init__() + + self.kernel_size = kernel_size + self.rate = rate + self.stride = stride + self.guidance_conv = nn.Conv2D( + in_channels=guidance_channels, + out_channels=guidance_channels // 2, + kernel_size=1) + + self.out_conv = nn.Sequential( + nn.Conv2D( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=1, + bias_attr=False), + nn.BatchNorm(out_channels)) + + self.init_weight() + + def init_weight(self): + param_init.xavier_uniform(self.guidance_conv.weight) + param_init.constant_init(self.guidance_conv.bias, value=0.0) + param_init.xavier_uniform(self.out_conv[0].weight) + param_init.constant_init(self.out_conv[1].weight, value=1e-3) + param_init.constant_init(self.out_conv[1].bias, value=0.0) + + def forward(self, img_feat, alpha_feat, unknown=None, softmax_scale=1.): + + img_feat = self.guidance_conv(img_feat) + img_feat = F.interpolate( + img_feat, scale_factor=1 / self.rate, mode='nearest') + + # process unknown mask + unknown, softmax_scale = self.process_unknown_mask(unknown, img_feat, + softmax_scale) + + img_ps, alpha_ps, unknown_ps = self.extract_feature_maps_patches( + img_feat, alpha_feat, unknown) + + self_mask = self.get_self_correlation_mask(img_feat) + + # split tensors by batch dimension; tuple is returned + img_groups = paddle.split(img_feat, 1, axis=0) + img_ps_groups = paddle.split(img_ps, 1, axis=0) + alpha_ps_groups = paddle.split(alpha_ps, 1, axis=0) + unknown_ps_groups = paddle.split(unknown_ps, 1, axis=0) + scale_groups = paddle.split(softmax_scale, 1, axis=0) + groups = (img_groups, img_ps_groups, alpha_ps_groups, unknown_ps_groups, + scale_groups) + + y = [] + + for img_i, img_ps_i, alpha_ps_i, unknown_ps_i, scale_i in zip(*groups): + # conv for compare + similarity_map = self.compute_similarity_map(img_i, img_ps_i) + + gca_score = self.compute_guided_attention_score( + similarity_map, unknown_ps_i, scale_i, self_mask) + + yi = self.propagate_alpha_feature(gca_score, alpha_ps_i) + + y.append(yi) + + y = paddle.concat(y, axis=0) # back to the mini-batch + y = paddle.reshape(y, alpha_feat.shape) + + y = self.out_conv(y) + alpha_feat + + return y + + def extract_feature_maps_patches(self, img_feat, alpha_feat, unknown): + + # extract image feature patches with shape: + # (N, img_h*img_w, img_c, img_ks, img_ks) + img_ks = self.kernel_size + img_ps = self.extract_patches(img_feat, img_ks, self.stride) + + # extract alpha feature patches with shape: + # (N, img_h*img_w, alpha_c, alpha_ks, alpha_ks) + alpha_ps = self.extract_patches(alpha_feat, self.rate * 2, self.rate) + + # extract unknown mask patches with shape: (N, img_h*img_w, 1, 1) + unknown_ps = self.extract_patches(unknown, img_ks, self.stride) + unknown_ps = unknown_ps.squeeze(axis=2) # squeeze channel dimension + unknown_ps = unknown_ps.mean(axis=[2, 3], keepdim=True) + + return img_ps, alpha_ps, unknown_ps + + def extract_patches(self, x, kernel_size, stride): + n, c, _, _ = x.shape + x = self.pad(x, kernel_size, stride) + x = F.unfold(x, [kernel_size, kernel_size], strides=[stride, stride]) + x = paddle.transpose(x, (0, 2, 1)) + x = paddle.reshape(x, (n, -1, c, kernel_size, kernel_size)) + + return x + + def pad(self, x, kernel_size, stride): + left = (kernel_size - stride + 1) // 2 + right = (kernel_size - stride) // 2 + pad = (left, right, left, right) + return F.pad(x, pad, mode='reflect') + + def compute_guided_attention_score(self, similarity_map, unknown_ps, scale, + self_mask): + # scale the correlation with predicted scale factor for known and + # unknown area + unknown_scale, known_scale = scale[0] + out = similarity_map * ( + unknown_scale * paddle.greater_than(unknown_ps, + paddle.to_tensor([0.])) + + known_scale * paddle.less_equal(unknown_ps, paddle.to_tensor([0.]))) + # mask itself, self-mask only applied to unknown area + out = out + self_mask * unknown_ps + gca_score = F.softmax(out, axis=1) + + return gca_score + + def propagate_alpha_feature(self, gca_score, alpha_ps): + + alpha_ps = alpha_ps[0] # squeeze dim 0 + if self.rate == 1: + gca_score = self.pad(gca_score, kernel_size=2, stride=1) + alpha_ps = paddle.transpose(alpha_ps, (1, 0, 2, 3)) + out = F.conv2d(gca_score, alpha_ps) / 4. + else: + out = F.conv2d_transpose( + gca_score, alpha_ps, stride=self.rate, padding=1) / 4. + + return out + + def compute_similarity_map(self, img_feat, img_ps): + img_ps = img_ps[0] # squeeze dim 0 + # convolve the feature to get correlation (similarity) map + img_ps_normed = img_ps / paddle.clip(self.l2_norm(img_ps), 1e-4) + img_feat = F.pad(img_feat, (1, 1, 1, 1), mode='reflect') + similarity_map = F.conv2d(img_feat, img_ps_normed) + + return similarity_map + + def get_self_correlation_mask(self, img_feat): + _, _, h, w = img_feat.shape + self_mask = F.one_hot( + paddle.reshape(paddle.arange(h * w), (h, w)), + num_classes=int(h * w)) + + self_mask = paddle.transpose(self_mask, (2, 0, 1)) + self_mask = paddle.reshape(self_mask, (1, h * w, h, w)) + + return self_mask * (-1e4) + + def process_unknown_mask(self, unknown, img_feat, softmax_scale): + + n, _, h, w = img_feat.shape + + if unknown is not None: + unknown = unknown.clone() + unknown = F.interpolate( + unknown, scale_factor=1 / self.rate, mode='nearest') + unknown_mean = unknown.mean(axis=[2, 3]) + known_mean = 1 - unknown_mean + unknown_scale = paddle.clip( + paddle.sqrt(unknown_mean / known_mean), 0.1, 10) + known_scale = paddle.clip( + paddle.sqrt(known_mean / unknown_mean), 0.1, 10) + softmax_scale = paddle.concat([unknown_scale, known_scale], axis=1) + else: + unknown = paddle.ones([n, 1, h, w]) + softmax_scale = paddle.reshape( + paddle.to_tensor([softmax_scale, softmax_scale]), (1, 2)) + softmax_scale = paddle.expand(softmax_scale, (n, 2)) + + return unknown, softmax_scale + + @staticmethod + def l2_norm(x): + x = x**2 + x = x.sum(axis=[1, 2, 3], keepdim=True) + return paddle.sqrt(x) diff --git a/ppmatting/models/losses/__init__.py b/ppmatting/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e309f46c7edd25ff514e670a567b23a14e5fd27 --- /dev/null +++ b/ppmatting/models/losses/__init__.py @@ -0,0 +1 @@ +from .loss import * diff --git a/ppmatting/models/losses/loss.py b/ppmatting/models/losses/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..750cb7b33b075c0c890e72a44ba041ad11b1bc4a --- /dev/null +++ b/ppmatting/models/losses/loss.py @@ -0,0 +1,163 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg.cvlibs import manager +import cv2 + + +@manager.LOSSES.add_component +class MRSD(nn.Layer): + def __init__(self, eps=1e-6): + super().__init__() + self.eps = eps + + def forward(self, logit, label, mask=None): + """ + Forward computation. + + Args: + logit (Tensor): Logit tensor, the data type is float32, float64. + label (Tensor): Label tensor, the data type is float32, float64. The shape should equal to logit. + mask (Tensor, optional): The mask where the loss valid. Default: None. + """ + if len(label.shape) == 3: + label = label.unsqueeze(1) + sd = paddle.square(logit - label) + loss = paddle.sqrt(sd + self.eps) + if mask is not None: + mask = mask.astype('float32') + if len(mask.shape) == 3: + mask = mask.unsqueeze(1) + loss = loss * mask + loss = loss.sum() / (mask.sum() + self.eps) + mask.stop_gradient = True + else: + loss = loss.mean() + + return loss + + +@manager.LOSSES.add_component +class GradientLoss(nn.Layer): + def __init__(self, eps=1e-6): + super().__init__() + self.kernel_x, self.kernel_y = self.sobel_kernel() + self.eps = eps + + def forward(self, logit, label, mask=None): + if len(label.shape) == 3: + label = label.unsqueeze(1) + if mask is not None: + if len(mask.shape) == 3: + mask = mask.unsqueeze(1) + logit = logit * mask + label = label * mask + loss = paddle.sum( + F.l1_loss(self.sobel(logit), self.sobel(label), 'none')) / ( + mask.sum() + self.eps) + else: + loss = F.l1_loss(self.sobel(logit), self.sobel(label), 'mean') + + return loss + + def sobel(self, input): + """Using Sobel to compute gradient. Return the magnitude.""" + if not len(input.shape) == 4: + raise ValueError("Invalid input shape, we expect NCHW, but it is ", + input.shape) + + n, c, h, w = input.shape + + input_pad = paddle.reshape(input, (n * c, 1, h, w)) + input_pad = F.pad(input_pad, pad=[1, 1, 1, 1], mode='replicate') + + grad_x = F.conv2d(input_pad, self.kernel_x, padding=0) + grad_y = F.conv2d(input_pad, self.kernel_y, padding=0) + + mag = paddle.sqrt(grad_x * grad_x + grad_y * grad_y + self.eps) + mag = paddle.reshape(mag, (n, c, h, w)) + + return mag + + def sobel_kernel(self): + kernel_x = paddle.to_tensor([[-1.0, 0.0, 1.0], [-2.0, 0.0, 2.0], + [-1.0, 0.0, 1.0]]).astype('float32') + kernel_x = kernel_x / kernel_x.abs().sum() + kernel_y = kernel_x.transpose([1, 0]) + kernel_x = kernel_x.unsqueeze(0).unsqueeze(0) + kernel_y = kernel_y.unsqueeze(0).unsqueeze(0) + kernel_x.stop_gradient = True + kernel_y.stop_gradient = True + return kernel_x, kernel_y + + +@manager.LOSSES.add_component +class LaplacianLoss(nn.Layer): + """ + Laplacian loss is refer to + https://github.com/JizhiziLi/AIM/blob/master/core/evaluate.py#L83 + """ + + def __init__(self): + super().__init__() + self.gauss_kernel = self.build_gauss_kernel( + size=5, sigma=1.0, n_channels=1) + + def forward(self, logit, label, mask=None): + if len(label.shape) == 3: + label = label.unsqueeze(1) + if mask is not None: + if len(mask.shape) == 3: + mask = mask.unsqueeze(1) + logit = logit * mask + label = label * mask + pyr_label = self.laplacian_pyramid(label, self.gauss_kernel, 5) + pyr_logit = self.laplacian_pyramid(logit, self.gauss_kernel, 5) + loss = sum(F.l1_loss(a, b) for a, b in zip(pyr_label, pyr_logit)) + + return loss + + def build_gauss_kernel(self, size=5, sigma=1.0, n_channels=1): + if size % 2 != 1: + raise ValueError("kernel size must be uneven") + grid = np.float32(np.mgrid[0:size, 0:size].T) + gaussian = lambda x: np.exp((x - size // 2)**2 / (-2 * sigma**2))**2 + kernel = np.sum(gaussian(grid), axis=2) + kernel /= np.sum(kernel) + kernel = np.tile(kernel, (n_channels, 1, 1)) + kernel = paddle.to_tensor(kernel[:, None, :, :]) + kernel.stop_gradient = True + return kernel + + def conv_gauss(self, input, kernel): + n_channels, _, kh, kw = kernel.shape + x = F.pad(input, (kh // 2, kw // 2, kh // 2, kh // 2), mode='replicate') + x = F.conv2d(x, kernel, groups=n_channels) + + return x + + def laplacian_pyramid(self, input, kernel, max_levels=5): + current = input + pyr = [] + for level in range(max_levels): + filtered = self.conv_gauss(current, kernel) + diff = current - filtered + pyr.append(diff) + current = F.avg_pool2d(filtered, 2) + pyr.append(current) + return pyr diff --git a/ppmatting/models/modnet.py b/ppmatting/models/modnet.py new file mode 100644 index 0000000000000000000000000000000000000000..ecadfdd1a1710980e36a23bc82717e3081ad64e9 --- /dev/null +++ b/ppmatting/models/modnet.py @@ -0,0 +1,494 @@ +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import scipy +import paddleseg +from paddleseg.models import layers, losses +from paddleseg import utils +from paddleseg.cvlibs import manager, param_init + + +@manager.MODELS.add_component +class MODNet(nn.Layer): + """ + The MODNet implementation based on PaddlePaddle. + + The original article refers to + Zhanghan Ke, et, al. "Is a Green Screen Really Necessary for Real-Time Portrait Matting?" + (https://arxiv.org/pdf/2011.11961.pdf). + + Args: + backbone: backbone model. + hr(int, optional): The channels of high resolutions branch. Defautl: None. + pretrained(str, optional): The path of pretrianed model. Defautl: None. + + """ + + def __init__(self, backbone, hr_channels=32, pretrained=None): + super().__init__() + self.backbone = backbone + self.pretrained = pretrained + self.head = MODNetHead( + hr_channels=hr_channels, backbone_channels=backbone.feat_channels) + self.init_weight() + self.blurer = GaussianBlurLayer(1, 3) + self.loss_func_dict = None + + def forward(self, inputs): + """ + If training, return a dict. + If evaluation, return the final alpha prediction. + """ + x = inputs['img'] + feat_list = self.backbone(x) + y = self.head(inputs=inputs, feat_list=feat_list) + if self.training: + loss = self.loss(y, inputs) + return y, loss + else: + return y + + def loss(self, logit_dict, label_dict, loss_func_dict=None): + if loss_func_dict is None: + if self.loss_func_dict is None: + self.loss_func_dict = defaultdict(list) + self.loss_func_dict['semantic'].append(paddleseg.models.MSELoss( + )) + self.loss_func_dict['detail'].append(paddleseg.models.L1Loss()) + self.loss_func_dict['fusion'].append(paddleseg.models.L1Loss()) + self.loss_func_dict['fusion'].append(paddleseg.models.L1Loss()) + else: + self.loss_func_dict = loss_func_dict + + loss = {} + # semantic loss + semantic_gt = F.interpolate( + label_dict['alpha'], + scale_factor=1 / 16, + mode='bilinear', + align_corners=False) + semantic_gt = self.blurer(semantic_gt) + # semantic_gt.stop_gradient=True + loss['semantic'] = self.loss_func_dict['semantic'][0]( + logit_dict['semantic'], semantic_gt) + + # detail loss + trimap = label_dict['trimap'] + mask = (trimap == 128).astype('float32') + logit_detail = logit_dict['detail'] * mask + label_detail = label_dict['alpha'] * mask + loss_detail = self.loss_func_dict['detail'][0](logit_detail, + label_detail) + loss_detail = loss_detail / (mask.mean() + 1e-6) + loss['detail'] = 10 * loss_detail + + # fusion loss + matte = logit_dict['matte'] + alpha = label_dict['alpha'] + transition_mask = label_dict['trimap'] == 128 + matte_boundary = paddle.where(transition_mask, matte, alpha) + # l1 loss + loss_fusion_l1 = self.loss_func_dict['fusion'][0]( + matte, alpha) + 4 * self.loss_func_dict['fusion'][0](matte_boundary, + alpha) + # composition loss + loss_fusion_comp = self.loss_func_dict['fusion'][1]( + matte * label_dict['img'], alpha * + label_dict['img']) + 4 * self.loss_func_dict['fusion'][1]( + matte_boundary * label_dict['img'], alpha * label_dict['img']) + # consisten loss with semantic + transition_mask = F.interpolate( + label_dict['trimap'], + scale_factor=1 / 16, + mode='nearest', + align_corners=False) + transition_mask = transition_mask == 128 + matte_con_sem = F.interpolate( + matte, scale_factor=1 / 16, mode='bilinear', align_corners=False) + matte_con_sem = self.blurer(matte_con_sem) + logit_semantic = logit_dict['semantic'].clone() + logit_semantic.stop_gradient = True + matte_con_sem = paddle.where(transition_mask, logit_semantic, + matte_con_sem) + if False: + import cv2 + matte_con_sem_num = matte_con_sem.numpy() + matte_con_sem_num = matte_con_sem_num[0].squeeze() + matte_con_sem_num = (matte_con_sem_num * 255).astype('uint8') + semantic = logit_dict['semantic'].numpy() + semantic = semantic[0].squeeze() + semantic = (semantic * 255).astype('uint8') + transition_mask = transition_mask.astype('uint8') + transition_mask = transition_mask.numpy() + transition_mask = (transition_mask[0].squeeze()) * 255 + cv2.imwrite('matte_con.png', matte_con_sem_num) + cv2.imwrite('semantic.png', semantic) + cv2.imwrite('transition.png', transition_mask) + mse_loss = paddleseg.models.MSELoss() + loss_fusion_con_sem = mse_loss(matte_con_sem, logit_dict['semantic']) + loss_fusion = loss_fusion_l1 + loss_fusion_comp + loss_fusion_con_sem + loss['fusion'] = loss_fusion + loss['fusion_l1'] = loss_fusion_l1 + loss['fusion_comp'] = loss_fusion_comp + loss['fusion_con_sem'] = loss_fusion_con_sem + + loss['all'] = loss['semantic'] + loss['detail'] + loss['fusion'] + + return loss + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class MODNetHead(nn.Layer): + def __init__(self, hr_channels, backbone_channels): + super().__init__() + + self.lr_branch = LRBranch(backbone_channels) + self.hr_branch = HRBranch(hr_channels, backbone_channels) + self.f_branch = FusionBranch(hr_channels, backbone_channels) + self.init_weight() + + def forward(self, inputs, feat_list): + pred_semantic, lr8x, [enc2x, enc4x] = self.lr_branch(feat_list) + pred_detail, hr2x = self.hr_branch(inputs['img'], enc2x, enc4x, lr8x) + pred_matte = self.f_branch(inputs['img'], lr8x, hr2x) + + if self.training: + logit_dict = { + 'semantic': pred_semantic, + 'detail': pred_detail, + 'matte': pred_matte + } + return logit_dict + else: + return pred_matte + + def init_weight(self): + for layer in self.sublayers(): + if isinstance(layer, nn.Conv2D): + param_init.kaiming_uniform(layer.weight) + + +class FusionBranch(nn.Layer): + def __init__(self, hr_channels, enc_channels): + super().__init__() + self.conv_lr4x = Conv2dIBNormRelu( + enc_channels[2], hr_channels, 5, stride=1, padding=2) + + self.conv_f2x = Conv2dIBNormRelu( + 2 * hr_channels, hr_channels, 3, stride=1, padding=1) + self.conv_f = nn.Sequential( + Conv2dIBNormRelu( + hr_channels + 3, int(hr_channels / 2), 3, stride=1, padding=1), + Conv2dIBNormRelu( + int(hr_channels / 2), + 1, + 1, + stride=1, + padding=0, + with_ibn=False, + with_relu=False)) + + def forward(self, img, lr8x, hr2x): + lr4x = F.interpolate( + lr8x, scale_factor=2, mode='bilinear', align_corners=False) + lr4x = self.conv_lr4x(lr4x) + lr2x = F.interpolate( + lr4x, scale_factor=2, mode='bilinear', align_corners=False) + + f2x = self.conv_f2x(paddle.concat((lr2x, hr2x), axis=1)) + f = F.interpolate( + f2x, scale_factor=2, mode='bilinear', align_corners=False) + f = self.conv_f(paddle.concat((f, img), axis=1)) + pred_matte = F.sigmoid(f) + + return pred_matte + + +class HRBranch(nn.Layer): + """ + High Resolution Branch of MODNet + """ + + def __init__(self, hr_channels, enc_channels): + super().__init__() + + self.tohr_enc2x = Conv2dIBNormRelu( + enc_channels[0], hr_channels, 1, stride=1, padding=0) + self.conv_enc2x = Conv2dIBNormRelu( + hr_channels + 3, hr_channels, 3, stride=2, padding=1) + + self.tohr_enc4x = Conv2dIBNormRelu( + enc_channels[1], hr_channels, 1, stride=1, padding=0) + self.conv_enc4x = Conv2dIBNormRelu( + 2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1) + + self.conv_hr4x = nn.Sequential( + Conv2dIBNormRelu( + 2 * hr_channels + enc_channels[2] + 3, + 2 * hr_channels, + 3, + stride=1, + padding=1), + Conv2dIBNormRelu( + 2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu( + 2 * hr_channels, hr_channels, 3, stride=1, padding=1)) + + self.conv_hr2x = nn.Sequential( + Conv2dIBNormRelu( + 2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu( + 2 * hr_channels, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu( + hr_channels, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu( + hr_channels, hr_channels, 3, stride=1, padding=1)) + + self.conv_hr = nn.Sequential( + Conv2dIBNormRelu( + hr_channels + 3, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu( + hr_channels, + 1, + 1, + stride=1, + padding=0, + with_ibn=False, + with_relu=False)) + + def forward(self, img, enc2x, enc4x, lr8x): + img2x = F.interpolate( + img, scale_factor=1 / 2, mode='bilinear', align_corners=False) + img4x = F.interpolate( + img, scale_factor=1 / 4, mode='bilinear', align_corners=False) + + enc2x = self.tohr_enc2x(enc2x) + hr4x = self.conv_enc2x(paddle.concat((img2x, enc2x), axis=1)) + + enc4x = self.tohr_enc4x(enc4x) + hr4x = self.conv_enc4x(paddle.concat((hr4x, enc4x), axis=1)) + + lr4x = F.interpolate( + lr8x, scale_factor=2, mode='bilinear', align_corners=False) + hr4x = self.conv_hr4x(paddle.concat((hr4x, lr4x, img4x), axis=1)) + + hr2x = F.interpolate( + hr4x, scale_factor=2, mode='bilinear', align_corners=False) + hr2x = self.conv_hr2x(paddle.concat((hr2x, enc2x), axis=1)) + + pred_detail = None + if self.training: + hr = F.interpolate( + hr2x, scale_factor=2, mode='bilinear', align_corners=False) + hr = self.conv_hr(paddle.concat((hr, img), axis=1)) + pred_detail = F.sigmoid(hr) + + return pred_detail, hr2x + + +class LRBranch(nn.Layer): + def __init__(self, backbone_channels): + super().__init__() + self.se_block = SEBlock(backbone_channels[4], reduction=4) + self.conv_lr16x = Conv2dIBNormRelu( + backbone_channels[4], backbone_channels[3], 5, stride=1, padding=2) + self.conv_lr8x = Conv2dIBNormRelu( + backbone_channels[3], backbone_channels[2], 5, stride=1, padding=2) + self.conv_lr = Conv2dIBNormRelu( + backbone_channels[2], + 1, + 3, + stride=2, + padding=1, + with_ibn=False, + with_relu=False) + + def forward(self, feat_list): + enc2x, enc4x, enc32x = feat_list[0], feat_list[1], feat_list[4] + + enc32x = self.se_block(enc32x) + lr16x = F.interpolate( + enc32x, scale_factor=2, mode='bilinear', align_corners=False) + lr16x = self.conv_lr16x(lr16x) + lr8x = F.interpolate( + lr16x, scale_factor=2, mode='bilinear', align_corners=False) + lr8x = self.conv_lr8x(lr8x) + + pred_semantic = None + if self.training: + lr = self.conv_lr(lr8x) + pred_semantic = F.sigmoid(lr) + + return pred_semantic, lr8x, [enc2x, enc4x] + + +class IBNorm(nn.Layer): + """ + Combine Instance Norm and Batch Norm into One Layer + """ + + def __init__(self, in_channels): + super().__init__() + self.bnorm_channels = in_channels // 2 + self.inorm_channels = in_channels - self.bnorm_channels + + self.bnorm = nn.BatchNorm2D(self.bnorm_channels) + self.inorm = nn.InstanceNorm2D(self.inorm_channels) + + def forward(self, x): + bn_x = self.bnorm(x[:, :self.bnorm_channels, :, :]) + in_x = self.inorm(x[:, self.bnorm_channels:, :, :]) + + return paddle.concat((bn_x, in_x), 1) + + +class Conv2dIBNormRelu(nn.Layer): + """ + Convolution + IBNorm + Relu + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias_attr=None, + with_ibn=True, + with_relu=True): + + super().__init__() + + layers = [ + nn.Conv2D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias_attr) + ] + + if with_ibn: + layers.append(IBNorm(out_channels)) + + if with_relu: + layers.append(nn.ReLU()) + + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class SEBlock(nn.Layer): + """ + SE Block Proposed in https://arxiv.org/pdf/1709.01507.pdf + """ + + def __init__(self, num_channels, reduction=1): + super().__init__() + self.pool = nn.AdaptiveAvgPool2D(1) + self.conv = nn.Sequential( + nn.Conv2D( + num_channels, + int(num_channels // reduction), + 1, + bias_attr=False), + nn.ReLU(), + nn.Conv2D( + int(num_channels // reduction), + num_channels, + 1, + bias_attr=False), + nn.Sigmoid()) + + def forward(self, x): + w = self.pool(x) + w = self.conv(w) + return w * x + + +class GaussianBlurLayer(nn.Layer): + """ Add Gaussian Blur to a 4D tensors + This layer takes a 4D tensor of {N, C, H, W} as input. + The Gaussian blur will be performed in given channel number (C) splitly. + """ + + def __init__(self, channels, kernel_size): + """ + Args: + channels (int): Channel for input tensor + kernel_size (int): Size of the kernel used in blurring + """ + + super(GaussianBlurLayer, self).__init__() + self.channels = channels + self.kernel_size = kernel_size + assert self.kernel_size % 2 != 0 + + self.op = nn.Sequential( + nn.Pad2D( + int(self.kernel_size / 2), mode='reflect'), + nn.Conv2D( + channels, + channels, + self.kernel_size, + stride=1, + padding=0, + bias_attr=False, + groups=channels)) + + self._init_kernel() + self.op[1].weight.stop_gradient = True + + def forward(self, x): + """ + Args: + x (paddle.Tensor): input 4D tensor + Returns: + paddle.Tensor: Blurred version of the input + """ + + if not len(list(x.shape)) == 4: + print('\'GaussianBlurLayer\' requires a 4D tensor as input\n') + exit() + elif not x.shape[1] == self.channels: + print('In \'GaussianBlurLayer\', the required channel ({0}) is' + 'not the same as input ({1})\n'.format(self.channels, x.shape[ + 1])) + exit() + + return self.op(x) + + def _init_kernel(self): + sigma = 0.3 * ((self.kernel_size - 1) * 0.5 - 1) + 0.8 + + n = np.zeros((self.kernel_size, self.kernel_size)) + i = int(self.kernel_size / 2) + n[i, i] = 1 + kernel = scipy.ndimage.gaussian_filter(n, sigma) + kernel = kernel.astype('float32') + kernel = kernel[np.newaxis, np.newaxis, :, :] + paddle.assign(kernel, self.op[1].weight) diff --git a/ppmatting/models/ppmatting.py b/ppmatting/models/ppmatting.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed14528b5e598eda3a8fd6030a51ecc81dc6e3c --- /dev/null +++ b/ppmatting/models/ppmatting.py @@ -0,0 +1,338 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +import time + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddleseg +from paddleseg.models import layers +from paddleseg import utils +from paddleseg.cvlibs import manager + +from ppmatting.models.losses import MRSD, GradientLoss +from ppmatting.models.backbone import resnet_vd + + +@manager.MODELS.add_component +class PPMatting(nn.Layer): + """ + The PPMattinh implementation based on PaddlePaddle. + + The original article refers to + Guowei Chen, et, al. "PP-Matting: High-Accuracy Natural Image Matting" + (https://arxiv.org/pdf/2204.09433.pdf). + + Args: + backbone: backbone model. + pretrained(str, optional): The path of pretrianed model. Defautl: None. + + """ + + def __init__(self, backbone, pretrained=None): + super().__init__() + self.backbone = backbone + self.pretrained = pretrained + self.loss_func_dict = self.get_loss_func_dict() + + self.backbone_channels = backbone.feat_channels + + self.scb = SCB(self.backbone_channels[-1]) + + self.hrdb = HRDB( + self.backbone_channels[0] + self.backbone_channels[1], + scb_channels=self.scb.out_channels, + gf_index=[0, 2, 4]) + + self.init_weight() + + def forward(self, inputs): + x = inputs['img'] + input_shape = paddle.shape(x) + fea_list = self.backbone(x) + + scb_logits = self.scb(fea_list[-1]) + semantic_map = F.softmax(scb_logits[-1], axis=1) + + fea0 = F.interpolate( + fea_list[0], input_shape[2:], mode='bilinear', align_corners=False) + fea1 = F.interpolate( + fea_list[1], input_shape[2:], mode='bilinear', align_corners=False) + hrdb_input = paddle.concat([fea0, fea1], 1) + hrdb_logit = self.hrdb(hrdb_input, scb_logits) + detail_map = F.sigmoid(hrdb_logit) + fusion = self.fusion(semantic_map, detail_map) + + if self.training: + logit_dict = { + 'semantic': semantic_map, + 'detail': detail_map, + 'fusion': fusion + } + loss_dict = self.loss(logit_dict, inputs) + return logit_dict, loss_dict + else: + return fusion + + def get_loss_func_dict(self): + loss_func_dict = defaultdict(list) + loss_func_dict['semantic'].append(nn.NLLLoss()) + loss_func_dict['detail'].append(MRSD()) + loss_func_dict['detail'].append(GradientLoss()) + loss_func_dict['fusion'].append(MRSD()) + loss_func_dict['fusion'].append(MRSD()) + loss_func_dict['fusion'].append(GradientLoss()) + return loss_func_dict + + def loss(self, logit_dict, label_dict): + loss = {} + + # semantic loss computation + # get semantic label + semantic_label = label_dict['trimap'] + semantic_label_trans = (semantic_label == 128).astype('int64') + semantic_label_bg = (semantic_label == 0).astype('int64') + semantic_label = semantic_label_trans + semantic_label_bg * 2 + loss_semantic = self.loss_func_dict['semantic'][0]( + paddle.log(logit_dict['semantic'] + 1e-6), + semantic_label.squeeze(1)) + loss['semantic'] = loss_semantic + + # detail loss computation + transparent = label_dict['trimap'] == 128 + detail_alpha_loss = self.loss_func_dict['detail'][0]( + logit_dict['detail'], label_dict['alpha'], transparent) + # gradient loss + detail_gradient_loss = self.loss_func_dict['detail'][1]( + logit_dict['detail'], label_dict['alpha'], transparent) + loss_detail = detail_alpha_loss + detail_gradient_loss + loss['detail'] = loss_detail + loss['detail_alpha'] = detail_alpha_loss + loss['detail_gradient'] = detail_gradient_loss + + # fusion loss + loss_fusion_func = self.loss_func_dict['fusion'] + # fusion_sigmoid loss + fusion_alpha_loss = loss_fusion_func[0](logit_dict['fusion'], + label_dict['alpha']) + # composion loss + comp_pred = logit_dict['fusion'] * label_dict['fg'] + ( + 1 - logit_dict['fusion']) * label_dict['bg'] + comp_gt = label_dict['alpha'] * label_dict['fg'] + ( + 1 - label_dict['alpha']) * label_dict['bg'] + fusion_composition_loss = loss_fusion_func[1](comp_pred, comp_gt) + # grandient loss + fusion_grad_loss = loss_fusion_func[2](logit_dict['fusion'], + label_dict['alpha']) + # fusion loss + loss_fusion = fusion_alpha_loss + fusion_composition_loss + fusion_grad_loss + loss['fusion'] = loss_fusion + loss['fusion_alpha'] = fusion_alpha_loss + loss['fusion_composition'] = fusion_composition_loss + loss['fusion_gradient'] = fusion_grad_loss + + loss[ + 'all'] = 0.25 * loss_semantic + 0.25 * loss_detail + 0.25 * loss_fusion + + return loss + + def fusion(self, semantic_map, detail_map): + # semantic_map [N, 3, H, W] + # In index, 0 is foreground, 1 is transition, 2 is backbone + # After fusion, the foreground is 1, the background is 0, and the transion is between [0, 1] + index = paddle.argmax(semantic_map, axis=1, keepdim=True) + transition_mask = (index == 1).astype('float32') + fg = (index == 0).astype('float32') + alpha = detail_map * transition_mask + fg + return alpha + + def init_weight(self): + if self.pretrained is not None: + utils.load_entire_model(self, self.pretrained) + + +class SCB(nn.Layer): + def __init__(self, in_channels): + super().__init__() + self.in_channels = [512 + in_channels, 512, 256, 128, 128, 64] + self.mid_channels = [512, 256, 128, 128, 64, 64] + self.out_channels = [256, 128, 64, 64, 64, 3] + + self.psp_module = layers.PPModule( + in_channels, + 512, + bin_sizes=(1, 3, 5), + dim_reduction=False, + align_corners=False) + + psp_upsamples = [2, 4, 8, 16] + self.psps = nn.LayerList([ + self.conv_up_psp(512, self.out_channels[i], psp_upsamples[i]) + for i in range(4) + ]) + + scb_list = [ + self._make_stage( + self.in_channels[i], + self.mid_channels[i], + self.out_channels[i], + padding=int(i == 0) + 1, + dilation=int(i == 0) + 1) + for i in range(len(self.in_channels) - 1) + ] + scb_list += [ + nn.Sequential( + layers.ConvBNReLU( + self.in_channels[-1], self.mid_channels[-1], 3, padding=1), + layers.ConvBNReLU( + self.mid_channels[-1], self.mid_channels[-1], 3, padding=1), + nn.Conv2D( + self.mid_channels[-1], self.out_channels[-1], 3, padding=1)) + ] + self.scb_stages = nn.LayerList(scb_list) + + def forward(self, x): + psp_x = self.psp_module(x) + psps = [psp(psp_x) for psp in self.psps] + + scb_logits = [] + for i, scb_stage in enumerate(self.scb_stages): + if i == 0: + x = scb_stage(paddle.concat((psp_x, x), 1)) + elif i <= len(psps): + x = scb_stage(paddle.concat((psps[i - 1], x), 1)) + else: + x = scb_stage(x) + scb_logits.append(x) + return scb_logits + + def conv_up_psp(self, in_channels, out_channels, up_sample): + return nn.Sequential( + layers.ConvBNReLU( + in_channels, out_channels, 3, padding=1), + nn.Upsample( + scale_factor=up_sample, mode='bilinear', align_corners=False)) + + def _make_stage(self, + in_channels, + mid_channels, + out_channels, + padding=1, + dilation=1): + layer_list = [ + layers.ConvBNReLU( + in_channels, mid_channels, 3, padding=1), layers.ConvBNReLU( + mid_channels, + mid_channels, + 3, + padding=padding, + dilation=dilation), layers.ConvBNReLU( + mid_channels, + out_channels, + 3, + padding=padding, + dilation=dilation), nn.Upsample( + scale_factor=2, + mode='bilinear', + align_corners=False) + ] + return nn.Sequential(*layer_list) + + +class HRDB(nn.Layer): + """ + The High-Resolution Detail Branch + + Args: + in_channels(int): The number of input channels. + scb_channels(list|tuple): The channels of scb logits + gf_index(list|tuple, optional): Which logit is selected as guidance flow from scb logits. Default: (0, 2, 4) + """ + + def __init__(self, in_channels, scb_channels, gf_index=(0, 2, 4)): + super().__init__() + self.gf_index = gf_index + self.gf_list = nn.LayerList( + [nn.Conv2D(scb_channels[i], 1, 1) for i in gf_index]) + + channels = [64, 32, 16, 8] + self.res_list = [ + resnet_vd.BasicBlock( + in_channels, channels[0], stride=1, shortcut=False) + ] + self.res_list += [ + resnet_vd.BasicBlock( + i, i, stride=1) for i in channels[1:-1] + ] + self.res_list = nn.LayerList(self.res_list) + + self.convs = nn.LayerList([ + nn.Conv2D( + channels[i], channels[i + 1], kernel_size=1) + for i in range(len(channels) - 1) + ]) + self.gates = nn.LayerList( + [GatedSpatailConv2d(i, i) for i in channels[1:]]) + + self.detail_conv = nn.Conv2D(channels[-1], 1, 1, bias_attr=False) + + def forward(self, x, scb_logits): + for i in range(len(self.res_list)): + x = self.res_list[i](x) + x = self.convs[i](x) + gf = self.gf_list[i](scb_logits[self.gf_index[i]]) + gf = F.interpolate( + gf, paddle.shape(x)[-2:], mode='bilinear', align_corners=False) + x = self.gates[i](x, gf) + return self.detail_conv(x) + + +class GatedSpatailConv2d(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + dilation=1, + groups=1, + bias_attr=False): + super().__init__() + self._gate_conv = nn.Sequential( + layers.SyncBatchNorm(in_channels + 1), + nn.Conv2D( + in_channels + 1, in_channels + 1, kernel_size=1), + nn.ReLU(), + nn.Conv2D( + in_channels + 1, 1, kernel_size=1), + layers.SyncBatchNorm(1), + nn.Sigmoid()) + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias_attr) + + def forward(self, input_features, gating_features): + cat = paddle.concat([input_features, gating_features], axis=1) + alphas = self._gate_conv(cat) + x = input_features * (alphas + 1) + x = self.conv(x) + return x diff --git a/ppmatting/transforms/__init__.py b/ppmatting/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7986cdd642998fb0638a81c9ea22615faf8bad0b --- /dev/null +++ b/ppmatting/transforms/__init__.py @@ -0,0 +1 @@ +from .transforms import * diff --git a/ppmatting/transforms/transforms.py b/ppmatting/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..73eddcace33e503b1de158b205febdc4f0322308 --- /dev/null +++ b/ppmatting/transforms/transforms.py @@ -0,0 +1,791 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import string + +import cv2 +import numpy as np +from paddleseg.transforms import functional +from paddleseg.cvlibs import manager +from paddleseg.utils import seg_env +from PIL import Image + + +@manager.TRANSFORMS.add_component +class Compose: + """ + Do transformation on input data with corresponding pre-processing and augmentation operations. + The shape of input data to all operations is [height, width, channels]. + """ + + def __init__(self, transforms, to_rgb=True): + if not isinstance(transforms, list): + raise TypeError('The transforms must be a list!') + self.transforms = transforms + self.to_rgb = to_rgb + + def __call__(self, data): + """ + Args: + data (dict): The data to transform. + + Returns: + dict: Data after transformation + """ + if 'trans_info' not in data: + data['trans_info'] = [] + for op in self.transforms: + data = op(data) + if data is None: + return None + + data['img'] = np.transpose(data['img'], (2, 0, 1)) + for key in data.get('gt_fields', []): + if len(data[key].shape) == 2: + continue + data[key] = np.transpose(data[key], (2, 0, 1)) + + return data + + +@manager.TRANSFORMS.add_component +class LoadImages: + def __init__(self, to_rgb=True): + self.to_rgb = to_rgb + + def __call__(self, data): + if isinstance(data['img'], str): + data['img'] = cv2.imread(data['img']) + for key in data.get('gt_fields', []): + if isinstance(data[key], str): + data[key] = cv2.imread(data[key], cv2.IMREAD_UNCHANGED) + # if alpha and trimap has 3 channels, extract one. + if key in ['alpha', 'trimap']: + if len(data[key].shape) > 2: + data[key] = data[key][:, :, 0] + + if self.to_rgb: + data['img'] = cv2.cvtColor(data['img'], cv2.COLOR_BGR2RGB) + for key in data.get('gt_fields', []): + if len(data[key].shape) == 2: + continue + data[key] = cv2.cvtColor(data[key], cv2.COLOR_BGR2RGB) + + return data + + +@manager.TRANSFORMS.add_component +class Resize: + def __init__(self, target_size=(512, 512), random_interp=False): + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of `target_size` is invalid. It should be list or tuple, but it is {}" + .format(type(target_size))) + + self.target_size = target_size + self.random_interp = random_interp + self.interps = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC] + + def __call__(self, data): + if self.random_interp: + interp = np.random.choice(self.interps) + else: + interp = cv2.INTER_LINEAR + data['trans_info'].append(('resize', data['img'].shape[0:2])) + data['img'] = functional.resize(data['img'], self.target_size, interp) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize(data[key], self.target_size, + cv2.INTER_NEAREST) + else: + data[key] = functional.resize(data[key], self.target_size, + interp) + return data + + +@manager.TRANSFORMS.add_component +class RandomResize: + """ + Resize image to a size determinned by `scale` and `size`. + + Args: + size(tuple|list): The reference size to resize. A tuple or list with length 2. + scale(tupel|list, optional): A range of scale base on `size`. A tuple or list with length 2. Default: None. + """ + + def __init__(self, size=None, scale=None): + if isinstance(size, list) or isinstance(size, tuple): + if len(size) != 2: + raise ValueError( + '`size` should include 2 elements, but it is {}'.format( + size)) + elif size is not None: + raise TypeError( + "Type of `size` is invalid. It should be list or tuple, but it is {}" + .format(type(size))) + + if scale is not None: + if isinstance(scale, list) or isinstance(scale, tuple): + if len(scale) != 2: + raise ValueError( + '`scale` should include 2 elements, but it is {}'. + format(scale)) + else: + raise TypeError( + "Type of `scale` is invalid. It should be list or tuple, but it is {}" + .format(type(scale))) + self.size = size + self.scale = scale + + def __call__(self, data): + h, w = data['img'].shape[:2] + if self.scale is not None: + scale = np.random.uniform(self.scale[0], self.scale[1]) + else: + scale = 1. + if self.size is not None: + scale_factor = max(self.size[0] / w, self.size[1] / h) + else: + scale_factor = 1 + scale = scale * scale_factor + + w = int(round(w * scale)) + h = int(round(h * scale)) + data['img'] = functional.resize(data['img'], (w, h)) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize(data[key], (w, h), + cv2.INTER_NEAREST) + else: + data[key] = functional.resize(data[key], (w, h)) + return data + + +@manager.TRANSFORMS.add_component +class ResizeByLong: + """ + Resize the long side of an image to given size, and then scale the other side proportionally. + + Args: + long_size (int): The target size of long side. + """ + + def __init__(self, long_size): + self.long_size = long_size + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + data['img'] = functional.resize_long(data['img'], self.long_size) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize_long(data[key], self.long_size, + cv2.INTER_NEAREST) + else: + data[key] = functional.resize_long(data[key], self.long_size) + return data + + +@manager.TRANSFORMS.add_component +class ResizeByShort: + """ + Resize the short side of an image to given size, and then scale the other side proportionally. + + Args: + short_size (int): The target size of short side. + """ + + def __init__(self, short_size): + self.short_size = short_size + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + data['img'] = functional.resize_short(data['img'], self.short_size) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize_short(data[key], self.short_size, + cv2.INTER_NEAREST) + else: + data[key] = functional.resize_short(data[key], self.short_size) + return data + + +@manager.TRANSFORMS.add_component +class ResizeToIntMult: + """ + Resize to some int muitple, d.g. 32. + """ + + def __init__(self, mult_int=32): + self.mult_int = mult_int + + def __call__(self, data): + data['trans_info'].append(('resize', data['img'].shape[0:2])) + + h, w = data['img'].shape[0:2] + rw = w - w % self.mult_int + rh = h - h % self.mult_int + data['img'] = functional.resize(data['img'], (rw, rh)) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize(data[key], (rw, rh), + cv2.INTER_NEAREST) + else: + data[key] = functional.resize(data[key], (rw, rh)) + + return data + + +@manager.TRANSFORMS.add_component +class Normalize: + """ + Normalize an image. + + Args: + mean (list, optional): The mean value of a data set. Default: [0.5, 0.5, 0.5]. + std (list, optional): The standard deviation of a data set. Default: [0.5, 0.5, 0.5]. + + Raises: + ValueError: When mean/std is not list or any value in std is 0. + """ + + def __init__(self, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)): + self.mean = mean + self.std = std + if not (isinstance(self.mean, + (list, tuple)) and isinstance(self.std, + (list, tuple))): + raise ValueError( + "{}: input type is invalid. It should be list or tuple".format( + self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def __call__(self, data): + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + data['img'] = functional.normalize(data['img'], mean, std) + if 'fg' in data.get('gt_fields', []): + data['fg'] = functional.normalize(data['fg'], mean, std) + if 'bg' in data.get('gt_fields', []): + data['bg'] = functional.normalize(data['bg'], mean, std) + + return data + + +@manager.TRANSFORMS.add_component +class RandomCropByAlpha: + """ + Randomly crop while centered on uncertain area by a certain probability. + + Args: + crop_size (tuple|list): The size you want to crop from image. + p (float): The probability centered on uncertain area. + + """ + + def __init__(self, crop_size=((320, 320), (480, 480), (640, 640)), + prob=0.5): + self.crop_size = crop_size + self.prob = prob + + def __call__(self, data): + idex = np.random.randint(low=0, high=len(self.crop_size)) + crop_w, crop_h = self.crop_size[idex] + + img_h = data['img'].shape[0] + img_w = data['img'].shape[1] + if np.random.rand() < self.prob: + crop_center = np.where((data['alpha'] > 0) & (data['alpha'] < 255)) + center_h_array, center_w_array = crop_center + if len(center_h_array) == 0: + return data + rand_ind = np.random.randint(len(center_h_array)) + center_h = center_h_array[rand_ind] + center_w = center_w_array[rand_ind] + delta_h = crop_h // 2 + delta_w = crop_w // 2 + start_h = max(0, center_h - delta_h) + start_w = max(0, center_w - delta_w) + else: + start_h = 0 + start_w = 0 + if img_h > crop_h: + start_h = np.random.randint(img_h - crop_h + 1) + if img_w > crop_w: + start_w = np.random.randint(img_w - crop_w + 1) + + end_h = min(img_h, start_h + crop_h) + end_w = min(img_w, start_w + crop_w) + + data['img'] = data['img'][start_h:end_h, start_w:end_w] + for key in data.get('gt_fields', []): + data[key] = data[key][start_h:end_h, start_w:end_w] + + return data + + +@manager.TRANSFORMS.add_component +class RandomCrop: + """ + Randomly crop + + Args: + crop_size (tuple|list): The size you want to crop from image. + """ + + def __init__(self, crop_size=((320, 320), (480, 480), (640, 640))): + if not isinstance(crop_size[0], (list, tuple)): + crop_size = [crop_size] + self.crop_size = crop_size + + def __call__(self, data): + idex = np.random.randint(low=0, high=len(self.crop_size)) + crop_w, crop_h = self.crop_size[idex] + img_h, img_w = data['img'].shape[0:2] + + start_h = 0 + start_w = 0 + if img_h > crop_h: + start_h = np.random.randint(img_h - crop_h + 1) + if img_w > crop_w: + start_w = np.random.randint(img_w - crop_w + 1) + + end_h = min(img_h, start_h + crop_h) + end_w = min(img_w, start_w + crop_w) + + data['img'] = data['img'][start_h:end_h, start_w:end_w] + for key in data.get('gt_fields', []): + data[key] = data[key][start_h:end_h, start_w:end_w] + + return data + + +@manager.TRANSFORMS.add_component +class LimitLong: + """ + Limit the long edge of image. + + If the long edge is larger than max_long, resize the long edge + to max_long, while scale the short edge proportionally. + + If the long edge is smaller than min_long, resize the long edge + to min_long, while scale the short edge proportionally. + + Args: + max_long (int, optional): If the long edge of image is larger than max_long, + it will be resize to max_long. Default: None. + min_long (int, optional): If the long edge of image is smaller than min_long, + it will be resize to min_long. Default: None. + """ + + def __init__(self, max_long=None, min_long=None): + if max_long is not None: + if not isinstance(max_long, int): + raise TypeError( + "Type of `max_long` is invalid. It should be int, but it is {}" + .format(type(max_long))) + if min_long is not None: + if not isinstance(min_long, int): + raise TypeError( + "Type of `min_long` is invalid. It should be int, but it is {}" + .format(type(min_long))) + if (max_long is not None) and (min_long is not None): + if min_long > max_long: + raise ValueError( + '`max_long should not smaller than min_long, but they are {} and {}' + .format(max_long, min_long)) + self.max_long = max_long + self.min_long = min_long + + def __call__(self, data): + h, w = data['img'].shape[:2] + long_edge = max(h, w) + target = long_edge + if (self.max_long is not None) and (long_edge > self.max_long): + target = self.max_long + elif (self.min_long is not None) and (long_edge < self.min_long): + target = self.min_long + + data['trans_info'].append(('resize', data['img'].shape[0:2])) + if target != long_edge: + data['img'] = functional.resize_long(data['img'], target) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize_long(data[key], target, + cv2.INTER_NEAREST) + else: + data[key] = functional.resize_long(data[key], target) + + return data + + +@manager.TRANSFORMS.add_component +class LimitShort: + """ + Limit the short edge of image. + + If the short edge is larger than max_short, resize the short edge + to max_short, while scale the long edge proportionally. + + If the short edge is smaller than min_short, resize the short edge + to min_short, while scale the long edge proportionally. + + Args: + max_short (int, optional): If the short edge of image is larger than max_short, + it will be resize to max_short. Default: None. + min_short (int, optional): If the short edge of image is smaller than min_short, + it will be resize to min_short. Default: None. + """ + + def __init__(self, max_short=None, min_short=None): + if max_short is not None: + if not isinstance(max_short, int): + raise TypeError( + "Type of `max_short` is invalid. It should be int, but it is {}" + .format(type(max_short))) + if min_short is not None: + if not isinstance(min_short, int): + raise TypeError( + "Type of `min_short` is invalid. It should be int, but it is {}" + .format(type(min_short))) + if (max_short is not None) and (min_short is not None): + if min_short > max_short: + raise ValueError( + '`max_short should not smaller than min_short, but they are {} and {}' + .format(max_short, min_short)) + self.max_short = max_short + self.min_short = min_short + + def __call__(self, data): + h, w = data['img'].shape[:2] + short_edge = min(h, w) + target = short_edge + if (self.max_short is not None) and (short_edge > self.max_short): + target = self.max_short + elif (self.min_short is not None) and (short_edge < self.min_short): + target = self.min_short + + data['trans_info'].append(('resize', data['img'].shape[0:2])) + if target != short_edge: + data['img'] = functional.resize_short(data['img'], target) + for key in data.get('gt_fields', []): + if key == 'trimap': + data[key] = functional.resize_short(data[key], target, + cv2.INTER_NEAREST) + else: + data[key] = functional.resize_short(data[key], target) + + return data + + +@manager.TRANSFORMS.add_component +class RandomHorizontalFlip: + """ + Flip an image horizontally with a certain probability. + + Args: + prob (float, optional): A probability of horizontally flipping. Default: 0.5. + """ + + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, data): + if random.random() < self.prob: + data['img'] = functional.horizontal_flip(data['img']) + for key in data.get('gt_fields', []): + data[key] = functional.horizontal_flip(data[key]) + + return data + + +@manager.TRANSFORMS.add_component +class RandomBlur: + """ + Blurring an image by a Gaussian function with a certain probability. + + Args: + prob (float, optional): A probability of blurring an image. Default: 0.1. + """ + + def __init__(self, prob=0.1): + self.prob = prob + + def __call__(self, data): + if self.prob <= 0: + n = 0 + elif self.prob >= 1: + n = 1 + else: + n = int(1.0 / self.prob) + if n > 0: + if np.random.randint(0, n) == 0: + radius = np.random.randint(3, 10) + if radius % 2 != 1: + radius = radius + 1 + if radius > 9: + radius = 9 + data['img'] = cv2.GaussianBlur(data['img'], (radius, radius), 0, + 0) + for key in data.get('gt_fields', []): + if key == 'trimap': + continue + data[key] = cv2.GaussianBlur(data[key], (radius, radius), 0, + 0) + return data + + +@manager.TRANSFORMS.add_component +class RandomDistort: + """ + Distort an image with random configurations. + + Args: + brightness_range (float, optional): A range of brightness. Default: 0.5. + brightness_prob (float, optional): A probability of adjusting brightness. Default: 0.5. + contrast_range (float, optional): A range of contrast. Default: 0.5. + contrast_prob (float, optional): A probability of adjusting contrast. Default: 0.5. + saturation_range (float, optional): A range of saturation. Default: 0.5. + saturation_prob (float, optional): A probability of adjusting saturation. Default: 0.5. + hue_range (int, optional): A range of hue. Default: 18. + hue_prob (float, optional): A probability of adjusting hue. Default: 0.5. + """ + + def __init__(self, + brightness_range=0.5, + brightness_prob=0.5, + contrast_range=0.5, + contrast_prob=0.5, + saturation_range=0.5, + saturation_prob=0.5, + hue_range=18, + hue_prob=0.5): + self.brightness_range = brightness_range + self.brightness_prob = brightness_prob + self.contrast_range = contrast_range + self.contrast_prob = contrast_prob + self.saturation_range = saturation_range + self.saturation_prob = saturation_prob + self.hue_range = hue_range + self.hue_prob = hue_prob + + def __call__(self, data): + brightness_lower = 1 - self.brightness_range + brightness_upper = 1 + self.brightness_range + contrast_lower = 1 - self.contrast_range + contrast_upper = 1 + self.contrast_range + saturation_lower = 1 - self.saturation_range + saturation_upper = 1 + self.saturation_range + hue_lower = -self.hue_range + hue_upper = self.hue_range + ops = [ + functional.brightness, functional.contrast, functional.saturation, + functional.hue + ] + random.shuffle(ops) + params_dict = { + 'brightness': { + 'brightness_lower': brightness_lower, + 'brightness_upper': brightness_upper + }, + 'contrast': { + 'contrast_lower': contrast_lower, + 'contrast_upper': contrast_upper + }, + 'saturation': { + 'saturation_lower': saturation_lower, + 'saturation_upper': saturation_upper + }, + 'hue': { + 'hue_lower': hue_lower, + 'hue_upper': hue_upper + } + } + prob_dict = { + 'brightness': self.brightness_prob, + 'contrast': self.contrast_prob, + 'saturation': self.saturation_prob, + 'hue': self.hue_prob + } + + im = data['img'].astype('uint8') + im = Image.fromarray(im) + for id in range(len(ops)): + params = params_dict[ops[id].__name__] + params['im'] = im + prob = prob_dict[ops[id].__name__] + if np.random.uniform(0, 1) < prob: + im = ops[id](**params) + data['img'] = np.asarray(im) + + for key in data.get('gt_fields', []): + if key in ['alpha', 'trimap']: + continue + else: + im = data[key].astype('uint8') + im = Image.fromarray(im) + for id in range(len(ops)): + params = params_dict[ops[id].__name__] + params['im'] = im + prob = prob_dict[ops[id].__name__] + if np.random.uniform(0, 1) < prob: + im = ops[id](**params) + data[key] = np.asarray(im) + return data + + +@manager.TRANSFORMS.add_component +class Padding: + """ + Add bottom-right padding to a raw image or annotation image. + + Args: + target_size (list|tuple): The target size after padding. + im_padding_value (list, optional): The padding value of raw image. + Default: [127.5, 127.5, 127.5]. + label_padding_value (int, optional): The padding value of annotation image. Default: 255. + + Raises: + TypeError: When target_size is neither list nor tuple. + ValueError: When the length of target_size is not 2. + """ + + def __init__(self, target_size, im_padding_value=(127.5, 127.5, 127.5)): + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of target_size is invalid. It should be list or tuple, now is {}" + .format(type(target_size))) + + self.target_size = target_size + self.im_padding_value = im_padding_value + + def __call__(self, data): + im_height, im_width = data['img'].shape[0], data['img'].shape[1] + target_height = self.target_size[1] + target_width = self.target_size[0] + pad_height = max(0, target_height - im_height) + pad_width = max(0, target_width - im_width) + data['trans_info'].append(('padding', data['img'].shape[0:2])) + if (pad_height == 0) and (pad_width == 0): + return data + else: + data['img'] = cv2.copyMakeBorder( + data['img'], + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=self.im_padding_value) + for key in data.get('gt_fields', []): + if key in ['trimap', 'alpha']: + value = 0 + else: + value = self.im_padding_value + data[key] = cv2.copyMakeBorder( + data[key], + 0, + pad_height, + 0, + pad_width, + cv2.BORDER_CONSTANT, + value=value) + return data + + +@manager.TRANSFORMS.add_component +class RandomSharpen: + def __init__(self, prob=0.1): + if prob < 0: + self.prob = 0 + elif prob > 1: + self.prob = 1 + else: + self.prob = prob + + def __call__(self, data): + if np.random.rand() > self.prob: + return data + + radius = np.random.choice([0, 3, 5, 7, 9]) + w = np.random.uniform(0.1, 0.5) + blur_img = cv2.GaussianBlur(data['img'], (radius, radius), 5) + data['img'] = cv2.addWeighted(data['img'], 1 + w, blur_img, -w, 0) + for key in data.get('gt_fields', []): + if key == 'trimap' or key == 'alpha': + continue + blur_img = cv2.GaussianBlur(data[key], (0, 0), 5) + data[key] = cv2.addWeighted(data[key], 1.5, blur_img, -0.5, 0) + + return data + + +@manager.TRANSFORMS.add_component +class RandomNoise: + def __init__(self, prob=0.1): + if prob < 0: + self.prob = 0 + elif prob > 1: + self.prob = 1 + else: + self.prob = prob + + def __call__(self, data): + if np.random.rand() > self.prob: + return data + mean = np.random.uniform(0, 0.04) + var = np.random.uniform(0, 0.001) + noise = np.random.normal(mean, var**0.5, data['img'].shape) * 255 + data['img'] = data['img'] + noise + data['img'] = np.clip(data['img'], 0, 255) + + return data + + +@manager.TRANSFORMS.add_component +class RandomReJpeg: + def __init__(self, prob=0.1): + if prob < 0: + self.prob = 0 + elif prob > 1: + self.prob = 1 + else: + self.prob = prob + + def __call__(self, data): + if np.random.rand() > self.prob: + return data + q = np.random.randint(70, 95) + img = data['img'].astype('uint8') + + # Ensure no conflicts between processes + tmp_name = str(os.getpid()) + '.jpg' + tmp_name = os.path.join(seg_env.TMP_HOME, tmp_name) + cv2.imwrite(tmp_name, img, [int(cv2.IMWRITE_JPEG_QUALITY), q]) + data['img'] = cv2.imread(tmp_name) + + return data diff --git a/ppmatting/utils/__init__.py b/ppmatting/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..79717c71036b5b730cce8548bc27f6fef7222c21 --- /dev/null +++ b/ppmatting/utils/__init__.py @@ -0,0 +1,2 @@ +from .estimate_foreground_ml import estimate_foreground_ml +from .utils import get_files, get_image_list, mkdir diff --git a/ppmatting/utils/estimate_foreground_ml.py b/ppmatting/utils/estimate_foreground_ml.py new file mode 100644 index 0000000000000000000000000000000000000000..05bffb6c31a5042fd96c028013c81f7533f3675d --- /dev/null +++ b/ppmatting/utils/estimate_foreground_ml.py @@ -0,0 +1,236 @@ +import numpy as np +from numba import njit, prange + +# The foreground estimation refer to pymatting [https://github.com/pymatting/pymatting/blob/master/pymatting/foreground/estimate_foreground_ml.py] + + +@njit("void(f4[:, :, :], f4[:, :, :])", cache=True, nogil=True, parallel=True) +def _resize_nearest_multichannel(dst, src): + """ + Internal method. + + Resize image src to dst using nearest neighbors filtering. + Images must have multiple color channels, i.e. :code:`len(shape) == 3`. + + Parameters + ---------- + dst: numpy.ndarray of type np.float32 + output image + src: numpy.ndarray of type np.float32 + input image + """ + h_src, w_src, depth = src.shape + h_dst, w_dst, depth = dst.shape + + for y_dst in prange(h_dst): + for x_dst in range(w_dst): + x_src = max(0, min(w_src - 1, x_dst * w_src // w_dst)) + y_src = max(0, min(h_src - 1, y_dst * h_src // h_dst)) + + for c in range(depth): + dst[y_dst, x_dst, c] = src[y_src, x_src, c] + + +@njit("void(f4[:, :], f4[:, :])", cache=True, nogil=True, parallel=True) +def _resize_nearest(dst, src): + """ + Internal method. + + Resize image src to dst using nearest neighbors filtering. + Images must be grayscale, i.e. :code:`len(shape) == 3`. + + Parameters + ---------- + dst: numpy.ndarray of type np.float32 + output image + src: numpy.ndarray of type np.float32 + input image + """ + h_src, w_src = src.shape + h_dst, w_dst = dst.shape + + for y_dst in prange(h_dst): + for x_dst in range(w_dst): + x_src = max(0, min(w_src - 1, x_dst * w_src // w_dst)) + y_src = max(0, min(h_src - 1, y_dst * h_src // h_dst)) + + dst[y_dst, x_dst] = src[y_src, x_src] + + +# TODO +# There should be an option to switch @njit(parallel=True) on or off. +# parallel=True would be faster, but might cause race conditions. +# User should have the option to turn it on or off. +@njit( + "Tuple((f4[:, :, :], f4[:, :, :]))(f4[:, :, :], f4[:, :], f4, i4, i4, i4, f4)", + cache=True, + nogil=True) +def _estimate_fb_ml( + input_image, + input_alpha, + regularization, + n_small_iterations, + n_big_iterations, + small_size, + gradient_weight, ): + h0, w0, depth = input_image.shape + + dtype = np.float32 + + w_prev = 1 + h_prev = 1 + + F_prev = np.empty((h_prev, w_prev, depth), dtype=dtype) + B_prev = np.empty((h_prev, w_prev, depth), dtype=dtype) + + n_levels = int(np.ceil(np.log2(max(w0, h0)))) + + for i_level in range(n_levels + 1): + w = round(w0**(i_level / n_levels)) + h = round(h0**(i_level / n_levels)) + + image = np.empty((h, w, depth), dtype=dtype) + alpha = np.empty((h, w), dtype=dtype) + + _resize_nearest_multichannel(image, input_image) + _resize_nearest(alpha, input_alpha) + + F = np.empty((h, w, depth), dtype=dtype) + B = np.empty((h, w, depth), dtype=dtype) + + _resize_nearest_multichannel(F, F_prev) + _resize_nearest_multichannel(B, B_prev) + + if w <= small_size and h <= small_size: + n_iter = n_small_iterations + else: + n_iter = n_big_iterations + + b = np.zeros((2, depth), dtype=dtype) + + dx = [-1, 1, 0, 0] + dy = [0, 0, -1, 1] + + for i_iter in range(n_iter): + for y in prange(h): + for x in range(w): + a0 = alpha[y, x] + a1 = 1.0 - a0 + + a00 = a0 * a0 + a01 = a0 * a1 + # a10 = a01 can be omitted due to symmetry of matrix + a11 = a1 * a1 + + for c in range(depth): + b[0, c] = a0 * image[y, x, c] + b[1, c] = a1 * image[y, x, c] + + for d in range(4): + x2 = max(0, min(w - 1, x + dx[d])) + y2 = max(0, min(h - 1, y + dy[d])) + + gradient = abs(a0 - alpha[y2, x2]) + + da = regularization + gradient_weight * gradient + + a00 += da + a11 += da + + for c in range(depth): + b[0, c] += da * F[y2, x2, c] + b[1, c] += da * B[y2, x2, c] + + determinant = a00 * a11 - a01 * a01 + + inv_det = 1.0 / determinant + + b00 = inv_det * a11 + b01 = inv_det * -a01 + b11 = inv_det * a00 + + for c in range(depth): + F_c = b00 * b[0, c] + b01 * b[1, c] + B_c = b01 * b[0, c] + b11 * b[1, c] + + F_c = max(0.0, min(1.0, F_c)) + B_c = max(0.0, min(1.0, B_c)) + + F[y, x, c] = F_c + B[y, x, c] = B_c + + F_prev = F + B_prev = B + + w_prev = w + h_prev = h + + return F, B + + +def estimate_foreground_ml( + image, + alpha, + regularization=1e-5, + n_small_iterations=10, + n_big_iterations=2, + small_size=32, + return_background=False, + gradient_weight=1.0, ): + """Estimates the foreground of an image given its alpha matte. + + See :cite:`germer2020multilevel` for reference. + + Parameters + ---------- + image: numpy.ndarray + Input image with shape :math:`h \\times w \\times d` + alpha: numpy.ndarray + Input alpha matte shape :math:`h \\times w` + regularization: float + Regularization strength :math:`\\epsilon`, defaults to :math:`10^{-5}`. + Higher regularization results in smoother colors. + n_small_iterations: int + Number of iterations performed on small scale, defaults to :math:`10` + n_big_iterations: int + Number of iterations performed on large scale, defaults to :math:`2` + small_size: int + Threshold that determines at which size `n_small_iterations` should be used + return_background: bool + Whether to return the estimated background in addition to the foreground + gradient_weight: float + Larger values enforce smoother foregrounds, defaults to :math:`1` + + Returns + ------- + F: numpy.ndarray + Extracted foreground + B: numpy.ndarray + Extracted background + + Example + ------- + >>> from pymatting import * + >>> image = load_image("data/lemur/lemur.png", "RGB") + >>> alpha = load_image("data/lemur/lemur_alpha.png", "GRAY") + >>> F = estimate_foreground_ml(image, alpha, return_background=False) + >>> F, B = estimate_foreground_ml(image, alpha, return_background=True) + + See Also + ---- + stack_images: This function can be used to place the foreground on a new background. + """ + + foreground, background = _estimate_fb_ml( + image.astype(np.float32), + alpha.astype(np.float32), + regularization, + n_small_iterations, + n_big_iterations, + small_size, + gradient_weight, ) + + if return_background: + return foreground, background + + return foreground diff --git a/ppmatting/utils/utils.py b/ppmatting/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..13513cb193757b63043f44a2c145b3e9b6fad82e --- /dev/null +++ b/ppmatting/utils/utils.py @@ -0,0 +1,71 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_files(root_path): + res = [] + for root, dirs, files in os.walk(root_path, followlinks=True): + for f in files: + if f.endswith(('.jpg', '.png', '.jpeg', 'JPG')): + res.append(os.path.join(root, f)) + return res + + +def get_image_list(image_path): + """Get image list""" + valid_suffix = [ + '.JPEG', '.jpeg', '.JPG', '.jpg', '.BMP', '.bmp', '.PNG', '.png' + ] + image_list = [] + image_dir = None + if os.path.isfile(image_path): + image_dir = None + if os.path.splitext(image_path)[-1] in valid_suffix: + image_list.append(image_path) + else: + image_dir = os.path.dirname(image_path) + with open(image_path, 'r') as f: + for line in f: + line = line.strip() + if len(line.split()) > 1: + raise RuntimeError( + 'There should be only one image path per line in `image_path` file. Wrong line: {}' + .format(line)) + image_list.append(os.path.join(image_dir, line)) + elif os.path.isdir(image_path): + image_dir = image_path + for root, dirs, files in os.walk(image_path): + for f in files: + if '.ipynb_checkpoints' in root: + continue + if os.path.splitext(f)[-1] in valid_suffix: + image_list.append(os.path.join(root, f)) + image_list.sort() + else: + raise FileNotFoundError( + '`image_path` is not found. it should be an image file or a directory including images' + ) + + if len(image_list) == 0: + raise RuntimeError('There are not image file in `image_path`') + + return image_list, image_dir + + +def mkdir(path): + sub_dir = os.path.dirname(path) + if not os.path.exists(sub_dir): + os.makedirs(sub_dir) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab502573c9246d360637b870cff0be0f0766c300 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +pyyaml >= 5.1 +visualdl >= 2.0.0 +opencv-python +tqdm +filelock +scipy +prettytable +sklearn