Spaces:
Running
on
Zero
Running
on
Zero
# -*- coding: utf-8 -*- | |
""" | |
This file may have been modified by Bytedance Ltd. and/or its affiliates (“Bytedance's Modifications”). | |
All Bytedance's Modifications are Copyright (year) Bytedance Ltd. and/or its affiliates. | |
Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/config.py | |
""" | |
from detectron2.config import CfgNode as CN | |
def add_maskformer2_config(cfg): | |
""" | |
Add config for MASK_FORMER. | |
""" | |
# NOTE: configs from original maskformer | |
# data config | |
# select the dataset mapper | |
cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" | |
# Color augmentation | |
cfg.INPUT.COLOR_AUG_SSD = False | |
# We retry random cropping until no single category in semantic segmentation GT occupies more | |
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop. | |
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 | |
# Pad image and segmentation GT in dataset mapper. | |
cfg.INPUT.SIZE_DIVISIBILITY = -1 | |
# solver config | |
# weight decay on embedding | |
cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 | |
# optimizer | |
cfg.SOLVER.OPTIMIZER = "ADAMW" | |
cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 | |
# mask_former model config | |
cfg.MODEL.MASK_FORMER = CN() | |
# loss | |
cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True | |
cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 | |
cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0 | |
cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 | |
cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 | |
# transformer config | |
cfg.MODEL.MASK_FORMER.NHEADS = 8 | |
cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 | |
cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 | |
cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 | |
cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 | |
cfg.MODEL.MASK_FORMER.PRE_NORM = False | |
cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 | |
cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 | |
cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" | |
cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False | |
# mask_former inference config | |
cfg.MODEL.MASK_FORMER.TEST = CN() | |
cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True | |
cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False | |
cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False | |
cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 | |
cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 | |
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False | |
# Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) | |
# you can use this config to override | |
cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 | |
# pixel decoder config | |
cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 | |
# adding transformer in pixel decoder | |
cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 | |
# pixel decoder | |
cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" | |
# swin transformer backbone | |
cfg.MODEL.SWIN = CN() | |
cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 | |
cfg.MODEL.SWIN.PATCH_SIZE = 4 | |
cfg.MODEL.SWIN.EMBED_DIM = 96 | |
cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] | |
cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] | |
cfg.MODEL.SWIN.WINDOW_SIZE = 7 | |
cfg.MODEL.SWIN.MLP_RATIO = 4.0 | |
cfg.MODEL.SWIN.QKV_BIAS = True | |
cfg.MODEL.SWIN.QK_SCALE = None | |
cfg.MODEL.SWIN.DROP_RATE = 0.0 | |
cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 | |
cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 | |
cfg.MODEL.SWIN.APE = False | |
cfg.MODEL.SWIN.PATCH_NORM = True | |
cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] | |
cfg.MODEL.SWIN.USE_CHECKPOINT = False | |
# NOTE: maskformer2 extra configs | |
# transformer module | |
cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder" | |
# LSJ aug | |
cfg.INPUT.IMAGE_SIZE = 1024 | |
cfg.INPUT.MIN_SCALE = 0.1 | |
cfg.INPUT.MAX_SCALE = 2.0 | |
# MSDeformAttn encoder configs | |
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] | |
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 | |
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 | |
# point loss configs | |
# Number of points sampled during training for a mask point head. | |
cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 | |
# Oversampling parameter for PointRend point sampling during training. Parameter `k` in the | |
# original paper. | |
cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0 | |
# Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in | |
# the original paper. | |
cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 | |
def add_fcclip_config(cfg): | |
# FC-CLIP model config | |
cfg.MODEL.FC_CLIP = CN() | |
cfg.MODEL.FC_CLIP.CLIP_MODEL_NAME = "convnext_large_d_320" | |
cfg.MODEL.FC_CLIP.CLIP_PRETRAINED_WEIGHTS = "laion2b_s29b_b131k_ft_soup" | |
cfg.MODEL.FC_CLIP.EMBED_DIM = 768 | |
cfg.MODEL.FC_CLIP.GEOMETRIC_ENSEMBLE_ALPHA = 0.4 | |
cfg.MODEL.FC_CLIP.GEOMETRIC_ENSEMBLE_BETA = 0.8 | |
cfg.MODEL.FC_CLIP.ENSEMBLE_ON_VALID_MASK = False | |
def add_mask_adapter_config(cfg): | |
# Mask-Adapter model config | |
cfg.MODEL.MASK_ADAPTER = CN() | |
cfg.MODEL.MASK_ADAPTER.MASK_IN_CHANNELS = 16 | |
cfg.MODEL.MASK_ADAPTER.NUM_CHANNELS = 768 | |
cfg.MODEL.MASK_ADAPTER.USE_CHECKPOINT = False | |
cfg.MODEL.MASK_ADAPTER.NUM_OUTPUT_MAPS = 16 | |
cfg.MODEL.MASK_ADAPTER.MASK_THRESHOLD = 0.45 | |
cfg.MODEL.MASK_ADAPTER.TRAIN_MAFT = False | |
cfg.MODEL.MASK_ADAPTER.NAME = "MASKAdapterHead" | |
cfg.DATALOADER.DATASET_RATIO = [1, 1] | |
cfg.DATALOADER.USE_DIFF_BS_SIZE = True | |
cfg.DATALOADER.DATASET_BS = [2, 2] | |
cfg.DATALOADER.USE_RFS = [False, False] | |
cfg.DATALOADER.MULTI_DATASET_GROUPING = True | |
cfg.DATALOADER.DATASET_ANN = ['box', 'box'] |