|
norm_cfg = dict(type='SyncBN', requires_grad=True) |
|
model = dict( |
|
type='EncoderDecoder', |
|
backbone=dict( |
|
type='VisionTransformer', |
|
model_name='vit_base_patch16_224', |
|
img_size=768, |
|
patch_size=16, |
|
in_chans=3, |
|
embed_dim=768, |
|
depth=12, |
|
num_heads=12, |
|
num_classes=104, |
|
drop_rate=0.0, |
|
norm_cfg=dict(type='SyncBN', requires_grad=True), |
|
pos_embed_interp=True, |
|
align_corners=False), |
|
decode_head=dict( |
|
type='VisionTransformerUpHead', |
|
in_channels=768, |
|
channels=512, |
|
in_index=11, |
|
img_size=768, |
|
embed_dim=768, |
|
num_classes=104, |
|
norm_cfg=dict(type='SyncBN', requires_grad=True), |
|
num_conv=2, |
|
upsampling_method='bilinear', |
|
align_corners=False, |
|
loss_decode=dict( |
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), |
|
auxiliary_head=[ |
|
dict( |
|
type='VisionTransformerUpHead', |
|
in_channels=768, |
|
channels=512, |
|
in_index=5, |
|
img_size=768, |
|
embed_dim=768, |
|
num_classes=104, |
|
norm_cfg=dict(type='SyncBN', requires_grad=True), |
|
num_conv=2, |
|
upsampling_method='bilinear', |
|
align_corners=False, |
|
loss_decode=dict( |
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), |
|
dict( |
|
type='VisionTransformerUpHead', |
|
in_channels=768, |
|
channels=512, |
|
in_index=7, |
|
img_size=768, |
|
embed_dim=768, |
|
num_classes=104, |
|
norm_cfg=dict(type='SyncBN', requires_grad=True), |
|
num_conv=2, |
|
upsampling_method='bilinear', |
|
align_corners=False, |
|
loss_decode=dict( |
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), |
|
dict( |
|
type='VisionTransformerUpHead', |
|
in_channels=768, |
|
channels=512, |
|
in_index=9, |
|
img_size=768, |
|
embed_dim=768, |
|
num_classes=104, |
|
norm_cfg=dict(type='SyncBN', requires_grad=True), |
|
num_conv=2, |
|
upsampling_method='bilinear', |
|
align_corners=False, |
|
loss_decode=dict( |
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)) |
|
]) |
|
train_cfg = dict() |
|
test_cfg = dict(mode='slide', crop_size=(768, 768), stride=(512, 512)) |
|
dataset_type = 'CustomDataset' |
|
data_root = './data/FoodSeg103/Images/' |
|
img_norm_cfg = dict( |
|
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) |
|
crop_size = (768, 768) |
|
train_pipeline = [ |
|
dict(type='LoadImageFromFile'), |
|
dict(type='LoadAnnotations'), |
|
dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), |
|
dict(type='RandomCrop', crop_size=(768, 768), cat_max_ratio=0.75), |
|
dict(type='RandomFlip', prob=0.5), |
|
dict(type='PhotoMetricDistortion'), |
|
dict( |
|
type='Normalize', |
|
mean=[123.675, 116.28, 103.53], |
|
std=[58.395, 57.12, 57.375], |
|
to_rgb=True), |
|
dict(type='Pad', size=(768, 768), pad_val=0, seg_pad_val=255), |
|
dict(type='DefaultFormatBundle'), |
|
dict(type='Collect', keys=['img', 'gt_semantic_seg']) |
|
] |
|
test_pipeline = [ |
|
dict(type='LoadImageFromFile'), |
|
dict( |
|
type='MultiScaleFlipAug', |
|
img_scale=(2049, 1025), |
|
flip=False, |
|
transforms=[ |
|
dict(type='Resize', keep_ratio=True), |
|
dict(type='RandomFlip'), |
|
dict( |
|
type='Normalize', |
|
mean=[123.675, 116.28, 103.53], |
|
std=[58.395, 57.12, 57.375], |
|
to_rgb=True), |
|
dict(type='ImageToTensor', keys=['img']), |
|
dict(type='Collect', keys=['img']) |
|
]) |
|
] |
|
data = dict( |
|
samples_per_gpu=1, |
|
workers_per_gpu=2, |
|
train=dict( |
|
type='CustomDataset', |
|
data_root='./data/FoodSeg103/Images/', |
|
img_dir='img_dir/train', |
|
ann_dir='ann_dir/train', |
|
pipeline=[ |
|
dict(type='LoadImageFromFile'), |
|
dict(type='LoadAnnotations'), |
|
dict( |
|
type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), |
|
dict(type='RandomCrop', crop_size=(768, 768), cat_max_ratio=0.75), |
|
dict(type='RandomFlip', prob=0.5), |
|
dict(type='PhotoMetricDistortion'), |
|
dict( |
|
type='Normalize', |
|
mean=[123.675, 116.28, 103.53], |
|
std=[58.395, 57.12, 57.375], |
|
to_rgb=True), |
|
dict(type='Pad', size=(768, 768), pad_val=0, seg_pad_val=255), |
|
dict(type='DefaultFormatBundle'), |
|
dict(type='Collect', keys=['img', 'gt_semantic_seg']) |
|
]), |
|
val=dict( |
|
type='CustomDataset', |
|
data_root='./data/FoodSeg103/Images/', |
|
img_dir='img_dir/test', |
|
ann_dir='ann_dir/test', |
|
pipeline=[ |
|
dict(type='LoadImageFromFile'), |
|
dict( |
|
type='MultiScaleFlipAug', |
|
img_scale=(2049, 1025), |
|
flip=False, |
|
transforms=[ |
|
dict(type='Resize', keep_ratio=True), |
|
dict(type='RandomFlip'), |
|
dict( |
|
type='Normalize', |
|
mean=[123.675, 116.28, 103.53], |
|
std=[58.395, 57.12, 57.375], |
|
to_rgb=True), |
|
dict(type='ImageToTensor', keys=['img']), |
|
dict(type='Collect', keys=['img']) |
|
]) |
|
]), |
|
test=dict( |
|
type='CustomDataset', |
|
data_root='./data/FoodSeg103/Images/', |
|
img_dir='img_dir/test', |
|
ann_dir='ann_dir/test', |
|
pipeline=[ |
|
dict(type='LoadImageFromFile'), |
|
dict( |
|
type='MultiScaleFlipAug', |
|
img_scale=(2049, 1025), |
|
flip=False, |
|
transforms=[ |
|
dict(type='Resize', keep_ratio=True), |
|
dict(type='RandomFlip'), |
|
dict( |
|
type='Normalize', |
|
mean=[123.675, 116.28, 103.53], |
|
std=[58.395, 57.12, 57.375], |
|
to_rgb=True), |
|
dict(type='ImageToTensor', keys=['img']), |
|
dict(type='Collect', keys=['img']) |
|
]) |
|
])) |
|
log_config = dict( |
|
interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) |
|
dist_params = dict(backend='nccl') |
|
log_level = 'INFO' |
|
load_from = None |
|
resume_from = None |
|
workflow = [('train', 1)] |
|
cudnn_benchmark = True |
|
optimizer = dict( |
|
type='SGD', |
|
lr=0.01, |
|
momentum=0.9, |
|
weight_decay=0.0, |
|
paramwise_cfg=dict(custom_keys=dict(head=dict(lr_mult=10.0)))) |
|
optimizer_config = dict() |
|
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False) |
|
runner = dict(type='IterBasedRunner', max_iters=80000) |
|
checkpoint_config = dict(by_epoch=False, interval=4000) |
|
evaluation = dict(interval=4000, metric='mIoU') |
|
find_unused_parameters = True |
|
work_dir = 'checkpoints/SETR_NAIVE' |
|
gpu_ids = range(0, 1) |
|
|