norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', backbone=dict( type='VisionTransformer', model_name='vit_base_patch16_224', img_size=768, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, num_classes=104, drop_rate=0.0, norm_cfg=dict(type='SyncBN', requires_grad=True), pos_embed_interp=True, align_corners=False), decode_head=dict( type='VisionTransformerUpHead', in_channels=768, channels=512, in_index=11, img_size=768, embed_dim=768, num_classes=104, norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=[ dict( type='VisionTransformerUpHead', in_channels=768, channels=512, in_index=5, img_size=768, embed_dim=768, num_classes=104, norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), dict( type='VisionTransformerUpHead', in_channels=768, channels=512, in_index=7, img_size=768, embed_dim=768, num_classes=104, norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), dict( type='VisionTransformerUpHead', in_channels=768, channels=512, in_index=9, img_size=768, embed_dim=768, num_classes=104, norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)) ]) train_cfg = dict() test_cfg = dict(mode='slide', crop_size=(768, 768), stride=(512, 512)) dataset_type = 'CustomDataset' data_root = './data/FoodSeg103/Images/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (768, 768) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(768, 768), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(768, 768), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2049, 1025), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='CustomDataset', data_root='./data/FoodSeg103/Images/', img_dir='img_dir/train', ann_dir='ann_dir/train', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict( type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(768, 768), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(768, 768), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ]), val=dict( type='CustomDataset', data_root='./data/FoodSeg103/Images/', img_dir='img_dir/test', ann_dir='ann_dir/test', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2049, 1025), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='CustomDataset', data_root='./data/FoodSeg103/Images/', img_dir='img_dir/test', ann_dir='ann_dir/test', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2049, 1025), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ])) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0, paramwise_cfg=dict(custom_keys=dict(head=dict(lr_mult=10.0)))) optimizer_config = dict() lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False) runner = dict(type='IterBasedRunner', max_iters=80000) checkpoint_config = dict(by_epoch=False, interval=4000) evaluation = dict(interval=4000, metric='mIoU') find_unused_parameters = True work_dir = 'checkpoints/SETR_NAIVE' gpu_ids = range(0, 1)