metaformer

Sleeping

App Files Files Community

diaoqishuai commited on Mar 5, 2022

Commit

4a3ad95

0 Parent(s):

first commit

Browse files

Files changed (33) hide show

.gitignore +145 -0
LICENSE +21 -0
README.md +117 -0
config.py +273 -0
configs/MetaFG_0_224.yaml +5 -0
configs/MetaFG_1_224.yaml +5 -0
configs/MetaFG_2_224.yaml +5 -0
configs/MetaFG_meta_0_224.yaml +8 -0
configs/MetaFG_meta_1_224.yaml +8 -0
configs/MetaFG_meta_2_224.yaml +8 -0
configs/MetaFG_meta_attribute_1_224.yaml +8 -0
configs/MetaFG_meta_bert_0_224.yaml +8 -0
configs/MetaFG_meta_bert_1_224.yaml +8 -0
data/__init__.py +1 -0
data/build.py +169 -0
data/cached_image_folder.py +251 -0
data/dataset_fg.py +457 -0
data/samplers.py +29 -0
data/zipreader.py +103 -0
figs/overview.png +0 -0
get_flops.py +62 -0
logger.py +47 -0
lr_scheduler.py +102 -0
main.py +403 -0
models/MBConv.py +169 -0
models/MHSA.py +161 -0
models/MetaFG.py +213 -0
models/MetaFG_meta.py +268 -0
models/__init__.py +1 -0
models/build.py +20 -0
models/meta_encoder.py +21 -0
optimizer.py +70 -0
utils.py +173 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,145 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+/output
+/imagenet
+/imagenet_raw
+/pretrained_model
+/inaturelist2021
+/inaturelist2021_mini
+/save_model
+/inaturelist2017
+/inaturelist2018
+/cub-200
+/stanfordcars
+/oxfordflower
+/stanforddogs
+/nabirds
+/aircraft
+/datasets

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 dqshuai
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+# MetaFG
+A repository for the code used to create and train the model defined in “MetaFormer : A Unified Meta Framework for Fine-Grained Recognition”
+## Model zoo
+| name       | resolution   | 1k model   |  21k model   | iNat21 model   |
+| :--------: | :----------: | :--------: | :----------: | :------------: |
+| MetaFormer-0   |     384x384      |  [metafg_0_1k_384](https://drive.google.com/file/d/1r62S3CJFRWV_qA5udC9MOFOJYwRf8mE2/view?usp=sharing)  |  [metafg_0_21k_384](https://drive.google.com/file/d/1wVmlPjNTA6JKHcF3ROGorEVPxKVO83Ss/view?usp=sharing)  |  [metafg_0_inat21_384](https://drive.google.com/file/d/11gCk_IuSN7krdkOUSWSM4xlf8GGknmxc/view?usp=sharing)  |
+| MetaFormer-1   |     384x384      |  [metafg_1_1k_384](https://drive.google.com/file/d/12OTmZg4J6fMGvs-colOTDfmhdA5EMMvo/view?usp=sharing)  |  [metafg_1_21k_384](https://drive.google.com/file/d/13dsarbtsNrkhpG5XpCRlN5ogXDGXO3Z_/view?usp=sharing)  |  [metafg_1_inat21_384](https://drive.google.com/file/d/1ATUIrDxaQaGqx4lJ8HE2IwX_evMhblPu/view?usp=sharing)  |
+| MetaFormer-2   |     384x384      |  [metafg_2_1k_384](https://drive.google.com/file/d/167oBaseORq32aFA3Ex6lpHuasvu2PMb8/view?usp=sharing)  |  [metafg_2_21k_384](https://drive.google.com/file/d/1PnpntloQaYduEokFGQ6y79G7DdyjD_u3/view?usp=sharing)  |  [metafg_2_inat21_384](https://drive.google.com/file/d/17sUNST7ivQhonBAfZEiTOLAgtaHa4F3e/view?usp=sharing)  |
+## Usage
+#### python module
+* install `Pytorch and torchvision`
+```
+pip install torch==1.5.1 torchvision==0.6.1
+```
+* install `timm`
+```
+pip install timm==0.4.5
+```
+* install `Apex`
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+* install other requirements
+```
+pip install opencv-python==4.5.1.48 yacs==0.1.8
+```
+#### data preparation
+Download [inat21,18,17](https://github.com/visipedia/inat_comp),[CUB](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html),[NABirds](https://dl.allaboutbirds.org/nabirds),[stanfordcars](https://ai.stanford.edu/~jkrause/cars/car_dataset.html), and[aircraft](https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/), put them in respective folders (\<root\>/datasets/<dataset_name>) and Unzip file. The folder sturture as follow:
+```
+datasets
+  |————inraturelist2021
+  |       └——————train
+  |       └——————val
+  |       └——————train.json
+  |       └——————val.json
+  |————inraturelist2018
+  |       └——————train_val_images
+  |       └——————train2018.json
+  |       └——————val2018.json
+  |       └——————train2018_locations.json
+  |       └——————val2018_locations.json
+  |       └——————categories.json.json
+  |————inraturelist2017
+  |       └——————train_val_images
+  |       └——————train2017.json
+  |       └——————val2017.json
+  |       └——————train2017_locations.json
+  |       └——————val2017_locations.json
+  |————cub-200
+  |       └——————...
+  |————nabirds
+  |       └——————...
+  |————stanfordcars
+  |       └——————car_ims
+  |       └——————cars_annos.mat
+  |————aircraft
+  |       └——————...
+```
+#### Training
+You can dowmload pre-trained model from model zoo, and put them under \<root\>/pretrained.
+To train MetaFG on datasets, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node <num-of-gpus-to-use> --master_port 12345  main.py --cfg <config-file> --dataset <dataset-name> --pretrain <pretainedmodel-path> [--batch-size <batch-size-per-gpu> --output <output-directory> --tag <job-tag>]
+```
+\<dataset-name\>:inaturelist2021,inaturelist2018,inaturelist2017,cub-200,nabirds,stanfordcars,aircraft
+For CUB-200-2011, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node 8 --master_port 12345  main.py --cfg ./configs/MetaFG_1_224.yaml --batch-size 32 --tag cub-200_v1 --lr 5e-5 --min-lr 5e-7 --warmup-lr 5e-8 --epochs 300 --warmup-epochs 20 --dataset cub-200 --pretrain ./pretrained_model/<xxxx>.pth --accumulation-steps 2 --opts DATA.IMG_SIZE 384
+```
+note that final learning rate is total_bs/512.
+#### Eval
+To evaluate model on dataset,run:
+```
+python3 -m torch.distributed.launch --nproc_per_node <num-of-gpus-to-use> --master_port 12345  main.py --eval --cfg <config-file> --dataset <dataset-name> --resume <checkpoint> [--batch-size <batch-size-per-gpu>]
+```
+## Main Result
+#### ImageNet-1k
+| Name       | Resolution   | #Param   |  #FLOPS   | Throughput   | Top-1 acc |
+| :--------: | :----------: | :--------: | :----------: | :------------: | :------------: |
+| MetaFormer-0   |     224x224      |  28M  |  4.6G  |  840.1  | 82.9 |
+| MetaFormer-1   |     224x224      |  45M  |  8.5G  |  444.8  | 83.9 |
+| MetaFormer-2   |     224x224      |  81M  |  16.9G  |  438.9  | 84.1 |
+| MetaFormer-0   |     384x384      |  28M  |  13.4G  |  349.4  | 84.2 |
+| MetaFormer-1   |     384x384      |  45M  |  24.7G  |  165.3  | 84.4 |
+| MetaFormer-2   |     384x384      |  81M  |  49.7G  |  132.7  | 84.6 |
+#### Fine-grained Datasets
+Result on fine-grained datasets with different pre-trained model.
+| Name       | Pretrain   | CUB | NABirds |  iNat2017   | iNat2018  | Cars | Aircraft |
+| :--------: | :----------: | :--------: | :----------: | :------------: | :------------: | :--------: |:--------: |
+| MetaFormer-0|ImageNet-1k|89.6|89.1|75.7|79.5|95.0|91.2|
+| MetaFormer-0|ImageNet-21k|89.7|89.5|75.8|79.9|94.6|91.2|
+| MetaFormer-0|iNaturalist 2021|91.8|91.5|78.3|82.9|95.1|87.4|
+| MetaFormer-1|ImageNet-1k|89.7|89.4|78.2|81.9|94.9|90.8|
+| MetaFormer-1|ImageNet-21k|91.3|91.6|79.4|83.2|95.0|92.6|
+| MetaFormer-1|iNaturalist 2021|92.3|92.7|82.0|87.5|95.0|92.5|
+| MetaFormer-2|ImageNet-1k|89.7|89.7|79.0|82.6|95.0|92.4|
+| MetaFormer-2|ImageNet-21k|91.8|92.2|80.4|84.3|95.1|92.9|
+| MetaFormer-2|iNaturalist 2021|92.9|93.0|82.8|87.7|95.4|92.8|
+Results in iNaturalist 2019, iNaturalist 2018, and iNaturalist 2021 with meta-information.
+| Name       | Pretrain   | Meta added| iNat2017   |  iNat2018   | iNat2021   |
+| :--------: | :----------: | :--------: | :---------- | :------------ |:------------ |
+|MetaFormer-0|ImageNet-1k|N|75.7|79.5|88.4|
+|MetaFormer-0|ImageNet-1k|Y|79.8(+4.1)|85.4(+5.9)|92.6(+4.2)|
+|MetaFormer-1|ImageNet-1k|N|78.2|81.9|90.2|
+|MetaFormer-1|ImageNet-1k|Y|81.3(+3.1)|86.5(+4.6)|93.4(+3.2)|
+|MetaFormer-2|ImageNet-1k|N|79.0|82.6|89.8|
+|MetaFormer-2|ImageNet-1k|Y|82.0(+3.0)|86.8(+4.2)|93.2(+3.4)|
+|MetaFormer-2|ImageNet-21k|N|80.4|84.3|90.3|
+|MetaFormer-2|ImageNet-21k|Y|83.4(+3.0)|88.7(+4.4)|93.6(+3.3)|
+## Citation
+## Acknowledgement
+Many thanks for [swin-transformer](https://github.com/microsoft/Swin-Transformer).A part of the code is borrowed from it.

config.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------'
+import os
+import yaml
+from yacs.config import CfgNode as CN
+_C = CN()
+# Base config files
+_C.BASE = ['']
+# -----------------------------------------------------------------------------
+# Data settings
+# -----------------------------------------------------------------------------
+_C.DATA = CN()
+# Batch size for a single GPU, could be overwritten by command line argument
+_C.DATA.BATCH_SIZE = 128
+# Path to dataset, could be overwritten by command line argument
+_C.DATA.DATA_PATH = ''
+# Dataset name
+_C.DATA.DATASET = 'imagenet'
+# Input image size
+_C.DATA.IMG_SIZE = 224
+# Interpolation to resize image (random, bilinear, bicubic)
+_C.DATA.INTERPOLATION = 'bicubic'
+_C.DATA.TRAIN_INTERPOLATION = 'bicubic'
+# Use zipped dataset instead of folder dataset
+# could be overwritten by command line argument
+_C.DATA.ZIP_MODE = False
+# Cache Data in Memory, could be overwritten by command line argument
+_C.DATA.CACHE_MODE = 'part'
+# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
+_C.DATA.PIN_MEMORY = True
+# Number of data loading threads
+_C.DATA.NUM_WORKERS = 8
+# hdfs data dir
+_C.DATA.TRAIN_PATH = None
+_C.DATA.VAL_PATH = None
+# arnold dataset parallel
+_C.DATA.NUM_READERS = 4
+#meta info
+_C.DATA.ADD_META = False
+_C.DATA.FUSION = 'early'
+_C.DATA.MASK_PROB = 0.0
+_C.DATA.MASK_TYPE = 'constant'
+_C.DATA.LATE_FUSION_LAYER = -1
+# -----------------------------------------------------------------------------
+# Model settings
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Model type
+_C.MODEL.TYPE = ''
+# Model name
+_C.MODEL.NAME = ''
+# Checkpoint to resume, could be overwritten by command line argument
+_C.MODEL.RESUME = ''
+# Number of classes, overwritten in data preparation
+_C.MODEL.NUM_CLASSES = 1000
+# Dropout rate
+_C.MODEL.DROP_RATE = 0.0
+# Drop path rate
+_C.MODEL.DROP_PATH_RATE = 0.1
+# Label Smoothing
+_C.MODEL.LABEL_SMOOTHING = 0.1
+#pretrain
+_C.MODEL.PRETRAINED = None
+_C.MODEL.DORP_HEAD = True
+_C.MODEL.DORP_META = True
+_C.MODEL.ONLY_LAST_CLS = False
+_C.MODEL.EXTRA_TOKEN_NUM = 1
+_C.MODEL.META_DIMS = []
+# -----------------------------------------------------------------------------
+# Training settings
+# -----------------------------------------------------------------------------
+_C.TRAIN = CN()
+_C.TRAIN.START_EPOCH = 0
+_C.TRAIN.EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 5e-4
+_C.TRAIN.WARMUP_LR = 5e-7
+_C.TRAIN.MIN_LR = 5e-6
+# Clip gradient norm
+_C.TRAIN.CLIP_GRAD = 5.0
+# Auto resume from latest checkpoint
+_C.TRAIN.AUTO_RESUME = True
+# Gradient accumulation steps
+# could be overwritten by command line argument
+_C.TRAIN.ACCUMULATION_STEPS = 0
+# Whether to use gradient checkpointing to save memory
+# could be overwritten by command line argument
+_C.TRAIN.USE_CHECKPOINT = False
+# LR scheduler
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
+# Epoch interval to decay LR, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
+# LR decay rate, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1
+# Optimizer
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'adamw'
+# Optimizer Epsilon
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+# Optimizer Betas
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+# SGD momentum
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+# -----------------------------------------------------------------------------
+# Augmentation settings
+# -----------------------------------------------------------------------------
+_C.AUG = CN()
+# Color jitter factor
+_C.AUG.COLOR_JITTER = 0.4
+# Use AutoAugment policy. "v0" or "original"
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+_C.AUG.REPROB = 0.25
+# Random erase mode
+_C.AUG.REMODE = 'pixel'
+# Random erase count
+_C.AUG.RECOUNT = 1
+# Mixup alpha, mixup enabled if > 0
+_C.AUG.MIXUP = 0.8
+# Cutmix alpha, cutmix enabled if > 0
+_C.AUG.CUTMIX = 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+_C.AUG.CUTMIX_MINMAX = None
+# Probability of performing mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_PROB = 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+_C.AUG.MIXUP_MODE = 'batch'
+# -----------------------------------------------------------------------------
+# Testing settings
+# -----------------------------------------------------------------------------
+_C.TEST = CN()
+# Whether to use center crop when testing
+_C.TEST.CROP = True
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2')
+# overwritten by command line argument
+_C.AMP_OPT_LEVEL = ''
+# Path to output folder, overwritten by command line argument
+_C.OUTPUT = ''
+# Tag of experiment, overwritten by command line argument
+_C.TAG = 'default'
+# Frequency to save checkpoint
+_C.SAVE_FREQ = 1
+# Frequency to logging info
+_C.PRINT_FREQ = 10
+# Fixed random seed
+_C.SEED = 0
+# Perform evaluation only, overwritten by command line argument
+_C.EVAL_MODE = False
+# Test throughput only, overwritten by command line argument
+_C.THROUGHPUT_MODE = False
+# local rank for DistributedDataParallel, given by command line argument
+_C.LOCAL_RANK = 0
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as f:
+        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('=> merge config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+def update_config(config, args):
+    _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.opts:
+        config.merge_from_list(args.opts)
+    # merge from specific arguments
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.zip:
+        config.DATA.ZIP_MODE = True
+    if args.cache_mode:
+        config.DATA.CACHE_MODE = args.cache_mode
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.accumulation_steps:
+        config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps
+    if args.use_checkpoint:
+        config.TRAIN.USE_CHECKPOINT = True
+    if args.amp_opt_level:
+        config.AMP_OPT_LEVEL = args.amp_opt_level
+    if args.output:
+        config.OUTPUT = args.output
+    if args.tag:
+        config.TAG = args.tag
+    if args.eval:
+        config.EVAL_MODE = True
+    if args.throughput:
+        config.THROUGHPUT_MODE = True
+    if args.num_workers is not None:
+        config.DATA.NUM_WORKERS = args.num_workers
+    #set lr and weight decay
+    if args.lr is not None:
+        config.TRAIN.BASE_LR = args.lr
+    if args.min_lr is not None:
+        config.TRAIN.MIN_LR = args.min_lr
+    if args.warmup_lr is not None:
+        config.TRAIN.WARMUP_LR = args.warmup_lr
+    if args.warmup_epochs is not None:
+        config.TRAIN.WARMUP_EPOCHS = args.warmup_epochs
+    if args.weight_decay is not None:
+        config.TRAIN.WEIGHT_DECAY = args.weight_decay
+    if args.epochs is not None:
+        config.TRAIN.EPOCHS = args.epochs
+    if args.dataset is not None:
+        config.DATA.DATASET = args.dataset
+    if args.lr_scheduler_name is not None:
+        config.TRAIN.LR_SCHEDULER.NAME = args.lr_scheduler_name
+    if args.pretrain is not None:
+        config.MODEL.PRETRAINED = args.pretrain
+    # set local rank for distributed training
+    config.LOCAL_RANK = args.local_rank
+    # output folder
+    config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG)
+    config.freeze()
+def get_config(args):
+    """Get a yacs CfgNode object with default values."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    update_config(config, args)
+    return config

configs/MetaFG_0_224.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+DATA:
+  IMG_SIZE: 224
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_0

configs/MetaFG_1_224.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+DATA:
+  IMG_SIZE: 224
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_1

configs/MetaFG_2_224.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+DATA:
+  IMG_SIZE: 224
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_2

configs/MetaFG_meta_0_224.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DATA:
+  IMG_SIZE: 224
+  ADD_META: True
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_meta_0
+  EXTRA_TOKEN_NUM: 3
+  META_DIMS: [ 4, 3 ]

configs/MetaFG_meta_1_224.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DATA:
+  IMG_SIZE: 224
+  ADD_META: True
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_meta_1
+  EXTRA_TOKEN_NUM: 3
+  META_DIMS: [ 4, 3 ]

configs/MetaFG_meta_2_224.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DATA:
+  IMG_SIZE: 224
+  ADD_META: True
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_meta_2
+  EXTRA_TOKEN_NUM: 3
+  META_DIMS: [ 4, 3 ]

configs/MetaFG_meta_attribute_1_224.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DATA:
+  IMG_SIZE: 224
+  ADD_META: True
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_meta_1
+  EXTRA_TOKEN_NUM: 2
+  META_DIMS: [ 312, ]

configs/MetaFG_meta_bert_0_224.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DATA:
+  IMG_SIZE: 224
+  ADD_META: True
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_meta_0
+  EXTRA_TOKEN_NUM: 33
+  META_DIMS: [ 768, ]

configs/MetaFG_meta_bert_1_224.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DATA:
+  IMG_SIZE: 224
+  ADD_META: True
+MODEL:
+  TYPE: MetaFG
+  NAME: MetaFG_meta_1
+  EXTRA_TOKEN_NUM: 33
+  META_DIMS: [ 768, ]

data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .build import build_loader

data/build.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import os
+import torch
+import numpy as np
+import torch.distributed as dist
+from torchvision import datasets, transforms
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data import Mixup
+from timm.data import create_transform
+from timm.data.transforms import _pil_interp
+from .cached_image_folder import CachedImageFolder
+from .samplers import SubsetRandomSampler
+from .dataset_fg import DatasetMeta
+def build_loader(config):
+    config.defrost()
+    dataset_train, config.MODEL.NUM_CLASSES = build_dataset(is_train=True, config=config)
+    config.freeze()
+    print(f"local rank {config.LOCAL_RANK} / global rank {dist.get_rank()} successfully build train dataset")
+    dataset_val, _ = build_dataset(is_train=False, config=config)
+    print(f"local rank {config.LOCAL_RANK} / global rank {dist.get_rank()} successfully build val dataset")
+    num_tasks = dist.get_world_size()
+    global_rank = dist.get_rank()
+    if config.DATA.ZIP_MODE and config.DATA.CACHE_MODE == 'part':
+        indices = np.arange(dist.get_rank(), len(dataset_train), dist.get_world_size())
+        sampler_train = SubsetRandomSampler(indices)
+    else:
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+    indices = np.arange(dist.get_rank(), len(dataset_val), dist.get_world_size())
+    sampler_val = SubsetRandomSampler(indices)
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=config.DATA.BATCH_SIZE,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=True,
+    )
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False
+    )
+    # setup mixup / cutmix
+    mixup_fn = None
+    mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
+    if mixup_active:
+        mixup_fn = Mixup(
+            mixup_alpha=config.AUG.MIXUP, cutmix_alpha=config.AUG.CUTMIX, cutmix_minmax=config.AUG.CUTMIX_MINMAX,
+            prob=config.AUG.MIXUP_PROB, switch_prob=config.AUG.MIXUP_SWITCH_PROB, mode=config.AUG.MIXUP_MODE,
+            label_smoothing=config.MODEL.LABEL_SMOOTHING, num_classes=config.MODEL.NUM_CLASSES)
+    return dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn
+def build_dataset(is_train, config):
+    transform = build_transform(is_train, config)
+    if config.DATA.DATASET == 'imagenet':
+        prefix = 'train' if is_train else 'val'
+        if config.DATA.ZIP_MODE:
+            ann_file = prefix + "_map.txt"
+            prefix = prefix + ".zip@/"
+            dataset = CachedImageFolder(config.DATA.DATA_PATH, ann_file, prefix, transform,
+                                        cache_mode=config.DATA.CACHE_MODE if is_train else 'part')
+        else:
+#             root = os.path.join(config.DATA.DATA_PATH, prefix)
+            root = './datasets/imagenet'
+            dataset = datasets.ImageFolder(root, transform=transform)
+        nb_classes = 1000
+    elif config.DATA.DATASET == 'inaturelist2021':
+        root = './datasets/inaturelist2021'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET,
+                             class_ratio=config.DATA.CLASS_RATIO,per_sample=config.DATA.PER_SAMPLE)
+        nb_classes = 10000
+    elif config.DATA.DATASET == 'inaturelist2021_mini':
+        root = './datasets/inaturelist2021_mini'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 10000
+    elif config.DATA.DATASET == 'inaturelist2017':
+        root = './datasets/inaturelist2017'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 5089
+    elif config.DATA.DATASET == 'inaturelist2018':
+        root = './datasets/inaturelist2018'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 8142
+    elif config.DATA.DATASET == 'cub-200':
+        root = './datasets/cub-200'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 200
+    elif config.DATA.DATASET == 'stanfordcars':
+        root = './datasets/stanfordcars'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 196
+    elif config.DATA.DATASET == 'oxfordflower':
+        root = './datasets/oxfordflower'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 102
+    elif config.DATA.DATASET == 'stanforddogs':
+        root = './datasets/stanforddogs'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 120
+    elif config.DATA.DATASET == 'nabirds':
+        root = './datasets/nabirds'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 555
+    elif config.DATA.DATASET == 'aircraft':
+        root = './datasets/aircraft'
+        dataset = DatasetMeta(root=root,transform=transform,train=is_train,aux_info=config.DATA.ADD_META,dataset=config.DATA.DATASET)
+        nb_classes = 100
+    else:
+        raise NotImplementedError("We only support ImageNet and inaturelist.")
+    return dataset, nb_classes
+def build_transform(is_train, config):
+    resize_im = config.DATA.IMG_SIZE > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=config.DATA.IMG_SIZE,
+            is_training=True,
+            color_jitter=config.AUG.COLOR_JITTER if config.AUG.COLOR_JITTER > 0 else None,
+            auto_augment=config.AUG.AUTO_AUGMENT if config.AUG.AUTO_AUGMENT != 'none' else None,
+            re_prob=config.AUG.REPROB,
+            re_mode=config.AUG.REMODE,
+            re_count=config.AUG.RECOUNT,
+            interpolation=config.DATA.TRAIN_INTERPOLATION,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4)
+        return transform
+    t = []
+    if resize_im:
+        if config.TEST.CROP:
+            size = int((256 / 224) * config.DATA.IMG_SIZE)
+            t.append(
+                transforms.Resize(size, interpolation=_pil_interp(config.DATA.INTERPOLATION)),
+                # to maintain same ratio w.r.t. 224 images
+            )
+            t.append(transforms.CenterCrop(config.DATA.IMG_SIZE))
+        else:
+            t.append(
+                transforms.Resize((config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
+                                  interpolation=_pil_interp(config.DATA.INTERPOLATION))
+            )
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)

data/cached_image_folder.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import io
+import os
+import time
+import torch.distributed as dist
+import torch.utils.data as data
+from PIL import Image
+from .zipreader import is_zip_path, ZipReader
+def has_file_allowed_extension(filename, extensions):
+    """Checks if a file is an allowed extension.
+    Args:
+        filename (string): path to a file
+    Returns:
+        bool: True if the filename ends with a known image extension
+    """
+    filename_lower = filename.lower()
+    return any(filename_lower.endswith(ext) for ext in extensions)
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+def make_dataset(dir, class_to_idx, extensions):
+    images = []
+    dir = os.path.expanduser(dir)
+    for target in sorted(os.listdir(dir)):
+        d = os.path.join(dir, target)
+        if not os.path.isdir(d):
+            continue
+        for root, _, fnames in sorted(os.walk(d)):
+            for fname in sorted(fnames):
+                if has_file_allowed_extension(fname, extensions):
+                    path = os.path.join(root, fname)
+                    item = (path, class_to_idx[target])
+                    images.append(item)
+    return images
+def make_dataset_with_ann(ann_file, img_prefix, extensions):
+    images = []
+    with open(ann_file, "r") as f:
+        contents = f.readlines()
+        for line_str in contents:
+            path_contents = [c for c in line_str.split('\t')]
+            im_file_name = path_contents[0]
+            class_index = int(path_contents[1])
+            assert str.lower(os.path.splitext(im_file_name)[-1]) in extensions
+            item = (os.path.join(img_prefix, im_file_name), class_index)
+            images.append(item)
+    return images
+class DatasetFolder(data.Dataset):
+    """A generic data loader where the samples are arranged in this way: ::
+        root/class_x/xxx.ext
+        root/class_x/xxy.ext
+        root/class_x/xxz.ext
+        root/class_y/123.ext
+        root/class_y/nsdf3.ext
+        root/class_y/asd932_.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (list[string]): A list of allowed extensions.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+     Attributes:
+        samples (list): List of (sample path, class_index) tuples
+    """
+    def __init__(self, root, loader, extensions, ann_file='', img_prefix='', transform=None, target_transform=None,
+                 cache_mode="no"):
+        # image folder mode
+        if ann_file == '':
+            _, class_to_idx = find_classes(root)
+            samples = make_dataset(root, class_to_idx, extensions)
+        # zip mode
+        else:
+            samples = make_dataset_with_ann(os.path.join(root, ann_file),
+                                            os.path.join(root, img_prefix),
+                                            extensions)
+        if len(samples) == 0:
+            raise (RuntimeError("Found 0 files in subfolders of: " + root + "\n" +
+                                "Supported extensions are: " + ",".join(extensions)))
+        self.root = root
+        self.loader = loader
+        self.extensions = extensions
+        self.samples = samples
+        self.labels = [y_1k for _, y_1k in samples]
+        self.classes = list(set(self.labels))
+        self.transform = transform
+        self.target_transform = target_transform
+        self.cache_mode = cache_mode
+        if self.cache_mode != "no":
+            self.init_cache()
+    def init_cache(self):
+        assert self.cache_mode in ["part", "full"]
+        n_sample = len(self.samples)
+        global_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        samples_bytes = [None for _ in range(n_sample)]
+        start_time = time.time()
+        for index in range(n_sample):
+            if index % (n_sample // 10) == 0:
+                t = time.time() - start_time
+                print(f'global_rank {dist.get_rank()} cached {index}/{n_sample} takes {t:.2f}s per block')
+                start_time = time.time()
+            path, target = self.samples[index]
+            if self.cache_mode == "full":
+                samples_bytes[index] = (ZipReader.read(path), target)
+            elif self.cache_mode == "part" and index % world_size == global_rank:
+                samples_bytes[index] = (ZipReader.read(path), target)
+            else:
+                samples_bytes[index] = (path, target)
+        self.samples = samples_bytes
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return sample, target
+    def __len__(self):
+        return len(self.samples)
+    def __repr__(self):
+        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
+        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
+        fmt_str += '    Root Location: {}\n'.format(self.root)
+        tmp = '    Transforms (if any): '
+        fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        tmp = '    Target Transforms (if any): '
+        fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        return fmt_str
+IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    if isinstance(path, bytes):
+        img = Image.open(io.BytesIO(path))
+    elif is_zip_path(path):
+        data = ZipReader.read(path)
+        img = Image.open(io.BytesIO(data))
+    else:
+        with open(path, 'rb') as f:
+            img = Image.open(f)
+    return img.convert('RGB')
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+def default_img_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+class CachedImageFolder(DatasetFolder):
+    """A generic data loader where the images are arranged in this way: ::
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/xxz.png
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/asd932_.png
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+     Attributes:
+        imgs (list): List of (image path, class_index) tuples
+    """
+    def __init__(self, root, ann_file='', img_prefix='', transform=None, target_transform=None,
+                 loader=default_img_loader, cache_mode="no"):
+        super(CachedImageFolder, self).__init__(root, loader, IMG_EXTENSIONS,
+                                                ann_file=ann_file, img_prefix=img_prefix,
+                                                transform=transform, target_transform=target_transform,
+                                                cache_mode=cache_mode)
+        self.imgs = self.samples
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        image = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(image)
+        else:
+            img = image
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return img, target

data/dataset_fg.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import torch.utils.data as data
+import os
+import re
+import csv
+import json
+import torch
+import tarfile
+import pickle
+import numpy as np
+import pandas as pd
+import random
+random.seed(2021)
+from PIL import Image
+from scipy import io as scio
+from math import radians, cos, sin, asin, sqrt, pi
+IMG_EXTENSIONS = ['.png', '.jpg', '.jpeg']
+def get_spatial_info(latitude,longitude):
+    if latitude and longitude:
+        latitude = radians(latitude)
+        longitude = radians(longitude)
+        x = cos(latitude)*cos(longitude)
+        y = cos(latitude)*sin(longitude)
+        z = sin(latitude)
+        return [x,y,z]
+    else:
+        return [0,0,0]
+def get_temporal_info(date,miss_hour=False):
+    try:
+        if date:
+            if miss_hour:
+                pattern = re.compile(r'(\d*)-(\d*)-(\d*)', re.I)
+            else:
+                pattern = re.compile(r'(\d*)-(\d*)-(\d*) (\d*):(\d*):(\d*)', re.I)
+            m = pattern.match(date.strip())
+            if m:
+                year = int(m.group(1))
+                month = int(m.group(2))
+                day = int(m.group(3))
+                x_month = sin(2*pi*month/12)
+                y_month = cos(2*pi*month/12)
+                if miss_hour:
+                    x_hour = 0
+                    y_hour = 0
+                else:
+                    hour = int(m.group(4))
+                    x_hour = sin(2*pi*hour/24)
+                    y_hour = cos(2*pi*hour/24)
+                return [x_month,y_month,x_hour,y_hour]
+            else:
+                return [0,0,0,0]
+        else:
+            return [0,0,0,0]
+    except:
+        return [0,0,0,0]
+def load_file(root,dataset):
+    if dataset == 'inaturelist2017':
+        year_flag = 7
+    elif dataset == 'inaturelist2018':
+        year_flag = 8
+    if dataset == 'inaturelist2018':
+        with open(os.path.join(root,'categories.json'),'r') as f:
+            map_label = json.load(f)
+        map_2018 = dict()
+        for _map in map_label:
+            map_2018[int(_map['id'])] = _map['name'].strip().lower()
+    with open(os.path.join(root,f'val201{year_flag}_locations.json'),'r') as f:
+        val_location = json.load(f)
+    val_id2meta = dict()
+    for meta_info in val_location:
+        val_id2meta[meta_info['id']] = meta_info
+    with open(os.path.join(root,f'train201{year_flag}_locations.json'),'r') as f:
+        train_location = json.load(f)
+    train_id2meta = dict()
+    for meta_info in train_location:
+        train_id2meta[meta_info['id']] = meta_info
+    with open(os.path.join(root,f'val201{year_flag}.json'),'r') as f:
+        val_class_info = json.load(f)
+    with open(os.path.join(root,f'train201{year_flag}.json'),'r') as f:
+        train_class_info = json.load(f)
+    if dataset == 'inaturelist2017':
+        categories_2017 = [x['name'].strip().lower() for x in val_class_info['categories']]
+        class_to_idx = {c: idx for idx, c in enumerate(categories_2017)}
+        id2label = dict()
+        for categorie in val_class_info['categories']:
+            id2label[int(categorie['id'])] = categorie['name'].strip().lower()
+    elif dataset == 'inaturelist2018':
+        categories_2018 = [x['name'].strip().lower() for x in map_label]
+        class_to_idx = {c: idx for idx, c in enumerate(categories_2018)}
+        id2label = dict()
+        for categorie in val_class_info['categories']:
+            name = map_2018[int(categorie['name'])]
+            id2label[int(categorie['id'])] = name.strip().lower()
+    return train_class_info,train_id2meta,val_class_info,val_id2meta,class_to_idx,id2label
+def find_images_and_targets_cub200(root,dataset,istrain=False,aux_info=False):
+    imageid2label = {}
+    with open(os.path.join(os.path.join(root,'CUB_200_2011'),'image_class_labels.txt'),'r') as f:
+        for line in f:
+            image_id,label = line.split()
+            imageid2label[int(image_id)] = int(label)-1
+    imageid2split = {}
+    with open(os.path.join(os.path.join(root,'CUB_200_2011'),'train_test_split.txt'),'r') as f:
+        for line in f:
+            image_id,split = line.split()
+            imageid2split[int(image_id)] = int(split)
+    images_and_targets = []
+    images_info = []
+    images_root = os.path.join(os.path.join(root,'CUB_200_2011'),'images')
+    bert_embedding_root = os.path.join(root,'bert_embedding_cub')
+    text_root = os.path.join(root,'text_c10')
+    with open(os.path.join(os.path.join(root,'CUB_200_2011'),'images.txt'),'r') as f:
+        for line in f:
+            image_id,file_name = line.split()
+            file_path = os.path.join(images_root,file_name)
+            target = imageid2label[int(image_id)]
+            if aux_info:
+                with open(os.path.join(bert_embedding_root,file_name.replace('.jpg','.pickle')),'rb') as f_bert:
+                    bert_embedding = pickle.load(f_bert)
+                    bert_embedding = bert_embedding['embedding_words']
+                text_list = []
+                with open(os.path.join(text_root,file_name.replace('.jpg','.txt')),'r') as f_text:
+                    for line in f_text:
+                        line = line.encode(encoding='UTF-8',errors='strict')
+                        line = line.replace(b'\xef\xbf\xbd\xef\xbf\xbd',b' ')
+                        line = line.decode('UTF-8','strict')
+                        text_list.append(line)
+            if istrain and imageid2split[int(image_id)]==1:
+                if aux_info:
+                    images_and_targets.append([file_path,target,bert_embedding])
+                    images_info.append({'text_list':text_list})
+                else:
+                    images_and_targets.append([file_path,target])
+            elif not istrain and imageid2split[int(image_id)]==0:
+                if aux_info:
+                    images_and_targets.append([file_path,target,bert_embedding])
+                    images_info.append({'text_list':text_list})
+                else:
+                    images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_cub200_attribute(root,dataset,istrain=False,aux_info=False):
+    imageid2label = {}
+    with open(os.path.join(os.path.join(root,'CUB_200_2011'),'image_class_labels.txt'),'r') as f:
+        for line in f:
+            image_id,label = line.split()
+            imageid2label[int(image_id)] = int(label)-1
+    imageid2split = {}
+    with open(os.path.join(os.path.join(root,'CUB_200_2011'),'train_test_split.txt'),'r') as f:
+        for line in f:
+            image_id,split = line.split()
+            imageid2split[int(image_id)] = int(split)
+    images_and_targets = []
+    images_info = []
+    images_root = os.path.join(os.path.join(root,'CUB_200_2011'),'images')
+    attributes_root = os.path.join(os.path.join(root,'CUB_200_2011'),'attributes')
+    imageid2attribute = {}
+    with open(os.path.join(attributes_root,'image_attribute_labels.txt'),'r') as f:
+        for line in f:
+            if len(line.split())==6:
+                image_id,attribute_id,is_present,_,_,_ = line.split()
+            else:
+                image_id,attribute_id,is_present,certainty_id,time = line.split()
+            if int(image_id) not in imageid2attribute:
+                imageid2attribute[int(image_id)] = [0 for i in range(312)]
+            imageid2attribute[int(image_id)][int(attribute_id)-1] = int(is_present)
+    with open(os.path.join(os.path.join(root,'CUB_200_2011'),'images.txt'),'r') as f:
+        for line in f:
+            image_id,file_name = line.split()
+            file_path = os.path.join(images_root,file_name)
+            target = imageid2label[int(image_id)]
+            if aux_info:
+                pass
+            if istrain and imageid2split[int(image_id)]==1:
+                if aux_info:
+                    images_and_targets.append([file_path,target,imageid2attribute[int(image_id)]])
+                    images_info.append({'attributes':imageid2attribute[int(image_id)]})
+                else:
+                    images_and_targets.append([file_path,target])
+            elif not istrain and imageid2split[int(image_id)]==0:
+                if aux_info:
+                    images_and_targets.append([file_path,target,imageid2attribute[int(image_id)]])
+                    images_info.append({'attributes':imageid2attribute[int(image_id)]})
+                else:
+                    images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_oxfordflower(root,dataset,istrain=False,aux_info=False):
+    imagelabels = scio.loadmat(os.path.join(root,'imagelabels.mat'))
+    imagelabels = imagelabels['labels'][0]
+    train_val_split = scio.loadmat(os.path.join(root,'setid.mat'))
+    train_data = train_val_split['trnid'][0].tolist()
+    val_data = train_val_split['valid'][0].tolist()
+    test_data = train_val_split['tstid'][0].tolist()
+    images_and_targets = []
+    images_info = []
+    images_root = os.path.join(root,'jpg')
+    bert_embedding_root = os.path.join(root,'bert_embedding_flower')
+    if istrain:
+        all_data = train_data+val_data
+    else:
+        all_data = test_data
+    for data in all_data:
+        file_path = os.path.join(images_root,f'image_{str(data).zfill(5)}.jpg')
+        target = int(imagelabels[int(data)-1])-1
+        if aux_info:
+            with open(os.path.join(bert_embedding_root,f'image_{str(data).zfill(5)}.pickle'),'rb') as f_bert:
+                bert_embedding = pickle.load(f_bert)
+                bert_embedding = bert_embedding['embedding_full']
+            images_and_targets.append([file_path,target,bert_embedding])
+        else:
+            images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_stanforddogs(root,dataset,istrain=False,aux_info=False):
+    if istrain:
+        anno_data = scio.loadmat(os.path.join(root,'train_list.mat'))
+    else:
+        anno_data = scio.loadmat(os.path.join(root,'test_list.mat'))
+    images_and_targets = []
+    images_info = []
+    for file,label in zip(anno_data['file_list'],anno_data['labels']):
+        file_path = os.path.join(os.path.join(root,'Images'),file[0][0])
+        target = int(label[0])-1
+        images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_nabirds(root,dataset,istrain=False,aux_info=False):
+    root = os.path.join(root,'nabirds')
+    image_paths = pd.read_csv(os.path.join(root,'images.txt'),sep=' ',names=['img_id','filepath'])
+    image_class_labels = pd.read_csv(os.path.join(root,'image_class_labels.txt'),sep=' ',names=['img_id','target'])
+    label_list = list(set(image_class_labels['target']))
+    label_list = sorted(label_list)
+    label_map = {k: i for i, k in enumerate(label_list)}
+    train_test_split = pd.read_csv(os.path.join(root, 'train_test_split.txt'), sep=' ', names=['img_id', 'is_training_img'])
+    data = image_paths.merge(image_class_labels, on='img_id')
+    data = data.merge(train_test_split, on='img_id')
+    if istrain:
+        data = data[data.is_training_img == 1]
+    else:
+        data = data[data.is_training_img == 0]
+    images_and_targets = []
+    images_info = []
+    for index,row in data.iterrows():
+        file_path = os.path.join(os.path.join(root,'images'),row['filepath'])
+        target = int(label_map[row['target']])
+        images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_stanfordcars_v1(root,dataset,istrain=False,aux_info=False):
+    if istrain:
+        flag = 'train'
+    else:
+        flag = 'test'
+    if istrain:
+        anno_data = scio.loadmat(os.path.join(os.path.join(root,'devkit'),f'cars_{flag}_annos.mat'))
+    else:
+        anno_data = scio.loadmat(os.path.join(os.path.join(root,'devkit'),f'cars_{flag}_annos_withlabels.mat'))
+    annotation = anno_data['annotations']
+    images_and_targets = []
+    images_info = []
+    for r in annotation[0]:
+        _,_,_,_,label,name = r
+        file_path = os.path.join(os.path.join(root,f'cars_{flag}'),name[0])
+        target = int(label[0][0])-1
+        images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_stanfordcars(root,dataset,istrain=False,aux_info=False):
+    anno_data = scio.loadmat(os.path.join(root,'cars_annos.mat'))
+    annotation = anno_data['annotations']
+    images_and_targets = []
+    images_info = []
+    for r in annotation[0]:
+        name,_,_,_,_,label,split = r
+        file_path = os.path.join(root,name[0])
+        target = int(label[0][0])-1
+        if istrain and int(split[0][0])==0:
+            images_and_targets.append([file_path,target])
+        elif not istrain and int(split[0][0])==1:
+            images_and_targets.append([file_path,target])
+    return images_and_targets,None,images_info
+def find_images_and_targets_aircraft(root,dataset,istrain=False,aux_info=False):
+    file_root = os.path.join(root,'fgvc-aircraft-2013b','data')
+    if istrain:
+        data_file = os.path.join(file_root,'images_variant_trainval.txt')
+    else:
+        data_file = os.path.join(file_root,'images_variant_test.txt')
+    classes = set()
+    with open(data_file,'r') as f:
+        for line in f:
+            class_name = '_'.join(line.split()[1:])
+            classes.add(class_name)
+    classes = sorted(list(classes))
+    class_to_idx = {name:ind for ind,name in enumerate(classes)}
+    images_and_targets = []
+    images_info = []
+    with open(data_file,'r') as f:
+        images_root = os.path.join(file_root,'images')
+        for line in f:
+            image_file = line.split()[0]
+            class_name = '_'.join(line.split()[1:])
+            file_path = os.path.join(images_root,f'{image_file}.jpg')
+            target = class_to_idx[class_name]
+            images_and_targets.append([file_path,target])
+    return images_and_targets,class_to_idx,images_info
+def find_images_and_targets_2017_2018(root,dataset,istrain=False,aux_info=False):
+    train_class_info,train_id2meta,val_class_info,val_id2meta,class_to_idx,id2label = load_file(root,dataset)
+    miss_hour = (dataset == 'inaturelist2017')
+    class_info = train_class_info if istrain else val_class_info
+    id2meta = train_id2meta if istrain else val_id2meta
+    images_and_targets = []
+    images_info = []
+    if aux_info:
+        temporal_info = []
+        spatial_info = []
+    for image,annotation in zip(class_info['images'],class_info['annotations']):
+        file_path = os.path.join(root,image['file_name'])
+        id_name = id2label[int(annotation['category_id'])]
+        target = class_to_idx[id_name]
+        image_id = image['id']
+        date = id2meta[image_id]['date']
+        latitude = id2meta[image_id]['lat']
+        longitude = id2meta[image_id]['lon']
+        location_uncertainty = id2meta[image_id]['loc_uncert']
+        images_info.append({'date':date,
+                'latitude':latitude,
+                'longitude':longitude,
+                'location_uncertainty':location_uncertainty,
+                'target':target})
+        if aux_info:
+            temporal_info = get_temporal_info(date,miss_hour=miss_hour)
+            spatial_info = get_spatial_info(latitude,longitude)
+            images_and_targets.append((file_path,target,temporal_info+spatial_info))
+        else:
+            images_and_targets.append((file_path,target))
+    return images_and_targets,class_to_idx,images_info
+def find_images_and_targets(root,istrain=False,aux_info=False):
+    if os.path.exists(os.path.join(root,'train.json')):
+        with open(os.path.join(root,'train.json'),'r') as f:
+            train_class_info = json.load(f)
+    elif os.path.exists(os.path.join(root,'train_mini.json')):
+        with open(os.path.join(root,'train_mini.json'),'r') as f:
+            train_class_info = json.load(f)
+    else:
+        raise ValueError(f'not eixst file {root}/train.json or {root}/train_mini.json')
+    with open(os.path.join(root,'val.json'),'r') as f:
+        val_class_info = json.load(f)
+    categories_2021 = [x['name'].strip().lower() for x in val_class_info['categories']]
+    class_to_idx = {c: idx for idx, c in enumerate(categories_2021)}
+    id2label = dict()
+    for categorie in train_class_info['categories']:
+        id2label[int(categorie['id'])] = categorie['name'].strip().lower()
+    class_info = train_class_info if istrain else val_class_info
+    images_and_targets = []
+    images_info = []
+    if aux_info:
+        temporal_info = []
+        spatial_info = []
+    for image,annotation in zip(class_info['images'],class_info['annotations']):
+        file_path = os.path.join(root,image['file_name'])
+        id_name = id2label[int(annotation['category_id'])]
+        target = class_to_idx[id_name]
+        date = image['date']
+        latitude = image['latitude']
+        longitude = image['longitude']
+        location_uncertainty = image['location_uncertainty']
+        images_info.append({'date':date,
+                'latitude':latitude,
+                'longitude':longitude,
+                'location_uncertainty':location_uncertainty,
+                'target':target})
+        if aux_info:
+            temporal_info = get_temporal_info(date)
+            spatial_info = get_spatial_info(latitude,longitude)
+            images_and_targets.append((file_path,target,temporal_info+spatial_info))
+        else:
+            images_and_targets.append((file_path,target))
+    return images_and_targets,class_to_idx,images_info
+class DatasetMeta(data.Dataset):
+    def __init__(
+            self,
+            root,
+            load_bytes=False,
+            transform=None,
+            train=False,
+            aux_info=False,
+            dataset='inaturelist2021',
+            class_ratio=1.0,
+            per_sample=1.0):
+        self.aux_info = aux_info
+        self.dataset = dataset
+        if dataset in ['inaturelist2021','inaturelist2021_mini']:
+            images, class_to_idx,images_info = find_images_and_targets(root,train,aux_info)
+        elif dataset in ['inaturelist2017','inaturelist2018']:
+            images, class_to_idx,images_info = find_images_and_targets_2017_2018(root,dataset,train,aux_info)
+        elif dataset == 'cub-200':
+            images, class_to_idx,images_info = find_images_and_targets_cub200(root,dataset,train,aux_info)
+        elif dataset == 'stanfordcars':
+            images, class_to_idx,images_info = find_images_and_targets_stanfordcars(root,dataset,train)
+        elif dataset == 'oxfordflower':
+            images, class_to_idx,images_info = find_images_and_targets_oxfordflower(root,dataset,train,aux_info)
+        elif dataset == 'stanforddogs':
+            images,class_to_idx,images_info = find_images_and_targets_stanforddogs(root,dataset,train)
+        elif dataset == 'nabirds':
+            images,class_to_idx,images_info = find_images_and_targets_nabirds(root,dataset,train)
+        elif dataset == 'aircraft':
+            images,class_to_idx,images_info = find_images_and_targets_aircraft(root,dataset,train)
+        if len(images) == 0:
+            raise RuntimeError(f'Found 0 images in subfolders of {root}. '
+                               f'Supported image extensions are {", ".join(IMG_EXTENSIONS)}')
+        self.root = root
+        self.samples = images
+        self.imgs = self.samples  # torchvision ImageFolder compat
+        self.class_to_idx = class_to_idx
+        self.images_info = images_info
+        self.load_bytes = load_bytes
+        self.transform = transform
+    def __getitem__(self, index):
+        if self.aux_info:
+            path, target,aux_info = self.samples[index]
+        else:
+            path, target = self.samples[index]
+        img = open(path, 'rb').read() if self.load_bytes else Image.open(path).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.aux_info:
+            if type(aux_info) is np.ndarray:
+                select_index = np.random.randint(aux_info.shape[0])
+                return img, target, aux_info[select_index,:]
+            else:
+                return img, target, np.asarray(aux_info).astype(np.float64)
+        else:
+            return img, target
+    def __len__(self):
+        return len(self.samples)
+if __name__ == '__main__':
+#     train_dataset = DatasetPre('./fgvc_previous','./fgvc_previous',train=True,aux_info=True)
+#     import ipdb;ipdb.set_trace()
+#     train_dataset = DatasetMeta('./nabirds',train=True,aux_info=False,dataset='nabirds')
+#     find_images_and_targets_stanforddogs('./stanforddogs',None,istrain=True)
+#     find_images_and_targets_oxfordflower('./oxfordflower',None,istrain=True)
+    find_images_and_targets_ablation('./inaturelist2021',True,True,0.5,1.0)
+#     find_images_and_targets_cub200('./cub-200','cub-200',True,True)
+#     find_images_and_targets_aircraft('./aircraft','aircraft',True)
+#     train_dataset = DatasetMeta('./aircraft',train=False,aux_info=False,dataset='aircraft')
+    import ipdb;ipdb.set_trace()
+#     find_images_and_targets_2017('')

data/samplers.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import torch
+class SubsetRandomSampler(torch.utils.data.Sampler):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+    Arguments:
+        indices (sequence): a sequence of indices
+    """
+    def __init__(self, indices):
+        self.epoch = 0
+        self.indices = indices
+    def __iter__(self):
+        return (self.indices[i] for i in torch.randperm(len(self.indices)))
+    def __len__(self):
+        return len(self.indices)
+    def set_epoch(self, epoch):
+        self.epoch = epoch

data/zipreader.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import os
+import zipfile
+import io
+import numpy as np
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+def is_zip_path(img_or_path):
+    """judge if this is a zip path"""
+    return '.zip@' in img_or_path
+class ZipReader(object):
+    """A class to read zipped files"""
+    zip_bank = dict()
+    def __init__(self):
+        super(ZipReader, self).__init__()
+    @staticmethod
+    def get_zipfile(path):
+        zip_bank = ZipReader.zip_bank
+        if path not in zip_bank:
+            zfile = zipfile.ZipFile(path, 'r')
+            zip_bank[path] = zfile
+        return zip_bank[path]
+    @staticmethod
+    def split_zip_style_path(path):
+        pos_at = path.index('@')
+        assert pos_at != -1, "character '@' is not found from the given path '%s'" % path
+        zip_path = path[0: pos_at]
+        folder_path = path[pos_at + 1:]
+        folder_path = str.strip(folder_path, '/')
+        return zip_path, folder_path
+    @staticmethod
+    def list_folder(path):
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        folder_list = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    len(os.path.splitext(file_foler_name)[-1]) == 0 and \
+                    file_foler_name != folder_path:
+                if len(folder_path) == 0:
+                    folder_list.append(file_foler_name)
+                else:
+                    folder_list.append(file_foler_name[len(folder_path) + 1:])
+        return folder_list
+    @staticmethod
+    def list_files(path, extension=None):
+        if extension is None:
+            extension = ['.*']
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        file_lists = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
+                if len(folder_path) == 0:
+                    file_lists.append(file_foler_name)
+                else:
+                    file_lists.append(file_foler_name[len(folder_path) + 1:])
+        return file_lists
+    @staticmethod
+    def read(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        return data
+    @staticmethod
+    def imread(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        try:
+            im = Image.open(io.BytesIO(data))
+        except:
+            print("ERROR IMG LOADED: ", path_img)
+            random_img = np.random.rand(224, 224, 3) * 255
+            im = Image.fromarray(np.uint8(random_img))
+        return im

figs/overview.png ADDED Viewed

get_flops.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+import torch
+from timm.models import create_model
+from models.CoAt import *
+try:
+    from mmcv.cnn import get_model_complexity_info
+    from mmcv.cnn.utils.flops_counter import get_model_complexity_info, flops_to_string, params_to_string
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+def parse_args():
+    parser = argparse.ArgumentParser(description='Get FLOPS of a classification model')
+    parser.add_argument('model', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[224,],
+        help='input image size')
+    args = parser.parse_args()
+    return args
+def get_flops(model, input_shape):
+    flops, params = get_model_complexity_info(model, input_shape, as_strings=False)
+    return flops_to_string(flops), params_to_string(params)
+def main():
+    args = parse_args()
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3,) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+    model = create_model(
+        args.model,
+        pretrained=False,
+        num_classes=1000,
+        img_size=args.shape[0],
+    )
+    model.name = args.model
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+    flops, params = get_flops(model, input_shape)
+    split_line = '=' * 30
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+if __name__ == '__main__':
+    main()

logger.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import os
+import sys
+import logging
+import functools
+from termcolor import colored
+@functools.lru_cache()
+def create_logger(output_dir, dist_rank=0, name='',local_rank=0):
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    # create formatter
+    fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+    color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \
+                colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
+    # create console handlers for master process
+#     if dist_rank == 0:
+#         console_handler = logging.StreamHandler(sys.stdout)
+#         console_handler.setLevel(logging.DEBUG)
+#         console_handler.setFormatter(
+#             logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+#         logger.addHandler(console_handler)
+    if local_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+    # create file handlers
+    file_handler = logging.FileHandler(os.path.join(output_dir, f'log_rank{dist_rank}.txt'), mode='a')
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+    logger.addHandler(file_handler)
+    return logger

lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import torch
+from timm.scheduler.cosine_lr import CosineLRScheduler
+from timm.scheduler.step_lr import StepLRScheduler
+from timm.scheduler.scheduler import Scheduler
+def build_scheduler(config, optimizer, n_iter_per_epoch):
+    num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch)
+    warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch)
+    decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS * n_iter_per_epoch)
+    lr_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == 'cosine':
+        lr_scheduler = CosineLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            t_mul=1.,
+            lr_min=config.TRAIN.MIN_LR,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            cycle_limit=1,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'linear':
+        lr_scheduler = LinearLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            lr_min_rate=0.01,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'step':
+        lr_scheduler = StepLRScheduler(
+            optimizer,
+            decay_t=decay_steps,
+            decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+    return lr_scheduler
+class LinearLRScheduler(Scheduler):
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 t_initial: int,
+                 lr_min_rate: float,
+                 warmup_t=0,
+                 warmup_lr_init=0.,
+                 t_in_epochs=True,
+                 noise_range_t=None,
+                 noise_pct=0.67,
+                 noise_std=1.0,
+                 noise_seed=42,
+                 initialize=True,
+                 ) -> None:
+        super().__init__(
+            optimizer, param_group_field="lr",
+            noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
+            initialize=initialize)
+        self.t_initial = t_initial
+        self.lr_min_rate = lr_min_rate
+        self.warmup_t = warmup_t
+        self.warmup_lr_init = warmup_lr_init
+        self.t_in_epochs = t_in_epochs
+        if self.warmup_t:
+            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
+            super().update_groups(self.warmup_lr_init)
+        else:
+            self.warmup_steps = [1 for _ in self.base_values]
+    def _get_lr(self, t):
+        if t < self.warmup_t:
+            lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
+        else:
+            t = t - self.warmup_t
+            total_t = self.t_initial - self.warmup_t
+            lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values]
+        return lrs
+    def get_epoch_values(self, epoch: int):
+        if self.t_in_epochs:
+            return self._get_lr(epoch)
+        else:
+            return None
+    def get_update_values(self, num_updates: int):
+        if not self.t_in_epochs:
+            return self._get_lr(num_updates)
+        else:
+            return None

main.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import os
+import time
+import argparse
+import datetime
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import accuracy, AverageMeter
+from config import get_config
+from models import build_model
+from data import build_loader
+from lr_scheduler import build_scheduler
+from optimizer import build_optimizer
+from logger import create_logger
+from utils import load_checkpoint, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor,load_pretained
+from torch.utils.tensorboard import SummaryWriter
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+def parse_option():
+    parser = argparse.ArgumentParser('MetaFG training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+',
+    )
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, help="batch size for single GPU")
+    parser.add_argument('--data-path',default='./imagenet', type=str, help='path to dataset')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                             'full: cache all data, '
+                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps")
+    parser.add_argument('--use-checkpoint', action='store_true',
+                        help="whether to use gradient checkpointing to save memory")
+    parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'],
+                        help='mixed precision opt level, if O0, no amp is used')
+    parser.add_argument('--output', default='output', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+    parser.add_argument('--num-workers', type=int,
+                        help="num of workers on dataloader ")
+    parser.add_argument('--lr', type=float, metavar='LR',
+                        help='learning rate')
+    parser.add_argument('--weight-decay', type=float,
+                        help='weight decay (default: 0.05 for adamw)')
+    parser.add_argument('--min-lr', type=float,
+                        help='learning rate')
+    parser.add_argument('--warmup-lr', type=float,
+                        help='warmup learning rate')
+    parser.add_argument('--epochs', type=int,
+                        help="epochs")
+    parser.add_argument('--warmup-epochs', type=int,
+                        help="epochs")
+    parser.add_argument('--dataset', type=str,
+                        help='dataset')
+    parser.add_argument('--lr-scheduler-name', type=str,
+                        help='lr scheduler name,cosin linear,step')
+    parser.add_argument('--pretrain', type=str,
+                        help='pretrain')
+    parser.add_argument('--tensorboard', action='store_true', help='using tensorboard')
+    # distributed training
+    parser.add_argument("--local_rank", type=int, required=True, help='local rank for DistributedDataParallel')
+    args, unparsed = parser.parse_known_args()
+    config = get_config(args)
+    return args, config
+def main(config):
+    dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn = build_loader(config)
+    logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}")
+    model = build_model(config)
+    model.cuda()
+    logger.info(str(model))
+    optimizer = build_optimizer(config, model)
+    if config.AMP_OPT_LEVEL != "O0":
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
+    model_without_ddp = model.module
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"number of params: {n_parameters}")
+    if hasattr(model_without_ddp, 'flops'):
+        flops = model_without_ddp.flops()
+        logger.info(f"number of GFLOPs: {flops / 1e9}")
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+    max_accuracy = 0.0
+    if config.MODEL.PRETRAINED:
+        load_pretained(config,model_without_ddp,logger)
+        if config.EVAL_MODE:
+            acc1, acc5, loss = validate(config, data_loader_val, model)
+            logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+            return
+    if config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}")
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+    if config.MODEL.RESUME:
+        logger.info(f"**********normal test***********")
+        max_accuracy = load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger)
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        if config.DATA.ADD_META:
+            logger.info(f"**********mask meta test***********")
+            acc1, acc5, loss = validate(config, data_loader_val, model,mask_meta=True)
+            logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        if config.EVAL_MODE:
+            return
+    if config.THROUGHPUT_MODE:
+        throughput(data_loader_val, model, logger)
+        return
+    logger.info("Start training")
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+        train_one_epoch_local_data(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
+        if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)):
+            save_checkpoint(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger)
+        logger.info(f"**********normal test***********")
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        max_accuracy = max(max_accuracy, acc1)
+        logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+        if config.DATA.ADD_META:
+            logger.info(f"**********mask meta test***********")
+            acc1, acc5, loss = validate(config, data_loader_val, model,mask_meta=True)
+            logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+#         data_loader_train.terminate()
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+def train_one_epoch_local_data(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler,tb_logger=None):
+    model.train()
+    if hasattr(model.module,'cur_epoch'):
+        model.module.cur_epoch = epoch
+        model.module.total_epoch = config.TRAIN.EPOCHS
+    optimizer.zero_grad()
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = AverageMeter()
+    start = time.time()
+    end = time.time()
+    for idx, data in enumerate(data_loader):
+        if config.DATA.ADD_META:
+            samples, targets,meta = data
+            meta = [m.float() for m in meta]
+            meta = torch.stack(meta,dim=0)
+            meta = meta.cuda(non_blocking=True)
+        else:
+            samples, targets= data
+            meta = None
+        samples = samples.cuda(non_blocking=True)
+        targets = targets.cuda(non_blocking=True)
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+        if config.DATA.ADD_META:
+            outputs = model(samples,meta)
+        else:
+            outputs = model(samples)
+        if config.TRAIN.ACCUMULATION_STEPS > 1:
+            loss = criterion(outputs, targets)
+            loss = loss / config.TRAIN.ACCUMULATION_STEPS
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                optimizer.step()
+                optimizer.zero_grad()
+                lr_scheduler.step_update(epoch * num_steps + idx)
+        else:
+            loss = criterion(outputs, targets)
+            optimizer.zero_grad()
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            optimizer.step()
+            lr_scheduler.step_update(epoch * num_steps + idx)
+        torch.cuda.synchronize()
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(grad_norm)
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")
+@torch.no_grad()
+def validate(config, data_loader, model, mask_meta=False):
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+    end = time.time()
+    for idx, data in enumerate(data_loader):
+        if config.DATA.ADD_META:
+            images,target,meta = data
+            meta = [m.float() for m in meta]
+            meta = torch.stack(meta,dim=0)
+            if mask_meta:
+                meta = torch.zeros_like(meta)
+            meta = meta.cuda(non_blocking=True)
+        else:
+            images, target = data
+            meta = None
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        # compute output
+        if config.DATA.ADD_META:
+            output = model(images,meta)
+        else:
+            output = model(images)
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(
+                f'Test: [{idx}/{len(data_loader)}]\t'
+                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                f'Mem {memory_used:.0f}MB')
+    logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+@torch.no_grad()
+def throughput(data_loader, model, logger):
+    model.eval()
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.cuda.synchronize()
+        logger.info(f"throughput averaged with 30 times")
+        tic1 = time.time()
+        for i in range(30):
+            model(images)
+        torch.cuda.synchronize()
+        tic2 = time.time()
+        logger.info(f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}")
+        return
+if __name__ == '__main__':
+    _, config = parse_option()
+    if config.AMP_OPT_LEVEL != "O0":
+        assert amp is not None, "amp not installed!"
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
+    else:
+        rank = -1
+        world_size = -1
+    torch.cuda.set_device(config.LOCAL_RANK)
+    torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
+    torch.distributed.barrier()
+    seed = config.SEED + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}",local_rank=config.LOCAL_RANK)
+    if dist.get_rank() == 0:
+        path = os.path.join(config.OUTPUT, "config.json")
+        with open(path, "w") as f:
+            f.write(config.dump())
+        logger.info(f"Full config saved to {path}")
+    # print config
+    logger.info(config.dump())
+    main(config)

models/MBConv.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import math
+from functools import partial
+import torch
+from torch import nn
+from torch.nn import functional as F
+class SwishImplementation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_variables[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+class MemoryEfficientSwish(nn.Module):
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+def drop_connect(inputs, p, training):
+    """ Drop connect. """
+    if not training: return inputs
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+    output = inputs / keep_prob * binary_tensor
+    return output
+def get_same_padding_conv2d(image_size=None):
+     return partial(Conv2dStaticSamePadding, image_size=image_size)
+def get_width_and_height_from_size(x):
+    """ Obtains width and height from a int or tuple """
+    if isinstance(x, int): return x, x
+    if isinstance(x, list) or isinstance(x, tuple): return x
+    else: raise TypeError()
+def calculate_output_image_size(input_image_size, stride):
+    """
+    计算出 Conv2dSamePadding with a stride.
+    """
+    if input_image_size is None: return None
+    image_height, image_width = get_width_and_height_from_size(input_image_size)
+    stride = stride if isinstance(stride, int) else stride[0]
+    image_height = int(math.ceil(image_height / stride))
+    image_width = int(math.ceil(image_width / stride))
+    return [image_height, image_width]
+class Conv2dStaticSamePadding(nn.Conv2d):
+    """ 2D Convolutions like TensorFlow, for a fixed image size"""
+    def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+        else:
+            self.static_padding = Identity()
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+        return x
+class Identity(nn.Module):
+    def __init__(self, ):
+        super(Identity, self).__init__()
+    def forward(self, input):
+        return input
+# #MBConvBlock
+class MBConvBlock(nn.Module):
+    '''
+    层 ksize3*3 输入32 输出16  conv1  stride步长1
+    '''
+    def __init__(self, ksize, input_filters, output_filters, expand_ratio=1, stride=1,image_size=224,drop_connect_rate=0.):
+        super().__init__()
+        self._bn_mom = 0.1
+        self._bn_eps = 0.01
+        self._se_ratio = 0.25
+        self._input_filters = input_filters
+        self._output_filters = output_filters
+        self._expand_ratio = expand_ratio
+        self._kernel_size = ksize
+        self._stride = stride
+        self._drop_connect_rate = drop_connect_rate
+        inp = self._input_filters
+        oup = self._input_filters * self._expand_ratio
+        if self._expand_ratio != 1:
+            self._expand_conv = nn.Conv2d(in_channels=inp, out_channels=oup, kernel_size=1,bias=False)
+            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+        # Depthwise convolution
+        k = self._kernel_size
+        s = self._stride
+        self._depthwise_conv = nn.Conv2d(in_channels=oup, out_channels=oup, groups=oup,
+            kernel_size=k, stride=s, padding=1,bias=False)
+        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+        # Squeeze and Excitation layer, if desired
+        num_squeezed_channels = max(1, int(self._input_filters * self._se_ratio))
+        self._se_reduce = nn.Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
+        self._se_expand = nn.Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
+        # Output phase
+        final_oup = self._output_filters
+        self._project_conv = nn.Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1,bias=False)
+        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+        self._swish = MemoryEfficientSwish()
+    def forward(self, inputs):
+        """
+        :param inputs: input tensor
+        :return: output of block
+        """
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._expand_ratio != 1:
+            expand = self._expand_conv(inputs)
+            bn0 = self._bn0(expand)
+            x = self._swish(bn0)
+        depthwise = self._depthwise_conv(x)
+        bn1 = self._bn1(depthwise)
+        x = self._swish(bn1)
+        # Squeeze and Excitation
+        x_squeezed = F.adaptive_avg_pool2d(x, 1)
+        x_squeezed = self._se_reduce(x_squeezed)
+        x_squeezed = self._swish(x_squeezed)
+        x_squeezed = self._se_expand(x_squeezed)
+        x = torch.sigmoid(x_squeezed) * x
+        x = self._bn2(self._project_conv(x))
+        # Skip connection and drop connect
+        input_filters, output_filters = self._input_filters, self._output_filters
+        if self._stride == 1 and input_filters == output_filters:
+            if self._drop_connect_rate!=0:
+                x = drop_connect(x, p=self._drop_connect_rate, training=self.training)
+            x = x + inputs  # skip connection
+        return x
+if __name__ == '__main__':
+    input=torch.randn(1,3,112,112)
+    mbconv=MBConvBlock(ksize=3,input_filters=3,output_filters=3,expand_ratio=4,stride=1)
+    print(mbconv)
+    out=mbconv(input)
+    print(out.shape)

models/MHSA.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x, H=None, W=None):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class Relative_Attention(nn.Module):
+    def __init__(self,dim,img_size,extra_token_num=1,num_heads=8,qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.extra_token_num = extra_token_num
+        head_dim = dim // num_heads
+        self.img_size = img_size # h,w
+        self.scale = qk_scale or head_dim ** -0.5
+         # define a parameter table of relative position bias,add cls_token bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * img_size[0] - 1) * (2 * img_size[1] - 1) + 1, num_heads))  # 2*h-1 * 2*w-1 + 1, nH
+        # get pair-wise relative position index for each token
+        coords_h = torch.arange(self.img_size[0])
+        coords_w = torch.arange(self.img_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, h, w
+        coords_flatten = torch.flatten(coords, 1)  # 2, h*w
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, h*w, h*w
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # h*w, h*w, 2
+        relative_coords[:, :, 0] += self.img_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.img_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.img_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # h*w, h*w
+        relative_position_index = F.pad(relative_position_index,(extra_token_num,0,extra_token_num,0))
+        relative_position_index = relative_position_index.long()
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x,):
+        """
+        Args:
+            x: input features with shape of (B, N, C)
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.img_size[0] * self.img_size[1] + self.extra_token_num, self.img_size[0] * self.img_size[1] + self.extra_token_num, -1)  # h*w+1,h*w+1,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, h*w+1, h*w+1
+        attn = attn + relative_position_bias.unsqueeze(0)
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class MHSABlock(nn.Module):
+    def __init__(self, input_dim, output_dim,image_size, stride, num_heads,extra_token_num=1,mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        if stride != 1:
+            self.patch_embed = OverlapPatchEmbed(patch_size=3,stride=stride,in_chans=input_dim,embed_dim=output_dim)
+            self.img_size = image_size//2
+        else:
+            self.patch_embed = None
+            self.img_size = image_size
+        self.img_size = to_2tuple(self.img_size)
+        self.norm1 = norm_layer(output_dim)
+        self.attn = Relative_Attention(
+            output_dim,self.img_size, extra_token_num=extra_token_num,num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(output_dim)
+        mlp_hidden_dim = int(output_dim * mlp_ratio)
+        self.mlp = Mlp(in_features=output_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, H, W, extra_tokens=None):
+        if self.patch_embed is not None:
+            x,_,_ = self.patch_embed(x)
+            extra_tokens = [token.expand(x.shape[0],-1,-1) for token in extra_tokens]
+            extra_tokens.append(x)
+            x = torch.cat(extra_tokens,dim=1)
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x),H//2,W//2))
+        return x

models/MetaFG.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import math
+import torch
+import torch.nn as nn
+from timm.models.helpers import load_pretrained
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_
+import numpy as np
+from .MBConv import MBConvBlock
+from .MHSA import MHSABlock,Mlp
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225),
+        'classifier': 'head',
+        **kwargs
+    }
+default_cfgs = {
+    'MetaFG_0': _cfg(),
+    'MetaFG_1': _cfg(),
+    'MetaFG_2': _cfg(),
+}
+def make_blocks(stage_index,depths,embed_dims,img_size,dpr,extra_token_num=1,num_heads=8,mlp_ratio=4.,stage_type='conv'):
+    stage_name = f'stage_{stage_index}'
+    blocks = []
+    for block_idx in range(depths[stage_index]):
+        stride = 2 if block_idx == 0 and stage_index != 1 else 1
+        in_chans = embed_dims[stage_index] if block_idx != 0 else  embed_dims[stage_index-1]
+        out_chans = embed_dims[stage_index]
+        image_size = img_size if block_idx == 0 or stage_index == 1 else img_size//2
+        drop_path_rate = dpr[sum(depths[1:stage_index])+block_idx]
+        if stage_type == 'conv':
+            blocks.append(MBConvBlock(ksize=3,input_filters=in_chans,output_filters=out_chans,
+                                      image_size=image_size,expand_ratio=int(mlp_ratio),stride=stride,drop_connect_rate=drop_path_rate))
+        elif stage_type == 'mhsa':
+            blocks.append(MHSABlock(input_dim=in_chans,output_dim=out_chans,
+                                    image_size=image_size,stride=stride,num_heads=num_heads,extra_token_num=extra_token_num,
+                                    mlp_ratio=mlp_ratio,drop_path=drop_path_rate))
+        else:
+            raise NotImplementedError("We only support conv and mhsa")
+    return blocks
+class MetaFG(nn.Module):
+    def __init__(self,img_size=224,in_chans=3, num_classes=1000,
+                conv_embed_dims = [64,96,192],attn_embed_dims=[384,768],
+                conv_depths = [2,2,3],attn_depths = [5,2],num_heads=32,extra_token_num=1,mlp_ratio=4.,
+                conv_norm_layer=nn.BatchNorm2d,attn_norm_layer=nn.LayerNorm,
+                conv_act_layer=nn.ReLU,attn_act_layer=nn.GELU,
+                qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,drop_path_rate=0.,
+                meta_dims=[],
+                only_last_cls=False,
+                use_checkpoint=False):
+        super().__init__()
+        self.only_last_cls = only_last_cls
+        self.img_size = img_size
+        self.num_classes = num_classes
+        stem_chs = (3 * (conv_embed_dims[0] // 4), conv_embed_dims[0])
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(conv_depths[1:]+attn_depths))]
+        #stage_0
+        self.stage_0 = nn.Sequential(*[
+                nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
+                conv_norm_layer(stem_chs[0]),
+                conv_act_layer(inplace=True),
+                nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
+                conv_norm_layer(stem_chs[1]),
+                conv_act_layer(inplace=True),
+                nn.Conv2d(stem_chs[1], conv_embed_dims[0], 3, stride=1, padding=1, bias=False)])
+        self.bn1 = conv_norm_layer(conv_embed_dims[0])
+        self.act1 = conv_act_layer(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        #stage_1
+        self.stage_1 = nn.ModuleList(make_blocks(1,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//4,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='conv'))
+        #stage_2
+        self.stage_2 = nn.ModuleList(make_blocks(2,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//4,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='conv'))
+        #stage_3
+        self.cls_token_1 = nn.Parameter(torch.zeros(1, 1, attn_embed_dims[0]))
+        self.stage_3 = nn.ModuleList(make_blocks(3,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//8,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='mhsa'))
+        #stage_4
+        self.cls_token_2 = nn.Parameter(torch.zeros(1, 1, attn_embed_dims[1]))
+        self.stage_4 = nn.ModuleList(make_blocks(4,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//16,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='mhsa'))
+        self.norm_2 = attn_norm_layer(attn_embed_dims[1])
+        #Aggregate
+        if not self.only_last_cls:
+            self.cl_1_fc = nn.Sequential(*[Mlp(in_features=attn_embed_dims[0], out_features=attn_embed_dims[1]),
+                                         attn_norm_layer(attn_embed_dims[1])])
+            self.aggregate = torch.nn.Conv1d(in_channels=2, out_channels=1, kernel_size=1)
+            self.norm_1 = attn_norm_layer(attn_embed_dims[0])
+            self.norm = attn_norm_layer(attn_embed_dims[1])
+        # Classifier head
+        self.head = nn.Linear(attn_embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+        trunc_normal_(self.cls_token_1, std=.02)
+        trunc_normal_(self.cls_token_2, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+#             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+#             fan_out //= m.groups
+#             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+#             if m.bias is not None:
+#                 m.bias.data.zero_()
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token_1','cls_token_2'}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self,x,meta=None):
+        extra_tokens_1 = [self.cls_token_1]
+        extra_tokens_2 = [self.cls_token_2]
+        B = x.shape[0]
+        x = self.stage_0(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+        for blk in self.stage_1:
+            x = blk(x)
+        for blk in self.stage_2:
+            x = blk(x)
+        H0,W0 = self.img_size//8,self.img_size//8
+        for ind,blk in enumerate(self.stage_3):
+            if ind==0:
+                x = blk(x,H0,W0,extra_tokens_1)
+            else:
+                x = blk(x,H0,W0)
+        if not self.only_last_cls:
+            cls_1 = x[:, :1, :]
+            cls_1 = self.norm_1(cls_1)
+            cls_1 = self.cl_1_fc(cls_1)
+        x = x[:, 1:, :]
+        H1,W1 = self.img_size//16,self.img_size//16
+        x = x.reshape(B,H1,W1,-1).permute(0, 3, 1, 2).contiguous()
+        for ind,blk in enumerate(self.stage_4):
+            if ind==0:
+                x = blk(x,H1,W1,extra_tokens_2)
+            else:
+                x = blk(x,H1,W1)
+        cls_2 = x[:, :1, :]
+        cls_2 = self.norm_2(cls_2)
+        if not self.only_last_cls:
+            cls = torch.cat((cls_1,cls_2), dim=1)#B,2,C
+            cls = self.aggregate(cls).squeeze(dim=1)#B,C
+            cls = self.norm(cls)
+        else:
+            cls = cls_2.squeeze(dim=1)
+        return cls
+    def forward(self, x,meta=None):
+        x = self.forward_features(x,meta)
+        x = self.head(x)
+        return x
+@register_model
+def MetaFG_0(pretrained=False, **kwargs):
+    model = MetaFG(conv_embed_dims = [64,96,192],attn_embed_dims=[384,768],
+                 conv_depths = [2,2,3],attn_depths = [5,2],num_heads=8,mlp_ratio=4., **kwargs)
+    model.default_cfg = default_cfgs['MetaFG_0']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+@register_model
+def MetaFG_1(pretrained=False, **kwargs):
+    model = MetaFG(conv_embed_dims = [64,96,192],attn_embed_dims=[384,768],
+                 conv_depths = [2,2,6],attn_depths = [14,2],num_heads=8,mlp_ratio=4., **kwargs)
+    model.default_cfg = default_cfgs['MetaFG_1']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+@register_model
+def MetaFG_2(pretrained=False, **kwargs):
+    model = MetaFG(conv_embed_dims = [128,128,256],attn_embed_dims=[512,1024],
+                 conv_depths = [2,2,6],attn_depths = [14,2],num_heads=8,mlp_ratio=4., **kwargs)
+    model.default_cfg = default_cfgs['MetaFG_2']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+if __name__ == "__main__":
+    x = torch.randn([2, 3, 224, 224])
+    model = MetaFG()
+    import ipdb;ipdb.set_trace()
+    output = model(x)
+    print(output.shape)

models/MetaFG_meta.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import math
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.helpers import load_pretrained
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_
+import numpy as np
+from .MBConv import MBConvBlock
+from .MHSA import MHSABlock,Mlp
+from .meta_encoder import ResNormLayer
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225),
+        'classifier': 'head',
+        **kwargs
+    }
+default_cfgs = {
+    'MetaFG_0': _cfg(),
+    'MetaFG_1': _cfg(),
+    'MetaFG_2': _cfg(),
+}
+def make_blocks(stage_index,depths,embed_dims,img_size,dpr,extra_token_num=1,num_heads=8,mlp_ratio=4.,stage_type='conv'):
+    stage_name = f'stage_{stage_index}'
+    blocks = []
+    for block_idx in range(depths[stage_index]):
+        stride = 2 if block_idx == 0 and stage_index != 1 else 1
+        in_chans = embed_dims[stage_index] if block_idx != 0 else  embed_dims[stage_index-1]
+        out_chans = embed_dims[stage_index]
+        image_size = img_size if block_idx == 0 or stage_index == 1 else img_size//2
+        drop_path_rate = dpr[sum(depths[1:stage_index])+block_idx]
+        if stage_type == 'conv':
+            blocks.append(MBConvBlock(ksize=3,input_filters=in_chans,output_filters=out_chans,
+                                      image_size=image_size,expand_ratio=int(mlp_ratio),stride=stride,drop_connect_rate=drop_path_rate))
+        elif stage_type == 'mhsa':
+            blocks.append(MHSABlock(input_dim=in_chans,output_dim=out_chans,
+                                    image_size=image_size,stride=stride,num_heads=num_heads,extra_token_num=extra_token_num,
+                                    mlp_ratio=mlp_ratio,drop_path=drop_path_rate))
+        else:
+            raise NotImplementedError("We only support conv and mhsa")
+    return blocks
+class MetaFG_Meta(nn.Module):
+    def __init__(self,img_size=224,in_chans=3, num_classes=1000,
+                conv_embed_dims = [64,96,192],attn_embed_dims=[384,768],
+                conv_depths = [2,2,3],attn_depths = [5,2],num_heads=32,extra_token_num=3,mlp_ratio=4.,
+                conv_norm_layer=nn.BatchNorm2d,attn_norm_layer=nn.LayerNorm,
+                conv_act_layer=nn.ReLU,attn_act_layer=nn.GELU,
+                qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,drop_path_rate=0.,
+                add_meta=True,meta_dims=[4,3],mask_prob=1.0,mask_type='linear',
+                only_last_cls=False,
+                use_checkpoint=False):
+        super().__init__()
+        self.only_last_cls = only_last_cls
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.add_meta = add_meta
+        self.meta_dims = meta_dims
+        self.cur_epoch = -1
+        self.total_epoch = -1
+        self.mask_prob = mask_prob
+        self.mask_type = mask_type
+        self.attn_embed_dims = attn_embed_dims
+        self.extra_token_num = extra_token_num
+        if self.add_meta:
+#             assert len(meta_dims)==extra_token_num-1
+            for ind,meta_dim in enumerate(meta_dims):
+                meta_head_1 = nn.Sequential(
+                                        nn.Linear(meta_dim, attn_embed_dims[0]),
+                                        nn.ReLU(inplace=True),
+                                        nn.LayerNorm(attn_embed_dims[0]),
+                                        ResNormLayer(attn_embed_dims[0]),
+                                        ) if meta_dim > 0 else nn.Identity()
+                meta_head_2 = nn.Sequential(
+                                        nn.Linear(meta_dim, attn_embed_dims[1]),
+                                        nn.ReLU(inplace=True),
+                                        nn.LayerNorm(attn_embed_dims[1]),
+                                        ResNormLayer(attn_embed_dims[1]),
+                                        ) if meta_dim > 0 else nn.Identity()
+                setattr(self, f"meta_{ind+1}_head_1", meta_head_1)
+                setattr(self, f"meta_{ind+1}_head_2", meta_head_2)
+        stem_chs = (3 * (conv_embed_dims[0] // 4), conv_embed_dims[0])
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(conv_depths[1:]+attn_depths))]
+        #stage_0
+        self.stage_0 = nn.Sequential(*[
+                nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
+                conv_norm_layer(stem_chs[0]),
+                conv_act_layer(inplace=True),
+                nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
+                conv_norm_layer(stem_chs[1]),
+                conv_act_layer(inplace=True),
+                nn.Conv2d(stem_chs[1], conv_embed_dims[0], 3, stride=1, padding=1, bias=False)])
+        self.bn1 = conv_norm_layer(conv_embed_dims[0])
+        self.act1 = conv_act_layer(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        #stage_1
+        self.stage_1 = nn.ModuleList(make_blocks(1,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//4,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='conv'))
+        #stage_2
+        self.stage_2 = nn.ModuleList(make_blocks(2,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//4,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='conv'))
+        #stage_3
+        self.cls_token_1 = nn.Parameter(torch.zeros(1, 1, attn_embed_dims[0]))
+        self.stage_3 = nn.ModuleList(make_blocks(3,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//8,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='mhsa'))
+        #stage_4
+        self.cls_token_2 = nn.Parameter(torch.zeros(1, 1, attn_embed_dims[1]))
+        self.stage_4 = nn.ModuleList(make_blocks(4,conv_depths+attn_depths,conv_embed_dims+attn_embed_dims,img_size//16,
+                                      dpr=dpr,num_heads=num_heads,extra_token_num=extra_token_num,mlp_ratio=mlp_ratio,stage_type='mhsa'))
+        self.norm_2 = attn_norm_layer(attn_embed_dims[1])
+        #Aggregate
+        if not self.only_last_cls:
+            self.cl_1_fc = nn.Sequential(*[Mlp(in_features=attn_embed_dims[0], out_features=attn_embed_dims[1]),
+                                         attn_norm_layer(attn_embed_dims[1])])
+            self.aggregate = torch.nn.Conv1d(in_channels=2, out_channels=1, kernel_size=1)
+            self.norm = attn_norm_layer(attn_embed_dims[1])
+            self.norm_1 = attn_norm_layer(attn_embed_dims[0])
+        # Classifier head
+        self.head = nn.Linear(attn_embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+        trunc_normal_(self.cls_token_1, std=.02)
+        trunc_normal_(self.cls_token_2, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+#             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+#             fan_out //= m.groups
+#             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+#             if m.bias is not None:
+#                 m.bias.data.zero_()
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.ones_(m.weight)
+            nn.init.zeros_(m.bias)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token_1','cls_token_2'}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self,x,meta=None):
+        B = x.shape[0]
+        extra_tokens_1 = [self.cls_token_1]
+        extra_tokens_2 = [self.cls_token_2]
+        if self.add_meta:
+            assert meta != None,'meta is None'
+            if len(self.meta_dims)>1:
+                metas = torch.split(meta,self.meta_dims,dim=1)
+            else:
+                metas = (meta,)
+            for ind,cur_meta in enumerate(metas):
+                meta_head_1 = getattr(self,f"meta_{ind+1}_head_1")
+                meta_head_2 = getattr(self,f"meta_{ind+1}_head_2")
+                meta_1 = meta_head_1(cur_meta)
+                meta_1 = meta_1.reshape(B, -1, self.attn_embed_dims[0])
+                meta_2 = meta_head_2(cur_meta)
+                meta_2 = meta_2.reshape(B, -1, self.attn_embed_dims[1])
+                extra_tokens_1.append(meta_1)
+                extra_tokens_2.append(meta_2)
+        x = self.stage_0(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+        for blk in self.stage_1:
+            x = blk(x)
+        for blk in self.stage_2:
+            x = blk(x)
+        H0,W0 = self.img_size//8,self.img_size//8
+        for ind,blk in enumerate(self.stage_3):
+            if ind==0:
+                x = blk(x,H0,W0,extra_tokens_1)
+            else:
+                x = blk(x,H0,W0)
+        if not self.only_last_cls:
+            cls_1 = x[:, :1, :]
+            cls_1 = self.norm_1(cls_1)
+            cls_1 = self.cl_1_fc(cls_1)
+        x = x[:, self.extra_token_num:, :]
+        H1,W1 = self.img_size//16,self.img_size//16
+        x = x.reshape(B,H1,W1,-1).permute(0, 3, 1, 2).contiguous()
+        for ind,blk in enumerate(self.stage_4):
+            if ind==0:
+                x = blk(x,H1,W1,extra_tokens_2)
+            else:
+                x = blk(x,H1,W1)
+        cls_2 = x[:, :1, :]
+        cls_2 = self.norm_2(cls_2)
+        if not self.only_last_cls:
+            cls = torch.cat((cls_1,cls_2), dim=1)#B,2,C
+            cls = self.aggregate(cls).squeeze(dim=1)#B,C
+            cls = self.norm(cls)
+        else:
+            cls = cls_2.squeeze(dim=1)
+        return cls
+    def forward(self, x,meta=None):
+        if meta is not None:
+            if self.mask_type=='linear':
+                cur_mask_prob = self.mask_prob - self.cur_epoch/self.total_epoch
+            else:
+                cur_mask_prob = self.mask_prob
+            if cur_mask_prob != 0 and self.training:
+                mask = torch.ones_like(meta)
+                mask_index = torch.randperm(meta.size(0))[:int(meta.size(0)*cur_mask_prob)]
+                mask[mask_index] = 0
+                meta = mask * meta
+        x = self.forward_features(x,meta)
+        x = self.head(x)
+        return x
+@register_model
+def MetaFG_meta_0(pretrained=False, **kwargs):
+    model = MetaFG_Meta(conv_embed_dims = [64,96,192],attn_embed_dims=[384,768],
+                 conv_depths = [2,2,3],attn_depths = [5,2],num_heads=8,mlp_ratio=4., **kwargs)
+    model.default_cfg = default_cfgs['MetaFG_0']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+@register_model
+def MetaFG_meta_1(pretrained=False, **kwargs):
+    model = MetaFG_Meta(conv_embed_dims = [64,96,192],attn_embed_dims=[384,768],
+                 conv_depths = [2,2,6],attn_depths = [14,2],num_heads=8,mlp_ratio=4., **kwargs)
+    model.default_cfg = default_cfgs['MetaFG_1']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+@register_model
+def MetaFG_meta_2(pretrained=False, **kwargs):
+    model = MetaFG_Meta(conv_embed_dims = [128,128,256],attn_embed_dims=[512,1024],
+                 conv_depths = [2,2,6],attn_depths = [14,2],num_heads=8,mlp_ratio=4., **kwargs)
+    model.default_cfg = default_cfgs['MetaFG_2']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+if __name__ == "__main__":
+    x = torch.randn([2, 3, 224, 224])
+    meta = torch.randn([2,7])
+    model = MetaFG_meta()
+    import ipdb;ipdb.set_trace()
+    output = model(x,meta)
+    print(output.shape)

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .build import build_model

models/build.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from timm.models import create_model
+from .MetaFG import *
+from .MetaFG_meta import *
+def build_model(config):
+    model_type = config.MODEL.TYPE
+    if model_type == 'MetaFG':
+        model = create_model(
+                config.MODEL.NAME,
+                pretrained=False,
+                num_classes=config.MODEL.NUM_CLASSES,
+                drop_path_rate=config.MODEL.DROP_PATH_RATE,
+                img_size=config.DATA.IMG_SIZE,
+                only_last_cls=config.MODEL.ONLY_LAST_CLS,
+                extra_token_num=config.MODEL.EXTRA_TOKEN_NUM,
+                meta_dims=config.MODEL.META_DIMS
+        )
+    else:
+        raise NotImplementedError(f"Unkown model: {model_type}")
+    return model

models/meta_encoder.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+class ResNormLayer(nn.Module):
+    def __init__(self, linear_size,):
+        super(ResNormLayer, self).__init__()
+        self.l_size = linear_size
+        self.nonlin1 = nn.ReLU(inplace=True)
+        self.nonlin2 = nn.ReLU(inplace=True)
+        self.norm_fn1 = nn.LayerNorm(self.l_size)
+        self.norm_fn2 = nn.LayerNorm(self.l_size)
+        self.w1 = nn.Linear(self.l_size, self.l_size)
+        self.w2 = nn.Linear(self.l_size, self.l_size)
+    def forward(self, x):
+        y = self.w1(x)
+        y = self.nonlin1(y)
+        y = self.norm_fn1(y)
+        y = self.w2(y)
+        y = self.nonlin2(y)
+        y = self.norm_fn2(y)
+        out = x + y
+        return out

optimizer.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from torch import optim as optim
+def build_optimizer(config, model):
+    """
+    Build optimizer, set weight decay of normalization to 0 by default.
+    """
+    skip = {}
+    skip_keywords = {}
+    if hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    if hasattr(model, 'no_weight_decay_keywords'):
+        skip_keywords = model.no_weight_decay_keywords()
+    parameters = set_weight_decay(model, skip, skip_keywords,config.TRAIN.BASE_LR)
+    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
+    optimizer = None
+    if opt_lower == 'sgd':
+        optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
+                              lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+    elif opt_lower == 'adamw':
+        optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
+                                lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+    return optimizer
+# def set_weight_decay(model, skip_list=(), skip_keywords=(),lr=0.0):
+#     has_decay = []
+#     no_decay = []
+#     high_lr = []
+#     for name, param in model.named_parameters():
+#         if not param.requires_grad:
+#             continue  # frozen weights
+#         if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
+#                 check_keywords_in_name(name, skip_keywords):
+#             if 'meta' in name:
+#                 high_lr.append(param)
+#             else:
+#                 no_decay.append(param)
+#             # print(f"{name} has no weight decay")
+#         else:
+#             has_decay.append(param)
+#     return [{'params': has_decay},
+# #             {'params':high_lr,'weight_decay': 0.,'lr':lr*10},
+#             {'params':high_lr,'lr':lr*20},
+#             {'params': no_decay, 'weight_decay': 0.}]
+def set_weight_decay(model, skip_list=(), skip_keywords=(),lr=0.0):
+    has_decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
+                check_keywords_in_name(name, skip_keywords):
+            no_decay.append(param)
+            # print(f"{name} has no weight decay")
+        else:
+            has_decay.append(param)
+    return [{'params': has_decay},
+            {'params': no_decay, 'weight_decay': 0.}]
+def check_keywords_in_name(name, keywords=()):
+    isin = False
+    for keyword in keywords:
+        if keyword in name:
+            isin = True
+    return isin

utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+import torch
+import importlib
+import torch.distributed as dist
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+def relative_bias_interpolate(checkpoint,config):
+    for k in list(checkpoint['model']):
+        if 'relative_position_index' in k:
+            del checkpoint['model'][k]
+        if 'relative_position_bias_table' in k:
+            relative_position_bias_table = checkpoint['model'][k]
+            cls_bias = relative_position_bias_table[:1,:]
+            relative_position_bias_table = relative_position_bias_table[1:,:]
+            size = int(relative_position_bias_table.shape[0]**0.5)
+            img_size = (size+1)//2
+            if 'stage_3' in k:
+                downsample_ratio = 16
+            elif 'stage_4' in k:
+                downsample_ratio = 32
+            new_img_size = config.DATA.IMG_SIZE//downsample_ratio
+            new_size = 2*new_img_size-1
+            if new_size == size:
+                continue
+            relative_position_bias_table = relative_position_bias_table.reshape(size,size,-1)
+            relative_position_bias_table = relative_position_bias_table.unsqueeze(0).permute(0,3,1,2)#bs,nhead,h,w
+            relative_position_bias_table = torch.nn.functional.interpolate(
+                relative_position_bias_table, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            relative_position_bias_table = relative_position_bias_table.permute(0,2,3,1)
+            relative_position_bias_table = relative_position_bias_table.squeeze(0).reshape(new_size*new_size,-1)
+            relative_position_bias_table = torch.cat((cls_bias,relative_position_bias_table),dim=0)
+            checkpoint['model'][k] = relative_position_bias_table
+    return checkpoint
+def load_pretained(config,model,logger=None,strict=False):
+    if logger is not None:
+        logger.info(f"==============> pretrain form {config.MODEL.PRETRAINED}....................")
+    checkpoint = torch.load(config.MODEL.PRETRAINED, map_location='cpu')
+    if 'model' not in checkpoint:
+        if 'state_dict_ema' in checkpoint:
+            checkpoint['model'] = checkpoint['state_dict_ema']
+        else:
+            checkpoint['model'] = checkpoint
+    if config.MODEL.DORP_HEAD:
+        if 'head.weight' in checkpoint['model'] and 'head.bias' in checkpoint['model']:
+            if logger is not None:
+                logger.info(f"==============> drop head....................")
+            del checkpoint['model']['head.weight']
+            del checkpoint['model']['head.bias']
+        if 'head.fc.weight' in checkpoint['model'] and 'head.fc.bias' in checkpoint['model']:
+            if logger is not None:
+                logger.info(f"==============> drop head....................")
+            del checkpoint['model']['head.fc.weight']
+            del checkpoint['model']['head.fc.bias']
+    if config.MODEL.DORP_META:
+        if logger is not None:
+            logger.info(f"==============> drop meta head....................")
+        for k in list(checkpoint['model']):
+            if 'meta' in k:
+                del checkpoint['model'][k]
+    checkpoint = relative_bias_interpolate(checkpoint,config)
+    if 'point_coord' in checkpoint['model']:
+        if logger is not None:
+            logger.info(f"==============> drop point coord....................")
+        del checkpoint['model']['point_coord']
+    msg = model.load_state_dict(checkpoint['model'], strict=strict)
+    del checkpoint
+    torch.cuda.empty_cache()
+def load_checkpoint(config, model, optimizer, lr_scheduler, logger):
+    logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
+    if config.MODEL.RESUME.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(
+            config.MODEL.RESUME, map_location='cpu', check_hash=True)
+    else:
+        checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+    if 'model' not in checkpoint:
+        if 'state_dict_ema' in checkpoint:
+            checkpoint['model'] = checkpoint['state_dict_ema']
+        else:
+            checkpoint['model'] = checkpoint
+    msg = model.load_state_dict(checkpoint['model'], strict=False)
+    logger.info(msg)
+    max_accuracy = 0.0
+    if not config.EVAL_MODE and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        config.defrost()
+        config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
+        config.freeze()
+        if 'amp' in checkpoint and config.AMP_OPT_LEVEL != "O0" and checkpoint['config'].AMP_OPT_LEVEL != "O0":
+            amp.load_state_dict(checkpoint['amp'])
+        logger.info(f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})")
+        if 'max_accuracy' in checkpoint:
+            max_accuracy = checkpoint['max_accuracy']
+    del checkpoint
+    torch.cuda.empty_cache()
+    return max_accuracy
+def save_checkpoint(config, epoch, model, max_accuracy, optimizer, lr_scheduler, logger):
+    save_state = {'model': model.state_dict(),
+                  'optimizer': optimizer.state_dict(),
+                  'lr_scheduler': lr_scheduler.state_dict(),
+                  'max_accuracy': max_accuracy,
+                  'epoch': epoch,
+                  'config': config}
+    if config.AMP_OPT_LEVEL != "O0":
+        save_state['amp'] = amp.state_dict()
+    save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth')
+    logger.info(f"{save_path} saving......")
+    torch.save(save_state, save_path)
+    logger.info(f"{save_path} saved !!!")
+    lastest_save_path = os.path.join(config.OUTPUT, f'latest.pth')
+    logger.info(f"{lastest_save_path} saving......")
+    torch.save(save_state, lastest_save_path)
+    logger.info(f"{lastest_save_path} saved !!!")
+def get_grad_norm(parameters, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+    total_norm = total_norm ** (1. / norm_type)
+    return total_norm
+def auto_resume_helper(output_dir):
+    checkpoints = os.listdir(output_dir)
+    checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth')]
+    print(f"All checkpoints founded in {output_dir}: {checkpoints}")
+    if len(checkpoints) > 0:
+        latest_checkpoint = max([os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime)
+        print(f"The latest checkpoint founded: {latest_checkpoint}")
+        resume_file = latest_checkpoint
+    else:
+        resume_file = None
+    return resume_file
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= dist.get_world_size()
+    return rt
+def load_ext(name, funcs):
+    ext = importlib.import_module(name)
+    for fun in funcs:
+        assert hasattr(ext, fun), f'{fun} miss in module {name}'
+    return ext