File size: 2,253 Bytes
231edce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
TRAIN:
  ENABLE: True
  DATASET: imagenet
  BATCH_SIZE: 256
  EVAL_PERIOD: 10
  CHECKPOINT_PERIOD: 1
  AUTO_RESUME: True

DATA:
  # PATH_TO_DATA_DIR: path-to-imagenet-dir
  MEAN: [0.485, 0.456, 0.406]
  STD: [0.229, 0.224, 0.225]
  NUM_FRAMES: 64
  TRAIN_CROP_SIZE: 224
  TEST_CROP_SIZE: 224
  INPUT_CHANNEL_NUM: [3]
MVIT:
  PATCH_2D: False
  ZERO_DECAY_POS_CLS: False
  MODE: "conv"
  CLS_EMBED_ON: False
  PATCH_KERNEL: [3, 7, 7]
  PATCH_STRIDE: [2, 4, 4]
  PATCH_PADDING: [1, 3, 3]
  EMBED_DIM: 96
  NUM_HEADS: 1
  MLP_RATIO: 4.0
  QKV_BIAS: True
  DROPPATH_RATE: 0.1
  DROPOUT_RATE: 0.0
  DEPTH: 16
  LAYER_SCALE_INIT_VALUE: 0.0
  HEAD_INIT_SCALE: 1.0
  USE_MEAN_POOLING: False
  USE_ABS_POS: True
  USE_FIXED_SINCOS_POS: False
  SEP_POS_EMBED: False
  REL_POS_SPATIAL: False
  REL_POS_TEMPORAL: False
  REL_POS_ZERO_INIT: False
  RESIDUAL_POOLING: False
  NORM: "layernorm"
  NORM_STEM: False
  DIM_MUL: [[1, 2.0], [3, 2.0], [14, 2.0]]
  HEAD_MUL: [[1, 2.0], [3, 2.0], [14, 2.0]]
  POOL_FIRST: null
  POOL_KVQ_KERNEL: [1, 3, 3]
  POOL_KV_STRIDE_ADAPTIVE: [1, 4, 4]
  POOL_Q_STRIDE: [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]]
  SEPARATE_QKV : True
  REV:
    ENABLE: True
    RESPATH_FUSE: "concat"
    BUFFER_LAYERS : [1,3, 14]
    RES_PATH : "conv"
    PRE_Q_FUSION: "concat_linear_2"
DETECTION:
  ENABLE: False
AUG:
  ENABLE: True
  COLOR_JITTER: 0.4
  AA_TYPE: rand-m9-n6-mstd0.5-inc1
  INTERPOLATION: bicubic
  RE_PROB: 0.25
  RE_MODE: pixel
  RE_COUNT: 1
  RE_SPLIT: False
MIXUP:
  ENABLE: True
  ALPHA: 0.8
  CUTMIX_ALPHA: 1.0
  PROB: 1.0
  SWITCH_PROB: 0.5
  LABEL_SMOOTH_VALUE: 0.1
SOLVER:
  BASE_LR_SCALE_NUM_SHARDS: True
  BASE_LR: 0.00025
  LR_POLICY: cosine
  MAX_EPOCH: 300
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.05
  WARMUP_EPOCHS: 70.0
  WARMUP_START_LR: 1e-8
  OPTIMIZING_METHOD: adamw
  COSINE_AFTER_WARMUP: True
  COSINE_END_LR: 1e-6
  ZERO_WD_1D_PARAM: True
  CLIP_GRAD_L2NORM: 1.0
MODEL:
  NUM_CLASSES: 1000
  ARCH: mvit
  MODEL_NAME: MViT
  LOSS_FUNC: soft_cross_entropy
  DROPOUT_RATE: 0.0
  HEAD_ACT: "softmax"
  DETACH_FINAL_FC: False
CONTRASTIVE:
  NUM_MLP_LAYERS: 1
TEST:
  ENABLE: False
  DATASET: imagenet
  BATCH_SIZE: 256
DATA_LOADER:
  NUM_WORKERS: 8
  PIN_MEMORY: True
NUM_GPUS: 2
NUM_SHARDS: 1
RNG_SEED: 0
OUTPUT_DIR: .