File size: 1,308 Bytes
98d74e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4773d08
98d74e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
AMP: true
CUDNN_BENCHMARK: true
CUDNN_DETERMINISTIC: false
DATA:
  EOS_INDEX: 2
  IMAGE_CROP_SIZE: 224
  IMAGE_TRANSFORM_TRAIN:
  - random_resized_crop
  - horizontal_flip
  - color_jitter
  - normalize
  IMAGE_TRANSFORM_VAL:
  - smallest_resize
  - center_crop
  - normalize
  MASKED_LM:
    MASK_PROBABILITY: 0.85
    MASK_PROPORTION: 0.15
    REPLACE_PROBABILITY: 0.1
  MASK_INDEX: 3
  MAX_CAPTION_LENGTH: 50
  ROOT: datasets/redcaps/tarfiles/*.tar
  SOS_INDEX: 1
  TOKENIZER_MODEL: datasets/common_30k.model
  UNK_INDEX: 0
  USE_PERCENTAGE: 100.0
  USE_SINGLE_CAPTION: false
  VOCAB_SIZE: 30000
MODEL:
  DECODER:
    BEAM_SIZE: 5
    MAX_DECODING_STEPS: 30
    NAME: nucleus_sampling
    NUCLEUS_SIZE: 0.9
  LABEL_SMOOTHING: 0.1
  NAME: virtex_web
  TEXTUAL:
    DROPOUT: 0.1
    NAME: transdec_prenorm::L6_H512_A8_F2048
  VISUAL:
    FEATURE_SIZE: 2048
    FROZEN: false
    NAME: torchvision::resnet50
    PRETRAINED: false
OPTIM:
  BATCH_SIZE: 256
  CLIP_GRAD_NORM: 10.0
  CNN_LR: 0.0005
  LOOKAHEAD:
    ALPHA: 0.5
    STEPS: 5
    USE: false
  LR: 0.0005
  LR_DECAY_NAME: cosine
  LR_GAMMA: 0.1
  LR_STEPS: []
  NO_DECAY: .*textual.(embedding|transformer).*(norm.*|bias)
  NUM_ITERATIONS: 1500000
  OPTIMIZER_NAME: adamw
  SGD_MOMENTUM: 0.9
  WARMUP_STEPS: 10000
  WEIGHT_DECAY: 0.01
RANDOM_SEED: 0