ofirab commited on
Commit
625d13e
·
verified ·
1 Parent(s): e0fc165

Upload config

Browse files
Files changed (2) hide show
  1. config.json +3 -0
  2. configuration_visfocus.py +94 -0
config.json CHANGED
@@ -2,6 +2,9 @@
2
  "architectures": [
3
  "VisFocus_VQAConcat_LV"
4
  ],
 
 
 
5
  "cache_dir": null,
6
  "do_lower_case": true,
7
  "freeze_modules": [],
 
2
  "architectures": [
3
  "VisFocus_VQAConcat_LV"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_visfocus.VisFocusConfig"
7
+ },
8
  "cache_dir": null,
9
  "do_lower_case": true,
10
  "freeze_modules": [],
configuration_visfocus.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List
3
+ from transformers import PretrainedConfig, T5Config
4
+
5
+
6
+ class SwinVilmaConfig(PretrainedConfig):
7
+ model_type = "swin_vilma"
8
+
9
+ def __init__(
10
+ self,
11
+ patch_size: int = 4,
12
+ in_chans: int = 3,
13
+ embed_dim: int = 96,
14
+ depths: List[int] = [2, 2, 18, 2],
15
+ num_heads: List[int] = [3, 6, 12, 24],
16
+ window_size: int = 24,
17
+ mlp_ratio: float = 4.0,
18
+ qkv_bias: bool = True,
19
+ ape: bool = False,
20
+ patch_norm: bool = True,
21
+ pretrained_window_sizes: List[int] = [0, 0, 0, 0],
22
+ vl_cross_attn_layers: List[int] = [3],
23
+ vl_alpha: float = 0.5,
24
+ lm_d_model: int = 768,
25
+ text_embedder: str = "t5-base",
26
+ downsampling_method: str = "merge_attention_v3",
27
+ vision_name: str = "swin_small_patch4_window7_224_22k",
28
+ image_size: List[int] = [1536, 768],
29
+ drop_path_rate: float = 0.3,
30
+ drop_rate: float = 0.0,
31
+ resume_from: str = "",
32
+ use_checkpoint: bool = False,
33
+ do_shift: bool = True,
34
+ input_type: str = "rgb",
35
+ vl_learned_ape: bool = True,
36
+ **kwargs
37
+ ):
38
+ super().__init__(**kwargs)
39
+ self.patch_size = patch_size
40
+ self.in_chans = in_chans
41
+ self.embed_dim = embed_dim
42
+ self.depths = depths
43
+ self.num_heads = num_heads
44
+ self.window_size = window_size
45
+ self.mlp_ratio = mlp_ratio
46
+ self.qkv_bias = qkv_bias
47
+ self.ape = ape
48
+ self.patch_norm = patch_norm
49
+ self.pretrained_window_sizes = pretrained_window_sizes
50
+ self.vl_cross_attn_layers = vl_cross_attn_layers
51
+ self.vl_alpha = vl_alpha
52
+ self.lm_d_model = lm_d_model
53
+ self.text_embedder = text_embedder
54
+ self.downsampling_method = downsampling_method
55
+ self.vision_name = vision_name
56
+ self.image_size = image_size
57
+ self.drop_path_rate = drop_path_rate
58
+ self.drop_rate = drop_rate
59
+ self.resume_from = resume_from
60
+ self.use_checkpoint = use_checkpoint
61
+ self.do_shift = do_shift
62
+ self.input_type = input_type
63
+ self.vl_learned_ape = vl_learned_ape
64
+
65
+ class VisFocusConfig(PretrainedConfig):
66
+ model_type = "visfocus"
67
+
68
+ def __init__(
69
+ self,
70
+ initializer_factor: float = 1.0,
71
+ initializer_range: float = 0.02,
72
+ max_seq_length: int = 2048,
73
+ generate_max_new_tokens_len: int = 256,
74
+ model_name_or_path: str = "",
75
+ variant: str = "vf-base",
76
+ image_size: List[int] = [1536, 768],
77
+ seed: int = 42,
78
+ do_lower_case: bool = True,
79
+ hidden_dropout_prob: float = .1,
80
+ **kwargs
81
+ ):
82
+ super().__init__(**kwargs)
83
+ self.initializer_factor = initializer_factor
84
+ self.initializer_range = initializer_range
85
+ self.max_seq_length = max_seq_length
86
+ self.generate_max_new_tokens_len = generate_max_new_tokens_len
87
+ self.model_name_or_path = model_name_or_path
88
+ self.variant = variant
89
+ self.image_size = image_size
90
+ self.seed = seed
91
+ self.do_lower_case = do_lower_case
92
+ self.hidden_dropout_prob = hidden_dropout_prob
93
+ self.vision_config = SwinVilmaConfig()
94
+ self.lm_config = T5Config()