Upload config
Browse files- config.json +3 -0
- configuration_visfocus.py +94 -0
config.json
CHANGED
@@ -2,6 +2,9 @@
|
|
2 |
"architectures": [
|
3 |
"VisFocus_VQAConcat_LV"
|
4 |
],
|
|
|
|
|
|
|
5 |
"cache_dir": null,
|
6 |
"do_lower_case": true,
|
7 |
"freeze_modules": [],
|
|
|
2 |
"architectures": [
|
3 |
"VisFocus_VQAConcat_LV"
|
4 |
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_visfocus.VisFocusConfig"
|
7 |
+
},
|
8 |
"cache_dir": null,
|
9 |
"do_lower_case": true,
|
10 |
"freeze_modules": [],
|
configuration_visfocus.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import List
|
3 |
+
from transformers import PretrainedConfig, T5Config
|
4 |
+
|
5 |
+
|
6 |
+
class SwinVilmaConfig(PretrainedConfig):
|
7 |
+
model_type = "swin_vilma"
|
8 |
+
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
patch_size: int = 4,
|
12 |
+
in_chans: int = 3,
|
13 |
+
embed_dim: int = 96,
|
14 |
+
depths: List[int] = [2, 2, 18, 2],
|
15 |
+
num_heads: List[int] = [3, 6, 12, 24],
|
16 |
+
window_size: int = 24,
|
17 |
+
mlp_ratio: float = 4.0,
|
18 |
+
qkv_bias: bool = True,
|
19 |
+
ape: bool = False,
|
20 |
+
patch_norm: bool = True,
|
21 |
+
pretrained_window_sizes: List[int] = [0, 0, 0, 0],
|
22 |
+
vl_cross_attn_layers: List[int] = [3],
|
23 |
+
vl_alpha: float = 0.5,
|
24 |
+
lm_d_model: int = 768,
|
25 |
+
text_embedder: str = "t5-base",
|
26 |
+
downsampling_method: str = "merge_attention_v3",
|
27 |
+
vision_name: str = "swin_small_patch4_window7_224_22k",
|
28 |
+
image_size: List[int] = [1536, 768],
|
29 |
+
drop_path_rate: float = 0.3,
|
30 |
+
drop_rate: float = 0.0,
|
31 |
+
resume_from: str = "",
|
32 |
+
use_checkpoint: bool = False,
|
33 |
+
do_shift: bool = True,
|
34 |
+
input_type: str = "rgb",
|
35 |
+
vl_learned_ape: bool = True,
|
36 |
+
**kwargs
|
37 |
+
):
|
38 |
+
super().__init__(**kwargs)
|
39 |
+
self.patch_size = patch_size
|
40 |
+
self.in_chans = in_chans
|
41 |
+
self.embed_dim = embed_dim
|
42 |
+
self.depths = depths
|
43 |
+
self.num_heads = num_heads
|
44 |
+
self.window_size = window_size
|
45 |
+
self.mlp_ratio = mlp_ratio
|
46 |
+
self.qkv_bias = qkv_bias
|
47 |
+
self.ape = ape
|
48 |
+
self.patch_norm = patch_norm
|
49 |
+
self.pretrained_window_sizes = pretrained_window_sizes
|
50 |
+
self.vl_cross_attn_layers = vl_cross_attn_layers
|
51 |
+
self.vl_alpha = vl_alpha
|
52 |
+
self.lm_d_model = lm_d_model
|
53 |
+
self.text_embedder = text_embedder
|
54 |
+
self.downsampling_method = downsampling_method
|
55 |
+
self.vision_name = vision_name
|
56 |
+
self.image_size = image_size
|
57 |
+
self.drop_path_rate = drop_path_rate
|
58 |
+
self.drop_rate = drop_rate
|
59 |
+
self.resume_from = resume_from
|
60 |
+
self.use_checkpoint = use_checkpoint
|
61 |
+
self.do_shift = do_shift
|
62 |
+
self.input_type = input_type
|
63 |
+
self.vl_learned_ape = vl_learned_ape
|
64 |
+
|
65 |
+
class VisFocusConfig(PretrainedConfig):
|
66 |
+
model_type = "visfocus"
|
67 |
+
|
68 |
+
def __init__(
|
69 |
+
self,
|
70 |
+
initializer_factor: float = 1.0,
|
71 |
+
initializer_range: float = 0.02,
|
72 |
+
max_seq_length: int = 2048,
|
73 |
+
generate_max_new_tokens_len: int = 256,
|
74 |
+
model_name_or_path: str = "",
|
75 |
+
variant: str = "vf-base",
|
76 |
+
image_size: List[int] = [1536, 768],
|
77 |
+
seed: int = 42,
|
78 |
+
do_lower_case: bool = True,
|
79 |
+
hidden_dropout_prob: float = .1,
|
80 |
+
**kwargs
|
81 |
+
):
|
82 |
+
super().__init__(**kwargs)
|
83 |
+
self.initializer_factor = initializer_factor
|
84 |
+
self.initializer_range = initializer_range
|
85 |
+
self.max_seq_length = max_seq_length
|
86 |
+
self.generate_max_new_tokens_len = generate_max_new_tokens_len
|
87 |
+
self.model_name_or_path = model_name_or_path
|
88 |
+
self.variant = variant
|
89 |
+
self.image_size = image_size
|
90 |
+
self.seed = seed
|
91 |
+
self.do_lower_case = do_lower_case
|
92 |
+
self.hidden_dropout_prob = hidden_dropout_prob
|
93 |
+
self.vision_config = SwinVilmaConfig()
|
94 |
+
self.lm_config = T5Config()
|