ofirab
/

visfocus-base-docvqa

Safetensors

visfocus

custom_code

Model card Files Files and versions Community

ofirab commited on Oct 19, 2024

Commit

625d13e

verified ·

1 Parent(s): e0fc165

Upload config

Browse files

Files changed (2) hide show

config.json +3 -0
configuration_visfocus.py +94 -0

config.json CHANGED Viewed

@@ -2,6 +2,9 @@
   "architectures": [
     "VisFocus_VQAConcat_LV"
   ],
   "cache_dir": null,
   "do_lower_case": true,
   "freeze_modules": [],

   "architectures": [
     "VisFocus_VQAConcat_LV"
   ],
+  "auto_map": {
+    "AutoConfig": "configuration_visfocus.VisFocusConfig"
+  },
   "cache_dir": null,
   "do_lower_case": true,
   "freeze_modules": [],

configuration_visfocus.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import List
+from transformers import PretrainedConfig, T5Config
+class SwinVilmaConfig(PretrainedConfig):
+    model_type = "swin_vilma"
+    def __init__(
+        self,
+        patch_size: int = 4,
+        in_chans: int = 3,
+        embed_dim: int = 96,
+        depths: List[int] = [2, 2, 18, 2],
+        num_heads: List[int] = [3, 6, 12, 24],
+        window_size: int = 24,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        ape: bool = False,
+        patch_norm: bool = True,
+        pretrained_window_sizes: List[int] = [0, 0, 0, 0],
+        vl_cross_attn_layers: List[int] = [3],
+        vl_alpha: float = 0.5,
+        lm_d_model: int = 768,
+        text_embedder: str = "t5-base",
+        downsampling_method: str = "merge_attention_v3",
+        vision_name: str = "swin_small_patch4_window7_224_22k",
+        image_size: List[int] = [1536, 768],
+        drop_path_rate: float = 0.3,
+        drop_rate: float = 0.0,
+        resume_from: str = "",
+        use_checkpoint: bool = False,
+        do_shift: bool = True,
+        input_type: str = "rgb",
+        vl_learned_ape: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.pretrained_window_sizes = pretrained_window_sizes
+        self.vl_cross_attn_layers = vl_cross_attn_layers
+        self.vl_alpha = vl_alpha
+        self.lm_d_model = lm_d_model
+        self.text_embedder = text_embedder
+        self.downsampling_method = downsampling_method
+        self.vision_name = vision_name
+        self.image_size = image_size
+        self.drop_path_rate = drop_path_rate
+        self.drop_rate = drop_rate
+        self.resume_from = resume_from
+        self.use_checkpoint = use_checkpoint
+        self.do_shift = do_shift
+        self.input_type = input_type
+        self.vl_learned_ape = vl_learned_ape
+class VisFocusConfig(PretrainedConfig):
+    model_type = "visfocus"
+    def __init__(
+        self,
+        initializer_factor: float = 1.0,
+        initializer_range: float = 0.02,
+        max_seq_length: int = 2048,
+        generate_max_new_tokens_len: int = 256,
+        model_name_or_path: str = "",
+        variant: str = "vf-base",
+        image_size: List[int] = [1536, 768],
+        seed: int = 42,
+        do_lower_case: bool = True,
+        hidden_dropout_prob: float = .1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.initializer_factor = initializer_factor
+        self.initializer_range = initializer_range
+        self.max_seq_length = max_seq_length
+        self.generate_max_new_tokens_len = generate_max_new_tokens_len
+        self.model_name_or_path = model_name_or_path
+        self.variant = variant
+        self.image_size = image_size
+        self.seed = seed
+        self.do_lower_case = do_lower_case
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.vision_config = SwinVilmaConfig()
+        self.lm_config = T5Config()