{ "architectures": [ "VisFocusModelForImageTextToText" ], "auto_map": { "AutoConfig": "configuration_visfocus.VisFocusConfig", "AutoModel": "configuration_visfocus.VisFocusPreTrainedModel", "AutoModelForConditionalGeneration": "configuration_visfocus.VisFocusModelForImageTextToText", "AutoModelForImageTextToText": "modeling_visfocus.VisFocusModelForImageTextToText" }, "cache_dir": null, "do_lower_case": true, "freeze_modules": [], "generate_max_new_tokens_len": 256, "hidden_dropout_prob": 0.1, "image_size": [ 1536, 768 ], "initializer_factor": "vf-base", "initializer_range": 0.02, "lm_config": { "model_type": "t5" }, "lora": null, "matcher_type": "default", "max_seq_length": 2048, "model_name_or_path": "", "model_type": "visfocus", "seed": 42, "torch_dtype": "float32", "transformers_version": "4.46.0.dev0", "unfreeze_modules": [], "variant": "vf-base", "vision": { "model": { "drop_path_rate": 0.3, "drop_rate": 0.0, "image_size": [ 1536, 768 ], "name": "swin_small_patch4_window7_224_22k", "swinv2": { "ape": false, "depths": [ 2, 2, 18, 2 ], "downsampling_method": "merge_attention_v3", "embed_dim": 96, "in_chans": 3, "lm_d_model": 768, "mlp_ratio": 4.0, "num_heads": [ 3, 6, 12, 24 ], "patch_norm": true, "patch_size": 4, "pretrained_window_sizes": [ 0, 0, 0, 0 ], "qkv_bias": true, "text_embedder": "t5-base", "vl_alpha": 0.5, "vl_cross_attn_layers": [ 3 ], "window_size": 24 }, "type": "swinv2", "vision_resume_from": "https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pth" }, "train": { "use_checkpoint": false } }, "vision_config": { "model_type": "swin_vilma" }, "vl_l1_loss": null, "vqa_method": null }