{"decoder_vocab_size": 259, "decoder_num_layers": 30, "decoder_num_heads": 9, "decoder_num_kv_heads": 3, "decoder_embed_dim": 576, "decoder_max_seq_len": 384, "decoder_intermediate_dim": 1536, "decoder_attn_dropout": 0.0, "decoder_norm_eps": 1e-05, "decoder_rope_base": 10000, "decoder_scale_factor": 32, "decoder_encoder_max_seq_len": 19200, "decoder_fusion_interval": 3, "encoder_encoder": "swin_base_patch4_window12_384.ms_in22k", "encoder_input_size": [2560, 1920], "encoder_name": "swin_base_patch4_window12_384.ms_in22k"}