|
from transformers import PretrainedConfig |
|
from torchscale.architecture.config import EncoderConfig |
|
|
|
class ViVQAConfig(PretrainedConfig): |
|
model_type = "vivqa" |
|
|
|
def __init__( |
|
self, |
|
drop_path_rate: float = 0.0, |
|
mlp_ratio: float = 4.0, |
|
encoder_layers: int = 6, |
|
encoder_attention_heads: int = 6, |
|
multiway: bool = True, |
|
layernorm_embedding: bool = False, |
|
normalize_output: bool = True, |
|
no_output_layer: bool = True, |
|
encoder_embed_dim: int = 768, |
|
**kwargs |
|
): |
|
args = EncoderConfig( |
|
multiway=multiway, |
|
layernorm_embedding=layernorm_embedding, normalize_output=normalize_output, no_output_layer=no_output_layer, |
|
drop_path_rate=drop_path_rate, encoder_embed_dim=768, encoder_attention_heads=encoder_attention_heads, |
|
encoder_ffn_embed_dim=int(768 * mlp_ratio), encoder_layers=encoder_layers, |
|
) |
|
for key, value in args.__dict__.items(): |
|
setattr(self, key, value) |
|
|
|
super().__init__(**kwargs) |