|
name: vit_roberta_image2text |
|
config_type: model |
|
encoder: |
|
name: null |
|
config_type: model |
|
hidden_size: 768 |
|
num_hidden_layers: 12 |
|
num_attention_heads: 12 |
|
intermediate_size: 3072 |
|
hidden_act: gelu |
|
hidden_dropout_prob: 0.0 |
|
attention_probs_dropout_prob: 0.0 |
|
initializer_range: 0.02 |
|
layer_norm_eps: 1.0e-12 |
|
image_size: 224 |
|
patch_size: 16 |
|
num_channels: 3 |
|
qkv_bias: true |
|
encoder_stride: 16 |
|
decoder: |
|
name: null |
|
config_type: model |
|
is_decoder: true |
|
add_cross_attention: true |
|
attention_probs_dropout_prob: 0.1 |
|
bos_token_id: 0 |
|
eos_token_id: 2 |
|
classifier_dropout: null |
|
gradient_checkpointing: false |
|
hidden_act: gelu |
|
hidden_dropout_prob: 0.1 |
|
hidden_size: 768 |
|
initializer_range: 0.02 |
|
intermediate_size: 3072 |
|
layer_norm_eps: 1.0e-12 |
|
max_position_embeddings: 514 |
|
num_attention_heads: 12 |
|
num_hidden_layers: 12 |
|
pad_token_id: 2 |
|
position_embedding_type: absolute |
|
type_vocab_size: 1 |
|
use_cache: true |
|
vocab_size: 42000 |
|
generation: |
|
bos_token_id: 0 |
|
decoder_start_token_id: 0 |
|
return_dict_in_generate: false |
|
early_stopping: true |
|
eos_token_id: 2 |
|
length_penalty: 2.0 |
|
max_length: 64 |
|
no_repeat_ngram_size: 3 |
|
num_beams: 4 |
|
pad_token_id: 1 |
|
|