|
|
|
|
|
|
|
model: |
|
_target_: sam2.modeling.sam2_base.SAM2Base |
|
image_encoder: |
|
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder |
|
scalp: 1 |
|
trunk: |
|
_target_: sam2.modeling.backbones.timm.TimmBackbone |
|
name: repvit_m1.dist_in1k |
|
features: |
|
- layer0 |
|
- layer1 |
|
- layer2 |
|
- layer3 |
|
neck: |
|
_target_: sam2.modeling.backbones.image_encoder.FpnNeck |
|
position_encoding: |
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
|
num_pos_feats: 256 |
|
normalize: true |
|
scale: null |
|
temperature: 10000 |
|
d_model: 256 |
|
backbone_channel_list: [384, 192, 96, 48] |
|
fpn_top_down_levels: [2, 3] |
|
fpn_interp_model: nearest |
|
|
|
memory_attention: |
|
_target_: sam2.modeling.memory_attention.MemoryAttention |
|
d_model: 256 |
|
pos_enc_at_input: true |
|
layer: |
|
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer |
|
activation: relu |
|
dim_feedforward: 2048 |
|
dropout: 0.1 |
|
pos_enc_at_attn: false |
|
self_attention: |
|
_target_: sam2.modeling.sam.transformer.RoPEAttention |
|
rope_theta: 10000.0 |
|
feat_sizes: [32, 32] |
|
embedding_dim: 256 |
|
num_heads: 1 |
|
downsample_rate: 1 |
|
dropout: 0.1 |
|
d_model: 256 |
|
pos_enc_at_cross_attn_keys: true |
|
pos_enc_at_cross_attn_queries: false |
|
cross_attention: |
|
_target_: sam2.modeling.sam.transformer.RoPEAttentionv2 |
|
rope_theta: 10000.0 |
|
q_sizes: [64, 64] |
|
k_sizes: [16, 16] |
|
embedding_dim: 256 |
|
num_heads: 1 |
|
downsample_rate: 1 |
|
dropout: 0.1 |
|
kv_in_dim: 64 |
|
num_layers: 2 |
|
|
|
memory_encoder: |
|
_target_: sam2.modeling.memory_encoder.MemoryEncoder |
|
out_dim: 64 |
|
position_encoding: |
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
|
num_pos_feats: 64 |
|
normalize: true |
|
scale: null |
|
temperature: 10000 |
|
mask_downsampler: |
|
_target_: sam2.modeling.memory_encoder.MaskDownSampler |
|
kernel_size: 3 |
|
stride: 2 |
|
padding: 1 |
|
fuser: |
|
_target_: sam2.modeling.memory_encoder.Fuser |
|
layer: |
|
_target_: sam2.modeling.memory_encoder.CXBlock |
|
dim: 256 |
|
kernel_size: 7 |
|
padding: 3 |
|
layer_scale_init_value: 1e-6 |
|
use_dwconv: True |
|
num_layers: 2 |
|
|
|
spatial_perceiver: |
|
_target_: sam2.modeling.perceiver.PerceiverResampler |
|
depth: 2 |
|
dim: 64 |
|
dim_head: 64 |
|
heads: 1 |
|
ff_mult: 4 |
|
hidden_dropout_p: 0. |
|
attention_dropout_p: 0. |
|
pos_enc_at_key_value: true |
|
concat_kv_latents: false |
|
num_latents: 256 |
|
num_latents_2d: 256 |
|
position_encoding: |
|
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
|
num_pos_feats: 64 |
|
normalize: true |
|
scale: null |
|
temperature: 10000 |
|
use_self_attn: true |
|
|
|
num_maskmem: 7 |
|
image_size: 1024 |
|
|
|
sigmoid_scale_for_mem_enc: 20.0 |
|
sigmoid_bias_for_mem_enc: -10.0 |
|
use_mask_input_as_output_without_sam: true |
|
|
|
directly_add_no_mem_embed: true |
|
|
|
use_high_res_features_in_sam: true |
|
|
|
multimask_output_in_sam: true |
|
|
|
iou_prediction_use_sigmoid: True |
|
|
|
use_obj_ptrs_in_encoder: true |
|
add_tpos_enc_to_obj_ptrs: false |
|
only_obj_ptrs_in_the_past_for_eval: true |
|
|
|
pred_obj_scores: true |
|
pred_obj_scores_mlp: true |
|
fixed_no_obj_ptr: true |
|
|
|
multimask_output_for_tracking: true |
|
use_multimask_token_for_obj_ptr: true |
|
multimask_min_pt_num: 0 |
|
multimask_max_pt_num: 1 |
|
use_mlp_for_obj_ptr_proj: true |
|
|
|
compile_image_encoder: false |