from transformers import AutoConfig, AutoModel from transformers import SiglipVisionModel, SiglipImageProcessor from .base_visual_tokenizer import BaseVisualTokenizerConfig, BaseVisualTokenizer MODEL_TYPE = "siglip_visual_tokenizer" class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig): model_type = MODEL_TYPE def __init__(self, **kwargs): super().__init__(**kwargs) if self.drop_cls_token: self.drop_cls_token = False if self.depths: assert len(self.depths) == 1 self.backbone_kwargs['num_hidden_layers'] = self.depths[0] class SiglipVisualTokenizer(BaseVisualTokenizer): config_class = SiglipVisualTokenizerConfig supports_gradient_checkpointing = True _no_split_modules = ["SiglipVisionTransformer"] _image_processor_class = SiglipImageProcessor _image_processor_kwargs = {} _backbone_class = SiglipVisionModel _backbone_name_or_path = "google/siglip-so400m-patch14-384" def get_monitor_tensors(self): return dict( backbone_bottom=self.backbone.vision_model.encoder.layers[0].self_attn.k_proj.weight, backbone_top=self.backbone.vision_model.encoder.layers[-1].self_attn.out_proj.weight, head=self.head[0].weight ) def get_image_size(self): height = self.image_processor.size["height"] width = self.image_processor.size["width"] return height, width AutoConfig.register(MODEL_TYPE, SiglipVisualTokenizerConfig) AutoModel.register(SiglipVisualTokenizerConfig, SiglipVisualTokenizer)