|
from transformers import AutoConfig, AutoModel |
|
from transformers import CLIPVisionModel, CLIPImageProcessor |
|
from .base_visual_tokenizer import BaseVisualTokenizerConfig, BaseVisualTokenizer |
|
|
|
MODEL_TYPE = "clip_visual_tokenizer" |
|
|
|
|
|
class ClipVisualTokenizerConfig(BaseVisualTokenizerConfig): |
|
model_type = MODEL_TYPE |
|
|
|
def __init__(self, **kwargs): |
|
super().__init__(**kwargs) |
|
if self.depths: |
|
assert len(self.depths) == 1 |
|
self.backbone_kwargs['num_hidden_layers'] = self.depths[0] |
|
|
|
|
|
class ClipVisualTokenizer(BaseVisualTokenizer): |
|
config_class = ClipVisualTokenizerConfig |
|
supports_gradient_checkpointing = True |
|
_no_split_modules = ["CLIPEncoderLayer"] |
|
_image_processor_class = CLIPImageProcessor |
|
_image_processor_kwargs = dict(do_center_crop=False) |
|
_backbone_class = CLIPVisionModel |
|
_backbone_name_or_path = "openai/clip-vit-large-patch14-336" |
|
|
|
def get_monitor_tensors(self): |
|
return dict( |
|
backbone_bottom=self.backbone.vision_model.encoder.layers[0].self_attn.k_proj.weight, |
|
backbone_top=self.backbone.vision_model.encoder.layers[-1].self_attn.out_proj.weight, |
|
head=self.head[0].weight |
|
) |
|
|
|
def get_image_size(self): |
|
height = self.image_processor.crop_size["height"] |
|
width = self.image_processor.crop_size["width"] |
|
return height, width |
|
|
|
|
|
AutoConfig.register(MODEL_TYPE, ClipVisualTokenizerConfig) |
|
AutoModel.register(ClipVisualTokenizerConfig, ClipVisualTokenizer) |
|
|