File size: 1,517 Bytes
b4942cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from transformers import AutoConfig, AutoModel
from transformers import CLIPVisionModel, CLIPImageProcessor
from .base_visual_tokenizer import BaseVisualTokenizerConfig, BaseVisualTokenizer
MODEL_TYPE = "clip_visual_tokenizer"
class ClipVisualTokenizerConfig(BaseVisualTokenizerConfig):
model_type = MODEL_TYPE
def __init__(self, **kwargs):
super().__init__(**kwargs)
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
class ClipVisualTokenizer(BaseVisualTokenizer):
config_class = ClipVisualTokenizerConfig
supports_gradient_checkpointing = True
_no_split_modules = ["CLIPEncoderLayer"]
_image_processor_class = CLIPImageProcessor
_image_processor_kwargs = dict(do_center_crop=False)
_backbone_class = CLIPVisionModel
_backbone_name_or_path = "openai/clip-vit-large-patch14-336"
def get_monitor_tensors(self):
return dict(
backbone_bottom=self.backbone.vision_model.encoder.layers[0].self_attn.k_proj.weight,
backbone_top=self.backbone.vision_model.encoder.layers[-1].self_attn.out_proj.weight,
head=self.head[0].weight
)
def get_image_size(self):
height = self.image_processor.crop_size["height"]
width = self.image_processor.crop_size["width"]
return height, width
AutoConfig.register(MODEL_TYPE, ClipVisualTokenizerConfig)
AutoModel.register(ClipVisualTokenizerConfig, ClipVisualTokenizer)
|