|
|
|
|
|
|
|
|
|
|
|
from enum import Enum
|
|
from typing import Union
|
|
|
|
import torch
|
|
|
|
from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
|
|
|
|
|
|
class Weights(Enum):
|
|
LVD142M = "LVD142M"
|
|
|
|
|
|
def _make_dinov2_model(
|
|
*,
|
|
arch_name: str = "vit_large",
|
|
img_size: int = 518,
|
|
patch_size: int = 14,
|
|
init_values: float = 1.0,
|
|
ffn_layer: str = "mlp",
|
|
block_chunks: int = 0,
|
|
num_register_tokens: int = 0,
|
|
interpolate_antialias: bool = False,
|
|
interpolate_offset: float = 0.1,
|
|
pretrained: bool = True,
|
|
weights: Union[Weights, str] = Weights.LVD142M,
|
|
**kwargs,
|
|
):
|
|
from ..models import vision_transformer as vits
|
|
|
|
if isinstance(weights, str):
|
|
try:
|
|
weights = Weights[weights]
|
|
except KeyError:
|
|
raise AssertionError(f"Unsupported weights: {weights}")
|
|
|
|
model_base_name = _make_dinov2_model_name(arch_name, patch_size)
|
|
vit_kwargs = dict(
|
|
img_size=img_size,
|
|
patch_size=patch_size,
|
|
init_values=init_values,
|
|
ffn_layer=ffn_layer,
|
|
block_chunks=block_chunks,
|
|
num_register_tokens=num_register_tokens,
|
|
interpolate_antialias=interpolate_antialias,
|
|
interpolate_offset=interpolate_offset,
|
|
)
|
|
vit_kwargs.update(**kwargs)
|
|
model = vits.__dict__[arch_name](**vit_kwargs)
|
|
|
|
if pretrained:
|
|
model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
|
|
url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
|
|
state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
|
|
|
|
state_dict = {k: v for k, v in state_dict.items() if 'mask_token' not in k}
|
|
if vit_kwargs.get("modulation_dim") is not None:
|
|
state_dict = {
|
|
k.replace('norm1', 'norm1.norm').replace('norm2', 'norm2.norm'): v
|
|
for k, v in state_dict.items()
|
|
}
|
|
model.load_state_dict(state_dict, strict=False)
|
|
else:
|
|
model.load_state_dict(state_dict, strict=True)
|
|
|
|
|
|
return model
|
|
|
|
|
|
def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
|
|
|
|
|
|
def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
|
|
|
|
|
|
def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
|
|
|
|
|
|
def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(
|
|
arch_name="vit_giant2",
|
|
ffn_layer="swiglufused",
|
|
weights=weights,
|
|
pretrained=pretrained,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(
|
|
arch_name="vit_small",
|
|
pretrained=pretrained,
|
|
weights=weights,
|
|
num_register_tokens=4,
|
|
interpolate_antialias=True,
|
|
interpolate_offset=0.0,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(
|
|
arch_name="vit_base",
|
|
pretrained=pretrained,
|
|
weights=weights,
|
|
num_register_tokens=4,
|
|
interpolate_antialias=True,
|
|
interpolate_offset=0.0,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(
|
|
arch_name="vit_large",
|
|
pretrained=pretrained,
|
|
weights=weights,
|
|
num_register_tokens=4,
|
|
interpolate_antialias=True,
|
|
interpolate_offset=0.0,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
|
|
"""
|
|
DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
|
|
"""
|
|
return _make_dinov2_model(
|
|
arch_name="vit_giant2",
|
|
ffn_layer="swiglufused",
|
|
weights=weights,
|
|
pretrained=pretrained,
|
|
num_register_tokens=4,
|
|
interpolate_antialias=True,
|
|
interpolate_offset=0.0,
|
|
**kwargs,
|
|
)
|
|
|