|
from pathlib import Path |
|
from typing import Optional, List, Tuple |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from PIL import Image |
|
import cv2 as cv2 |
|
import warnings |
|
|
|
import torch |
|
from torch import Tensor |
|
from torch.nn import functional as F |
|
|
|
import open_clip |
|
from open_clip import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD |
|
from open_clip.transformer import _expand_token |
|
from timm.layers import resample_abs_pos_embed |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hooked_attention_forward( |
|
self, |
|
x, |
|
x_k, |
|
x_v, |
|
attn_mask: Optional[torch.Tensor] = None, |
|
need_weights: bool = False, |
|
): |
|
L, N, C = x.shape |
|
q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1) |
|
q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) |
|
k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) |
|
v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1) |
|
|
|
head_dim = q.shape[-1] |
|
scale = float(head_dim) ** -0.5 |
|
q = q * scale |
|
attn = torch.bmm(q, k.transpose(-1, -2)) |
|
|
|
if attn_mask is not None: |
|
if attn_mask.dtype == torch.bool: |
|
new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype) |
|
new_attn_mask.masked_fill_(attn_mask, float("-inf")) |
|
attn_mask = new_attn_mask |
|
attn += attn_mask |
|
|
|
attn = attn.softmax(dim=-1) |
|
|
|
self.attention_map = attn |
|
|
|
x = torch.bmm(attn, v) |
|
x = x.transpose(0, 1).reshape(L, N, C) |
|
x = self.out_proj(x) |
|
return x |
|
|
|
|
|
def hooked_attention_timm_forward(self, x, attn_mask=None): |
|
B, N, C = x.shape |
|
qkv = ( |
|
self.qkv(x) |
|
.reshape(B, N, 3, self.num_heads, self.head_dim) |
|
.permute(2, 0, 3, 1, 4) |
|
) |
|
q, k, v = qkv.unbind(0) |
|
q, k = self.q_norm(q), self.k_norm(k) |
|
|
|
q = q * self.scale |
|
attn = q @ k.transpose(-2, -1) |
|
attn = attn.softmax(dim=-1) |
|
attn = self.attn_drop(attn) |
|
x = attn @ v |
|
|
|
|
|
self.attention_map = attn |
|
|
|
x = x.transpose(1, 2).reshape(B, N, C) |
|
x = self.proj(x) |
|
x = self.proj_drop(x) |
|
return x |
|
|
|
|
|
|
|
|
|
def hooked_resblock_forward(self, q_x, k_x=None, v_x=None, attn_mask=None): |
|
assert k_x is None and v_x is None, "k_x and v_x must be None" |
|
|
|
|
|
x = q_x + self.ls1( |
|
self.attn( |
|
self.norm1(q_x), |
|
k_x=k_x, |
|
v_x=v_x, |
|
attn_mask=attn_mask, |
|
) |
|
) |
|
|
|
self.feat_post_attn = x |
|
x = x + self.ls2(self.mlp(self.norm2(x))) |
|
|
|
|
|
self.feat_post_mlp = x |
|
return x |
|
|
|
|
|
|
|
|
|
|
|
def hooked_torch_multi_head_attention_forward( |
|
self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None |
|
): |
|
r""" |
|
Args: |
|
query, key, value: map a query and a set of key-value pairs to an output. |
|
See "Attention Is All You Need" for more details. |
|
key_padding_mask: if provided, specified padding elements in the key will |
|
be ignored by the attention. When given a binary mask and a value is True, |
|
the corresponding value on the attention layer will be ignored. When given |
|
a byte mask and a value is non-zero, the corresponding value on the attention |
|
layer will be ignored |
|
need_weights: output attn_output_weights. |
|
attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all |
|
the batches while a 3D mask allows to specify a different mask for the entries of each batch. |
|
|
|
Shape: |
|
- Inputs: |
|
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is |
|
the embedding dimension. |
|
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is |
|
the embedding dimension. |
|
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is |
|
the embedding dimension. |
|
- key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. |
|
If a ByteTensor is provided, the non-zero positions will be ignored while the position |
|
with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the |
|
value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. |
|
- attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. |
|
3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, |
|
S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked |
|
positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend |
|
while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` |
|
is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor |
|
is provided, it will be added to the attention weight. |
|
|
|
- Outputs: |
|
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, |
|
E is the embedding dimension. |
|
- attn_output_weights: :math:`(N, L, S)` where N is the batch size, |
|
L is the target sequence length, S is the source sequence length. |
|
""" |
|
if not self._qkv_same_embed_dim: |
|
out, _attn_maps = hooked_torch_func_multi_head_attention_forward( |
|
query, |
|
key, |
|
value, |
|
self.embed_dim, |
|
self.num_heads, |
|
self.in_proj_weight, |
|
self.in_proj_bias, |
|
self.bias_k, |
|
self.bias_v, |
|
self.add_zero_attn, |
|
self.dropout, |
|
self.out_proj.weight, |
|
self.out_proj.bias, |
|
training=self.training, |
|
key_padding_mask=key_padding_mask, |
|
need_weights=True, |
|
attn_mask=attn_mask, |
|
use_separate_proj_weight=True, |
|
q_proj_weight=self.q_proj_weight, |
|
k_proj_weight=self.k_proj_weight, |
|
v_proj_weight=self.v_proj_weight, |
|
) |
|
|
|
self.attention_maps = _attn_maps |
|
return out, _attn_maps |
|
else: |
|
out, _attn_maps = hooked_torch_func_multi_head_attention_forward( |
|
query, |
|
key, |
|
value, |
|
self.embed_dim, |
|
self.num_heads, |
|
self.in_proj_weight, |
|
self.in_proj_bias, |
|
self.bias_k, |
|
self.bias_v, |
|
self.add_zero_attn, |
|
self.dropout, |
|
self.out_proj.weight, |
|
self.out_proj.bias, |
|
training=self.training, |
|
key_padding_mask=key_padding_mask, |
|
need_weights=True, |
|
attn_mask=attn_mask, |
|
) |
|
|
|
self.attention_maps = _attn_maps |
|
return out, _attn_maps |
|
|
|
|
|
def hooked_torch_func_multi_head_attention_forward( |
|
query: Tensor, |
|
key: Tensor, |
|
value: Tensor, |
|
embed_dim_to_check: int, |
|
num_heads: int, |
|
in_proj_weight: Tensor, |
|
in_proj_bias: Tensor, |
|
bias_k: Optional[Tensor], |
|
bias_v: Optional[Tensor], |
|
add_zero_attn: bool, |
|
dropout_p: float, |
|
out_proj_weight: Tensor, |
|
out_proj_bias: Tensor, |
|
training: bool = True, |
|
key_padding_mask: Optional[Tensor] = None, |
|
need_weights: bool = True, |
|
attn_mask: Optional[Tensor] = None, |
|
use_separate_proj_weight: bool = False, |
|
q_proj_weight: Optional[Tensor] = None, |
|
k_proj_weight: Optional[Tensor] = None, |
|
v_proj_weight: Optional[Tensor] = None, |
|
static_k: Optional[Tensor] = None, |
|
static_v: Optional[Tensor] = None, |
|
) -> Tuple[Tensor, Optional[Tensor]]: |
|
if not torch.jit.is_scripting(): |
|
tens_ops = ( |
|
query, |
|
key, |
|
value, |
|
in_proj_weight, |
|
in_proj_bias, |
|
bias_k, |
|
bias_v, |
|
out_proj_weight, |
|
out_proj_bias, |
|
) |
|
if any([type(t) is not Tensor for t in tens_ops]) and F.has_torch_function( |
|
tens_ops |
|
): |
|
return F.handle_torch_function( |
|
multi_head_attention_forward, |
|
tens_ops, |
|
query, |
|
key, |
|
value, |
|
embed_dim_to_check, |
|
num_heads, |
|
in_proj_weight, |
|
in_proj_bias, |
|
bias_k, |
|
bias_v, |
|
add_zero_attn, |
|
dropout_p, |
|
out_proj_weight, |
|
out_proj_bias, |
|
training=training, |
|
key_padding_mask=key_padding_mask, |
|
need_weights=need_weights, |
|
attn_mask=attn_mask, |
|
use_separate_proj_weight=use_separate_proj_weight, |
|
q_proj_weight=q_proj_weight, |
|
k_proj_weight=k_proj_weight, |
|
v_proj_weight=v_proj_weight, |
|
static_k=static_k, |
|
static_v=static_v, |
|
) |
|
tgt_len, bsz, embed_dim = query.size() |
|
assert embed_dim == embed_dim_to_check |
|
|
|
assert key.size(0) == value.size(0) and key.size(1) == value.size(1) |
|
|
|
head_dim = embed_dim // num_heads |
|
assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" |
|
scaling = float(head_dim) ** -0.5 |
|
|
|
if not use_separate_proj_weight: |
|
if torch.equal(query, key) and torch.equal(key, value): |
|
|
|
q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1) |
|
|
|
elif torch.equal(key, value): |
|
|
|
|
|
_b = in_proj_bias |
|
_start = 0 |
|
_end = embed_dim |
|
_w = in_proj_weight[_start:_end, :] |
|
if _b is not None: |
|
_b = _b[_start:_end] |
|
q = F.linear(query, _w, _b) |
|
|
|
if key is None: |
|
assert value is None |
|
k = None |
|
v = None |
|
else: |
|
|
|
|
|
_b = in_proj_bias |
|
_start = embed_dim |
|
_end = None |
|
_w = in_proj_weight[_start:, :] |
|
if _b is not None: |
|
_b = _b[_start:] |
|
k, v = F.linear(key, _w, _b).chunk(2, dim=-1) |
|
|
|
else: |
|
|
|
_b = in_proj_bias |
|
_start = 0 |
|
_end = embed_dim |
|
_w = in_proj_weight[_start:_end, :] |
|
if _b is not None: |
|
_b = _b[_start:_end] |
|
q = F.linear(query, _w, _b) |
|
|
|
|
|
_b = in_proj_bias |
|
_start = embed_dim |
|
_end = embed_dim * 2 |
|
_w = in_proj_weight[_start:_end, :] |
|
if _b is not None: |
|
_b = _b[_start:_end] |
|
k = F.linear(key, _w, _b) |
|
|
|
|
|
_b = in_proj_bias |
|
_start = embed_dim * 2 |
|
_end = None |
|
_w = in_proj_weight[_start:, :] |
|
if _b is not None: |
|
_b = _b[_start:] |
|
v = F.linear(value, _w, _b) |
|
else: |
|
q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight) |
|
len1, len2 = q_proj_weight_non_opt.size() |
|
assert len1 == embed_dim and len2 == query.size(-1) |
|
|
|
k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight) |
|
len1, len2 = k_proj_weight_non_opt.size() |
|
assert len1 == embed_dim and len2 == key.size(-1) |
|
|
|
v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight) |
|
len1, len2 = v_proj_weight_non_opt.size() |
|
assert len1 == embed_dim and len2 == value.size(-1) |
|
|
|
if in_proj_bias is not None: |
|
q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim]) |
|
k = F.linear( |
|
key, k_proj_weight_non_opt, in_proj_bias[embed_dim : (embed_dim * 2)] |
|
) |
|
v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2) :]) |
|
else: |
|
q = F.linear(query, q_proj_weight_non_opt, in_proj_bias) |
|
k = F.linear(key, k_proj_weight_non_opt, in_proj_bias) |
|
v = F.linear(value, v_proj_weight_non_opt, in_proj_bias) |
|
q = q * scaling |
|
|
|
if attn_mask is not None: |
|
assert ( |
|
attn_mask.dtype == torch.float32 |
|
or attn_mask.dtype == torch.float64 |
|
or attn_mask.dtype == torch.float16 |
|
or attn_mask.dtype == torch.uint8 |
|
or attn_mask.dtype == torch.bool |
|
), "Only float, byte, and bool types are supported for attn_mask, not {}".format( |
|
attn_mask.dtype |
|
) |
|
if attn_mask.dtype == torch.uint8: |
|
warnings.warn( |
|
"Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead." |
|
) |
|
attn_mask = attn_mask.to(torch.bool) |
|
|
|
if attn_mask.dim() == 2: |
|
attn_mask = attn_mask.unsqueeze(0) |
|
if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: |
|
raise RuntimeError("The size of the 2D attn_mask is not correct.") |
|
elif attn_mask.dim() == 3: |
|
if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]: |
|
raise RuntimeError("The size of the 3D attn_mask is not correct.") |
|
else: |
|
raise RuntimeError( |
|
"attn_mask's dimension {} is not supported".format(attn_mask.dim()) |
|
) |
|
|
|
|
|
|
|
if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8: |
|
warnings.warn( |
|
"Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead." |
|
) |
|
key_padding_mask = key_padding_mask.to(torch.bool) |
|
|
|
if bias_k is not None and bias_v is not None: |
|
if static_k is None and static_v is None: |
|
k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) |
|
v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) |
|
if attn_mask is not None: |
|
attn_mask = pad(attn_mask, (0, 1)) |
|
if key_padding_mask is not None: |
|
key_padding_mask = pad(key_padding_mask, (0, 1)) |
|
else: |
|
assert static_k is None, "bias cannot be added to static key." |
|
assert static_v is None, "bias cannot be added to static value." |
|
else: |
|
assert bias_k is None |
|
assert bias_v is None |
|
|
|
q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) |
|
if k is not None: |
|
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) |
|
if v is not None: |
|
v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) |
|
|
|
if static_k is not None: |
|
assert static_k.size(0) == bsz * num_heads |
|
assert static_k.size(2) == head_dim |
|
k = static_k |
|
|
|
if static_v is not None: |
|
assert static_v.size(0) == bsz * num_heads |
|
assert static_v.size(2) == head_dim |
|
v = static_v |
|
|
|
src_len = k.size(1) |
|
|
|
if key_padding_mask is not None: |
|
assert key_padding_mask.size(0) == bsz |
|
assert key_padding_mask.size(1) == src_len |
|
|
|
if add_zero_attn: |
|
src_len += 1 |
|
k = torch.cat( |
|
[ |
|
k, |
|
torch.zeros( |
|
(k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device |
|
), |
|
], |
|
dim=1, |
|
) |
|
v = torch.cat( |
|
[ |
|
v, |
|
torch.zeros( |
|
(v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device |
|
), |
|
], |
|
dim=1, |
|
) |
|
if attn_mask is not None: |
|
attn_mask = pad(attn_mask, (0, 1)) |
|
if key_padding_mask is not None: |
|
key_padding_mask = pad(key_padding_mask, (0, 1)) |
|
|
|
attn_output_weights = torch.bmm(q, k.transpose(1, 2)) |
|
assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len] |
|
|
|
if attn_mask is not None: |
|
if attn_mask.dtype == torch.bool: |
|
attn_output_weights.masked_fill_(attn_mask, float("-inf")) |
|
else: |
|
attn_output_weights += attn_mask |
|
|
|
if key_padding_mask is not None: |
|
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) |
|
attn_output_weights = attn_output_weights.masked_fill( |
|
key_padding_mask.unsqueeze(1).unsqueeze(2), |
|
float("-inf"), |
|
) |
|
attn_output_weights = attn_output_weights.view( |
|
bsz * num_heads, tgt_len, src_len |
|
) |
|
|
|
attn_output_weights = F.softmax(attn_output_weights, dim=-1) |
|
attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
attn_output = torch.bmm(attn_output_weights, v) |
|
assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] |
|
attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) |
|
attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias) |
|
|
|
if need_weights: |
|
|
|
|
|
|
|
|
|
return attn_output, attn_output_weights |
|
else: |
|
return attn_output, None |
|
|
|
|
|
|
|
def hooked_resblock_timm_forward(self, x: torch.Tensor) -> torch.Tensor: |
|
x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) |
|
self.feat_post_attn = x |
|
x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) |
|
self.feat_post_mlp = x |
|
return x |
|
|
|
|
|
|
|
def hooked_attentional_pooler_timm_forward(self, x): |
|
B, N, C = x.shape |
|
|
|
if self.pos_embed is not None: |
|
|
|
x = x + self.pos_embed.unsqueeze(0).to(x.dtype) |
|
|
|
q_latent = self.latent.expand(B, -1, -1) |
|
q = ( |
|
self.q(q_latent) |
|
.reshape(B, self.latent_len, self.num_heads, self.head_dim) |
|
.transpose(1, 2) |
|
) |
|
|
|
kv = ( |
|
self.kv(x) |
|
.reshape(B, N, 2, self.num_heads, self.head_dim) |
|
.permute(2, 0, 3, 1, 4) |
|
) |
|
k, v = kv.unbind(0) |
|
|
|
q, k = self.q_norm(q), self.k_norm(k) |
|
|
|
q = q * self.scale |
|
attn = q @ k.transpose(-2, -1) |
|
attn = attn.softmax(dim=-1) |
|
x = attn @ v |
|
|
|
|
|
self.attn_probs = attn |
|
|
|
x = x.transpose(1, 2).reshape(B, self.latent_len, C) |
|
x = self.proj(x) |
|
x = self.proj_drop(x) |
|
|
|
x = x + self.mlp(self.norm(x)) |
|
|
|
|
|
if self.pool == "token": |
|
x = x[:, 0] |
|
elif self.pool == "avg": |
|
x = x.mean(1) |
|
return x |
|
|
|
|
|
|
|
def vit_dynamic_size_forward(self, x: torch.Tensor): |
|
x = self.conv1(x) |
|
grid_h, grid_w = x.shape[2:] |
|
x = x.reshape(x.shape[0], x.shape[1], -1) |
|
x = x.permute(0, 2, 1) |
|
|
|
|
|
x = torch.cat( |
|
[_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1 |
|
) |
|
|
|
if x.shape[1] != self.positional_embedding.shape[1]: |
|
self.positional_embedding.data = resample_abs_pos_embed( |
|
self.positional_embedding.unsqueeze(0), |
|
new_size=[grid_h, grid_w], |
|
|
|
num_prefix_tokens=1, |
|
interpolation="bicubic", |
|
antialias=True, |
|
) |
|
|
|
x = x + self.positional_embedding.to(x.dtype) |
|
|
|
x = self.patch_dropout(x) |
|
x = self.ln_pre(x) |
|
|
|
x = x.permute(1, 0, 2) |
|
x = self.transformer(x) |
|
x = x.permute(1, 0, 2) |
|
|
|
if self.attn_pool is not None: |
|
if self.attn_pool_contrastive is not None: |
|
|
|
x = self.ln_post(x) |
|
tokens = self.attn_pool(x) |
|
if self.attn_pool_type == "parallel": |
|
pooled = self.attn_pool_contrastive(x) |
|
else: |
|
assert self.attn_pool_type == "cascade" |
|
pooled = self.attn_pool_contrastive(tokens) |
|
else: |
|
|
|
x = self.attn_pool(x) |
|
x = self.ln_post(x) |
|
pooled, tokens = self._global_pool(x) |
|
elif self.final_ln_after_pool: |
|
pooled, tokens = self._global_pool(x) |
|
pooled = self.ln_post(pooled) |
|
else: |
|
x = self.ln_post(x) |
|
pooled, tokens = self._global_pool(x) |
|
|
|
if self.proj is not None: |
|
pooled = pooled @ self.proj |
|
|
|
if self.output_tokens: |
|
return pooled, tokens |
|
|
|
return pooled |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def min_max(logits): |
|
B, num_prompt = logits.shape[:2] |
|
logits_min = ( |
|
logits.reshape(B, num_prompt, -1).min(dim=-1, keepdim=True)[0].unsqueeze(-1) |
|
) |
|
logits_max = ( |
|
logits.reshape(B, num_prompt, -1).max(dim=-1, keepdim=True)[0].unsqueeze(-1) |
|
) |
|
logits = (logits - logits_min) / (logits_max - logits_min) |
|
return logits |
|
|
|
|
|
def visualize(image, heatmaps, alpha=0.6, save_path: Path = None): |
|
|
|
W, H = heatmaps.shape[-2:] |
|
if isinstance(image, Image.Image): |
|
image = image.resize((W, H)) |
|
elif isinstance(image, torch.Tensor): |
|
if image.ndim > 3: |
|
image = image.squeeze(0) |
|
|
|
image_unormed = ( |
|
image.detach().cpu() * torch.Tensor(OPENAI_DATASET_STD)[:, None, None] |
|
) + torch.Tensor(OPENAI_DATASET_MEAN)[:, None, None] |
|
|
|
image = Image.fromarray( |
|
(image_unormed.permute(1, 2, 0).numpy() * 255).astype("uint8") |
|
) |
|
else: |
|
raise f"image should be either of type PIL.Image.Image or torch.Tensor but found {type(image)}" |
|
|
|
|
|
plt.imshow(image) |
|
plt.axis("off") |
|
plt.tight_layout() |
|
plt.show() |
|
|
|
if heatmaps.ndim > 3: |
|
heatmaps = heatmaps.squeeze(0) |
|
heatmaps = heatmaps.detach().cpu().numpy() |
|
|
|
img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) |
|
heatmaps = (heatmaps * 255).astype("uint8") |
|
heat_maps = [cv2.applyColorMap(logit, cv2.COLORMAP_JET) for logit in heatmaps] |
|
|
|
vizs = [(1 - alpha) * img_cv + alpha * heat_map for heat_map in heat_maps] |
|
for i, viz in enumerate(vizs): |
|
viz = cv2.cvtColor(viz.astype("uint8"), cv2.COLOR_BGR2RGB) |
|
plt.imshow(viz) |
|
plt.axis("off") |
|
plt.tight_layout() |
|
|
|
plt.subplots_adjust(left=0, right=1, top=1, bottom=0) |
|
plt.show() |
|
if save_path is not None: |
|
plt.savefig(save_path, bbox_inches="tight", pad_inches=0) |
|
print(f"Saved visualization at {save_path}") |
|
|
|
|
|
def list_pretrained(): |
|
openclip_list_ = open_clip.list_pretrained() |
|
filtered_list = [ |
|
(model_name, pretrained) |
|
for (model_name, pretrained) in openclip_list_ |
|
if model_name |
|
] |
|
unsupported_models = [ |
|
"RN", |
|
"convnext", |
|
] |
|
_str = ( |
|
": ".join(["model_name" + " " * (25 - len("model_name")), "pretrained"]) + "\n" |
|
) |
|
for model_name, pretrained in openclip_list_: |
|
for unsup_model in unsupported_models: |
|
if unsup_model in model_name: |
|
skip = True |
|
break |
|
else: |
|
skip = False |
|
if not skip: |
|
filtered_list.append((model_name, pretrained)) |
|
_str += ( |
|
": ".join([model_name + " " * (25 - len(model_name)), pretrained]) |
|
+ "\n" |
|
) |
|
|
|
print(_str) |
|
return filtered_list |
|
|
|
|
|
if __name__ == "__main__": |
|
list_pretrained() |
|
|