|
import torch |
|
import torch.nn as nn |
|
|
|
from . import register_connector |
|
from .base import Connector |
|
|
|
|
|
|
|
|
|
class MoFMLP(nn.Module): |
|
def __init__(self, config): |
|
super().__init__() |
|
|
|
modules_clip = [nn.Linear(config.vision_hidden_size, config.hidden_size), |
|
nn.GELU(), |
|
nn.Linear(config.hidden_size, config.hidden_size) |
|
] |
|
|
|
modules_dinov2 = [nn.Linear(config.vision_hidden_size, config.hidden_size), |
|
nn.GELU(), |
|
nn.Linear(config.hidden_size, config.hidden_size) |
|
] |
|
|
|
self.clip = nn.Sequential(*modules_clip) |
|
self.dinov2 = nn.Sequential(*modules_dinov2) |
|
|
|
|
|
|
|
def forward(self, x): |
|
|
|
image_features_clip = self.clip(x[0]) |
|
image_features_dinov2 = self.dinov2(x[1]) |
|
|
|
bs = image_features_clip.size(0) |
|
total_len = image_features_clip.size(1)+image_features_dinov2.size(1) |
|
dim = image_features_clip.size(-1) |
|
|
|
merged_features = torch.empty(bs, total_len, dim).to(device=x[0].device, dtype=x[0].dtype) |
|
merged_features[:,0::2] = image_features_clip |
|
merged_features[:,1::2] = image_features_dinov2 |
|
|
|
return merged_features |
|
|
|
|
|
|
|
|
|
@register_connector('mof_mlp') |
|
class MoFMLPConnector(Connector): |
|
def __init__(self, config): |
|
super().__init__() |
|
|
|
self._connector = MoFMLP(config) |
|
|