Upload M3DCLIP

Browse files

Files changed (5) hide show

README.md +3 -3
config.json +35 -35
configuration_m3d_clip.py +42 -42
model.safetensors +1 -1
modeling_m3d_clip.py +225 -225

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
 license: apache-2.0
-metrics:
-- accuracy
-pipeline_tag: image-feature-extraction
 tags:
 - 3D medical CLIP
 - Image-text retrieval
 ---
 M3D-CLIP is one of the works in the [M3D](https://github.com/BAAI-DCAI/M3D) series.

 ---
 license: apache-2.0
 tags:
 - 3D medical CLIP
 - Image-text retrieval
+metrics:
+- accuracy
+pipeline_tag: image-feature-extraction
 ---
 M3D-CLIP is one of the works in the [M3D](https://github.com/BAAI-DCAI/M3D) series.

config.json CHANGED Viewed

@@ -1,35 +1,35 @@
-{
-  "architectures": [
-    "M3DCLIP"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_m3d_clip.M3DCLIPConfig",
-    "AutoModel": "modeling_m3d_clip.M3DCLIP"
-  },
-  "dropout_rate": 0,
-  "gather_loss": true,
-  "hidden_size": 768,
-  "img_size": [
-    32,
-    256,
-    256
-  ],
-  "in_channels": 1,
-  "language_model_name_or_path": "bert-base-uncased",
-  "local_loss": false,
-  "max_text_len": 128,
-  "mlp_dim": 3072,
-  "model_type": "m3d_clip",
-  "num_heads": 12,
-  "num_layers": 12,
-  "patch_size": [
-    4,
-    16,
-    16
-  ],
-  "pos_embed": "perceptron",
-  "spatial_dims": 3,
-  "torch_dtype": "float32",
-  "transformers_version": "4.40.1",
-  "vocab_size": 30522
-}

+{
+  "architectures": [
+    "M3DCLIP"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_m3d_clip.M3DCLIPConfig",
+    "AutoModel": "modeling_m3d_clip.M3DCLIP"
+  },
+  "dropout_rate": 0,
+  "gather_loss": true,
+  "hidden_size": 768,
+  "img_size": [
+    32,
+    256,
+    256
+  ],
+  "in_channels": 1,
+  "language_model_name_or_path": "bert-base-uncased",
+  "local_loss": false,
+  "max_text_len": 128,
+  "mlp_dim": 3072,
+  "model_type": "m3d_clip",
+  "num_heads": 12,
+  "num_layers": 12,
+  "patch_size": [
+    4,
+    16,
+    16
+  ],
+  "pos_embed": "perceptron",
+  "spatial_dims": 3,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "vocab_size": 30522
+}

configuration_m3d_clip.py CHANGED Viewed

@@ -1,42 +1,42 @@
-from transformers import PretrainedConfig
-class M3DCLIPConfig(PretrainedConfig):
-    model_type = "m3d_clip"
-    def __init__(
-        self,
-        language_model_name_or_path: str = 'bert-base-uncased',
-        local_loss: bool = False,
-        gather_loss: bool = True,
-        in_channels: int = 1,
-        img_size: tuple = (32, 256, 256),
-        patch_size: tuple = (4, 16, 16),
-        hidden_size: int = 768,
-        mlp_dim: int = 3072,
-        num_layers: int = 12,
-        num_heads: int = 12,
-        pos_embed: str = "perceptron",
-        dropout_rate: float = 0,
-        spatial_dims: int = 3,
-        max_text_len: int = 128,
-        vocab_size: int = 30522,
-        **kwargs,
-    ):
-        self.language_model_name_or_path = language_model_name_or_path
-        self.in_channels = in_channels
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.hidden_size = hidden_size
-        self.mlp_dim = mlp_dim
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.pos_embed = pos_embed
-        self.dropout_rate = dropout_rate
-        self.spatial_dims = spatial_dims
-        self.local_loss = local_loss
-        self.gather_loss = gather_loss
-        self.max_text_len = max_text_len
-        self.vocab_size = vocab_size
-        super().__init__(**kwargs)

+from transformers import PretrainedConfig
+class M3DCLIPConfig(PretrainedConfig):
+    model_type = "m3d_clip"
+    def __init__(
+        self,
+        language_model_name_or_path: str = 'bert-base-uncased',
+        local_loss: bool = False,
+        gather_loss: bool = True,
+        in_channels: int = 1,
+        img_size: tuple = (32, 256, 256),
+        patch_size: tuple = (4, 16, 16),
+        hidden_size: int = 768,
+        mlp_dim: int = 3072,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        pos_embed: str = "perceptron",
+        dropout_rate: float = 0,
+        spatial_dims: int = 3,
+        max_text_len: int = 128,
+        vocab_size: int = 30522,
+        **kwargs,
+    ):
+        self.language_model_name_or_path = language_model_name_or_path
+        self.in_channels = in_channels
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.mlp_dim = mlp_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.pos_embed = pos_embed
+        self.dropout_rate = dropout_rate
+        self.spatial_dims = spatial_dims
+        self.local_loss = local_loss
+        self.gather_loss = gather_loss
+        self.max_text_len = max_text_len
+        self.vocab_size = vocab_size
+        super().__init__(**kwargs)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6973db8a3018c033117b5f3fe9ff8ca3c6842532993e2c5723e892fcd18a235e
 size 792251956

 version https://git-lfs.github.com/spec/v1
+oid sha256:59abe6b2ad8b7cbab4d2026fcd2b7e0fe8f0d70d17ad5ede819b4100aa59b860
 size 792251956

modeling_m3d_clip.py CHANGED Viewed

@@ -1,225 +1,225 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PreTrainedModel
-from collections.abc import Sequence
-from monai.networks.blocks.patchembedding import PatchEmbeddingBlock
-from monai.networks.blocks.transformerblock import TransformerBlock
-try:
-    import torch.distributed.nn
-    from torch import distributed as dist
-    has_distributed = True
-except ImportError:
-    has_distributed = False
-from .configuration_m3d_clip import M3DCLIPConfig
-from transformers import BertModel, BertConfig
-def gather_features(
-        image_features,
-        text_features,
-        local_loss=False,
-        gather_with_grad=True,
-        rank=0,
-        world_size=1,
-):
-    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
-    # We gather tensors from all gpus
-    if gather_with_grad:
-        all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
-        all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
-    else:
-        gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
-        gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
-        dist.all_gather(gathered_image_features, image_features)
-        dist.all_gather(gathered_text_features, text_features)
-        if not local_loss:
-            # ensure grads for local rank when all_* features don't have a gradient
-            gathered_image_features[rank] = image_features
-            gathered_text_features[rank] = text_features
-        all_image_features = torch.cat(gathered_image_features, dim=0)
-        all_text_features = torch.cat(gathered_text_features, dim=0)
-    return all_image_features, all_text_features
-class ViT(nn.Module):
-    """
-    Vision Transformer (ViT), based on: "Dosovitskiy et al.,
-    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
-    ViT supports Torchscript but only works for Pytorch after 1.8.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        img_size: Sequence[int] | int,
-        patch_size: Sequence[int] | int,
-        hidden_size: int = 768,
-        mlp_dim: int = 3072,
-        num_layers: int = 12,
-        num_heads: int = 12,
-        pos_embed: str = "conv",
-        classification: bool = False,
-        num_classes: int = 2,
-        dropout_rate: float = 0.0,
-        spatial_dims: int = 3,
-        post_activation="Tanh",
-        qkv_bias: bool = False,
-        save_attn: bool = False,
-    ) -> None:
-        """
-        Args:
-            in_channels (int): dimension of input channels.
-            img_size (Union[Sequence[int], int]): dimension of input image.
-            patch_size (Union[Sequence[int], int]): dimension of patch size.
-            hidden_size (int, optional): dimension of hidden layer. Defaults to 768.
-            mlp_dim (int, optional): dimension of feedforward layer. Defaults to 3072.
-            num_layers (int, optional): number of transformer blocks. Defaults to 12.
-            num_heads (int, optional): number of attention heads. Defaults to 12.
-            pos_embed (str, optional): position embedding layer type. Defaults to "conv".
-            classification (bool, optional): bool argument to determine if classification is used. Defaults to False.
-            num_classes (int, optional): number of classes if classification is used. Defaults to 2.
-            dropout_rate (float, optional): faction of the input units to drop. Defaults to 0.0.
-            spatial_dims (int, optional): number of spatial dimensions. Defaults to 3.
-            post_activation (str, optional): add a final acivation function to the classification head
-                when `classification` is True. Default to "Tanh" for `nn.Tanh()`.
-                Set to other values to remove this function.
-            qkv_bias (bool, optional): apply bias to the qkv linear layer in self attention block. Defaults to False.
-            save_attn (bool, optional): to make accessible the attention in self attention block. Defaults to False.
-        Examples::
-            # for single channel input with image size of (96,96,96), conv position embedding and segmentation backbone
-            >>> net = ViT(in_channels=1, img_size=(96,96,96), pos_embed='conv')
-            # for 3-channel with image size of (128,128,128), 24 layers and classification backbone
-            >>> net = ViT(in_channels=3, img_size=(128,128,128), pos_embed='conv', classification=True)
-            # for 3-channel with image size of (224,224), 12 layers and classification backbone
-            >>> net = ViT(in_channels=3, img_size=(224,224), pos_embed='conv', classification=True, spatial_dims=2)
-        """
-        super().__init__()
-        if not (0 <= dropout_rate <= 1):
-            raise ValueError("dropout_rate should be between 0 and 1.")
-        if hidden_size % num_heads != 0:
-            raise ValueError("hidden_size should be divisible by num_heads.")
-        self.hidden_size = hidden_size
-        self.classification = classification
-        self.patch_embedding = PatchEmbeddingBlock(
-            in_channels=in_channels,
-            img_size=img_size,
-            patch_size=patch_size,
-            hidden_size=hidden_size,
-            num_heads=num_heads,
-            pos_embed=pos_embed,
-            dropout_rate=dropout_rate,
-            spatial_dims=spatial_dims,
-        )
-        self.blocks = nn.ModuleList(
-            [
-                TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate, qkv_bias, save_attn)
-                for i in range(num_layers)
-            ]
-        )
-        self.norm = nn.LayerNorm(hidden_size)
-        if self.classification:
-            self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
-            # if post_activation == "Tanh":
-            #     self.classification_head = nn.Sequential(nn.Linear(hidden_size, num_classes), nn.Tanh())
-            # else:
-            #     self.classification_head = nn.Linear(hidden_size, num_classes)  # type: ignore
-    def forward(self, x):
-        x = self.patch_embedding(x)
-        if hasattr(self, "cls_token"):
-            cls_token = self.cls_token.expand(x.shape[0], -1, -1)
-            x = torch.cat((cls_token, x), dim=1)
-        hidden_states_out = []
-        for blk in self.blocks:
-            x = blk(x)
-            hidden_states_out.append(x)
-        x = self.norm(x)
-        # if hasattr(self, "classification_head"):
-        #     x = self.classification_head(x[:, 0])
-        return x, hidden_states_out
-class M3DCLIP(PreTrainedModel):
-    config_class = M3DCLIPConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.vision_encoder = ViT(
-            in_channels=config.in_channels,
-            img_size=config.img_size,
-            patch_size=config.patch_size,
-            hidden_size=config.hidden_size,
-            mlp_dim=config.mlp_dim,
-            num_layers=config.num_layers,
-            num_heads=config.num_heads,
-            pos_embed=config.pos_embed,
-            dropout_rate=config.dropout_rate,
-            spatial_dims=config.spatial_dims,
-            classification=True,
-        )
-        # configuration = BertConfig()
-        # self.language_encoder = BertModel(configuration)
-        self.language_encoder = BertModel.from_pretrained(config.language_model_name_or_path)
-        self.mm_vision_proj = nn.Linear(config.hidden_size, config.hidden_size)
-        self.mm_language_proj = nn.Linear(config.hidden_size, config.hidden_size)
-        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-        self.local_loss = config.local_loss
-        self.gather_loss = config.gather_loss
-    def encode_image(self, image):
-        image_feats, _ = self.vision_encoder(image)
-        image_feats = self.mm_vision_proj(image_feats)
-        image_feats = F.normalize(image_feats, dim=-1)
-        return image_feats
-    def encode_text(self, input_id, attention_mask):
-        text_feats = self.language_encoder(input_id, attention_mask=attention_mask)["last_hidden_state"]
-        text_feats = self.mm_language_proj(text_feats)
-        text_feats = F.normalize(text_feats, dim=-1)
-        return text_feats
-    def forward(self, images, input_ids, attention_mask, labels, **kwargs):
-        image_features = self.encode_image(images)[:, 0]
-        text_features = self.encode_text(input_ids, attention_mask)[:, 0]
-        if self.gather_loss:
-            all_image_features, all_text_features = gather_features(image_features, text_features)
-            if self.local_loss:
-                logits_per_image = self.logit_scale * image_features @ all_text_features.T
-                logits_per_text = self.logit_scale * text_features @ all_image_features.T
-            else:
-                logits_per_image = self.logit_scale * all_image_features @ all_text_features.T
-                logits_per_text = logits_per_image.T
-        else:
-            logits_per_image = self.logit_scale * image_features @ text_features.T
-            logits_per_text = self.logit_scale * text_features @ image_features.T
-        loss = (
-                           F.cross_entropy(logits_per_image, labels) +
-                           F.cross_entropy(logits_per_text, labels)
-                   ) / 2
-        ret = {
-            "loss": loss,
-            "logits": (logits_per_image + logits_per_text) / 2.0,
-        }
-        return ret

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from collections.abc import Sequence
+from monai.networks.blocks.patchembedding import PatchEmbeddingBlock
+from monai.networks.blocks.transformerblock import TransformerBlock
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+from .configuration_m3d_clip import M3DCLIPConfig
+from transformers import BertModel, BertConfig
+def gather_features(
+        image_features,
+        text_features,
+        local_loss=False,
+        gather_with_grad=True,
+        rank=0,
+        world_size=1,
+):
+    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+    # We gather tensors from all gpus
+    if gather_with_grad:
+        all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+        all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+    else:
+        gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+        gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+        dist.all_gather(gathered_image_features, image_features)
+        dist.all_gather(gathered_text_features, text_features)
+        if not local_loss:
+            # ensure grads for local rank when all_* features don't have a gradient
+            gathered_image_features[rank] = image_features
+            gathered_text_features[rank] = text_features
+        all_image_features = torch.cat(gathered_image_features, dim=0)
+        all_text_features = torch.cat(gathered_text_features, dim=0)
+    return all_image_features, all_text_features
+class ViT(nn.Module):
+    """
+    Vision Transformer (ViT), based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    ViT supports Torchscript but only works for Pytorch after 1.8.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        img_size: Sequence[int] | int,
+        patch_size: Sequence[int] | int,
+        hidden_size: int = 768,
+        mlp_dim: int = 3072,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        pos_embed: str = "conv",
+        classification: bool = False,
+        num_classes: int = 2,
+        dropout_rate: float = 0.0,
+        spatial_dims: int = 3,
+        post_activation="Tanh",
+        qkv_bias: bool = False,
+        save_attn: bool = False,
+    ) -> None:
+        """
+        Args:
+            in_channels (int): dimension of input channels.
+            img_size (Union[Sequence[int], int]): dimension of input image.
+            patch_size (Union[Sequence[int], int]): dimension of patch size.
+            hidden_size (int, optional): dimension of hidden layer. Defaults to 768.
+            mlp_dim (int, optional): dimension of feedforward layer. Defaults to 3072.
+            num_layers (int, optional): number of transformer blocks. Defaults to 12.
+            num_heads (int, optional): number of attention heads. Defaults to 12.
+            pos_embed (str, optional): position embedding layer type. Defaults to "conv".
+            classification (bool, optional): bool argument to determine if classification is used. Defaults to False.
+            num_classes (int, optional): number of classes if classification is used. Defaults to 2.
+            dropout_rate (float, optional): faction of the input units to drop. Defaults to 0.0.
+            spatial_dims (int, optional): number of spatial dimensions. Defaults to 3.
+            post_activation (str, optional): add a final acivation function to the classification head
+                when `classification` is True. Default to "Tanh" for `nn.Tanh()`.
+                Set to other values to remove this function.
+            qkv_bias (bool, optional): apply bias to the qkv linear layer in self attention block. Defaults to False.
+            save_attn (bool, optional): to make accessible the attention in self attention block. Defaults to False.
+        Examples::
+            # for single channel input with image size of (96,96,96), conv position embedding and segmentation backbone
+            >>> net = ViT(in_channels=1, img_size=(96,96,96), pos_embed='conv')
+            # for 3-channel with image size of (128,128,128), 24 layers and classification backbone
+            >>> net = ViT(in_channels=3, img_size=(128,128,128), pos_embed='conv', classification=True)
+            # for 3-channel with image size of (224,224), 12 layers and classification backbone
+            >>> net = ViT(in_channels=3, img_size=(224,224), pos_embed='conv', classification=True, spatial_dims=2)
+        """
+        super().__init__()
+        if not (0 <= dropout_rate <= 1):
+            raise ValueError("dropout_rate should be between 0 and 1.")
+        if hidden_size % num_heads != 0:
+            raise ValueError("hidden_size should be divisible by num_heads.")
+        self.hidden_size = hidden_size
+        self.classification = classification
+        self.patch_embedding = PatchEmbeddingBlock(
+            in_channels=in_channels,
+            img_size=img_size,
+            patch_size=patch_size,
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            pos_embed=pos_embed,
+            dropout_rate=dropout_rate,
+            spatial_dims=spatial_dims,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate, qkv_bias, save_attn)
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = nn.LayerNorm(hidden_size)
+        if self.classification:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
+            # if post_activation == "Tanh":
+            #     self.classification_head = nn.Sequential(nn.Linear(hidden_size, num_classes), nn.Tanh())
+            # else:
+            #     self.classification_head = nn.Linear(hidden_size, num_classes)  # type: ignore
+    def forward(self, x):
+        x = self.patch_embedding(x)
+        if hasattr(self, "cls_token"):
+            cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+            x = torch.cat((cls_token, x), dim=1)
+        hidden_states_out = []
+        for blk in self.blocks:
+            x = blk(x)
+            hidden_states_out.append(x)
+        x = self.norm(x)
+        # if hasattr(self, "classification_head"):
+        #     x = self.classification_head(x[:, 0])
+        return x, hidden_states_out
+class M3DCLIP(PreTrainedModel):
+    config_class = M3DCLIPConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_encoder = ViT(
+            in_channels=config.in_channels,
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            hidden_size=config.hidden_size,
+            mlp_dim=config.mlp_dim,
+            num_layers=config.num_layers,
+            num_heads=config.num_heads,
+            pos_embed=config.pos_embed,
+            dropout_rate=config.dropout_rate,
+            spatial_dims=config.spatial_dims,
+            classification=True,
+        )
+        # configuration = BertConfig()
+        # self.language_encoder = BertModel(configuration)
+        self.language_encoder = BertModel.from_pretrained(config.language_model_name_or_path)
+        self.mm_vision_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.mm_language_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.local_loss = config.local_loss
+        self.gather_loss = config.gather_loss
+    def encode_image(self, image):
+        image_feats, _ = self.vision_encoder(image)
+        image_feats = self.mm_vision_proj(image_feats)
+        image_feats = F.normalize(image_feats, dim=-1)
+        return image_feats
+    def encode_text(self, input_id, attention_mask):
+        text_feats = self.language_encoder(input_id, attention_mask=attention_mask)["last_hidden_state"]
+        text_feats = self.mm_language_proj(text_feats)
+        text_feats = F.normalize(text_feats, dim=-1)
+        return text_feats
+    def forward(self, images, input_ids, attention_mask, labels, **kwargs):
+        image_features = self.encode_image(images)[:, 0]
+        text_features = self.encode_text(input_ids, attention_mask)[:, 0]
+        if self.gather_loss:
+            all_image_features, all_text_features = gather_features(image_features, text_features)
+            if self.local_loss:
+                logits_per_image = self.logit_scale * image_features @ all_text_features.T
+                logits_per_text = self.logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = self.logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = self.logit_scale * image_features @ text_features.T
+            logits_per_text = self.logit_scale * text_features @ image_features.T
+        loss = (
+                           F.cross_entropy(logits_per_image, labels) +
+                           F.cross_entropy(logits_per_text, labels)
+                   ) / 2
+        ret = {
+            "loss": loss,
+            "logits": (logits_per_image + logits_per_text) / 2.0,
+        }
+        return ret