upload model

Browse files

Files changed (7) hide show

README.md +49 -0
config.json +43 -0
configuration_adaptformer.py +80 -0
model.safetensors +3 -0
modeling_adaptformer.py +647 -0
preprocessing_adaptformer.py +99 -0
preprocessor_config.json +23 -0

README.md CHANGED Viewed

@@ -1,3 +1,52 @@
 ---
 license: mit
 ---

 ---
 license: mit
+tags:
+- vision
+- image-segmentation
+datasets:
+- LEVIR-CD
 ---
+# AdaptFormer model fine-tuned on LEVIR-CD
+AdaptFormer model fine-tuned on LEVIR-CD at resolution 512x512. It was introduced in the paper [AdaptFormer: An Adaptive Hierarchical Semantic Approach for Change Detection on Remote Sensing Images](https://ieeexplore.ieee.org/document/10497147) by Pang et al. and first released in [this repository](https://github.com/aigzhusmart/AdaptFormer).
+## Model description
+AdaptFormer, uniquely designed to adaptively interpret hierarchical semantics. Instead of a one-size-fits-all approach, it strategizes differently across three semantic depths: employing straightforward operations for shallow semantics, assimilating spatial data for medium semantics to emphasize detailed interregional changes, and integrating cascaded depthwise attention for in-depth semantics, focusing on high-level representations
+Here is how to use this model to classify an image:
+```python
+from transformers import AutoImageProcessor, AutoModel
+from PIL import Image
+import requests
+image_processor = AutoImageProcessor.from_pretrained("deepang/adaptformer-LEVIR-CD")
+model = AutoModel.from_pretrained("deepang/adaptformer-LEVIR-CD")
+image_A = Image.open(requests.get('https://raw.githubusercontent.com/aigzhusmart/AdaptFormer/main/figures/test_2_1_A.png', stream=True).raw)
+image_B = Image.open(requests.get('https://raw.githubusercontent.com/aigzhusmart/AdaptFormer/main/figures/test_2_1_B.png', stream=True).raw)
+label = Image.open(requests.get('https://raw.githubusercontent.com/aigzhusmart/AdaptFormer/main/figures/test_2_1_label.png', stream=True).raw)
+inputs = preprocessor(images=(image_A, image_B), return_tensors="pt")
+outputs = adaptfromer_model(**inputs)
+logits = outputs.logits # shape (batch_size, num_labels, height, width)
+pred = logits.argmax(dim=1)[0]
+```
+### License
+The license for this model can be found [here](https://github.com/aigzhusmart/AdaptFormer).
+### BibTeX entry and citation info
+```bibtex
+@article{huang2024adaptformer,
+  title={AdaptFormer: An Adaptive Hierarchical Semantic Approach for Change Detection on Remote Sensing Images},
+  author={Huang, Teng and Hong, Yile and Pang, Yan and Liang, Jiaming and Hong, Jie and Huang, Lin and Zhang, Yuan and Jia, Yan and Savi, Patrizia},
+  journal={IEEE Transactions on Instrumentation and Measurement},
+  year={2024},
+  publisher={IEEE}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "AdaptFormerForChangeDetection"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_adaptformer.AdaptFormerConfig",
+    "AutoModel": "modeling_adaptformer.AdaptFormerForChangeDetection",
+    "AutoImageProcessor": "preprocessing_adaptformer.AdaptFormerImageProcessor"
+  },
+  "depths": [
+    3,
+    3,
+    3
+  ],
+  "embed_dims": [
+    64,
+    128,
+    256
+  ],
+  "initializer_range": 0.02,
+  "mlp_ratios": [
+    4,
+    4,
+    4
+  ],
+  "model_type": "adaptformer",
+  "num_channels": 3,
+  "num_classes": 2,
+  "num_heads": [
+    1,
+    2,
+    4
+  ],
+  "semantic_loss_ignore_index": 255,
+  "semantic_loss_weight": [
+    0,
+    0,
+    0.5,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.3"
+}

configuration_adaptformer.py ADDED Viewed

	@@ -0,0 +1,80 @@

+""" AdaptFormer model configuration"""
+from transformers import PretrainedConfig
+class AdaptFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AdaptFormerForChangeDetection`].
+    It is used to instantiate an AdaptFormer model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the AdaptFormer
+    [deepang/adaptformer-LEVIR-CD](https://huggingface.co/deepang/adaptformer-LEVIR-CD)
+    architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_classes (`int`, *optional*, defaults to 2):
+            The number of classes.
+        embed_dims (`List[int]`, *optional*, defaults to `[64, 128, 256]`):
+            Dimension of each of the encoder blocks.
+        num_heads (`List[int]`, *optional*, defaults to `[1, 2, 4]`):
+            Number of attention heads for each attention layer in each block of the encoder.
+        mlp_ratios (`List[int]`, *optional*, defaults to `[4, 4, 4]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        depths (`List[int]`, *optional*, defaults to `[3, 3, 3]`):
+            The number of layers in each encoder block.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+        semantic_loss_weight (`List[float]`, *optional*, defaults to `[0, 0, 0.8, 1]`):
+            The weight of the semantic segmentation loss.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Example:
+    ```python
+    >>> from transformers import AutoModel, AutoConfig
+    >>> # Initializing a AdaptFormer
+    >>> configuration = AutoConfig.from_pretrained("deepang/adaptformer-LEVIR-CD")
+    >>> # Initializing a model from the deepang/adaptformer-LEVIR-CD style configuration
+    >>> model = AutoModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "adaptformer"
+    def __init__(
+        self,
+        num_channels=3,
+        num_classes=2,
+        embed_dims=[64, 128, 256],
+        num_heads=[1, 2, 4],
+        mlp_ratios=[4, 4, 4],
+        depths=[3, 3, 3],
+        semantic_loss_ignore_index=255,
+        semantic_loss_weight=[0, 0, 0.5, 1],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.num_channels = num_channels
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_heads = num_heads
+        self.mlp_ratios = mlp_ratios
+        self.depths = depths
+        self.num_classes = num_classes
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+        self.semantic_loss_weight = semantic_loss_weight
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32a543900a391fcb9b974956cfc96811261f7c6ad4b6393e6907910d99b42e04
+size 50178960

modeling_adaptformer.py ADDED Viewed

	@@ -0,0 +1,647 @@

+""" PyTorch AdaptFormer model."""
+import itertools
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.layers.torch import Rearrange
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import SemanticSegmenterOutput
+from .configuration_adaptformer import AdaptFormerConfig
+class SpatialExchange(nn.Module):
+    def __init__(self, p=1 / 2):
+        super().__init__()
+        assert p >= 0 and p <= 1
+        self.p = int(1 / p)
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor):
+        _, _, _, w = x1.shape
+        exchange_mask = torch.arange(w) % self.p == 0
+        out_x1 = torch.zeros_like(x1, device=x1.device)
+        out_x2 = torch.zeros_like(x2, device=x1.device)
+        out_x1[..., ~exchange_mask] = x1[..., ~exchange_mask]
+        out_x2[..., ~exchange_mask] = x2[..., ~exchange_mask]
+        out_x1[..., exchange_mask] = x2[..., exchange_mask]
+        out_x2[..., exchange_mask] = x1[..., exchange_mask]
+        return out_x1, out_x2
+class ChannelExchange(nn.Module):
+    def __init__(self, p=1 / 2):
+        super().__init__()
+        assert p >= 0 and p <= 1
+        self.p = int(1 / p)
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor):
+        N, c, _, _ = x1.shape
+        exchange_map = torch.arange(c) % self.p == 0
+        exchange_mask = exchange_map.unsqueeze(0).expand((N, -1))
+        out_x1 = torch.zeros_like(x1, device=x1.device)
+        out_x2 = torch.zeros_like(x2, device=x1.device)
+        out_x1[~exchange_mask, ...] = x1[~exchange_mask, ...]
+        out_x2[~exchange_mask, ...] = x2[~exchange_mask, ...]
+        out_x1[exchange_mask, ...] = x2[exchange_mask, ...]
+        out_x2[exchange_mask, ...] = x1[exchange_mask, ...]
+        return out_x1, out_x2
+class CascadedGroupAttention(nn.Module):
+    r"""Cascaded Group Attention.
+    Args:
+        dim (int): Number of input channels.
+        key_dim (int): The dimension for query and key.
+        num_heads (int): Number of attention heads.
+        attn_ratio (int): Multiplier for the query dim for value dimension.
+        resolution (int): Input resolution, correspond to the window size.
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """
+    def __init__(
+        self,
+        dim,
+        key_dim,
+        num_heads=8,
+        attn_ratio=4,
+        resolution=14,
+        kernels=[5, 5, 5, 5],
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.d = int(attn_ratio * key_dim)
+        self.attn_ratio = attn_ratio
+        qkvs = []
+        dws = []
+        for i in range(num_heads):
+            qkvs.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        dim // (num_heads),
+                        self.key_dim * 2 + self.d,
+                        1,
+                        1,
+                        0,
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(self.key_dim * 2 + self.d),
+                )
+            )
+            dws.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        self.key_dim,
+                        self.key_dim,
+                        kernels[i],
+                        1,
+                        kernels[i] // 2,
+                        groups=self.key_dim,
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(self.key_dim),
+                )
+            )
+        self.qkvs = nn.ModuleList(qkvs)
+        self.dws = nn.ModuleList(dws)
+        self.proj = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(self.d * num_heads, dim, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(dim),
+        )
+        self.act_gelu = nn.GELU()
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets))
+        )
+        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(N, N))
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, "ab"):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+    def forward(self, x):
+        B, _, H, W = x.shape
+        trainingab = self.attention_biases[:, self.attention_bias_idxs]
+        feats_in = x.chunk(len(self.qkvs), dim=1)
+        feats_out = []
+        feat = feats_in[0]
+        for i, qkv in enumerate(self.qkvs):
+            if i > 0:
+                feat = feat + feats_in[i]
+            feat = qkv(feat)
+            q, k, v = feat.view(B, -1, H, W).split(
+                [self.key_dim, self.key_dim, self.d], dim=1
+            )
+            q = self.act_gelu(self.dws[i](q)) + q
+            q, k, v = q.flatten(2), k.flatten(2), v.flatten(2)
+            attn = (q.transpose(-2, -1) @ k) * self.scale + (
+                trainingab[i] if self.training else self.ab[i].to(x.device)
+            )
+            attn = attn.softmax(dim=-1)
+            feat = (v @ attn.transpose(-2, -1)).view(B, self.d, H, W)
+            feats_out.append(feat)
+        x = self.proj(torch.cat(feats_out, 1))
+        return x
+class LocalWindowAttention(nn.Module):
+    r"""Local Window Attention.
+    Args:
+        dim (int): Number of input channels.
+        key_dim (int): The dimension for query and key.
+        num_heads (int): Number of attention heads.
+        attn_ratio (int): Multiplier for the query dim for value dimension.
+        resolution (int): Input resolution.
+        window_resolution (int): Local window resolution.
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """
+    def __init__(
+        self,
+        dim,
+        key_dim,
+        num_heads=8,
+        attn_ratio=4,
+        resolution=14,
+        window_resolution=7,
+        kernels=[5, 5, 5, 5],
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.resolution = resolution
+        assert window_resolution > 0, "window_size must be greater than 0"
+        self.window_resolution = window_resolution
+        window_resolution = min(window_resolution, resolution)
+        self.attn = CascadedGroupAttention(
+            dim,
+            key_dim,
+            num_heads,
+            attn_ratio=attn_ratio,
+            resolution=window_resolution,
+            kernels=kernels,
+        )
+    def forward(self, x):
+        H = W = self.resolution
+        B, C, H_, W_ = x.shape
+        # Only check this for classifcation models
+        assert (
+            H == H_ and W == W_
+        ), "input feature has wrong size, expect {}, got {}".format((H, W), (H_, W_))
+        if H <= self.window_resolution and W <= self.window_resolution:
+            x = self.attn(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            pad_b = (
+                self.window_resolution - H % self.window_resolution
+            ) % self.window_resolution
+            pad_r = (
+                self.window_resolution - W % self.window_resolution
+            ) % self.window_resolution
+            padding = pad_b > 0 or pad_r > 0
+            if padding:
+                x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_resolution
+            nW = pW // self.window_resolution
+            x = (
+                x.view(B, nH, self.window_resolution, nW, self.window_resolution, C)
+                .transpose(2, 3)
+                .reshape(B * nH * nW, self.window_resolution, self.window_resolution, C)
+                .permute(0, 3, 1, 2)
+            )
+            x = self.attn(x)
+            x = (
+                x.permute(0, 2, 3, 1)
+                .view(B, nH, nW, self.window_resolution, self.window_resolution, C)
+                .transpose(2, 3)
+                .reshape(B, pH, pW, C)
+            )
+            if padding:
+                x = x[:, :H, :W].contiguous()
+            x = x.permute(0, 3, 1, 2)
+        return x
+class LocalAgg(nn.Module):
+    def __init__(self, channels):
+        super(LocalAgg, self).__init__()
+        self.bn = nn.BatchNorm2d(channels)
+        self.pointwise_conv_0 = nn.Conv2d(channels, channels, kernel_size=1, bias=False)
+        self.depthwise_conv = nn.Conv2d(
+            channels, channels, padding=1, kernel_size=3, groups=channels, bias=False
+        )
+        self.pointwise_prenorm_1 = nn.BatchNorm2d(channels)
+        self.pointwise_conv_1 = nn.Conv2d(channels, channels, kernel_size=1, bias=False)
+    def forward(self, x):
+        x = self.bn(x)
+        x = self.pointwise_conv_0(x)
+        x = self.depthwise_conv(x)
+        x = self.pointwise_prenorm_1(x)
+        x = self.pointwise_conv_1(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(self, channels, mlp_ratio):
+        super(Mlp, self).__init__()
+        self.up_proj = nn.Conv2d(
+            channels, channels * mlp_ratio, kernel_size=1, bias=False
+        )
+        self.down_proj = nn.Conv2d(
+            channels * mlp_ratio, channels, kernel_size=1, bias=False
+        )
+    def forward(self, x):
+        return self.down_proj(F.gelu(self.up_proj(x)))
+class LocalMerge(nn.Module):
+    def __init__(self, channels, r, heads, resolution, partial=False):
+        super(LocalMerge, self).__init__()
+        self.partial = partial
+        self.cpe1 = nn.Conv2d(
+            channels, channels, kernel_size=3, padding=1, groups=channels, bias=False
+        )
+        self.local_agg = LocalAgg(channels)
+        self.mlp1 = Mlp(channels, r)
+        if partial:
+            self.cpe2 = nn.Conv2d(
+                channels,
+                channels,
+                kernel_size=3,
+                padding=1,
+                groups=channels,
+                bias=False,
+            )
+            self.attn = LocalWindowAttention(
+                channels,
+                16,
+                heads,
+                attn_ratio=r,
+                resolution=resolution,
+                window_resolution=7,
+                kernels=[5, 5, 5, 5],
+            )
+            self.mlp2 = Mlp(channels, r)
+    def forward(self, x):
+        x = self.cpe1(x) + x
+        x = self.local_agg(x) + x
+        x = self.mlp1(x) + x
+        if self.partial:
+            x = self.cpe2(x) + x
+            x = self.attn(x) + x
+            x = self.mlp2(x) + x
+        return x
+class AdaptFormerEncoderBlock(nn.Module):
+    def __init__(
+        self, in_chans, embed_dim, num_head, mlp_ratio, depth, resolution, partial
+    ):
+        super().__init__()
+        self.down = nn.Sequential(
+            nn.Conv2d(in_chans, embed_dim, kernel_size=2, stride=2),
+            nn.GroupNorm(num_groups=1, num_channels=embed_dim),
+        )
+        self.block = nn.Sequential(
+            *[
+                LocalMerge(
+                    channels=embed_dim,
+                    r=mlp_ratio,
+                    heads=num_head,
+                    resolution=resolution,
+                    partial=partial,
+                )
+                for _ in range(depth)
+            ]
+        )
+    def forward(self, x: torch.Tensor):
+        return self.block(self.down(x))
+class ChangeDetectionHaed(nn.Module):
+    def __init__(self, embedding_dim, in_channels, num_classes):
+        super(ChangeDetectionHaed, self).__init__()
+        self.in_proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels=embedding_dim * len(in_channels),
+                out_channels=embedding_dim,
+                kernel_size=1,
+            ),
+            nn.BatchNorm2d(embedding_dim),
+            nn.ConvTranspose2d(embedding_dim, embedding_dim, 4, stride=2, padding=1),
+        )
+        self.conv1 = nn.Conv2d(embedding_dim, embedding_dim, 3, 1, 1)
+        self.conv2 = nn.Conv2d(embedding_dim, embedding_dim, 3, 1, 1)
+        self.out = nn.Conv2d(embedding_dim, num_classes, 3, 1, 1)
+    def forward(self, x: torch.Tensor):
+        x = self.in_proj(x)
+        x = self.conv2(F.relu(self.conv1(x))) * 0.1 + x
+        return self.out(x)
+class AdaptFormerDecoder(nn.Module):
+    def __init__(
+        self,
+        config: AdaptFormerConfig,
+    ):
+        super(AdaptFormerDecoder, self).__init__()
+        self.in_channels = config.embed_dims
+        self.embedding_dim = config.embed_dims[-1]
+        self.linear_emb_layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    Rearrange("n c ... -> n (...) c"),
+                    nn.Linear(in_dim, self.embedding_dim),
+                )
+                for in_dim in self.in_channels
+            ]
+        )
+        self.diff_layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv2d(2 * self.embedding_dim, self.embedding_dim, 3, 1, 1),
+                    nn.ReLU(),
+                    nn.BatchNorm2d(self.embedding_dim),
+                    nn.Conv2d(self.embedding_dim, self.embedding_dim, 3, 1, 1),
+                    nn.ReLU(),
+                )
+                for _ in range(3)
+            ]
+        )
+        self.prediction_layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv2d(self.embedding_dim, config.num_classes, 3, 1, 1),
+                    nn.ReLU(),
+                    nn.BatchNorm2d(config.num_classes),
+                    nn.Conv2d(config.num_classes, config.num_classes, 3, 1, 1),
+                )
+                for _ in range(3)
+            ]
+        )
+        self.head = ChangeDetectionHaed(
+            self.embedding_dim, self.in_channels, config.num_classes
+        )
+    def forward(self, pixel_valuesA, pixel_valuesB):
+        N, _, H, W = pixel_valuesA[0].shape
+        # c3
+        pixel_values_c3 = torch.cat([pixel_valuesA[2], pixel_valuesB[2]], dim=0)
+        _c3_1, _c3_2 = torch.chunk(
+            self.linear_emb_layers[2](pixel_values_c3).permute(0, 2, 1), 2
+        )
+        _c3_1 = _c3_1.reshape(N, -1, pixel_values_c3.shape[2], pixel_values_c3.shape[3])
+        _c3_2 = _c3_2.reshape(N, -1, pixel_values_c3.shape[2], pixel_values_c3.shape[3])
+        _c3 = self.diff_layers[2](torch.cat((_c3_1, _c3_2), dim=1))
+        p_c3 = self.prediction_layers[2](_c3)
+        _c3_up = F.interpolate(_c3, (H, W), mode="bilinear", align_corners=False)
+        # c2
+        pixel_values_c2 = torch.cat([pixel_valuesA[1], pixel_valuesB[1]], dim=0)
+        _c2_1, _c2_2 = torch.chunk(
+            self.linear_emb_layers[1](pixel_values_c2).permute(0, 2, 1), 2
+        )
+        _c2_1 = _c2_1.reshape(N, -1, pixel_values_c2.shape[2], pixel_values_c2.shape[3])
+        _c2_2 = _c2_2.reshape(N, -1, pixel_values_c2.shape[2], pixel_values_c2.shape[3])
+        _c2 = self.diff_layers[1](torch.cat((_c2_1, _c2_2), dim=1)) + F.interpolate(
+            _c3, scale_factor=2, mode="bilinear"
+        )
+        p_c2 = self.prediction_layers[1](_c2)
+        _c2_up = F.interpolate(_c2, (H, W), mode="bilinear", align_corners=False)
+        # c1
+        pixel_values_c1 = torch.cat([pixel_valuesA[0], pixel_valuesB[0]], dim=0)
+        _c1_1, _c1_2 = torch.chunk(
+            self.linear_emb_layers[0](pixel_values_c1).permute(0, 2, 1), 2
+        )
+        _c1_1 = _c1_1.reshape(N, -1, pixel_values_c1.shape[2], pixel_values_c1.shape[3])
+        _c1_2 = _c1_2.reshape(N, -1, pixel_values_c1.shape[2], pixel_values_c1.shape[3])
+        _c1 = self.diff_layers[0](torch.cat((_c1_1, _c1_2), dim=1)) + F.interpolate(
+            _c2, scale_factor=2, mode="bilinear"
+        )
+        p_c1 = self.prediction_layers[0](_c1)
+        cp = self.head(torch.cat((_c3_up, _c2_up, _c1), dim=1))
+        return [p_c3, p_c2, p_c1, cp]
+class AdaptFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = AdaptFormerConfig
+    base_model_prefix = "adaptformer"
+    def _init_weights(self, m):
+        """Initialize the weights"""
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            import math
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+class AdaptFormerForChangeDetection(AdaptFormerPreTrainedModel):
+    """
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`AdaptFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+    def __init__(
+        self,
+        config: AdaptFormerConfig,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.block1 = AdaptFormerEncoderBlock(
+            in_chans=config.num_channels,
+            embed_dim=config.embed_dims[0],
+            num_head=config.num_heads[0],
+            mlp_ratio=config.mlp_ratios[0],
+            depth=config.depths[0],
+            resolution=config.embed_dims[2] // 2,
+            partial=False,
+        )
+        self.block2 = AdaptFormerEncoderBlock(
+            in_chans=config.embed_dims[0],
+            embed_dim=config.embed_dims[1],
+            num_head=config.num_heads[1],
+            mlp_ratio=config.mlp_ratios[1],
+            depth=config.depths[1],
+            resolution=config.embed_dims[1] // 2,
+            partial=False,
+        )
+        self.block3 = AdaptFormerEncoderBlock(
+            in_chans=config.embed_dims[1],
+            embed_dim=config.embed_dims[2],
+            num_head=config.num_heads[2],
+            mlp_ratio=config.mlp_ratios[2],
+            depth=config.depths[2],
+            resolution=config.embed_dims[0] // 2,
+            partial=True,
+        )
+        self.spatialex = SpatialExchange()
+        self.channelex = ChannelExchange()
+        self.decoder = AdaptFormerDecoder(config=config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        pixel_valuesA: torch.Tensor,
+        pixel_valuesB: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModel
+        >>> from PIL import Image
+        >>> import requests
+        >>> image_processor = AutoImageProcessor.from_pretrained("deepang/adaptformer-LEVIR-CD")
+        >>> model = AutoModel.from_pretrained("deepang/adaptformer-LEVIR-CD")
+        >>> image_A = Image.open(requests.get('https://raw.githubusercontent.com/aigzhusmart/AdaptFormer/main/figures/test_2_1_A.png', stream=True).raw)
+        >>> image_B = Image.open(requests.get('https://raw.githubusercontent.com/aigzhusmart/AdaptFormer/main/figures/test_2_1_B.png', stream=True).raw)
+        >>> label = Image.open(requests.get('https://raw.githubusercontent.com/aigzhusmart/AdaptFormer/main/figures/test_2_1_label.png', stream=True).raw)
+        >>> with torch.no_grad():
+        >>>     inputs = preprocessor(images=(image_A, image_B), return_tensors="pt")
+        >>>     outputs = adaptfromer_model(**inputs)
+        >>>     logits = outputs.logits.cpu()
+        >>>     pred = logits.argmax(dim=1)[0]
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        x1_1, x2_1 = torch.chunk(
+            self.block1(torch.cat((pixel_valuesA, pixel_valuesB), dim=0)), 2
+        )
+        x1_2, x2_2 = torch.chunk(
+            self.block2(torch.cat(self.spatialex(x1_1, x2_1), dim=0)), 2
+        )
+        x1_3, x2_3 = torch.chunk(
+            self.block3(torch.cat(self.channelex(x1_2, x2_2), dim=0)), 2
+        )
+        hidden_states = self.decoder([x1_1, x1_2, x1_3], [x2_1, x2_2, x2_3])
+        loss = None
+        if labels is not None:
+            loss = 0
+            for i, hidden_state in enumerate(hidden_states):
+                upsampled_logits = F.interpolate(
+                    hidden_state,
+                    size=labels.shape[-2:],
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                loss += (
+                    F.cross_entropy(
+                        upsampled_logits,
+                        labels.long(),
+                        ignore_index=self.config.semantic_loss_ignore_index,
+                    )
+                    * self.config.semantic_loss_weight[i]
+                )
+        if not return_dict:
+            if output_hidden_states:
+                output = (hidden_states[-1], hidden_states)
+            else:
+                output = (hidden_states[-1],)
+            return ((loss,) + output) if loss is not None else output
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=hidden_states[-1],
+            hidden_states=hidden_states if output_hidden_states else None,
+        )

preprocessing_adaptformer.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import Tuple
+from transformers import ViTImageProcessor
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import ImageInput
+class AdaptFormerImageProcessor(ViTImageProcessor):
+    r"""
+    Constructs a AdaptFormer image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def preprocess(
+        self,
+        images: Tuple[ImageInput, ImageInput],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`Tuple[ImageInput, ImageInput]`):
+                Image Tuple to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        imagesA, imagesB = images
+        feature_A = super().preprocess(imagesA, **kwargs)
+        feature_B = super().preprocess(imagesB, **kwargs)
+        data = {
+            "pixel_valuesA": feature_A["pixel_values"],
+            "pixel_valuesB": feature_B["pixel_values"],
+        }
+        return BatchFeature(data=data, tensor_type=kwargs.pop("return_tensors", None))

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "auto_map": {
+        "AutoImageProcessor": "preprocessing_adaptformer.AdaptFormerImageProcessor"
+    },
+    "size": 256,
+    "do_center_crop": false,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+        0.485,
+        0.456,
+        0.406
+    ],
+    "image_processor_type": "AdaptFormerImageProcessor",
+    "image_std": [
+        0.229,
+        0.224,
+        0.225
+    ],
+    "resample": 3
+}