Spaces:

eksemyashkina
/

clothes-segmentation

Running

App Files Files Community

eksemyashkina commited on Jan 17

Commit

af720c2

verified ·

1 Parent(s): 656b785

Upload 9 files

Browse files

Files changed (9) hide show

src/dataset.py +50 -0
src/get_loss.py +79 -0
src/losses.py +498 -0
src/models/dino.py +37 -0
src/models/segmentation_head.py +40 -0
src/models/unet.py +171 -0
src/models/vit.py +36 -0
src/train.py +264 -0
src/utils.py +58 -0

src/dataset.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import List, Tuple, Callable
+from pathlib import Path
+import datasets
+import torch
+from torch.utils.data import Dataset
+class SegmentationDataset(Dataset):
+    def __init__(
+        self,
+        dataset: datasets.Dataset,
+        train: bool = True,
+        transform: Callable = None,
+        target_transform: Callable = None,
+        test_size: float = 0.25,
+    ) -> None:
+        super().__init__()
+        self.dataset = dataset
+        self.train = train
+        self.transform = transform
+        self.target_transform = target_transform
+        self.test_size = test_size
+        total_size = len(dataset)
+        indices = list(range(total_size))
+        split = int(self.test_size * total_size)
+        if train:
+            self.indices = indices[split:]
+        else:
+            self.indices = indices[:split]
+    def __len__(self) -> int:
+        return len(self.indices)
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        item = self.dataset[self.indices[idx]]
+        image = item["image"]
+        mask = item["mask"]
+        if self.transform:
+            image = self.transform(image)
+        if self.target_transform:
+            mask = self.target_transform(mask)
+        return image, mask
+def collate_fn(items: List[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor]:
+    images = torch.stack([item[0] for item in items])
+    masks = torch.stack([item[1] for item in items])
+    return images, masks

src/get_loss.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Dict, Callable
+import torch.nn as nn
+import torch
+from losses import SoftDiceLoss, SSLoss, IoULoss, TverskyLoss, FocalTversky_loss, AsymLoss, ExpLog_loss, FocalLoss, LovaszSoftmax, TopKLoss, WeightedCrossEntropyLoss, SoftDiceLoss_v2, IoULoss_v2, TverskyLoss_v2, FocalTversky_loss_v2, AsymLoss_v2, SSLoss_v2
+def get_loss(loss_type: str) -> Callable | None:
+    if loss_type == "cross_entropy":
+        return nn.CrossEntropyLoss()
+    elif loss_type == "SoftDiceLoss":
+        return SoftDiceLoss()
+    elif loss_type == "SSLoss":
+        return SSLoss()
+    elif loss_type == "IoULoss":
+        return IoULoss()
+    elif loss_type == "TverskyLoss":
+        return TverskyLoss()
+    elif loss_type == "FocalTversky_loss":
+        tversky_kwargs = {
+            "apply_nonlin": None,
+            "batch_dice": False,
+            "do_bg": True,
+            "smooth": 1.0,
+            "square": False
+        }
+        return FocalTversky_loss(tversky_kwargs=tversky_kwargs)
+    elif loss_type == "AsymLoss":
+        return AsymLoss()
+    elif loss_type == "ExpLog_loss":
+        soft_dice_kwargs = {
+            "smooth": 1.0
+        }
+        wce_kwargs = {
+            "weight": None
+        }
+        return ExpLog_loss(soft_dice_kwargs=soft_dice_kwargs, wce_kwargs=wce_kwargs)
+    elif loss_type == "FocalLoss":
+        return FocalLoss()
+    elif loss_type == "LovaszSoftmax":
+        return LovaszSoftmax()
+    elif loss_type == "TopKLoss":
+        return TopKLoss()
+    elif loss_type == "WeightedCrossEntropyLoss":
+        return WeightedCrossEntropyLoss()
+    elif loss_type == "SoftDiceLoss_v2":
+        return SoftDiceLoss_v2()
+    elif loss_type == "IoULoss_v2":
+        return IoULoss_v2()
+    elif loss_type == "TverskyLoss_v2":
+        return TverskyLoss_v2()
+    elif loss_type == "FocalTversky_loss_v2":
+        return FocalTversky_loss_v2()
+    elif loss_type == "AsymLoss_v2":
+        return AsymLoss_v2()
+    elif loss_type == "SSLoss_v2":
+        return SSLoss_v2()
+    else:
+        raise ValueError(f"Unsupported loss type: {loss_type}")
+def get_composite_criterion(losses_config: Dict[str, float]) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]:
+    losses = []
+    weights = []
+    for loss_name, weight in losses_config.items():
+        if weight != 0.0:
+            loss_fn = get_loss(loss_name)
+            if loss_fn is not None:
+                losses.append(loss_fn)
+                weights.append(weight)
+    def composite_loss(output: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        total_loss = 0.0
+        for loss_fn, weight in zip(losses, weights):
+            total_loss += weight * loss_fn(output, target)
+        return total_loss
+    return composite_loss

src/losses.py ADDED Viewed

	@@ -0,0 +1,498 @@

+from typing import Callable, List, Tuple, Dict
+import torch
+from torch import nn
+import torch.nn.functional as F
+import numpy as np
+def sum_tensor(inp: torch.Tensor, axes: int | List[int], keepdim: bool = False) -> torch.Tensor:
+    axes = np.unique(axes).astype(int)
+    if keepdim:
+        for ax in axes:
+            inp = inp.sum(int(ax), keepdim=True)
+    else:
+        for ax in sorted(axes, reverse=True):
+            inp = inp.sum(int(ax))
+    return inp
+def get_tp_fp_fn(net_output: torch.Tensor, gt: torch.Tensor, axes: int | Tuple[int, ...] | None = None, mask: torch.Tensor | None = None, square: bool = False) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if axes is None:
+        axes = tuple(range(2, len(net_output.size())))
+    shp_x = net_output.shape
+    shp_y = gt.shape
+    with torch.no_grad():
+        if len(shp_x) != len(shp_y):
+            gt = gt.view((shp_y[0], 1, *shp_y[1:]))
+        if all([i == j for i, j in zip(net_output.shape, gt.shape)]):
+            y_onehot = gt
+        else:
+            gt = gt.long()
+            y_onehot = torch.zeros(shp_x)
+            if net_output.device.type == "cuda":
+                y_onehot = y_onehot.cuda(net_output.device.index)
+            y_onehot.scatter_(1, gt, 1)
+    tp = net_output * y_onehot
+    fp = net_output * (1 - y_onehot)
+    fn = (1 - net_output) * y_onehot
+    if mask is not None:
+        tp = torch.stack(tuple(x_i * mask[:, 0] for x_i in torch.unbind(tp, dim=1)), dim=1)
+        fp = torch.stack(tuple(x_i * mask[:, 0] for x_i in torch.unbind(fp, dim=1)), dim=1)
+        fn = torch.stack(tuple(x_i * mask[:, 0] for x_i in torch.unbind(fn, dim=1)), dim=1)
+    if square:
+        tp = tp ** 2
+        fp = fp ** 2
+        fn = fn ** 2
+    tp = sum_tensor(tp, axes, keepdim=False)
+    fp = sum_tensor(fp, axes, keepdim=False)
+    fn = sum_tensor(fn, axes, keepdim=False)
+    return tp, fp, fn
+def softmax_helper(x: torch.Tensor) -> torch.Tensor:
+    rpt = [1 for _ in range(len(x.size()))]
+    rpt[1] = x.size(1)
+    x_max = x.max(1, keepdim=True)[0].repeat(*rpt)
+    e_x = torch.exp(x - x_max)
+    return e_x / e_x.sum(1, keepdim=True).repeat(*rpt)
+def flatten(tensor: torch.Tensor) -> torch.Tensor:
+    C = tensor.size(1)
+    axis_order = (1, 0) + tuple(range(2, tensor.dim()))
+    transposed = tensor.permute(axis_order).contiguous()
+    return transposed.view(C, -1)
+class SoftDiceLoss(nn.Module):
+    def __init__(self, apply_nonlin: Callable | None = softmax_helper, batch_dice: bool = True, do_bg: bool = False, smooth: float = 1.0, square: bool = True) -> None:
+        super().__init__()
+        self.square = square
+        self.do_bg = do_bg
+        self.batch_dice = batch_dice
+        self.apply_nonlin = apply_nonlin
+        self.smooth = smooth
+    def forward(self, x: torch.Tensor, y: torch.Tensor, loss_mask: torch.Tensor | None = None) -> torch.Tensor:
+        shp_x = x.shape
+        if self.batch_dice:
+            axes = [0] + list(range(2, len(shp_x)))
+        else:
+            axes = list(range(2, len(shp_x)))
+        if self.apply_nonlin is not None:
+            x = self.apply_nonlin(x)
+        tp, fp, fn = get_tp_fp_fn(x, y, axes, loss_mask, self.square)
+        dc = (2 * tp + self.smooth) / (2 * tp + fp + fn + self.smooth)
+        if not self.do_bg:
+            if self.batch_dice:
+                dc = dc[1:]
+            else:
+                dc = dc[:, 1:]
+        dc = dc.mean()
+        return -dc
+class SoftDiceLoss_v2(nn.Module):
+    def __init__(self, smooth: float = 1.0) -> None:
+        super().__init__()
+        self.smooth = smooth
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = F.softmax(logits, dim=1)
+        targets = F.one_hot(targets, num_classes=probs.size(1)).permute(0, 3, 1, 2).float()
+        intersection = torch.sum(probs * targets, dim=(0, 2, 3))
+        union = torch.sum(probs + targets, dim=(0, 2, 3))
+        dl = 1 - (2.0 * intersection + self.smooth) / (union + self.smooth)
+        dice_loss = torch.mean(dl)
+        return dice_loss
+class SSLoss(nn.Module):
+    def __init__(self, apply_nonlin: Callable | None = softmax_helper, batch_dice: bool = True, do_bg: bool = False, smooth: float = 1., square: bool = True) -> None:
+        super().__init__()
+        self.square = square
+        self.do_bg = do_bg
+        self.batch_dice = batch_dice
+        self.apply_nonlin = apply_nonlin
+        self.smooth = smooth
+        self.r = 0.1
+    def forward(self, net_output: torch.Tensor, gt: torch.Tensor) -> torch.Tensor:
+        shp_x = net_output.shape
+        shp_y = gt.shape
+        with torch.no_grad():
+            if len(shp_x) != len(shp_y):
+                gt = gt.view((shp_y[0], 1, *shp_y[1:]))
+            if all([i == j for i, j in zip(net_output.shape, gt.shape)]):
+                y_onehot = gt
+            else:
+                gt = gt.long()
+                y_onehot = torch.zeros(shp_x)
+                if net_output.device.type == "cuda":
+                    y_onehot = y_onehot.cuda(net_output.device.index)
+                y_onehot.scatter_(1, gt, 1)
+        if self.batch_dice:
+            axes = [0] + list(range(2, len(shp_x)))
+        else:
+            axes = list(range(2, len(shp_x)))
+        if self.apply_nonlin is not None:
+            net_output = self.apply_nonlin(net_output)
+        bg_onehot = 1 - y_onehot
+        squared_error = (y_onehot - net_output)**2
+        specificity_part = sum_tensor(squared_error*y_onehot, axes)/(sum_tensor(y_onehot, axes)+self.smooth)
+        sensitivity_part = sum_tensor(squared_error*bg_onehot, axes)/(sum_tensor(bg_onehot, axes)+self.smooth)
+        ss = self.r * specificity_part + (1-self.r) * sensitivity_part
+        if not self.do_bg:
+            if self.batch_dice:
+                ss = ss[1:]
+            else:
+                ss = ss[:, 1:]
+        ss = ss.mean()
+        return ss
+class SSLoss_v2(nn.Module):
+    def __init__(self, alpha: float = 0.5) -> None:
+        super().__init__()
+        self.alpha = alpha
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = F.softmax(logits, dim=1)
+        targets = F.one_hot(targets, num_classes=probs.size(1)).permute(0, 3, 1, 2).float()
+        intersection = torch.sum(probs * targets, dim=(0, 2, 3))
+        cardinality = torch.sum(probs + targets, dim=(0, 2, 3))
+        dice_loss = 1 - (2.0 * intersection + 1e-6) / (cardinality + 1e-6)
+        ce_loss = F.cross_entropy(probs, targets, reduction='mean')
+        loss = 0.5 * dice_loss.mean() + (1 - 0.5) * ce_loss
+        return loss
+class IoULoss(nn.Module):
+    def __init__(self, apply_nonlin: Callable | None = softmax_helper, batch_dice: bool = True, do_bg: bool = False, smooth: float = 1., square: bool = True) -> None:
+        super().__init__()
+        self.square = square
+        self.do_bg = do_bg
+        self.batch_dice = batch_dice
+        self.apply_nonlin = apply_nonlin
+        self.smooth = smooth
+    def forward(self, x: torch.Tensor, y: torch.Tensor, loss_mask: torch.Tensor | None = None) -> torch.Tensor:
+        shp_x = x.shape
+        if self.batch_dice:
+            axes = [0] + list(range(2, len(shp_x)))
+        else:
+            axes = list(range(2, len(shp_x)))
+        if self.apply_nonlin is not None:
+            x = self.apply_nonlin(x)
+        tp, fp, fn = get_tp_fp_fn(x, y, axes, loss_mask, self.square)
+        iou = (tp + self.smooth) / (tp + fp + fn + self.smooth)
+        if not self.do_bg:
+            if self.batch_dice:
+                iou = iou[1:]
+            else:
+                iou = iou[:, 1:]
+        iou = iou.mean()
+        return -iou
+class IoULoss_v2(nn.Module):
+    def __init__(self, smooth: float = 1.0) -> None:
+        super().__init__()
+        self.smooth = smooth
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = F.softmax(logits, dim=1)
+        targets = F.one_hot(targets, num_classes=probs.size(1)).permute(0, 3, 1, 2).float()
+        intersection = torch.sum(probs * targets, dim=(0, 2, 3))
+        union = torch.sum(probs + targets, dim=(0, 2, 3)) - intersection
+        iou = 1 - (intersection + self.smooth) / (union + self.smooth)
+        iou_loss = torch.mean(iou)
+        return iou_loss
+class TverskyLoss(nn.Module):
+    def __init__(self, apply_nonlin: Callable | None = softmax_helper, batch_dice: bool = True, do_bg: bool = False, smooth: float = 1., square: bool = True) -> None:
+        super().__init__()
+        self.square = square
+        self.do_bg = do_bg
+        self.batch_dice = batch_dice
+        self.apply_nonlin = apply_nonlin
+        self.smooth = smooth
+        self.alpha = 0.3
+        self.beta = 0.7
+    def forward(self, x: torch.Tensor, y: torch.Tensor, loss_mask: torch.Tensor | None = None) -> torch.Tensor:
+        shp_x = x.shape
+        if self.batch_dice:
+            axes = [0] + list(range(2, len(shp_x)))
+        else:
+            axes = list(range(2, len(shp_x)))
+        if self.apply_nonlin is not None:
+            x = self.apply_nonlin(x)
+        tp, fp, fn = get_tp_fp_fn(x, y, axes, loss_mask, self.square)
+        tversky = (tp + self.smooth) / (tp + self.alpha*fp + self.beta*fn + self.smooth)
+        if not self.do_bg:
+            if self.batch_dice:
+                tversky = tversky[1:]
+            else:
+                tversky = tversky[:, 1:]
+        tversky = tversky.mean()
+        return -tversky
+class TverskyLoss_v2(nn.Module):
+    def __init__(self, alpha: float = 0.5, beta: float = 0.5, smooth: float = 1.0) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.beta = beta
+        self.smooth = smooth
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = F.softmax(logits, dim=1)
+        targets = F.one_hot(targets, num_classes=probs.size(1)).permute(0, 3, 1, 2).float()
+        tp = torch.sum(probs * targets, dim=(0, 2, 3))
+        fp = torch.sum((1 - targets) * probs, dim=(0, 2, 3))
+        fn = torch.sum(targets * (1 - probs), dim=(0, 2, 3))
+        tversky = 1 - (tp + self.smooth) / (tp + self.alpha * fp + self.beta * fn + self.smooth)
+        tversky_loss = torch.mean(tversky)
+        return tversky_loss
+class FocalTversky_loss(nn.Module):
+    def __init__(self, tversky_kwargs: Dict, gamma: float = 0.75) -> None:
+        super().__init__()
+        self.gamma = gamma
+        self.tversky = TverskyLoss(**tversky_kwargs)
+    def forward(self, net_output: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        tversky_loss = 1 + self.tversky(net_output, target)
+        focal_tversky = torch.pow(tversky_loss, self.gamma)
+        return focal_tversky
+class FocalTversky_loss_v2(nn.Module):
+    def __init__(self, alpha: float = 0.5, beta: float = 0.5, gamma: float = 1.5, smooth: float = 1.0) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.beta = beta
+        self.gamma = gamma
+        self.smooth = smooth
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = F.softmax(logits, dim=1)
+        targets = F.one_hot(targets, num_classes=probs.size(1)).permute(0, 3, 1, 2).float()
+        tp = torch.sum(probs * targets, dim=(0, 2, 3))
+        fp = torch.sum((1 - targets) * probs, dim=(0, 2, 3))
+        fn = torch.sum(targets * (1 - probs), dim=(0, 2, 3))
+        focal_tversky = (1 - (tp + self.smooth) / (tp + self.alpha * fp + self.beta * fn + self.smooth)) ** self.gamma
+        focal_tversky_loss = torch.mean(focal_tversky)
+        return focal_tversky_loss
+class AsymLoss(nn.Module):
+    def __init__(self, apply_nonlin: Callable | None = softmax_helper, batch_dice: bool = True, do_bg: bool = False, smooth: float = 1., square: bool = True) -> None:
+        super().__init__()
+        self.square = square
+        self.do_bg = do_bg
+        self.batch_dice = batch_dice
+        self.apply_nonlin = apply_nonlin
+        self.smooth = smooth
+        self.beta = 1.5
+    def forward(self, x: torch.Tensor, y: torch.Tensor, loss_mask: torch.Tensor | None = None) -> torch.Tensor:
+        shp_x = x.shape
+        if self.batch_dice:
+            axes = [0] + list(range(2, len(shp_x)))
+        else:
+            axes = list(range(2, len(shp_x)))
+        if self.apply_nonlin is not None:
+            x = self.apply_nonlin(x)
+        tp, fp, fn = get_tp_fp_fn(x, y, axes, loss_mask, self.square)
+        weight = (self.beta**2)/(1+self.beta**2)
+        asym = (tp + self.smooth) / (tp + weight*fn + (1-weight)*fp + self.smooth)
+        if not self.do_bg:
+            if self.batch_dice:
+                asym = asym[1:]
+            else:
+                asym = asym[:, 1:]
+        asym = asym.mean()
+        return -asym
+class AsymLoss_v2(nn.Module):
+    def __init__(self, alpha: float = 0.5, gamma: float = 2.0, smooth: float = 1e-5) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.smooth = smooth
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = F.softmax(logits, dim=1)
+        targets_one_hot = F.one_hot(targets, num_classes=probs.size(1)).permute(0, 3, 1, 2).float()
+        pos_loss = -self.alpha * (1 - probs) ** self.gamma * targets_one_hot * torch.log(probs + self.smooth)
+        neg_loss = -(1 - self.alpha) * probs ** self.gamma * (1 - targets_one_hot) * torch.log(1 - probs + self.smooth)
+        loss = pos_loss + neg_loss
+        return loss.mean()
+class ExpLog_loss(nn.Module):
+    def __init__(self, soft_dice_kwargs: Dict, wce_kwargs: Dict, gamma: float = 0.3) -> None:
+        super().__init__()
+        self.wce = WeightedCrossEntropyLoss(**wce_kwargs)
+        self.dc = SoftDiceLoss_v2(**soft_dice_kwargs)
+        self.gamma = gamma
+    def forward(self, net_output: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        dc_loss = -self.dc(net_output, target)
+        wce_loss = self.wce(net_output, target)
+        explog_loss = 0.8*torch.pow(-torch.log(torch.clamp(dc_loss, 1e-6)), self.gamma) + 0.2*wce_loss
+        return explog_loss
+class FocalLoss(nn.Module):
+    def __init__(self, apply_nonlin: Callable | None = softmax_helper, alpha: float | List[float] | np.ndarray | None = None, gamma: int = 2, balance_index: int = 0, smooth: float = 1e-4, size_average: bool = True) -> None:
+        super().__init__()
+        self.apply_nonlin = apply_nonlin
+        self.alpha = alpha
+        self.gamma = gamma
+        self.balance_index = balance_index
+        self.smooth = smooth
+        self.size_average = size_average
+        if self.smooth is not None:
+            if self.smooth < 0 or self.smooth > 1.0:
+                raise ValueError("smooth value should be in [0,1]")
+    def forward(self, logit: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        if self.apply_nonlin is not None:
+            logit = self.apply_nonlin(logit)
+        num_class = logit.shape[1]
+        if logit.dim() > 2:
+            logit = logit.view(logit.size(0), logit.size(1), -1)
+            logit = logit.permute(0, 2, 1).contiguous()
+            logit = logit.view(-1, logit.size(-1))
+        target = torch.squeeze(target, 1)
+        target = target.view(-1, 1)
+        alpha = self.alpha
+        if alpha is None:
+            alpha = torch.ones(num_class, 1)
+        elif isinstance(alpha, (list, np.ndarray)):
+            assert len(alpha) == num_class
+            alpha = torch.FloatTensor(alpha).view(num_class, 1)
+            alpha = alpha / alpha.sum()
+        elif isinstance(alpha, float):
+            alpha = torch.ones(num_class, 1)
+            alpha = alpha * (1 - self.alpha)
+            alpha[self.balance_index] = self.alpha
+        else:
+            raise TypeError("Not support alpha type")
+        if alpha.device != logit.device:
+            alpha = alpha.to(logit.device)
+        idx = target.cpu().long()
+        one_hot_key = torch.FloatTensor(target.size(0), num_class).zero_()
+        one_hot_key = one_hot_key.scatter_(1, idx, 1)
+        if one_hot_key.device != logit.device:
+            one_hot_key = one_hot_key.to(logit.device)
+        if self.smooth:
+            one_hot_key = torch.clamp(
+                one_hot_key, self.smooth/(num_class-1), 1.0 - self.smooth)
+        pt = (one_hot_key * logit).sum(1) + self.smooth
+        logpt = pt.log()
+        gamma = self.gamma
+        alpha = alpha[idx]
+        alpha = torch.squeeze(alpha)
+        loss = -1 * alpha * torch.pow((1 - pt), gamma) * logpt
+        if self.size_average:
+            loss = loss.mean()
+        else:
+            loss = loss.sum()
+        return loss
+def lovasz_grad(gt_sorted: torch.Tensor) -> torch.Tensor:
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+class LovaszSoftmax(nn.Module):
+    def __init__(self, reduction: str = "mean") -> None:
+        super().__init__()
+        self.reduction = reduction
+    def prob_flatten(self, input: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert input.dim() in [4, 5]
+        num_class = input.size(1)
+        if input.dim() == 4:
+            input = input.permute(0, 2, 3, 1).contiguous()
+            input_flatten = input.view(-1, num_class)
+        elif input.dim() == 5:
+            input = input.permute(0, 2, 3, 4, 1).contiguous()
+            input_flatten = input.view(-1, num_class)
+        target_flatten = target.view(-1)
+        return input_flatten, target_flatten
+    def lovasz_softmax_flat(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        num_classes = inputs.size(1)
+        losses = []
+        for c in range(num_classes):
+            target_c = (targets == c).float()
+            if num_classes == 1:
+                input_c = inputs[:, 0]
+            else:
+                input_c = inputs[:, c]
+            loss_c = (torch.autograd.Variable(target_c) - input_c).abs()
+            loss_c_sorted, loss_index = torch.sort(loss_c, 0, descending=True)
+            target_c_sorted = target_c[loss_index]
+            losses.append(torch.dot(loss_c_sorted, torch.autograd.Variable(lovasz_grad(target_c_sorted))))
+        losses = torch.stack(losses)
+        if self.reduction == "none":
+            loss = losses
+        elif self.reduction == "sum":
+            loss = losses.sum()
+        else:
+            loss = losses.mean()
+        return loss
+    def forward(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        inputs, targets = self.prob_flatten(inputs, targets)
+        losses = self.lovasz_softmax_flat(inputs, targets)
+        return losses
+class TopKLoss(nn.Module):
+    def __init__(self, weight: torch.Tensor | None = None, ignore_index: int = -100, k: int = 10) -> None:
+        super().__init__()
+        self.k = k
+        self.cross_entropy = nn.CrossEntropyLoss(weight=weight, ignore_index=ignore_index, reduction="none")
+    def forward(self, inp: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        pixel_losses = self.cross_entropy(inp, target)
+        pixel_losses = pixel_losses.view(-1)
+        num_voxels = pixel_losses.numel()
+        res, _ = torch.topk(pixel_losses, int(num_voxels * self.k / 100), sorted=False)
+        return res.mean()
+class WeightedCrossEntropyLoss(torch.nn.CrossEntropyLoss):
+    def __init__(self, weight: torch.Tensor | None = None) -> None:
+        super().__init__()
+        self.weight = weight
+    def forward(self, inp: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        target = target.long()
+        num_classes = inp.size()[1]
+        i0 = 1
+        i1 = 2
+        while i1 < len(inp.shape):
+            inp = inp.transpose(i0, i1)
+            i0 += 1
+            i1 += 1
+        inp = inp.contiguous()
+        inp = inp.view(-1, num_classes)
+        target = target.view(-1,)
+        wce_loss = torch.nn.CrossEntropyLoss(weight=self.weight)
+        return wce_loss(inp, target)

src/models/dino.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers import Dinov2Backbone
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from src.models.segmentation_head import SegmentationHead
+class DINOSegmentationModel(nn.Module):
+    def __init__(self, image_size: int = 224, num_classes: int = 18) -> None:
+        super().__init__()
+        self.mean = [0.485, 0.456, 0.406]
+        self.std = [0.229, 0.224, 0.225]
+        self.image_size = image_size
+        model_name = "facebook/dinov2-small"
+        self.backbone = Dinov2Backbone.from_pretrained(model_name)
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+        self.segmentation_head = SegmentationHead(in_channels=384, num_classes=num_classes)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, channels, height, width = x.size()
+        assert height == width == self.image_size, "The image must match the size required by the DINO model"
+        features = self.backbone(pixel_values=x).feature_maps[0]
+        masks = self.segmentation_head(features)
+        return masks
+def main() -> None:
+    # model = DINOSegmentationModel()
+    model = SegmentationHead(384, 18)
+    num_params = sum([p.numel() for p in model.parameters()])
+    print(f"params: {num_params/1e6:.2f} M")
+if __name__ == "__main__":
+    main()

src/models/segmentation_head.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn as nn
+class SegmentationHead(nn.Module):
+    def __init__(self, in_channels: int, num_classes: int):
+        super().__init__()
+        self.head = nn.Sequential(
+            nn.Conv2d(in_channels, 256, kernel_size=3, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.Upsample(size=(64, 64), mode="bilinear"),
+            nn.Conv2d(256, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.Upsample(size=(128, 128), mode="bilinear"),
+            nn.Conv2d(128, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Upsample(size=(224, 224), mode="bilinear"),
+            nn.Conv2d(64, 32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.Conv2d(32, num_classes, kernel_size=3, padding=1),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.head(x)

src/models/unet.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+from torch import nn
+class UNet(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.mean = [0.485, 0.456, 0.406]
+        self.std = [0.229, 0.224, 0.225]
+        # Downsampler
+        self.enc_conv0 = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(64),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(64),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(64)
+        )
+        self.pool0 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.enc_conv1 = nn.Sequential(
+            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(128),
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(128),
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(128)
+        )
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.enc_conv2 = nn.Sequential(
+            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(256),
+            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(256),
+            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(256)
+        )
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.enc_conv3 = nn.Sequential(
+            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(512),
+            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(512),
+            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(512)
+        )
+        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # bottleneck
+        self.bottleneck_conv = nn.Sequential(
+            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(1024),
+            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(1024),
+            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(1024)
+        )
+        # Upsampler
+        self.upsample0 = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3, padding=1),
+        )
+        self.dec_conv0 = nn.Sequential(
+            nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(512),
+            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(512),
+            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(512)
+        )
+        self.upsample1 = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1),
+        )
+        self.dec_conv1 = nn.Sequential(
+            nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(256),
+            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(256),
+            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(256)
+        )
+        self.upsample2 = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(in_channels=256, out_channels=128, kernel_size=3, padding=1),
+        )
+        self.dec_conv2 = nn.Sequential(
+            nn.Conv2d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(128),
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(128),
+            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(128)
+        )
+        self.upsample3 = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, padding=1),
+        )
+        self.dec_conv3 = nn.Sequential(
+            nn.Conv2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(64),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(inplace=True),
+            nn.BatchNorm2d(64),
+            nn.Conv2d(in_channels=64, out_channels=18, kernel_size=1, stride=1, padding=0)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # encoder
+        e0 = self.enc_conv0(x)
+        e1 = self.pool0(e0)
+        e1 = self.enc_conv1(e1)
+        e2 = self.pool1(e1)
+        e2 = self.enc_conv2(e2)
+        e3 = self.pool2(e2)
+        e3 = self.enc_conv3(e3)
+        # bottleneck
+        b = self.pool3(e3)
+        b = self.bottleneck_conv(b)
+        # decoder
+        d0 = self.upsample0(b)
+        d0 = torch.cat([d0, e3], dim=1)
+        d0 = self.dec_conv0(d0)
+        d1 = self.upsample1(d0)
+        d1 = torch.cat([d1, e2], dim=1)
+        d1 = self.dec_conv1(d1)
+        d2 = self.upsample2(d1)
+        d2 = torch.cat([d2, e1], dim=1)
+        d2 = self.dec_conv2(d2)
+        d3 = self.upsample3(d2)
+        d3 = torch.cat([d3, e0], dim=1)
+        d3 = self.dec_conv3(d3)
+        return d3

src/models/vit.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+from transformers import ViTModel
+from src.models.segmentation_head import SegmentationHead
+class ViTSegmentation(nn.Module):
+    def __init__(self, image_size: int = 224, num_classes: int = 18) -> None:
+        super().__init__()
+        self.mean = [0.5, 0.5, 0.5]
+        self.std = [0.5, 0.5, 0.5]
+        self.backbone = ViTModel.from_pretrained("google/vit-base-patch16-224")
+        self.segmentation_head = SegmentationHead(in_channels=768, num_classes=num_classes)
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, channels, height, width = x.size()
+        assert height == width == self.backbone.config.image_size, "The image must match the size required by the ViT model"
+        outputs = self.backbone(pixel_values=x).last_hidden_state
+        patch_dim = int(height / self.backbone.config.patch_size)
+        outputs = outputs[:, 1:, :]
+        outputs = outputs.permute(0, 2, 1).view(batch_size, -1, patch_dim, patch_dim)
+        masks = self.segmentation_head(outputs)
+        return masks
+def main() -> None:
+    model = ViTSegmentation(image_size=224, num_classes=18)
+    num_params = sum([p.numel() for p in model.parameters()])
+    print(f"params: {num_params/1e6:.2f} M")
+if __name__ == "__main__":
+    main()

src/train.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from typing import Tuple
+from pathlib import Path
+from tqdm import tqdm
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from matplotlib import cm
+import numpy as np
+import matplotlib.pyplot as plt
+import argparse
+import json
+import wandb
+from datasets import load_dataset
+import torch
+from torch.utils.data import DataLoader
+from models.unet import UNet
+from dataset import SegmentationDataset, collate_fn
+from utils import get_transform, mask_transform, EMA
+from get_loss import get_composite_criterion
+from models.vit import ViTSegmentation
+from models.dino import DINOSegmentationModel
+color_map = cm.get_cmap('tab20', 18)
+fixed_colors = np.array([color_map(i)[:3] for i in range(18)]) * 255
+def mask_to_color(mask: np.ndarray) -> np.ndarray:
+    h, w = mask.shape
+    color_mask = np.zeros((h, w, 3), dtype=np.uint8)
+    for class_idx in range(18):
+        color_mask[mask == class_idx] = fixed_colors[class_idx]
+    return color_mask
+def create_combined_image(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    y_pred: torch.Tensor,
+    mean: list[float] = [0.485, 0.456, 0.406],
+    std: list[float] = [0.229, 0.224, 0.225]
+) -> np.ndarray:
+    batch_size, _, height, width = x.shape
+    combined_height = height * 3
+    combined_width = width * batch_size
+    combined_image = np.zeros((combined_height, combined_width, 3), dtype=np.uint8)
+    for i in range(batch_size):
+        image = x[i].cpu().permute(1, 2, 0).numpy()
+        image = (image * std + mean).clip(0, 1)
+        image = (image * 255).astype(np.uint8)
+        true_mask = y[i].cpu().numpy()
+        true_mask_color = mask_to_color(true_mask)
+        pred_mask = y_pred[i].cpu().numpy()
+        pred_mask_color = mask_to_color(pred_mask)
+        combined_image[:height, i * width:(i + 1) * width, :] = image
+        combined_image[height:2 * height, i * width:(i + 1) * width, :] = true_mask_color
+        combined_image[2 * height:, i * width:(i + 1) * width, :] = pred_mask_color
+    return combined_image
+def compute_metrics(y_pred: torch.Tensor, y: torch.Tensor, num_classes: int = 18) -> Tuple[float, float, float, float, float, float]:
+    pred_mask = y_pred.unsqueeze(-1) == torch.arange(num_classes, device=y_pred.device).reshape(1, 1, 1, -1)
+    target_mask = y.unsqueeze(-1) == torch.arange(num_classes, device=y.device).reshape(1, 1, 1, -1)
+    class_present = (target_mask.sum(dim=(0, 1, 2)) > 0).float()
+    tp = (pred_mask & target_mask).sum(dim=(0, 1, 2)).float()
+    fp = (pred_mask & ~target_mask).sum(dim=(0, 1, 2)).float()
+    fn = (~pred_mask & target_mask).sum(dim=(0, 1, 2)).float()
+    tn = (~pred_mask & ~target_mask).sum(dim=(0, 1, 2)).float()
+    overall_tp = tp.sum()
+    overall_fp = fp.sum()
+    overall_fn = fn.sum()
+    overall_tn = tn.sum()
+    precision = tp / (tp + fp).clamp(min=1e-8)
+    recall = tp / (tp + fn).clamp(min=1e-8)
+    accuracy = (tp + tn) / (tp + tn + fp + fn)
+    macro_precision = ((precision * class_present).sum() / class_present.sum().clamp(min=1e-8)).item()
+    macro_recall = ((recall * class_present).sum() / class_present.sum().clamp(min=1e-8)).item()
+    macro_accuracy = accuracy.mean().item()
+    micro_precision = (overall_tp / (overall_tp + overall_fp).clamp(min=1e-8)).item()
+    micro_recall = (overall_tp / (overall_tp + overall_fn).clamp(min=1e-8)).item()
+    global_accuracy = ((y_pred == y).sum() / (y.shape[0] * y.shape[1] * y.shape[2])).item()
+    return macro_precision, macro_recall, macro_accuracy, micro_precision, micro_recall, global_accuracy
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a model on human parsing dataset")
+    parser.add_argument("--data-path", type=str, default="mattmdjaga/human_parsing_dataset", help="Path to the data")
+    parser.add_argument("--batch-size", type=int, default=32, help="Batch size for training and testing")
+    parser.add_argument("--pin-memory", type=bool, default=True, help="Pin memory for DataLoader")
+    parser.add_argument("--num-workers", type=int, default=0, help="Number of workers for DataLoader")
+    parser.add_argument("--num-epochs", type=int, default=15, help="Number of training epochs")
+    parser.add_argument("--optimizer", type=str, default="AdamW", help="Optimizer type")
+    parser.add_argument("--learning-rate", type=float, default=1e-4, help="Learning rate for the optimizer")
+    parser.add_argument("--max-norm", type=float, default=1.0, help="Maximum gradient norm for clipping")
+    parser.add_argument("--logs-dir", type=str, default="dino-logs", help="Directory for saving logs")
+    parser.add_argument("--model", type=str, default="dino", choices=["unet", "vit", "dino"], help="Model class name")
+    parser.add_argument("--losses-path", type=str, default="losses_config.json", help="Path to the losses")
+    parser.add_argument("--mixed-precision", type=str, default="fp16", choices=["fp16", "bf16", "fp8", "no"], help="Value of the mixed precision")
+    parser.add_argument("--gradient-accumulation-steps", type=int, default=2, help="Value of the gradient accumulation steps")
+    parser.add_argument("--project-name", type=str, default="human_parsing_segmentation_ttk", help="WandB project name")
+    parser.add_argument("--save-frequency", type=int, default=4, help="Frequency of saving model weights")
+    parser.add_argument("--log-steps", type=int, default=400, help="Number of steps between logging training images and metrics")
+    parser.add_argument("--seed", type=int, default=42, help="Value of the seed")
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    set_seed(args.seed)
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision)
+    with open(args.losses_path, "r") as fp:
+        losses_config = json.load(fp)
+    with accelerator.main_process_first():
+        logs_dir = Path(args.logs_dir)
+        logs_dir.mkdir(exist_ok=True)
+        wandb.init(project=args.project_name, dir=logs_dir)
+        wandb.save(args.losses_path)
+    optimizer_class = getattr(torch.optim, args.optimizer)
+    if args.model == "unet":
+        model = UNet().to(accelerator.device)
+        optimizer = optimizer_class(model.parameters(), lr=args.learning_rate)
+    elif args.model == "vit":
+        model = ViTSegmentation().to(accelerator.device)
+        optimizer = optimizer_class(model.parameters(), lr=args.learning_rate)
+    elif args.model == "dino":
+        model = DINOSegmentationModel().to(accelerator.device)
+        optimizer = optimizer_class(model.segmentation_head.parameters(), lr=args.learning_rate)
+    else:
+        raise NotImplementedError("Incorrect model name")
+    transform = get_transform(model.mean, model.std)
+    dataset = load_dataset(args.data_path, split="train")
+    train_dataset = SegmentationDataset(dataset, train=True, transform=transform, target_transform=mask_transform)
+    valid_dataset = SegmentationDataset(dataset, train=False, transform=transform, target_transform=mask_transform)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=args.pin_memory, collate_fn=collate_fn)
+    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=args.pin_memory, collate_fn=collate_fn)
+    criterion = get_composite_criterion(losses_config)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epochs * len(train_loader))
+    model, optimizer, train_loader, lr_scheduler = accelerator.prepare(model, optimizer, train_loader, lr_scheduler)
+    best_accuracy = 0
+    print(f"params: {sum([p.numel() for p in model.parameters()])/1e6:.2f} M")
+    print(f"trainable params: {sum([p.numel() for p in model.parameters() if p.requires_grad])/1e6:.2f} M")
+    train_loss_ema, train_macro_precision_ema, train_macro_recall_ema, train_macro_accuracy_ema, train_micro_precision_ema, train_micro_recall_ema, train_global_accuracy_ema = EMA(), EMA(), EMA(), EMA(), EMA(), EMA(), EMA()
+    for epoch in range(1, args.num_epochs + 1):
+        model.train()
+        print(f"trainable params: {sum([p.numel() for p in model.parameters() if p.requires_grad])/1e6:.2f} M")
+        exit()
+        pbar = tqdm(train_loader, desc=f"Train epoch {epoch}/{args.num_epochs}")
+        for index, (x, y) in enumerate(pbar):
+            x, y = x.to(accelerator.device), y.squeeze(1).to(accelerator.device)
+            with accelerator.accumulate(model):
+                with accelerator.autocast():
+                    output = model(x)
+                    loss = criterion(output, y)
+                    accelerator.backward(loss)
+                    train_loss = loss.item()
+                    grad_norm = None
+                    _, y_pred = output.max(dim=1)
+                    train_macro_precision, train_macro_recall, train_macro_accuracy, train_micro_precision, train_micro_recall, train_global_accuracy = compute_metrics(y_pred, y)
+                    if accelerator.sync_gradients:
+                        grad_norm = accelerator.clip_grad_norm_(model.parameters(), args.max_norm).item()
+                        optimizer.step()
+                        lr_scheduler.step()
+                        optimizer.zero_grad()
+                    if (index + 1) % args.log_steps == 0 and accelerator.is_main_process:
+                        images_to_log = []
+                        combined_image = create_combined_image(x, y, y_pred)
+                        images_to_log.append(wandb.Image(combined_image, caption=f"Combined Image (Train, Epoch {epoch}, Batch {index})"))
+                        wandb.log({"train_samples": images_to_log})
+                    pbar.set_postfix({"loss": train_loss_ema(train_loss), "macro_precision": train_macro_precision_ema(train_macro_precision), "macro_recall": train_macro_recall_ema(train_macro_recall), "macro_accuracy": train_macro_accuracy_ema(train_macro_accuracy), "micro_precision": train_micro_precision_ema(train_micro_precision), "micro_recall": train_micro_recall_ema(train_micro_recall), "global_accuracy": train_global_accuracy_ema(train_global_accuracy)})
+                    log_data = {
+                        "train/epoch": epoch,
+                        "train/loss": train_loss,
+                        "train/macro_accuracy": train_macro_accuracy,
+                        "train/learning_rate": optimizer.param_groups[0]["lr"],
+                        "train/macro_precision": train_macro_precision,
+                        "train/macro_recall": train_macro_recall,
+                        "train/micro_precision": train_micro_precision,
+                        "train/micro_recall": train_micro_recall,
+                        "train/global_accuracy": train_global_accuracy,
+                    }
+                    if grad_norm is not None:
+                        log_data["train/grad_norm"] = grad_norm
+                    if accelerator.is_main_process:
+                        wandb.log(log_data)
+        accelerator.wait_for_everyone()
+        model.eval()
+        valid_loss, valid_macro_accuracies, valid_macro_precisions, valid_macro_recalls, valid_global_accuracies, valid_micro_precisions, valid_micro_recalls = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+        with torch.inference_mode():
+            pbar = tqdm(valid_loader, desc=f"Val epoch {epoch}/{args.num_epochs}")
+            for index, (x, y) in enumerate(valid_loader):
+                x, y = x.to(accelerator.device), y.squeeze(1).to(accelerator.device)
+                output = model(x)
+                _, y_pred = output.max(dim=1)
+                if (index + 1) % args.log_steps == 0 and accelerator.is_main_process:
+                    images_to_log = []
+                    combined_image = create_combined_image(x, y, y_pred)
+                    images_to_log.append(wandb.Image(combined_image, caption=f"Combined Image (Validation, Epoch {epoch})"))
+                    wandb.log({"valid_samples": images_to_log})
+                valid_macro_precision, valid_macro_recall, valid_macro_accuracy, valid_micro_precision, valid_micro_recall, valid_global_accuracy = compute_metrics(y_pred, y)
+                valid_macro_precisions += valid_macro_precision
+                valid_macro_recalls += valid_macro_recall
+                valid_macro_accuracies += valid_macro_accuracy
+                valid_micro_precisions += valid_micro_precision
+                valid_micro_recalls += valid_micro_recall
+                valid_global_accuracies += valid_global_accuracy
+                loss = criterion(output, y)
+                valid_loss += loss.item()
+        valid_loss = valid_loss / len(valid_loader)
+        valid_macro_accuracies = valid_macro_accuracies / len(valid_loader)
+        valid_macro_precisions = valid_macro_precisions / len(valid_loader)
+        valid_macro_recalls = valid_macro_recalls / len(valid_loader)
+        valid_global_accuracies = valid_global_accuracies / len(valid_loader)
+        valid_micro_precisions = valid_micro_precisions / len(valid_loader)
+        valid_micro_recalls = valid_micro_recalls / len(valid_loader)
+        accelerator.print(f"loss: {valid_loss:.3f}, valid_macro_precision: {valid_macro_precisions:.3f}, valid_macro_recall: {valid_macro_recalls:.3f}, valid_macro_accuracy: {valid_macro_accuracies:.3f}, valid_micro_precision: {valid_micro_precisions:.3f}, valid_micro_recall: {valid_micro_recalls:.3f}, valid_global_accuracy: {valid_global_accuracies:.3f}")
+        if accelerator.is_main_process:
+            wandb.log(
+                {
+                    "val/epoch": epoch,
+                    "val/loss": valid_loss,
+                    "val/macro_accuracy": valid_macro_accuracies,
+                    "val/macro_precision": valid_macro_precisions,
+                    "val/macro_recall": valid_macro_recalls,
+                    "val/global_accuracy": valid_global_accuracies,
+                    "val/micro_precision": valid_micro_precisions,
+                    "val/micro_recall": valid_micro_recalls,
+                }
+            )
+            if valid_global_accuracies > best_accuracy:
+                best_accuracy = valid_global_accuracies
+                if args.model in ["dino", "vit"]:
+                    accelerator.save(model.segmentation_head.state_dict(), logs_dir / f"checkpoint-best.pth")
+                else:
+                    accelerator.save(model.state_dict(), logs_dir / f"checkpoint-best.pth")
+                accelerator.print(f"new best_accuracy {best_accuracy}, {epoch=}")
+            if epoch % args.save_frequency == 0:
+                if args.model in ["dino", "vit"]:
+                    accelerator.save(model.segmentation_head.state_dict(), logs_dir / f"checkpoint-{epoch:09}.pth")
+                else:
+                    accelerator.save(model.state_dict(), logs_dir / f"checkpoint-{epoch:09}.pth")
+        accelerator.wait_for_everyone()
+    accelerator.wait_for_everyone()
+    wandb.finish()
+if __name__ == "__main__":
+    main()

src/utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torchvision.transforms as T
+import PIL.Image
+from typing import List
+size = (224, 224)
+class ResizeWithPadding:
+    def __init__(self, target_size: int = 224, fill: int = 0, mode: str = "RGB") -> None:
+        self.target_size = target_size
+        self.fill = fill
+        self.mode = mode
+    def __call__(self, image: PIL.Image) -> PIL.Image:
+        original_width, original_height = image.size
+        aspect_ratio = original_width / original_height
+        if aspect_ratio > 1:
+            new_width = self.target_size
+            new_height = int(self.target_size / aspect_ratio)
+        else:
+            new_height = self.target_size
+            new_width = int(self.target_size * aspect_ratio)
+        resized_image = image.resize((new_width, new_height), PIL.Image.BICUBIC if self.mode == "RGB" else PIL.Image.NEAREST)
+        delta_w = self.target_size - new_width
+        delta_h = self.target_size - new_height
+        padding = (delta_w // 2, delta_h // 2, delta_w - delta_w // 2, delta_h - delta_h // 2)
+        padded_image = PIL.Image.new(self.mode, (self.target_size, self.target_size), self.fill)
+        padded_image.paste(resized_image, (padding[0], padding[1]))
+        return padded_image
+def get_transform(mean: List[float], std: List[float]) -> T.Compose:
+    return T.Compose([
+        ResizeWithPadding(),
+        T.ToTensor(),
+        T.Normalize(mean=mean, std=std),
+    ])
+mask_transform = T.Compose([
+    ResizeWithPadding(mode="L"),
+    T.ToTensor(),
+    T.Lambda(lambda x: (x * 255).long()),
+])
+class EMA:
+    def __init__(self, alpha: float = 0.9) -> None:
+        self.value = None
+        self.alpha = alpha
+    def __call__(self, value: float) -> float:
+        if self.value is None:
+            self.value = value
+        else:
+            self.value = self.alpha * self.value + (1 - self.alpha) * value
+        return self.value