Spaces:

henry000
/

YOLO

Running

App Files Files Community

henry000 commited on Jun 7, 2024

Commit

2dd2ae5

1 Parent(s): 0a3c9de

✨ [New] Framework anc2box -> anc2vec + vec2box

Browse files

Files changed (8) hide show

yolo/config/model/v9-c.yaml +4 -13
yolo/lazy.py +5 -2
yolo/model/module.py +17 -36
yolo/model/yolo.py +3 -3
yolo/tools/format_converters.py +1 -1
yolo/tools/loss_functions.py +32 -42
yolo/tools/solver.py +12 -7
yolo/utils/bounding_box_utils.py +47 -8

yolo/config/model/v9-c.yaml CHANGED Viewed

@@ -1,6 +1,5 @@
 anchor:
   reg_max: 16
-  strides: [8, 16, 32]
 model:
   backbone:
@@ -120,23 +119,15 @@ model:
     - MultiheadDetection:
         source: [A3, A4, A5]
-        tags: aux_head
-    - Anchor2Box:
-        source: aux_head
-        output: True
         args:
             reg_max: ${model.anchor.reg_max}
-            strides: ${model.anchor.strides}
-        tags: aux_bbox
   detection:
     - MultiheadDetection:
         source: [P3, P4, P5]
-        tags: reg_head
-    - Anchor2Box:
-        source: reg_head
-        output: True
         args:
             reg_max: ${model.anchor.reg_max}
-            strides: ${model.anchor.strides}
-        tags: reg_bbox

 anchor:
   reg_max: 16
 model:
   backbone:
     - MultiheadDetection:
         source: [A3, A4, A5]
+        tags: AUX
         args:
             reg_max: ${model.anchor.reg_max}
+        output: True
   detection:
     - MultiheadDetection:
         source: [P3, P4, P5]
+        tags: Main
         args:
             reg_max: ${model.anchor.reg_max}
+        output: True

yolo/lazy.py CHANGED Viewed

@@ -11,6 +11,7 @@ from yolo.config.config import Config
 from yolo.model.yolo import create_model
 from yolo.tools.data_loader import create_dataloader
 from yolo.tools.solver import ModelTester, ModelTrainer
 from yolo.utils.deploy_utils import FastModelLoader
 from yolo.utils.logging_utils import custom_logger, validate_log_directory
@@ -27,12 +28,14 @@ def main(cfg: Config):
     else:
         model = create_model(cfg.model, class_num=cfg.class_num, weight_path=cfg.weight).to(device)
     if cfg.task.task == "train":
-        trainer = ModelTrainer(cfg, model, save_path, device)
         trainer.solve(dataloader)
     if cfg.task.task == "inference":
-        tester = ModelTester(cfg, model, save_path, device)
         tester.solve(dataloader)

 from yolo.model.yolo import create_model
 from yolo.tools.data_loader import create_dataloader
 from yolo.tools.solver import ModelTester, ModelTrainer
+from yolo.utils.bounding_box_utils import Vec2Box
 from yolo.utils.deploy_utils import FastModelLoader
 from yolo.utils.logging_utils import custom_logger, validate_log_directory
     else:
         model = create_model(cfg.model, class_num=cfg.class_num, weight_path=cfg.weight).to(device)
+    vec2box = Vec2Box(model, cfg.image_size, device)
     if cfg.task.task == "train":
+        trainer = ModelTrainer(cfg, model, vec2box, save_path, device)
         trainer.solve(dataloader)
     if cfg.task.task == "inference":
+        tester = ModelTester(cfg, model, vec2box, save_path, device)
         tester.solve(dataloader)

yolo/model/module.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Detection(nn.Module):
         anchor_channels = 4 * reg_max
         first_neck, in_channels = in_channels
-        anchor_neck = max(round_up(first_neck // 4, groups), anchor_channels, 16)
         class_neck = max(first_neck, min(num_classes * 2, 128))
         self.anchor_conv = nn.Sequential(
@@ -70,13 +70,16 @@ class Detection(nn.Module):
             Conv(in_channels, class_neck, 3), Conv(class_neck, class_neck, 3), nn.Conv2d(class_neck, num_classes, 1)
         )
         self.anchor_conv[-1].bias.data.fill_(1.0)
         self.class_conv[-1].bias.data.fill_(-10)
-    def forward(self, x: List[Tensor]) -> List[Tensor]:
         anchor_x = self.anchor_conv(x)
         class_x = self.class_conv(x)
-        return torch.cat([anchor_x, class_x], dim=1)
 class MultiheadDetection(nn.Module):
@@ -92,40 +95,18 @@ class MultiheadDetection(nn.Module):
         return [head(x) for x, head in zip(x_list, self.heads)]
-class Anchor2Box(nn.Module):
-    def __init__(self, reg_max, strides, num_classes: int) -> None:
         super().__init__()
-        self.reg_max = reg_max
-        self.strides = strides
-        # TODO: read by cfg!
-        image_size = [640, 640]
-        self.num_classes = num_classes
-        self.anchors, self.scaler = generate_anchors(image_size, self.strides)
-        reverse_reg = torch.arange(self.reg_max, dtype=torch.float32)
-        self.reverse_reg = nn.Parameter(reverse_reg, requires_grad=False)
-        self.anchors = nn.Parameter(self.anchors, requires_grad=False)
-        self.scaler = nn.Parameter(self.scaler, requires_grad=False)
-    def forward(self, predicts: List[Tensor]) -> Tensor:
-        """
-        args:
-            [B x AnchorClass x h1 x w1, B x AnchorClass x h2 x w2, B x AnchorClass x h3 x w3] // AnchorClass = 4 * 16 + 80
-        return:
-            [B x HW x ClassBbox] // HW = h1*w1 + h2*w2 + h3*w3, ClassBox = 80 + 4 (xyXY)
-        """
-        preds = []
-        for pred in predicts:
-            preds.append(rearrange(pred, "B AC h w -> B (h w) AC"))  # B x AC x h x w-> B x hw x AC
-        preds = torch.concat(preds, dim=1)  # -> B x (H W) x AC
-        preds_anc, preds_cls = torch.split(preds, (self.reg_max * 4, self.num_classes), dim=-1)
-        preds_anc = rearrange(preds_anc, "B  hw (P R)-> B hw P R", P=4)
-        pred_LTRB = preds_anc.softmax(dim=-1) @ self.reverse_reg * self.scaler.view(1, -1, 1)
-        lt, rb = pred_LTRB.chunk(2, dim=-1)
-        preds_box = torch.cat([self.anchors - lt, self.anchors + rb], dim=-1)
-        predicts = torch.cat([preds_cls, preds_box], dim=-1)
-        return predicts, preds_anc
 # ----------- Backbone Class ----------- #

         anchor_channels = 4 * reg_max
         first_neck, in_channels = in_channels
+        anchor_neck = max(round_up(first_neck // 4, groups), anchor_channels, reg_max)
         class_neck = max(first_neck, min(num_classes * 2, 128))
         self.anchor_conv = nn.Sequential(
             Conv(in_channels, class_neck, 3), Conv(class_neck, class_neck, 3), nn.Conv2d(class_neck, num_classes, 1)
         )
+        self.anc2vec = Anchor2Vec(reg_max=reg_max)
         self.anchor_conv[-1].bias.data.fill_(1.0)
         self.class_conv[-1].bias.data.fill_(-10)
+    def forward(self, x: Tensor) -> Tuple[Tensor]:
         anchor_x = self.anchor_conv(x)
         class_x = self.class_conv(x)
+        anchor_x, vector_x = self.anc2vec(anchor_x)
+        return class_x, anchor_x, vector_x
 class MultiheadDetection(nn.Module):
         return [head(x) for x, head in zip(x_list, self.heads)]
+class Anchor2Vec(nn.Module):
+    def __init__(self, reg_max: int = 16) -> None:
         super().__init__()
+        reverse_reg = torch.arange(reg_max, dtype=torch.float32).view(1, reg_max, 1, 1, 1)
+        self.anc2vec = nn.Conv3d(in_channels=reg_max, out_channels=1, kernel_size=1, bias=False)
+        self.anc2vec.weight = nn.Parameter(reverse_reg, requires_grad=False)
+    def forward(self, anchor_x: Tensor) -> Tensor:
+        anchor_x = rearrange(anchor_x, "B (P R) h w -> B R P h w", P=4)
+        vector_x = anchor_x.softmax(dim=1)
+        vector_x = self.anc2vec(vector_x).squeeze(1)
+        return anchor_x, vector_x
 # ----------- Backbone Class ----------- #

yolo/model/yolo.py CHANGED Viewed

@@ -66,7 +66,7 @@ class YOLO(nn.Module):
     def forward(self, x):
         y = {0: x}
-        output = []
         for index, layer in enumerate(self.model, start=1):
             if isinstance(layer.source, list):
                 model_input = [y[idx] for idx in layer.source]
@@ -77,7 +77,7 @@ class YOLO(nn.Module):
             if layer.usable:
                 y[index] = x
             if layer.output:
-                output.append(x)
         return output
     def get_out_channels(self, layer_type: str, layer_args: dict, output_dim: list, source: Union[int, list]):
@@ -131,7 +131,7 @@ def create_model(model_cfg: ModelConfig, class_num: int = 80, weight_path: str =
     logger.info("✅ Success load model")
     if weight_path:
         if os.path.exists(weight_path):
-            model.model.load_state_dict(torch.load(weight_path), strict=False)
             logger.info("✅ Success load model weight")
         else:
             logger.info(f"🌐 Weight {weight_path} not found, try downloading")

     def forward(self, x):
         y = {0: x}
+        output = dict()
         for index, layer in enumerate(self.model, start=1):
             if isinstance(layer.source, list):
                 model_input = [y[idx] for idx in layer.source]
             if layer.usable:
                 y[index] = x
             if layer.output:
+                output[layer.tags] = x
         return output
     def get_out_channels(self, layer_type: str, layer_args: dict, output_dim: list, source: Union[int, list]):
     logger.info("✅ Success load model")
     if weight_path:
         if os.path.exists(weight_path):
+            model.model.load_state_dict(torch.load(weight_path))
             logger.info("✅ Success load model weight")
         else:
             logger.info(f"🌐 Weight {weight_path} not found, try downloading")

yolo/tools/format_converters.py CHANGED Viewed

@@ -17,7 +17,7 @@ def convert_weight(old_state_dict, new_state_dict, model_size: int = 38):
                     continue
                 _, _, conv_name, conv_idx, *details = weight_name.split(".")
                 if conv_name == "cv4" or conv_name == "cv5":
-                    layer_idx = 39
                 else:
                     layer_idx = 37

                     continue
                 _, _, conv_name, conv_idx, *details = weight_name.split(".")
                 if conv_name == "cv4" or conv_name == "cv5":
+                    layer_idx = 38
                 else:
                     layer_idx = 37

yolo/tools/loss_functions.py CHANGED Viewed

@@ -2,14 +2,12 @@ from typing import Any, Dict, List, Tuple
 import torch
 import torch.nn.functional as F
-from einops import rearrange
 from loguru import logger
 from torch import Tensor, nn
 from torch.nn import BCEWithLogitsLoss
-from yolo.config.config import Config
-from yolo.utils.bounding_box_utils import BoxMatcher, calculate_iou, generate_anchors
-from yolo.utils.module_utils import divide_into_chunks
 class BCELoss(nn.Module):
@@ -40,10 +38,9 @@ class BoxLoss(nn.Module):
 class DFLoss(nn.Module):
-    def __init__(self, anchors: Tensor, scaler: Tensor, reg_max: int) -> None:
         super().__init__()
-        self.anchors = anchors
-        self.scaler = scaler
         self.reg_max = reg_max
     def forward(
@@ -51,8 +48,9 @@ class DFLoss(nn.Module):
     ) -> Any:
         valid_bbox = valid_masks[..., None].expand(-1, -1, 4)
         bbox_lt, bbox_rb = targets_bbox.chunk(2, -1)
-        anchors_norm = (self.anchors / self.scaler[:, None])[None]
-        targets_dist = torch.cat(((anchors_norm - bbox_lt), (bbox_rb - anchors_norm)), -1).clamp(0, self.reg_max - 1.01)
         picked_targets = targets_dist[valid_bbox].view(-1)
         picked_predict = predicts_anc[valid_bbox].view(-1, self.reg_max)
@@ -68,42 +66,31 @@ class DFLoss(nn.Module):
 class YOLOLoss:
-    def __init__(self, cfg: Config) -> None:
-        self.reg_max = cfg.model.anchor.reg_max
-        self.class_num = cfg.class_num
-        self.image_size = list(cfg.image_size)
-        self.strides = cfg.model.anchor.strides
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.anchors, self.scaler = generate_anchors(self.image_size, self.strides)
-        self.anchors = self.anchors.to(device)
-        self.scaler = self.scaler.to(device)
         self.cls = BCELoss()
-        self.dfl = DFLoss(self.anchors, self.scaler, self.reg_max)
         self.iou = BoxLoss()
-        self.matcher = BoxMatcher(cfg.task.loss.matcher, self.class_num, self.anchors)
     def separate_anchor(self, anchors):
         """
         separate anchor and bbouding box
         """
         anchors_cls, anchors_box = torch.split(anchors, (self.class_num, 4), dim=-1)
-        anchors_box = anchors_box / self.scaler[None, :, None]
         return anchors_cls, anchors_box
-    def __call__(
-        self, predicts_box: List[Tensor], predicts_anc: Tensor, targets: Tensor
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        # Batch_Size x (Anchor + Class) x H x W
-        # TODO: check datatype, why targets has a little bit error with origin version
         # For each predicted targets, assign a best suitable ground truth box.
-        align_targets, valid_masks = self.matcher(targets, predicts_box)
         targets_cls, targets_bbox = self.separate_anchor(align_targets)
-        predicts_cls, predicts_bbox = self.separate_anchor(predicts_box)
         cls_norm = targets_cls.sum()
         box_norm = targets_cls.sum(-1)[valid_masks]
@@ -111,7 +98,7 @@ class YOLOLoss:
         ## -- CLS -- ##
         loss_cls = self.cls(predicts_cls, targets_cls, cls_norm)
         ## -- IOU -- ##
-        loss_iou = self.iou(predicts_bbox, targets_bbox, valid_masks, box_norm, cls_norm)
         ## -- DFL -- ##
         loss_dfl = self.dfl(predicts_anc, targets_bbox, valid_masks, box_norm, cls_norm)
@@ -119,19 +106,22 @@ class YOLOLoss:
 class DualLoss:
-    def __init__(self, cfg: Config) -> None:
-        self.loss = YOLOLoss(cfg)
-        self.aux_rate = cfg.task.loss.aux
-        self.iou_rate = cfg.task.loss.objective["BoxLoss"]
-        self.dfl_rate = cfg.task.loss.objective["DFLoss"]
-        self.cls_rate = cfg.task.loss.objective["BCELoss"]
-    def __call__(self, predicts: List[Tensor], targets: Tensor) -> Tuple[Tensor, Dict[str, Tensor]]:
         # TODO: Need Refactor this region, make it flexible!
-        aux_iou, aux_dfl, aux_cls = self.loss(*predicts[0], targets)
-        main_iou, main_dfl, main_cls = self.loss(*predicts[1], targets)
         loss_dict = {
             "BoxLoss": self.iou_rate * (aux_iou * self.aux_rate + main_iou),
@@ -142,7 +132,7 @@ class DualLoss:
         return loss_sum, loss_dict
-def get_loss_function(cfg: Config) -> YOLOLoss:
-    loss_function = DualLoss(cfg)
     logger.info("✅ Success load loss function")
     return loss_function

 import torch
 import torch.nn.functional as F
 from loguru import logger
 from torch import Tensor, nn
 from torch.nn import BCEWithLogitsLoss
+from yolo.config.config import Config, LossConfig
+from yolo.utils.bounding_box_utils import BoxMatcher, Vec2Box, calculate_iou
 class BCELoss(nn.Module):
 class DFLoss(nn.Module):
+    def __init__(self, anchors_norm: Tensor, reg_max: int) -> None:
         super().__init__()
+        self.anchors_norm = anchors_norm
         self.reg_max = reg_max
     def forward(
     ) -> Any:
         valid_bbox = valid_masks[..., None].expand(-1, -1, 4)
         bbox_lt, bbox_rb = targets_bbox.chunk(2, -1)
+        targets_dist = torch.cat(((self.anchors_norm - bbox_lt), (bbox_rb - self.anchors_norm)), -1).clamp(
+            0, self.reg_max - 1.01
+        )
         picked_targets = targets_dist[valid_bbox].view(-1)
         picked_predict = predicts_anc[valid_bbox].view(-1, self.reg_max)
 class YOLOLoss:
+    def __init__(self, loss_cfg: LossConfig, vec2box: Vec2Box, class_num: int = 80, reg_max: int = 16) -> None:
+        self.class_num = class_num
+        self.vec2box = vec2box
         self.cls = BCELoss()
+        self.dfl = DFLoss(vec2box.anchor_norm, reg_max)
         self.iou = BoxLoss()
+        self.matcher = BoxMatcher(loss_cfg.matcher, self.class_num, vec2box.anchor_grid)
     def separate_anchor(self, anchors):
         """
         separate anchor and bbouding box
         """
         anchors_cls, anchors_box = torch.split(anchors, (self.class_num, 4), dim=-1)
+        anchors_box = anchors_box / self.vec2box.scaler[None, :, None]
         return anchors_cls, anchors_box
+    def __call__(self, predicts: List[Tensor], targets: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        predicts_cls, predicts_anc, predicts_box = predicts
         # For each predicted targets, assign a best suitable ground truth box.
+        align_targets, valid_masks = self.matcher(targets, (predicts_cls, predicts_box))
         targets_cls, targets_bbox = self.separate_anchor(align_targets)
+        predicts_box = predicts_box / self.vec2box.scaler[None, :, None]
         cls_norm = targets_cls.sum()
         box_norm = targets_cls.sum(-1)[valid_masks]
         ## -- CLS -- ##
         loss_cls = self.cls(predicts_cls, targets_cls, cls_norm)
         ## -- IOU -- ##
+        loss_iou = self.iou(predicts_box, targets_bbox, valid_masks, box_norm, cls_norm)
         ## -- DFL -- ##
         loss_dfl = self.dfl(predicts_anc, targets_bbox, valid_masks, box_norm, cls_norm)
 class DualLoss:
+    def __init__(self, cfg: Config, vec2box) -> None:
+        loss_cfg = cfg.task.loss
+        self.loss = YOLOLoss(loss_cfg, vec2box, class_num=cfg.class_num, reg_max=cfg.model.anchor.reg_max)
+        self.aux_rate = loss_cfg.aux
+        self.iou_rate = loss_cfg.objective["BoxLoss"]
+        self.dfl_rate = loss_cfg.objective["DFLoss"]
+        self.cls_rate = loss_cfg.objective["BCELoss"]
+    def __call__(
+        self, aux_predicts: List[Tensor], main_predicts: List[Tensor], targets: Tensor
+    ) -> Tuple[Tensor, Dict[str, Tensor]]:
         # TODO: Need Refactor this region, make it flexible!
+        aux_iou, aux_dfl, aux_cls = self.loss(aux_predicts, targets)
+        main_iou, main_dfl, main_cls = self.loss(main_predicts, targets)
         loss_dict = {
             "BoxLoss": self.iou_rate * (aux_iou * self.aux_rate + main_iou),
         return loss_sum, loss_dict
+def get_loss_function(cfg: Config, vec2box) -> DualLoss:
+    loss_function = DualLoss(cfg, vec2box)
     logger.info("✅ Success load loss function")
     return loss_function

yolo/tools/solver.py CHANGED Viewed

@@ -10,7 +10,7 @@ from yolo.model.yolo import YOLO
 from yolo.tools.data_loader import StreamDataLoader, create_dataloader
 from yolo.tools.drawer import draw_bboxes
 from yolo.tools.loss_functions import get_loss_function
-from yolo.utils.bounding_box_utils import bbox_nms, calculate_map
 from yolo.utils.logging_utils import ProgressTracker
 from yolo.utils.model_utils import (
     ExponentialMovingAverage,
@@ -20,13 +20,14 @@ from yolo.utils.model_utils import (
 class ModelTrainer:
-    def __init__(self, cfg: Config, model: YOLO, save_path: str, device):
         train_cfg: TrainConfig = cfg.task
         self.model = model
         self.device = device
         self.optimizer = create_optimizer(model, train_cfg.optimizer)
         self.scheduler = create_scheduler(self.optimizer, train_cfg.scheduler)
-        self.loss_fn = get_loss_function(cfg)
         self.progress = ProgressTracker(cfg.name, save_path, cfg.use_wandb)
         self.num_epochs = cfg.task.epoch
@@ -45,7 +46,9 @@ class ModelTrainer:
         with autocast():
             outputs = self.model(data)
-            loss, loss_item = self.loss_fn(outputs, targets)
         self.scaler.scale(loss).backward()
         self.scaler.step(self.optimizer)
@@ -96,9 +99,10 @@ class ModelTrainer:
 class ModelTester:
-    def __init__(self, cfg: Config, model: YOLO, save_path: str, device):
         self.model = model
         self.device = device
         self.progress = ProgressTracker(cfg, save_path, cfg.use_wandb)
         self.nms = cfg.task.nms
@@ -112,8 +116,9 @@ class ModelTester:
             for idx, images in enumerate(dataloader):
                 images = images.to(self.device)
                 with torch.no_grad():
-                    raw_output = self.model(images)
-                nms_out = bbox_nms(raw_output[-1][0], self.nms)
                 draw_bboxes(
                     images[0],
                     nms_out[0],

 from yolo.tools.data_loader import StreamDataLoader, create_dataloader
 from yolo.tools.drawer import draw_bboxes
 from yolo.tools.loss_functions import get_loss_function
+from yolo.utils.bounding_box_utils import Vec2Box, bbox_nms, calculate_map
 from yolo.utils.logging_utils import ProgressTracker
 from yolo.utils.model_utils import (
     ExponentialMovingAverage,
 class ModelTrainer:
+    def __init__(self, cfg: Config, model: YOLO, vec2box: Vec2Box, save_path: str, device):
         train_cfg: TrainConfig = cfg.task
         self.model = model
+        self.vec2box = vec2box
         self.device = device
         self.optimizer = create_optimizer(model, train_cfg.optimizer)
         self.scheduler = create_scheduler(self.optimizer, train_cfg.scheduler)
+        self.loss_fn = get_loss_function(cfg, vec2box)
         self.progress = ProgressTracker(cfg.name, save_path, cfg.use_wandb)
         self.num_epochs = cfg.task.epoch
         with autocast():
             outputs = self.model(data)
+            aux_predicts = self.vec2box(outputs["AUX"])
+            main_predicts = self.vec2box(outputs["Main"])
+            loss, loss_item = self.loss_fn(aux_predicts, main_predicts, targets)
         self.scaler.scale(loss).backward()
         self.scaler.step(self.optimizer)
 class ModelTester:
+    def __init__(self, cfg: Config, model: YOLO, vec2box: Vec2Box, save_path: str, device):
         self.model = model
         self.device = device
+        self.vec2box = vec2box
         self.progress = ProgressTracker(cfg, save_path, cfg.use_wandb)
         self.nms = cfg.task.nms
             for idx, images in enumerate(dataloader):
                 images = images.to(self.device)
                 with torch.no_grad():
+                    outputs = self.model(images)
+                outputs = self.vec2box(outputs["Main"])
+                nms_out = bbox_nms(outputs[0], outputs[2], self.nms)
                 draw_bboxes(
                     images[0],
                     nms_out[0],

yolo/utils/bounding_box_utils.py CHANGED Viewed

@@ -106,12 +106,23 @@ def transform_bbox(bbox: Tensor, indicator="xywh -> xyxy"):
     return bbox.to(dtype=data_type)
-def generate_anchors(image_size: List[int], strides: List[int]):
     W, H = image_size
     anchors = []
     scaler = []
-    for stride in strides:
-        anchor_num = W // stride * H // stride
         scaler.append(torch.full((anchor_num,), stride))
         shift = stride // 2
         x = torch.arange(0, W, stride) + shift
@@ -207,13 +218,13 @@ class BoxMatcher:
         unique_indices = target_matrix.argmax(dim=1)
         return unique_indices[..., None]
-    def __call__(self, target: Tensor, predict: Tensor) -> Tuple[Tensor, Tensor]:
         """
         1. For each anchor prediction, find the highest suitability targets
         2. Select the targets
         2. Noramlize the class probilities of targets
         """
-        predict_cls, predict_bbox = predict.split(self.class_num, dim=-1)  # B, HW x (C B) -> B x HW x C, B x HW x B
         target_cls, target_bbox = target.split([1, 4], dim=-1)  # B x N x (C B) -> B x N x C, B x N x B
         target_cls = target_cls.long().clamp(0)
@@ -251,9 +262,37 @@ class BoxMatcher:
         return torch.cat([align_cls, align_bbox], dim=-1), valid_mask.bool()
-def bbox_nms(predicts: Tensor, nms_cfg: NMSConfig):
     # TODO change function to class or set 80 to class_num instead of a number
-    cls_dist, bbox = torch.split(predicts, [80, 4], dim=-1)
     cls_dist = cls_dist.sigmoid()
     # filter class by confidence
@@ -266,7 +305,7 @@ def bbox_nms(predicts: Tensor, nms_cfg: NMSConfig):
     batch_idx, *_ = torch.where(valid_mask)
     nms_idx = batched_nms(valid_box, valid_cls, batch_idx, nms_cfg.min_iou)
     predicts_nms = []
-    for idx in range(predicts.size(0)):
         instance_idx = nms_idx[idx == batch_idx[nms_idx]]
         predict_nms = torch.cat(

     return bbox.to(dtype=data_type)
+def generate_anchors(image_size: List[int], anchors_list: List[Tuple[int]]):
+    """
+    Find the anchor maps for each w, h.
+    Args:
+        anchors_list List[[w1, h1], [w2, h2], ...]: the anchor num for each predicted anchor
+    Returns:
+        all_anchors [HW x 2]:
+        all_scalers [HW]: The index of the best targets for each anchors
+    """
     W, H = image_size
     anchors = []
     scaler = []
+    for anchor_wh in anchors_list:
+        stride = W // anchor_wh[0]
+        anchor_num = anchor_wh[0] * anchor_wh[1]
         scaler.append(torch.full((anchor_num,), stride))
         shift = stride // 2
         x = torch.arange(0, W, stride) + shift
         unique_indices = target_matrix.argmax(dim=1)
         return unique_indices[..., None]
+    def __call__(self, target: Tensor, predict: Tuple[Tensor]) -> Tuple[Tensor, Tensor]:
         """
         1. For each anchor prediction, find the highest suitability targets
         2. Select the targets
         2. Noramlize the class probilities of targets
         """
+        predict_cls, predict_bbox = predict
         target_cls, target_bbox = target.split([1, 4], dim=-1)  # B x N x (C B) -> B x N x C, B x N x B
         target_cls = target_cls.long().clamp(0)
         return torch.cat([align_cls, align_bbox], dim=-1), valid_mask.bool()
+class Vec2Box:
+    def __init__(self, model, image_size, device):
+        dummy_input = torch.zeros(1, 3, *image_size).to(device)
+        dummy_output = model(dummy_input)
+        anchors_num = []
+        for predict_head in dummy_output["Main"]:
+            _, _, *anchor_num = predict_head[2].shape
+            anchors_num.append(anchor_num)
+        anchor_grid, scaler = generate_anchors(image_size, anchors_num)
+        self.anchor_grid, self.scaler = anchor_grid.to(device), scaler.to(device)
+        self.anchor_norm = (anchor_grid / scaler[:, None])[None].to(device)
+    def __call__(self, predicts):
+        preds_cls, preds_anc, preds_box = [], [], []
+        for layer_output in predicts:
+            pred_cls, pred_anc, pred_box = layer_output
+            preds_cls.append(rearrange(pred_cls, "B C h w -> B (h w) C"))
+            preds_anc.append(rearrange(pred_anc, "B A R h w -> B (h w) R A"))
+            preds_box.append(rearrange(pred_box, "B X h w -> B (h w) X"))
+        preds_cls = torch.concat(preds_cls, dim=1)
+        preds_anc = torch.concat(preds_anc, dim=1)
+        preds_box = torch.concat(preds_box, dim=1)
+        pred_LTRB = preds_box * self.scaler.view(1, -1, 1)
+        lt, rb = pred_LTRB.chunk(2, dim=-1)
+        preds_box = torch.cat([self.anchor_grid - lt, self.anchor_grid + rb], dim=-1)
+        return preds_cls, preds_anc, preds_box
+def bbox_nms(cls_dist: Tensor, bbox: Tensor, nms_cfg: NMSConfig):
     # TODO change function to class or set 80 to class_num instead of a number
     cls_dist = cls_dist.sigmoid()
     # filter class by confidence
     batch_idx, *_ = torch.where(valid_mask)
     nms_idx = batched_nms(valid_box, valid_cls, batch_idx, nms_cfg.min_iou)
     predicts_nms = []
+    for idx in range(cls_dist.size(0)):
         instance_idx = nms_idx[idx == batch_idx[nms_idx]]
         predict_nms = torch.cat(