image-matching-webui

Sleeping

File size: 6,459 Bytes

import logging
import os
import cv2
import torch
from copy import deepcopy
import torch.nn.functional as F
from torchvision.transforms import ToTensor
import math

from alnet import ALNet
from soft_detect import DKD
import time

configs = {
    "alike-t": {
        "c1": 8,
        "c2": 16,
        "c3": 32,
        "c4": 64,
        "dim": 64,
        "single_head": True,
        "radius": 2,
        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-t.pth"),
    },
    "alike-s": {
        "c1": 8,
        "c2": 16,
        "c3": 48,
        "c4": 96,
        "dim": 96,
        "single_head": True,
        "radius": 2,
        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-s.pth"),
    },
    "alike-n": {
        "c1": 16,
        "c2": 32,
        "c3": 64,
        "c4": 128,
        "dim": 128,
        "single_head": True,
        "radius": 2,
        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-n.pth"),
    },
    "alike-l": {
        "c1": 32,
        "c2": 64,
        "c3": 128,
        "c4": 128,
        "dim": 128,
        "single_head": False,
        "radius": 2,
        "model_path": os.path.join(os.path.split(__file__)[0], "models", "alike-l.pth"),
    },
}


class ALike(ALNet):
    def __init__(
        self,
        # ================================== feature encoder
        c1: int = 32,
        c2: int = 64,
        c3: int = 128,
        c4: int = 128,
        dim: int = 128,
        single_head: bool = False,
        # ================================== detect parameters
        radius: int = 2,
        top_k: int = 500,
        scores_th: float = 0.5,
        n_limit: int = 5000,
        device: str = "cpu",
        model_path: str = "",
    ):
        super().__init__(c1, c2, c3, c4, dim, single_head)
        self.radius = radius
        self.top_k = top_k
        self.n_limit = n_limit
        self.scores_th = scores_th
        self.dkd = DKD(
            radius=self.radius,
            top_k=self.top_k,
            scores_th=self.scores_th,
            n_limit=self.n_limit,
        )
        self.device = device

        if model_path != "":
            state_dict = torch.load(model_path, self.device)
            self.load_state_dict(state_dict)
            self.to(self.device)
            self.eval()
            logging.info(f"Loaded model parameters from {model_path}")
            logging.info(
                f"Number of model parameters: {sum(p.numel() for p in self.parameters() if p.requires_grad) / 1e3}KB"
            )

    def extract_dense_map(self, image, ret_dict=False):
        # ====================================================
        # check image size, should be integer multiples of 2^5
        # if it is not a integer multiples of 2^5, padding zeros
        device = image.device
        b, c, h, w = image.shape
        h_ = math.ceil(h / 32) * 32 if h % 32 != 0 else h
        w_ = math.ceil(w / 32) * 32 if w % 32 != 0 else w
        if h_ != h:
            h_padding = torch.zeros(b, c, h_ - h, w, device=device)
            image = torch.cat([image, h_padding], dim=2)
        if w_ != w:
            w_padding = torch.zeros(b, c, h_, w_ - w, device=device)
            image = torch.cat([image, w_padding], dim=3)
        # ====================================================

        scores_map, descriptor_map = super().forward(image)

        # ====================================================
        if h_ != h or w_ != w:
            descriptor_map = descriptor_map[:, :, :h, :w]
            scores_map = scores_map[:, :, :h, :w]  # Bx1xHxW
        # ====================================================

        # BxCxHxW
        descriptor_map = torch.nn.functional.normalize(descriptor_map, p=2, dim=1)

        if ret_dict:
            return {
                "descriptor_map": descriptor_map,
                "scores_map": scores_map,
            }
        else:
            return descriptor_map, scores_map

    def forward(self, img, image_size_max=99999, sort=False, sub_pixel=False):
        """
        :param img: np.array HxWx3, RGB
        :param image_size_max: maximum image size, otherwise, the image will be resized
        :param sort: sort keypoints by scores
        :param sub_pixel: whether to use sub-pixel accuracy
        :return: a dictionary with 'keypoints', 'descriptors', 'scores', and 'time'
        """
        H, W, three = img.shape
        assert three == 3, "input image shape should be [HxWx3]"

        # ==================== image size constraint
        image = deepcopy(img)
        max_hw = max(H, W)
        if max_hw > image_size_max:
            ratio = float(image_size_max / max_hw)
            image = cv2.resize(image, dsize=None, fx=ratio, fy=ratio)

        # ==================== convert image to tensor
        image = (
            torch.from_numpy(image)
            .to(self.device)
            .to(torch.float32)
            .permute(2, 0, 1)[None]
            / 255.0
        )

        # ==================== extract keypoints
        start = time.time()

        with torch.no_grad():
            descriptor_map, scores_map = self.extract_dense_map(image)
            keypoints, descriptors, scores, _ = self.dkd(
                scores_map, descriptor_map, sub_pixel=sub_pixel
            )
            keypoints, descriptors, scores = keypoints[0], descriptors[0], scores[0]
            keypoints = (keypoints + 1) / 2 * keypoints.new_tensor([[W - 1, H - 1]])

        if sort:
            indices = torch.argsort(scores, descending=True)
            keypoints = keypoints[indices]
            descriptors = descriptors[indices]
            scores = scores[indices]

        end = time.time()

        return {
            "keypoints": keypoints.cpu().numpy(),
            "descriptors": descriptors.cpu().numpy(),
            "scores": scores.cpu().numpy(),
            "scores_map": scores_map.cpu().numpy(),
            "time": end - start,
        }


if __name__ == "__main__":
    import numpy as np
    from thop import profile

    net = ALike(c1=32, c2=64, c3=128, c4=128, dim=128, single_head=False)

    image = np.random.random((640, 480, 3)).astype(np.float32)
    flops, params = profile(net, inputs=(image, 9999, False), verbose=False)
    print("{:<30}  {:<8} GFLops".format("Computational complexity: ", flops / 1e9))
    print("{:<30}  {:<8} KB".format("Number of parameters: ", params / 1e3))