|
import numpy as np |
|
import cv2 |
|
from typing import Tuple, List, Union |
|
from .base_onnx import BaseONNX |
|
|
|
class RTMPOSE_ONNX(BaseONNX): |
|
|
|
bone_names = [ |
|
"A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", |
|
"J0", "J1", "J2", "J3", "J4", "J5", "J6", "J7", "J8", |
|
"B0", "C0", "D0", "E0", "F0", "G0", "H0", "I0", |
|
"B8", "C8", "D8", "E8", "F8", "G8", "H8", "I8", |
|
] |
|
|
|
def __init__(self, model_path, input_size=(256, 256), padding=1.25): |
|
super().__init__(model_path, input_size) |
|
self.padding = padding |
|
|
|
|
|
def get_bbox_center_scale(self, bbox: List[int]): |
|
"""Convert bounding box to center and scale. |
|
|
|
The center is the coordinates of the bbox center, and the scale is the |
|
bbox width and height normalized by the padding factor. |
|
|
|
Args: |
|
bbox: Bounding box in format [x1, y1, x2, y2] |
|
|
|
Returns: |
|
tuple: A tuple containing: |
|
- center (numpy.ndarray): Center coordinates [x, y] |
|
- scale (numpy.ndarray): Scale [width, height] |
|
""" |
|
|
|
|
|
x1, y1, x2, y2 = bbox |
|
center = np.array([(x1 + x2) / 2.0, (y1 + y2) / 2.0]) |
|
|
|
|
|
w = x2 - x1 |
|
h = y2 - y1 |
|
|
|
|
|
scale = np.array([w, h]) * self.padding |
|
|
|
return center, scale |
|
|
|
|
|
@staticmethod |
|
def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray: |
|
"""Rotate a point by an angle. |
|
|
|
Args: |
|
pt (np.ndarray): 2D point coordinates (x, y) in shape (2, ) |
|
angle_rad (float): rotation angle in radian |
|
|
|
Returns: |
|
np.ndarray: Rotated point in shape (2, ) |
|
""" |
|
|
|
sn, cs = np.sin(angle_rad), np.cos(angle_rad) |
|
rot_mat = np.array([[cs, -sn], [sn, cs]]) |
|
return rot_mat @ pt |
|
|
|
|
|
@staticmethod |
|
def _get_3rd_point(a: np.ndarray, b: np.ndarray): |
|
"""To calculate the affine matrix, three pairs of points are required. This |
|
function is used to get the 3rd point, given 2D points a & b. |
|
|
|
The 3rd point is defined by rotating vector `a - b` by 90 degrees |
|
anticlockwise, using b as the rotation center. |
|
|
|
Args: |
|
a (np.ndarray): The 1st point (x,y) in shape (2, ) |
|
b (np.ndarray): The 2nd point (x,y) in shape (2, ) |
|
|
|
Returns: |
|
np.ndarray: The 3rd point. |
|
""" |
|
direction = a - b |
|
c = b + np.r_[-direction[1], direction[0]] |
|
return c |
|
|
|
|
|
@staticmethod |
|
def get_warp_matrix( |
|
center: np.ndarray, |
|
scale: np.ndarray, |
|
rot: float, |
|
output_size: Tuple[int, int], |
|
shift: Tuple[float, float] = (0., 0.), |
|
inv: bool = False, |
|
fix_aspect_ratio: bool = True, |
|
) -> np.ndarray: |
|
"""Calculate the affine transformation matrix that can warp the bbox area |
|
in the input image to the output size. |
|
|
|
Args: |
|
center (np.ndarray[2, ]): Center of the bounding box (x, y). |
|
scale (np.ndarray[2, ]): Scale of the bounding box |
|
wrt [width, height]. |
|
rot (float): Rotation angle (degree). |
|
output_size (np.ndarray[2, ] | list(2,)): Size of the |
|
destination heatmaps. |
|
shift (0-100%): Shift translation ratio wrt the width/height. |
|
Default (0., 0.). |
|
inv (bool): Option to inverse the affine transform direction. |
|
(inv=False: src->dst or inv=True: dst->src) |
|
fix_aspect_ratio (bool): Whether to fix aspect ratio during transform. |
|
Defaults to True. |
|
|
|
Returns: |
|
np.ndarray: A 2x3 transformation matrix |
|
""" |
|
assert len(center) == 2 |
|
assert len(scale) == 2 |
|
assert len(output_size) == 2 |
|
assert len(shift) == 2 |
|
|
|
shift = np.array(shift) |
|
src_w, src_h = scale[:2] |
|
dst_w, dst_h = output_size[:2] |
|
|
|
rot_rad = np.deg2rad(rot) |
|
src_dir = RTMPOSE_ONNX._rotate_point(np.array([src_w * -0.5, 0.]), rot_rad) |
|
dst_dir = np.array([dst_w * -0.5, 0.]) |
|
|
|
src = np.zeros((3, 2), dtype=np.float32) |
|
src[0, :] = center + scale * shift |
|
src[1, :] = center + src_dir + scale * shift |
|
|
|
dst = np.zeros((3, 2), dtype=np.float32) |
|
dst[0, :] = [dst_w * 0.5, dst_h * 0.5] |
|
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir |
|
|
|
if fix_aspect_ratio: |
|
src[2, :] = RTMPOSE_ONNX._get_3rd_point(src[0, :], src[1, :]) |
|
dst[2, :] = RTMPOSE_ONNX._get_3rd_point(dst[0, :], dst[1, :]) |
|
else: |
|
src_dir_2 = RTMPOSE_ONNX._rotate_point(np.array([0., src_h * -0.5]), rot_rad) |
|
dst_dir_2 = np.array([0., dst_h * -0.5]) |
|
src[2, :] = center + src_dir_2 + scale * shift |
|
dst[2, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir_2 |
|
|
|
if inv: |
|
warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src)) |
|
else: |
|
warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst)) |
|
return warp_mat |
|
|
|
|
|
def get_warp_size_with_input_size(self, |
|
bbox_center: List[int], |
|
bbox_scale: List[int], |
|
inv: bool = False, |
|
): |
|
""" |
|
获取仿射变换矩阵的输出尺寸 |
|
""" |
|
|
|
w, h = self.input_size |
|
warp_size = self.input_size |
|
|
|
|
|
scale_w, scale_h = bbox_scale |
|
aspect_ratio = w / h |
|
if scale_w > scale_h * aspect_ratio: |
|
bbox_scale = [scale_w, scale_w / aspect_ratio] |
|
else: |
|
bbox_scale = [scale_h * aspect_ratio, scale_h] |
|
|
|
|
|
center = np.array(bbox_center, dtype=np.float32) |
|
scale = np.array(bbox_scale, dtype=np.float32) |
|
|
|
rot = 0.0 |
|
|
|
warp_mat = self.get_warp_matrix(center, scale, rot, output_size=warp_size, inv=inv) |
|
|
|
return warp_mat |
|
|
|
def topdown_affine(self, img: cv2.UMat, bbox_center: List[int], bbox_scale: List[int]): |
|
"""简化版的 top-down 仿射变换函数 |
|
|
|
Args: |
|
img: 输入图像 |
|
|
|
Returns: |
|
变换后的图像 |
|
""" |
|
|
|
warp_mat = self.get_warp_size_with_input_size(bbox_center, bbox_scale) |
|
|
|
|
|
dst_img = cv2.warpAffine(img, warp_mat, self.input_size, flags=cv2.INTER_LINEAR) |
|
|
|
return dst_img |
|
|
|
|
|
|
|
def get_simcc_maximum(self, simcc_x, simcc_y): |
|
|
|
|
|
x_indices = np.argmax(simcc_x[0], axis=1) |
|
y_indices = np.argmax(simcc_y[0], axis=1) |
|
|
|
|
|
input_w, input_h = self.input_size |
|
|
|
|
|
x_coords = x_indices / (input_w * 2) |
|
y_coords = y_indices / (input_h * 2) |
|
|
|
|
|
keypoints = np.stack([x_coords, y_coords], axis=1) |
|
|
|
|
|
scores = np.max(simcc_x[0], axis=1) * np.max(simcc_y[0], axis=1) |
|
|
|
return keypoints, scores |
|
|
|
|
|
|
|
def preprocess_image(self, img_bgr: cv2.UMat, bbox_center: List[int], bbox_scale: List[int]): |
|
|
|
""" |
|
预处理图像 |
|
|
|
Args: |
|
img_bgr (cv2.UMat): 输入图像 |
|
bbox_center (list[int, int]): 边界框中心坐标 [x, y] |
|
bbox_scale (list[int, int]): 边界框尺度 [w, h] |
|
""" |
|
|
|
affine_img_bgr = self.topdown_affine(img_bgr, bbox_center, bbox_scale) |
|
|
|
|
|
affine_img_rgb = cv2.cvtColor(affine_img_bgr, cv2.COLOR_BGR2RGB) |
|
|
|
affine_img_rgb_norm = (affine_img_rgb - np.array([123.675, 116.28, 103.53])) / np.array([58.395, 57.12, 57.375]) |
|
|
|
img = affine_img_rgb_norm.astype(np.float32) |
|
|
|
img = np.transpose(img, (2, 0, 1)) |
|
|
|
img = np.expand_dims(img, axis=0) |
|
|
|
return img |
|
|
|
|
|
def run_inference(self, image: np.ndarray): |
|
""" |
|
Run inference on the image. |
|
|
|
Args: |
|
image (np.ndarray): The image to run inference on. |
|
|
|
Returns: |
|
tuple: A tuple containing the detection results and labels. |
|
""" |
|
|
|
outputs = self.session.run(None, {self.input_name: image}) |
|
""" |
|
simcc_x: float32[batch,MatMulsimcc_x_dim_1,512] |
|
simcc_y: float32[batch,MatMulsimcc_x_dim_1,512] |
|
""" |
|
simcc_x, simcc_y = outputs |
|
|
|
return simcc_x, simcc_y |
|
|
|
def pred(self, image: List[Union[cv2.UMat, str]], bbox: List[int]) -> Tuple[np.ndarray, np.ndarray]: |
|
""" |
|
Predict the keypoints results of the image. |
|
|
|
Args: |
|
image (str | cv2.UMat): The image to predict. |
|
bbox (list[int, int, int, int]): The bounding box to predict. |
|
|
|
Returns: |
|
keypoints (np.ndarray): The predicted keypoints. |
|
scores (np.ndarray): The predicted scores. |
|
""" |
|
if isinstance(image, str): |
|
img_bgr = cv2.imread(image) |
|
else: |
|
img_bgr = image.copy() |
|
|
|
bbox_center, bbox_scale = self.get_bbox_center_scale(bbox) |
|
|
|
image = self.preprocess_image(img_bgr, bbox_center, bbox_scale) |
|
simcc_x, simcc_y = self.run_inference(image) |
|
|
|
|
|
|
|
keypoints, scores = self.get_simcc_maximum(simcc_x, simcc_y) |
|
|
|
|
|
keypoints = self.transform_keypoints_to_original(keypoints, bbox_center, bbox_scale, self.input_size) |
|
|
|
return keypoints, scores |
|
|
|
def transform_keypoints_to_original(self, keypoints, center, scale, output_size): |
|
""" |
|
将预测的关键点坐标从模型输出尺寸映射回原图尺寸 |
|
|
|
Args: |
|
keypoints: 预测的关键点坐标 [N, 2] |
|
center: bbox中心点 [x, y] |
|
scale: bbox尺度 [w, h] |
|
output_size: 模型输入尺寸 (w, h) |
|
|
|
Returns: |
|
np.ndarray: 转换后的关键点坐标 [N, 2] |
|
""" |
|
target_coords = keypoints.copy() |
|
|
|
|
|
target_coords[:, 0] = target_coords[:, 0] * output_size[0] |
|
target_coords[:, 1] = target_coords[:, 1] * output_size[1] |
|
|
|
|
|
warp_mat = self.get_warp_size_with_input_size(center, scale, inv=True) |
|
|
|
|
|
ones = np.ones((len(target_coords), 1)) |
|
target_coords_homogeneous = np.hstack([target_coords, ones]) |
|
|
|
|
|
original_keypoints = target_coords_homogeneous @ warp_mat.T |
|
|
|
return original_keypoints |
|
|
|
def draw_pred(self, img: cv2.UMat, keypoints: np.ndarray, scores: np.ndarray, is_rgb: bool = True) -> cv2.UMat: |
|
""" |
|
Draw the keypoints results on the image. |
|
""" |
|
|
|
if not is_rgb: |
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) |
|
|
|
|
|
colors = np.random.randint(0, 256, (34, 3)) |
|
|
|
for i, (point, score) in enumerate(zip(keypoints, scores)): |
|
if score > 0.3: |
|
x, y = map(int, point) |
|
|
|
color = colors[i] |
|
|
|
cv2.circle(img, (x, y), 5, (int(color[0]), int(color[1]), int(color[2])), -1) |
|
|
|
cv2.putText(img, self.bone_names[i], (x+5, y+5), |
|
cv2.FONT_HERSHEY_SIMPLEX, 1.0, (int(color[0]), int(color[1]), int(color[2])), 1) |
|
return img |
|
|
|
|