from __future__ import division import time import torch import numpy as np import cv2 import os import sys import random import pickle as pkl import argparse from util import * from darknet import Darknet from preprocess import letterbox_image import preprocess cur_dir = os.path.dirname(os.path.realpath(__file__)) project_root = os.path.join(cur_dir, '../../../') chk_root = os.path.join(project_root, 'checkpoint/') data_root = os.path.join(project_root, 'data/') sys.path.insert(0, project_root) sys.path.pop(0) def prep_image(img, inp_dim): """ Prepare image for inputting to the neural network. Returns a Variable """ ori_img = img dim = ori_img.shape[1], ori_img.shape[0] img = cv2.resize(ori_img, (inp_dim, inp_dim)) img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) return img_, ori_img, dim def write(x, img, colors): x = [int(i) for i in x] c1 = tuple(x[0:2]) c2 = tuple(x[2:4]) label = 'People {}'.format(0) color = (0, 0, 255) cv2.rectangle(img, c1, c2, color, 2) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) return img def arg_parse(): """" Parse arguements to the detect module """ parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') parser.add_argument('--confidence', dest='confidence', type=float, default=0.70, help='Object Confidence to filter predictions') parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold') parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. ' 'Increase to increase accuracy. Decrease to increase speed. (160, 416)') parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path' 'of model weight file') parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file') parser.add_argument('-a', '--animation', action='store_true', help='output animation') parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path') parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png', help='The input video path') parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]') return parser.parse_args() def load_model(args=None, CUDA=None, inp_dim=416): if args is None: args = arg_parse() if CUDA is None: CUDA = torch.cuda.is_available() # Set up the neural network model = Darknet(args.cfg_file) model.load_weights(args.weight_file) model.net_info["height"] = inp_dim assert inp_dim % 32 == 0 assert inp_dim > 32 # If there's a GPU availible, put the model on GPU if CUDA: model.cuda() # Set the model in evaluation mode model.eval() return model def yolo_human_det(img, model=None, reso=416, confidence=0.70): args = arg_parse() # args.reso = reso inp_dim = reso num_classes = 80 CUDA = torch.cuda.is_available() if model is None: model = load_model(args, CUDA, inp_dim) if type(img) == str: assert os.path.isfile(img), 'The image path does not exist' img = cv2.imread(img) img, ori_img, img_dim = preprocess.prep_image(img, inp_dim) img_dim = torch.FloatTensor(img_dim).repeat(1, 2) with torch.no_grad(): if CUDA: img_dim = img_dim.cuda() img = img.cuda() output = model(img, CUDA) output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True) if len(output) == 0: return None, None img_dim = img_dim.repeat(output.size(0), 1) scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1) output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2 output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2 output[:, 1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0]) output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1]) bboxs = [] scores = [] for i in range(len(output)): item = output[i] bbox = item[1:5].cpu().numpy() # conver float32 to .2f data bbox = [round(i, 2) for i in list(bbox)] score = item[5].cpu().numpy() bboxs.append(bbox) scores.append(score) scores = np.expand_dims(np.array(scores), 1) bboxs = np.array(bboxs) return bboxs, scores