Sam / VideoToNPZ /lib /detector /yolov3 /human_detector.py
Amanpreet
added 2
1cdc47e
from __future__ import division
import time
import torch
import numpy as np
import cv2
import os
import sys
import random
import pickle as pkl
import argparse
from util import *
from darknet import Darknet
from preprocess import letterbox_image
import preprocess
cur_dir = os.path.dirname(os.path.realpath(__file__))
project_root = os.path.join(cur_dir, '../../../')
chk_root = os.path.join(project_root, 'checkpoint/')
data_root = os.path.join(project_root, 'data/')
sys.path.insert(0, project_root)
sys.path.pop(0)
def prep_image(img, inp_dim):
"""
Prepare image for inputting to the neural network.
Returns a Variable
"""
ori_img = img
dim = ori_img.shape[1], ori_img.shape[0]
img = cv2.resize(ori_img, (inp_dim, inp_dim))
img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
return img_, ori_img, dim
def write(x, img, colors):
x = [int(i) for i in x]
c1 = tuple(x[0:2])
c2 = tuple(x[2:4])
label = 'People {}'.format(0)
color = (0, 0, 255)
cv2.rectangle(img, c1, c2, color, 2)
t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0]
c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
cv2.rectangle(img, c1, c2, color, -1)
cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1)
return img
def arg_parse():
""""
Parse arguements to the detect module
"""
parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
parser.add_argument('--confidence', dest='confidence', type=float, default=0.70,
help='Object Confidence to filter predictions')
parser.add_argument('--nms-thresh', dest='nms_thresh', type=float, default=0.4, help='NMS Threshold')
parser.add_argument('--reso', dest='reso', default=416, type=int, help='Input resolution of the network. '
'Increase to increase accuracy. Decrease to increase speed. (160, 416)')
parser.add_argument('-wf', '--weight-file', type=str, default=chk_root + 'yolov3/yolov3.weights', help='The path'
'of model weight file')
parser.add_argument('-cf', '--cfg-file', type=str, default=cur_dir + '/cfg/yolov3.cfg', help='weight file')
parser.add_argument('-a', '--animation', action='store_true', help='output animation')
parser.add_argument('-v', '--video', type=str, default='camera', help='The input video path')
parser.add_argument('-i', '--image', type=str, default=cur_dir + '/data/dog-cycle-car.png',
help='The input video path')
parser.add_argument('-np', '--num-person', type=int, default=1, help='number of estimated human poses. [1, 2]')
return parser.parse_args()
def load_model(args=None, CUDA=None, inp_dim=416):
if args is None:
args = arg_parse()
if CUDA is None:
CUDA = torch.cuda.is_available()
# Set up the neural network
model = Darknet(args.cfg_file)
model.load_weights(args.weight_file)
model.net_info["height"] = inp_dim
assert inp_dim % 32 == 0
assert inp_dim > 32
# If there's a GPU availible, put the model on GPU
if CUDA:
model.cuda()
# Set the model in evaluation mode
model.eval()
return model
def yolo_human_det(img, model=None, reso=416, confidence=0.70):
args = arg_parse()
# args.reso = reso
inp_dim = reso
num_classes = 80
CUDA = torch.cuda.is_available()
if model is None:
model = load_model(args, CUDA, inp_dim)
if type(img) == str:
assert os.path.isfile(img), 'The image path does not exist'
img = cv2.imread(img)
img, ori_img, img_dim = preprocess.prep_image(img, inp_dim)
img_dim = torch.FloatTensor(img_dim).repeat(1, 2)
with torch.no_grad():
if CUDA:
img_dim = img_dim.cuda()
img = img.cuda()
output = model(img, CUDA)
output = write_results(output, confidence, num_classes, nms=True, nms_conf=args.nms_thresh, det_hm=True)
if len(output) == 0:
return None, None
img_dim = img_dim.repeat(output.size(0), 1)
scaling_factor = torch.min(inp_dim / img_dim, 1)[0].view(-1, 1)
output[:, [1, 3]] -= (inp_dim - scaling_factor * img_dim[:, 0].view(-1, 1)) / 2
output[:, [2, 4]] -= (inp_dim - scaling_factor * img_dim[:, 1].view(-1, 1)) / 2
output[:, 1:5] /= scaling_factor
for i in range(output.shape[0]):
output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim[i, 0])
output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim[i, 1])
bboxs = []
scores = []
for i in range(len(output)):
item = output[i]
bbox = item[1:5].cpu().numpy()
# conver float32 to .2f data
bbox = [round(i, 2) for i in list(bbox)]
score = item[5].cpu().numpy()
bboxs.append(bbox)
scores.append(score)
scores = np.expand_dims(np.array(scores), 1)
bboxs = np.array(bboxs)
return bboxs, scores