File size: 3,113 Bytes
a57c6eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 8 13:31:25 2020

@author: infguo
"""

import os
from typing import Dict
import argparse
import json
import traceback
import hashlib

import cv2
from moviepy.editor import VideoFileClip

# TODO
# 该部分与源代码相比,修改了insight_face的输出接口,将性别分数透传出来,用于后续更精准的决策


def inference(frame, app, max_face=10):
    # Start to perform face recognition
    try:  # Handle exception
        faces = app.get(frame, max_num=max_face)
    except Exception as e:
        print("is discarded due to exception {}!".format(e))
        return

    if (
        len(faces) == 0
    ):  # If the landmarks cannot be detected, the img will be discarded
        return
    return faces


def predict(app, video_path, video_map, sample_fps):
    from insightface.app import FaceAnalysis

    video_name = ".".join(video_path.split("/")[-1].split(".")[:-1])
    # video_hash_code = (os.popen('md5sum {}'.format(video_path))).readlines()[0].split('  ')[0]
    with open(video_path, "rb") as fd:
        data = fd.read()
    video_hash_code = hashlib.md5(data).hexdigest()

    assert video_hash_code == video_map["video_file_hash_code"]

    # Capture video
    video = VideoFileClip(video_path)
    video = video.crop(*video_map["content_box"])
    fps = video.fps
    duration = video.duration
    total_frames = int(duration * fps)
    width, height = video.size
    print("fps, frame_count, width, height:", fps, total_frames, width, height)

    video_map["detect_fps"] = sample_fps
    video_map["face_detections"] = []

    cnt_frame, step = 0, 0

    fps = video_map["sample_fps"]
    for frame in video.iter_frames(fps=fps):
        if cnt_frame >= step:
            step += fps / sample_fps
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            faces = inference(frame, app, max_face=0)
            if faces and len(faces) > 0:
                for f in faces:
                    f["bbox"] = f["bbox"].tolist()
                    f["kps"] = f["kps"].tolist()
                    f["embedding"] = f["embedding"].tolist()
                    f["det_score"] = str(f["det_score"])
                    f["gender"] = str(f["gender"])
                    f["age"] = str(f["age"])
            else:
                faces = None
            video_map["face_detections"].append(
                {"frame_idx": cnt_frame, "faces": faces}
            )

        cnt_frame += 1
    return video_map


class InsightfacePredictor(object):
    def __init__(
        self,
        sample_fps=10,
    ) -> None:
        # Load models
        self.sample_fps = sample_fps
        self.app = FaceAnalysis(
            allowed_modules=["detection", "genderage", "recognition"],
            providers=["CUDAExecutionProvider"],
            provider_options=[{"device_id": "0"}],
        )
        self.app.prepare(ctx_id=0, det_thresh=0.3, det_size=(640, 640))

    def __call__(self, video_path, video_map) -> Dict:
        video_info = predict(video_path, video_map, sample_fps=self.sample_fps)
        return video_info