|
""" |
|
File: app_utils.py |
|
Author: Elena Ryumina and Dmitry Ryumin |
|
Description: This module contains utility functions for facial expression recognition application. |
|
License: MIT License |
|
""" |
|
|
|
import torch |
|
import numpy as np |
|
import mediapipe as mp |
|
from PIL import Image |
|
import cv2 |
|
|
|
|
|
from app.model import pth_model_static, pth_model_dynamic, pth_processing |
|
from app.face_utils import get_box, display_info |
|
from app.config import DICT_EMO, config_data |
|
from app.plot import statistics_plot |
|
|
|
|
|
mp_face_mesh = mp.solutions.face_mesh |
|
|
|
|
|
def preprocess_image_and_predict(inp): |
|
inp = np.array(inp) |
|
|
|
if inp is None: |
|
return None, None |
|
|
|
try: |
|
h, w = inp.shape[:2] |
|
except Exception: |
|
return None, None |
|
|
|
with mp_face_mesh.FaceMesh( |
|
max_num_faces=1, |
|
refine_landmarks=False, |
|
min_detection_confidence=0.5, |
|
min_tracking_confidence=0.5, |
|
) as face_mesh: |
|
results = face_mesh.process(inp) |
|
if results.multi_face_landmarks: |
|
for fl in results.multi_face_landmarks: |
|
startX, startY, endX, endY = get_box(fl, w, h) |
|
cur_face = inp[startY:endY, startX:endX] |
|
cur_face_n = pth_processing(Image.fromarray(cur_face)).to(config_data.DEVICE) |
|
prediction = ( |
|
torch.nn.functional.softmax(pth_model_static(cur_face_n), dim=1).cpu() |
|
.detach() |
|
.numpy()[0] |
|
) |
|
confidences = {DICT_EMO[i]: float(prediction[i]) for i in range(7)} |
|
|
|
return cur_face, confidences |
|
|
|
|
|
def preprocess_video_and_predict(video): |
|
|
|
cap = cv2.VideoCapture(video) |
|
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) |
|
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) |
|
fps = np.round(cap.get(cv2.CAP_PROP_FPS)) |
|
|
|
path_save_video = 'result.mp4' |
|
vid_writer = cv2.VideoWriter(path_save_video, cv2.VideoWriter_fourcc(*'mp4v'), fps, (224, 224)) |
|
|
|
lstm_features = [] |
|
count_frame = 1 |
|
probs = [] |
|
frames = [] |
|
last_output = None |
|
|
|
with mp_face_mesh.FaceMesh( |
|
max_num_faces=1, |
|
refine_landmarks=False, |
|
min_detection_confidence=0.5, |
|
min_tracking_confidence=0.5) as face_mesh: |
|
|
|
while cap.isOpened(): |
|
_, frame = cap.read() |
|
if frame is None: break |
|
|
|
frame_copy = frame.copy() |
|
frame_copy.flags.writeable = False |
|
frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB) |
|
results = face_mesh.process(frame_copy) |
|
frame_copy.flags.writeable = True |
|
|
|
if results.multi_face_landmarks: |
|
for fl in results.multi_face_landmarks: |
|
startX, startY, endX, endY = get_box(fl, w, h) |
|
cur_face = frame_copy[startY:endY, startX: endX] |
|
|
|
if (count_frame-1)%5 == 0: |
|
cur_face_copy = pth_processing(Image.fromarray(cur_face)).to(config_data.DEVICE) |
|
features = torch.nn.functional.relu(pth_model_static.extract_features(cur_face_copy)).cpu().detach().numpy() |
|
|
|
if len(lstm_features) == 0: |
|
lstm_features = [features]*10 |
|
else: |
|
lstm_features = lstm_features[1:] + [features] |
|
|
|
lstm_f = torch.from_numpy(np.vstack(lstm_features)) |
|
lstm_f = torch.unsqueeze(lstm_f, 0).to(config_data.DEVICE) |
|
output = pth_model_dynamic(lstm_f).cpu().detach().numpy() |
|
last_output = output |
|
else: |
|
if last_output is not None: |
|
output = last_output |
|
elif last_output is None: |
|
output = np.zeros((7)) |
|
|
|
probs.append(output[0]) |
|
frames.append(count_frame) |
|
else: |
|
empty = np.empty((7)) |
|
empty[:] = np.nan |
|
probs.append(empty) |
|
frames.append(count_frame) |
|
|
|
cur_face = cv2.cvtColor(cur_face, cv2.COLOR_RGB2BGR) |
|
cur_face = cv2.resize(cur_face, (224,224), interpolation = cv2.INTER_AREA) |
|
|
|
cur_face = display_info(cur_face, 'Frame: {}'.format(count_frame), box_scale=.3) |
|
count_frame += 1 |
|
vid_writer.write(cur_face) |
|
|
|
vid_writer.release() |
|
|
|
stat = statistics_plot(frames, probs) |
|
|
|
if not stat: |
|
return None, None, None |
|
|
|
return video, path_save_video, stat |