Spaces:
Build error
Build error
Commit
·
c794a89
1
Parent(s):
6a66599
Upload landmark_utils.py
Browse files- landmark_utils.py +309 -0
landmark_utils.py
ADDED
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import numpy as np
|
3 |
+
import dlib
|
4 |
+
from collections import OrderedDict
|
5 |
+
import cv2
|
6 |
+
|
7 |
+
detector = dlib.get_frontal_face_detector()
|
8 |
+
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
|
9 |
+
FACIAL_LANDMARKS_68_IDXS = OrderedDict([
|
10 |
+
("mouth", (48, 68)),
|
11 |
+
("inner_mouth", (60, 68)),
|
12 |
+
("right_eyebrow", (17, 22)),
|
13 |
+
("left_eyebrow", (22, 27)),
|
14 |
+
("right_eye", (36, 42)),
|
15 |
+
("left_eye", (42, 48)),
|
16 |
+
("nose", (27, 36)),
|
17 |
+
("jaw", (0, 17))
|
18 |
+
])
|
19 |
+
|
20 |
+
|
21 |
+
def shape_to_face(shape, width, height, scale=1.2):
|
22 |
+
"""
|
23 |
+
Recalculate the face bounding box based on coarse landmark location(shape)
|
24 |
+
:param
|
25 |
+
shape: landmark locations
|
26 |
+
scale: the scale parameter of face, to enlarge the bounding box
|
27 |
+
:return:
|
28 |
+
face_new: new bounding box of face (1*4 list [x1, y1, x2, y2])
|
29 |
+
# face_center: the center coordinate of face (1*2 list [x_c, y_c])
|
30 |
+
face_size: the face is rectangular( width = height = size)(int)
|
31 |
+
"""
|
32 |
+
x_min, y_min = np.min(shape, axis=0)
|
33 |
+
x_max, y_max = np.max(shape, axis=0)
|
34 |
+
|
35 |
+
x_center = (x_min + x_max) // 2
|
36 |
+
y_center = (y_min + y_max) // 2
|
37 |
+
|
38 |
+
face_size = int(max(x_max - x_min, y_max - y_min) * scale)
|
39 |
+
# Enforce it to be even
|
40 |
+
# Thus the real whole bounding box size will be an odd
|
41 |
+
# But after cropping the face size will become even and
|
42 |
+
# keep same to the face_size parameter.
|
43 |
+
face_size = face_size // 2 * 2
|
44 |
+
|
45 |
+
x1 = max(x_center - face_size // 2, 0)
|
46 |
+
y1 = max(y_center - face_size // 2, 0)
|
47 |
+
|
48 |
+
face_size = min(width - x1, face_size)
|
49 |
+
face_size = min(height - y1, face_size)
|
50 |
+
|
51 |
+
x2 = x1 + face_size
|
52 |
+
y2 = y1 + face_size
|
53 |
+
|
54 |
+
face_new = [int(x1), int(y1), int(x2), int(y2)]
|
55 |
+
return face_new, face_size
|
56 |
+
|
57 |
+
|
58 |
+
def predict_single_frame(frame):
|
59 |
+
"""
|
60 |
+
:param frame: A full frame of video
|
61 |
+
:return:
|
62 |
+
face_num: the number of face (just to verify if successfully detect a face)
|
63 |
+
shape: landmark locations
|
64 |
+
"""
|
65 |
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
66 |
+
faces = detector(gray, 0)
|
67 |
+
if len(faces) < 1:
|
68 |
+
return 0, None
|
69 |
+
face = faces[0]
|
70 |
+
|
71 |
+
landmarks = predictor(frame, face)
|
72 |
+
face_landmark_list = [(p.x, p.y) for p in landmarks.parts()]
|
73 |
+
shape = np.array(face_landmark_list)
|
74 |
+
|
75 |
+
return 1, shape
|
76 |
+
|
77 |
+
|
78 |
+
def landmark_align(shape):
|
79 |
+
desiredLeftEye = (0.35, 0.25)
|
80 |
+
desiredFaceWidth = 2
|
81 |
+
desiredFaceHeight = 2
|
82 |
+
(lStart, lEnd) = FACIAL_LANDMARKS_68_IDXS["left_eye"]
|
83 |
+
(rStart, rEnd) = FACIAL_LANDMARKS_68_IDXS["right_eye"]
|
84 |
+
|
85 |
+
leftEyePts = shape[lStart:lEnd]
|
86 |
+
rightEyePts = shape[rStart:rEnd]
|
87 |
+
|
88 |
+
# compute the center of mass for each eye
|
89 |
+
leftEyeCenter = leftEyePts.mean(axis=0) # .astype("int")
|
90 |
+
rightEyeCenter = rightEyePts.mean(axis=0) # .astype("int")
|
91 |
+
# compute the angle between the eye centroids
|
92 |
+
dY = rightEyeCenter[1] - leftEyeCenter[1]
|
93 |
+
dX = rightEyeCenter[0] - leftEyeCenter[0]
|
94 |
+
angle = np.degrees(np.arctan2(dY, dX)) # - 180
|
95 |
+
|
96 |
+
# compute the desired right eye x-coordinate based on the
|
97 |
+
# desired x-coordinate of the left eye
|
98 |
+
desiredRightEyeX = 1.0 - desiredLeftEye[0]
|
99 |
+
|
100 |
+
# determine the scale of the new resulting image by taking
|
101 |
+
# the ratio of the distance between eyes in the *current*
|
102 |
+
# image to the ratio of distance between eyes in the
|
103 |
+
# *desired* image
|
104 |
+
dist = np.sqrt((dX ** 2) + (dY ** 2))
|
105 |
+
desiredDist = (desiredRightEyeX - desiredLeftEye[0])
|
106 |
+
desiredDist *= desiredFaceWidth
|
107 |
+
scale = desiredDist / dist
|
108 |
+
|
109 |
+
# compute center (x, y)-coordinates (i.e., the median point)
|
110 |
+
# between the two eyes in the input image
|
111 |
+
eyesCenter = ((leftEyeCenter[0] + rightEyeCenter[0]) // 2,
|
112 |
+
(leftEyeCenter[1] + rightEyeCenter[1]) // 2)
|
113 |
+
|
114 |
+
# grab the rotation matrix for rotating and scaling the face
|
115 |
+
M = cv2.getRotationMatrix2D(eyesCenter, angle, scale)
|
116 |
+
|
117 |
+
# update the translation component of the matrix
|
118 |
+
tX = 0 # desiredFaceWidth * 0.5
|
119 |
+
tY = desiredFaceHeight * desiredLeftEye[1]
|
120 |
+
M[0, 2] += (tX - eyesCenter[0])
|
121 |
+
M[1, 2] += (tY - eyesCenter[1])
|
122 |
+
|
123 |
+
n, d = shape.shape
|
124 |
+
temp = np.zeros((n, d + 1), dtype="int")
|
125 |
+
temp[:, 0:2] = shape
|
126 |
+
temp[:, 2] = 1
|
127 |
+
aligned_landmarks = np.matmul(M, temp.T)
|
128 |
+
return aligned_landmarks.T # .astype("int"))
|
129 |
+
|
130 |
+
|
131 |
+
def check_and_merge(location, forward, feedback, P_predict, status_fw=None, status_fb=None):
|
132 |
+
num_pts = 68
|
133 |
+
check = [True] * num_pts
|
134 |
+
|
135 |
+
target = location[1]
|
136 |
+
forward_predict = forward[1]
|
137 |
+
|
138 |
+
# To ensure the robustness through feedback-check
|
139 |
+
forward_base = forward[0] # Also equal to location[0]
|
140 |
+
feedback_predict = feedback[0]
|
141 |
+
feedback_diff = feedback_predict - forward_base
|
142 |
+
feedback_dist = np.linalg.norm(feedback_diff, axis=1, keepdims=True)
|
143 |
+
|
144 |
+
# For Kalman Filtering
|
145 |
+
detect_diff = location[1] - location[0]
|
146 |
+
detect_dist = np.linalg.norm(detect_diff, axis=1, keepdims=True)
|
147 |
+
predict_diff = forward[1] - forward[0]
|
148 |
+
predict_dist = np.linalg.norm(predict_diff, axis=1, keepdims=True)
|
149 |
+
predict_dist[np.where(predict_dist == 0)] = 1 # Avoid nan
|
150 |
+
P_detect = (detect_dist / predict_dist).reshape(num_pts)
|
151 |
+
|
152 |
+
for ipt in range(num_pts):
|
153 |
+
if feedback_dist[ipt] > 2: # When use float
|
154 |
+
check[ipt] = False
|
155 |
+
|
156 |
+
if status_fw is not None and np.sum(status_fw) != num_pts:
|
157 |
+
for ipt in range(num_pts):
|
158 |
+
if status_fw[ipt][0] == 0:
|
159 |
+
check[ipt] = False
|
160 |
+
if status_fw is not None and np.sum(status_fb) != num_pts:
|
161 |
+
for ipt in range(num_pts):
|
162 |
+
if status_fb[ipt][0] == 0:
|
163 |
+
check[ipt] = False
|
164 |
+
location_merge = target.copy()
|
165 |
+
# Merge the results:
|
166 |
+
"""
|
167 |
+
Use Kalman Filter to combine the calculate result and detect result.
|
168 |
+
"""
|
169 |
+
|
170 |
+
Q = 0.3 # Process variance
|
171 |
+
|
172 |
+
for ipt in range(num_pts):
|
173 |
+
if check[ipt]:
|
174 |
+
# Kalman parameter
|
175 |
+
P_predict[ipt] += Q
|
176 |
+
K = P_predict[ipt] / (P_predict[ipt] + P_detect[ipt])
|
177 |
+
location_merge[ipt] = forward_predict[ipt] + K * (target[ipt] - forward_predict[ipt])
|
178 |
+
# Update the P_predict by the current K
|
179 |
+
P_predict[ipt] = (1 - K) * P_predict[ipt]
|
180 |
+
return location_merge, check, P_predict
|
181 |
+
|
182 |
+
|
183 |
+
def detect_frames_track(frames, fps, video):
|
184 |
+
frames_num = len(frames)
|
185 |
+
assert frames_num != 0
|
186 |
+
frame_height, frame_width = frames[0].shape[:2]
|
187 |
+
"""
|
188 |
+
Pre-process:
|
189 |
+
To detect the original results,
|
190 |
+
and normalize each face to a certain width,
|
191 |
+
also its corresponding landmarks locations and
|
192 |
+
scale parameter.
|
193 |
+
"""
|
194 |
+
face_size_normalized = 400
|
195 |
+
faces = []
|
196 |
+
locations = []
|
197 |
+
shapes_origin = []
|
198 |
+
shapes_para = [] # Use to recover the shape in whole frame. ([x1, y1, scale_shape])
|
199 |
+
face_size = 0
|
200 |
+
skipped = 0
|
201 |
+
|
202 |
+
"""
|
203 |
+
Use single frame to detect face on Dlib (CPU)
|
204 |
+
"""
|
205 |
+
# ----------------------------------------------------------------------------#
|
206 |
+
|
207 |
+
print("Detecting:")
|
208 |
+
for i in tqdm(range(frames_num)):
|
209 |
+
frame = frames[i]
|
210 |
+
face_num, shape = predict_single_frame(frame)
|
211 |
+
|
212 |
+
if face_num == 0:
|
213 |
+
if len(shapes_origin) == 0:
|
214 |
+
skipped += 1
|
215 |
+
# print("Skipped", skipped, "Frame_num", frames_num)
|
216 |
+
continue
|
217 |
+
shape = shapes_origin[i - 1 - skipped]
|
218 |
+
|
219 |
+
face, face_size = shape_to_face(shape, frame_width, frame_height, 1.2)
|
220 |
+
faceFrame = frame[face[1]: face[3],
|
221 |
+
face[0]:face[2]]
|
222 |
+
if face_size < face_size_normalized:
|
223 |
+
inter_para = cv2.INTER_CUBIC
|
224 |
+
else:
|
225 |
+
inter_para = cv2.INTER_AREA
|
226 |
+
face_norm = cv2.resize(faceFrame, (face_size_normalized, face_size_normalized), interpolation=inter_para)
|
227 |
+
scale_shape = face_size_normalized / face_size
|
228 |
+
shape_norm = np.rint((shape - np.array([face[0], face[1]])) * scale_shape).astype(int)
|
229 |
+
faces.append(face_norm)
|
230 |
+
shapes_para.append([face[0], face[1], scale_shape])
|
231 |
+
shapes_origin.append(shape)
|
232 |
+
locations.append(shape_norm)
|
233 |
+
|
234 |
+
"""
|
235 |
+
Calibration module.
|
236 |
+
"""
|
237 |
+
segment_length = 2
|
238 |
+
locations_sum = len(locations)
|
239 |
+
if locations_sum == 0:
|
240 |
+
return []
|
241 |
+
locations_track = [locations[0]]
|
242 |
+
num_pts = 68
|
243 |
+
P_predict = np.array([0] * num_pts).reshape(num_pts).astype(float)
|
244 |
+
print("Tracking")
|
245 |
+
for i in tqdm(range(locations_sum - 1)):
|
246 |
+
faces_seg = faces[i:i + segment_length]
|
247 |
+
locations_seg = locations[i:i + segment_length]
|
248 |
+
|
249 |
+
# ----------------------------------------------------------------------#
|
250 |
+
"""
|
251 |
+
Numpy Version (DEPRECATED)
|
252 |
+
"""
|
253 |
+
|
254 |
+
# locations_track_start = [locations_track[i]]
|
255 |
+
# forward_pts, feedback_pts = track_bidirectional(faces_seg, locations_track_start)
|
256 |
+
#
|
257 |
+
# forward_pts = np.rint(forward_pts).astype(int)
|
258 |
+
# feedback_pts = np.rint(feedback_pts).astype(int)
|
259 |
+
# merge_pt, check, P_predict = check_and_merge(locations_seg, forward_pts, feedback_pts, P_predict)
|
260 |
+
|
261 |
+
# ----------------------------------------------------------------------#
|
262 |
+
"""
|
263 |
+
OpenCV Version
|
264 |
+
"""
|
265 |
+
|
266 |
+
lk_params = dict(winSize=(15, 15),
|
267 |
+
maxLevel=3,
|
268 |
+
criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
|
269 |
+
# Use the tracked current location as input. Also use the next frame's predicted location for
|
270 |
+
# auxiliary initialization.
|
271 |
+
|
272 |
+
start_pt = locations_track[i].astype(np.float32)
|
273 |
+
target_pt = locations_seg[1].astype(np.float32)
|
274 |
+
|
275 |
+
forward_pt, status_fw, err_fw = cv2.calcOpticalFlowPyrLK(faces_seg[0], faces_seg[1],
|
276 |
+
start_pt, target_pt, **lk_params,
|
277 |
+
flags=cv2.OPTFLOW_USE_INITIAL_FLOW)
|
278 |
+
feedback_pt, status_fb, err_fb = cv2.calcOpticalFlowPyrLK(faces_seg[1], faces_seg[0],
|
279 |
+
forward_pt, start_pt, **lk_params,
|
280 |
+
flags=cv2.OPTFLOW_USE_INITIAL_FLOW)
|
281 |
+
|
282 |
+
forward_pts = [locations_track[i].copy(), forward_pt]
|
283 |
+
feedback_pts = [feedback_pt, forward_pt.copy()]
|
284 |
+
|
285 |
+
forward_pts = np.rint(forward_pts).astype(int)
|
286 |
+
feedback_pts = np.rint(feedback_pts).astype(int)
|
287 |
+
|
288 |
+
merge_pt, check, P_predict = check_and_merge(locations_seg, forward_pts, feedback_pts, P_predict, status_fw,
|
289 |
+
status_fb)
|
290 |
+
|
291 |
+
# ----------------------------------------------------------------------#
|
292 |
+
|
293 |
+
locations_track.append(merge_pt)
|
294 |
+
|
295 |
+
"""
|
296 |
+
If us visualization, write the results to the visualize output folder.
|
297 |
+
"""
|
298 |
+
if locations_sum != frames_num:
|
299 |
+
print("INFO: Landmarks detection failed in some frames. Therefore we disable the "
|
300 |
+
"visualization for this video. It will be optimized in future version.")
|
301 |
+
|
302 |
+
aligned_landmarks = []
|
303 |
+
for i in locations_track:
|
304 |
+
shape = landmark_align(i)
|
305 |
+
shape = shape.ravel()
|
306 |
+
shape = shape.tolist()
|
307 |
+
aligned_landmarks.append(shape)
|
308 |
+
|
309 |
+
return aligned_landmarks
|