David Driscoll
commited on
Commit
·
bf45aa3
1
Parent(s):
473b2d5
Add model
Browse files
app.py
CHANGED
@@ -7,7 +7,8 @@ from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
|
|
7 |
from PIL import Image
|
8 |
import mediapipe as mp
|
9 |
from fer import FER # Facial emotion recognition
|
10 |
-
from transformers import AutoFeatureExtractor, AutoModel
|
|
|
11 |
|
12 |
# -----------------------------
|
13 |
# Configuration
|
@@ -49,25 +50,20 @@ emotion_detector = FER(mtcnn=True)
|
|
49 |
object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
|
50 |
|
51 |
# -----------------------------
|
52 |
-
# Facial Recognition Model (
|
53 |
# -----------------------------
|
54 |
-
|
55 |
-
facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16")
|
56 |
-
facial_recognition_model.to(device)
|
57 |
-
facial_recognition_model.eval()
|
58 |
|
59 |
# -----------------------------
|
60 |
# Overlay Drawing Functions
|
61 |
# -----------------------------
|
62 |
def draw_posture_overlay(raw_frame, landmarks):
|
63 |
-
# Draw connector lines using MediaPipe's POSE_CONNECTIONS
|
64 |
for connection in mp_pose.POSE_CONNECTIONS:
|
65 |
start_idx, end_idx = connection
|
66 |
if start_idx < len(landmarks) and end_idx < len(landmarks):
|
67 |
start_point = landmarks[start_idx]
|
68 |
end_point = landmarks[end_idx]
|
69 |
cv2.line(raw_frame, start_point, end_point, (50, 205, 50), 2)
|
70 |
-
# Draw landmark points in lime green (BGR: (50,205,50))
|
71 |
for (x, y) in landmarks:
|
72 |
cv2.circle(raw_frame, (x, y), 4, (50, 205, 50), -1)
|
73 |
return raw_frame
|
@@ -90,7 +86,6 @@ def compute_posture_overlay(image):
|
|
90 |
if pose_results.pose_landmarks:
|
91 |
landmarks = []
|
92 |
for lm in pose_results.pose_landmarks.landmark:
|
93 |
-
# Scale landmarks back to the original image size
|
94 |
x = int(lm.x * small_w * (w / small_w))
|
95 |
y = int(lm.y * small_h * (h / small_h))
|
96 |
landmarks.append((x, y))
|
@@ -101,7 +96,6 @@ def compute_posture_overlay(image):
|
|
101 |
return landmarks, text
|
102 |
|
103 |
def compute_emotion_overlay(image):
|
104 |
-
# Use the FER package (exactly as in your provided code)
|
105 |
frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
106 |
frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
|
107 |
frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
|
@@ -157,8 +151,8 @@ def compute_faces_overlay(image):
|
|
157 |
|
158 |
def compute_facial_recognition_vector(image):
|
159 |
"""
|
160 |
-
Detects a face using MediaPipe, crops it,
|
161 |
-
using
|
162 |
"""
|
163 |
frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
164 |
frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
|
@@ -173,15 +167,14 @@ def compute_facial_recognition_vector(image):
|
|
173 |
box_w = int(bbox.width * w)
|
174 |
box_h = int(bbox.height * h)
|
175 |
face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
#
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
vector_str = np.array2string(vector_np, precision=2, separator=',')
|
185 |
return face_crop, vector_str
|
186 |
else:
|
187 |
return np.array(image), "No face detected"
|
@@ -246,21 +239,16 @@ def analyze_facial_recognition(image):
|
|
246 |
|
247 |
def analyze_all(image):
|
248 |
current_frame = np.array(image).copy()
|
249 |
-
# Posture Analysis
|
250 |
landmarks, posture_text = compute_posture_overlay(image)
|
251 |
if landmarks:
|
252 |
current_frame = draw_posture_overlay(current_frame, landmarks)
|
253 |
-
# Emotion Analysis
|
254 |
emotion_text = compute_emotion_overlay(image)
|
255 |
-
# Object Detection
|
256 |
boxes_obj, objects_text, object_list_text = compute_objects_overlay(image)
|
257 |
if boxes_obj:
|
258 |
current_frame = draw_boxes_overlay(current_frame, boxes_obj, (255, 255, 0))
|
259 |
-
# Face Detection
|
260 |
boxes_face, faces_text = compute_faces_overlay(image)
|
261 |
if boxes_face:
|
262 |
current_frame = draw_boxes_overlay(current_frame, boxes_face, (0, 0, 255))
|
263 |
-
# Combined Analysis Text
|
264 |
combined_text = (
|
265 |
f"<b>Posture Analysis:</b> {posture_text}<br>"
|
266 |
f"<b>Emotion Analysis:</b> {emotion_text}<br>"
|
@@ -350,7 +338,7 @@ facial_recognition_interface = gr.Interface(
|
|
350 |
inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
|
351 |
outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
|
352 |
title="Facial Recognition",
|
353 |
-
description="Extracts and outputs the facial vector using
|
354 |
live=False
|
355 |
)
|
356 |
|
|
|
7 |
from PIL import Image
|
8 |
import mediapipe as mp
|
9 |
from fer import FER # Facial emotion recognition
|
10 |
+
from transformers import AutoFeatureExtractor, AutoModel # (Unused now for facial recognition)
|
11 |
+
import onnxruntime as rt # New import for ONNX Runtime
|
12 |
|
13 |
# -----------------------------
|
14 |
# Configuration
|
|
|
50 |
object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
|
51 |
|
52 |
# -----------------------------
|
53 |
+
# Facial Recognition Model (Marltgap/FaceTransformerOctupletLoss ONNX)
|
54 |
# -----------------------------
|
55 |
+
facial_recognition_onnx = rt.InferenceSession("FaceTransformerOctupletLoss.onnx", providers=rt.get_available_providers())
|
|
|
|
|
|
|
56 |
|
57 |
# -----------------------------
|
58 |
# Overlay Drawing Functions
|
59 |
# -----------------------------
|
60 |
def draw_posture_overlay(raw_frame, landmarks):
|
|
|
61 |
for connection in mp_pose.POSE_CONNECTIONS:
|
62 |
start_idx, end_idx = connection
|
63 |
if start_idx < len(landmarks) and end_idx < len(landmarks):
|
64 |
start_point = landmarks[start_idx]
|
65 |
end_point = landmarks[end_idx]
|
66 |
cv2.line(raw_frame, start_point, end_point, (50, 205, 50), 2)
|
|
|
67 |
for (x, y) in landmarks:
|
68 |
cv2.circle(raw_frame, (x, y), 4, (50, 205, 50), -1)
|
69 |
return raw_frame
|
|
|
86 |
if pose_results.pose_landmarks:
|
87 |
landmarks = []
|
88 |
for lm in pose_results.pose_landmarks.landmark:
|
|
|
89 |
x = int(lm.x * small_w * (w / small_w))
|
90 |
y = int(lm.y * small_h * (h / small_h))
|
91 |
landmarks.append((x, y))
|
|
|
96 |
return landmarks, text
|
97 |
|
98 |
def compute_emotion_overlay(image):
|
|
|
99 |
frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
100 |
frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
|
101 |
frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
|
|
|
151 |
|
152 |
def compute_facial_recognition_vector(image):
|
153 |
"""
|
154 |
+
Detects a face using MediaPipe, crops and resizes it to 112x112, then computes its embedding
|
155 |
+
vector using the Marltgap FaceTransformerOctupletLoss ONNX model.
|
156 |
"""
|
157 |
frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
158 |
frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
|
|
|
167 |
box_w = int(bbox.width * w)
|
168 |
box_h = int(bbox.height * h)
|
169 |
face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
|
170 |
+
# Resize the face crop to the required dimensions: 112x112
|
171 |
+
face_crop_resized = cv2.resize(face_crop, (112, 112))
|
172 |
+
# Convert image to float32 (values between 0 and 255)
|
173 |
+
input_image = face_crop_resized.astype(np.float32)
|
174 |
+
# Run inference using the ONNX model
|
175 |
+
outputs = facial_recognition_onnx.run(None, {"input_image": input_image})
|
176 |
+
embedding = outputs[0][0] # Assuming the output shape is (1, 512)
|
177 |
+
vector_str = np.array2string(embedding, precision=2, separator=',')
|
|
|
178 |
return face_crop, vector_str
|
179 |
else:
|
180 |
return np.array(image), "No face detected"
|
|
|
239 |
|
240 |
def analyze_all(image):
|
241 |
current_frame = np.array(image).copy()
|
|
|
242 |
landmarks, posture_text = compute_posture_overlay(image)
|
243 |
if landmarks:
|
244 |
current_frame = draw_posture_overlay(current_frame, landmarks)
|
|
|
245 |
emotion_text = compute_emotion_overlay(image)
|
|
|
246 |
boxes_obj, objects_text, object_list_text = compute_objects_overlay(image)
|
247 |
if boxes_obj:
|
248 |
current_frame = draw_boxes_overlay(current_frame, boxes_obj, (255, 255, 0))
|
|
|
249 |
boxes_face, faces_text = compute_faces_overlay(image)
|
250 |
if boxes_face:
|
251 |
current_frame = draw_boxes_overlay(current_frame, boxes_face, (0, 0, 255))
|
|
|
252 |
combined_text = (
|
253 |
f"<b>Posture Analysis:</b> {posture_text}<br>"
|
254 |
f"<b>Emotion Analysis:</b> {emotion_text}<br>"
|
|
|
338 |
inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
|
339 |
outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
|
340 |
title="Facial Recognition",
|
341 |
+
description="Extracts and outputs the facial vector using the Marltgap FaceTransformerOctupletLoss ONNX model.",
|
342 |
live=False
|
343 |
)
|
344 |
|