David Driscoll commited on
Commit
bf45aa3
·
1 Parent(s): 473b2d5
Files changed (1) hide show
  1. app.py +15 -27
app.py CHANGED
@@ -7,7 +7,8 @@ from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
7
  from PIL import Image
8
  import mediapipe as mp
9
  from fer import FER # Facial emotion recognition
10
- from transformers import AutoFeatureExtractor, AutoModel
 
11
 
12
  # -----------------------------
13
  # Configuration
@@ -49,25 +50,20 @@ emotion_detector = FER(mtcnn=True)
49
  object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
50
 
51
  # -----------------------------
52
- # Facial Recognition Model (DINO-ViT)
53
  # -----------------------------
54
- facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16")
55
- facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16")
56
- facial_recognition_model.to(device)
57
- facial_recognition_model.eval()
58
 
59
  # -----------------------------
60
  # Overlay Drawing Functions
61
  # -----------------------------
62
  def draw_posture_overlay(raw_frame, landmarks):
63
- # Draw connector lines using MediaPipe's POSE_CONNECTIONS
64
  for connection in mp_pose.POSE_CONNECTIONS:
65
  start_idx, end_idx = connection
66
  if start_idx < len(landmarks) and end_idx < len(landmarks):
67
  start_point = landmarks[start_idx]
68
  end_point = landmarks[end_idx]
69
  cv2.line(raw_frame, start_point, end_point, (50, 205, 50), 2)
70
- # Draw landmark points in lime green (BGR: (50,205,50))
71
  for (x, y) in landmarks:
72
  cv2.circle(raw_frame, (x, y), 4, (50, 205, 50), -1)
73
  return raw_frame
@@ -90,7 +86,6 @@ def compute_posture_overlay(image):
90
  if pose_results.pose_landmarks:
91
  landmarks = []
92
  for lm in pose_results.pose_landmarks.landmark:
93
- # Scale landmarks back to the original image size
94
  x = int(lm.x * small_w * (w / small_w))
95
  y = int(lm.y * small_h * (h / small_h))
96
  landmarks.append((x, y))
@@ -101,7 +96,6 @@ def compute_posture_overlay(image):
101
  return landmarks, text
102
 
103
  def compute_emotion_overlay(image):
104
- # Use the FER package (exactly as in your provided code)
105
  frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
106
  frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
107
  frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
@@ -157,8 +151,8 @@ def compute_faces_overlay(image):
157
 
158
  def compute_facial_recognition_vector(image):
159
  """
160
- Detects a face using MediaPipe, crops it, and computes its embedding vector
161
- using facebook/dino-vitb16. The raw vector is returned as a string.
162
  """
163
  frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
164
  frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
@@ -173,15 +167,14 @@ def compute_facial_recognition_vector(image):
173
  box_w = int(bbox.width * w)
174
  box_h = int(bbox.height * h)
175
  face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
176
- face_image = Image.fromarray(face_crop)
177
- inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device)
178
- with torch.no_grad():
179
- outputs = facial_recognition_model(**inputs)
180
- # Mean pooling of the last hidden state to obtain a vector representation
181
- vector = outputs.last_hidden_state.mean(dim=1).squeeze()
182
- vector_np = vector.cpu().numpy()
183
- # Format vector as a string with limited decimal places
184
- vector_str = np.array2string(vector_np, precision=2, separator=',')
185
  return face_crop, vector_str
186
  else:
187
  return np.array(image), "No face detected"
@@ -246,21 +239,16 @@ def analyze_facial_recognition(image):
246
 
247
  def analyze_all(image):
248
  current_frame = np.array(image).copy()
249
- # Posture Analysis
250
  landmarks, posture_text = compute_posture_overlay(image)
251
  if landmarks:
252
  current_frame = draw_posture_overlay(current_frame, landmarks)
253
- # Emotion Analysis
254
  emotion_text = compute_emotion_overlay(image)
255
- # Object Detection
256
  boxes_obj, objects_text, object_list_text = compute_objects_overlay(image)
257
  if boxes_obj:
258
  current_frame = draw_boxes_overlay(current_frame, boxes_obj, (255, 255, 0))
259
- # Face Detection
260
  boxes_face, faces_text = compute_faces_overlay(image)
261
  if boxes_face:
262
  current_frame = draw_boxes_overlay(current_frame, boxes_face, (0, 0, 255))
263
- # Combined Analysis Text
264
  combined_text = (
265
  f"<b>Posture Analysis:</b> {posture_text}<br>"
266
  f"<b>Emotion Analysis:</b> {emotion_text}<br>"
@@ -350,7 +338,7 @@ facial_recognition_interface = gr.Interface(
350
  inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
351
  outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
352
  title="Facial Recognition",
353
- description="Extracts and outputs the facial vector using facebook/dino-vitb16.",
354
  live=False
355
  )
356
 
 
7
  from PIL import Image
8
  import mediapipe as mp
9
  from fer import FER # Facial emotion recognition
10
+ from transformers import AutoFeatureExtractor, AutoModel # (Unused now for facial recognition)
11
+ import onnxruntime as rt # New import for ONNX Runtime
12
 
13
  # -----------------------------
14
  # Configuration
 
50
  object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
51
 
52
  # -----------------------------
53
+ # Facial Recognition Model (Marltgap/FaceTransformerOctupletLoss ONNX)
54
  # -----------------------------
55
+ facial_recognition_onnx = rt.InferenceSession("FaceTransformerOctupletLoss.onnx", providers=rt.get_available_providers())
 
 
 
56
 
57
  # -----------------------------
58
  # Overlay Drawing Functions
59
  # -----------------------------
60
  def draw_posture_overlay(raw_frame, landmarks):
 
61
  for connection in mp_pose.POSE_CONNECTIONS:
62
  start_idx, end_idx = connection
63
  if start_idx < len(landmarks) and end_idx < len(landmarks):
64
  start_point = landmarks[start_idx]
65
  end_point = landmarks[end_idx]
66
  cv2.line(raw_frame, start_point, end_point, (50, 205, 50), 2)
 
67
  for (x, y) in landmarks:
68
  cv2.circle(raw_frame, (x, y), 4, (50, 205, 50), -1)
69
  return raw_frame
 
86
  if pose_results.pose_landmarks:
87
  landmarks = []
88
  for lm in pose_results.pose_landmarks.landmark:
 
89
  x = int(lm.x * small_w * (w / small_w))
90
  y = int(lm.y * small_h * (h / small_h))
91
  landmarks.append((x, y))
 
96
  return landmarks, text
97
 
98
  def compute_emotion_overlay(image):
 
99
  frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
100
  frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
101
  frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
 
151
 
152
  def compute_facial_recognition_vector(image):
153
  """
154
+ Detects a face using MediaPipe, crops and resizes it to 112x112, then computes its embedding
155
+ vector using the Marltgap FaceTransformerOctupletLoss ONNX model.
156
  """
157
  frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
158
  frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
 
167
  box_w = int(bbox.width * w)
168
  box_h = int(bbox.height * h)
169
  face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
170
+ # Resize the face crop to the required dimensions: 112x112
171
+ face_crop_resized = cv2.resize(face_crop, (112, 112))
172
+ # Convert image to float32 (values between 0 and 255)
173
+ input_image = face_crop_resized.astype(np.float32)
174
+ # Run inference using the ONNX model
175
+ outputs = facial_recognition_onnx.run(None, {"input_image": input_image})
176
+ embedding = outputs[0][0] # Assuming the output shape is (1, 512)
177
+ vector_str = np.array2string(embedding, precision=2, separator=',')
 
178
  return face_crop, vector_str
179
  else:
180
  return np.array(image), "No face detected"
 
239
 
240
  def analyze_all(image):
241
  current_frame = np.array(image).copy()
 
242
  landmarks, posture_text = compute_posture_overlay(image)
243
  if landmarks:
244
  current_frame = draw_posture_overlay(current_frame, landmarks)
 
245
  emotion_text = compute_emotion_overlay(image)
 
246
  boxes_obj, objects_text, object_list_text = compute_objects_overlay(image)
247
  if boxes_obj:
248
  current_frame = draw_boxes_overlay(current_frame, boxes_obj, (255, 255, 0))
 
249
  boxes_face, faces_text = compute_faces_overlay(image)
250
  if boxes_face:
251
  current_frame = draw_boxes_overlay(current_frame, boxes_face, (0, 0, 255))
 
252
  combined_text = (
253
  f"<b>Posture Analysis:</b> {posture_text}<br>"
254
  f"<b>Emotion Analysis:</b> {emotion_text}<br>"
 
338
  inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
339
  outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
340
  title="Facial Recognition",
341
+ description="Extracts and outputs the facial vector using the Marltgap FaceTransformerOctupletLoss ONNX model.",
342
  live=False
343
  )
344