Spaces:

ddriscoll
/

SOC3242-01_Group_3_Interactive

Sleeping

App Files Files Community

David Driscoll commited on Feb 19

Commit

bf45aa3

1 Parent(s): 473b2d5

Add model

Browse files

Files changed (1) hide show

app.py +15 -27

app.py CHANGED Viewed

@@ -7,7 +7,8 @@ from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
 from PIL import Image
 import mediapipe as mp
 from fer import FER  # Facial emotion recognition
-from transformers import AutoFeatureExtractor, AutoModel
 # -----------------------------
 # Configuration
@@ -49,25 +50,20 @@ emotion_detector = FER(mtcnn=True)
 object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
 # -----------------------------
-# Facial Recognition Model (DINO-ViT)
 # -----------------------------
-facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16")
-facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16")
-facial_recognition_model.to(device)
-facial_recognition_model.eval()
 # -----------------------------
 # Overlay Drawing Functions
 # -----------------------------
 def draw_posture_overlay(raw_frame, landmarks):
-    # Draw connector lines using MediaPipe's POSE_CONNECTIONS
     for connection in mp_pose.POSE_CONNECTIONS:
         start_idx, end_idx = connection
         if start_idx < len(landmarks) and end_idx < len(landmarks):
             start_point = landmarks[start_idx]
             end_point = landmarks[end_idx]
             cv2.line(raw_frame, start_point, end_point, (50, 205, 50), 2)
-    # Draw landmark points in lime green (BGR: (50,205,50))
     for (x, y) in landmarks:
         cv2.circle(raw_frame, (x, y), 4, (50, 205, 50), -1)
     return raw_frame
@@ -90,7 +86,6 @@ def compute_posture_overlay(image):
     if pose_results.pose_landmarks:
         landmarks = []
         for lm in pose_results.pose_landmarks.landmark:
-            # Scale landmarks back to the original image size
             x = int(lm.x * small_w * (w / small_w))
             y = int(lm.y * small_h * (h / small_h))
             landmarks.append((x, y))
@@ -101,7 +96,6 @@ def compute_posture_overlay(image):
     return landmarks, text
 def compute_emotion_overlay(image):
-    # Use the FER package (exactly as in your provided code)
     frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
     frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
@@ -157,8 +151,8 @@ def compute_faces_overlay(image):
 def compute_facial_recognition_vector(image):
     """
-    Detects a face using MediaPipe, crops it, and computes its embedding vector
-    using facebook/dino-vitb16. The raw vector is returned as a string.
     """
     frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
@@ -173,15 +167,14 @@ def compute_facial_recognition_vector(image):
         box_w = int(bbox.width * w)
         box_h = int(bbox.height * h)
         face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
-        face_image = Image.fromarray(face_crop)
-        inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device)
-        with torch.no_grad():
-            outputs = facial_recognition_model(**inputs)
-        # Mean pooling of the last hidden state to obtain a vector representation
-        vector = outputs.last_hidden_state.mean(dim=1).squeeze()
-        vector_np = vector.cpu().numpy()
-        # Format vector as a string with limited decimal places
-        vector_str = np.array2string(vector_np, precision=2, separator=',')
         return face_crop, vector_str
     else:
         return np.array(image), "No face detected"
@@ -246,21 +239,16 @@ def analyze_facial_recognition(image):
 def analyze_all(image):
     current_frame = np.array(image).copy()
-    # Posture Analysis
     landmarks, posture_text = compute_posture_overlay(image)
     if landmarks:
         current_frame = draw_posture_overlay(current_frame, landmarks)
-    # Emotion Analysis
     emotion_text = compute_emotion_overlay(image)
-    # Object Detection
     boxes_obj, objects_text, object_list_text = compute_objects_overlay(image)
     if boxes_obj:
         current_frame = draw_boxes_overlay(current_frame, boxes_obj, (255, 255, 0))
-    # Face Detection
     boxes_face, faces_text = compute_faces_overlay(image)
     if boxes_face:
         current_frame = draw_boxes_overlay(current_frame, boxes_face, (0, 0, 255))
-    # Combined Analysis Text
     combined_text = (
         f"<b>Posture Analysis:</b> {posture_text}<br>"
         f"<b>Emotion Analysis:</b> {emotion_text}<br>"
@@ -350,7 +338,7 @@ facial_recognition_interface = gr.Interface(
     inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
     outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
     title="Facial Recognition",
-    description="Extracts and outputs the facial vector using facebook/dino-vitb16.",
     live=False
 )

 from PIL import Image
 import mediapipe as mp
 from fer import FER  # Facial emotion recognition
+from transformers import AutoFeatureExtractor, AutoModel  # (Unused now for facial recognition)
+import onnxruntime as rt  # New import for ONNX Runtime
 # -----------------------------
 # Configuration
 object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
 # -----------------------------
+# Facial Recognition Model (Marltgap/FaceTransformerOctupletLoss ONNX)
 # -----------------------------
+facial_recognition_onnx = rt.InferenceSession("FaceTransformerOctupletLoss.onnx", providers=rt.get_available_providers())
 # -----------------------------
 # Overlay Drawing Functions
 # -----------------------------
 def draw_posture_overlay(raw_frame, landmarks):
     for connection in mp_pose.POSE_CONNECTIONS:
         start_idx, end_idx = connection
         if start_idx < len(landmarks) and end_idx < len(landmarks):
             start_point = landmarks[start_idx]
             end_point = landmarks[end_idx]
             cv2.line(raw_frame, start_point, end_point, (50, 205, 50), 2)
     for (x, y) in landmarks:
         cv2.circle(raw_frame, (x, y), 4, (50, 205, 50), -1)
     return raw_frame
     if pose_results.pose_landmarks:
         landmarks = []
         for lm in pose_results.pose_landmarks.landmark:
             x = int(lm.x * small_w * (w / small_w))
             y = int(lm.y * small_h * (h / small_h))
             landmarks.append((x, y))
     return landmarks, text
 def compute_emotion_overlay(image):
     frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
     frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
 def compute_facial_recognition_vector(image):
     """
+    Detects a face using MediaPipe, crops and resizes it to 112x112, then computes its embedding
+    vector using the Marltgap FaceTransformerOctupletLoss ONNX model.
     """
     frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
         box_w = int(bbox.width * w)
         box_h = int(bbox.height * h)
         face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
+        # Resize the face crop to the required dimensions: 112x112
+        face_crop_resized = cv2.resize(face_crop, (112, 112))
+        # Convert image to float32 (values between 0 and 255)
+        input_image = face_crop_resized.astype(np.float32)
+        # Run inference using the ONNX model
+        outputs = facial_recognition_onnx.run(None, {"input_image": input_image})
+        embedding = outputs[0][0]  # Assuming the output shape is (1, 512)
+        vector_str = np.array2string(embedding, precision=2, separator=',')
         return face_crop, vector_str
     else:
         return np.array(image), "No face detected"
 def analyze_all(image):
     current_frame = np.array(image).copy()
     landmarks, posture_text = compute_posture_overlay(image)
     if landmarks:
         current_frame = draw_posture_overlay(current_frame, landmarks)
     emotion_text = compute_emotion_overlay(image)
     boxes_obj, objects_text, object_list_text = compute_objects_overlay(image)
     if boxes_obj:
         current_frame = draw_boxes_overlay(current_frame, boxes_obj, (255, 255, 0))
     boxes_face, faces_text = compute_faces_overlay(image)
     if boxes_face:
         current_frame = draw_boxes_overlay(current_frame, boxes_face, (0, 0, 255))
     combined_text = (
         f"<b>Posture Analysis:</b> {posture_text}<br>"
         f"<b>Emotion Analysis:</b> {emotion_text}<br>"
     inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
     outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
     title="Facial Recognition",
+    description="Extracts and outputs the facial vector using the Marltgap FaceTransformerOctupletLoss ONNX model.",
     live=False
 )