Spaces:

ddriscoll
/

SOC3242-01_Group_3_Interactive

Sleeping

File size: 9,819 Bytes

import gradio as gr
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from PIL import Image
import mediapipe as mp
from fer import FER  # Facial emotion recognition

# -----------------------------
# Constants
# -----------------------------
SKIP_RATE = 5  # Run heavy detection every 5 frames

# -----------------------------
# Initialize Models and Helpers
# -----------------------------

# MediaPipe Pose for posture analysis
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()
mp_drawing = mp.solutions.drawing_utils

# MediaPipe Face Detection for face detection
mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)

# Object Detection Model: Faster R-CNN (pretrained on COCO)
object_detection_model = models.detection.fasterrcnn_resnet50_fpn(
    weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT
)
object_detection_model.eval()
obj_transform = transforms.Compose([transforms.ToTensor()])

# Facial Emotion Detection using FER (requires TensorFlow)
emotion_detector = FER(mtcnn=True)

# -----------------------------
# Define Analysis Functions with Frame Skipping
# -----------------------------

def analyze_posture(image):
    """
    Processes an image from the webcam with MediaPipe Pose.
    Runs heavy detection every SKIP_RATE frames; otherwise, returns last result.
    """
    if not hasattr(analyze_posture, "counter"):
        analyze_posture.counter = 0
        analyze_posture.last_output = None
    analyze_posture.counter += 1

    # If first frame or time to run detection:
    if analyze_posture.counter % SKIP_RATE == 0 or analyze_posture.last_output is None:
        # Convert from PIL (RGB) to OpenCV BGR format
        frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        output_frame = frame.copy()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        posture_result = "No posture detected"
        pose_results = pose.process(frame_rgb)
        if pose_results.pose_landmarks:
            posture_result = "Posture detected"
            mp_drawing.draw_landmarks(
                output_frame, pose_results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
                mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2)
            )

        annotated_image = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
        result = (annotated_image, f"Posture Analysis: {posture_result}")
        analyze_posture.last_output = result
        return result
    else:
        # For frames in between, return last result
        return analyze_posture.last_output

def analyze_emotion(image):
    """
    Uses FER to detect facial emotions from the webcam image.
    Runs heavy detection every SKIP_RATE frames.
    """
    if not hasattr(analyze_emotion, "counter"):
        analyze_emotion.counter = 0
        analyze_emotion.last_output = None
    analyze_emotion.counter += 1

    if analyze_emotion.counter % SKIP_RATE == 0 or analyze_emotion.last_output is None:
        frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        emotions = emotion_detector.detect_emotions(frame_rgb)
        if emotions:
            top_emotion, score = max(emotions[0]["emotions"].items(), key=lambda x: x[1])
            emotion_text = f"{top_emotion} ({score:.2f})"
        else:
            emotion_text = "No face detected for emotion analysis"
        annotated_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = (annotated_image, f"Emotion Analysis: {emotion_text}")
        analyze_emotion.last_output = result
        return result
    else:
        return analyze_emotion.last_output

def analyze_objects(image):
    """
    Uses Faster R-CNN to detect objects in the webcam image.
    Heavy detection is run every SKIP_RATE frames.
    """
    if not hasattr(analyze_objects, "counter"):
        analyze_objects.counter = 0
        analyze_objects.last_output = None
    analyze_objects.counter += 1

    if analyze_objects.counter % SKIP_RATE == 0 or analyze_objects.last_output is None:
        frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        output_frame = frame.copy()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image_pil = Image.fromarray(frame_rgb)
        img_tensor = obj_transform(image_pil)

        with torch.no_grad():
            detections = object_detection_model([img_tensor])[0]

        threshold = 0.8
        detected_boxes = detections["boxes"][detections["scores"] > threshold]
        for box in detected_boxes:
            box = box.int().cpu().numpy()
            cv2.rectangle(output_frame, (box[0], box[1]), (box[2], box[3]), (255, 255, 0), 2)

        object_result = f"Detected {len(detected_boxes)} object(s)" if len(detected_boxes) else "No objects detected"
        annotated_image = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
        result = (annotated_image, f"Object Detection: {object_result}")
        analyze_objects.last_output = result
        return result
    else:
        return analyze_objects.last_output

def analyze_faces(image):
    """
    Uses MediaPipe to detect faces in the webcam image.
    Runs heavy detection every SKIP_RATE frames.
    """
    if not hasattr(analyze_faces, "counter"):
        analyze_faces.counter = 0
        analyze_faces.last_output = None
    analyze_faces.counter += 1

    if analyze_faces.counter % SKIP_RATE == 0 or analyze_faces.last_output is None:
        frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        output_frame = frame.copy()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        face_results = face_detection.process(frame_rgb)

        face_result = "No faces detected"
        if face_results.detections:
            face_result = f"Detected {len(face_results.detections)} face(s)"
            h, w, _ = output_frame.shape
            for detection in face_results.detections:
                bbox = detection.location_data.relative_bounding_box
                x = int(bbox.xmin * w)
                y = int(bbox.ymin * h)
                box_w = int(bbox.width * w)
                box_h = int(bbox.height * h)
                cv2.rectangle(output_frame, (x, y), (x + box_w, y + box_h), (0, 0, 255), 2)

        annotated_image = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
        result = (annotated_image, f"Face Detection: {face_result}")
        analyze_faces.last_output = result
        return result
    else:
        return analyze_faces.last_output

# -----------------------------
# Custom CSS for a High-Tech Look (with white fonts)
# -----------------------------
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@400;700&display=swap');
body {
    background-color: #0e0e0e;
    color: #ffffff;
    font-family: 'Orbitron', sans-serif;
    margin: 0;
    padding: 0;
}
.gradio-container {
    background: linear-gradient(135deg, #1e1e2f, #3e3e55);
    border-radius: 10px;
    padding: 20px;
    max-width: 1200px;
    margin: auto;
}
.gradio-title {
    font-size: 2.5em;
    color: #ffffff;
    text-align: center;
    margin-bottom: 0.2em;
}
.gradio-description {
    font-size: 1.2em;
    text-align: center;
    margin-bottom: 1em;
    color: #ffffff;
}
"""

# -----------------------------
# Create Individual Interfaces for Each Analysis (using real-time webcam input)
# -----------------------------
posture_interface = gr.Interface(
    fn=analyze_posture,
    inputs=gr.Image(sources=["webcam"], streaming=True, label="Capture Your Posture"),
    outputs=[gr.Image(type="numpy", label="Annotated Output"), gr.Textbox(label="Posture Analysis")],
    title="Posture Analysis",
    description="Detects your posture using MediaPipe.",
    live=True
)

emotion_interface = gr.Interface(
    fn=analyze_emotion,
    inputs=gr.Image(sources=["webcam"], streaming=True, label="Capture Your Face"),
    outputs=[gr.Image(type="numpy", label="Annotated Output"), gr.Textbox(label="Emotion Analysis")],
    title="Emotion Analysis",
    description="Detects facial emotions using FER.",
    live=True
)

objects_interface = gr.Interface(
    fn=analyze_objects,
    inputs=gr.Image(sources=["webcam"], streaming=True, label="Capture the Scene"),
    outputs=[gr.Image(type="numpy", label="Annotated Output"), gr.Textbox(label="Object Detection")],
    title="Object Detection",
    description="Detects objects using a pretrained Faster R-CNN.",
    live=True
)

faces_interface = gr.Interface(
    fn=analyze_faces,
    inputs=gr.Image(sources=["webcam"], streaming=True, label="Capture Your Face"),
    outputs=[gr.Image(type="numpy", label="Annotated Output"), gr.Textbox(label="Face Detection")],
    title="Face Detection",
    description="Detects faces using MediaPipe.",
    live=True
)

# -----------------------------
# Create a Tabbed Interface for All Analyses
# -----------------------------
tabbed_interface = gr.TabbedInterface(
    interface_list=[posture_interface, emotion_interface, objects_interface, faces_interface],
    tab_names=["Posture", "Emotion", "Objects", "Faces"]
)

# -----------------------------
# Wrap Everything in a Blocks Layout with Custom CSS
# -----------------------------
demo = gr.Blocks(css=custom_css)
with demo:
    gr.Markdown("<h1 class='gradio-title'>Real-Time Multi-Analysis App</h1>")
    gr.Markdown("<p class='gradio-description'>Experience a high-tech cinematic interface for real-time analysis of your posture, emotions, objects, and faces using your webcam.</p>")
    tabbed_interface.render()

if __name__ == "__main__":
    demo.launch()