Spaces:

Sagnik1750
/

emotion-analysis-ui

Sleeping

App Files Files Community

Sagnik1750 commited on Mar 6

Commit

b337ab5

verified ·

1 Parent(s): 0340596

Create app.py

Browse files

Files changed (1) hide show

app.py +460 -0

app.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import cv2
+import mediapipe as mp
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from facenet_pytorch import MTCNN
+from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
+from PIL import Image
+import moviepy.editor as moviepy
+import librosa
+import os
+import gradio as gr
+import tempfile
+# Initialize device
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Using device: {device}")
+# Initialize visual models
+mp_pose = mp.solutions.pose
+pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
+mtcnn = MTCNN(device=device)
+face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
+face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
+# Initialize audio model
+audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+audio_processor = AutoFeatureExtractor.from_pretrained(audio_model_name)
+audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
+audio_sampling_rate = 16000
+def calculate_angle(a, b, c):
+    """Calculates the angle between three points."""
+    a, b, c = np.array(a), np.array(b), np.array(c)
+    ba, bc = a - b, c - b
+    cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
+    return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
+def detect_emotions(frame):
+    """Detects facial emotions in a given frame."""
+    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    faces, _ = mtcnn.detect(img)
+    if faces is None or len(faces) == 0:
+        return "Neutral"  # Default to neutral if no face is detected
+    face = img.crop((faces[0][0], faces[0][1], faces[0][2], faces[0][3]))
+    inputs = face_extractor(images=face, return_tensors="pt").to(device)
+    outputs = face_model(**inputs)
+    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    return face_model.config.id2label[torch.argmax(probs).item()]
+def classify_posture(back_angle, neck_angle):
+    """Classifies posture based on back and neck angles."""
+    if back_angle > 170 and neck_angle > 150:
+        return "Confident"
+    elif back_angle < 160 and neck_angle < 140:
+        return "Nervous"
+    elif back_angle < 150:
+        return "Defensive"
+    elif neck_angle < 130:
+        return "Serious"
+    else:
+        return "Attentive"
+def extract_audio(video_path):
+    """Extracts audio from video file and saves it as WAV."""
+    audio_path = tempfile.mktemp(suffix='.wav')
+    video = moviepy.VideoFileClip(video_path)
+    video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
+    return audio_path
+def analyze_audio_emotion(audio_path):
+    """Analyzes emotion from audio file and returns emotion counts."""
+    # Load audio
+    y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
+    # Process audio in chunks to avoid memory issues
+    chunk_length = audio_sampling_rate * 5  # 5 seconds
+    emotion_counts = {}
+    audio_emotions = []
+    # Process audio in chunks
+    for i in range(0, len(y), chunk_length):
+        chunk = y[i:min(i+chunk_length, len(y))]
+        # Skip chunks that are too short
+        if len(chunk) < audio_sampling_rate:
+            continue
+        # Process audio with the model
+        inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = audio_model(**inputs)
+        # Get prediction
+        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
+        emotion = audio_model.config.id2label[predicted_class_id]
+        audio_emotions.append(emotion)
+        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
+    return emotion_counts, audio_emotions
+def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
+    """Draws multimodal emotion and posture sentiment on the frame."""
+    overlay = frame.copy()
+    cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
+    # Display current emotions
+    cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
+    cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
+    cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
+    # Display major emotion
+    cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
+    # Add explanation
+    reason_text = 'Weighted combination of face, posture, and audio analysis'
+    cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+    # Blend overlay with original frame
+    cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
+def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
+    """Generates charts for all emotion modalities."""
+    # Create a figure with 3 subplots
+    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
+    # Face emotions pie chart
+    labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
+    axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
+    axs[0].set_title("Facial Emotions")
+    # Posture pie chart
+    labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
+    axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
+    axs[1].set_title("Posture Analysis")
+    # Audio emotions pie chart
+    labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
+    axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
+    axs[2].set_title("Audio Emotions")
+    plt.tight_layout()
+    # Save to a temporary file
+    chart_path = tempfile.mktemp(suffix='.jpg')
+    plt.savefig(chart_path)
+    plt.close()
+    # Create combined emotions bar chart
+    plt.figure(figsize=(12, 6))
+    # Combine all emotions across modalities
+    all_emotions = set()
+    for counts in [face_emotion_counts, audio_emotion_counts]:
+        all_emotions.update(counts.keys())
+    # Prepare data for each emotion across modalities
+    emotions = list(all_emotions)
+    face_values = [face_emotion_counts.get(e, 0) for e in emotions]
+    audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
+    # Normalize values
+    if sum(face_values) > 0:
+        face_values = [v/sum(face_values)*100 for v in face_values]
+    if sum(audio_values) > 0:
+        audio_values = [v/sum(audio_values)*100 for v in audio_values]
+    # Create bar chart
+    x = np.arange(len(emotions))
+    width = 0.35
+    fig, ax = plt.subplots(figsize=(14, 8))
+    ax.bar(x - width/2, face_values, width, label='Face')
+    ax.bar(x + width/2, audio_values, width, label='Audio')
+    ax.set_title('Emotion Distribution by Modality')
+    ax.set_xlabel('Emotions')
+    ax.set_ylabel('Percentage (%)')
+    ax.set_xticks(x)
+    ax.set_xticklabels(emotions)
+    ax.legend()
+    plt.tight_layout()
+    # Save to a temporary file
+    comparison_path = tempfile.mktemp(suffix='.jpg')
+    plt.savefig(comparison_path)
+    plt.close()
+    return chart_path, comparison_path
+def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
+    """Calculates a combined sentiment score from all modalities."""
+    # Define emotion categories and weights
+    modality_weights = {
+        "face": 0.4,
+        "posture": 0.2,
+        "audio": 0.4
+    }
+    # Map posture labels to emotional states for better combination
+    posture_emotion_mapping = {
+        "Confident": "Happy",
+        "Nervous": "Fearful",
+        "Defensive": "Angry",
+        "Serious": "Neutral",
+        "Attentive": "Neutral"
+    }
+    # Convert posture counts to emotion counts
+    posture_emotion_counts = {}
+    for posture, count in posture_counts.items():
+        emotion = posture_emotion_mapping.get(posture, "Neutral")
+        posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
+    # Get all unique emotions across all modalities
+    all_emotions = set()
+    for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
+        all_emotions.update(counts.keys())
+    # Calculate total frames/samples for each modality
+    face_total = sum(face_emotion_counts.values())
+    posture_total = sum(posture_counts.values())
+    audio_total = sum(audio_emotion_counts.values())
+    # Calculate weighted emotion scores
+    combined_scores = {}
+    for emotion in all_emotions:
+        # Get normalized scores from each modality (or 0 if not present)
+        face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
+        posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
+        audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
+        # Calculate weighted score
+        weighted_score = (
+            face_score * modality_weights["face"] +
+            posture_score * modality_weights["posture"] +
+            audio_score * modality_weights["audio"]
+        )
+        combined_scores[emotion] = weighted_score
+    # Normalize to percentages
+    total_score = sum(combined_scores.values())
+    if total_score > 0:
+        for emotion in combined_scores:
+            combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
+    # Get the major emotion
+    major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
+    return combined_scores, major_emotion[0], major_emotion[1]
+def process_video_for_gradio(video_path, progress=gr.Progress()):
+    """Processes the video for Gradio interface with progress updates."""
+    # Extract audio first
+    progress(0.1, "Extracting audio from video...")
+    audio_path = extract_audio(video_path)
+    # Analyze audio emotions
+    progress(0.2, "Analyzing audio emotions...")
+    audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
+    # Process video frames
+    progress(0.3, "Starting video frame analysis...")
+    cap = cv2.VideoCapture(video_path)
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # Create a temporary file for the output video
+    output_path = tempfile.mktemp(suffix='.mp4')
+    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
+    # Initialize counters
+    face_emotion_counts = {}
+    posture_counts = {}
+    total_frames = 0
+    frame_index = 0
+    # Get total frames for progress tracking
+    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # For very long videos, we might want to sample frames
+    sample_rate = max(1, total_video_frames // 300)  # Process at most ~300 frames
+    # Calculate frames per audio segment
+    audio_segments = len(audio_emotions_sequence)
+    frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
+    current_audio_index = 0
+    # Current audio emotion
+    current_audio_emotion = audio_emotions_sequence[0] if audio_emotions_sequence else "Unknown"
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_index += 1
+        # Skip frames according to sample rate
+        if frame_index % sample_rate != 0:
+            continue
+        # Update progress
+        progress_value = 0.3 + (0.6 * frame_index / total_video_frames)
+        progress(progress_value, f"Processing frame {frame_index}/{total_video_frames}")
+        # Track the frame
+        total_frames += 1
+        # Update current audio emotion based on frame index
+        current_audio_index = min(frame_index // frames_per_audio, len(audio_emotions_sequence) - 1)
+        if current_audio_index >= 0 and current_audio_index < len(audio_emotions_sequence):
+            current_audio_emotion = audio_emotions_sequence[current_audio_index]
+        # Process the frame for face and posture
+        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        result = pose.process(rgb_frame)
+        posture_label = "Unknown"
+        if result.pose_landmarks:
+            landmarks = result.pose_landmarks.landmark
+            try:
+                shoulder = [landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].x, landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].y]
+                hip = [landmarks[mp_pose.PoseLandmark.LEFT_HIP].x, landmarks[mp_pose.PoseLandmark.LEFT_HIP].y]
+                knee = [landmarks[mp_pose.PoseLandmark.LEFT_KNEE].x, landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y]
+                ear = [landmarks[mp_pose.PoseLandmark.LEFT_EAR].x, landmarks[mp_pose.PoseLandmark.LEFT_EAR].y]
+                back_angle = calculate_angle(shoulder, hip, knee)
+                neck_angle = calculate_angle(ear, shoulder, hip)
+                posture_label = classify_posture(back_angle, neck_angle)
+            except:
+                # If any landmark is missing, use default
+                posture_label = "Unknown"
+        # Update posture counts
+        posture_counts[posture_label] = posture_counts.get(posture_label, 0) + 1
+        # Detect face emotion
+        try:
+            face_emotion = detect_emotions(frame)
+        except Exception as e:
+            face_emotion = "Neutral"
+            print(f"Face detection error: {e}")
+        # Update face emotion counts
+        face_emotion_counts[face_emotion] = face_emotion_counts.get(face_emotion, 0) + 1
+        # Calculate current major emotion
+        combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
+            face_emotion_counts, posture_counts, audio_emotion_counts
+        )
+        # Draw sentiment info on the frame
+        draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, current_audio_emotion, major_emotion, major_emotion_percent)
+        # Write the frame to output video
+        out.write(frame)
+    # Release resources
+    cap.release()
+    out.release()
+    # Generate charts
+    progress(0.9, "Generating emotion charts...")
+    chart_path, comparison_path = generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts)
+    # Clean up temporary audio file
+    try:
+        os.remove(audio_path)
+    except:
+        pass
+    progress(1.0, "Analysis complete!")
+    # Prepare result summary
+    combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
+        face_emotion_counts, posture_counts, audio_emotion_counts
+    )
+    result_summary = f"""
+    # Video Sentiment Analysis Results
+    ## Overall Sentiment
+    The dominant emotion in this video is: **{major_emotion}** ({major_emotion_percent:.1f}%)
+    ## Emotion Distribution
+    ### Face Emotions:
+    {', '.join([f"{emotion}: {count}" for emotion, count in face_emotion_counts.items()])}
+    ### Posture Analysis:
+    {', '.join([f"{posture}: {count}" for posture, count in posture_counts.items()])}
+    ### Audio Emotions:
+    {', '.join([f"{emotion}: {count}" for emotion, count in audio_emotion_counts.items()])}
+    ### Combined Emotion Scores:
+    {', '.join([f"{emotion}: {score:.1f}%" for emotion, score in combined_scores.items()])}
+    """
+    return output_path, chart_path, comparison_path, result_summary
+# Create Gradio interface
+def create_gradio_interface():
+    with gr.Blocks(title="Multimodal Video Sentiment Analysis") as demo:
+        gr.Markdown("# 📹 Multimodal Video Sentiment Analysis")
+        gr.Markdown("""
+        This app analyzes videos for emotions using three modalities:
+        - 😊 **Facial Expressions**: Detects emotions from faces
+        - 🧍‍♂️ **Body Posture**: Identifies emotional cues from posture
+        - 🔊 **Audio Tone**: Analyzes voice for emotional content
+        Upload a video to see the combined analysis!
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                video_input = gr.Video(label="Upload Video")
+                analyze_btn = gr.Button("Analyze Video", variant="primary")
+            with gr.Column(scale=2):
+                with gr.Tabs():
+                    with gr.TabItem("Results Summary"):
+                        result_text = gr.Markdown(label="Analysis Results")
+                    with gr.TabItem("Processed Video"):
+                        video_output = gr.Video(label="Processed Video")
+                    with gr.TabItem("Emotion Charts"):
+                        chart_output = gr.Image(label="Emotion Distribution")
+                        comparison_output = gr.Image(label="Modality Comparison")
+        analyze_btn.click(
+            process_video_for_gradio,
+            inputs=[video_input],
+            outputs=[video_output, chart_output, comparison_output, result_text]
+        )
+        gr.Markdown("""
+        ## How it works
+        1. **Visual Analysis**: The app processes video frames to detect faces and body posture
+        2. **Audio Analysis**: The audio is extracted and analyzed for emotional tone
+        3. **Combined Analysis**: The results are weighted and combined for a holistic emotional assessment
+        The app uses pretrained models for each modality and combines their outputs using a weighted approach.
+        """)
+    return demo
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch()