Spaces:

Sagnik1750
/

emotion-analysis-ui

Sleeping

App Files Files Community

Sagnik1750 commited on Mar 6

Commit

b353d28

verified ·

1 Parent(s): a18c5a8

Update app.py

Browse files

Files changed (1) hide show

app.py +317 -154

app.py CHANGED Viewed

@@ -1,163 +1,326 @@
-import torch
-import torchaudio
 import cv2
-import librosa
 import numpy as np
-import gradio as gr
 import matplotlib.pyplot as plt
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
-from deepface import DeepFace
-from moviepy.editor import VideoFileClip
-# --- Load Pretrained Models ---
-# Speech-to-Text
-asr_model_name = "facebook/wav2vec2-large-960h"
-asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
-asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name).to("cpu")
-# Sentiment Analysis (Text)
-emotion_model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
-emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
-emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name).to("cpu")
-# Emotion Categories
-emotion_labels = {
-    0: "Neutral", 1: "Happy", 2: "Sad", 3: "Surprise", 4: "Fear",
-    5: "Disgust", 6: "Anger", 7: "Contempt"
-}
-# --- Extract Audio from Video ---
-def extract_audio(video_path, audio_output_path="temp_audio.wav"):
-    video = VideoFileClip(video_path)
-    video.audio.write_audiofile(audio_output_path, codec="pcm_s16le")
-    return audio_output_path
-# --- Extract Frames for Facial & Posture Analysis ---
-def extract_frames(video_path, interval=10):
-    cap = cv2.VideoCapture(video_path)
-    frames = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frames.append(frame)
-    cap.release()
-    return frames[::interval]  # Process every nth frame
-# --- Normalize Emotion Percentages to 100% ---
-def normalize_emotion_percentages(emotion_counts):
-    print("Raw emotion counts:", emotion_counts)  # Debugging
-    total = sum(emotion_counts.values())
-    if total > 0:
-        normalized_counts = {k: round((v / total) * 100, 1) for k, v in emotion_counts.items()}
-        # Adjust the highest emotion to ensure total = 100%
-        total_after = sum(normalized_counts.values())
-        if total_after != 100:
-            diff = 100 - total_after
-            max_emotion = max(normalized_counts, key=normalized_counts.get)
-            normalized_counts[max_emotion] += diff
-        print("Normalized emotion counts:", normalized_counts)  # Debugging
-        return normalized_counts
-    else:
-        return {k: 0 for k in emotion_counts}
-# --- Facial Emotion Analysis ---
-def analyze_facial_emotion(frames):
-    emotion_counts = {key: 0 for key in emotion_labels.values()}
-    for frame in frames:
-        try:
-            result = DeepFace.analyze(frame, actions=["emotion"], enforce_detection=False)
-            detected_emotion = result[0]["dominant_emotion"].capitalize()
-            print("Detected emotion:", detected_emotion)  # Debugging
-            if detected_emotion in emotion_counts:
-                emotion_counts[detected_emotion] += 1
-        except Exception:
-            continue
-    return normalize_emotion_percentages(emotion_counts)
-# --- Speech-to-Text ---
-def transcribe_audio(audio_path):
-    speech, sr = librosa.load(audio_path, sr=16000)
-    input_values = asr_processor(speech, return_tensors="pt", sampling_rate=16000).input_values
-    with torch.no_grad():
-        logits = asr_model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    return asr_processor.batch_decode(predicted_ids)[0]
-# --- Sentiment Analysis from Text ---
-def analyze_audio_emotion(text):
-    inputs = emotion_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    with torch.no_grad():
-        logits = emotion_model(**inputs).logits
-    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
-    predicted_emotion = emotion_labels[torch.argmax(logits, dim=-1).item()]
-    return predicted_emotion, probabilities
-# --- Full Analysis Pipeline ---
-def analyze_video(video_path):
-    # Extract Audio from Video
-    audio_path = extract_audio(video_path)
-    # Extract Frames for Facial & Posture Analysis
-    frames = extract_frames(video_path)
-    # Facial Emotion Analysis
-    facial_emotions = analyze_facial_emotion(frames)
-    # Audio Analysis
-    transcription = transcribe_audio(audio_path)
-    audio_emotion, audio_probabilities = analyze_audio_emotion(transcription)
-    # Combine Emotion Scores
-    final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
-    # Display Emotion Pie Chart
-    plt.figure(figsize=(5, 5))
-    plt.pie(facial_emotions.values(), labels=facial_emotions.keys(), autopct="%1.1f%%", colors=plt.cm.Paired.colors)
-    plt.title("Facial Emotion Distribution")
-    plt.savefig("emotion_pie_chart.png")
-    return (
-        transcription,
-        audio_emotion,
-        final_emotion,
-        facial_emotions,
-        "emotion_pie_chart.png"
-    )
-# --- Gradio UI ---
-theme_css = """
-    body { font-family: Arial, sans-serif; background: #f4f4f4; }
-    .gradio-container { max-width: 800px; margin: auto; padding: 20px; background: white; border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); }
-    .gr-box { border-radius: 10px; padding: 15px; background: #fff; }
-    h1 { color: #333; text-align: center; }
-"""
-interface = gr.Interface(
-    fn=analyze_video,
-    inputs=gr.Video(),
-    outputs=[
-        gr.Textbox(label="Transcribed Speech"),
-        gr.Textbox(label="Predicted Audio Emotion"),
-        gr.Textbox(label="Major Detected Emotion (Face + Posture)"),
-        gr.Label(label="Facial Emotion Distribution"),
-        gr.Image(label="Facial Emotion Pie Chart"),
-    ],
-    title="🎭 Multi-Modal Emotion Analysis",
-    description="📌 Upload a video and get analyzed emotions from **facial expressions, posture, and voice** in one step.\n\n🚀 Features:\n- Facial Emotion Analysis\n- Audio-Based Sentiment Detection\n- Real-Time Processing\n- Visual Pie Chart Representation",
-    theme="compact",
-    css=theme_css
-)
-interface.launch()

 import cv2
+import mediapipe as mp
+import torch
 import numpy as np
 import matplotlib.pyplot as plt
+import seaborn as sns
+from facenet_pytorch import MTCNN
+from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification
+from PIL import Image
+import moviepy.editor as moviepy
+import librosa
+import os
+# Initialize device
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Initialize visual models
+mp_pose = mp.solutions.pose
+pose = mp_pose.Pose()
+mtcnn = MTCNN(device=device)
+face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
+face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
+# Initialize audio model
+audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+audio_processor = AutoProcessor.from_pretrained(audio_model_name)
+audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
+audio_sampling_rate = 16000
+def calculate_angle(a, b, c):
+    """Calculates the angle between three points."""
+    a, b, c = np.array(a), np.array(b), np.array(c)
+    ba, bc = a - b, c - b
+    cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
+    return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
+def detect_emotions(frame):
+    """Detects facial emotions in a given frame."""
+    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    faces, _ = mtcnn.detect(img)
+    if faces is None or len(faces) == 0:
+        return "Neutral"  # Default to neutral if no face is detected
+    face = img.crop(faces[0])
+    inputs = face_extractor(images=face, return_tensors="pt").to(device)
+    outputs = face_model(**inputs)
+    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    return face_model.config.id2label[torch.argmax(probs).item()]
+def classify_posture(back_angle, neck_angle):
+    """Classifies posture based on back and neck angles."""
+    if back_angle > 170 and neck_angle > 150:
+        return "Confident"
+    elif back_angle < 160 and neck_angle < 140:
+        return "Nervous"
+    elif back_angle < 150:
+        return "Defensive"
+    elif neck_angle < 130:
+        return "Serious"
+    else:
+        return "Attentive"
+def extract_audio(video_path):
+    """Extracts audio from video file and saves it as WAV."""
+    audio_path = "extracted_audio.wav"
+    video = moviepy.VideoFileClip(video_path)
+    video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False)
+    return audio_path
+def analyze_audio_emotion(audio_path):
+    """Analyzes emotion from audio file and returns emotion counts."""
+    # Load audio
+    y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
+    # Process audio in chunks to avoid memory issues
+    chunk_length = audio_sampling_rate * 5  # 5 seconds
+    emotion_counts = {}
+    audio_emotions = []
+    # Process audio in chunks
+    for i in range(0, len(y), chunk_length):
+        chunk = y[i:min(i+chunk_length, len(y))]
+        # Skip chunks that are too short
+        if len(chunk) < audio_sampling_rate:
+            continue
+        # Process audio with the model
+        inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = audio_model(**inputs)
+        # Get prediction
+        predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
+        emotion = audio_model.config.id2label[predicted_class_id]
+        audio_emotions.append(emotion)
+        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
+    return emotion_counts, audio_emotions
+def map_emotion_labels(emotion, source="face"):
+    """Standardizes emotion labels across different models."""
+    # Mapping dictionaries for different models
+    face_mapping = {
+        "happy": "Happy",
+        "sad": "Sad",
+        "angry": "Angry",
+        "surprise": "Surprised",
+        "fear": "Fearful",
+        "disgust": "Disgusted",
+        "neutral": "Neutral"
+    }
+    audio_mapping = {
+        "anger": "Angry",
+        "disgust": "Disgusted",
+        "fear": "Fearful",
+        "joy": "Happy",
+        "neutral": "Neutral",
+        "sadness": "Sad",
+        "surprise": "Surprised"
+    }
+    posture_mapping = {
+        "Confident": "Confident",
+        "Nervous": "Nervous",
+        "Defensive": "Defensive",
+        "Serious": "Serious",
+        "Attentive": "Attentive"
+    }
+    if source == "face":
+        return face_mapping.get(emotion.lower(), emotion)
+    elif source == "audio":
+        return audio_mapping.get(emotion.lower(), emotion)
+    elif source == "posture":
+        return posture_mapping.get(emotion, emotion)
+    return emotion
+def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
+    """Draws multimodal emotion and posture sentiment on the frame."""
+    overlay = frame.copy()
+    cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
+    # Display current emotions
+    cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
+    cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
+    cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
+    # Display major emotion
+    cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
+    # Add explanation
+    reason_text = 'Weighted combination of face, posture, and audio analysis'
+    cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
+    # Blend overlay with original frame
+    cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
+def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
+    """Generates charts for all emotion modalities."""
+    # Create a figure with 3 subplots
+    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
+    # Face emotions pie chart
+    labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
+    axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
+    axs[0].set_title("Facial Emotions")
+    # Posture pie chart
+    labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
+    axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
+    axs[1].set_title("Posture Analysis")
+    # Audio emotions pie chart
+    labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
+    axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
+    axs[2].set_title("Audio Emotions")
+    plt.tight_layout()
+    plt.savefig("multimodal_emotion_analysis.jpg")
+    plt.close()
+    # Create combined emotions bar chart
+    plt.figure(figsize=(12, 6))
+    # Combine all emotions across modalities
+    all_emotions = set()
+    for counts in [face_emotion_counts, audio_emotion_counts]:
+        all_emotions.update(counts.keys())
+    # Prepare data for each emotion across modalities
+    emotions = list(all_emotions)
+    face_values = [face_emotion_counts.get(e, 0) for e in emotions]
+    audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
+    # Normalize values
+    if sum(face_values) > 0:
+        face_values = [v/sum(face_values)*100 for v in face_values]
+    if sum(audio_values) > 0:
+        audio_values = [v/sum(audio_values)*100 for v in audio_values]
+    # Create bar chart
+    x = np.arange(len(emotions))
+    width = 0.35
+    fig, ax = plt.subplots(figsize=(14, 8))
+    ax.bar(x - width/2, face_values, width, label='Face')
+    ax.bar(x + width/2, audio_values, width, label='Audio')
+    ax.set_title('Emotion Distribution by Modality')
+    ax.set_xlabel('Emotions')
+    ax.set_ylabel('Percentage (%)')
+    ax.set_xticks(x)
+    ax.set_xticklabels(emotions)
+    ax.legend()
+    plt.tight_layout()
+    plt.savefig("emotion_comparison.jpg")
+    plt.close()
+def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
+    """Calculates a combined sentiment score from all modalities."""
+    # Define emotion categories and weights
+    modality_weights = {
+        "face": 0.4,
+        "posture": 0.2,
+        "audio": 0.4
+    }
+    # Map posture labels to emotional states for better combination
+    posture_emotion_mapping = {
+        "Confident": "Happy",
+        "Nervous": "Fearful",
+        "Defensive": "Angry",
+        "Serious": "Neutral",
+        "Attentive": "Neutral"
+    }
+    # Convert posture counts to emotion counts
+    posture_emotion_counts = {}
+    for posture, count in posture_counts.items():
+        emotion = posture_emotion_mapping.get(posture, "Neutral")
+        posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
+    # Get all unique emotions across all modalities
+    all_emotions = set()
+    for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
+        all_emotions.update(counts.keys())
+    # Calculate total frames/samples for each modality
+    face_total = sum(face_emotion_counts.values())
+    posture_total = sum(posture_counts.values())
+    audio_total = sum(audio_emotion_counts.values())
+    # Calculate weighted emotion scores
+    combined_scores = {}
+    for emotion in all_emotions:
+        # Get normalized scores from each modality (or 0 if not present)
+        face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
+        posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
+        audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
+        # Calculate weighted score
+        weighted_score = (
+            face_score * modality_weights["face"] +
+            posture_score * modality_weights["posture"] +
+            audio_score * modality_weights["audio"]
+        )
+        combined_scores[emotion] = weighted_score
+    # Normalize to percentages
+    total_score = sum(combined_scores.values())
+    if total_score > 0:
+        for emotion in combined_scores:
+            combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
+    # Get the major emotion
+    major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
+    return combined_scores, major_emotion[0], major_emotion[1]
+def process_video(input_path):
+    """Processes the video with multimodal sentiment analysis."""
+    # Extract audio first
+    print("Extracting audio from video...")
+    audio_path = extract_audio(input_path)
+    # Analyze audio emotions
+    print("Analyzing audio emotions...")
+    audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
+    # Process video frames
+    print("Processing video frames...")
+    cap = cv2.VideoCapture(input_path)
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
+    # Initialize counters
+    face_emotion_counts = {}
+    posture_counts = {}
+    total_frames = 0
+    frame_index = 0
+    # Get total frames for progress tracking
+    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Calculate frames per audio segment
+    audio_segments = len(audio_emotions_sequence)
+    frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
+    current_audio_index = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
+        # Update progress
+        frame_index += 1
+        if frame_index % 30 == 0:  # Show progress every 30 frames
+            print(f"Processing frame {frame_index}/{total_video_frames}