Spaces:

Sagnik1750
/

emotion-analysis-ui

Sleeping

App Files Files Community

Sagnik1750 commited on Mar 5

Commit

69ca061

verified ·

1 Parent(s): 096df1d

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -99

app.py CHANGED Viewed

@@ -1,111 +1,164 @@
-import gradio as gr
-import cv2
 import torch
 import numpy as np
-import mediapipe as mp
 import matplotlib.pyplot as plt
-import seaborn as sns
-from facenet_pytorch import MTCNN
-from transformers import AutoFeatureExtractor, AutoModelForImageClassification
-from PIL import Image
-import os
-from collections import Counter
-# Load models
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-mtcnn = MTCNN(device=device)
-model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
-extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
-# Emotion labels
-affectnet_labels = {
-    0: "neutral", 1: "happy", 2: "sad", 3: "surprise", 4: "fear",
-    5: "disgust", 6: "anger", 7: "contempt"
 }
-def detect_emotions(frame):
-    """Detects facial emotions in a given frame."""
-    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-    faces, _ = mtcnn.detect(img)
-    if faces is None or len(faces) == 0:
-        return "No Face Detected"
-    face = img.crop(faces[0])
-    inputs = extractor(images=face, return_tensors="pt").to(device)
-    outputs = model(**inputs)
-    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
-    return model.config.id2label[torch.argmax(probs).item()]
-def process_video(input_path):
-    """Processes video, overlays emotions, and creates a summary chart."""
-    cap = cv2.VideoCapture(input_path)
-    fps = int(cap.get(cv2.CAP_PROP_FPS))
-    frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
-    emotion_counts = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
-        emotion = detect_emotions(frame)
-        emotion_counts.append(emotion)
-        # Overlay emotion
-        overlay = frame.copy()
-        cv2.rectangle(overlay, (10, 10), (350, 80), (255, 255, 255), -1)
-        cv2.putText(overlay, f'Emotion: {emotion}', (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
-        cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
-        out.write(frame)
     cap.release()
-    out.release()
-    cv2.destroyAllWindows()
-    # Find major emotion
-    emotion_counter = Counter(emotion_counts)
-    major_emotion = emotion_counter.most_common(1)[0][0] if emotion_counter else "No Face Detected"
-    # Generate emotion distribution pie chart
     plt.figure(figsize=(5, 5))
-    labels, sizes = zip(*emotion_counter.items())
-    plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('pastel'))
-    plt.title("Emotion Distribution")
-    plt.savefig("emotion_distribution.jpg")
-    return "output_video.mp4", plt, major_emotion
-# Gradio Web Interface
-with gr.Blocks(css="""
-    .gradio-container { max-width: 750px !important; margin: auto; background-color: #f8f9fa; padding: 20px; border-radius: 15px; }
-    .gradio-container h1 { font-size: 22px; text-align: center; color: #333; }
-    .gradio-container .gr-button { background-color: #007bff; color: white; border-radius: 10px; padding: 8px 15px; }
-    .gradio-container .gr-textbox { font-size: 16px; font-weight: bold; color: #007bff; }
-    .gradio-container .gr-file { border-radius: 10px; padding: 5px; }
-    @media screen and (max-width: 768px) {
-        .gradio-container { width: 100%; padding: 10px; }
-        .gradio-container h1 { font-size: 18px; }
-    }
-""") as demo:
-    gr.Markdown("# 🎭 Emotion Analysis from Video 🎥")
-    gr.Markdown("Upload a video, and the AI will detect emotions in each frame, providing a processed video, an emotion distribution chart, and the major detected emotion.")
-    with gr.Row():
-        video_input = gr.File(label="📤 Upload Video (MP4, MOV, AVI)")
-    with gr.Row():
-        process_button = gr.Button("🚀 Analyze")
-    with gr.Row():
-        video_output = gr.File(label="📥 Processed Video")
-        emotion_chart = gr.Plot(label="📊 Emotion Distribution Chart")
-    major_emotion_output = gr.Textbox(label="🔥 Major Emotion Detected", interactive=False)
-    process_button.click(fn=process_video, inputs=video_input, outputs=[video_output, emotion_chart, major_emotion_output])
-demo.launch()

 import torch
+import torchaudio
+import cv2
+import librosa
 import numpy as np
+import gradio as gr
 import matplotlib.pyplot as plt
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
+from deepface import DeepFace
+from moviepy.editor import VideoFileClip
+# --- Load Pretrained Models ---
+# Speech-to-Text
+asr_model_name = "facebook/wav2vec2-large-960h"
+asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
+asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name).to("cpu")
+# Sentiment Analysis (Text)
+emotion_model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
+emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
+emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name).to("cpu")
+# Emotion Categories
+emotion_labels = {
+    0: "Neutral", 1: "Happy", 2: "Sad", 3: "Surprise", 4: "Fear",
+    5: "Disgust", 6: "Anger", 7: "Contempt"
 }
+# --- Extract Audio from Video ---
+def extract_audio(video_path, audio_output_path="temp_audio.wav"):
+    video = VideoFileClip(video_path)
+    video.audio.write_audiofile(audio_output_path, codec="pcm_s16le")
+    return audio_output_path
+# --- Extract Frames for Facial & Posture Analysis ---
+def extract_frames(video_path, interval=10):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(frame)
     cap.release()
+    return frames[::interval]  # Process every nth frame
+# --- Normalize Emotion Percentages to 100% ---
+def normalize_emotion_percentages(emotion_counts):
+    print("Raw emotion counts:", emotion_counts)  # Debugging
+    total = sum(emotion_counts.values())
+    if total > 0:
+        normalized_counts = {k: round((v / total) * 100, 1) for k, v in emotion_counts.items()}
+        # Adjust the highest emotion to ensure total = 100%
+        total_after = sum(normalized_counts.values())
+        if total_after != 100:
+            diff = 100 - total_after
+            max_emotion = max(normalized_counts, key=normalized_counts.get)
+            normalized_counts[max_emotion] += diff
+        print("Normalized emotion counts:", normalized_counts)  # Debugging
+        return normalized_counts
+    else:
+        return {k: 0 for k in emotion_counts}
+# --- Facial Emotion Analysis ---
+def analyze_facial_emotion(frames):
+    emotion_counts = {key: 0 for key in emotion_labels.values()}
+    for frame in frames:
+        try:
+            result = DeepFace.analyze(frame, actions=["emotion"], enforce_detection=False)
+            detected_emotion = result[0]["dominant_emotion"].capitalize()
+            print("Detected emotion:", detected_emotion)  # Debugging
+            if detected_emotion in emotion_counts:
+                emotion_counts[detected_emotion] += 1
+        except Exception:
+            continue
+    return normalize_emotion_percentages(emotion_counts)
+# --- Speech-to-Text ---
+def transcribe_audio(audio_path):
+    speech, sr = librosa.load(audio_path, sr=16000)
+    input_values = asr_processor(speech, return_tensors="pt", sampling_rate=16000).input_values
+    with torch.no_grad():
+        logits = asr_model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    return asr_processor.batch_decode(predicted_ids)[0]
+# --- Sentiment Analysis from Text ---
+def analyze_audio_emotion(text):
+    inputs = emotion_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        logits = emotion_model(**inputs).logits
+    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
+    predicted_emotion = emotion_labels[torch.argmax(logits, dim=-1).item()]
+    return predicted_emotion, probabilities
+# --- Full Analysis Pipeline ---
+def analyze_video(video_path):
+    # Extract Audio from Video
+    audio_path = extract_audio(video_path)
+    # Extract Frames for Facial & Posture Analysis
+    frames = extract_frames(video_path)
+    # Facial Emotion Analysis
+    facial_emotions = analyze_facial_emotion(frames)
+    # Audio Analysis
+    transcription = transcribe_audio(audio_path)
+    audio_emotion, audio_probabilities = analyze_audio_emotion(transcription)
+    # Combine Emotion Scores
+    final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
+    # Display Emotion Pie Chart
     plt.figure(figsize=(5, 5))
+    plt.pie(facial_emotions.values(), labels=facial_emotions.keys(), autopct="%1.1f%%", colors=plt.cm.Paired.colors)
+    plt.title("Facial Emotion Distribution")
+    plt.savefig("emotion_pie_chart.png")
+    return (
+        transcription,
+        audio_emotion,
+        final_emotion,
+        facial_emotions,
+        "emotion_pie_chart.png"
+    )
+# --- Gradio UI ---
+theme_css = """
+    body { font-family: Arial, sans-serif; background: #f4f4f4; }
+    .gradio-container { max-width: 800px; margin: auto; padding: 20px; background: white; border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); }
+    .gr-box { border-radius: 10px; padding: 15px; background: #fff; }
+    h1 { color: #333; text-align: center; }
+"""
+interface = gr.Interface(
+    fn=analyze_video,
+    inputs=gr.Video(),
+    outputs=[
+        gr.Textbox(label="Transcribed Speech"),
+        gr.Textbox(label="Predicted Audio Emotion"),
+        gr.Textbox(label="Major Detected Emotion (Face + Posture)"),
+        gr.Label(label="Facial Emotion Distribution"),
+        gr.Image(label="Facial Emotion Pie Chart"),
+    ],
+    title="🎭 Multi-Modal Emotion Analysis",
+    description="📌 Upload a video and get analyzed emotions from **facial expressions, posture, and voice** in one step.\n\n🚀 Features:\n- Facial Emotion Analysis\n- Audio-Based Sentiment Detection\n- Real-Time Processing\n- Visual Pie Chart Representation",
+    theme="compact",
+    css=theme_css
+)
+interface.launch()