Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,111 +1,164 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import cv2
|
3 |
import torch
|
|
|
|
|
|
|
4 |
import numpy as np
|
5 |
-
import
|
6 |
import matplotlib.pyplot as plt
|
7 |
-
import
|
8 |
-
from
|
9 |
-
from
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
}
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
return model.config.id2label[torch.argmax(probs).item()]
|
39 |
-
|
40 |
-
def process_video(input_path):
|
41 |
-
"""Processes video, overlays emotions, and creates a summary chart."""
|
42 |
-
cap = cv2.VideoCapture(input_path)
|
43 |
-
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
44 |
-
frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
45 |
-
out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
|
46 |
-
|
47 |
-
emotion_counts = []
|
48 |
-
|
49 |
while cap.isOpened():
|
50 |
ret, frame = cap.read()
|
51 |
if not ret:
|
52 |
break
|
53 |
-
|
54 |
-
|
55 |
-
emotion_counts.append(emotion)
|
56 |
-
|
57 |
-
# Overlay emotion
|
58 |
-
overlay = frame.copy()
|
59 |
-
cv2.rectangle(overlay, (10, 10), (350, 80), (255, 255, 255), -1)
|
60 |
-
cv2.putText(overlay, f'Emotion: {emotion}', (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
61 |
-
cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
|
62 |
-
|
63 |
-
out.write(frame)
|
64 |
|
65 |
cap.release()
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
plt.figure(figsize=(5, 5))
|
75 |
-
labels,
|
76 |
-
plt.
|
77 |
-
plt.
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
}
|
93 |
-
"""
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
+
import torchaudio
|
3 |
+
import cv2
|
4 |
+
import librosa
|
5 |
import numpy as np
|
6 |
+
import gradio as gr
|
7 |
import matplotlib.pyplot as plt
|
8 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
|
9 |
+
from deepface import DeepFace
|
10 |
+
from moviepy.editor import VideoFileClip
|
11 |
+
|
12 |
+
# --- Load Pretrained Models ---
|
13 |
+
# Speech-to-Text
|
14 |
+
asr_model_name = "facebook/wav2vec2-large-960h"
|
15 |
+
asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
|
16 |
+
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name).to("cpu")
|
17 |
+
|
18 |
+
# Sentiment Analysis (Text)
|
19 |
+
emotion_model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
|
20 |
+
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
21 |
+
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name).to("cpu")
|
22 |
+
|
23 |
+
# Emotion Categories
|
24 |
+
emotion_labels = {
|
25 |
+
0: "Neutral", 1: "Happy", 2: "Sad", 3: "Surprise", 4: "Fear",
|
26 |
+
5: "Disgust", 6: "Anger", 7: "Contempt"
|
27 |
}
|
28 |
|
29 |
+
# --- Extract Audio from Video ---
|
30 |
+
def extract_audio(video_path, audio_output_path="temp_audio.wav"):
|
31 |
+
video = VideoFileClip(video_path)
|
32 |
+
video.audio.write_audiofile(audio_output_path, codec="pcm_s16le")
|
33 |
+
return audio_output_path
|
34 |
+
|
35 |
+
# --- Extract Frames for Facial & Posture Analysis ---
|
36 |
+
def extract_frames(video_path, interval=10):
|
37 |
+
cap = cv2.VideoCapture(video_path)
|
38 |
+
frames = []
|
39 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
while cap.isOpened():
|
41 |
ret, frame = cap.read()
|
42 |
if not ret:
|
43 |
break
|
44 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
45 |
+
frames.append(frame)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
cap.release()
|
48 |
+
return frames[::interval] # Process every nth frame
|
49 |
+
|
50 |
+
# --- Normalize Emotion Percentages to 100% ---
|
51 |
+
def normalize_emotion_percentages(emotion_counts):
|
52 |
+
print("Raw emotion counts:", emotion_counts) # Debugging
|
53 |
+
total = sum(emotion_counts.values())
|
54 |
+
if total > 0:
|
55 |
+
normalized_counts = {k: round((v / total) * 100, 1) for k, v in emotion_counts.items()}
|
56 |
+
|
57 |
+
# Adjust the highest emotion to ensure total = 100%
|
58 |
+
total_after = sum(normalized_counts.values())
|
59 |
+
if total_after != 100:
|
60 |
+
diff = 100 - total_after
|
61 |
+
max_emotion = max(normalized_counts, key=normalized_counts.get)
|
62 |
+
normalized_counts[max_emotion] += diff
|
63 |
+
|
64 |
+
print("Normalized emotion counts:", normalized_counts) # Debugging
|
65 |
+
return normalized_counts
|
66 |
+
else:
|
67 |
+
return {k: 0 for k in emotion_counts}
|
68 |
+
|
69 |
+
# --- Facial Emotion Analysis ---
|
70 |
+
def analyze_facial_emotion(frames):
|
71 |
+
emotion_counts = {key: 0 for key in emotion_labels.values()}
|
72 |
+
|
73 |
+
for frame in frames:
|
74 |
+
try:
|
75 |
+
result = DeepFace.analyze(frame, actions=["emotion"], enforce_detection=False)
|
76 |
+
detected_emotion = result[0]["dominant_emotion"].capitalize()
|
77 |
+
print("Detected emotion:", detected_emotion) # Debugging
|
78 |
+
if detected_emotion in emotion_counts:
|
79 |
+
emotion_counts[detected_emotion] += 1
|
80 |
+
except Exception:
|
81 |
+
continue
|
82 |
+
|
83 |
+
return normalize_emotion_percentages(emotion_counts)
|
84 |
+
|
85 |
+
# --- Speech-to-Text ---
|
86 |
+
def transcribe_audio(audio_path):
|
87 |
+
speech, sr = librosa.load(audio_path, sr=16000)
|
88 |
+
input_values = asr_processor(speech, return_tensors="pt", sampling_rate=16000).input_values
|
89 |
+
|
90 |
+
with torch.no_grad():
|
91 |
+
logits = asr_model(input_values).logits
|
92 |
+
|
93 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
94 |
+
return asr_processor.batch_decode(predicted_ids)[0]
|
95 |
+
|
96 |
+
# --- Sentiment Analysis from Text ---
|
97 |
+
def analyze_audio_emotion(text):
|
98 |
+
inputs = emotion_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
99 |
+
with torch.no_grad():
|
100 |
+
logits = emotion_model(**inputs).logits
|
101 |
+
|
102 |
+
probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
|
103 |
+
predicted_emotion = emotion_labels[torch.argmax(logits, dim=-1).item()]
|
104 |
+
|
105 |
+
return predicted_emotion, probabilities
|
106 |
+
|
107 |
+
# --- Full Analysis Pipeline ---
|
108 |
+
def analyze_video(video_path):
|
109 |
+
# Extract Audio from Video
|
110 |
+
audio_path = extract_audio(video_path)
|
111 |
+
|
112 |
+
# Extract Frames for Facial & Posture Analysis
|
113 |
+
frames = extract_frames(video_path)
|
114 |
+
|
115 |
+
# Facial Emotion Analysis
|
116 |
+
facial_emotions = analyze_facial_emotion(frames)
|
117 |
+
|
118 |
+
# Audio Analysis
|
119 |
+
transcription = transcribe_audio(audio_path)
|
120 |
+
audio_emotion, audio_probabilities = analyze_audio_emotion(transcription)
|
121 |
+
|
122 |
+
# Combine Emotion Scores
|
123 |
+
final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
|
124 |
+
|
125 |
+
# Display Emotion Pie Chart
|
126 |
plt.figure(figsize=(5, 5))
|
127 |
+
plt.pie(facial_emotions.values(), labels=facial_emotions.keys(), autopct="%1.1f%%", colors=plt.cm.Paired.colors)
|
128 |
+
plt.title("Facial Emotion Distribution")
|
129 |
+
plt.savefig("emotion_pie_chart.png")
|
130 |
+
|
131 |
+
return (
|
132 |
+
transcription,
|
133 |
+
audio_emotion,
|
134 |
+
final_emotion,
|
135 |
+
facial_emotions,
|
136 |
+
"emotion_pie_chart.png"
|
137 |
+
)
|
138 |
+
|
139 |
+
# --- Gradio UI ---
|
140 |
+
theme_css = """
|
141 |
+
body { font-family: Arial, sans-serif; background: #f4f4f4; }
|
142 |
+
.gradio-container { max-width: 800px; margin: auto; padding: 20px; background: white; border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); }
|
143 |
+
.gr-box { border-radius: 10px; padding: 15px; background: #fff; }
|
144 |
+
h1 { color: #333; text-align: center; }
|
145 |
+
"""
|
146 |
+
|
147 |
+
interface = gr.Interface(
|
148 |
+
fn=analyze_video,
|
149 |
+
inputs=gr.Video(),
|
150 |
+
outputs=[
|
151 |
+
gr.Textbox(label="Transcribed Speech"),
|
152 |
+
gr.Textbox(label="Predicted Audio Emotion"),
|
153 |
+
gr.Textbox(label="Major Detected Emotion (Face + Posture)"),
|
154 |
+
gr.Label(label="Facial Emotion Distribution"),
|
155 |
+
gr.Image(label="Facial Emotion Pie Chart"),
|
156 |
+
],
|
157 |
+
title="🎭 Multi-Modal Emotion Analysis",
|
158 |
+
description="📌 Upload a video and get analyzed emotions from **facial expressions, posture, and voice** in one step.\n\n🚀 Features:\n- Facial Emotion Analysis\n- Audio-Based Sentiment Detection\n- Real-Time Processing\n- Visual Pie Chart Representation",
|
159 |
+
theme="compact",
|
160 |
+
css=theme_css
|
161 |
+
)
|
162 |
+
|
163 |
+
interface.launch()
|
164 |
+
|