Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,483 +1,111 @@
|
|
|
|
1 |
import cv2
|
2 |
-
import mediapipe as mp
|
3 |
import torch
|
4 |
import numpy as np
|
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
import seaborn as sns
|
7 |
from facenet_pytorch import MTCNN
|
8 |
-
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
|
9 |
from PIL import Image
|
10 |
-
import moviepy.editor as moviepy
|
11 |
-
import librosa
|
12 |
import os
|
13 |
-
|
14 |
-
import tempfile
|
15 |
|
16 |
-
#
|
17 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
18 |
-
print(f"Using device: {device}")
|
19 |
-
|
20 |
-
# Initialize visual models
|
21 |
-
mp_pose = mp.solutions.pose
|
22 |
-
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
|
23 |
mtcnn = MTCNN(device=device)
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
# Initialize audio model
|
28 |
-
audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
29 |
-
audio_processor = AutoFeatureExtractor.from_pretrained(audio_model_name)
|
30 |
-
audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
|
31 |
-
audio_sampling_rate = 16000
|
32 |
-
import os
|
33 |
-
|
34 |
-
def analyze_video(video_path):
|
35 |
-
if not video_path:
|
36 |
-
return "Error: No video input received!"
|
37 |
-
|
38 |
-
# Check file size (limit to ~50MB for Hugging Face Spaces)
|
39 |
-
max_size_mb = 50
|
40 |
-
file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
|
41 |
-
|
42 |
-
if file_size_mb > max_size_mb:
|
43 |
-
return f"Error: File size ({file_size_mb:.2f}MB) exceeds the {max_size_mb}MB limit."
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
audio_emotion, _ = analyze_audio_emotion(transcription)
|
51 |
-
final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
|
52 |
-
|
53 |
-
return transcription, audio_emotion, final_emotion, facial_emotions, "emotion_pie_chart.png"
|
54 |
-
|
55 |
-
|
56 |
-
def calculate_angle(a, b, c):
|
57 |
-
"""Calculates the angle between three points."""
|
58 |
-
a, b, c = np.array(a), np.array(b), np.array(c)
|
59 |
-
ba, bc = a - b, c - b
|
60 |
-
cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
|
61 |
-
return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
|
62 |
|
63 |
def detect_emotions(frame):
|
64 |
"""Detects facial emotions in a given frame."""
|
65 |
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
66 |
faces, _ = mtcnn.detect(img)
|
67 |
-
|
68 |
if faces is None or len(faces) == 0:
|
69 |
-
return "
|
70 |
-
|
71 |
-
face = img.crop((faces[0][0], faces[0][1], faces[0][2], faces[0][3]))
|
72 |
-
inputs = face_extractor(images=face, return_tensors="pt").to(device)
|
73 |
-
outputs = face_model(**inputs)
|
74 |
-
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
75 |
-
return face_model.config.id2label[torch.argmax(probs).item()]
|
76 |
-
|
77 |
-
def classify_posture(back_angle, neck_angle):
|
78 |
-
"""Classifies posture based on back and neck angles."""
|
79 |
-
if back_angle > 170 and neck_angle > 150:
|
80 |
-
return "Confident"
|
81 |
-
elif back_angle < 160 and neck_angle < 140:
|
82 |
-
return "Nervous"
|
83 |
-
elif back_angle < 150:
|
84 |
-
return "Defensive"
|
85 |
-
elif neck_angle < 130:
|
86 |
-
return "Serious"
|
87 |
-
else:
|
88 |
-
return "Attentive"
|
89 |
-
|
90 |
-
def extract_audio(video_path):
|
91 |
-
"""Extracts audio from video file and saves it as WAV."""
|
92 |
-
audio_path = tempfile.mktemp(suffix='.wav')
|
93 |
-
video = moviepy.VideoFileClip(video_path)
|
94 |
-
video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
|
95 |
-
return audio_path
|
96 |
-
|
97 |
-
def analyze_audio_emotion(audio_path):
|
98 |
-
"""Analyzes emotion from audio file and returns emotion counts."""
|
99 |
-
# Load audio
|
100 |
-
y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
|
101 |
-
|
102 |
-
# Process audio in chunks to avoid memory issues
|
103 |
-
chunk_length = audio_sampling_rate * 5 # 5 seconds
|
104 |
-
emotion_counts = {}
|
105 |
-
audio_emotions = []
|
106 |
-
|
107 |
-
# Process audio in chunks
|
108 |
-
for i in range(0, len(y), chunk_length):
|
109 |
-
chunk = y[i:min(i+chunk_length, len(y))]
|
110 |
-
|
111 |
-
# Skip chunks that are too short
|
112 |
-
if len(chunk) < audio_sampling_rate:
|
113 |
-
continue
|
114 |
-
|
115 |
-
# Process audio with the model
|
116 |
-
inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
|
117 |
-
with torch.no_grad():
|
118 |
-
outputs = audio_model(**inputs)
|
119 |
-
|
120 |
-
# Get prediction
|
121 |
-
predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
|
122 |
-
emotion = audio_model.config.id2label[predicted_class_id]
|
123 |
-
audio_emotions.append(emotion)
|
124 |
-
emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
|
125 |
-
|
126 |
-
return emotion_counts, audio_emotions
|
127 |
-
|
128 |
-
def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
|
129 |
-
"""Draws multimodal emotion and posture sentiment on the frame."""
|
130 |
-
overlay = frame.copy()
|
131 |
-
cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
|
132 |
-
|
133 |
-
# Display current emotions
|
134 |
-
cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
135 |
-
cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
136 |
-
cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
137 |
-
|
138 |
-
# Display major emotion
|
139 |
-
cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
|
140 |
-
|
141 |
-
# Add explanation
|
142 |
-
reason_text = 'Weighted combination of face, posture, and audio analysis'
|
143 |
-
cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
|
144 |
-
|
145 |
-
# Blend overlay with original frame
|
146 |
-
cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
|
147 |
-
|
148 |
-
def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
|
149 |
-
"""Generates charts for all emotion modalities."""
|
150 |
-
# Create a figure with 3 subplots
|
151 |
-
fig, axs = plt.subplots(1, 3, figsize=(18, 6))
|
152 |
-
|
153 |
-
# Face emotions pie chart
|
154 |
-
labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
|
155 |
-
axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
|
156 |
-
axs[0].set_title("Facial Emotions")
|
157 |
-
|
158 |
-
# Posture pie chart
|
159 |
-
labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
|
160 |
-
axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
|
161 |
-
axs[1].set_title("Posture Analysis")
|
162 |
-
|
163 |
-
# Audio emotions pie chart
|
164 |
-
labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
|
165 |
-
axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
|
166 |
-
axs[2].set_title("Audio Emotions")
|
167 |
-
|
168 |
-
plt.tight_layout()
|
169 |
-
|
170 |
-
# Save to a temporary file
|
171 |
-
chart_path = tempfile.mktemp(suffix='.jpg')
|
172 |
-
plt.savefig(chart_path)
|
173 |
-
plt.close()
|
174 |
-
|
175 |
-
# Create combined emotions bar chart
|
176 |
-
plt.figure(figsize=(12, 6))
|
177 |
-
|
178 |
-
# Combine all emotions across modalities
|
179 |
-
all_emotions = set()
|
180 |
-
for counts in [face_emotion_counts, audio_emotion_counts]:
|
181 |
-
all_emotions.update(counts.keys())
|
182 |
-
|
183 |
-
# Prepare data for each emotion across modalities
|
184 |
-
emotions = list(all_emotions)
|
185 |
-
face_values = [face_emotion_counts.get(e, 0) for e in emotions]
|
186 |
-
audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
|
187 |
-
|
188 |
-
# Normalize values
|
189 |
-
if sum(face_values) > 0:
|
190 |
-
face_values = [v/sum(face_values)*100 for v in face_values]
|
191 |
-
if sum(audio_values) > 0:
|
192 |
-
audio_values = [v/sum(audio_values)*100 for v in audio_values]
|
193 |
-
|
194 |
-
# Create bar chart
|
195 |
-
x = np.arange(len(emotions))
|
196 |
-
width = 0.35
|
197 |
-
|
198 |
-
fig, ax = plt.subplots(figsize=(14, 8))
|
199 |
-
ax.bar(x - width/2, face_values, width, label='Face')
|
200 |
-
ax.bar(x + width/2, audio_values, width, label='Audio')
|
201 |
-
|
202 |
-
ax.set_title('Emotion Distribution by Modality')
|
203 |
-
ax.set_xlabel('Emotions')
|
204 |
-
ax.set_ylabel('Percentage (%)')
|
205 |
-
ax.set_xticks(x)
|
206 |
-
ax.set_xticklabels(emotions)
|
207 |
-
ax.legend()
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
plt.savefig(comparison_path)
|
214 |
-
plt.close()
|
215 |
-
|
216 |
-
return chart_path, comparison_path
|
217 |
-
|
218 |
-
def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
|
219 |
-
"""Calculates a combined sentiment score from all modalities."""
|
220 |
-
# Define emotion categories and weights
|
221 |
-
modality_weights = {
|
222 |
-
"face": 0.4,
|
223 |
-
"posture": 0.2,
|
224 |
-
"audio": 0.4
|
225 |
-
}
|
226 |
-
|
227 |
-
# Map posture labels to emotional states for better combination
|
228 |
-
posture_emotion_mapping = {
|
229 |
-
"Confident": "Happy",
|
230 |
-
"Nervous": "Fearful",
|
231 |
-
"Defensive": "Angry",
|
232 |
-
"Serious": "Neutral",
|
233 |
-
"Attentive": "Neutral"
|
234 |
-
}
|
235 |
-
|
236 |
-
# Convert posture counts to emotion counts
|
237 |
-
posture_emotion_counts = {}
|
238 |
-
for posture, count in posture_counts.items():
|
239 |
-
emotion = posture_emotion_mapping.get(posture, "Neutral")
|
240 |
-
posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
|
241 |
-
|
242 |
-
# Get all unique emotions across all modalities
|
243 |
-
all_emotions = set()
|
244 |
-
for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
|
245 |
-
all_emotions.update(counts.keys())
|
246 |
-
|
247 |
-
# Calculate total frames/samples for each modality
|
248 |
-
face_total = sum(face_emotion_counts.values())
|
249 |
-
posture_total = sum(posture_counts.values())
|
250 |
-
audio_total = sum(audio_emotion_counts.values())
|
251 |
-
|
252 |
-
# Calculate weighted emotion scores
|
253 |
-
combined_scores = {}
|
254 |
-
|
255 |
-
for emotion in all_emotions:
|
256 |
-
# Get normalized scores from each modality (or 0 if not present)
|
257 |
-
face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
|
258 |
-
posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
|
259 |
-
audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
|
260 |
-
|
261 |
-
# Calculate weighted score
|
262 |
-
weighted_score = (
|
263 |
-
face_score * modality_weights["face"] +
|
264 |
-
posture_score * modality_weights["posture"] +
|
265 |
-
audio_score * modality_weights["audio"]
|
266 |
-
)
|
267 |
-
|
268 |
-
combined_scores[emotion] = weighted_score
|
269 |
-
|
270 |
-
# Normalize to percentages
|
271 |
-
total_score = sum(combined_scores.values())
|
272 |
-
if total_score > 0:
|
273 |
-
for emotion in combined_scores:
|
274 |
-
combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
|
275 |
-
|
276 |
-
# Get the major emotion
|
277 |
-
major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
|
278 |
|
279 |
-
return
|
280 |
|
281 |
-
def
|
282 |
-
"""Processes
|
283 |
-
|
284 |
-
progress(0.1, "Extracting audio from video...")
|
285 |
-
audio_path = extract_audio(video_path)
|
286 |
-
|
287 |
-
# Analyze audio emotions
|
288 |
-
progress(0.2, "Analyzing audio emotions...")
|
289 |
-
audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
|
290 |
-
|
291 |
-
# Process video frames
|
292 |
-
progress(0.3, "Starting video frame analysis...")
|
293 |
-
cap = cv2.VideoCapture(video_path)
|
294 |
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
295 |
frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
296 |
-
|
297 |
-
# Create a temporary file for the output video
|
298 |
-
output_path = tempfile.mktemp(suffix='.mp4')
|
299 |
-
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
|
300 |
-
|
301 |
-
# Initialize counters
|
302 |
-
face_emotion_counts = {}
|
303 |
-
posture_counts = {}
|
304 |
-
total_frames = 0
|
305 |
-
frame_index = 0
|
306 |
|
307 |
-
|
308 |
-
total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
309 |
-
|
310 |
-
# For very long videos, we might want to sample frames
|
311 |
-
sample_rate = max(1, total_video_frames // 300) # Process at most ~300 frames
|
312 |
-
|
313 |
-
# Calculate frames per audio segment
|
314 |
-
audio_segments = len(audio_emotions_sequence)
|
315 |
-
frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
|
316 |
-
current_audio_index = 0
|
317 |
|
318 |
-
# Current audio emotion
|
319 |
-
current_audio_emotion = audio_emotions_sequence[0] if audio_emotions_sequence else "Unknown"
|
320 |
-
|
321 |
while cap.isOpened():
|
322 |
ret, frame = cap.read()
|
323 |
if not ret:
|
324 |
break
|
325 |
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
# Track the frame
|
337 |
-
total_frames += 1
|
338 |
-
|
339 |
-
# Update current audio emotion based on frame index
|
340 |
-
current_audio_index = min(frame_index // frames_per_audio, len(audio_emotions_sequence) - 1)
|
341 |
-
if current_audio_index >= 0 and current_audio_index < len(audio_emotions_sequence):
|
342 |
-
current_audio_emotion = audio_emotions_sequence[current_audio_index]
|
343 |
-
|
344 |
-
# Process the frame for face and posture
|
345 |
-
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
346 |
-
result = pose.process(rgb_frame)
|
347 |
-
|
348 |
-
posture_label = "Unknown"
|
349 |
-
if result.pose_landmarks:
|
350 |
-
landmarks = result.pose_landmarks.landmark
|
351 |
-
try:
|
352 |
-
shoulder = [landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].x, landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].y]
|
353 |
-
hip = [landmarks[mp_pose.PoseLandmark.LEFT_HIP].x, landmarks[mp_pose.PoseLandmark.LEFT_HIP].y]
|
354 |
-
knee = [landmarks[mp_pose.PoseLandmark.LEFT_KNEE].x, landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y]
|
355 |
-
ear = [landmarks[mp_pose.PoseLandmark.LEFT_EAR].x, landmarks[mp_pose.PoseLandmark.LEFT_EAR].y]
|
356 |
-
|
357 |
-
back_angle = calculate_angle(shoulder, hip, knee)
|
358 |
-
neck_angle = calculate_angle(ear, shoulder, hip)
|
359 |
-
posture_label = classify_posture(back_angle, neck_angle)
|
360 |
-
except:
|
361 |
-
# If any landmark is missing, use default
|
362 |
-
posture_label = "Unknown"
|
363 |
-
|
364 |
-
# Update posture counts
|
365 |
-
posture_counts[posture_label] = posture_counts.get(posture_label, 0) + 1
|
366 |
-
|
367 |
-
# Detect face emotion
|
368 |
-
try:
|
369 |
-
face_emotion = detect_emotions(frame)
|
370 |
-
except Exception as e:
|
371 |
-
face_emotion = "Neutral"
|
372 |
-
print(f"Face detection error: {e}")
|
373 |
-
|
374 |
-
# Update face emotion counts
|
375 |
-
face_emotion_counts[face_emotion] = face_emotion_counts.get(face_emotion, 0) + 1
|
376 |
-
|
377 |
-
# Calculate current major emotion
|
378 |
-
combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
|
379 |
-
face_emotion_counts, posture_counts, audio_emotion_counts
|
380 |
-
)
|
381 |
-
|
382 |
-
# Draw sentiment info on the frame
|
383 |
-
draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, current_audio_emotion, major_emotion, major_emotion_percent)
|
384 |
-
|
385 |
-
# Write the frame to output video
|
386 |
out.write(frame)
|
387 |
-
|
388 |
-
# Release resources
|
389 |
cap.release()
|
390 |
out.release()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
chart_path, comparison_path = generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts)
|
395 |
-
|
396 |
-
# Clean up temporary audio file
|
397 |
-
try:
|
398 |
-
os.remove(audio_path)
|
399 |
-
except:
|
400 |
-
pass
|
401 |
-
|
402 |
-
progress(1.0, "Analysis complete!")
|
403 |
-
|
404 |
-
# Prepare result summary
|
405 |
-
combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
|
406 |
-
face_emotion_counts, posture_counts, audio_emotion_counts
|
407 |
-
)
|
408 |
-
|
409 |
-
result_summary = f"""
|
410 |
-
# Video Sentiment Analysis Results
|
411 |
-
|
412 |
-
## Overall Sentiment
|
413 |
-
The dominant emotion in this video is: **{major_emotion}** ({major_emotion_percent:.1f}%)
|
414 |
-
|
415 |
-
## Emotion Distribution
|
416 |
-
|
417 |
-
### Face Emotions:
|
418 |
-
{', '.join([f"{emotion}: {count}" for emotion, count in face_emotion_counts.items()])}
|
419 |
-
|
420 |
-
### Posture Analysis:
|
421 |
-
{', '.join([f"{posture}: {count}" for posture, count in posture_counts.items()])}
|
422 |
-
|
423 |
-
### Audio Emotions:
|
424 |
-
{', '.join([f"{emotion}: {count}" for emotion, count in audio_emotion_counts.items()])}
|
425 |
-
|
426 |
-
### Combined Emotion Scores:
|
427 |
-
{', '.join([f"{emotion}: {score:.1f}%" for emotion, score in combined_scores.items()])}
|
428 |
-
"""
|
429 |
-
|
430 |
-
return output_path, chart_path, comparison_path, result_summary
|
431 |
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
gr.Markdown("# πΉ Multimodal Video Sentiment Analysis")
|
436 |
-
gr.Markdown("""
|
437 |
-
This app analyzes videos for emotions using three modalities:
|
438 |
-
- π **Facial Expressions**: Detects emotions from faces
|
439 |
-
- π§ββοΈ **Body Posture**: Identifies emotional cues from posture
|
440 |
-
- π **Audio Tone**: Analyzes voice for emotional content
|
441 |
-
|
442 |
-
Upload a video to see the combined analysis!
|
443 |
-
""")
|
444 |
-
|
445 |
-
with gr.Row():
|
446 |
-
with gr.Column(scale=1):
|
447 |
-
video_input = gr.Video(label="Upload Video")
|
448 |
-
analyze_btn = gr.Button("Analyze Video", variant="primary")
|
449 |
-
|
450 |
-
with gr.Column(scale=2):
|
451 |
-
with gr.Tabs():
|
452 |
-
with gr.TabItem("Results Summary"):
|
453 |
-
result_text = gr.Markdown(label="Analysis Results")
|
454 |
-
|
455 |
-
with gr.TabItem("Processed Video"):
|
456 |
-
video_output = gr.Video(label="Processed Video")
|
457 |
-
|
458 |
-
with gr.TabItem("Emotion Charts"):
|
459 |
-
chart_output = gr.Image(label="Emotion Distribution")
|
460 |
-
comparison_output = gr.Image(label="Modality Comparison")
|
461 |
-
|
462 |
-
analyze_btn.click(
|
463 |
-
process_video_for_gradio,
|
464 |
-
inputs=[video_input],
|
465 |
-
outputs=[video_output, chart_output, comparison_output, result_text]
|
466 |
-
)
|
467 |
-
|
468 |
-
gr.Markdown("""
|
469 |
-
## How it works
|
470 |
-
|
471 |
-
1. **Visual Analysis**: The app processes video frames to detect faces and body posture
|
472 |
-
2. **Audio Analysis**: The audio is extracted and analyzed for emotional tone
|
473 |
-
3. **Combined Analysis**: The results are weighted and combined for a holistic emotional assessment
|
474 |
-
|
475 |
-
The app uses pretrained models for each modality and combines their outputs using a weighted approach.
|
476 |
-
""")
|
477 |
|
478 |
-
|
|
|
|
|
479 |
|
480 |
-
|
481 |
-
if __name__ == "__main__":
|
482 |
-
demo = create_gradio_interface()
|
483 |
-
demo.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
import cv2
|
|
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
+
import mediapipe as mp
|
6 |
import matplotlib.pyplot as plt
|
7 |
import seaborn as sns
|
8 |
from facenet_pytorch import MTCNN
|
9 |
+
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
|
10 |
from PIL import Image
|
|
|
|
|
11 |
import os
|
12 |
+
from collections import Counter
|
|
|
13 |
|
14 |
+
# Load models
|
15 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
|
|
|
|
|
|
|
16 |
mtcnn = MTCNN(device=device)
|
17 |
+
model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
|
18 |
+
extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# Emotion labels
|
21 |
+
affectnet_labels = {
|
22 |
+
0: "neutral", 1: "happy", 2: "sad", 3: "surprise", 4: "fear",
|
23 |
+
5: "disgust", 6: "anger", 7: "contempt"
|
24 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def detect_emotions(frame):
|
27 |
"""Detects facial emotions in a given frame."""
|
28 |
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
29 |
faces, _ = mtcnn.detect(img)
|
|
|
30 |
if faces is None or len(faces) == 0:
|
31 |
+
return "No Face Detected"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
face = img.crop(faces[0])
|
34 |
+
inputs = extractor(images=face, return_tensors="pt").to(device)
|
35 |
+
outputs = model(**inputs)
|
36 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
return model.config.id2label[torch.argmax(probs).item()]
|
39 |
|
40 |
+
def process_video(input_path):
|
41 |
+
"""Processes video, overlays emotions, and creates a summary chart."""
|
42 |
+
cap = cv2.VideoCapture(input_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
44 |
frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
45 |
+
out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
emotion_counts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
|
|
|
|
|
|
49 |
while cap.isOpened():
|
50 |
ret, frame = cap.read()
|
51 |
if not ret:
|
52 |
break
|
53 |
|
54 |
+
emotion = detect_emotions(frame)
|
55 |
+
emotion_counts.append(emotion)
|
56 |
+
|
57 |
+
# Overlay emotion
|
58 |
+
overlay = frame.copy()
|
59 |
+
cv2.rectangle(overlay, (10, 10), (350, 80), (255, 255, 255), -1)
|
60 |
+
cv2.putText(overlay, f'Emotion: {emotion}', (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
61 |
+
cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
|
62 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
out.write(frame)
|
64 |
+
|
|
|
65 |
cap.release()
|
66 |
out.release()
|
67 |
+
cv2.destroyAllWindows()
|
68 |
+
|
69 |
+
# Find major emotion
|
70 |
+
emotion_counter = Counter(emotion_counts)
|
71 |
+
major_emotion = emotion_counter.most_common(1)[0][0] if emotion_counter else "No Face Detected"
|
72 |
+
|
73 |
+
# Generate emotion distribution pie chart
|
74 |
+
plt.figure(figsize=(5, 5))
|
75 |
+
labels, sizes = zip(*emotion_counter.items())
|
76 |
+
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('pastel'))
|
77 |
+
plt.title("Emotion Distribution")
|
78 |
+
plt.savefig("emotion_distribution.jpg")
|
79 |
+
|
80 |
+
return "output_video.mp4", plt, major_emotion
|
81 |
+
|
82 |
+
# Gradio Web Interface
|
83 |
+
with gr.Blocks(css="""
|
84 |
+
.gradio-container { max-width: 750px !important; margin: auto; background-color: #f8f9fa; padding: 20px; border-radius: 15px; }
|
85 |
+
.gradio-container h1 { font-size: 22px; text-align: center; color: #333; }
|
86 |
+
.gradio-container .gr-button { background-color: #007bff; color: white; border-radius: 10px; padding: 8px 15px; }
|
87 |
+
.gradio-container .gr-textbox { font-size: 16px; font-weight: bold; color: #007bff; }
|
88 |
+
.gradio-container .gr-file { border-radius: 10px; padding: 5px; }
|
89 |
+
@media screen and (max-width: 768px) {
|
90 |
+
.gradio-container { width: 100%; padding: 10px; }
|
91 |
+
.gradio-container h1 { font-size: 18px; }
|
92 |
+
}
|
93 |
+
""") as demo:
|
94 |
+
gr.Markdown("# π Emotion Analysis from Video π₯")
|
95 |
+
gr.Markdown("Upload a video, and the AI will detect emotions in each frame, providing a processed video, an emotion distribution chart, and the major detected emotion.")
|
96 |
+
|
97 |
+
with gr.Row():
|
98 |
+
video_input = gr.File(label="π€ Upload Video (MP4, MOV, AVI)")
|
99 |
|
100 |
+
with gr.Row():
|
101 |
+
process_button = gr.Button("π Analyze")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
with gr.Row():
|
104 |
+
video_output = gr.File(label="π₯ Processed Video")
|
105 |
+
emotion_chart = gr.Plot(label="π Emotion Distribution Chart")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
major_emotion_output = gr.Textbox(label="π₯ Major Emotion Detected", interactive=False)
|
108 |
+
|
109 |
+
process_button.click(fn=process_video, inputs=video_input, outputs=[video_output, emotion_chart, major_emotion_output])
|
110 |
|
111 |
+
demo.launch()
|
|
|
|
|
|