Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import mediapipe as mp
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
+
from facenet_pytorch import MTCNN
|
8 |
+
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
|
9 |
+
from PIL import Image
|
10 |
+
import moviepy.editor as moviepy
|
11 |
+
import librosa
|
12 |
+
import os
|
13 |
+
import gradio as gr
|
14 |
+
import tempfile
|
15 |
+
|
16 |
+
# Initialize device
|
17 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
18 |
+
print(f"Using device: {device}")
|
19 |
+
|
20 |
+
# Initialize visual models
|
21 |
+
mp_pose = mp.solutions.pose
|
22 |
+
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
|
23 |
+
mtcnn = MTCNN(device=device)
|
24 |
+
face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
|
25 |
+
face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
|
26 |
+
|
27 |
+
# Initialize audio model
|
28 |
+
audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
29 |
+
audio_processor = AutoFeatureExtractor.from_pretrained(audio_model_name)
|
30 |
+
audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
|
31 |
+
audio_sampling_rate = 16000
|
32 |
+
|
33 |
+
def calculate_angle(a, b, c):
|
34 |
+
"""Calculates the angle between three points."""
|
35 |
+
a, b, c = np.array(a), np.array(b), np.array(c)
|
36 |
+
ba, bc = a - b, c - b
|
37 |
+
cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
|
38 |
+
return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
|
39 |
+
|
40 |
+
def detect_emotions(frame):
|
41 |
+
"""Detects facial emotions in a given frame."""
|
42 |
+
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
43 |
+
faces, _ = mtcnn.detect(img)
|
44 |
+
|
45 |
+
if faces is None or len(faces) == 0:
|
46 |
+
return "Neutral" # Default to neutral if no face is detected
|
47 |
+
|
48 |
+
face = img.crop((faces[0][0], faces[0][1], faces[0][2], faces[0][3]))
|
49 |
+
inputs = face_extractor(images=face, return_tensors="pt").to(device)
|
50 |
+
outputs = face_model(**inputs)
|
51 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
52 |
+
return face_model.config.id2label[torch.argmax(probs).item()]
|
53 |
+
|
54 |
+
def classify_posture(back_angle, neck_angle):
|
55 |
+
"""Classifies posture based on back and neck angles."""
|
56 |
+
if back_angle > 170 and neck_angle > 150:
|
57 |
+
return "Confident"
|
58 |
+
elif back_angle < 160 and neck_angle < 140:
|
59 |
+
return "Nervous"
|
60 |
+
elif back_angle < 150:
|
61 |
+
return "Defensive"
|
62 |
+
elif neck_angle < 130:
|
63 |
+
return "Serious"
|
64 |
+
else:
|
65 |
+
return "Attentive"
|
66 |
+
|
67 |
+
def extract_audio(video_path):
|
68 |
+
"""Extracts audio from video file and saves it as WAV."""
|
69 |
+
audio_path = tempfile.mktemp(suffix='.wav')
|
70 |
+
video = moviepy.VideoFileClip(video_path)
|
71 |
+
video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
|
72 |
+
return audio_path
|
73 |
+
|
74 |
+
def analyze_audio_emotion(audio_path):
|
75 |
+
"""Analyzes emotion from audio file and returns emotion counts."""
|
76 |
+
# Load audio
|
77 |
+
y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
|
78 |
+
|
79 |
+
# Process audio in chunks to avoid memory issues
|
80 |
+
chunk_length = audio_sampling_rate * 5 # 5 seconds
|
81 |
+
emotion_counts = {}
|
82 |
+
audio_emotions = []
|
83 |
+
|
84 |
+
# Process audio in chunks
|
85 |
+
for i in range(0, len(y), chunk_length):
|
86 |
+
chunk = y[i:min(i+chunk_length, len(y))]
|
87 |
+
|
88 |
+
# Skip chunks that are too short
|
89 |
+
if len(chunk) < audio_sampling_rate:
|
90 |
+
continue
|
91 |
+
|
92 |
+
# Process audio with the model
|
93 |
+
inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
|
94 |
+
with torch.no_grad():
|
95 |
+
outputs = audio_model(**inputs)
|
96 |
+
|
97 |
+
# Get prediction
|
98 |
+
predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
|
99 |
+
emotion = audio_model.config.id2label[predicted_class_id]
|
100 |
+
audio_emotions.append(emotion)
|
101 |
+
emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
|
102 |
+
|
103 |
+
return emotion_counts, audio_emotions
|
104 |
+
|
105 |
+
def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
|
106 |
+
"""Draws multimodal emotion and posture sentiment on the frame."""
|
107 |
+
overlay = frame.copy()
|
108 |
+
cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
|
109 |
+
|
110 |
+
# Display current emotions
|
111 |
+
cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
112 |
+
cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
113 |
+
cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
114 |
+
|
115 |
+
# Display major emotion
|
116 |
+
cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
|
117 |
+
|
118 |
+
# Add explanation
|
119 |
+
reason_text = 'Weighted combination of face, posture, and audio analysis'
|
120 |
+
cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
|
121 |
+
|
122 |
+
# Blend overlay with original frame
|
123 |
+
cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
|
124 |
+
|
125 |
+
def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
|
126 |
+
"""Generates charts for all emotion modalities."""
|
127 |
+
# Create a figure with 3 subplots
|
128 |
+
fig, axs = plt.subplots(1, 3, figsize=(18, 6))
|
129 |
+
|
130 |
+
# Face emotions pie chart
|
131 |
+
labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
|
132 |
+
axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
|
133 |
+
axs[0].set_title("Facial Emotions")
|
134 |
+
|
135 |
+
# Posture pie chart
|
136 |
+
labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
|
137 |
+
axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
|
138 |
+
axs[1].set_title("Posture Analysis")
|
139 |
+
|
140 |
+
# Audio emotions pie chart
|
141 |
+
labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
|
142 |
+
axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
|
143 |
+
axs[2].set_title("Audio Emotions")
|
144 |
+
|
145 |
+
plt.tight_layout()
|
146 |
+
|
147 |
+
# Save to a temporary file
|
148 |
+
chart_path = tempfile.mktemp(suffix='.jpg')
|
149 |
+
plt.savefig(chart_path)
|
150 |
+
plt.close()
|
151 |
+
|
152 |
+
# Create combined emotions bar chart
|
153 |
+
plt.figure(figsize=(12, 6))
|
154 |
+
|
155 |
+
# Combine all emotions across modalities
|
156 |
+
all_emotions = set()
|
157 |
+
for counts in [face_emotion_counts, audio_emotion_counts]:
|
158 |
+
all_emotions.update(counts.keys())
|
159 |
+
|
160 |
+
# Prepare data for each emotion across modalities
|
161 |
+
emotions = list(all_emotions)
|
162 |
+
face_values = [face_emotion_counts.get(e, 0) for e in emotions]
|
163 |
+
audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
|
164 |
+
|
165 |
+
# Normalize values
|
166 |
+
if sum(face_values) > 0:
|
167 |
+
face_values = [v/sum(face_values)*100 for v in face_values]
|
168 |
+
if sum(audio_values) > 0:
|
169 |
+
audio_values = [v/sum(audio_values)*100 for v in audio_values]
|
170 |
+
|
171 |
+
# Create bar chart
|
172 |
+
x = np.arange(len(emotions))
|
173 |
+
width = 0.35
|
174 |
+
|
175 |
+
fig, ax = plt.subplots(figsize=(14, 8))
|
176 |
+
ax.bar(x - width/2, face_values, width, label='Face')
|
177 |
+
ax.bar(x + width/2, audio_values, width, label='Audio')
|
178 |
+
|
179 |
+
ax.set_title('Emotion Distribution by Modality')
|
180 |
+
ax.set_xlabel('Emotions')
|
181 |
+
ax.set_ylabel('Percentage (%)')
|
182 |
+
ax.set_xticks(x)
|
183 |
+
ax.set_xticklabels(emotions)
|
184 |
+
ax.legend()
|
185 |
+
|
186 |
+
plt.tight_layout()
|
187 |
+
|
188 |
+
# Save to a temporary file
|
189 |
+
comparison_path = tempfile.mktemp(suffix='.jpg')
|
190 |
+
plt.savefig(comparison_path)
|
191 |
+
plt.close()
|
192 |
+
|
193 |
+
return chart_path, comparison_path
|
194 |
+
|
195 |
+
def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
|
196 |
+
"""Calculates a combined sentiment score from all modalities."""
|
197 |
+
# Define emotion categories and weights
|
198 |
+
modality_weights = {
|
199 |
+
"face": 0.4,
|
200 |
+
"posture": 0.2,
|
201 |
+
"audio": 0.4
|
202 |
+
}
|
203 |
+
|
204 |
+
# Map posture labels to emotional states for better combination
|
205 |
+
posture_emotion_mapping = {
|
206 |
+
"Confident": "Happy",
|
207 |
+
"Nervous": "Fearful",
|
208 |
+
"Defensive": "Angry",
|
209 |
+
"Serious": "Neutral",
|
210 |
+
"Attentive": "Neutral"
|
211 |
+
}
|
212 |
+
|
213 |
+
# Convert posture counts to emotion counts
|
214 |
+
posture_emotion_counts = {}
|
215 |
+
for posture, count in posture_counts.items():
|
216 |
+
emotion = posture_emotion_mapping.get(posture, "Neutral")
|
217 |
+
posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
|
218 |
+
|
219 |
+
# Get all unique emotions across all modalities
|
220 |
+
all_emotions = set()
|
221 |
+
for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
|
222 |
+
all_emotions.update(counts.keys())
|
223 |
+
|
224 |
+
# Calculate total frames/samples for each modality
|
225 |
+
face_total = sum(face_emotion_counts.values())
|
226 |
+
posture_total = sum(posture_counts.values())
|
227 |
+
audio_total = sum(audio_emotion_counts.values())
|
228 |
+
|
229 |
+
# Calculate weighted emotion scores
|
230 |
+
combined_scores = {}
|
231 |
+
|
232 |
+
for emotion in all_emotions:
|
233 |
+
# Get normalized scores from each modality (or 0 if not present)
|
234 |
+
face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
|
235 |
+
posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
|
236 |
+
audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
|
237 |
+
|
238 |
+
# Calculate weighted score
|
239 |
+
weighted_score = (
|
240 |
+
face_score * modality_weights["face"] +
|
241 |
+
posture_score * modality_weights["posture"] +
|
242 |
+
audio_score * modality_weights["audio"]
|
243 |
+
)
|
244 |
+
|
245 |
+
combined_scores[emotion] = weighted_score
|
246 |
+
|
247 |
+
# Normalize to percentages
|
248 |
+
total_score = sum(combined_scores.values())
|
249 |
+
if total_score > 0:
|
250 |
+
for emotion in combined_scores:
|
251 |
+
combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
|
252 |
+
|
253 |
+
# Get the major emotion
|
254 |
+
major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
|
255 |
+
|
256 |
+
return combined_scores, major_emotion[0], major_emotion[1]
|
257 |
+
|
258 |
+
def process_video_for_gradio(video_path, progress=gr.Progress()):
|
259 |
+
"""Processes the video for Gradio interface with progress updates."""
|
260 |
+
# Extract audio first
|
261 |
+
progress(0.1, "Extracting audio from video...")
|
262 |
+
audio_path = extract_audio(video_path)
|
263 |
+
|
264 |
+
# Analyze audio emotions
|
265 |
+
progress(0.2, "Analyzing audio emotions...")
|
266 |
+
audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
|
267 |
+
|
268 |
+
# Process video frames
|
269 |
+
progress(0.3, "Starting video frame analysis...")
|
270 |
+
cap = cv2.VideoCapture(video_path)
|
271 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
272 |
+
frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
273 |
+
|
274 |
+
# Create a temporary file for the output video
|
275 |
+
output_path = tempfile.mktemp(suffix='.mp4')
|
276 |
+
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
|
277 |
+
|
278 |
+
# Initialize counters
|
279 |
+
face_emotion_counts = {}
|
280 |
+
posture_counts = {}
|
281 |
+
total_frames = 0
|
282 |
+
frame_index = 0
|
283 |
+
|
284 |
+
# Get total frames for progress tracking
|
285 |
+
total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
286 |
+
|
287 |
+
# For very long videos, we might want to sample frames
|
288 |
+
sample_rate = max(1, total_video_frames // 300) # Process at most ~300 frames
|
289 |
+
|
290 |
+
# Calculate frames per audio segment
|
291 |
+
audio_segments = len(audio_emotions_sequence)
|
292 |
+
frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
|
293 |
+
current_audio_index = 0
|
294 |
+
|
295 |
+
# Current audio emotion
|
296 |
+
current_audio_emotion = audio_emotions_sequence[0] if audio_emotions_sequence else "Unknown"
|
297 |
+
|
298 |
+
while cap.isOpened():
|
299 |
+
ret, frame = cap.read()
|
300 |
+
if not ret:
|
301 |
+
break
|
302 |
+
|
303 |
+
frame_index += 1
|
304 |
+
|
305 |
+
# Skip frames according to sample rate
|
306 |
+
if frame_index % sample_rate != 0:
|
307 |
+
continue
|
308 |
+
|
309 |
+
# Update progress
|
310 |
+
progress_value = 0.3 + (0.6 * frame_index / total_video_frames)
|
311 |
+
progress(progress_value, f"Processing frame {frame_index}/{total_video_frames}")
|
312 |
+
|
313 |
+
# Track the frame
|
314 |
+
total_frames += 1
|
315 |
+
|
316 |
+
# Update current audio emotion based on frame index
|
317 |
+
current_audio_index = min(frame_index // frames_per_audio, len(audio_emotions_sequence) - 1)
|
318 |
+
if current_audio_index >= 0 and current_audio_index < len(audio_emotions_sequence):
|
319 |
+
current_audio_emotion = audio_emotions_sequence[current_audio_index]
|
320 |
+
|
321 |
+
# Process the frame for face and posture
|
322 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
323 |
+
result = pose.process(rgb_frame)
|
324 |
+
|
325 |
+
posture_label = "Unknown"
|
326 |
+
if result.pose_landmarks:
|
327 |
+
landmarks = result.pose_landmarks.landmark
|
328 |
+
try:
|
329 |
+
shoulder = [landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].x, landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].y]
|
330 |
+
hip = [landmarks[mp_pose.PoseLandmark.LEFT_HIP].x, landmarks[mp_pose.PoseLandmark.LEFT_HIP].y]
|
331 |
+
knee = [landmarks[mp_pose.PoseLandmark.LEFT_KNEE].x, landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y]
|
332 |
+
ear = [landmarks[mp_pose.PoseLandmark.LEFT_EAR].x, landmarks[mp_pose.PoseLandmark.LEFT_EAR].y]
|
333 |
+
|
334 |
+
back_angle = calculate_angle(shoulder, hip, knee)
|
335 |
+
neck_angle = calculate_angle(ear, shoulder, hip)
|
336 |
+
posture_label = classify_posture(back_angle, neck_angle)
|
337 |
+
except:
|
338 |
+
# If any landmark is missing, use default
|
339 |
+
posture_label = "Unknown"
|
340 |
+
|
341 |
+
# Update posture counts
|
342 |
+
posture_counts[posture_label] = posture_counts.get(posture_label, 0) + 1
|
343 |
+
|
344 |
+
# Detect face emotion
|
345 |
+
try:
|
346 |
+
face_emotion = detect_emotions(frame)
|
347 |
+
except Exception as e:
|
348 |
+
face_emotion = "Neutral"
|
349 |
+
print(f"Face detection error: {e}")
|
350 |
+
|
351 |
+
# Update face emotion counts
|
352 |
+
face_emotion_counts[face_emotion] = face_emotion_counts.get(face_emotion, 0) + 1
|
353 |
+
|
354 |
+
# Calculate current major emotion
|
355 |
+
combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
|
356 |
+
face_emotion_counts, posture_counts, audio_emotion_counts
|
357 |
+
)
|
358 |
+
|
359 |
+
# Draw sentiment info on the frame
|
360 |
+
draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, current_audio_emotion, major_emotion, major_emotion_percent)
|
361 |
+
|
362 |
+
# Write the frame to output video
|
363 |
+
out.write(frame)
|
364 |
+
|
365 |
+
# Release resources
|
366 |
+
cap.release()
|
367 |
+
out.release()
|
368 |
+
|
369 |
+
# Generate charts
|
370 |
+
progress(0.9, "Generating emotion charts...")
|
371 |
+
chart_path, comparison_path = generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts)
|
372 |
+
|
373 |
+
# Clean up temporary audio file
|
374 |
+
try:
|
375 |
+
os.remove(audio_path)
|
376 |
+
except:
|
377 |
+
pass
|
378 |
+
|
379 |
+
progress(1.0, "Analysis complete!")
|
380 |
+
|
381 |
+
# Prepare result summary
|
382 |
+
combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
|
383 |
+
face_emotion_counts, posture_counts, audio_emotion_counts
|
384 |
+
)
|
385 |
+
|
386 |
+
result_summary = f"""
|
387 |
+
# Video Sentiment Analysis Results
|
388 |
+
|
389 |
+
## Overall Sentiment
|
390 |
+
The dominant emotion in this video is: **{major_emotion}** ({major_emotion_percent:.1f}%)
|
391 |
+
|
392 |
+
## Emotion Distribution
|
393 |
+
|
394 |
+
### Face Emotions:
|
395 |
+
{', '.join([f"{emotion}: {count}" for emotion, count in face_emotion_counts.items()])}
|
396 |
+
|
397 |
+
### Posture Analysis:
|
398 |
+
{', '.join([f"{posture}: {count}" for posture, count in posture_counts.items()])}
|
399 |
+
|
400 |
+
### Audio Emotions:
|
401 |
+
{', '.join([f"{emotion}: {count}" for emotion, count in audio_emotion_counts.items()])}
|
402 |
+
|
403 |
+
### Combined Emotion Scores:
|
404 |
+
{', '.join([f"{emotion}: {score:.1f}%" for emotion, score in combined_scores.items()])}
|
405 |
+
"""
|
406 |
+
|
407 |
+
return output_path, chart_path, comparison_path, result_summary
|
408 |
+
|
409 |
+
# Create Gradio interface
|
410 |
+
def create_gradio_interface():
|
411 |
+
with gr.Blocks(title="Multimodal Video Sentiment Analysis") as demo:
|
412 |
+
gr.Markdown("# πΉ Multimodal Video Sentiment Analysis")
|
413 |
+
gr.Markdown("""
|
414 |
+
This app analyzes videos for emotions using three modalities:
|
415 |
+
- π **Facial Expressions**: Detects emotions from faces
|
416 |
+
- π§ββοΈ **Body Posture**: Identifies emotional cues from posture
|
417 |
+
- π **Audio Tone**: Analyzes voice for emotional content
|
418 |
+
|
419 |
+
Upload a video to see the combined analysis!
|
420 |
+
""")
|
421 |
+
|
422 |
+
with gr.Row():
|
423 |
+
with gr.Column(scale=1):
|
424 |
+
video_input = gr.Video(label="Upload Video")
|
425 |
+
analyze_btn = gr.Button("Analyze Video", variant="primary")
|
426 |
+
|
427 |
+
with gr.Column(scale=2):
|
428 |
+
with gr.Tabs():
|
429 |
+
with gr.TabItem("Results Summary"):
|
430 |
+
result_text = gr.Markdown(label="Analysis Results")
|
431 |
+
|
432 |
+
with gr.TabItem("Processed Video"):
|
433 |
+
video_output = gr.Video(label="Processed Video")
|
434 |
+
|
435 |
+
with gr.TabItem("Emotion Charts"):
|
436 |
+
chart_output = gr.Image(label="Emotion Distribution")
|
437 |
+
comparison_output = gr.Image(label="Modality Comparison")
|
438 |
+
|
439 |
+
analyze_btn.click(
|
440 |
+
process_video_for_gradio,
|
441 |
+
inputs=[video_input],
|
442 |
+
outputs=[video_output, chart_output, comparison_output, result_text]
|
443 |
+
)
|
444 |
+
|
445 |
+
gr.Markdown("""
|
446 |
+
## How it works
|
447 |
+
|
448 |
+
1. **Visual Analysis**: The app processes video frames to detect faces and body posture
|
449 |
+
2. **Audio Analysis**: The audio is extracted and analyzed for emotional tone
|
450 |
+
3. **Combined Analysis**: The results are weighted and combined for a holistic emotional assessment
|
451 |
+
|
452 |
+
The app uses pretrained models for each modality and combines their outputs using a weighted approach.
|
453 |
+
""")
|
454 |
+
|
455 |
+
return demo
|
456 |
+
|
457 |
+
# Launch the Gradio app
|
458 |
+
if __name__ == "__main__":
|
459 |
+
demo = create_gradio_interface()
|
460 |
+
demo.launch()
|