Sagnik1750 commited on
Commit
b337ab5
Β·
verified Β·
1 Parent(s): 0340596

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +460 -0
app.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import mediapipe as mp
3
+ import torch
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from facenet_pytorch import MTCNN
8
+ from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
9
+ from PIL import Image
10
+ import moviepy.editor as moviepy
11
+ import librosa
12
+ import os
13
+ import gradio as gr
14
+ import tempfile
15
+
16
+ # Initialize device
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ print(f"Using device: {device}")
19
+
20
+ # Initialize visual models
21
+ mp_pose = mp.solutions.pose
22
+ pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
23
+ mtcnn = MTCNN(device=device)
24
+ face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
25
+ face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
26
+
27
+ # Initialize audio model
28
+ audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
29
+ audio_processor = AutoFeatureExtractor.from_pretrained(audio_model_name)
30
+ audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
31
+ audio_sampling_rate = 16000
32
+
33
+ def calculate_angle(a, b, c):
34
+ """Calculates the angle between three points."""
35
+ a, b, c = np.array(a), np.array(b), np.array(c)
36
+ ba, bc = a - b, c - b
37
+ cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
38
+ return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
39
+
40
+ def detect_emotions(frame):
41
+ """Detects facial emotions in a given frame."""
42
+ img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
43
+ faces, _ = mtcnn.detect(img)
44
+
45
+ if faces is None or len(faces) == 0:
46
+ return "Neutral" # Default to neutral if no face is detected
47
+
48
+ face = img.crop((faces[0][0], faces[0][1], faces[0][2], faces[0][3]))
49
+ inputs = face_extractor(images=face, return_tensors="pt").to(device)
50
+ outputs = face_model(**inputs)
51
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
52
+ return face_model.config.id2label[torch.argmax(probs).item()]
53
+
54
+ def classify_posture(back_angle, neck_angle):
55
+ """Classifies posture based on back and neck angles."""
56
+ if back_angle > 170 and neck_angle > 150:
57
+ return "Confident"
58
+ elif back_angle < 160 and neck_angle < 140:
59
+ return "Nervous"
60
+ elif back_angle < 150:
61
+ return "Defensive"
62
+ elif neck_angle < 130:
63
+ return "Serious"
64
+ else:
65
+ return "Attentive"
66
+
67
+ def extract_audio(video_path):
68
+ """Extracts audio from video file and saves it as WAV."""
69
+ audio_path = tempfile.mktemp(suffix='.wav')
70
+ video = moviepy.VideoFileClip(video_path)
71
+ video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
72
+ return audio_path
73
+
74
+ def analyze_audio_emotion(audio_path):
75
+ """Analyzes emotion from audio file and returns emotion counts."""
76
+ # Load audio
77
+ y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
78
+
79
+ # Process audio in chunks to avoid memory issues
80
+ chunk_length = audio_sampling_rate * 5 # 5 seconds
81
+ emotion_counts = {}
82
+ audio_emotions = []
83
+
84
+ # Process audio in chunks
85
+ for i in range(0, len(y), chunk_length):
86
+ chunk = y[i:min(i+chunk_length, len(y))]
87
+
88
+ # Skip chunks that are too short
89
+ if len(chunk) < audio_sampling_rate:
90
+ continue
91
+
92
+ # Process audio with the model
93
+ inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
94
+ with torch.no_grad():
95
+ outputs = audio_model(**inputs)
96
+
97
+ # Get prediction
98
+ predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
99
+ emotion = audio_model.config.id2label[predicted_class_id]
100
+ audio_emotions.append(emotion)
101
+ emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
102
+
103
+ return emotion_counts, audio_emotions
104
+
105
+ def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
106
+ """Draws multimodal emotion and posture sentiment on the frame."""
107
+ overlay = frame.copy()
108
+ cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
109
+
110
+ # Display current emotions
111
+ cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
112
+ cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
113
+ cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
114
+
115
+ # Display major emotion
116
+ cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
117
+
118
+ # Add explanation
119
+ reason_text = 'Weighted combination of face, posture, and audio analysis'
120
+ cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
121
+
122
+ # Blend overlay with original frame
123
+ cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
124
+
125
+ def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
126
+ """Generates charts for all emotion modalities."""
127
+ # Create a figure with 3 subplots
128
+ fig, axs = plt.subplots(1, 3, figsize=(18, 6))
129
+
130
+ # Face emotions pie chart
131
+ labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
132
+ axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
133
+ axs[0].set_title("Facial Emotions")
134
+
135
+ # Posture pie chart
136
+ labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
137
+ axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
138
+ axs[1].set_title("Posture Analysis")
139
+
140
+ # Audio emotions pie chart
141
+ labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
142
+ axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
143
+ axs[2].set_title("Audio Emotions")
144
+
145
+ plt.tight_layout()
146
+
147
+ # Save to a temporary file
148
+ chart_path = tempfile.mktemp(suffix='.jpg')
149
+ plt.savefig(chart_path)
150
+ plt.close()
151
+
152
+ # Create combined emotions bar chart
153
+ plt.figure(figsize=(12, 6))
154
+
155
+ # Combine all emotions across modalities
156
+ all_emotions = set()
157
+ for counts in [face_emotion_counts, audio_emotion_counts]:
158
+ all_emotions.update(counts.keys())
159
+
160
+ # Prepare data for each emotion across modalities
161
+ emotions = list(all_emotions)
162
+ face_values = [face_emotion_counts.get(e, 0) for e in emotions]
163
+ audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
164
+
165
+ # Normalize values
166
+ if sum(face_values) > 0:
167
+ face_values = [v/sum(face_values)*100 for v in face_values]
168
+ if sum(audio_values) > 0:
169
+ audio_values = [v/sum(audio_values)*100 for v in audio_values]
170
+
171
+ # Create bar chart
172
+ x = np.arange(len(emotions))
173
+ width = 0.35
174
+
175
+ fig, ax = plt.subplots(figsize=(14, 8))
176
+ ax.bar(x - width/2, face_values, width, label='Face')
177
+ ax.bar(x + width/2, audio_values, width, label='Audio')
178
+
179
+ ax.set_title('Emotion Distribution by Modality')
180
+ ax.set_xlabel('Emotions')
181
+ ax.set_ylabel('Percentage (%)')
182
+ ax.set_xticks(x)
183
+ ax.set_xticklabels(emotions)
184
+ ax.legend()
185
+
186
+ plt.tight_layout()
187
+
188
+ # Save to a temporary file
189
+ comparison_path = tempfile.mktemp(suffix='.jpg')
190
+ plt.savefig(comparison_path)
191
+ plt.close()
192
+
193
+ return chart_path, comparison_path
194
+
195
+ def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
196
+ """Calculates a combined sentiment score from all modalities."""
197
+ # Define emotion categories and weights
198
+ modality_weights = {
199
+ "face": 0.4,
200
+ "posture": 0.2,
201
+ "audio": 0.4
202
+ }
203
+
204
+ # Map posture labels to emotional states for better combination
205
+ posture_emotion_mapping = {
206
+ "Confident": "Happy",
207
+ "Nervous": "Fearful",
208
+ "Defensive": "Angry",
209
+ "Serious": "Neutral",
210
+ "Attentive": "Neutral"
211
+ }
212
+
213
+ # Convert posture counts to emotion counts
214
+ posture_emotion_counts = {}
215
+ for posture, count in posture_counts.items():
216
+ emotion = posture_emotion_mapping.get(posture, "Neutral")
217
+ posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
218
+
219
+ # Get all unique emotions across all modalities
220
+ all_emotions = set()
221
+ for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
222
+ all_emotions.update(counts.keys())
223
+
224
+ # Calculate total frames/samples for each modality
225
+ face_total = sum(face_emotion_counts.values())
226
+ posture_total = sum(posture_counts.values())
227
+ audio_total = sum(audio_emotion_counts.values())
228
+
229
+ # Calculate weighted emotion scores
230
+ combined_scores = {}
231
+
232
+ for emotion in all_emotions:
233
+ # Get normalized scores from each modality (or 0 if not present)
234
+ face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
235
+ posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
236
+ audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
237
+
238
+ # Calculate weighted score
239
+ weighted_score = (
240
+ face_score * modality_weights["face"] +
241
+ posture_score * modality_weights["posture"] +
242
+ audio_score * modality_weights["audio"]
243
+ )
244
+
245
+ combined_scores[emotion] = weighted_score
246
+
247
+ # Normalize to percentages
248
+ total_score = sum(combined_scores.values())
249
+ if total_score > 0:
250
+ for emotion in combined_scores:
251
+ combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
252
+
253
+ # Get the major emotion
254
+ major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
255
+
256
+ return combined_scores, major_emotion[0], major_emotion[1]
257
+
258
+ def process_video_for_gradio(video_path, progress=gr.Progress()):
259
+ """Processes the video for Gradio interface with progress updates."""
260
+ # Extract audio first
261
+ progress(0.1, "Extracting audio from video...")
262
+ audio_path = extract_audio(video_path)
263
+
264
+ # Analyze audio emotions
265
+ progress(0.2, "Analyzing audio emotions...")
266
+ audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
267
+
268
+ # Process video frames
269
+ progress(0.3, "Starting video frame analysis...")
270
+ cap = cv2.VideoCapture(video_path)
271
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
272
+ frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
273
+
274
+ # Create a temporary file for the output video
275
+ output_path = tempfile.mktemp(suffix='.mp4')
276
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
277
+
278
+ # Initialize counters
279
+ face_emotion_counts = {}
280
+ posture_counts = {}
281
+ total_frames = 0
282
+ frame_index = 0
283
+
284
+ # Get total frames for progress tracking
285
+ total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
286
+
287
+ # For very long videos, we might want to sample frames
288
+ sample_rate = max(1, total_video_frames // 300) # Process at most ~300 frames
289
+
290
+ # Calculate frames per audio segment
291
+ audio_segments = len(audio_emotions_sequence)
292
+ frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
293
+ current_audio_index = 0
294
+
295
+ # Current audio emotion
296
+ current_audio_emotion = audio_emotions_sequence[0] if audio_emotions_sequence else "Unknown"
297
+
298
+ while cap.isOpened():
299
+ ret, frame = cap.read()
300
+ if not ret:
301
+ break
302
+
303
+ frame_index += 1
304
+
305
+ # Skip frames according to sample rate
306
+ if frame_index % sample_rate != 0:
307
+ continue
308
+
309
+ # Update progress
310
+ progress_value = 0.3 + (0.6 * frame_index / total_video_frames)
311
+ progress(progress_value, f"Processing frame {frame_index}/{total_video_frames}")
312
+
313
+ # Track the frame
314
+ total_frames += 1
315
+
316
+ # Update current audio emotion based on frame index
317
+ current_audio_index = min(frame_index // frames_per_audio, len(audio_emotions_sequence) - 1)
318
+ if current_audio_index >= 0 and current_audio_index < len(audio_emotions_sequence):
319
+ current_audio_emotion = audio_emotions_sequence[current_audio_index]
320
+
321
+ # Process the frame for face and posture
322
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
323
+ result = pose.process(rgb_frame)
324
+
325
+ posture_label = "Unknown"
326
+ if result.pose_landmarks:
327
+ landmarks = result.pose_landmarks.landmark
328
+ try:
329
+ shoulder = [landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].x, landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].y]
330
+ hip = [landmarks[mp_pose.PoseLandmark.LEFT_HIP].x, landmarks[mp_pose.PoseLandmark.LEFT_HIP].y]
331
+ knee = [landmarks[mp_pose.PoseLandmark.LEFT_KNEE].x, landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y]
332
+ ear = [landmarks[mp_pose.PoseLandmark.LEFT_EAR].x, landmarks[mp_pose.PoseLandmark.LEFT_EAR].y]
333
+
334
+ back_angle = calculate_angle(shoulder, hip, knee)
335
+ neck_angle = calculate_angle(ear, shoulder, hip)
336
+ posture_label = classify_posture(back_angle, neck_angle)
337
+ except:
338
+ # If any landmark is missing, use default
339
+ posture_label = "Unknown"
340
+
341
+ # Update posture counts
342
+ posture_counts[posture_label] = posture_counts.get(posture_label, 0) + 1
343
+
344
+ # Detect face emotion
345
+ try:
346
+ face_emotion = detect_emotions(frame)
347
+ except Exception as e:
348
+ face_emotion = "Neutral"
349
+ print(f"Face detection error: {e}")
350
+
351
+ # Update face emotion counts
352
+ face_emotion_counts[face_emotion] = face_emotion_counts.get(face_emotion, 0) + 1
353
+
354
+ # Calculate current major emotion
355
+ combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
356
+ face_emotion_counts, posture_counts, audio_emotion_counts
357
+ )
358
+
359
+ # Draw sentiment info on the frame
360
+ draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, current_audio_emotion, major_emotion, major_emotion_percent)
361
+
362
+ # Write the frame to output video
363
+ out.write(frame)
364
+
365
+ # Release resources
366
+ cap.release()
367
+ out.release()
368
+
369
+ # Generate charts
370
+ progress(0.9, "Generating emotion charts...")
371
+ chart_path, comparison_path = generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts)
372
+
373
+ # Clean up temporary audio file
374
+ try:
375
+ os.remove(audio_path)
376
+ except:
377
+ pass
378
+
379
+ progress(1.0, "Analysis complete!")
380
+
381
+ # Prepare result summary
382
+ combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
383
+ face_emotion_counts, posture_counts, audio_emotion_counts
384
+ )
385
+
386
+ result_summary = f"""
387
+ # Video Sentiment Analysis Results
388
+
389
+ ## Overall Sentiment
390
+ The dominant emotion in this video is: **{major_emotion}** ({major_emotion_percent:.1f}%)
391
+
392
+ ## Emotion Distribution
393
+
394
+ ### Face Emotions:
395
+ {', '.join([f"{emotion}: {count}" for emotion, count in face_emotion_counts.items()])}
396
+
397
+ ### Posture Analysis:
398
+ {', '.join([f"{posture}: {count}" for posture, count in posture_counts.items()])}
399
+
400
+ ### Audio Emotions:
401
+ {', '.join([f"{emotion}: {count}" for emotion, count in audio_emotion_counts.items()])}
402
+
403
+ ### Combined Emotion Scores:
404
+ {', '.join([f"{emotion}: {score:.1f}%" for emotion, score in combined_scores.items()])}
405
+ """
406
+
407
+ return output_path, chart_path, comparison_path, result_summary
408
+
409
+ # Create Gradio interface
410
+ def create_gradio_interface():
411
+ with gr.Blocks(title="Multimodal Video Sentiment Analysis") as demo:
412
+ gr.Markdown("# πŸ“Ή Multimodal Video Sentiment Analysis")
413
+ gr.Markdown("""
414
+ This app analyzes videos for emotions using three modalities:
415
+ - 😊 **Facial Expressions**: Detects emotions from faces
416
+ - πŸ§β€β™‚οΈ **Body Posture**: Identifies emotional cues from posture
417
+ - πŸ”Š **Audio Tone**: Analyzes voice for emotional content
418
+
419
+ Upload a video to see the combined analysis!
420
+ """)
421
+
422
+ with gr.Row():
423
+ with gr.Column(scale=1):
424
+ video_input = gr.Video(label="Upload Video")
425
+ analyze_btn = gr.Button("Analyze Video", variant="primary")
426
+
427
+ with gr.Column(scale=2):
428
+ with gr.Tabs():
429
+ with gr.TabItem("Results Summary"):
430
+ result_text = gr.Markdown(label="Analysis Results")
431
+
432
+ with gr.TabItem("Processed Video"):
433
+ video_output = gr.Video(label="Processed Video")
434
+
435
+ with gr.TabItem("Emotion Charts"):
436
+ chart_output = gr.Image(label="Emotion Distribution")
437
+ comparison_output = gr.Image(label="Modality Comparison")
438
+
439
+ analyze_btn.click(
440
+ process_video_for_gradio,
441
+ inputs=[video_input],
442
+ outputs=[video_output, chart_output, comparison_output, result_text]
443
+ )
444
+
445
+ gr.Markdown("""
446
+ ## How it works
447
+
448
+ 1. **Visual Analysis**: The app processes video frames to detect faces and body posture
449
+ 2. **Audio Analysis**: The audio is extracted and analyzed for emotional tone
450
+ 3. **Combined Analysis**: The results are weighted and combined for a holistic emotional assessment
451
+
452
+ The app uses pretrained models for each modality and combines their outputs using a weighted approach.
453
+ """)
454
+
455
+ return demo
456
+
457
+ # Launch the Gradio app
458
+ if __name__ == "__main__":
459
+ demo = create_gradio_interface()
460
+ demo.launch()