Sagnik1750 commited on
Commit
e671129
Β·
verified Β·
1 Parent(s): 86aff99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -446
app.py CHANGED
@@ -1,483 +1,111 @@
 
1
  import cv2
2
- import mediapipe as mp
3
  import torch
4
  import numpy as np
 
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  from facenet_pytorch import MTCNN
8
- from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
9
  from PIL import Image
10
- import moviepy.editor as moviepy
11
- import librosa
12
  import os
13
- import gradio as gr
14
- import tempfile
15
 
16
- # Initialize device
17
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
- print(f"Using device: {device}")
19
-
20
- # Initialize visual models
21
- mp_pose = mp.solutions.pose
22
- pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)
23
  mtcnn = MTCNN(device=device)
24
- face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
25
- face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
26
-
27
- # Initialize audio model
28
- audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
29
- audio_processor = AutoFeatureExtractor.from_pretrained(audio_model_name)
30
- audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
31
- audio_sampling_rate = 16000
32
- import os
33
-
34
- def analyze_video(video_path):
35
- if not video_path:
36
- return "Error: No video input received!"
37
-
38
- # Check file size (limit to ~50MB for Hugging Face Spaces)
39
- max_size_mb = 50
40
- file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
41
-
42
- if file_size_mb > max_size_mb:
43
- return f"Error: File size ({file_size_mb:.2f}MB) exceeds the {max_size_mb}MB limit."
44
 
45
- # Process the video normally
46
- audio_path = extract_audio(video_path)
47
- frames = extract_frames(video_path)
48
- facial_emotions = analyze_facial_emotion(frames)
49
- transcription = transcribe_audio(audio_path)
50
- audio_emotion, _ = analyze_audio_emotion(transcription)
51
- final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
52
-
53
- return transcription, audio_emotion, final_emotion, facial_emotions, "emotion_pie_chart.png"
54
-
55
-
56
- def calculate_angle(a, b, c):
57
- """Calculates the angle between three points."""
58
- a, b, c = np.array(a), np.array(b), np.array(c)
59
- ba, bc = a - b, c - b
60
- cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
61
- return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
62
 
63
  def detect_emotions(frame):
64
  """Detects facial emotions in a given frame."""
65
  img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
66
  faces, _ = mtcnn.detect(img)
67
-
68
  if faces is None or len(faces) == 0:
69
- return "Neutral" # Default to neutral if no face is detected
70
-
71
- face = img.crop((faces[0][0], faces[0][1], faces[0][2], faces[0][3]))
72
- inputs = face_extractor(images=face, return_tensors="pt").to(device)
73
- outputs = face_model(**inputs)
74
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
75
- return face_model.config.id2label[torch.argmax(probs).item()]
76
-
77
- def classify_posture(back_angle, neck_angle):
78
- """Classifies posture based on back and neck angles."""
79
- if back_angle > 170 and neck_angle > 150:
80
- return "Confident"
81
- elif back_angle < 160 and neck_angle < 140:
82
- return "Nervous"
83
- elif back_angle < 150:
84
- return "Defensive"
85
- elif neck_angle < 130:
86
- return "Serious"
87
- else:
88
- return "Attentive"
89
-
90
- def extract_audio(video_path):
91
- """Extracts audio from video file and saves it as WAV."""
92
- audio_path = tempfile.mktemp(suffix='.wav')
93
- video = moviepy.VideoFileClip(video_path)
94
- video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False, logger=None)
95
- return audio_path
96
-
97
- def analyze_audio_emotion(audio_path):
98
- """Analyzes emotion from audio file and returns emotion counts."""
99
- # Load audio
100
- y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
101
-
102
- # Process audio in chunks to avoid memory issues
103
- chunk_length = audio_sampling_rate * 5 # 5 seconds
104
- emotion_counts = {}
105
- audio_emotions = []
106
-
107
- # Process audio in chunks
108
- for i in range(0, len(y), chunk_length):
109
- chunk = y[i:min(i+chunk_length, len(y))]
110
-
111
- # Skip chunks that are too short
112
- if len(chunk) < audio_sampling_rate:
113
- continue
114
-
115
- # Process audio with the model
116
- inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
117
- with torch.no_grad():
118
- outputs = audio_model(**inputs)
119
-
120
- # Get prediction
121
- predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
122
- emotion = audio_model.config.id2label[predicted_class_id]
123
- audio_emotions.append(emotion)
124
- emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
125
-
126
- return emotion_counts, audio_emotions
127
-
128
- def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
129
- """Draws multimodal emotion and posture sentiment on the frame."""
130
- overlay = frame.copy()
131
- cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
132
-
133
- # Display current emotions
134
- cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
135
- cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
136
- cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
137
-
138
- # Display major emotion
139
- cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
140
-
141
- # Add explanation
142
- reason_text = 'Weighted combination of face, posture, and audio analysis'
143
- cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
144
-
145
- # Blend overlay with original frame
146
- cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
147
-
148
- def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
149
- """Generates charts for all emotion modalities."""
150
- # Create a figure with 3 subplots
151
- fig, axs = plt.subplots(1, 3, figsize=(18, 6))
152
-
153
- # Face emotions pie chart
154
- labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
155
- axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
156
- axs[0].set_title("Facial Emotions")
157
-
158
- # Posture pie chart
159
- labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
160
- axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
161
- axs[1].set_title("Posture Analysis")
162
-
163
- # Audio emotions pie chart
164
- labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
165
- axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
166
- axs[2].set_title("Audio Emotions")
167
-
168
- plt.tight_layout()
169
-
170
- # Save to a temporary file
171
- chart_path = tempfile.mktemp(suffix='.jpg')
172
- plt.savefig(chart_path)
173
- plt.close()
174
-
175
- # Create combined emotions bar chart
176
- plt.figure(figsize=(12, 6))
177
-
178
- # Combine all emotions across modalities
179
- all_emotions = set()
180
- for counts in [face_emotion_counts, audio_emotion_counts]:
181
- all_emotions.update(counts.keys())
182
-
183
- # Prepare data for each emotion across modalities
184
- emotions = list(all_emotions)
185
- face_values = [face_emotion_counts.get(e, 0) for e in emotions]
186
- audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
187
-
188
- # Normalize values
189
- if sum(face_values) > 0:
190
- face_values = [v/sum(face_values)*100 for v in face_values]
191
- if sum(audio_values) > 0:
192
- audio_values = [v/sum(audio_values)*100 for v in audio_values]
193
-
194
- # Create bar chart
195
- x = np.arange(len(emotions))
196
- width = 0.35
197
-
198
- fig, ax = plt.subplots(figsize=(14, 8))
199
- ax.bar(x - width/2, face_values, width, label='Face')
200
- ax.bar(x + width/2, audio_values, width, label='Audio')
201
-
202
- ax.set_title('Emotion Distribution by Modality')
203
- ax.set_xlabel('Emotions')
204
- ax.set_ylabel('Percentage (%)')
205
- ax.set_xticks(x)
206
- ax.set_xticklabels(emotions)
207
- ax.legend()
208
 
209
- plt.tight_layout()
210
-
211
- # Save to a temporary file
212
- comparison_path = tempfile.mktemp(suffix='.jpg')
213
- plt.savefig(comparison_path)
214
- plt.close()
215
-
216
- return chart_path, comparison_path
217
-
218
- def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
219
- """Calculates a combined sentiment score from all modalities."""
220
- # Define emotion categories and weights
221
- modality_weights = {
222
- "face": 0.4,
223
- "posture": 0.2,
224
- "audio": 0.4
225
- }
226
-
227
- # Map posture labels to emotional states for better combination
228
- posture_emotion_mapping = {
229
- "Confident": "Happy",
230
- "Nervous": "Fearful",
231
- "Defensive": "Angry",
232
- "Serious": "Neutral",
233
- "Attentive": "Neutral"
234
- }
235
-
236
- # Convert posture counts to emotion counts
237
- posture_emotion_counts = {}
238
- for posture, count in posture_counts.items():
239
- emotion = posture_emotion_mapping.get(posture, "Neutral")
240
- posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
241
-
242
- # Get all unique emotions across all modalities
243
- all_emotions = set()
244
- for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
245
- all_emotions.update(counts.keys())
246
-
247
- # Calculate total frames/samples for each modality
248
- face_total = sum(face_emotion_counts.values())
249
- posture_total = sum(posture_counts.values())
250
- audio_total = sum(audio_emotion_counts.values())
251
-
252
- # Calculate weighted emotion scores
253
- combined_scores = {}
254
-
255
- for emotion in all_emotions:
256
- # Get normalized scores from each modality (or 0 if not present)
257
- face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
258
- posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
259
- audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
260
-
261
- # Calculate weighted score
262
- weighted_score = (
263
- face_score * modality_weights["face"] +
264
- posture_score * modality_weights["posture"] +
265
- audio_score * modality_weights["audio"]
266
- )
267
-
268
- combined_scores[emotion] = weighted_score
269
-
270
- # Normalize to percentages
271
- total_score = sum(combined_scores.values())
272
- if total_score > 0:
273
- for emotion in combined_scores:
274
- combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
275
-
276
- # Get the major emotion
277
- major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
278
 
279
- return combined_scores, major_emotion[0], major_emotion[1]
280
 
281
- def process_video_for_gradio(video_path, progress=gr.Progress()):
282
- """Processes the video for Gradio interface with progress updates."""
283
- # Extract audio first
284
- progress(0.1, "Extracting audio from video...")
285
- audio_path = extract_audio(video_path)
286
-
287
- # Analyze audio emotions
288
- progress(0.2, "Analyzing audio emotions...")
289
- audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
290
-
291
- # Process video frames
292
- progress(0.3, "Starting video frame analysis...")
293
- cap = cv2.VideoCapture(video_path)
294
  fps = int(cap.get(cv2.CAP_PROP_FPS))
295
  frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
296
-
297
- # Create a temporary file for the output video
298
- output_path = tempfile.mktemp(suffix='.mp4')
299
- out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
300
-
301
- # Initialize counters
302
- face_emotion_counts = {}
303
- posture_counts = {}
304
- total_frames = 0
305
- frame_index = 0
306
 
307
- # Get total frames for progress tracking
308
- total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
309
-
310
- # For very long videos, we might want to sample frames
311
- sample_rate = max(1, total_video_frames // 300) # Process at most ~300 frames
312
-
313
- # Calculate frames per audio segment
314
- audio_segments = len(audio_emotions_sequence)
315
- frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
316
- current_audio_index = 0
317
 
318
- # Current audio emotion
319
- current_audio_emotion = audio_emotions_sequence[0] if audio_emotions_sequence else "Unknown"
320
-
321
  while cap.isOpened():
322
  ret, frame = cap.read()
323
  if not ret:
324
  break
325
 
326
- frame_index += 1
327
-
328
- # Skip frames according to sample rate
329
- if frame_index % sample_rate != 0:
330
- continue
331
-
332
- # Update progress
333
- progress_value = 0.3 + (0.6 * frame_index / total_video_frames)
334
- progress(progress_value, f"Processing frame {frame_index}/{total_video_frames}")
335
-
336
- # Track the frame
337
- total_frames += 1
338
-
339
- # Update current audio emotion based on frame index
340
- current_audio_index = min(frame_index // frames_per_audio, len(audio_emotions_sequence) - 1)
341
- if current_audio_index >= 0 and current_audio_index < len(audio_emotions_sequence):
342
- current_audio_emotion = audio_emotions_sequence[current_audio_index]
343
-
344
- # Process the frame for face and posture
345
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
346
- result = pose.process(rgb_frame)
347
-
348
- posture_label = "Unknown"
349
- if result.pose_landmarks:
350
- landmarks = result.pose_landmarks.landmark
351
- try:
352
- shoulder = [landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].x, landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].y]
353
- hip = [landmarks[mp_pose.PoseLandmark.LEFT_HIP].x, landmarks[mp_pose.PoseLandmark.LEFT_HIP].y]
354
- knee = [landmarks[mp_pose.PoseLandmark.LEFT_KNEE].x, landmarks[mp_pose.PoseLandmark.LEFT_KNEE].y]
355
- ear = [landmarks[mp_pose.PoseLandmark.LEFT_EAR].x, landmarks[mp_pose.PoseLandmark.LEFT_EAR].y]
356
-
357
- back_angle = calculate_angle(shoulder, hip, knee)
358
- neck_angle = calculate_angle(ear, shoulder, hip)
359
- posture_label = classify_posture(back_angle, neck_angle)
360
- except:
361
- # If any landmark is missing, use default
362
- posture_label = "Unknown"
363
-
364
- # Update posture counts
365
- posture_counts[posture_label] = posture_counts.get(posture_label, 0) + 1
366
-
367
- # Detect face emotion
368
- try:
369
- face_emotion = detect_emotions(frame)
370
- except Exception as e:
371
- face_emotion = "Neutral"
372
- print(f"Face detection error: {e}")
373
-
374
- # Update face emotion counts
375
- face_emotion_counts[face_emotion] = face_emotion_counts.get(face_emotion, 0) + 1
376
-
377
- # Calculate current major emotion
378
- combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
379
- face_emotion_counts, posture_counts, audio_emotion_counts
380
- )
381
-
382
- # Draw sentiment info on the frame
383
- draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, current_audio_emotion, major_emotion, major_emotion_percent)
384
-
385
- # Write the frame to output video
386
  out.write(frame)
387
-
388
- # Release resources
389
  cap.release()
390
  out.release()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
- # Generate charts
393
- progress(0.9, "Generating emotion charts...")
394
- chart_path, comparison_path = generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts)
395
-
396
- # Clean up temporary audio file
397
- try:
398
- os.remove(audio_path)
399
- except:
400
- pass
401
-
402
- progress(1.0, "Analysis complete!")
403
-
404
- # Prepare result summary
405
- combined_scores, major_emotion, major_emotion_percent = calculate_combined_sentiment(
406
- face_emotion_counts, posture_counts, audio_emotion_counts
407
- )
408
-
409
- result_summary = f"""
410
- # Video Sentiment Analysis Results
411
-
412
- ## Overall Sentiment
413
- The dominant emotion in this video is: **{major_emotion}** ({major_emotion_percent:.1f}%)
414
-
415
- ## Emotion Distribution
416
-
417
- ### Face Emotions:
418
- {', '.join([f"{emotion}: {count}" for emotion, count in face_emotion_counts.items()])}
419
-
420
- ### Posture Analysis:
421
- {', '.join([f"{posture}: {count}" for posture, count in posture_counts.items()])}
422
-
423
- ### Audio Emotions:
424
- {', '.join([f"{emotion}: {count}" for emotion, count in audio_emotion_counts.items()])}
425
-
426
- ### Combined Emotion Scores:
427
- {', '.join([f"{emotion}: {score:.1f}%" for emotion, score in combined_scores.items()])}
428
- """
429
-
430
- return output_path, chart_path, comparison_path, result_summary
431
 
432
- # Create Gradio interface
433
- def create_gradio_interface():
434
- with gr.Blocks(title="Multimodal Video Sentiment Analysis") as demo:
435
- gr.Markdown("# πŸ“Ή Multimodal Video Sentiment Analysis")
436
- gr.Markdown("""
437
- This app analyzes videos for emotions using three modalities:
438
- - 😊 **Facial Expressions**: Detects emotions from faces
439
- - πŸ§β€β™‚οΈ **Body Posture**: Identifies emotional cues from posture
440
- - πŸ”Š **Audio Tone**: Analyzes voice for emotional content
441
-
442
- Upload a video to see the combined analysis!
443
- """)
444
-
445
- with gr.Row():
446
- with gr.Column(scale=1):
447
- video_input = gr.Video(label="Upload Video")
448
- analyze_btn = gr.Button("Analyze Video", variant="primary")
449
-
450
- with gr.Column(scale=2):
451
- with gr.Tabs():
452
- with gr.TabItem("Results Summary"):
453
- result_text = gr.Markdown(label="Analysis Results")
454
-
455
- with gr.TabItem("Processed Video"):
456
- video_output = gr.Video(label="Processed Video")
457
-
458
- with gr.TabItem("Emotion Charts"):
459
- chart_output = gr.Image(label="Emotion Distribution")
460
- comparison_output = gr.Image(label="Modality Comparison")
461
-
462
- analyze_btn.click(
463
- process_video_for_gradio,
464
- inputs=[video_input],
465
- outputs=[video_output, chart_output, comparison_output, result_text]
466
- )
467
-
468
- gr.Markdown("""
469
- ## How it works
470
-
471
- 1. **Visual Analysis**: The app processes video frames to detect faces and body posture
472
- 2. **Audio Analysis**: The audio is extracted and analyzed for emotional tone
473
- 3. **Combined Analysis**: The results are weighted and combined for a holistic emotional assessment
474
-
475
- The app uses pretrained models for each modality and combines their outputs using a weighted approach.
476
- """)
477
 
478
- return demo
 
 
479
 
480
- # Launch the Gradio app
481
- if __name__ == "__main__":
482
- demo = create_gradio_interface()
483
- demo.launch()
 
1
+ import gradio as gr
2
  import cv2
 
3
  import torch
4
  import numpy as np
5
+ import mediapipe as mp
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
8
  from facenet_pytorch import MTCNN
9
+ from transformers import AutoFeatureExtractor, AutoModelForImageClassification
10
  from PIL import Image
 
 
11
  import os
12
+ from collections import Counter
 
13
 
14
+ # Load models
15
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
 
 
 
16
  mtcnn = MTCNN(device=device)
17
+ model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
18
+ extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Emotion labels
21
+ affectnet_labels = {
22
+ 0: "neutral", 1: "happy", 2: "sad", 3: "surprise", 4: "fear",
23
+ 5: "disgust", 6: "anger", 7: "contempt"
24
+ }
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def detect_emotions(frame):
27
  """Detects facial emotions in a given frame."""
28
  img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
29
  faces, _ = mtcnn.detect(img)
 
30
  if faces is None or len(faces) == 0:
31
+ return "No Face Detected"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ face = img.crop(faces[0])
34
+ inputs = extractor(images=face, return_tensors="pt").to(device)
35
+ outputs = model(**inputs)
36
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ return model.config.id2label[torch.argmax(probs).item()]
39
 
40
+ def process_video(input_path):
41
+ """Processes video, overlays emotions, and creates a summary chart."""
42
+ cap = cv2.VideoCapture(input_path)
 
 
 
 
 
 
 
 
 
 
43
  fps = int(cap.get(cv2.CAP_PROP_FPS))
44
  frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
45
+ out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
 
 
 
 
 
 
 
 
 
46
 
47
+ emotion_counts = []
 
 
 
 
 
 
 
 
 
48
 
 
 
 
49
  while cap.isOpened():
50
  ret, frame = cap.read()
51
  if not ret:
52
  break
53
 
54
+ emotion = detect_emotions(frame)
55
+ emotion_counts.append(emotion)
56
+
57
+ # Overlay emotion
58
+ overlay = frame.copy()
59
+ cv2.rectangle(overlay, (10, 10), (350, 80), (255, 255, 255), -1)
60
+ cv2.putText(overlay, f'Emotion: {emotion}', (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
61
+ cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
62
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  out.write(frame)
64
+
 
65
  cap.release()
66
  out.release()
67
+ cv2.destroyAllWindows()
68
+
69
+ # Find major emotion
70
+ emotion_counter = Counter(emotion_counts)
71
+ major_emotion = emotion_counter.most_common(1)[0][0] if emotion_counter else "No Face Detected"
72
+
73
+ # Generate emotion distribution pie chart
74
+ plt.figure(figsize=(5, 5))
75
+ labels, sizes = zip(*emotion_counter.items())
76
+ plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('pastel'))
77
+ plt.title("Emotion Distribution")
78
+ plt.savefig("emotion_distribution.jpg")
79
+
80
+ return "output_video.mp4", plt, major_emotion
81
+
82
+ # Gradio Web Interface
83
+ with gr.Blocks(css="""
84
+ .gradio-container { max-width: 750px !important; margin: auto; background-color: #f8f9fa; padding: 20px; border-radius: 15px; }
85
+ .gradio-container h1 { font-size: 22px; text-align: center; color: #333; }
86
+ .gradio-container .gr-button { background-color: #007bff; color: white; border-radius: 10px; padding: 8px 15px; }
87
+ .gradio-container .gr-textbox { font-size: 16px; font-weight: bold; color: #007bff; }
88
+ .gradio-container .gr-file { border-radius: 10px; padding: 5px; }
89
+ @media screen and (max-width: 768px) {
90
+ .gradio-container { width: 100%; padding: 10px; }
91
+ .gradio-container h1 { font-size: 18px; }
92
+ }
93
+ """) as demo:
94
+ gr.Markdown("# 🎭 Emotion Analysis from Video πŸŽ₯")
95
+ gr.Markdown("Upload a video, and the AI will detect emotions in each frame, providing a processed video, an emotion distribution chart, and the major detected emotion.")
96
+
97
+ with gr.Row():
98
+ video_input = gr.File(label="πŸ“€ Upload Video (MP4, MOV, AVI)")
99
 
100
+ with gr.Row():
101
+ process_button = gr.Button("πŸš€ Analyze")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ with gr.Row():
104
+ video_output = gr.File(label="πŸ“₯ Processed Video")
105
+ emotion_chart = gr.Plot(label="πŸ“Š Emotion Distribution Chart")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ major_emotion_output = gr.Textbox(label="πŸ”₯ Major Emotion Detected", interactive=False)
108
+
109
+ process_button.click(fn=process_video, inputs=video_input, outputs=[video_output, emotion_chart, major_emotion_output])
110
 
111
+ demo.launch()