Sagnik1750 commited on
Commit
b353d28
·
verified ·
1 Parent(s): a18c5a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +317 -154
app.py CHANGED
@@ -1,163 +1,326 @@
1
- import torch
2
- import torchaudio
3
  import cv2
4
- import librosa
 
5
  import numpy as np
6
- import gradio as gr
7
  import matplotlib.pyplot as plt
8
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
9
- from deepface import DeepFace
10
- from moviepy.editor import VideoFileClip
11
-
12
- # --- Load Pretrained Models ---
13
- # Speech-to-Text
14
- asr_model_name = "facebook/wav2vec2-large-960h"
15
- asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
16
- asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name).to("cpu")
17
-
18
- # Sentiment Analysis (Text)
19
- emotion_model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
20
- emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
21
- emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name).to("cpu")
22
-
23
- # Emotion Categories
24
- emotion_labels = {
25
- 0: "Neutral", 1: "Happy", 2: "Sad", 3: "Surprise", 4: "Fear",
26
- 5: "Disgust", 6: "Anger", 7: "Contempt"
27
- }
28
-
29
- # --- Extract Audio from Video ---
30
- def extract_audio(video_path, audio_output_path="temp_audio.wav"):
31
- video = VideoFileClip(video_path)
32
- video.audio.write_audiofile(audio_output_path, codec="pcm_s16le")
33
- return audio_output_path
34
-
35
- # --- Extract Frames for Facial & Posture Analysis ---
36
- def extract_frames(video_path, interval=10):
37
- cap = cv2.VideoCapture(video_path)
38
- frames = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  while cap.isOpened():
41
  ret, frame = cap.read()
42
  if not ret:
43
  break
44
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45
- frames.append(frame)
46
-
47
- cap.release()
48
- return frames[::interval] # Process every nth frame
49
-
50
- # --- Normalize Emotion Percentages to 100% ---
51
- def normalize_emotion_percentages(emotion_counts):
52
- print("Raw emotion counts:", emotion_counts) # Debugging
53
- total = sum(emotion_counts.values())
54
- if total > 0:
55
- normalized_counts = {k: round((v / total) * 100, 1) for k, v in emotion_counts.items()}
56
-
57
- # Adjust the highest emotion to ensure total = 100%
58
- total_after = sum(normalized_counts.values())
59
- if total_after != 100:
60
- diff = 100 - total_after
61
- max_emotion = max(normalized_counts, key=normalized_counts.get)
62
- normalized_counts[max_emotion] += diff
63
-
64
- print("Normalized emotion counts:", normalized_counts) # Debugging
65
- return normalized_counts
66
- else:
67
- return {k: 0 for k in emotion_counts}
68
-
69
- # --- Facial Emotion Analysis ---
70
- def analyze_facial_emotion(frames):
71
- emotion_counts = {key: 0 for key in emotion_labels.values()}
72
-
73
- for frame in frames:
74
- try:
75
- result = DeepFace.analyze(frame, actions=["emotion"], enforce_detection=False)
76
- detected_emotion = result[0]["dominant_emotion"].capitalize()
77
- print("Detected emotion:", detected_emotion) # Debugging
78
- if detected_emotion in emotion_counts:
79
- emotion_counts[detected_emotion] += 1
80
- except Exception:
81
- continue
82
 
83
- return normalize_emotion_percentages(emotion_counts)
84
-
85
- # --- Speech-to-Text ---
86
- def transcribe_audio(audio_path):
87
- speech, sr = librosa.load(audio_path, sr=16000)
88
- input_values = asr_processor(speech, return_tensors="pt", sampling_rate=16000).input_values
89
-
90
- with torch.no_grad():
91
- logits = asr_model(input_values).logits
92
-
93
- predicted_ids = torch.argmax(logits, dim=-1)
94
- return asr_processor.batch_decode(predicted_ids)[0]
95
-
96
- # --- Sentiment Analysis from Text ---
97
- def analyze_audio_emotion(text):
98
- inputs = emotion_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
99
- with torch.no_grad():
100
- logits = emotion_model(**inputs).logits
101
-
102
- probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
103
- predicted_emotion = emotion_labels[torch.argmax(logits, dim=-1).item()]
104
-
105
- return predicted_emotion, probabilities
106
-
107
- # --- Full Analysis Pipeline ---
108
- def analyze_video(video_path):
109
- # Extract Audio from Video
110
- audio_path = extract_audio(video_path)
111
-
112
- # Extract Frames for Facial & Posture Analysis
113
- frames = extract_frames(video_path)
114
-
115
- # Facial Emotion Analysis
116
- facial_emotions = analyze_facial_emotion(frames)
117
-
118
- # Audio Analysis
119
- transcription = transcribe_audio(audio_path)
120
- audio_emotion, audio_probabilities = analyze_audio_emotion(transcription)
121
-
122
- # Combine Emotion Scores
123
- final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
124
-
125
- # Display Emotion Pie Chart
126
- plt.figure(figsize=(5, 5))
127
- plt.pie(facial_emotions.values(), labels=facial_emotions.keys(), autopct="%1.1f%%", colors=plt.cm.Paired.colors)
128
- plt.title("Facial Emotion Distribution")
129
- plt.savefig("emotion_pie_chart.png")
130
-
131
- return (
132
- transcription,
133
- audio_emotion,
134
- final_emotion,
135
- facial_emotions,
136
- "emotion_pie_chart.png"
137
- )
138
-
139
- # --- Gradio UI ---
140
- theme_css = """
141
- body { font-family: Arial, sans-serif; background: #f4f4f4; }
142
- .gradio-container { max-width: 800px; margin: auto; padding: 20px; background: white; border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); }
143
- .gr-box { border-radius: 10px; padding: 15px; background: #fff; }
144
- h1 { color: #333; text-align: center; }
145
- """
146
-
147
- interface = gr.Interface(
148
- fn=analyze_video,
149
- inputs=gr.Video(),
150
- outputs=[
151
- gr.Textbox(label="Transcribed Speech"),
152
- gr.Textbox(label="Predicted Audio Emotion"),
153
- gr.Textbox(label="Major Detected Emotion (Face + Posture)"),
154
- gr.Label(label="Facial Emotion Distribution"),
155
- gr.Image(label="Facial Emotion Pie Chart"),
156
- ],
157
- title="🎭 Multi-Modal Emotion Analysis",
158
- description="📌 Upload a video and get analyzed emotions from **facial expressions, posture, and voice** in one step.\n\n🚀 Features:\n- Facial Emotion Analysis\n- Audio-Based Sentiment Detection\n- Real-Time Processing\n- Visual Pie Chart Representation",
159
- theme="compact",
160
- css=theme_css
161
- )
162
-
163
- interface.launch()
 
 
 
1
  import cv2
2
+ import mediapipe as mp
3
+ import torch
4
  import numpy as np
 
5
  import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ from facenet_pytorch import MTCNN
8
+ from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification
9
+ from PIL import Image
10
+ import moviepy.editor as moviepy
11
+ import librosa
12
+ import os
13
+
14
+ # Initialize device
15
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
+
17
+ # Initialize visual models
18
+ mp_pose = mp.solutions.pose
19
+ pose = mp_pose.Pose()
20
+ mtcnn = MTCNN(device=device)
21
+ face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
22
+ face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
23
+
24
+ # Initialize audio model
25
+ audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
26
+ audio_processor = AutoProcessor.from_pretrained(audio_model_name)
27
+ audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
28
+ audio_sampling_rate = 16000
29
+
30
+ def calculate_angle(a, b, c):
31
+ """Calculates the angle between three points."""
32
+ a, b, c = np.array(a), np.array(b), np.array(c)
33
+ ba, bc = a - b, c - b
34
+ cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
35
+ return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
36
+
37
+ def detect_emotions(frame):
38
+ """Detects facial emotions in a given frame."""
39
+ img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
40
+ faces, _ = mtcnn.detect(img)
41
+
42
+ if faces is None or len(faces) == 0:
43
+ return "Neutral" # Default to neutral if no face is detected
44
+
45
+ face = img.crop(faces[0])
46
+ inputs = face_extractor(images=face, return_tensors="pt").to(device)
47
+ outputs = face_model(**inputs)
48
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
49
+ return face_model.config.id2label[torch.argmax(probs).item()]
50
+
51
+ def classify_posture(back_angle, neck_angle):
52
+ """Classifies posture based on back and neck angles."""
53
+ if back_angle > 170 and neck_angle > 150:
54
+ return "Confident"
55
+ elif back_angle < 160 and neck_angle < 140:
56
+ return "Nervous"
57
+ elif back_angle < 150:
58
+ return "Defensive"
59
+ elif neck_angle < 130:
60
+ return "Serious"
61
+ else:
62
+ return "Attentive"
63
+
64
+ def extract_audio(video_path):
65
+ """Extracts audio from video file and saves it as WAV."""
66
+ audio_path = "extracted_audio.wav"
67
+ video = moviepy.VideoFileClip(video_path)
68
+ video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False)
69
+ return audio_path
70
+
71
+ def analyze_audio_emotion(audio_path):
72
+ """Analyzes emotion from audio file and returns emotion counts."""
73
+ # Load audio
74
+ y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
75
+
76
+ # Process audio in chunks to avoid memory issues
77
+ chunk_length = audio_sampling_rate * 5 # 5 seconds
78
+ emotion_counts = {}
79
+ audio_emotions = []
80
+
81
+ # Process audio in chunks
82
+ for i in range(0, len(y), chunk_length):
83
+ chunk = y[i:min(i+chunk_length, len(y))]
84
+
85
+ # Skip chunks that are too short
86
+ if len(chunk) < audio_sampling_rate:
87
+ continue
88
+
89
+ # Process audio with the model
90
+ inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
91
+ with torch.no_grad():
92
+ outputs = audio_model(**inputs)
93
+
94
+ # Get prediction
95
+ predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
96
+ emotion = audio_model.config.id2label[predicted_class_id]
97
+ audio_emotions.append(emotion)
98
+ emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
99
+
100
+ return emotion_counts, audio_emotions
101
+
102
+ def map_emotion_labels(emotion, source="face"):
103
+ """Standardizes emotion labels across different models."""
104
+ # Mapping dictionaries for different models
105
+ face_mapping = {
106
+ "happy": "Happy",
107
+ "sad": "Sad",
108
+ "angry": "Angry",
109
+ "surprise": "Surprised",
110
+ "fear": "Fearful",
111
+ "disgust": "Disgusted",
112
+ "neutral": "Neutral"
113
+ }
114
+
115
+ audio_mapping = {
116
+ "anger": "Angry",
117
+ "disgust": "Disgusted",
118
+ "fear": "Fearful",
119
+ "joy": "Happy",
120
+ "neutral": "Neutral",
121
+ "sadness": "Sad",
122
+ "surprise": "Surprised"
123
+ }
124
+
125
+ posture_mapping = {
126
+ "Confident": "Confident",
127
+ "Nervous": "Nervous",
128
+ "Defensive": "Defensive",
129
+ "Serious": "Serious",
130
+ "Attentive": "Attentive"
131
+ }
132
+
133
+ if source == "face":
134
+ return face_mapping.get(emotion.lower(), emotion)
135
+ elif source == "audio":
136
+ return audio_mapping.get(emotion.lower(), emotion)
137
+ elif source == "posture":
138
+ return posture_mapping.get(emotion, emotion)
139
+
140
+ return emotion
141
+
142
+ def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
143
+ """Draws multimodal emotion and posture sentiment on the frame."""
144
+ overlay = frame.copy()
145
+ cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
146
+
147
+ # Display current emotions
148
+ cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
149
+ cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
150
+ cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
151
+
152
+ # Display major emotion
153
+ cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
154
+
155
+ # Add explanation
156
+ reason_text = 'Weighted combination of face, posture, and audio analysis'
157
+ cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
158
+
159
+ # Blend overlay with original frame
160
+ cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
161
+
162
+ def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
163
+ """Generates charts for all emotion modalities."""
164
+ # Create a figure with 3 subplots
165
+ fig, axs = plt.subplots(1, 3, figsize=(18, 6))
166
+
167
+ # Face emotions pie chart
168
+ labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
169
+ axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
170
+ axs[0].set_title("Facial Emotions")
171
+
172
+ # Posture pie chart
173
+ labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
174
+ axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
175
+ axs[1].set_title("Posture Analysis")
176
+
177
+ # Audio emotions pie chart
178
+ labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
179
+ axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
180
+ axs[2].set_title("Audio Emotions")
181
+
182
+ plt.tight_layout()
183
+ plt.savefig("multimodal_emotion_analysis.jpg")
184
+ plt.close()
185
+
186
+ # Create combined emotions bar chart
187
+ plt.figure(figsize=(12, 6))
188
+
189
+ # Combine all emotions across modalities
190
+ all_emotions = set()
191
+ for counts in [face_emotion_counts, audio_emotion_counts]:
192
+ all_emotions.update(counts.keys())
193
+
194
+ # Prepare data for each emotion across modalities
195
+ emotions = list(all_emotions)
196
+ face_values = [face_emotion_counts.get(e, 0) for e in emotions]
197
+ audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
198
+
199
+ # Normalize values
200
+ if sum(face_values) > 0:
201
+ face_values = [v/sum(face_values)*100 for v in face_values]
202
+ if sum(audio_values) > 0:
203
+ audio_values = [v/sum(audio_values)*100 for v in audio_values]
204
+
205
+ # Create bar chart
206
+ x = np.arange(len(emotions))
207
+ width = 0.35
208
+
209
+ fig, ax = plt.subplots(figsize=(14, 8))
210
+ ax.bar(x - width/2, face_values, width, label='Face')
211
+ ax.bar(x + width/2, audio_values, width, label='Audio')
212
+
213
+ ax.set_title('Emotion Distribution by Modality')
214
+ ax.set_xlabel('Emotions')
215
+ ax.set_ylabel('Percentage (%)')
216
+ ax.set_xticks(x)
217
+ ax.set_xticklabels(emotions)
218
+ ax.legend()
219
+
220
+ plt.tight_layout()
221
+ plt.savefig("emotion_comparison.jpg")
222
+ plt.close()
223
+
224
+ def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
225
+ """Calculates a combined sentiment score from all modalities."""
226
+ # Define emotion categories and weights
227
+ modality_weights = {
228
+ "face": 0.4,
229
+ "posture": 0.2,
230
+ "audio": 0.4
231
+ }
232
+
233
+ # Map posture labels to emotional states for better combination
234
+ posture_emotion_mapping = {
235
+ "Confident": "Happy",
236
+ "Nervous": "Fearful",
237
+ "Defensive": "Angry",
238
+ "Serious": "Neutral",
239
+ "Attentive": "Neutral"
240
+ }
241
+
242
+ # Convert posture counts to emotion counts
243
+ posture_emotion_counts = {}
244
+ for posture, count in posture_counts.items():
245
+ emotion = posture_emotion_mapping.get(posture, "Neutral")
246
+ posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
247
+
248
+ # Get all unique emotions across all modalities
249
+ all_emotions = set()
250
+ for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
251
+ all_emotions.update(counts.keys())
252
+
253
+ # Calculate total frames/samples for each modality
254
+ face_total = sum(face_emotion_counts.values())
255
+ posture_total = sum(posture_counts.values())
256
+ audio_total = sum(audio_emotion_counts.values())
257
+
258
+ # Calculate weighted emotion scores
259
+ combined_scores = {}
260
+
261
+ for emotion in all_emotions:
262
+ # Get normalized scores from each modality (or 0 if not present)
263
+ face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
264
+ posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
265
+ audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
266
+
267
+ # Calculate weighted score
268
+ weighted_score = (
269
+ face_score * modality_weights["face"] +
270
+ posture_score * modality_weights["posture"] +
271
+ audio_score * modality_weights["audio"]
272
+ )
273
+
274
+ combined_scores[emotion] = weighted_score
275
+
276
+ # Normalize to percentages
277
+ total_score = sum(combined_scores.values())
278
+ if total_score > 0:
279
+ for emotion in combined_scores:
280
+ combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
281
+
282
+ # Get the major emotion
283
+ major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
284
+
285
+ return combined_scores, major_emotion[0], major_emotion[1]
286
+
287
+ def process_video(input_path):
288
+ """Processes the video with multimodal sentiment analysis."""
289
+ # Extract audio first
290
+ print("Extracting audio from video...")
291
+ audio_path = extract_audio(input_path)
292
+
293
+ # Analyze audio emotions
294
+ print("Analyzing audio emotions...")
295
+ audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
296
+
297
+ # Process video frames
298
+ print("Processing video frames...")
299
+ cap = cv2.VideoCapture(input_path)
300
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
301
+ frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
302
+ out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
303
+
304
+ # Initialize counters
305
+ face_emotion_counts = {}
306
+ posture_counts = {}
307
+ total_frames = 0
308
+ frame_index = 0
309
+
310
+ # Get total frames for progress tracking
311
+ total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
312
+
313
+ # Calculate frames per audio segment
314
+ audio_segments = len(audio_emotions_sequence)
315
+ frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
316
+ current_audio_index = 0
317
 
318
  while cap.isOpened():
319
  ret, frame = cap.read()
320
  if not ret:
321
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
+ # Update progress
324
+ frame_index += 1
325
+ if frame_index % 30 == 0: # Show progress every 30 frames
326
+ print(f"Processing frame {frame_index}/{total_video_frames}