Sagnik1750 commited on
Commit
0340596
·
verified ·
1 Parent(s): b353d28

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -326
app.py DELETED
@@ -1,326 +0,0 @@
1
- import cv2
2
- import mediapipe as mp
3
- import torch
4
- import numpy as np
5
- import matplotlib.pyplot as plt
6
- import seaborn as sns
7
- from facenet_pytorch import MTCNN
8
- from transformers import AutoFeatureExtractor, AutoModelForImageClassification, AutoProcessor, AutoModelForAudioClassification
9
- from PIL import Image
10
- import moviepy.editor as moviepy
11
- import librosa
12
- import os
13
-
14
- # Initialize device
15
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
-
17
- # Initialize visual models
18
- mp_pose = mp.solutions.pose
19
- pose = mp_pose.Pose()
20
- mtcnn = MTCNN(device=device)
21
- face_model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
22
- face_extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
23
-
24
- # Initialize audio model
25
- audio_model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
26
- audio_processor = AutoProcessor.from_pretrained(audio_model_name)
27
- audio_model = AutoModelForAudioClassification.from_pretrained(audio_model_name).to(device)
28
- audio_sampling_rate = 16000
29
-
30
- def calculate_angle(a, b, c):
31
- """Calculates the angle between three points."""
32
- a, b, c = np.array(a), np.array(b), np.array(c)
33
- ba, bc = a - b, c - b
34
- cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
35
- return np.degrees(np.arccos(np.clip(cosine_angle, -1.0, 1.0)))
36
-
37
- def detect_emotions(frame):
38
- """Detects facial emotions in a given frame."""
39
- img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
40
- faces, _ = mtcnn.detect(img)
41
-
42
- if faces is None or len(faces) == 0:
43
- return "Neutral" # Default to neutral if no face is detected
44
-
45
- face = img.crop(faces[0])
46
- inputs = face_extractor(images=face, return_tensors="pt").to(device)
47
- outputs = face_model(**inputs)
48
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
49
- return face_model.config.id2label[torch.argmax(probs).item()]
50
-
51
- def classify_posture(back_angle, neck_angle):
52
- """Classifies posture based on back and neck angles."""
53
- if back_angle > 170 and neck_angle > 150:
54
- return "Confident"
55
- elif back_angle < 160 and neck_angle < 140:
56
- return "Nervous"
57
- elif back_angle < 150:
58
- return "Defensive"
59
- elif neck_angle < 130:
60
- return "Serious"
61
- else:
62
- return "Attentive"
63
-
64
- def extract_audio(video_path):
65
- """Extracts audio from video file and saves it as WAV."""
66
- audio_path = "extracted_audio.wav"
67
- video = moviepy.VideoFileClip(video_path)
68
- video.audio.write_audiofile(audio_path, codec='pcm_s16le', verbose=False)
69
- return audio_path
70
-
71
- def analyze_audio_emotion(audio_path):
72
- """Analyzes emotion from audio file and returns emotion counts."""
73
- # Load audio
74
- y, sr = librosa.load(audio_path, sr=audio_sampling_rate)
75
-
76
- # Process audio in chunks to avoid memory issues
77
- chunk_length = audio_sampling_rate * 5 # 5 seconds
78
- emotion_counts = {}
79
- audio_emotions = []
80
-
81
- # Process audio in chunks
82
- for i in range(0, len(y), chunk_length):
83
- chunk = y[i:min(i+chunk_length, len(y))]
84
-
85
- # Skip chunks that are too short
86
- if len(chunk) < audio_sampling_rate:
87
- continue
88
-
89
- # Process audio with the model
90
- inputs = audio_processor(chunk, sampling_rate=audio_sampling_rate, return_tensors="pt").to(device)
91
- with torch.no_grad():
92
- outputs = audio_model(**inputs)
93
-
94
- # Get prediction
95
- predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
96
- emotion = audio_model.config.id2label[predicted_class_id]
97
- audio_emotions.append(emotion)
98
- emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
99
-
100
- return emotion_counts, audio_emotions
101
-
102
- def map_emotion_labels(emotion, source="face"):
103
- """Standardizes emotion labels across different models."""
104
- # Mapping dictionaries for different models
105
- face_mapping = {
106
- "happy": "Happy",
107
- "sad": "Sad",
108
- "angry": "Angry",
109
- "surprise": "Surprised",
110
- "fear": "Fearful",
111
- "disgust": "Disgusted",
112
- "neutral": "Neutral"
113
- }
114
-
115
- audio_mapping = {
116
- "anger": "Angry",
117
- "disgust": "Disgusted",
118
- "fear": "Fearful",
119
- "joy": "Happy",
120
- "neutral": "Neutral",
121
- "sadness": "Sad",
122
- "surprise": "Surprised"
123
- }
124
-
125
- posture_mapping = {
126
- "Confident": "Confident",
127
- "Nervous": "Nervous",
128
- "Defensive": "Defensive",
129
- "Serious": "Serious",
130
- "Attentive": "Attentive"
131
- }
132
-
133
- if source == "face":
134
- return face_mapping.get(emotion.lower(), emotion)
135
- elif source == "audio":
136
- return audio_mapping.get(emotion.lower(), emotion)
137
- elif source == "posture":
138
- return posture_mapping.get(emotion, emotion)
139
-
140
- return emotion
141
-
142
- def draw_multimodal_sentiment_bar(frame, face_emotion, posture_label, audio_emotion, major_emotion, major_emotion_percent):
143
- """Draws multimodal emotion and posture sentiment on the frame."""
144
- overlay = frame.copy()
145
- cv2.rectangle(overlay, (10, 10), (450, 200), (255, 255, 255), -1)
146
-
147
- # Display current emotions
148
- cv2.putText(overlay, f'Face Emotion: {face_emotion}', (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
149
- cv2.putText(overlay, f'Posture: {posture_label}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
150
- cv2.putText(overlay, f'Audio Emotion: {audio_emotion}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
151
-
152
- # Display major emotion
153
- cv2.putText(overlay, f'Major Emotion: {major_emotion} ({major_emotion_percent:.1f}%)', (20, 130), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)
154
-
155
- # Add explanation
156
- reason_text = 'Weighted combination of face, posture, and audio analysis'
157
- cv2.putText(overlay, f'Analysis: {reason_text}', (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
158
-
159
- # Blend overlay with original frame
160
- cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
161
-
162
- def generate_multimodal_charts(face_emotion_counts, posture_counts, audio_emotion_counts):
163
- """Generates charts for all emotion modalities."""
164
- # Create a figure with 3 subplots
165
- fig, axs = plt.subplots(1, 3, figsize=(18, 6))
166
-
167
- # Face emotions pie chart
168
- labels, sizes = zip(*face_emotion_counts.items()) if face_emotion_counts else (["None"], [1])
169
- axs[0].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Blues'))
170
- axs[0].set_title("Facial Emotions")
171
-
172
- # Posture pie chart
173
- labels, sizes = zip(*posture_counts.items()) if posture_counts else (["None"], [1])
174
- axs[1].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Greens'))
175
- axs[1].set_title("Posture Analysis")
176
-
177
- # Audio emotions pie chart
178
- labels, sizes = zip(*audio_emotion_counts.items()) if audio_emotion_counts else (["None"], [1])
179
- axs[2].pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('Reds'))
180
- axs[2].set_title("Audio Emotions")
181
-
182
- plt.tight_layout()
183
- plt.savefig("multimodal_emotion_analysis.jpg")
184
- plt.close()
185
-
186
- # Create combined emotions bar chart
187
- plt.figure(figsize=(12, 6))
188
-
189
- # Combine all emotions across modalities
190
- all_emotions = set()
191
- for counts in [face_emotion_counts, audio_emotion_counts]:
192
- all_emotions.update(counts.keys())
193
-
194
- # Prepare data for each emotion across modalities
195
- emotions = list(all_emotions)
196
- face_values = [face_emotion_counts.get(e, 0) for e in emotions]
197
- audio_values = [audio_emotion_counts.get(e, 0) for e in emotions]
198
-
199
- # Normalize values
200
- if sum(face_values) > 0:
201
- face_values = [v/sum(face_values)*100 for v in face_values]
202
- if sum(audio_values) > 0:
203
- audio_values = [v/sum(audio_values)*100 for v in audio_values]
204
-
205
- # Create bar chart
206
- x = np.arange(len(emotions))
207
- width = 0.35
208
-
209
- fig, ax = plt.subplots(figsize=(14, 8))
210
- ax.bar(x - width/2, face_values, width, label='Face')
211
- ax.bar(x + width/2, audio_values, width, label='Audio')
212
-
213
- ax.set_title('Emotion Distribution by Modality')
214
- ax.set_xlabel('Emotions')
215
- ax.set_ylabel('Percentage (%)')
216
- ax.set_xticks(x)
217
- ax.set_xticklabels(emotions)
218
- ax.legend()
219
-
220
- plt.tight_layout()
221
- plt.savefig("emotion_comparison.jpg")
222
- plt.close()
223
-
224
- def calculate_combined_sentiment(face_emotion_counts, posture_counts, audio_emotion_counts):
225
- """Calculates a combined sentiment score from all modalities."""
226
- # Define emotion categories and weights
227
- modality_weights = {
228
- "face": 0.4,
229
- "posture": 0.2,
230
- "audio": 0.4
231
- }
232
-
233
- # Map posture labels to emotional states for better combination
234
- posture_emotion_mapping = {
235
- "Confident": "Happy",
236
- "Nervous": "Fearful",
237
- "Defensive": "Angry",
238
- "Serious": "Neutral",
239
- "Attentive": "Neutral"
240
- }
241
-
242
- # Convert posture counts to emotion counts
243
- posture_emotion_counts = {}
244
- for posture, count in posture_counts.items():
245
- emotion = posture_emotion_mapping.get(posture, "Neutral")
246
- posture_emotion_counts[emotion] = posture_emotion_counts.get(emotion, 0) + count
247
-
248
- # Get all unique emotions across all modalities
249
- all_emotions = set()
250
- for counts in [face_emotion_counts, posture_emotion_counts, audio_emotion_counts]:
251
- all_emotions.update(counts.keys())
252
-
253
- # Calculate total frames/samples for each modality
254
- face_total = sum(face_emotion_counts.values())
255
- posture_total = sum(posture_counts.values())
256
- audio_total = sum(audio_emotion_counts.values())
257
-
258
- # Calculate weighted emotion scores
259
- combined_scores = {}
260
-
261
- for emotion in all_emotions:
262
- # Get normalized scores from each modality (or 0 if not present)
263
- face_score = face_emotion_counts.get(emotion, 0) / face_total if face_total > 0 else 0
264
- posture_score = posture_emotion_counts.get(emotion, 0) / posture_total if posture_total > 0 else 0
265
- audio_score = audio_emotion_counts.get(emotion, 0) / audio_total if audio_total > 0 else 0
266
-
267
- # Calculate weighted score
268
- weighted_score = (
269
- face_score * modality_weights["face"] +
270
- posture_score * modality_weights["posture"] +
271
- audio_score * modality_weights["audio"]
272
- )
273
-
274
- combined_scores[emotion] = weighted_score
275
-
276
- # Normalize to percentages
277
- total_score = sum(combined_scores.values())
278
- if total_score > 0:
279
- for emotion in combined_scores:
280
- combined_scores[emotion] = (combined_scores[emotion] / total_score) * 100
281
-
282
- # Get the major emotion
283
- major_emotion = max(combined_scores.items(), key=lambda x: x[1]) if combined_scores else ("Unknown", 0)
284
-
285
- return combined_scores, major_emotion[0], major_emotion[1]
286
-
287
- def process_video(input_path):
288
- """Processes the video with multimodal sentiment analysis."""
289
- # Extract audio first
290
- print("Extracting audio from video...")
291
- audio_path = extract_audio(input_path)
292
-
293
- # Analyze audio emotions
294
- print("Analyzing audio emotions...")
295
- audio_emotion_counts, audio_emotions_sequence = analyze_audio_emotion(audio_path)
296
-
297
- # Process video frames
298
- print("Processing video frames...")
299
- cap = cv2.VideoCapture(input_path)
300
- fps = int(cap.get(cv2.CAP_PROP_FPS))
301
- frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
302
- out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
303
-
304
- # Initialize counters
305
- face_emotion_counts = {}
306
- posture_counts = {}
307
- total_frames = 0
308
- frame_index = 0
309
-
310
- # Get total frames for progress tracking
311
- total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
312
-
313
- # Calculate frames per audio segment
314
- audio_segments = len(audio_emotions_sequence)
315
- frames_per_audio = max(1, total_video_frames // audio_segments) if audio_segments > 0 else 1
316
- current_audio_index = 0
317
-
318
- while cap.isOpened():
319
- ret, frame = cap.read()
320
- if not ret:
321
- break
322
-
323
- # Update progress
324
- frame_index += 1
325
- if frame_index % 30 == 0: # Show progress every 30 frames
326
- print(f"Processing frame {frame_index}/{total_video_frames}