ans123 commited on
Commit
2f82426
·
verified ·
1 Parent(s): 10cf743

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1312 -0
app.py ADDED
@@ -0,0 +1,1312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ import pandas as pd
5
+ import time
6
+ import dlib
7
+ import matplotlib.pyplot as plt
8
+ from matplotlib.colors import LinearSegmentedColormap
9
+ from matplotlib.collections import LineCollection
10
+ import os
11
+ import datetime
12
+ import tempfile
13
+ from typing import Dict, List, Tuple, Optional, Union, Any
14
+ import google.generativeai as genai
15
+ from PIL import Image
16
+ import json
17
+ import warnings
18
+ from deepface import DeepFace
19
+ import base64
20
+ import io
21
+ from pathlib import Path
22
+
23
+ # Suppress warnings for cleaner output
24
+ warnings.filterwarnings('ignore')
25
+
26
+ # --- Constants ---
27
+ VIDEO_FPS = 30 # Target FPS for saved video
28
+ CSV_FILENAME_TEMPLATE = "facial_analysis_{timestamp}.csv"
29
+ VIDEO_FILENAME_TEMPLATE = "processed_{timestamp}.mp4"
30
+ TEMP_DIR = Path("temp_frames")
31
+ TEMP_DIR.mkdir(exist_ok=True)
32
+
33
+ # --- Configure Google Gemini API ---
34
+ print("Configuring Google Gemini API...")
35
+ try:
36
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
37
+ if not GOOGLE_API_KEY:
38
+ raise ValueError("GOOGLE_API_KEY environment variable not set.")
39
+
40
+ genai.configure(api_key=GOOGLE_API_KEY)
41
+ # Use gemini-2.0-flash for quick responses
42
+ model = genai.GenerativeModel('gemini-2.0-flash')
43
+ GEMINI_ENABLED = True
44
+ print("Google Gemini API configured successfully.")
45
+ except Exception as e:
46
+ print(f"WARNING: Failed to configure Google Gemini API: {e}")
47
+ print("Running with simulated Gemini API responses.")
48
+ GEMINI_ENABLED = False
49
+
50
+ # --- Initialize dlib and DeepFace for facial analysis ---
51
+ print("Initializing dlib face detector and shape predictor...")
52
+ try:
53
+ # Initialize dlib's face detector and facial landmark predictor
54
+ face_detector = dlib.get_frontal_face_detector()
55
+
56
+ # Paths to shape predictor model file
57
+ # You need to download this file from:
58
+ # http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
59
+ predictor_path = "shape_predictor_68_face_landmarks.dat"
60
+
61
+ # Check if the predictor file exists, otherwise inform the user
62
+ if not os.path.exists(predictor_path):
63
+ print(f"WARNING: {predictor_path} not found. Please download from:")
64
+ print("http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
65
+ print("Extract and place in the current directory.")
66
+ # Use a placeholder or alternative
67
+ shape_predictor = None
68
+ else:
69
+ shape_predictor = dlib.shape_predictor(predictor_path)
70
+
71
+ print("dlib face detector initialized successfully.")
72
+ except Exception as e:
73
+ print(f"ERROR initializing dlib face detector: {e}")
74
+ face_detector = None
75
+ shape_predictor = None
76
+
77
+ # --- Metrics Definition ---
78
+ metrics = [
79
+ "valence", "arousal", "dominance", "cognitive_load",
80
+ "emotional_stability", "openness", "agreeableness",
81
+ "neuroticism", "conscientiousness", "extraversion",
82
+ "stress_index", "engagement_level"
83
+ ]
84
+
85
+ # DeepFace emotion mapping
86
+ emotion_mapping = {
87
+ "angry": {"valence": 0.2, "arousal": 0.8, "dominance": 0.7},
88
+ "disgust": {"valence": 0.2, "arousal": 0.6, "dominance": 0.5},
89
+ "fear": {"valence": 0.2, "arousal": 0.8, "dominance": 0.3},
90
+ "happy": {"valence": 0.9, "arousal": 0.7, "dominance": 0.6},
91
+ "sad": {"valence": 0.3, "arousal": 0.4, "dominance": 0.3},
92
+ "surprise": {"valence": 0.6, "arousal": 0.9, "dominance": 0.5},
93
+ "neutral": {"valence": 0.5, "arousal": 0.5, "dominance": 0.5}
94
+ }
95
+
96
+ ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
97
+ user_state_columns = ["user_state", "enhanced_user_state"]
98
+ all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
99
+ initial_metrics_df = pd.DataFrame(columns=all_columns)
100
+
101
+ # --- Gemini API Functions ---
102
+
103
+ def call_gemini_api_for_ad(description, detail, ad_type):
104
+ """
105
+ Uses Google Gemini to analyze ad context.
106
+ """
107
+ print(f"Analyzing ad context: '{description}' ({ad_type})")
108
+
109
+ if not GEMINI_ENABLED:
110
+ # Simulated response
111
+ analysis = f"Simulated analysis: Ad='{description or 'N/A'}' ({ad_type}), Focus='{detail or 'N/A'}'."
112
+ if not description and not detail:
113
+ analysis = "No ad context provided."
114
+ print(f"Simulated Gemini Result: {analysis}")
115
+ return analysis
116
+ else:
117
+ try:
118
+ prompt = f"""
119
+ Please analyze this advertisement context:
120
+ - Description: {description}
121
+ - Detail focus: {detail}
122
+ - Type/Genre: {ad_type}
123
+
124
+ Provide a concise analysis of what emotional and cognitive responses might be expected from viewers.
125
+ Limit your response to 100 words.
126
+ """
127
+
128
+ response = model.generate_content(prompt)
129
+ return response.text
130
+ except Exception as e:
131
+ print(f"Error calling Gemini for ad context: {e}")
132
+ return f"Error analyzing ad context: {str(e)}"
133
+
134
+ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
135
+ """
136
+ Uses Google Gemini to interpret facial metrics and determine user state.
137
+ """
138
+ if not metrics_dict:
139
+ return "No metrics", "No facial data detected"
140
+
141
+ if not GEMINI_ENABLED:
142
+ # Basic rule-based simulation for user state
143
+ valence = metrics_dict.get('valence', 0.5)
144
+ arousal = metrics_dict.get('arousal', 0.5)
145
+ cog_load = metrics_dict.get('cognitive_load', 0.5)
146
+ stress = metrics_dict.get('stress_index', 0.5)
147
+ engagement = metrics_dict.get('engagement_level', 0.5)
148
+
149
+ # Simple rule-based simulation
150
+ state = "Neutral"
151
+ if valence > 0.65 and arousal > 0.55 and engagement > 0.6:
152
+ state = "Positive, Engaged"
153
+ elif valence < 0.4 and stress > 0.6:
154
+ state = "Stressed, Negative"
155
+ elif cog_load > 0.7 and engagement < 0.4:
156
+ state = "Confused, Disengaged"
157
+ elif arousal < 0.4 and engagement < 0.5:
158
+ state = "Calm, Passive"
159
+
160
+ enhanced_state = f"The viewer appears {state.lower()} while watching this content. They are likely not fully connecting with the message."
161
+
162
+ return state, enhanced_state
163
+ else:
164
+ try:
165
+ # Format metrics for Gemini
166
+ metrics_formatted = "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
167
+ if k not in ('timestamp', 'frame_number')])
168
+
169
+ # Include ad context if available
170
+ ad_info = ""
171
+ if ad_context:
172
+ ad_desc = ad_context.get('ad_description', 'N/A')
173
+ ad_type = ad_context.get('ad_type', 'N/A')
174
+ ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
175
+
176
+ prompt = f"""
177
+ Analyze these facial metrics (scale 0-1) of a person watching an advertisement{ad_info}:
178
+ {metrics_formatted}
179
+
180
+ Provide two outputs:
181
+ 1. User State: A short 1-3 word description of their emotional/cognitive state
182
+ 2. Enhanced Analysis: A detailed 1-2 sentence interpretation of their reaction to the content
183
+
184
+ Format as JSON: {{"user_state": "STATE", "enhanced_user_state": "DETAILED ANALYSIS"}}
185
+ """
186
+
187
+ response = model.generate_content(prompt)
188
+
189
+ try:
190
+ # Try to parse as JSON
191
+ result = json.loads(response.text)
192
+ return result.get("user_state", "Uncertain"), result.get("enhanced_user_state", "Analysis unavailable")
193
+ except json.JSONDecodeError:
194
+ # If not valid JSON, try to extract manually
195
+ text = response.text
196
+ if "user_state" in text and "enhanced_user_state" in text:
197
+ parts = text.split("enhanced_user_state")
198
+ user_state = parts[0].split("user_state")[1].replace('"', '').replace(':', '').replace(',', '').strip()
199
+ enhanced = parts[1].replace('"', '').replace(':', '').replace('}', '').strip()
200
+ return user_state, enhanced
201
+ else:
202
+ # Just return the raw text as enhanced state
203
+ return "Analyzed", text
204
+
205
+ except Exception as e:
206
+ print(f"Error calling Gemini for metric interpretation: {e}")
207
+ return "Error", f"Error analyzing facial metrics: {str(e)}"
208
+
209
+ # --- Facial Analysis Functions with dlib and DeepFace ---
210
+
211
+ def extract_face_landmarks_dlib(image):
212
+ """Extract facial landmarks using dlib"""
213
+ if image is None or face_detector is None or shape_predictor is None:
214
+ return None
215
+
216
+ try:
217
+ # Convert to grayscale for dlib
218
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
219
+
220
+ # Detect faces
221
+ faces = face_detector(gray, 0)
222
+
223
+ if len(faces) == 0:
224
+ return None
225
+
226
+ # Get the largest face by area
227
+ largest_face = faces[0]
228
+ largest_area = (faces[0].right() - faces[0].left()) * (faces[0].bottom() - faces[0].top())
229
+
230
+ for face in faces:
231
+ area = (face.right() - face.left()) * (face.bottom() - face.top())
232
+ if area > largest_area:
233
+ largest_face = face
234
+ largest_area = area
235
+
236
+ # Get facial landmarks
237
+ landmarks = shape_predictor(gray, largest_face)
238
+
239
+ # Return both the face detection rectangle and landmarks
240
+ return {"rect": largest_face, "landmarks": landmarks}
241
+
242
+ except Exception as e:
243
+ print(f"Error in dlib landmark extraction: {e}")
244
+ return None
245
+
246
+ def analyze_face_with_deepface(image):
247
+ """Analyze facial emotions using DeepFace"""
248
+ if image is None:
249
+ return None
250
+
251
+ try:
252
+ # Convert to RGB for DeepFace if needed
253
+ if len(image.shape) == 3 and image.shape[2] == 3:
254
+ # Check if BGR and convert to RGB if needed
255
+ if np.mean(image[:,:,0]) < np.mean(image[:,:,2]): # Rough BGR check
256
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
257
+ else:
258
+ image_rgb = image
259
+ else:
260
+ # Handle grayscale or other formats
261
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
262
+
263
+ # Save image to temp file (DeepFace sometimes works better with files)
264
+ temp_img = f"temp_frames/temp_analysis_{time.time()}.jpg"
265
+ cv2.imwrite(temp_img, image_rgb)
266
+
267
+ # Analyze with DeepFace
268
+ analysis = DeepFace.analyze(
269
+ img_path=temp_img,
270
+ actions=['emotion'],
271
+ enforce_detection=False, # Don't throw error if face not detected
272
+ detector_backend='opencv' # Faster detection
273
+ )
274
+
275
+ # Remove temporary file
276
+ try:
277
+ os.remove(temp_img)
278
+ except:
279
+ pass
280
+
281
+ # Return the first face analysis (assuming single face)
282
+ if isinstance(analysis, list) and len(analysis) > 0:
283
+ return analysis[0]
284
+ else:
285
+ return analysis
286
+
287
+ except Exception as e:
288
+ print(f"DeepFace analysis error: {e}")
289
+ return None
290
+
291
+ def calculate_ear_dlib(landmarks):
292
+ """Calculate Eye Aspect Ratio using dlib landmarks"""
293
+ if landmarks is None:
294
+ return 0.0
295
+
296
+ try:
297
+ # dlib's 68-point face model landmark indices
298
+ # Left eye: 36-41, Right eye: 42-47
299
+ LEFT_EYE = range(36, 42)
300
+ RIGHT_EYE = range(42, 48)
301
+
302
+ def get_eye_aspect_ratio(eye_points):
303
+ # Compute the euclidean distances between the two sets of vertical landmarks
304
+ v1 = np.linalg.norm(eye_points[1] - eye_points[5])
305
+ v2 = np.linalg.norm(eye_points[2] - eye_points[4])
306
+ # Compute the euclidean distance between the horizontal landmarks
307
+ h = np.linalg.norm(eye_points[0] - eye_points[3])
308
+ # Compute the eye aspect ratio
309
+ return (v1 + v2) / (2.0 * h) if h > 1e-6 else 0.0
310
+
311
+ # Extract landmark coordinates
312
+ landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
313
+
314
+ # Calculate EAR for left and right eyes
315
+ left_eye_coords = landmark_coords[list(LEFT_EYE)]
316
+ right_eye_coords = landmark_coords[list(RIGHT_EYE)]
317
+
318
+ left_ear = get_eye_aspect_ratio(left_eye_coords)
319
+ right_ear = get_eye_aspect_ratio(right_eye_coords)
320
+
321
+ # Return average of both eyes
322
+ return (left_ear + right_ear) / 2.0
323
+
324
+ except Exception as e:
325
+ print(f"Error calculating EAR: {e}")
326
+ return 0.0
327
+
328
+ def calculate_mar_dlib(landmarks):
329
+ """Calculate Mouth Aspect Ratio using dlib landmarks"""
330
+ if landmarks is None:
331
+ return 0.0
332
+
333
+ try:
334
+ # dlib's 68-point face model landmark indices for mouth
335
+ # Mouth outer: 48-59, Mouth inner: 60-67
336
+ MOUTH_OUTER = range(48, 60)
337
+ MOUTH_INNER = range(60, 68)
338
+
339
+ # Extract landmark coordinates
340
+ landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
341
+
342
+ # Use specific points for vertical and horizontal measurements
343
+ # Vertical: distance between top and bottom lips
344
+ top_lip = landmark_coords[51] # Top lip center
345
+ bottom_lip = landmark_coords[57] # Bottom lip center
346
+ vertical = np.linalg.norm(top_lip - bottom_lip)
347
+
348
+ # Horizontal: distance between mouth corners
349
+ left_corner = landmark_coords[48] # Left mouth corner
350
+ right_corner = landmark_coords[54] # Right mouth corner
351
+ horizontal = np.linalg.norm(left_corner - right_corner)
352
+
353
+ # Calculate ratio
354
+ return vertical / horizontal if horizontal > 1e-6 else 0.0
355
+
356
+ except Exception as e:
357
+ print(f"Error calculating MAR: {e}")
358
+ return 0.0
359
+
360
+ def calculate_eyebrow_position_dlib(landmarks):
361
+ """Calculate eyebrow position using dlib landmarks"""
362
+ if landmarks is None:
363
+ return 0.0
364
+
365
+ try:
366
+ # dlib's 68-point face model landmark indices
367
+ # Left eyebrow: 17-21, Right eyebrow: 22-26
368
+ # Left eye: 36-41, Right eye: 42-47
369
+ L_BROW_C = 19 # Center of left eyebrow
370
+ R_BROW_C = 24 # Center of right eyebrow
371
+ L_EYE_C = 37 # Center top of left eye
372
+ R_EYE_C = 43 # Center top of right eye
373
+
374
+ # Extract landmark coordinates
375
+ landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
376
+
377
+ # Calculate distances between eyebrows and eyes
378
+ l_brow_y = landmark_coords[L_BROW_C][1]
379
+ r_brow_y = landmark_coords[R_BROW_C][1]
380
+ l_eye_y = landmark_coords[L_EYE_C][1]
381
+ r_eye_y = landmark_coords[R_EYE_C][1]
382
+
383
+ # Calculate vertical distances (smaller value means eyebrows are raised)
384
+ l_dist = l_eye_y - l_brow_y
385
+ r_dist = r_eye_y - r_brow_y
386
+
387
+ # Average the distances and normalize
388
+ avg_dist = (l_dist + r_dist) / 2.0
389
+ # Approximate normalization based on typical face proportions
390
+ # Higher value means eyebrows are raised more
391
+ norm = (avg_dist - 5) / 15 # Adjusted for typical pixel distances
392
+
393
+ return max(0.0, min(1.0, norm))
394
+
395
+ except Exception as e:
396
+ print(f"Error calculating Eyebrow Position: {e}")
397
+ return 0.0
398
+
399
+ def estimate_head_pose_dlib(landmarks):
400
+ """Estimate head pose using dlib landmarks"""
401
+ if landmarks is None:
402
+ return 0.0, 0.0
403
+
404
+ try:
405
+ # dlib's 68-point face model landmark indices
406
+ NOSE_TIP = 30 # Nose tip
407
+ LEFT_EYE_C = 37 # Left eye center
408
+ RIGHT_EYE_C = 44 # Right eye center
409
+
410
+ # Extract landmark coordinates
411
+ landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
412
+
413
+ # Get key points
414
+ nose_pt = landmark_coords[NOSE_TIP]
415
+ l_eye_pt = landmark_coords[LEFT_EYE_C]
416
+ r_eye_pt = landmark_coords[RIGHT_EYE_C]
417
+
418
+ # Calculate eye midpoint
419
+ eye_mid_x = (l_eye_pt[0] + r_eye_pt[0]) / 2.0
420
+ eye_mid_y = (l_eye_pt[1] + r_eye_pt[1]) / 2.0
421
+
422
+ # Calculate tilt
423
+ v_tilt = nose_pt[1] - eye_mid_y # Vertical tilt
424
+ h_tilt = nose_pt[0] - eye_mid_x # Horizontal tilt
425
+
426
+ # Normalize based on typical facial proportions
427
+ v_tilt_norm = v_tilt / 30.0 # Approximate normalization
428
+ h_tilt_norm = h_tilt / 20.0 # Approximate normalization
429
+
430
+ # Clip to range [-1, 1]
431
+ v_tilt_norm = max(-1.0, min(1.0, v_tilt_norm))
432
+ h_tilt_norm = max(-1.0, min(1.0, h_tilt_norm))
433
+
434
+ return v_tilt_norm, h_tilt_norm
435
+
436
+ except Exception as e:
437
+ print(f"Error estimating Head Pose: {e}")
438
+ return 0.0, 0.0
439
+
440
+ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None):
441
+ """
442
+ Calculate facial metrics using a combination of dlib landmarks and DeepFace emotions.
443
+ This provides a more robust approach by integrating both geometric and deep learning methods.
444
+ """
445
+ if ad_context is None:
446
+ ad_context = {}
447
+
448
+ # Initialize default metrics
449
+ default_metrics = {m: 0.5 for m in metrics}
450
+
451
+ # If no facial data, return defaults
452
+ if not facial_data:
453
+ return default_metrics
454
+
455
+ # Extract landmarks from facial data
456
+ landmarks = facial_data.get("landmarks")
457
+
458
+ # If we have DeepFace data, use it to influence our metrics
459
+ emotion_weights = None
460
+ dominant_emotion = None
461
+
462
+ if deepface_data and "emotion" in deepface_data:
463
+ emotion_weights = deepface_data["emotion"]
464
+ # Find dominant emotion
465
+ dominant_emotion = max(emotion_weights.items(), key=lambda x: x[1])[0]
466
+
467
+ # Calculate base geometric features if landmarks are available
468
+ ear = calculate_ear_dlib(landmarks) if landmarks else 0.2
469
+ mar = calculate_mar_dlib(landmarks) if landmarks else 0.5
470
+ eb_pos = calculate_eyebrow_position_dlib(landmarks) if landmarks else 0.5
471
+ v_tilt, h_tilt = estimate_head_pose_dlib(landmarks) if landmarks else (0.0, 0.0)
472
+
473
+ # Combine geometric features with emotion weights
474
+
475
+ # Step 1: Start with default metrics
476
+ calculated_metrics = default_metrics.copy()
477
+
478
+ # Step 2: Update based on geometric features
479
+ cl = max(0, min(1, 1.0 - ear * 2.5)) # Cognitive load: Higher when eyes are more closed
480
+
481
+ # Step 3: If we have emotion data from DeepFace, incorporate it
482
+ if dominant_emotion and emotion_weights:
483
+ # Get base values from emotion mapping
484
+ base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
485
+
486
+ # Calculate confidence-weighted emotion values
487
+ confidence = emotion_weights.get(dominant_emotion, 0) / 100.0 # Convert percentage to 0-1
488
+
489
+ # Combine geometric and emotion-based metrics with weighted approach
490
+ val = base_vals["valence"] * confidence + (mar * 0.7 * (1.0 - eb_pos) * 0.3) * (1 - confidence)
491
+ arsl = base_vals["arousal"] * confidence + ((mar + (1.0 - ear) + eb_pos) / 3.0) * (1 - confidence)
492
+ dom = base_vals["dominance"] * confidence + (0.5 + v_tilt) * (1 - confidence)
493
+ else:
494
+ # Fallback to geometric features only
495
+ val = max(0, min(1, mar * 2.0 * (1.0 - eb_pos)))
496
+ arsl = max(0, min(1, (mar + (1.0 - ear) + eb_pos) / 3.0))
497
+ dom = max(0, min(1, 0.5 + v_tilt))
498
+
499
+ # Illustrative Context Adjustments from ad
500
+ ad_type = ad_context.get('ad_type', 'Unknown')
501
+ gem_txt = str(ad_context.get('gemini_ad_analysis', '')).lower()
502
+
503
+ # Adjust based on ad context
504
+ val_adj = 0.1 if ad_type == 'Funny' or 'humor' in gem_txt else 0.0
505
+ arsl_adj = 0.1 if ad_type == 'Action' or 'exciting' in gem_txt else 0.0
506
+
507
+ # Apply adjustments
508
+ val = max(0, min(1, val + val_adj))
509
+ arsl = max(0, min(1, arsl + arsl_adj))
510
+
511
+ # Calculate secondary metrics
512
+ neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
513
+ em_stab = 1.0 - neur
514
+ extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
515
+ open = max(0, min(1, 0.5 + ((mar - 0.5) * 0.5)))
516
+ agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
517
+ consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
518
+ stress = max(0, min(1, (cl * 0.5) + (eb_pos * 0.3) + ((1.0 - val) * 0.2)))
519
+ engag = max(0, min(1, (arsl * 0.7) + ((1.0 - abs(h_tilt)) * 0.3)))
520
+
521
+ # Update the metrics dictionary
522
+ calculated_metrics.update({
523
+ 'valence': val,
524
+ 'arousal': arsl,
525
+ 'dominance': dom,
526
+ 'cognitive_load': cl,
527
+ 'emotional_stability': em_stab,
528
+ 'openness': open,
529
+ 'agreeableness': agree,
530
+ 'neuroticism': neur,
531
+ 'conscientiousness': consc,
532
+ 'extraversion': extr,
533
+ 'stress_index': stress,
534
+ 'engagement_level': engag
535
+ })
536
+
537
+ return calculated_metrics
538
+
539
+ def update_metrics_visualization(metrics_values):
540
+ """Create a visualization of metrics"""
541
+ if not metrics_values:
542
+ fig, ax = plt.subplots(figsize=(10, 8))
543
+ ax.text(0.5, 0.5, "Waiting for facial metrics...", ha='center', va='center')
544
+ ax.axis('off')
545
+ fig.patch.set_facecolor('#FFFFFF')
546
+ ax.set_facecolor('#FFFFFF')
547
+ return fig
548
+
549
+ # Filter out non-metric keys
550
+ filtered_metrics = {k: v for k, v in metrics_values.items()
551
+ if k in metrics and isinstance(v, (int, float))}
552
+
553
+ if not filtered_metrics:
554
+ fig, ax = plt.subplots(figsize=(10, 8))
555
+ ax.text(0.5, 0.5, "No valid metrics available", ha='center', va='center')
556
+ ax.axis('off')
557
+ return fig
558
+
559
+ num_metrics = len(filtered_metrics)
560
+ nrows = (num_metrics + 2) // 3
561
+ fig, axs = plt.subplots(nrows, 3, figsize=(10, nrows * 2.5), facecolor='#FFFFFF')
562
+ axs = axs.flatten()
563
+
564
+ colors = [(0.1, 0.1, 0.9), (0.9, 0.9, 0.1), (0.9, 0.1, 0.1)]
565
+ cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=100)
566
+ norm = plt.Normalize(0, 1)
567
+ metric_idx = 0
568
+
569
+ for key, value in filtered_metrics.items():
570
+ value = max(0.0, min(1.0, value)) # Clip value for safety
571
+
572
+ ax = axs[metric_idx]
573
+ ax.set_title(key.replace('_', ' ').title(), fontsize=10)
574
+ ax.set_xlim(0, 1)
575
+ ax.set_ylim(0, 0.5)
576
+ ax.set_aspect('equal')
577
+ ax.axis('off')
578
+ ax.set_facecolor('#FFFFFF')
579
+
580
+ r = 0.4
581
+ theta = np.linspace(np.pi, 0, 100)
582
+ x_bg = 0.5 + r * np.cos(theta)
583
+ y_bg = 0.1 + r * np.sin(theta)
584
+ ax.plot(x_bg, y_bg, 'k-', linewidth=3, alpha=0.2)
585
+
586
+ value_angle = np.pi * (1 - value)
587
+ num_points = max(2, int(100 * value))
588
+ value_theta = np.linspace(np.pi, value_angle, num_points)
589
+ x_val = 0.5 + r * np.cos(value_theta)
590
+ y_val = 0.1 + r * np.sin(value_theta)
591
+
592
+ if len(x_val) > 1:
593
+ points = np.array([x_val, y_val]).T.reshape(-1, 1, 2)
594
+ segments = np.concatenate([points[:-1], points[1:]], axis=1)
595
+ segment_values = np.linspace(0, value, len(segments))
596
+ lc = LineCollection(segments, cmap=cmap, norm=norm)
597
+ lc.set_array(segment_values)
598
+ lc.set_linewidth(5)
599
+ ax.add_collection(lc)
600
+
601
+ ax.text(0.5, 0.15, f"{value:.2f}", ha='center', va='center', fontsize=11,
602
+ fontweight='bold', bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.2'))
603
+
604
+ metric_idx += 1
605
+
606
+ for i in range(metric_idx, len(axs)):
607
+ axs[i].axis('off')
608
+
609
+ plt.tight_layout(pad=0.5)
610
+ return fig
611
+
612
+ def annotate_frame(frame, facial_data, metrics=None, enhanced_state=None):
613
+ """
614
+ Add facial landmark annotations and metrics to a frame
615
+ """
616
+ if frame is None:
617
+ return None
618
+
619
+ annotated = frame.copy()
620
+
621
+ # If we have facial data, draw the landmarks
622
+ if facial_data and "landmarks" in facial_data:
623
+ landmarks = facial_data["landmarks"]
624
+ rect = facial_data.get("rect")
625
+
626
+ # Draw face rectangle if available
627
+ if rect:
628
+ x1, y1, x2, y2 = rect.left(), rect.top(), rect.right(), rect.bottom()
629
+ cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)
630
+
631
+ # Draw the 68 facial landmarks
632
+ for i in range(68):
633
+ x, y = landmarks.part(i).x, landmarks.part(i).y
634
+ cv2.circle(annotated, (x, y), 2, (0, 0, 255), -1)
635
+
636
+ # Draw connecting lines for different facial features
637
+ # Eyes
638
+ for eye_points in [(36, 41), (42, 47)]: # Left eye, Right eye
639
+ for i in range(eye_points[0], eye_points[1]):
640
+ pt1 = (landmarks.part(i).x, landmarks.part(i).y)
641
+ pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
642
+ cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
643
+ # Connect last point to first
644
+ pt1 = (landmarks.part(eye_points[1]).x, landmarks.part(eye_points[1]).y)
645
+ pt2 = (landmarks.part(eye_points[0]).x, landmarks.part(eye_points[0]).y)
646
+ cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
647
+
648
+ # Eyebrows
649
+ for brow_points in [(17, 21), (22, 26)]: # Left eyebrow, Right eyebrow
650
+ for i in range(brow_points[0], brow_points[1]):
651
+ pt1 = (landmarks.part(i).x, landmarks.part(i).y)
652
+ pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
653
+ cv2.line(annotated, pt1, pt2, (255, 255, 0), 1)
654
+
655
+ # Nose
656
+ for i in range(27, 35):
657
+ pt1 = (landmarks.part(i).x, landmarks.part(i).y)
658
+ pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
659
+ cv2.line(annotated, pt1, pt2, (255, 0, 255), 1)
660
+
661
+ # Mouth outer
662
+ for i in range(48, 59):
663
+ pt1 = (landmarks.part(i).x, landmarks.part(i).y)
664
+ pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
665
+ cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
666
+ # Connect last point to first for mouth
667
+ pt1 = (landmarks.part(59).x, landmarks.part(59).y)
668
+ pt2 = (landmarks.part(48).x, landmarks.part(48).y)
669
+ cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
670
+
671
+ # Mouth inner
672
+ for i in range(60, 67):
673
+ pt1 = (landmarks.part(i).x, landmarks.part(i).y)
674
+ pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
675
+ cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
676
+ # Connect last point to first for inner mouth
677
+ pt1 = (landmarks.part(67).x, landmarks.part(67).y)
678
+ pt2 = (landmarks.part(60).x, landmarks.part(60).y)
679
+ cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
680
+
681
+ # Add metrics summary if available
682
+ if metrics:
683
+ # Format for display
684
+ h, w = annotated.shape[:2]
685
+ y_pos = 30 # Starting Y position
686
+
687
+ # Add user state if available
688
+ if enhanced_state:
689
+ # Draw background for text
690
+ text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
691
+ cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
692
+ # Draw text
693
+ cv2.putText(annotated, enhanced_state, (10, y_pos),
694
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
695
+ y_pos += 30
696
+
697
+ # Show top 3 metrics
698
+ top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
699
+ key=lambda x: x[1], reverse=True)[:3]
700
+
701
+ for name, value in top_metrics:
702
+ metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
703
+ text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
704
+ cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
705
+ cv2.putText(annotated, metric_text, (10, y_pos),
706
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
707
+ y_pos += 25
708
+
709
+ return annotated
710
+
711
+ # --- API 1: Video File Processing ---
712
+
713
+ def process_video_file(
714
+ video_file: Union[str, np.ndarray],
715
+ ad_description: str = "",
716
+ ad_detail: str = "",
717
+ ad_type: str = "Video",
718
+ sampling_rate: int = 5, # Process every Nth frame
719
+ save_processed_video: bool = True,
720
+ show_progress: bool = True
721
+ ) -> Tuple[str, str, pd.DataFrame, List[np.ndarray]]:
722
+ """
723
+ Process a video file and analyze facial expressions frame by frame
724
+
725
+ Args:
726
+ video_file: Path to video file or video array
727
+ ad_description: Description of the ad being watched
728
+ ad_detail: Detail focus of the ad
729
+ ad_type: Type of ad (Video, Image, Audio, Text, Funny, etc.)
730
+ sampling_rate: Process every Nth frame
731
+ save_processed_video: Whether to save the processed video with annotations
732
+ show_progress: Whether to show processing progress
733
+
734
+ Returns:
735
+ Tuple of (csv_path, processed_video_path, metrics_dataframe, processed_frames_list)
736
+ """
737
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
738
+ csv_path = CSV_FILENAME_TEMPLATE.format(timestamp=timestamp)
739
+ video_path = VIDEO_FILENAME_TEMPLATE.format(timestamp=timestamp) if save_processed_video else None
740
+
741
+ # Setup ad context
742
+ gemini_result = call_gemini_api_for_ad(ad_description, ad_detail, ad_type)
743
+ ad_context = {
744
+ "ad_description": ad_description,
745
+ "ad_detail": ad_detail,
746
+ "ad_type": ad_type,
747
+ "gemini_ad_analysis": gemini_result
748
+ }
749
+
750
+ # Initialize capture
751
+ if isinstance(video_file, str):
752
+ cap = cv2.VideoCapture(video_file)
753
+ else:
754
+ # Create a temporary file for the video array
755
+ temp_dir = tempfile.mkdtemp()
756
+ temp_path = os.path.join(temp_dir, "temp_video.mp4")
757
+
758
+ # Convert video array to file
759
+ if isinstance(video_file, np.ndarray) and len(video_file.shape) == 4: # Multiple frames
760
+ h, w = video_file[0].shape[:2]
761
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
762
+ temp_writer = cv2.VideoWriter(temp_path, fourcc, 30, (w, h))
763
+ for frame in video_file:
764
+ temp_writer.write(frame)
765
+ temp_writer.release()
766
+ cap = cv2.VideoCapture(temp_path)
767
+ elif isinstance(video_file, np.ndarray) and len(video_file.shape) == 3: # Single frame
768
+ # For single frame, just process it directly
769
+ metrics_data = []
770
+ processed_frames = []
771
+
772
+ # Process the single frame
773
+ facial_data = extract_face_landmarks_dlib(video_file)
774
+ deepface_data = analyze_face_with_deepface(video_file)
775
+
776
+ if facial_data:
777
+ calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
778
+ user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
779
+
780
+ # Create a row for the dataframe
781
+ row = {
782
+ 'timestamp': 0.0,
783
+ 'frame_number': 0,
784
+ **calculated_metrics,
785
+ **ad_context,
786
+ 'user_state': user_state,
787
+ 'enhanced_user_state': enhanced_state
788
+ }
789
+ metrics_data.append(row)
790
+
791
+ # Annotate the frame
792
+ annotated_frame = annotate_frame(video_file, facial_data, calculated_metrics, enhanced_state)
793
+ processed_frames.append(annotated_frame)
794
+
795
+ # Save processed image
796
+ if save_processed_video:
797
+ cv2.imwrite(video_path.replace('.mp4', '.jpg'), annotated_frame)
798
+
799
+ # Create DataFrame and save to CSV
800
+ metrics_df = pd.DataFrame(metrics_data)
801
+ if not metrics_df.empty:
802
+ metrics_df.to_csv(csv_path, index=False)
803
+
804
+ return csv_path, video_path.replace('.mp4', '.jpg') if save_processed_video else None, metrics_df, processed_frames
805
+ else:
806
+ print("Error: Invalid video input format")
807
+ return None, None, None, []
808
+
809
+ if not cap.isOpened():
810
+ print("Error: Could not open video.")
811
+ return None, None, None, []
812
+
813
+ # Get video properties
814
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
815
+ frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
816
+ fps = cap.get(cv2.CAP_PROP_FPS)
817
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
818
+
819
+ # Initialize video writer if saving processed video
820
+ if save_processed_video:
821
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
822
+ out = cv2.VideoWriter(video_path, fourcc, fps / sampling_rate, (frame_width, frame_height))
823
+
824
+ # Process video frames
825
+ metrics_data = []
826
+ processed_frames = []
827
+ frame_count = 0
828
+
829
+ if show_progress:
830
+ print(f"Processing video with {total_frames} frames at {fps} FPS")
831
+ print(f"Ad Context: {ad_description} ({ad_type})")
832
+
833
+ while True:
834
+ ret, frame = cap.read()
835
+ if not ret:
836
+ break
837
+
838
+ # Only process every Nth frame (according to sampling_rate)
839
+ if frame_count % sampling_rate == 0:
840
+ if show_progress and frame_count % (sampling_rate * 10) == 0:
841
+ print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
842
+
843
+ # Extract facial landmarks and analyze with DeepFace
844
+ facial_data = extract_face_landmarks_dlib(frame)
845
+ deepface_data = analyze_face_with_deepface(frame)
846
+
847
+ # Calculate metrics if landmarks detected
848
+ if facial_data:
849
+ calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
850
+ user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
851
+
852
+ # Create a row for the dataframe
853
+ row = {
854
+ 'timestamp': frame_count / fps,
855
+ 'frame_number': frame_count,
856
+ **calculated_metrics,
857
+ **ad_context,
858
+ 'user_state': user_state,
859
+ 'enhanced_user_state': enhanced_state
860
+ }
861
+ metrics_data.append(row)
862
+
863
+ # Annotate the frame
864
+ annotated_frame = annotate_frame(frame, facial_data, calculated_metrics, enhanced_state)
865
+
866
+ if save_processed_video:
867
+ out.write(annotated_frame)
868
+ processed_frames.append(annotated_frame)
869
+ else:
870
+ # No face detected
871
+ if save_processed_video:
872
+ # Add text to frame
873
+ no_face_frame = frame.copy()
874
+ cv2.putText(no_face_frame, "No face detected", (30, 30),
875
+ cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
876
+ out.write(no_face_frame)
877
+ processed_frames.append(no_face_frame)
878
+
879
+ frame_count += 1
880
+
881
+ # Release resources
882
+ cap.release()
883
+ if save_processed_video:
884
+ out.release()
885
+
886
+ # Create DataFrame and save to CSV
887
+ metrics_df = pd.DataFrame(metrics_data)
888
+ if not metrics_df.empty:
889
+ metrics_df.to_csv(csv_path, index=False)
890
+
891
+ if show_progress:
892
+ print(f"Video processing complete. Analyzed {len(metrics_data)} frames.")
893
+ print(f"Results saved to {csv_path}")
894
+ if save_processed_video:
895
+ print(f"Processed video saved to {video_path}")
896
+
897
+ # Return results
898
+ return csv_path, video_path, metrics_df, processed_frames
899
+
900
+ # --- API 2: Webcam Processing Function ---
901
+
902
+ def process_webcam_frame(
903
+ frame: np.ndarray,
904
+ ad_context: Dict[str, Any],
905
+ metrics_data: pd.DataFrame,
906
+ frame_count: int,
907
+ start_time: float
908
+ ) -> Tuple[np.ndarray, Dict[str, float], str, pd.DataFrame]:
909
+ """
910
+ Process a single webcam frame
911
+
912
+ Args:
913
+ frame: Input frame from webcam
914
+ ad_context: Ad context dictionary
915
+ metrics_data: DataFrame to accumulate metrics
916
+ frame_count: Current frame count
917
+ start_time: Start time of the session
918
+
919
+ Returns:
920
+ Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_metrics_df)
921
+ """
922
+ if frame is None:
923
+ return None, None, None, metrics_data
924
+
925
+ # Extract facial landmarks and analyze with DeepFace
926
+ facial_data = extract_face_landmarks_dlib(frame)
927
+ deepface_data = analyze_face_with_deepface(frame)
928
+
929
+ # Calculate metrics if landmarks detected
930
+ if facial_data:
931
+ calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
932
+ user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
933
+
934
+ # Create a row for the dataframe
935
+ current_time = time.time()
936
+ row = {
937
+ 'timestamp': current_time - start_time,
938
+ 'frame_number': frame_count,
939
+ **calculated_metrics,
940
+ **ad_context,
941
+ 'user_state': user_state,
942
+ 'enhanced_user_state': enhanced_state
943
+ }
944
+
945
+ # Add row to DataFrame
946
+ new_row_df = pd.DataFrame([row], columns=all_columns)
947
+ metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
948
+
949
+ # Annotate the frame
950
+ annotated_frame = annotate_frame(frame, facial_data, calculated_metrics, enhanced_state)
951
+
952
+ return annotated_frame, calculated_metrics, enhanced_state, metrics_data
953
+ else:
954
+ # No face detected
955
+ no_face_frame = frame.copy()
956
+ cv2.putText(no_face_frame, "No face detected", (30, 30),
957
+ cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
958
+ return no_face_frame, None, "No face detected", metrics_data
959
+
960
+ def start_webcam_session(
961
+ ad_description: str = "",
962
+ ad_detail: str = "",
963
+ ad_type: str = "Video",
964
+ save_interval: int = 100, # Save CSV every N frames
965
+ record_video: bool = True
966
+ ) -> Dict[str, Any]:
967
+ """
968
+ Initialize a webcam session for facial analysis
969
+
970
+ Args:
971
+ ad_description: Description of the ad being watched
972
+ ad_detail: Detail focus of the ad
973
+ ad_type: Type of ad
974
+ save_interval: How often to save data to CSV
975
+ record_video: Whether to record processed frames for later saving
976
+
977
+ Returns:
978
+ Session context dictionary
979
+ """
980
+ # Generate timestamp for file naming
981
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
982
+ csv_path = CSV_FILENAME_TEMPLATE.format(timestamp=timestamp)
983
+ video_path = VIDEO_FILENAME_TEMPLATE.format(timestamp=timestamp) if record_video else None
984
+
985
+ # Setup ad context
986
+ gemini_result = call_gemini_api_for_ad(ad_description, ad_detail, ad_type)
987
+ ad_context = {
988
+ "ad_description": ad_description,
989
+ "ad_detail": ad_detail,
990
+ "ad_type": ad_type,
991
+ "gemini_ad_analysis": gemini_result
992
+ }
993
+
994
+ # Initialize session context
995
+ session = {
996
+ "start_time": time.time(),
997
+ "frame_count": 0,
998
+ "metrics_data": initial_metrics_df.copy(),
999
+ "ad_context": ad_context,
1000
+ "csv_path": csv_path,
1001
+ "video_path": video_path,
1002
+ "save_interval": save_interval,
1003
+ "last_saved": 0,
1004
+ "record_video": record_video,
1005
+ "recorded_frames": [] if record_video else None,
1006
+ "timestamps": [] if record_video else None
1007
+ }
1008
+
1009
+ return session
1010
+
1011
+ def update_webcam_session(
1012
+ session: Dict[str, Any],
1013
+ frame: np.ndarray
1014
+ ) -> Tuple[np.ndarray, Dict[str, float], str, Dict[str, Any]]:
1015
+ """
1016
+ Update webcam session with a new frame
1017
+
1018
+ Args:
1019
+ session: Session context dictionary
1020
+ frame: New frame from webcam
1021
+
1022
+ Returns:
1023
+ Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_session)
1024
+ """
1025
+ # Process the frame
1026
+ annotated_frame, metrics, enhanced_state, updated_df = process_webcam_frame(
1027
+ frame,
1028
+ session["ad_context"],
1029
+ session["metrics_data"],
1030
+ session["frame_count"],
1031
+ session["start_time"]
1032
+ )
1033
+
1034
+ # Update session
1035
+ session["frame_count"] += 1
1036
+ session["metrics_data"] = updated_df
1037
+
1038
+ # Record frame if enabled
1039
+ if session["record_video"] and annotated_frame is not None:
1040
+ session["recorded_frames"].append(annotated_frame)
1041
+ session["timestamps"].append(time.time() - session["start_time"])
1042
+
1043
+ # Save CSV periodically
1044
+ if session["frame_count"] - session["last_saved"] >= session["save_interval"]:
1045
+ if not updated_df.empty:
1046
+ updated_df.to_csv(session["csv_path"], index=False)
1047
+ session["last_saved"] = session["frame_count"]
1048
+
1049
+ return annotated_frame, metrics, enhanced_state, session
1050
+
1051
+ def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
1052
+ """
1053
+ End a webcam session and save final results
1054
+
1055
+ Args:
1056
+ session: Session context dictionary
1057
+
1058
+ Returns:
1059
+ Tuple of (csv_path, video_path)
1060
+ """
1061
+ # Save final metrics to CSV
1062
+ if not session["metrics_data"].empty:
1063
+ session["metrics_data"].to_csv(session["csv_path"], index=False)
1064
+
1065
+ # Save recorded video if available
1066
+ video_path = None
1067
+ if session["record_video"] and session["recorded_frames"]:
1068
+ try:
1069
+ frames = session["recorded_frames"]
1070
+ if frames:
1071
+ # Get frame dimensions
1072
+ height, width = frames[0].shape[:2]
1073
+
1074
+ # Calculate FPS based on actual timestamps
1075
+ if len(session["timestamps"]) > 1:
1076
+ # Calculate average time between frames
1077
+ time_diffs = np.diff(session["timestamps"])
1078
+ avg_frame_time = np.mean(time_diffs)
1079
+ fps = 1.0 / avg_frame_time if avg_frame_time > 0 else 15.0
1080
+ else:
1081
+ fps = 15.0 # Default FPS
1082
+
1083
+ # Create video writer
1084
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
1085
+ video_path = session["video_path"]
1086
+ out = cv2.VideoWriter(video_path, fourcc, fps, (width, height))
1087
+
1088
+ # Write frames
1089
+ for frame in frames:
1090
+ out.write(frame)
1091
+
1092
+ out.release()
1093
+ print(f"Recorded video saved to {video_path}")
1094
+ else:
1095
+ print("No frames recorded")
1096
+ except Exception as e:
1097
+ print(f"Error saving video: {e}")
1098
+
1099
+ print(f"Session ended. Data saved to {session['csv_path']}")
1100
+ return session["csv_path"], video_path
1101
+
1102
+ # --- Create Gradio Interface ---
1103
+
1104
+ def create_api_interface():
1105
+ with gr.Blocks(title="Enhanced Facial Analysis APIs") as iface:
1106
+ gr.Markdown("""
1107
+ # Enhanced Facial Analysis APIs
1108
+
1109
+ This interface provides two API endpoints:
1110
+
1111
+ 1. **Video File API**: Upload and analyze pre-recorded videos
1112
+ 2. **Webcam API**: Analyze live webcam feed in real-time
1113
+
1114
+ Both APIs use dlib for facial landmark detection, DeepFace for emotion analysis,
1115
+ and Google's Gemini API for enhanced interpretations.
1116
+ """)
1117
+
1118
+ with gr.Tab("Video File API"):
1119
+ with gr.Row():
1120
+ with gr.Column(scale=1):
1121
+ video_input = gr.Video(label="Upload Video")
1122
+ vid_ad_desc = gr.Textbox(label="Ad Description", placeholder="Enter a description of the advertisement being watched...")
1123
+ vid_ad_detail = gr.Textbox(label="Ad Detail Focus", placeholder="Enter specific aspects to focus on...")
1124
+ vid_ad_type = gr.Radio(
1125
+ ["Video", "Image", "Audio", "Text", "Funny", "Serious", "Action", "Informative"],
1126
+ label="Ad Type/Genre",
1127
+ value="Video"
1128
+ )
1129
+ sampling_rate = gr.Slider(
1130
+ minimum=1, maximum=30, step=1, value=5,
1131
+ label="Sampling Rate (process every N frames)"
1132
+ )
1133
+ save_video = gr.Checkbox(label="Save Processed Video", value=True)
1134
+ process_btn = gr.Button("Process Video", variant="primary")
1135
+
1136
+ with gr.Column(scale=2):
1137
+ output_text = gr.Textbox(label="Processing Results", lines=3)
1138
+ with gr.Row():
1139
+ with gr.Column():
1140
+ output_video = gr.Video(label="Processed Video")
1141
+ with gr.Column():
1142
+ frame_gallery = gr.Gallery(label="Processed Frames",
1143
+ show_label=True, columns=2,
1144
+ height=400)
1145
+
1146
+ with gr.Row():
1147
+ with gr.Column():
1148
+ output_plot = gr.Plot(label="Sample Frame Metrics")
1149
+ with gr.Column():
1150
+ output_csv = gr.File(label="Download CSV Results")
1151
+
1152
+ # Define function to handle video processing and show frames
1153
+ def handle_video_processing(video, desc, detail, ad_type, rate, save_vid):
1154
+ if video is None:
1155
+ return "No video uploaded", None, None, [], None
1156
+
1157
+ try:
1158
+ result_text = "Starting video processing...\n"
1159
+ # Process the video
1160
+ csv_path, video_path, metrics_df, processed_frames = process_video_file(
1161
+ video,
1162
+ ad_description=desc,
1163
+ ad_detail=detail,
1164
+ ad_type=ad_type,
1165
+ sampling_rate=rate,
1166
+ save_processed_video=save_vid,
1167
+ show_progress=True
1168
+ )
1169
+
1170
+ if metrics_df is None or metrics_df.empty:
1171
+ return "No facial data detected in video", None, None, [], None
1172
+
1173
+ # Generate a sample metrics visualization
1174
+ sample_row = metrics_df.iloc[0].to_dict()
1175
+ metrics_plot = update_metrics_visualization(sample_row)
1176
+
1177
+ # Create a gallery of processed frames
1178
+ # Take a subset if there are too many frames (maximum ~20 for display)
1179
+ display_frames = []
1180
+ step = max(1, len(processed_frames) // 20)
1181
+ for i in range(0, len(processed_frames), step):
1182
+ if i < len(processed_frames):
1183
+ # Convert BGR to RGB for display
1184
+ rgb_frame = cv2.cvtColor(processed_frames[i], cv2.COLOR_BGR2RGB)
1185
+ display_frames.append(rgb_frame)
1186
+
1187
+ # Return results summary
1188
+ processed_count = metrics_df.shape[0]
1189
+ total_count = len(processed_frames)
1190
+ result_text = f"✅ Processed {processed_count} frames out of {total_count} total frames.\n"
1191
+ result_text += f"📊 CSV saved with {len(metrics_df.columns)} metrics columns.\n"
1192
+ if video_path:
1193
+ result_text += f"🎬 Processed video saved to: {video_path}"
1194
+
1195
+ return result_text, video_path, metrics_plot, display_frames, csv_path
1196
+ except Exception as e:
1197
+ return f"❌ Error processing video: {str(e)}", None, None, [], None
1198
+
1199
+ process_btn.click(
1200
+ handle_video_processing,
1201
+ inputs=[video_input, vid_ad_desc, vid_ad_detail, vid_ad_type, sampling_rate, save_video],
1202
+ outputs=[output_text, output_video, output_plot, frame_gallery, output_csv]
1203
+ )
1204
+
1205
+ with gr.Tab("Webcam API"):
1206
+ with gr.Row():
1207
+ with gr.Column(scale=2):
1208
+ webcam_input = gr.Image(sources="webcam", streaming=True, label="Webcam Input", type="numpy")
1209
+
1210
+ with gr.Row():
1211
+ with gr.Column():
1212
+ web_ad_desc = gr.Textbox(label="Ad Description", placeholder="Enter a description of the advertisement being watched...")
1213
+ web_ad_detail = gr.Textbox(label="Ad Detail Focus", placeholder="Enter specific aspects to focus on...")
1214
+ web_ad_type = gr.Radio(
1215
+ ["Video", "Image", "Audio", "Text", "Funny", "Serious", "Action", "Informative"],
1216
+ label="Ad Type/Genre",
1217
+ value="Video"
1218
+ )
1219
+ with gr.Column():
1220
+ record_video_chk = gr.Checkbox(label="Record Video", value=True)
1221
+ start_session_btn = gr.Button("Start Session", variant="primary")
1222
+ end_session_btn = gr.Button("End Session", variant="stop")
1223
+ session_status = gr.Textbox(label="Session Status", placeholder="Session not started...")
1224
+
1225
+ with gr.Column(scale=2):
1226
+ processed_output = gr.Image(label="Processed Feed", type="numpy", height=360)
1227
+
1228
+ with gr.Row():
1229
+ with gr.Column():
1230
+ metrics_plot = gr.Plot(label="Current Metrics", height=300)
1231
+ with gr.Column():
1232
+ enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
1233
+
1234
+ with gr.Row():
1235
+ download_csv = gr.File(label="Download Session Data")
1236
+ download_video = gr.Video(label="Recorded Session")
1237
+
1238
+ # Session state
1239
+ session_data = gr.State(value=None)
1240
+
1241
+ # Define session handlers
1242
+ def start_session(desc, detail, ad_type, record_video):
1243
+ session = start_webcam_session(
1244
+ ad_description=desc,
1245
+ ad_detail=detail,
1246
+ ad_type=ad_type,
1247
+ record_video=record_video
1248
+ )
1249
+ return (
1250
+ session,
1251
+ f"Session started at {datetime.datetime.now().strftime('%H:%M:%S')}.\n"
1252
+ f"Ad context: {desc} ({ad_type}).\n"
1253
+ f"Data will be saved to {session['csv_path']}"
1254
+ )
1255
+
1256
+ def process_frame(frame, session):
1257
+ if session is None:
1258
+ return frame, None, "No active session. Click 'Start Session' to begin.", session
1259
+
1260
+ # Process the frame
1261
+ annotated_frame, metrics, enhanced_state, updated_session = update_webcam_session(session, frame)
1262
+
1263
+ # Update the metrics plot if metrics available
1264
+ if metrics:
1265
+ metrics_plot = update_metrics_visualization(metrics)
1266
+ return annotated_frame, metrics_plot, enhanced_state, updated_session
1267
+ else:
1268
+ # Return the annotated frame (likely with "No face detected")
1269
+ return annotated_frame, None, enhanced_state or "No metrics available", updated_session
1270
+
1271
+ def end_session(session):
1272
+ if session is None:
1273
+ return "No active session", None, None
1274
+
1275
+ csv_path, video_path = end_webcam_session(session)
1276
+ end_time = datetime.datetime.now().strftime('%H:%M:%S')
1277
+ result = f"Session ended at {end_time}.\n"
1278
+
1279
+ if csv_path:
1280
+ result += f"CSV data saved to: {csv_path}\n"
1281
+ if video_path:
1282
+ result += f"Video saved to: {video_path}"
1283
+
1284
+ return result, csv_path, video_path
1285
+
1286
+ start_session_btn.click(
1287
+ start_session,
1288
+ inputs=[web_ad_desc, web_ad_detail, web_ad_type, record_video_chk],
1289
+ outputs=[session_data, session_status]
1290
+ )
1291
+
1292
+ webcam_input.stream(
1293
+ process_frame,
1294
+ inputs=[webcam_input, session_data],
1295
+ outputs=[processed_output, metrics_plot, enhanced_state_txt, session_data]
1296
+ )
1297
+
1298
+ end_session_btn.click(
1299
+ end_session,
1300
+ inputs=[session_data],
1301
+ outputs=[session_status, download_csv, download_video]
1302
+ )
1303
+
1304
+ return iface
1305
+
1306
+ # Entry point
1307
+ if __name__ == "__main__":
1308
+ print("Starting Enhanced Facial Analysis API server...")
1309
+ print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
1310
+ print(f"Facial analysis using dlib and DeepFace")
1311
+ iface = create_api_interface()
1312
+ iface.launch(debug=True)