ans123 commited on
Commit
2b82b08
·
verified ·
1 Parent(s): fcf70d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +357 -404
app.py CHANGED
@@ -3,7 +3,6 @@ import cv2
3
  import numpy as np
4
  import pandas as pd
5
  import time
6
- import dlib
7
  import matplotlib.pyplot as plt
8
  from matplotlib.colors import LinearSegmentedColormap
9
  from matplotlib.collections import LineCollection
@@ -19,6 +18,9 @@ from deepface import DeepFace
19
  import base64
20
  import io
21
  from pathlib import Path
 
 
 
22
 
23
  # Suppress warnings for cleaner output
24
  warnings.filterwarnings('ignore')
@@ -47,32 +49,59 @@ except Exception as e:
47
  print("Running with simulated Gemini API responses.")
48
  GEMINI_ENABLED = False
49
 
50
- # --- Initialize dlib and DeepFace for facial analysis ---
51
- print("Initializing dlib face detector and shape predictor...")
 
52
  try:
53
- # Initialize dlib's face detector and facial landmark predictor
54
- face_detector = dlib.get_frontal_face_detector()
55
-
56
- # Paths to shape predictor model file
57
- # You need to download this file from:
58
- # http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
59
- predictor_path = "shape_predictor_68_face_landmarks.dat"
60
-
61
- # Check if the predictor file exists, otherwise inform the user
62
- if not os.path.exists(predictor_path):
63
- print(f"WARNING: {predictor_path} not found. Please download from:")
64
- print("http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
65
- print("Extract and place in the current directory.")
66
- # Use a placeholder or alternative
67
- shape_predictor = None
68
  else:
69
- shape_predictor = dlib.shape_predictor(predictor_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- print("dlib face detector initialized successfully.")
 
 
72
  except Exception as e:
73
- print(f"ERROR initializing dlib face detector: {e}")
74
- face_detector = None
75
- shape_predictor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # --- Metrics Definition ---
78
  metrics = [
@@ -94,12 +123,64 @@ emotion_mapping = {
94
  }
95
 
96
  ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
97
- user_state_columns = ["user_state", "enhanced_user_state"]
98
  all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
99
  initial_metrics_df = pd.DataFrame(columns=all_columns)
100
 
101
- # --- Gemini API Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
 
103
  def call_gemini_api_for_ad(description, detail, ad_type):
104
  """
105
  Uses Google Gemini to analyze ad context.
@@ -131,40 +212,57 @@ def call_gemini_api_for_ad(description, detail, ad_type):
131
  print(f"Error calling Gemini for ad context: {e}")
132
  return f"Error analyzing ad context: {str(e)}"
133
 
134
- def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
135
  """
136
- Uses Google Gemini to interpret facial metrics and determine user state.
 
137
  """
138
- if not metrics_dict:
139
  return "No metrics", "No facial data detected"
140
 
141
  if not GEMINI_ENABLED:
142
  # Basic rule-based simulation for user state
143
- valence = metrics_dict.get('valence', 0.5)
144
- arousal = metrics_dict.get('arousal', 0.5)
145
- cog_load = metrics_dict.get('cognitive_load', 0.5)
146
- stress = metrics_dict.get('stress_index', 0.5)
147
- engagement = metrics_dict.get('engagement_level', 0.5)
 
 
 
148
 
149
  # Simple rule-based simulation
150
- state = "Neutral"
151
- if valence > 0.65 and arousal > 0.55 and engagement > 0.6:
152
  state = "Positive, Engaged"
153
- elif valence < 0.4 and stress > 0.6:
154
  state = "Stressed, Negative"
155
- elif cog_load > 0.7 and engagement < 0.4:
156
- state = "Confused, Disengaged"
157
- elif arousal < 0.4 and engagement < 0.5:
158
- state = "Calm, Passive"
159
 
160
- enhanced_state = f"The viewer appears {state.lower()} while watching this content. They are likely not fully connecting with the message."
 
 
 
 
161
 
162
  return state, enhanced_state
163
  else:
164
  try:
165
  # Format metrics for Gemini
166
- metrics_formatted = "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
167
- if k not in ('timestamp', 'frame_number')])
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  # Include ad context if available
170
  ad_info = ""
@@ -174,8 +272,9 @@ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
174
  ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
175
 
176
  prompt = f"""
177
- Analyze these facial metrics (scale 0-1) of a person watching an advertisement{ad_info}:
178
- {metrics_formatted}
 
179
 
180
  Provide two outputs:
181
  1. User State: A short 1-3 word description of their emotional/cognitive state
@@ -206,45 +305,9 @@ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
206
  print(f"Error calling Gemini for metric interpretation: {e}")
207
  return "Error", f"Error analyzing facial metrics: {str(e)}"
208
 
209
- # --- Facial Analysis Functions with dlib and DeepFace ---
210
-
211
- def extract_face_landmarks_dlib(image):
212
- """Extract facial landmarks using dlib"""
213
- if image is None or face_detector is None or shape_predictor is None:
214
- return None
215
-
216
- try:
217
- # Convert to grayscale for dlib
218
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
219
-
220
- # Detect faces
221
- faces = face_detector(gray, 0)
222
-
223
- if len(faces) == 0:
224
- return None
225
-
226
- # Get the largest face by area
227
- largest_face = faces[0]
228
- largest_area = (faces[0].right() - faces[0].left()) * (faces[0].bottom() - faces[0].top())
229
-
230
- for face in faces:
231
- area = (face.right() - face.left()) * (face.bottom() - face.top())
232
- if area > largest_area:
233
- largest_face = face
234
- largest_area = area
235
-
236
- # Get facial landmarks
237
- landmarks = shape_predictor(gray, largest_face)
238
-
239
- # Return both the face detection rectangle and landmarks
240
- return {"rect": largest_face, "landmarks": landmarks}
241
-
242
- except Exception as e:
243
- print(f"Error in dlib landmark extraction: {e}")
244
- return None
245
-
246
  def analyze_face_with_deepface(image):
247
- """Analyze facial emotions using DeepFace"""
248
  if image is None:
249
  return None
250
 
@@ -267,7 +330,7 @@ def analyze_face_with_deepface(image):
267
  # Analyze with DeepFace
268
  analysis = DeepFace.analyze(
269
  img_path=temp_img,
270
- actions=['emotion'],
271
  enforce_detection=False, # Don't throw error if face not detected
272
  detector_backend='opencv' # Faster detection
273
  )
@@ -288,159 +351,40 @@ def analyze_face_with_deepface(image):
288
  print(f"DeepFace analysis error: {e}")
289
  return None
290
 
291
- def calculate_ear_dlib(landmarks):
292
- """Calculate Eye Aspect Ratio using dlib landmarks"""
293
- if landmarks is None:
294
- return 0.0
295
-
296
- try:
297
- # dlib's 68-point face model landmark indices
298
- # Left eye: 36-41, Right eye: 42-47
299
- LEFT_EYE = range(36, 42)
300
- RIGHT_EYE = range(42, 48)
301
-
302
- def get_eye_aspect_ratio(eye_points):
303
- # Compute the euclidean distances between the two sets of vertical landmarks
304
- v1 = np.linalg.norm(eye_points[1] - eye_points[5])
305
- v2 = np.linalg.norm(eye_points[2] - eye_points[4])
306
- # Compute the euclidean distance between the horizontal landmarks
307
- h = np.linalg.norm(eye_points[0] - eye_points[3])
308
- # Compute the eye aspect ratio
309
- return (v1 + v2) / (2.0 * h) if h > 1e-6 else 0.0
310
-
311
- # Extract landmark coordinates
312
- landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
313
-
314
- # Calculate EAR for left and right eyes
315
- left_eye_coords = landmark_coords[list(LEFT_EYE)]
316
- right_eye_coords = landmark_coords[list(RIGHT_EYE)]
317
-
318
- left_ear = get_eye_aspect_ratio(left_eye_coords)
319
- right_ear = get_eye_aspect_ratio(right_eye_coords)
320
-
321
- # Return average of both eyes
322
- return (left_ear + right_ear) / 2.0
323
-
324
- except Exception as e:
325
- print(f"Error calculating EAR: {e}")
326
- return 0.0
327
-
328
- def calculate_mar_dlib(landmarks):
329
- """Calculate Mouth Aspect Ratio using dlib landmarks"""
330
- if landmarks is None:
331
- return 0.0
332
-
333
- try:
334
- # dlib's 68-point face model landmark indices for mouth
335
- # Mouth outer: 48-59, Mouth inner: 60-67
336
- MOUTH_OUTER = range(48, 60)
337
- MOUTH_INNER = range(60, 68)
338
-
339
- # Extract landmark coordinates
340
- landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
341
-
342
- # Use specific points for vertical and horizontal measurements
343
- # Vertical: distance between top and bottom lips
344
- top_lip = landmark_coords[51] # Top lip center
345
- bottom_lip = landmark_coords[57] # Bottom lip center
346
- vertical = np.linalg.norm(top_lip - bottom_lip)
347
-
348
- # Horizontal: distance between mouth corners
349
- left_corner = landmark_coords[48] # Left mouth corner
350
- right_corner = landmark_coords[54] # Right mouth corner
351
- horizontal = np.linalg.norm(left_corner - right_corner)
352
-
353
- # Calculate ratio
354
- return vertical / horizontal if horizontal > 1e-6 else 0.0
355
-
356
- except Exception as e:
357
- print(f"Error calculating MAR: {e}")
358
- return 0.0
359
-
360
- def calculate_eyebrow_position_dlib(landmarks):
361
- """Calculate eyebrow position using dlib landmarks"""
362
- if landmarks is None:
363
- return 0.0
364
-
365
- try:
366
- # dlib's 68-point face model landmark indices
367
- # Left eyebrow: 17-21, Right eyebrow: 22-26
368
- # Left eye: 36-41, Right eye: 42-47
369
- L_BROW_C = 19 # Center of left eyebrow
370
- R_BROW_C = 24 # Center of right eyebrow
371
- L_EYE_C = 37 # Center top of left eye
372
- R_EYE_C = 43 # Center top of right eye
373
-
374
- # Extract landmark coordinates
375
- landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
376
-
377
- # Calculate distances between eyebrows and eyes
378
- l_brow_y = landmark_coords[L_BROW_C][1]
379
- r_brow_y = landmark_coords[R_BROW_C][1]
380
- l_eye_y = landmark_coords[L_EYE_C][1]
381
- r_eye_y = landmark_coords[R_EYE_C][1]
382
-
383
- # Calculate vertical distances (smaller value means eyebrows are raised)
384
- l_dist = l_eye_y - l_brow_y
385
- r_dist = r_eye_y - r_brow_y
386
-
387
- # Average the distances and normalize
388
- avg_dist = (l_dist + r_dist) / 2.0
389
- # Approximate normalization based on typical face proportions
390
- # Higher value means eyebrows are raised more
391
- norm = (avg_dist - 5) / 15 # Adjusted for typical pixel distances
392
-
393
- return max(0.0, min(1.0, norm))
394
-
395
- except Exception as e:
396
- print(f"Error calculating Eyebrow Position: {e}")
397
- return 0.0
398
-
399
- def estimate_head_pose_dlib(landmarks):
400
- """Estimate head pose using dlib landmarks"""
401
- if landmarks is None:
402
- return 0.0, 0.0
403
 
404
  try:
405
- # dlib's 68-point face model landmark indices
406
- NOSE_TIP = 30 # Nose tip
407
- LEFT_EYE_C = 37 # Left eye center
408
- RIGHT_EYE_C = 44 # Right eye center
409
-
410
- # Extract landmark coordinates
411
- landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
412
-
413
- # Get key points
414
- nose_pt = landmark_coords[NOSE_TIP]
415
- l_eye_pt = landmark_coords[LEFT_EYE_C]
416
- r_eye_pt = landmark_coords[RIGHT_EYE_C]
417
-
418
- # Calculate eye midpoint
419
- eye_mid_x = (l_eye_pt[0] + r_eye_pt[0]) / 2.0
420
- eye_mid_y = (l_eye_pt[1] + r_eye_pt[1]) / 2.0
421
 
422
- # Calculate tilt
423
- v_tilt = nose_pt[1] - eye_mid_y # Vertical tilt
424
- h_tilt = nose_pt[0] - eye_mid_x # Horizontal tilt
 
 
 
 
425
 
426
- # Normalize based on typical facial proportions
427
- v_tilt_norm = v_tilt / 30.0 # Approximate normalization
428
- h_tilt_norm = h_tilt / 20.0 # Approximate normalization
429
 
430
- # Clip to range [-1, 1]
431
- v_tilt_norm = max(-1.0, min(1.0, v_tilt_norm))
432
- h_tilt_norm = max(-1.0, min(1.0, h_tilt_norm))
433
 
434
- return v_tilt_norm, h_tilt_norm
435
 
436
  except Exception as e:
437
- print(f"Error estimating Head Pose: {e}")
438
- return 0.0, 0.0
439
 
440
- def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None):
 
441
  """
442
- Calculate facial metrics using a combination of dlib landmarks and DeepFace emotions.
443
- This provides a more robust approach by integrating both geometric and deep learning methods.
444
  """
445
  if ad_context is None:
446
  ad_context = {}
@@ -449,52 +393,44 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
449
  default_metrics = {m: 0.5 for m in metrics}
450
 
451
  # If no facial data, return defaults
452
- if not facial_data:
453
  return default_metrics
454
 
455
- # Extract landmarks from facial data
456
- landmarks = facial_data.get("landmarks")
457
-
458
- # If we have DeepFace data, use it to influence our metrics
459
- emotion_weights = None
460
- dominant_emotion = None
461
-
462
- if deepface_data and "emotion" in deepface_data:
463
- emotion_weights = deepface_data["emotion"]
464
- # Find dominant emotion
465
- dominant_emotion = max(emotion_weights.items(), key=lambda x: x[1])[0]
466
-
467
- # Calculate base geometric features if landmarks are available
468
- ear = calculate_ear_dlib(landmarks) if landmarks else 0.2
469
- mar = calculate_mar_dlib(landmarks) if landmarks else 0.5
470
- eb_pos = calculate_eyebrow_position_dlib(landmarks) if landmarks else 0.5
471
- v_tilt, h_tilt = estimate_head_pose_dlib(landmarks) if landmarks else (0.0, 0.0)
472
-
473
- # Combine geometric features with emotion weights
474
-
475
- # Step 1: Start with default metrics
476
- calculated_metrics = default_metrics.copy()
477
-
478
- # Step 2: Update based on geometric features
479
- cl = max(0, min(1, 1.0 - ear * 2.5)) # Cognitive load: Higher when eyes are more closed
480
-
481
- # Step 3: If we have emotion data from DeepFace, incorporate it
482
- if dominant_emotion and emotion_weights:
483
- # Get base values from emotion mapping
484
- base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
485
-
486
- # Calculate confidence-weighted emotion values
487
- confidence = emotion_weights.get(dominant_emotion, 0) / 100.0 # Convert percentage to 0-1
488
-
489
- # Combine geometric and emotion-based metrics with weighted approach
490
- val = base_vals["valence"] * confidence + (mar * 0.7 * (1.0 - eb_pos) * 0.3) * (1 - confidence)
491
- arsl = base_vals["arousal"] * confidence + ((mar + (1.0 - ear) + eb_pos) / 3.0) * (1 - confidence)
492
- dom = base_vals["dominance"] * confidence + (0.5 + v_tilt) * (1 - confidence)
493
- else:
494
- # Fallback to geometric features only
495
- val = max(0, min(1, mar * 2.0 * (1.0 - eb_pos)))
496
- arsl = max(0, min(1, (mar + (1.0 - ear) + eb_pos) / 3.0))
497
- dom = max(0, min(1, 0.5 + v_tilt))
498
 
499
  # Illustrative Context Adjustments from ad
500
  ad_type = ad_context.get('ad_type', 'Unknown')
@@ -508,18 +444,25 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
508
  val = max(0, min(1, val + val_adj))
509
  arsl = max(0, min(1, arsl + arsl_adj))
510
 
 
 
 
 
 
 
 
511
  # Calculate secondary metrics
512
  neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
513
  em_stab = 1.0 - neur
514
  extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
515
- open = max(0, min(1, 0.5 + ((mar - 0.5) * 0.5)))
516
  agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
517
  consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
518
- stress = max(0, min(1, (cl * 0.5) + (eb_pos * 0.3) + ((1.0 - val) * 0.2)))
519
- engag = max(0, min(1, (arsl * 0.7) + ((1.0 - abs(h_tilt)) * 0.3)))
520
 
521
- # Update the metrics dictionary
522
- calculated_metrics.update({
523
  'valence': val,
524
  'arousal': arsl,
525
  'dominance': dom,
@@ -532,7 +475,7 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
532
  'extraversion': extr,
533
  'stress_index': stress,
534
  'engagement_level': engag
535
- })
536
 
537
  return calculated_metrics
538
 
@@ -609,83 +552,45 @@ def update_metrics_visualization(metrics_values):
609
  plt.tight_layout(pad=0.5)
610
  return fig
611
 
612
- def annotate_frame(frame, facial_data, metrics=None, enhanced_state=None):
613
  """
614
- Add facial landmark annotations and metrics to a frame
615
  """
616
  if frame is None:
617
  return None
618
 
619
  annotated = frame.copy()
620
 
621
- # If we have facial data, draw the landmarks
622
- if facial_data and "landmarks" in facial_data:
623
- landmarks = facial_data["landmarks"]
624
- rect = facial_data.get("rect")
625
-
626
- # Draw face rectangle if available
627
- if rect:
628
- x1, y1, x2, y2 = rect.left(), rect.top(), rect.right(), rect.bottom()
629
- cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)
630
-
631
- # Draw the 68 facial landmarks
632
- for i in range(68):
633
- x, y = landmarks.part(i).x, landmarks.part(i).y
634
- cv2.circle(annotated, (x, y), 2, (0, 0, 255), -1)
635
-
636
- # Draw connecting lines for different facial features
637
- # Eyes
638
- for eye_points in [(36, 41), (42, 47)]: # Left eye, Right eye
639
- for i in range(eye_points[0], eye_points[1]):
640
- pt1 = (landmarks.part(i).x, landmarks.part(i).y)
641
- pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
642
- cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
643
- # Connect last point to first
644
- pt1 = (landmarks.part(eye_points[1]).x, landmarks.part(eye_points[1]).y)
645
- pt2 = (landmarks.part(eye_points[0]).x, landmarks.part(eye_points[0]).y)
646
- cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
647
-
648
- # Eyebrows
649
- for brow_points in [(17, 21), (22, 26)]: # Left eyebrow, Right eyebrow
650
- for i in range(brow_points[0], brow_points[1]):
651
- pt1 = (landmarks.part(i).x, landmarks.part(i).y)
652
- pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
653
- cv2.line(annotated, pt1, pt2, (255, 255, 0), 1)
654
-
655
- # Nose
656
- for i in range(27, 35):
657
- pt1 = (landmarks.part(i).x, landmarks.part(i).y)
658
- pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
659
- cv2.line(annotated, pt1, pt2, (255, 0, 255), 1)
660
-
661
- # Mouth outer
662
- for i in range(48, 59):
663
- pt1 = (landmarks.part(i).x, landmarks.part(i).y)
664
- pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
665
- cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
666
- # Connect last point to first for mouth
667
- pt1 = (landmarks.part(59).x, landmarks.part(59).y)
668
- pt2 = (landmarks.part(48).x, landmarks.part(48).y)
669
- cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
670
-
671
- # Mouth inner
672
- for i in range(60, 67):
673
- pt1 = (landmarks.part(i).x, landmarks.part(i).y)
674
- pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
675
- cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
676
- # Connect last point to first for inner mouth
677
- pt1 = (landmarks.part(67).x, landmarks.part(67).y)
678
- pt2 = (landmarks.part(60).x, landmarks.part(60).y)
679
- cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
680
-
681
- # Add metrics summary if available
682
- if metrics:
683
  # Format for display
684
  h, w = annotated.shape[:2]
685
  y_pos = 30 # Starting Y position
686
 
687
- # Add user state if available
 
 
 
 
 
 
 
 
 
688
  if enhanced_state:
 
 
 
 
689
  # Draw background for text
690
  text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
691
  cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
@@ -695,21 +600,21 @@ def annotate_frame(frame, facial_data, metrics=None, enhanced_state=None):
695
  y_pos += 30
696
 
697
  # Show top 3 metrics
698
- top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
699
- key=lambda x: x[1], reverse=True)[:3]
700
-
701
- for name, value in top_metrics:
702
- metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
703
- text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
704
- cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
705
- cv2.putText(annotated, metric_text, (10, y_pos),
706
- cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
707
- y_pos += 25
 
708
 
709
  return annotated
710
 
711
  # --- API 1: Video File Processing ---
712
-
713
  def process_video_file(
714
  video_file: Union[str, np.ndarray],
715
  ad_description: str = "",
@@ -770,12 +675,23 @@ def process_video_file(
770
  processed_frames = []
771
 
772
  # Process the single frame
773
- facial_data = extract_face_landmarks_dlib(video_file)
774
- deepface_data = analyze_face_with_deepface(video_file)
775
 
776
- if facial_data:
777
- calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
778
- user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
 
 
 
 
 
 
 
 
 
 
 
779
 
780
  # Create a row for the dataframe
781
  row = {
@@ -784,12 +700,13 @@ def process_video_file(
784
  **calculated_metrics,
785
  **ad_context,
786
  'user_state': user_state,
787
- 'enhanced_user_state': enhanced_state
 
788
  }
789
  metrics_data.append(row)
790
 
791
  # Annotate the frame
792
- annotated_frame = annotate_frame(video_file, facial_data, calculated_metrics, enhanced_state)
793
  processed_frames.append(annotated_frame)
794
 
795
  # Save processed image
@@ -825,10 +742,13 @@ def process_video_file(
825
  metrics_data = []
826
  processed_frames = []
827
  frame_count = 0
 
 
828
 
829
  if show_progress:
830
  print(f"Processing video with {total_frames} frames at {fps} FPS")
831
  print(f"Ad Context: {ad_description} ({ad_type})")
 
832
 
833
  while True:
834
  ret, frame = cap.read()
@@ -840,14 +760,25 @@ def process_video_file(
840
  if show_progress and frame_count % (sampling_rate * 10) == 0:
841
  print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
842
 
843
- # Extract facial landmarks and analyze with DeepFace
844
- facial_data = extract_face_landmarks_dlib(frame)
845
- deepface_data = analyze_face_with_deepface(frame)
 
 
 
 
 
 
 
 
 
 
 
846
 
847
- # Calculate metrics if landmarks detected
848
- if facial_data:
849
- calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
850
- user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
851
 
852
  # Create a row for the dataframe
853
  row = {
@@ -856,12 +787,13 @@ def process_video_file(
856
  **calculated_metrics,
857
  **ad_context,
858
  'user_state': user_state,
859
- 'enhanced_user_state': enhanced_state
 
860
  }
861
  metrics_data.append(row)
862
 
863
  # Annotate the frame
864
- annotated_frame = annotate_frame(frame, facial_data, calculated_metrics, enhanced_state)
865
 
866
  if save_processed_video:
867
  out.write(annotated_frame)
@@ -898,14 +830,14 @@ def process_video_file(
898
  return csv_path, video_path, metrics_df, processed_frames
899
 
900
  # --- API 2: Webcam Processing Function ---
901
-
902
  def process_webcam_frame(
903
  frame: np.ndarray,
904
  ad_context: Dict[str, Any],
905
  metrics_data: pd.DataFrame,
906
  frame_count: int,
907
- start_time: float
908
- ) -> Tuple[np.ndarray, Dict[str, float], str, pd.DataFrame]:
 
909
  """
910
  Process a single webcam frame
911
 
@@ -915,21 +847,35 @@ def process_webcam_frame(
915
  metrics_data: DataFrame to accumulate metrics
916
  frame_count: Current frame count
917
  start_time: Start time of the session
 
918
 
919
  Returns:
920
- Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_metrics_df)
921
  """
922
  if frame is None:
923
- return None, None, None, metrics_data
 
 
 
 
 
 
 
 
 
 
 
 
924
 
925
- # Extract facial landmarks and analyze with DeepFace
926
- facial_data = extract_face_landmarks_dlib(frame)
927
- deepface_data = analyze_face_with_deepface(frame)
 
928
 
929
- # Calculate metrics if landmarks detected
930
- if facial_data:
931
- calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
932
- user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
933
 
934
  # Create a row for the dataframe
935
  current_time = time.time()
@@ -939,7 +885,8 @@ def process_webcam_frame(
939
  **calculated_metrics,
940
  **ad_context,
941
  'user_state': user_state,
942
- 'enhanced_user_state': enhanced_state
 
943
  }
944
 
945
  # Add row to DataFrame
@@ -947,15 +894,15 @@ def process_webcam_frame(
947
  metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
948
 
949
  # Annotate the frame
950
- annotated_frame = annotate_frame(frame, facial_data, calculated_metrics, enhanced_state)
951
 
952
- return annotated_frame, calculated_metrics, enhanced_state, metrics_data
953
  else:
954
  # No face detected
955
  no_face_frame = frame.copy()
956
  cv2.putText(no_face_frame, "No face detected", (30, 30),
957
  cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
958
- return no_face_frame, None, "No face detected", metrics_data
959
 
960
  def start_webcam_session(
961
  ad_description: str = "",
@@ -1003,7 +950,8 @@ def start_webcam_session(
1003
  "last_saved": 0,
1004
  "record_video": record_video,
1005
  "recorded_frames": [] if record_video else None,
1006
- "timestamps": [] if record_video else None
 
1007
  }
1008
 
1009
  return session
@@ -1011,7 +959,7 @@ def start_webcam_session(
1011
  def update_webcam_session(
1012
  session: Dict[str, Any],
1013
  frame: np.ndarray
1014
- ) -> Tuple[np.ndarray, Dict[str, float], str, Dict[str, Any]]:
1015
  """
1016
  Update webcam session with a new frame
1017
 
@@ -1020,20 +968,22 @@ def update_webcam_session(
1020
  frame: New frame from webcam
1021
 
1022
  Returns:
1023
- Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_session)
1024
  """
1025
  # Process the frame
1026
- annotated_frame, metrics, enhanced_state, updated_df = process_webcam_frame(
1027
  frame,
1028
  session["ad_context"],
1029
  session["metrics_data"],
1030
  session["frame_count"],
1031
- session["start_time"]
 
1032
  )
1033
 
1034
  # Update session
1035
  session["frame_count"] += 1
1036
  session["metrics_data"] = updated_df
 
1037
 
1038
  # Record frame if enabled
1039
  if session["record_video"] and annotated_frame is not None:
@@ -1046,7 +996,7 @@ def update_webcam_session(
1046
  updated_df.to_csv(session["csv_path"], index=False)
1047
  session["last_saved"] = session["frame_count"]
1048
 
1049
- return annotated_frame, metrics, enhanced_state, session
1050
 
1051
  def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
1052
  """
@@ -1100,19 +1050,19 @@ def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
1100
  return session["csv_path"], video_path
1101
 
1102
  # --- Create Gradio Interface ---
1103
-
1104
  def create_api_interface():
1105
- with gr.Blocks(title="Enhanced Facial Analysis APIs") as iface:
1106
- gr.Markdown("""
1107
- # Enhanced Facial Analysis APIs
1108
 
1109
  This interface provides two API endpoints:
1110
 
1111
  1. **Video File API**: Upload and analyze pre-recorded videos
1112
  2. **Webcam API**: Analyze live webcam feed in real-time
1113
 
1114
- Both APIs use dlib for facial landmark detection, DeepFace for emotion analysis,
1115
- and Google's Gemini API for enhanced interpretations.
 
1116
  """)
1117
 
1118
  with gr.Tab("Video File API"):
@@ -1231,6 +1181,9 @@ def create_api_interface():
1231
  with gr.Column():
1232
  enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
1233
 
 
 
 
1234
  with gr.Row():
1235
  download_csv = gr.File(label="Download Session Data")
1236
  download_video = gr.Video(label="Recorded Session")
@@ -1255,18 +1208,18 @@ def create_api_interface():
1255
 
1256
  def process_frame(frame, session):
1257
  if session is None:
1258
- return frame, None, "No active session. Click 'Start Session' to begin.", session
1259
 
1260
  # Process the frame
1261
- annotated_frame, metrics, enhanced_state, updated_session = update_webcam_session(session, frame)
1262
 
1263
  # Update the metrics plot if metrics available
1264
  if metrics:
1265
  metrics_plot = update_metrics_visualization(metrics)
1266
- return annotated_frame, metrics_plot, enhanced_state, updated_session
1267
  else:
1268
  # Return the annotated frame (likely with "No face detected")
1269
- return annotated_frame, None, enhanced_state or "No metrics available", updated_session
1270
 
1271
  def end_session(session):
1272
  if session is None:
@@ -1292,7 +1245,7 @@ def create_api_interface():
1292
  webcam_input.stream(
1293
  process_frame,
1294
  inputs=[webcam_input, session_data],
1295
- outputs=[processed_output, metrics_plot, enhanced_state_txt, session_data]
1296
  )
1297
 
1298
  end_session_btn.click(
@@ -1305,8 +1258,8 @@ def create_api_interface():
1305
 
1306
  # Entry point
1307
  if __name__ == "__main__":
1308
- print("Starting Enhanced Facial Analysis API server...")
1309
  print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
1310
- print(f"Facial analysis using dlib and DeepFace")
1311
  iface = create_api_interface()
1312
  iface.launch(debug=True)
 
3
  import numpy as np
4
  import pandas as pd
5
  import time
 
6
  import matplotlib.pyplot as plt
7
  from matplotlib.colors import LinearSegmentedColormap
8
  from matplotlib.collections import LineCollection
 
18
  import base64
19
  import io
20
  from pathlib import Path
21
+ import torch
22
+ from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
23
+ from io import BytesIO
24
 
25
  # Suppress warnings for cleaner output
26
  warnings.filterwarnings('ignore')
 
49
  print("Running with simulated Gemini API responses.")
50
  GEMINI_ENABLED = False
51
 
52
+ # --- Initialize LLaVA Vision Model ---
53
+ print("Initializing LLaVA Vision Model...")
54
+ LLAVA_ENABLED = False
55
  try:
56
+ # Check if GPU is available
57
+ if torch.cuda.is_available():
58
+ device = "cuda"
 
 
 
 
 
 
 
 
 
 
 
 
59
  else:
60
+ device = "cpu"
61
+
62
+ # Use a smaller LLaVA model for better performance
63
+ model_id = "llava-hf/llava-1.5-7b-hf"
64
+
65
+ # Initialize the model
66
+ processor = AutoProcessor.from_pretrained(model_id)
67
+ llava_model = AutoModelForCausalLM.from_pretrained(
68
+ model_id,
69
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
70
+ low_cpu_mem_usage=True if device == "cuda" else False,
71
+ ).to(device)
72
+
73
+ # Create a pipeline
74
+ vision_llm = pipeline(
75
+ "image-to-text",
76
+ model=llava_model,
77
+ tokenizer=processor.tokenizer,
78
+ image_processor=processor.image_processor,
79
+ device=device,
80
+ max_new_tokens=512,
81
+ )
82
 
83
+ LLAVA_ENABLED = True
84
+ print(f"LLaVA Vision Model initialized successfully on {device.upper()}")
85
+
86
  except Exception as e:
87
+ print(f"WARNING: Failed to initialize LLaVA Vision Model: {e}")
88
+ print("Running with DeepFace only (no LLaVA vision features).")
89
+ vision_llm = None
90
+
91
+ # --- Initialize OpenCV face detector for backup ---
92
+ print("Initializing OpenCV face detector...")
93
+ try:
94
+ # Use OpenCV's built-in face detector as backup
95
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
96
+
97
+ # Check if the face detector loaded successfully
98
+ if face_cascade.empty():
99
+ print("WARNING: Failed to load face cascade classifier")
100
+ else:
101
+ print("OpenCV face detector initialized successfully.")
102
+ except Exception as e:
103
+ print(f"ERROR initializing OpenCV face detector: {e}")
104
+ face_cascade = None
105
 
106
  # --- Metrics Definition ---
107
  metrics = [
 
123
  }
124
 
125
  ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
126
+ user_state_columns = ["user_state", "enhanced_user_state", "llava_analysis"]
127
  all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
128
  initial_metrics_df = pd.DataFrame(columns=all_columns)
129
 
130
+ # --- LLaVA Vision Analysis Function ---
131
+ def analyze_image_with_llava(image, ad_context=None):
132
+ """
133
+ Use LLaVA vision model to analyze facial expression and emotion in image
134
+ """
135
+ if not LLAVA_ENABLED or vision_llm is None or image is None:
136
+ return "LLaVA analysis not available"
137
+
138
+ try:
139
+ # Convert OpenCV image (BGR) to PIL Image (RGB)
140
+ if len(image.shape) == 3 and image.shape[2] == 3:
141
+ # Check if BGR and convert to RGB if needed
142
+ if np.mean(image[:,:,0]) < np.mean(image[:,:,2]): # Rough BGR check
143
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
144
+ else:
145
+ image_rgb = image
146
+ else:
147
+ # Handle grayscale or other formats
148
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
149
+
150
+ # Convert to PIL Image
151
+ pil_image = Image.fromarray(image_rgb)
152
+
153
+ # Create prompt based on ad context
154
+ ad_info = ""
155
+ if ad_context:
156
+ ad_desc = ad_context.get('ad_description', '')
157
+ ad_type = ad_context.get('ad_type', '')
158
+ if ad_desc:
159
+ ad_info = f" while watching an ad about {ad_desc} (type: {ad_type})"
160
+
161
+ prompt = f"""Analyze this person's facial expression and emotion{ad_info}.
162
+ Describe their emotional state, engagement level, and cognitive state in detail.
163
+ Focus on: valence (positive/negative emotion), arousal (excitement level),
164
+ attention, stress indicators, and overall reaction to what they're seeing.
165
+ """
166
+
167
+ # Process with Vision LLM
168
+ outputs = vision_llm(pil_image, prompt=prompt)
169
+
170
+ # Extract the generated text
171
+ if isinstance(outputs, list) and len(outputs) > 0:
172
+ if isinstance(outputs[0], dict) and "generated_text" in outputs[0]:
173
+ return outputs[0]["generated_text"]
174
+ elif isinstance(outputs[0], str):
175
+ return outputs[0]
176
+
177
+ return str(outputs) if outputs else "No results from LLaVA analysis"
178
+
179
+ except Exception as e:
180
+ print(f"Error in LLaVA analysis: {e}")
181
+ return f"LLaVA analysis error: {str(e)}"
182
 
183
+ # --- Gemini API Functions ---
184
  def call_gemini_api_for_ad(description, detail, ad_type):
185
  """
186
  Uses Google Gemini to analyze ad context.
 
212
  print(f"Error calling Gemini for ad context: {e}")
213
  return f"Error analyzing ad context: {str(e)}"
214
 
215
+ def interpret_metrics_with_gemini(metrics_dict, deepface_results=None, llava_analysis=None, ad_context=None):
216
  """
217
+ Uses Google Gemini to interpret facial metrics, DeepFace results and LLaVA analysis
218
+ to determine user state.
219
  """
220
+ if not metrics_dict and not deepface_results and not llava_analysis:
221
  return "No metrics", "No facial data detected"
222
 
223
  if not GEMINI_ENABLED:
224
  # Basic rule-based simulation for user state
225
+ valence = metrics_dict.get('valence', 0.5) if metrics_dict else 0.5
226
+ arousal = metrics_dict.get('arousal', 0.5) if metrics_dict else 0.5
227
+
228
+ # Extract emotion from DeepFace if available
229
+ dominant_emotion = "neutral"
230
+ if deepface_results and "emotion" in deepface_results:
231
+ emotion_dict = deepface_results["emotion"]
232
+ dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
233
 
234
  # Simple rule-based simulation
235
+ state = dominant_emotion.capitalize() if dominant_emotion != "neutral" else "Neutral"
236
+ if valence > 0.65 and arousal > 0.55:
237
  state = "Positive, Engaged"
238
+ elif valence < 0.4 and arousal > 0.6:
239
  state = "Stressed, Negative"
 
 
 
 
240
 
241
+ enhanced_state = f"The viewer appears {state.lower()} while watching this content."
242
+ if llava_analysis and llava_analysis != "LLaVA analysis not available":
243
+ # Extract a brief summary from LLaVA analysis (first sentence)
244
+ first_sentence = llava_analysis.split('.')[0] + '.'
245
+ enhanced_state += f" {first_sentence}"
246
 
247
  return state, enhanced_state
248
  else:
249
  try:
250
  # Format metrics for Gemini
251
+ metrics_formatted = ""
252
+ if metrics_dict:
253
+ metrics_formatted = "\nMetrics (0-1 scale):\n" + "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
254
+ if k not in ('timestamp', 'frame_number')])
255
+
256
+ # Format DeepFace results
257
+ deepface_formatted = ""
258
+ if deepface_results and "emotion" in deepface_results:
259
+ emotion_dict = deepface_results["emotion"]
260
+ deepface_formatted = "\nDeepFace emotions:\n" + "\n".join([f"- {k.title()}: {v:.2f}" for k, v in emotion_dict.items()])
261
+
262
+ # Format LLaVA analysis
263
+ llava_formatted = ""
264
+ if llava_analysis and llava_analysis != "LLaVA analysis not available":
265
+ llava_formatted = f"\nLLaVA Vision Analysis:\n{llava_analysis}"
266
 
267
  # Include ad context if available
268
  ad_info = ""
 
272
  ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
273
 
274
  prompt = f"""
275
+ Analyze the facial expression and emotion of a person watching an advertisement{ad_info}.
276
+
277
+ Use these combined inputs:{metrics_formatted}{deepface_formatted}{llava_formatted}
278
 
279
  Provide two outputs:
280
  1. User State: A short 1-3 word description of their emotional/cognitive state
 
305
  print(f"Error calling Gemini for metric interpretation: {e}")
306
  return "Error", f"Error analyzing facial metrics: {str(e)}"
307
 
308
+ # --- DeepFace Analysis Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  def analyze_face_with_deepface(image):
310
+ """Analyze facial emotions and attributes using DeepFace"""
311
  if image is None:
312
  return None
313
 
 
330
  # Analyze with DeepFace
331
  analysis = DeepFace.analyze(
332
  img_path=temp_img,
333
+ actions=['emotion', 'age', 'gender', 'race'],
334
  enforce_detection=False, # Don't throw error if face not detected
335
  detector_backend='opencv' # Faster detection
336
  )
 
351
  print(f"DeepFace analysis error: {e}")
352
  return None
353
 
354
+ # --- Face Detection Backup with OpenCV ---
355
+ def detect_face_opencv(image):
356
+ """Detect faces using OpenCV cascade classifier as backup"""
357
+ if image is None or face_cascade is None:
358
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
  try:
361
+ # Convert to grayscale for detection
362
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
+ # Detect faces
365
+ faces = face_cascade.detectMultiScale(
366
+ gray,
367
+ scaleFactor=1.1,
368
+ minNeighbors=5,
369
+ minSize=(30, 30)
370
+ )
371
 
372
+ if len(faces) == 0:
373
+ return None
 
374
 
375
+ # Get the largest face by area
376
+ largest_face = max(faces, key=lambda rect: rect[2] * rect[3])
 
377
 
378
+ return {"rect": largest_face}
379
 
380
  except Exception as e:
381
+ print(f"Error in OpenCV face detection: {e}")
382
+ return None
383
 
384
+ # --- Calculate Metrics from DeepFace Results ---
385
+ def calculate_metrics_from_deepface(deepface_results, ad_context=None):
386
  """
387
+ Calculate psychometric metrics from DeepFace analysis results
 
388
  """
389
  if ad_context is None:
390
  ad_context = {}
 
393
  default_metrics = {m: 0.5 for m in metrics}
394
 
395
  # If no facial data, return defaults
396
+ if not deepface_results or "emotion" not in deepface_results:
397
  return default_metrics
398
 
399
+ # Extract emotion data from DeepFace
400
+ emotion_dict = deepface_results["emotion"]
401
+ # Find dominant emotion
402
+ dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
403
+ dominant_score = max(emotion_dict.items(), key=lambda x: x[1])[1] / 100.0 # Convert to 0-1 scale
404
+
405
+ # Get base values from emotion mapping
406
+ base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
407
+
408
+ # Calculate primary metrics with confidence weighting
409
+ val = base_vals["valence"]
410
+ arsl = base_vals["arousal"]
411
+ dom = base_vals["dominance"]
412
+
413
+ # Add directional adjustments based on specific emotions
414
+ if dominant_emotion == "happy":
415
+ val += 0.1
416
+ elif dominant_emotion == "sad":
417
+ val -= 0.1
418
+ elif dominant_emotion == "angry":
419
+ arsl += 0.1
420
+ dom += 0.1
421
+ elif dominant_emotion == "fear":
422
+ arsl += 0.1
423
+ dom -= 0.1
424
+
425
+ # Adjust for gender and age if available (just examples of potential factors)
426
+ if "gender" in deepface_results:
427
+ gender = deepface_results["gender"]
428
+ gender_score = deepface_results.get("gender_score", 0.5)
429
+ # No real adjustment needed, this is just an example
430
+
431
+ if "age" in deepface_results:
432
+ age = deepface_results["age"]
433
+ # No real adjustment needed, this is just an example
 
 
 
 
 
 
 
 
434
 
435
  # Illustrative Context Adjustments from ad
436
  ad_type = ad_context.get('ad_type', 'Unknown')
 
444
  val = max(0, min(1, val + val_adj))
445
  arsl = max(0, min(1, arsl + arsl_adj))
446
 
447
+ # Estimate cognitive load based on emotional intensity
448
+ cl = 0.5 # Default
449
+ if dominant_emotion in ["neutral"]:
450
+ cl = 0.3 # Lower cognitive load for neutral expression
451
+ elif dominant_emotion in ["surprise", "fear"]:
452
+ cl = 0.7 # Higher cognitive load for surprise/fear
453
+
454
  # Calculate secondary metrics
455
  neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
456
  em_stab = 1.0 - neur
457
  extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
458
+ open = max(0, min(1, 0.5 + (val - 0.5) * 0.5))
459
  agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
460
  consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
461
+ stress = max(0, min(1, (cl * 0.5) + ((1.0 - val) * 0.5)))
462
+ engag = max(0, min(1, arsl * 0.7 + (val * 0.3)))
463
 
464
+ # Create metrics dictionary
465
+ calculated_metrics = {
466
  'valence': val,
467
  'arousal': arsl,
468
  'dominance': dom,
 
475
  'extraversion': extr,
476
  'stress_index': stress,
477
  'engagement_level': engag
478
+ }
479
 
480
  return calculated_metrics
481
 
 
552
  plt.tight_layout(pad=0.5)
553
  return fig
554
 
555
+ def annotate_frame(frame, face_data=None, deepface_results=None, metrics=None, enhanced_state=None):
556
  """
557
+ Add facial annotations and metrics to a frame
558
  """
559
  if frame is None:
560
  return None
561
 
562
  annotated = frame.copy()
563
 
564
+ # Draw face rectangle if available
565
+ if face_data and "rect" in face_data:
566
+ x, y, w, h = face_data["rect"]
567
+ cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
568
+ elif deepface_results and "region" in deepface_results:
569
+ region = deepface_results["region"]
570
+ x, y, w, h = region["x"], region["y"], region["w"], region["h"]
571
+ cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
572
+
573
+ # Add emotion and metrics summary
574
+ if deepface_results or metrics:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  # Format for display
576
  h, w = annotated.shape[:2]
577
  y_pos = 30 # Starting Y position
578
 
579
+ # Add emotion info if available from DeepFace
580
+ if deepface_results and "dominant_emotion" in deepface_results:
581
+ emotion_text = f"Emotion: {deepface_results['dominant_emotion'].capitalize()}"
582
+ text_size = cv2.getTextSize(emotion_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
583
+ cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
584
+ cv2.putText(annotated, emotion_text, (10, y_pos),
585
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
586
+ y_pos += 30
587
+
588
+ # Add enhanced user state if available
589
  if enhanced_state:
590
+ # Truncate if too long
591
+ if len(enhanced_state) > 60:
592
+ enhanced_state = enhanced_state[:57] + "..."
593
+
594
  # Draw background for text
595
  text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
596
  cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
 
600
  y_pos += 30
601
 
602
  # Show top 3 metrics
603
+ if metrics:
604
+ top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
605
+ key=lambda x: x[1], reverse=True)[:3]
606
+
607
+ for name, value in top_metrics:
608
+ metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
609
+ text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
610
+ cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
611
+ cv2.putText(annotated, metric_text, (10, y_pos),
612
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
613
+ y_pos += 25
614
 
615
  return annotated
616
 
617
  # --- API 1: Video File Processing ---
 
618
  def process_video_file(
619
  video_file: Union[str, np.ndarray],
620
  ad_description: str = "",
 
675
  processed_frames = []
676
 
677
  # Process the single frame
678
+ deepface_results = analyze_face_with_deepface(video_file)
679
+ face_data = None
680
 
681
+ # Fall back to OpenCV face detection if DeepFace didn't detect a face
682
+ if not deepface_results or "region" not in deepface_results:
683
+ face_data = detect_face_opencv(video_file)
684
+
685
+ # Use LLaVA for additional analysis (once per frame)
686
+ llava_analysis = "LLaVA analysis not available"
687
+ if face_data is not None or (deepface_results and "region" in deepface_results):
688
+ # Only use LLaVA if a face was detected
689
+ llava_analysis = analyze_image_with_llava(video_file, ad_context)
690
+
691
+ # Calculate metrics if face detected
692
+ if deepface_results or face_data:
693
+ calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
694
+ user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
695
 
696
  # Create a row for the dataframe
697
  row = {
 
700
  **calculated_metrics,
701
  **ad_context,
702
  'user_state': user_state,
703
+ 'enhanced_user_state': enhanced_state,
704
+ 'llava_analysis': llava_analysis
705
  }
706
  metrics_data.append(row)
707
 
708
  # Annotate the frame
709
+ annotated_frame = annotate_frame(video_file, face_data, deepface_results, calculated_metrics, enhanced_state)
710
  processed_frames.append(annotated_frame)
711
 
712
  # Save processed image
 
742
  metrics_data = []
743
  processed_frames = []
744
  frame_count = 0
745
+ llava_counter = 0 # To limit LLaVA analysis (it's slow)
746
+ llava_interval = sampling_rate * 10 # Run LLaVA every X frames
747
 
748
  if show_progress:
749
  print(f"Processing video with {total_frames} frames at {fps} FPS")
750
  print(f"Ad Context: {ad_description} ({ad_type})")
751
+ print(f"LLaVA Vision Model: {'Enabled' if LLAVA_ENABLED else 'Disabled'}")
752
 
753
  while True:
754
  ret, frame = cap.read()
 
760
  if show_progress and frame_count % (sampling_rate * 10) == 0:
761
  print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
762
 
763
+ # Analyze with DeepFace
764
+ deepface_results = analyze_face_with_deepface(frame)
765
+ face_data = None
766
+
767
+ # Fall back to OpenCV face detection if DeepFace didn't detect a face
768
+ if not deepface_results or "region" not in deepface_results:
769
+ face_data = detect_face_opencv(frame)
770
+
771
+ # Use LLaVA for additional analysis (periodically to save time)
772
+ llava_analysis = "LLaVA analysis not available"
773
+ if (face_data is not None or (deepface_results and "region" in deepface_results)) and llava_counter % llava_interval == 0:
774
+ # Only use LLaVA if a face was detected and on the right interval
775
+ llava_analysis = analyze_image_with_llava(frame, ad_context)
776
+ llava_counter += 1
777
 
778
+ # Calculate metrics if face detected
779
+ if deepface_results or face_data:
780
+ calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
781
+ user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
782
 
783
  # Create a row for the dataframe
784
  row = {
 
787
  **calculated_metrics,
788
  **ad_context,
789
  'user_state': user_state,
790
+ 'enhanced_user_state': enhanced_state,
791
+ 'llava_analysis': llava_analysis
792
  }
793
  metrics_data.append(row)
794
 
795
  # Annotate the frame
796
+ annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
797
 
798
  if save_processed_video:
799
  out.write(annotated_frame)
 
830
  return csv_path, video_path, metrics_df, processed_frames
831
 
832
  # --- API 2: Webcam Processing Function ---
 
833
  def process_webcam_frame(
834
  frame: np.ndarray,
835
  ad_context: Dict[str, Any],
836
  metrics_data: pd.DataFrame,
837
  frame_count: int,
838
+ start_time: float,
839
+ llava_counter: int
840
+ ) -> Tuple[np.ndarray, Dict[str, float], str, str, pd.DataFrame, int]:
841
  """
842
  Process a single webcam frame
843
 
 
847
  metrics_data: DataFrame to accumulate metrics
848
  frame_count: Current frame count
849
  start_time: Start time of the session
850
+ llava_counter: Counter to limit LLaVA calls
851
 
852
  Returns:
853
+ Tuple of (annotated_frame, metrics_dict, enhanced_state, llava_analysis, updated_metrics_df, updated_llava_counter)
854
  """
855
  if frame is None:
856
+ return None, None, None, None, metrics_data, llava_counter
857
+
858
+ # Analyze with DeepFace
859
+ deepface_results = analyze_face_with_deepface(frame)
860
+ face_data = None
861
+
862
+ # Fall back to OpenCV face detection if DeepFace didn't detect a face
863
+ if not deepface_results or "region" not in deepface_results:
864
+ face_data = detect_face_opencv(frame)
865
+
866
+ # Use LLaVA for periodic analysis (it's slow)
867
+ llava_analysis = "LLaVA analysis not available"
868
+ llava_interval = 30 # Run LLaVA every X frames
869
 
870
+ if (face_data is not None or (deepface_results and "region" in deepface_results)) and llava_counter % llava_interval == 0:
871
+ # Only use LLaVA if a face was detected and on the right interval
872
+ llava_analysis = analyze_image_with_llava(frame, ad_context)
873
+ llava_counter += 1
874
 
875
+ # Calculate metrics if face detected
876
+ if deepface_results or face_data:
877
+ calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
878
+ user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
879
 
880
  # Create a row for the dataframe
881
  current_time = time.time()
 
885
  **calculated_metrics,
886
  **ad_context,
887
  'user_state': user_state,
888
+ 'enhanced_user_state': enhanced_state,
889
+ 'llava_analysis': llava_analysis
890
  }
891
 
892
  # Add row to DataFrame
 
894
  metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
895
 
896
  # Annotate the frame
897
+ annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
898
 
899
+ return annotated_frame, calculated_metrics, enhanced_state, llava_analysis, metrics_data, llava_counter
900
  else:
901
  # No face detected
902
  no_face_frame = frame.copy()
903
  cv2.putText(no_face_frame, "No face detected", (30, 30),
904
  cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
905
+ return no_face_frame, None, "No face detected", None, metrics_data, llava_counter
906
 
907
  def start_webcam_session(
908
  ad_description: str = "",
 
950
  "last_saved": 0,
951
  "record_video": record_video,
952
  "recorded_frames": [] if record_video else None,
953
+ "timestamps": [] if record_video else None,
954
+ "llava_counter": 0 # Counter to limit LLaVA calls
955
  }
956
 
957
  return session
 
959
  def update_webcam_session(
960
  session: Dict[str, Any],
961
  frame: np.ndarray
962
+ ) -> Tuple[np.ndarray, Dict[str, float], str, str, Dict[str, Any]]:
963
  """
964
  Update webcam session with a new frame
965
 
 
968
  frame: New frame from webcam
969
 
970
  Returns:
971
+ Tuple of (annotated_frame, metrics_dict, enhanced_state, llava_analysis, updated_session)
972
  """
973
  # Process the frame
974
+ annotated_frame, metrics, enhanced_state, llava_analysis, updated_df, updated_llava_counter = process_webcam_frame(
975
  frame,
976
  session["ad_context"],
977
  session["metrics_data"],
978
  session["frame_count"],
979
+ session["start_time"],
980
+ session["llava_counter"]
981
  )
982
 
983
  # Update session
984
  session["frame_count"] += 1
985
  session["metrics_data"] = updated_df
986
+ session["llava_counter"] = updated_llava_counter
987
 
988
  # Record frame if enabled
989
  if session["record_video"] and annotated_frame is not None:
 
996
  updated_df.to_csv(session["csv_path"], index=False)
997
  session["last_saved"] = session["frame_count"]
998
 
999
+ return annotated_frame, metrics, enhanced_state, llava_analysis, session
1000
 
1001
  def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
1002
  """
 
1050
  return session["csv_path"], video_path
1051
 
1052
  # --- Create Gradio Interface ---
 
1053
  def create_api_interface():
1054
+ with gr.Blocks(title="Facial Analysis APIs") as iface:
1055
+ gr.Markdown(f"""
1056
+ # Enhanced Facial Analysis APIs (LLaVA + DeepFace)
1057
 
1058
  This interface provides two API endpoints:
1059
 
1060
  1. **Video File API**: Upload and analyze pre-recorded videos
1061
  2. **Webcam API**: Analyze live webcam feed in real-time
1062
 
1063
+ Both APIs use DeepFace for emotion analysis and Google's Gemini API for enhanced interpretations.
1064
+
1065
+ **LLaVA Vision Model: {'✅ Enabled' if LLAVA_ENABLED else '❌ Disabled'}**
1066
  """)
1067
 
1068
  with gr.Tab("Video File API"):
 
1181
  with gr.Column():
1182
  enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
1183
 
1184
+ with gr.Row():
1185
+ llava_analysis_txt = gr.Textbox(label="LLaVA Vision Analysis", lines=6)
1186
+
1187
  with gr.Row():
1188
  download_csv = gr.File(label="Download Session Data")
1189
  download_video = gr.Video(label="Recorded Session")
 
1208
 
1209
  def process_frame(frame, session):
1210
  if session is None:
1211
+ return frame, None, "No active session. Click 'Start Session' to begin.", "LLaVA analysis not available", session
1212
 
1213
  # Process the frame
1214
+ annotated_frame, metrics, enhanced_state, llava_analysis, updated_session = update_webcam_session(session, frame)
1215
 
1216
  # Update the metrics plot if metrics available
1217
  if metrics:
1218
  metrics_plot = update_metrics_visualization(metrics)
1219
+ return annotated_frame, metrics_plot, enhanced_state, llava_analysis or "LLaVA analysis not available", updated_session
1220
  else:
1221
  # Return the annotated frame (likely with "No face detected")
1222
+ return annotated_frame, None, enhanced_state or "No metrics available", "LLaVA analysis not available", updated_session
1223
 
1224
  def end_session(session):
1225
  if session is None:
 
1245
  webcam_input.stream(
1246
  process_frame,
1247
  inputs=[webcam_input, session_data],
1248
+ outputs=[processed_output, metrics_plot, enhanced_state_txt, llava_analysis_txt, session_data]
1249
  )
1250
 
1251
  end_session_btn.click(
 
1258
 
1259
  # Entry point
1260
  if __name__ == "__main__":
1261
+ print("Starting Enhanced Facial Analysis API (LLaVA + DeepFace)...")
1262
  print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
1263
+ print(f"LLaVA Vision Model {'enabled' if LLAVA_ENABLED else 'disabled (using DeepFace only)'}")
1264
  iface = create_api_interface()
1265
  iface.launch(debug=True)