Spaces:
Configuration error
Configuration error
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ import cv2
|
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
import time
|
6 |
-
import dlib
|
7 |
import matplotlib.pyplot as plt
|
8 |
from matplotlib.colors import LinearSegmentedColormap
|
9 |
from matplotlib.collections import LineCollection
|
@@ -19,6 +18,9 @@ from deepface import DeepFace
|
|
19 |
import base64
|
20 |
import io
|
21 |
from pathlib import Path
|
|
|
|
|
|
|
22 |
|
23 |
# Suppress warnings for cleaner output
|
24 |
warnings.filterwarnings('ignore')
|
@@ -47,32 +49,59 @@ except Exception as e:
|
|
47 |
print("Running with simulated Gemini API responses.")
|
48 |
GEMINI_ENABLED = False
|
49 |
|
50 |
-
# --- Initialize
|
51 |
-
print("Initializing
|
|
|
52 |
try:
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
-
# Paths to shape predictor model file
|
57 |
-
# You need to download this file from:
|
58 |
-
# http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
|
59 |
-
predictor_path = "shape_predictor_68_face_landmarks.dat"
|
60 |
-
|
61 |
-
# Check if the predictor file exists, otherwise inform the user
|
62 |
-
if not os.path.exists(predictor_path):
|
63 |
-
print(f"WARNING: {predictor_path} not found. Please download from:")
|
64 |
-
print("http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
|
65 |
-
print("Extract and place in the current directory.")
|
66 |
-
# Use a placeholder or alternative
|
67 |
-
shape_predictor = None
|
68 |
else:
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
|
|
|
|
72 |
except Exception as e:
|
73 |
-
print(f"
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# --- Metrics Definition ---
|
78 |
metrics = [
|
@@ -94,12 +123,64 @@ emotion_mapping = {
|
|
94 |
}
|
95 |
|
96 |
ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
|
97 |
-
user_state_columns = ["user_state", "enhanced_user_state"]
|
98 |
all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
|
99 |
initial_metrics_df = pd.DataFrame(columns=all_columns)
|
100 |
|
101 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
|
|
103 |
def call_gemini_api_for_ad(description, detail, ad_type):
|
104 |
"""
|
105 |
Uses Google Gemini to analyze ad context.
|
@@ -131,40 +212,57 @@ def call_gemini_api_for_ad(description, detail, ad_type):
|
|
131 |
print(f"Error calling Gemini for ad context: {e}")
|
132 |
return f"Error analyzing ad context: {str(e)}"
|
133 |
|
134 |
-
def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
|
135 |
"""
|
136 |
-
Uses Google Gemini to interpret facial metrics and
|
|
|
137 |
"""
|
138 |
-
if not metrics_dict:
|
139 |
return "No metrics", "No facial data detected"
|
140 |
|
141 |
if not GEMINI_ENABLED:
|
142 |
# Basic rule-based simulation for user state
|
143 |
-
valence = metrics_dict.get('valence', 0.5)
|
144 |
-
arousal = metrics_dict.get('arousal', 0.5)
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
148 |
|
149 |
# Simple rule-based simulation
|
150 |
-
state = "Neutral"
|
151 |
-
if valence > 0.65 and arousal > 0.55
|
152 |
state = "Positive, Engaged"
|
153 |
-
elif valence < 0.4 and
|
154 |
state = "Stressed, Negative"
|
155 |
-
elif cog_load > 0.7 and engagement < 0.4:
|
156 |
-
state = "Confused, Disengaged"
|
157 |
-
elif arousal < 0.4 and engagement < 0.5:
|
158 |
-
state = "Calm, Passive"
|
159 |
|
160 |
-
enhanced_state = f"The viewer appears {state.lower()} while watching this content.
|
|
|
|
|
|
|
|
|
161 |
|
162 |
return state, enhanced_state
|
163 |
else:
|
164 |
try:
|
165 |
# Format metrics for Gemini
|
166 |
-
metrics_formatted = "
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
# Include ad context if available
|
170 |
ad_info = ""
|
@@ -174,8 +272,9 @@ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
|
|
174 |
ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
|
175 |
|
176 |
prompt = f"""
|
177 |
-
Analyze
|
178 |
-
|
|
|
179 |
|
180 |
Provide two outputs:
|
181 |
1. User State: A short 1-3 word description of their emotional/cognitive state
|
@@ -206,45 +305,9 @@ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
|
|
206 |
print(f"Error calling Gemini for metric interpretation: {e}")
|
207 |
return "Error", f"Error analyzing facial metrics: {str(e)}"
|
208 |
|
209 |
-
# ---
|
210 |
-
|
211 |
-
def extract_face_landmarks_dlib(image):
|
212 |
-
"""Extract facial landmarks using dlib"""
|
213 |
-
if image is None or face_detector is None or shape_predictor is None:
|
214 |
-
return None
|
215 |
-
|
216 |
-
try:
|
217 |
-
# Convert to grayscale for dlib
|
218 |
-
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
219 |
-
|
220 |
-
# Detect faces
|
221 |
-
faces = face_detector(gray, 0)
|
222 |
-
|
223 |
-
if len(faces) == 0:
|
224 |
-
return None
|
225 |
-
|
226 |
-
# Get the largest face by area
|
227 |
-
largest_face = faces[0]
|
228 |
-
largest_area = (faces[0].right() - faces[0].left()) * (faces[0].bottom() - faces[0].top())
|
229 |
-
|
230 |
-
for face in faces:
|
231 |
-
area = (face.right() - face.left()) * (face.bottom() - face.top())
|
232 |
-
if area > largest_area:
|
233 |
-
largest_face = face
|
234 |
-
largest_area = area
|
235 |
-
|
236 |
-
# Get facial landmarks
|
237 |
-
landmarks = shape_predictor(gray, largest_face)
|
238 |
-
|
239 |
-
# Return both the face detection rectangle and landmarks
|
240 |
-
return {"rect": largest_face, "landmarks": landmarks}
|
241 |
-
|
242 |
-
except Exception as e:
|
243 |
-
print(f"Error in dlib landmark extraction: {e}")
|
244 |
-
return None
|
245 |
-
|
246 |
def analyze_face_with_deepface(image):
|
247 |
-
"""Analyze facial emotions using DeepFace"""
|
248 |
if image is None:
|
249 |
return None
|
250 |
|
@@ -267,7 +330,7 @@ def analyze_face_with_deepface(image):
|
|
267 |
# Analyze with DeepFace
|
268 |
analysis = DeepFace.analyze(
|
269 |
img_path=temp_img,
|
270 |
-
actions=['emotion'],
|
271 |
enforce_detection=False, # Don't throw error if face not detected
|
272 |
detector_backend='opencv' # Faster detection
|
273 |
)
|
@@ -288,159 +351,40 @@ def analyze_face_with_deepface(image):
|
|
288 |
print(f"DeepFace analysis error: {e}")
|
289 |
return None
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
try:
|
297 |
-
# dlib's 68-point face model landmark indices
|
298 |
-
# Left eye: 36-41, Right eye: 42-47
|
299 |
-
LEFT_EYE = range(36, 42)
|
300 |
-
RIGHT_EYE = range(42, 48)
|
301 |
-
|
302 |
-
def get_eye_aspect_ratio(eye_points):
|
303 |
-
# Compute the euclidean distances between the two sets of vertical landmarks
|
304 |
-
v1 = np.linalg.norm(eye_points[1] - eye_points[5])
|
305 |
-
v2 = np.linalg.norm(eye_points[2] - eye_points[4])
|
306 |
-
# Compute the euclidean distance between the horizontal landmarks
|
307 |
-
h = np.linalg.norm(eye_points[0] - eye_points[3])
|
308 |
-
# Compute the eye aspect ratio
|
309 |
-
return (v1 + v2) / (2.0 * h) if h > 1e-6 else 0.0
|
310 |
-
|
311 |
-
# Extract landmark coordinates
|
312 |
-
landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
|
313 |
-
|
314 |
-
# Calculate EAR for left and right eyes
|
315 |
-
left_eye_coords = landmark_coords[list(LEFT_EYE)]
|
316 |
-
right_eye_coords = landmark_coords[list(RIGHT_EYE)]
|
317 |
-
|
318 |
-
left_ear = get_eye_aspect_ratio(left_eye_coords)
|
319 |
-
right_ear = get_eye_aspect_ratio(right_eye_coords)
|
320 |
-
|
321 |
-
# Return average of both eyes
|
322 |
-
return (left_ear + right_ear) / 2.0
|
323 |
-
|
324 |
-
except Exception as e:
|
325 |
-
print(f"Error calculating EAR: {e}")
|
326 |
-
return 0.0
|
327 |
-
|
328 |
-
def calculate_mar_dlib(landmarks):
|
329 |
-
"""Calculate Mouth Aspect Ratio using dlib landmarks"""
|
330 |
-
if landmarks is None:
|
331 |
-
return 0.0
|
332 |
-
|
333 |
-
try:
|
334 |
-
# dlib's 68-point face model landmark indices for mouth
|
335 |
-
# Mouth outer: 48-59, Mouth inner: 60-67
|
336 |
-
MOUTH_OUTER = range(48, 60)
|
337 |
-
MOUTH_INNER = range(60, 68)
|
338 |
-
|
339 |
-
# Extract landmark coordinates
|
340 |
-
landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
|
341 |
-
|
342 |
-
# Use specific points for vertical and horizontal measurements
|
343 |
-
# Vertical: distance between top and bottom lips
|
344 |
-
top_lip = landmark_coords[51] # Top lip center
|
345 |
-
bottom_lip = landmark_coords[57] # Bottom lip center
|
346 |
-
vertical = np.linalg.norm(top_lip - bottom_lip)
|
347 |
-
|
348 |
-
# Horizontal: distance between mouth corners
|
349 |
-
left_corner = landmark_coords[48] # Left mouth corner
|
350 |
-
right_corner = landmark_coords[54] # Right mouth corner
|
351 |
-
horizontal = np.linalg.norm(left_corner - right_corner)
|
352 |
-
|
353 |
-
# Calculate ratio
|
354 |
-
return vertical / horizontal if horizontal > 1e-6 else 0.0
|
355 |
-
|
356 |
-
except Exception as e:
|
357 |
-
print(f"Error calculating MAR: {e}")
|
358 |
-
return 0.0
|
359 |
-
|
360 |
-
def calculate_eyebrow_position_dlib(landmarks):
|
361 |
-
"""Calculate eyebrow position using dlib landmarks"""
|
362 |
-
if landmarks is None:
|
363 |
-
return 0.0
|
364 |
-
|
365 |
-
try:
|
366 |
-
# dlib's 68-point face model landmark indices
|
367 |
-
# Left eyebrow: 17-21, Right eyebrow: 22-26
|
368 |
-
# Left eye: 36-41, Right eye: 42-47
|
369 |
-
L_BROW_C = 19 # Center of left eyebrow
|
370 |
-
R_BROW_C = 24 # Center of right eyebrow
|
371 |
-
L_EYE_C = 37 # Center top of left eye
|
372 |
-
R_EYE_C = 43 # Center top of right eye
|
373 |
-
|
374 |
-
# Extract landmark coordinates
|
375 |
-
landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
|
376 |
-
|
377 |
-
# Calculate distances between eyebrows and eyes
|
378 |
-
l_brow_y = landmark_coords[L_BROW_C][1]
|
379 |
-
r_brow_y = landmark_coords[R_BROW_C][1]
|
380 |
-
l_eye_y = landmark_coords[L_EYE_C][1]
|
381 |
-
r_eye_y = landmark_coords[R_EYE_C][1]
|
382 |
-
|
383 |
-
# Calculate vertical distances (smaller value means eyebrows are raised)
|
384 |
-
l_dist = l_eye_y - l_brow_y
|
385 |
-
r_dist = r_eye_y - r_brow_y
|
386 |
-
|
387 |
-
# Average the distances and normalize
|
388 |
-
avg_dist = (l_dist + r_dist) / 2.0
|
389 |
-
# Approximate normalization based on typical face proportions
|
390 |
-
# Higher value means eyebrows are raised more
|
391 |
-
norm = (avg_dist - 5) / 15 # Adjusted for typical pixel distances
|
392 |
-
|
393 |
-
return max(0.0, min(1.0, norm))
|
394 |
-
|
395 |
-
except Exception as e:
|
396 |
-
print(f"Error calculating Eyebrow Position: {e}")
|
397 |
-
return 0.0
|
398 |
-
|
399 |
-
def estimate_head_pose_dlib(landmarks):
|
400 |
-
"""Estimate head pose using dlib landmarks"""
|
401 |
-
if landmarks is None:
|
402 |
-
return 0.0, 0.0
|
403 |
|
404 |
try:
|
405 |
-
#
|
406 |
-
|
407 |
-
LEFT_EYE_C = 37 # Left eye center
|
408 |
-
RIGHT_EYE_C = 44 # Right eye center
|
409 |
-
|
410 |
-
# Extract landmark coordinates
|
411 |
-
landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
|
412 |
-
|
413 |
-
# Get key points
|
414 |
-
nose_pt = landmark_coords[NOSE_TIP]
|
415 |
-
l_eye_pt = landmark_coords[LEFT_EYE_C]
|
416 |
-
r_eye_pt = landmark_coords[RIGHT_EYE_C]
|
417 |
-
|
418 |
-
# Calculate eye midpoint
|
419 |
-
eye_mid_x = (l_eye_pt[0] + r_eye_pt[0]) / 2.0
|
420 |
-
eye_mid_y = (l_eye_pt[1] + r_eye_pt[1]) / 2.0
|
421 |
|
422 |
-
#
|
423 |
-
|
424 |
-
|
|
|
|
|
|
|
|
|
425 |
|
426 |
-
|
427 |
-
|
428 |
-
h_tilt_norm = h_tilt / 20.0 # Approximate normalization
|
429 |
|
430 |
-
#
|
431 |
-
|
432 |
-
h_tilt_norm = max(-1.0, min(1.0, h_tilt_norm))
|
433 |
|
434 |
-
return
|
435 |
|
436 |
except Exception as e:
|
437 |
-
print(f"Error
|
438 |
-
return
|
439 |
|
440 |
-
|
|
|
441 |
"""
|
442 |
-
Calculate
|
443 |
-
This provides a more robust approach by integrating both geometric and deep learning methods.
|
444 |
"""
|
445 |
if ad_context is None:
|
446 |
ad_context = {}
|
@@ -449,52 +393,44 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
|
|
449 |
default_metrics = {m: 0.5 for m in metrics}
|
450 |
|
451 |
# If no facial data, return defaults
|
452 |
-
if not
|
453 |
return default_metrics
|
454 |
|
455 |
-
# Extract
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
#
|
482 |
-
if
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
#
|
490 |
-
val = base_vals["valence"] * confidence + (mar * 0.7 * (1.0 - eb_pos) * 0.3) * (1 - confidence)
|
491 |
-
arsl = base_vals["arousal"] * confidence + ((mar + (1.0 - ear) + eb_pos) / 3.0) * (1 - confidence)
|
492 |
-
dom = base_vals["dominance"] * confidence + (0.5 + v_tilt) * (1 - confidence)
|
493 |
-
else:
|
494 |
-
# Fallback to geometric features only
|
495 |
-
val = max(0, min(1, mar * 2.0 * (1.0 - eb_pos)))
|
496 |
-
arsl = max(0, min(1, (mar + (1.0 - ear) + eb_pos) / 3.0))
|
497 |
-
dom = max(0, min(1, 0.5 + v_tilt))
|
498 |
|
499 |
# Illustrative Context Adjustments from ad
|
500 |
ad_type = ad_context.get('ad_type', 'Unknown')
|
@@ -508,18 +444,25 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
|
|
508 |
val = max(0, min(1, val + val_adj))
|
509 |
arsl = max(0, min(1, arsl + arsl_adj))
|
510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
# Calculate secondary metrics
|
512 |
neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
|
513 |
em_stab = 1.0 - neur
|
514 |
extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
|
515 |
-
open = max(0, min(1, 0.5 + (
|
516 |
agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
|
517 |
consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
|
518 |
-
stress = max(0, min(1, (cl * 0.5) + (
|
519 |
-
engag = max(0, min(1,
|
520 |
|
521 |
-
#
|
522 |
-
calculated_metrics
|
523 |
'valence': val,
|
524 |
'arousal': arsl,
|
525 |
'dominance': dom,
|
@@ -532,7 +475,7 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
|
|
532 |
'extraversion': extr,
|
533 |
'stress_index': stress,
|
534 |
'engagement_level': engag
|
535 |
-
}
|
536 |
|
537 |
return calculated_metrics
|
538 |
|
@@ -609,83 +552,45 @@ def update_metrics_visualization(metrics_values):
|
|
609 |
plt.tight_layout(pad=0.5)
|
610 |
return fig
|
611 |
|
612 |
-
def annotate_frame(frame,
|
613 |
"""
|
614 |
-
Add facial
|
615 |
"""
|
616 |
if frame is None:
|
617 |
return None
|
618 |
|
619 |
annotated = frame.copy()
|
620 |
|
621 |
-
#
|
622 |
-
if
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
for i in range(68):
|
633 |
-
x, y = landmarks.part(i).x, landmarks.part(i).y
|
634 |
-
cv2.circle(annotated, (x, y), 2, (0, 0, 255), -1)
|
635 |
-
|
636 |
-
# Draw connecting lines for different facial features
|
637 |
-
# Eyes
|
638 |
-
for eye_points in [(36, 41), (42, 47)]: # Left eye, Right eye
|
639 |
-
for i in range(eye_points[0], eye_points[1]):
|
640 |
-
pt1 = (landmarks.part(i).x, landmarks.part(i).y)
|
641 |
-
pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
|
642 |
-
cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
|
643 |
-
# Connect last point to first
|
644 |
-
pt1 = (landmarks.part(eye_points[1]).x, landmarks.part(eye_points[1]).y)
|
645 |
-
pt2 = (landmarks.part(eye_points[0]).x, landmarks.part(eye_points[0]).y)
|
646 |
-
cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
|
647 |
-
|
648 |
-
# Eyebrows
|
649 |
-
for brow_points in [(17, 21), (22, 26)]: # Left eyebrow, Right eyebrow
|
650 |
-
for i in range(brow_points[0], brow_points[1]):
|
651 |
-
pt1 = (landmarks.part(i).x, landmarks.part(i).y)
|
652 |
-
pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
|
653 |
-
cv2.line(annotated, pt1, pt2, (255, 255, 0), 1)
|
654 |
-
|
655 |
-
# Nose
|
656 |
-
for i in range(27, 35):
|
657 |
-
pt1 = (landmarks.part(i).x, landmarks.part(i).y)
|
658 |
-
pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
|
659 |
-
cv2.line(annotated, pt1, pt2, (255, 0, 255), 1)
|
660 |
-
|
661 |
-
# Mouth outer
|
662 |
-
for i in range(48, 59):
|
663 |
-
pt1 = (landmarks.part(i).x, landmarks.part(i).y)
|
664 |
-
pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
|
665 |
-
cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
|
666 |
-
# Connect last point to first for mouth
|
667 |
-
pt1 = (landmarks.part(59).x, landmarks.part(59).y)
|
668 |
-
pt2 = (landmarks.part(48).x, landmarks.part(48).y)
|
669 |
-
cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
|
670 |
-
|
671 |
-
# Mouth inner
|
672 |
-
for i in range(60, 67):
|
673 |
-
pt1 = (landmarks.part(i).x, landmarks.part(i).y)
|
674 |
-
pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
|
675 |
-
cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
|
676 |
-
# Connect last point to first for inner mouth
|
677 |
-
pt1 = (landmarks.part(67).x, landmarks.part(67).y)
|
678 |
-
pt2 = (landmarks.part(60).x, landmarks.part(60).y)
|
679 |
-
cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
|
680 |
-
|
681 |
-
# Add metrics summary if available
|
682 |
-
if metrics:
|
683 |
# Format for display
|
684 |
h, w = annotated.shape[:2]
|
685 |
y_pos = 30 # Starting Y position
|
686 |
|
687 |
-
# Add
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
688 |
if enhanced_state:
|
|
|
|
|
|
|
|
|
689 |
# Draw background for text
|
690 |
text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
|
691 |
cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
|
@@ -695,21 +600,21 @@ def annotate_frame(frame, facial_data, metrics=None, enhanced_state=None):
|
|
695 |
y_pos += 30
|
696 |
|
697 |
# Show top 3 metrics
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
|
|
708 |
|
709 |
return annotated
|
710 |
|
711 |
# --- API 1: Video File Processing ---
|
712 |
-
|
713 |
def process_video_file(
|
714 |
video_file: Union[str, np.ndarray],
|
715 |
ad_description: str = "",
|
@@ -770,12 +675,23 @@ def process_video_file(
|
|
770 |
processed_frames = []
|
771 |
|
772 |
# Process the single frame
|
773 |
-
|
774 |
-
|
775 |
|
776 |
-
if
|
777 |
-
|
778 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
779 |
|
780 |
# Create a row for the dataframe
|
781 |
row = {
|
@@ -784,12 +700,13 @@ def process_video_file(
|
|
784 |
**calculated_metrics,
|
785 |
**ad_context,
|
786 |
'user_state': user_state,
|
787 |
-
'enhanced_user_state': enhanced_state
|
|
|
788 |
}
|
789 |
metrics_data.append(row)
|
790 |
|
791 |
# Annotate the frame
|
792 |
-
annotated_frame = annotate_frame(video_file,
|
793 |
processed_frames.append(annotated_frame)
|
794 |
|
795 |
# Save processed image
|
@@ -825,10 +742,13 @@ def process_video_file(
|
|
825 |
metrics_data = []
|
826 |
processed_frames = []
|
827 |
frame_count = 0
|
|
|
|
|
828 |
|
829 |
if show_progress:
|
830 |
print(f"Processing video with {total_frames} frames at {fps} FPS")
|
831 |
print(f"Ad Context: {ad_description} ({ad_type})")
|
|
|
832 |
|
833 |
while True:
|
834 |
ret, frame = cap.read()
|
@@ -840,14 +760,25 @@ def process_video_file(
|
|
840 |
if show_progress and frame_count % (sampling_rate * 10) == 0:
|
841 |
print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
|
842 |
|
843 |
-
#
|
844 |
-
|
845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
846 |
|
847 |
-
# Calculate metrics if
|
848 |
-
if
|
849 |
-
calculated_metrics =
|
850 |
-
user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
|
851 |
|
852 |
# Create a row for the dataframe
|
853 |
row = {
|
@@ -856,12 +787,13 @@ def process_video_file(
|
|
856 |
**calculated_metrics,
|
857 |
**ad_context,
|
858 |
'user_state': user_state,
|
859 |
-
'enhanced_user_state': enhanced_state
|
|
|
860 |
}
|
861 |
metrics_data.append(row)
|
862 |
|
863 |
# Annotate the frame
|
864 |
-
annotated_frame = annotate_frame(frame,
|
865 |
|
866 |
if save_processed_video:
|
867 |
out.write(annotated_frame)
|
@@ -898,14 +830,14 @@ def process_video_file(
|
|
898 |
return csv_path, video_path, metrics_df, processed_frames
|
899 |
|
900 |
# --- API 2: Webcam Processing Function ---
|
901 |
-
|
902 |
def process_webcam_frame(
|
903 |
frame: np.ndarray,
|
904 |
ad_context: Dict[str, Any],
|
905 |
metrics_data: pd.DataFrame,
|
906 |
frame_count: int,
|
907 |
-
start_time: float
|
908 |
-
|
|
|
909 |
"""
|
910 |
Process a single webcam frame
|
911 |
|
@@ -915,21 +847,35 @@ def process_webcam_frame(
|
|
915 |
metrics_data: DataFrame to accumulate metrics
|
916 |
frame_count: Current frame count
|
917 |
start_time: Start time of the session
|
|
|
918 |
|
919 |
Returns:
|
920 |
-
Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_metrics_df)
|
921 |
"""
|
922 |
if frame is None:
|
923 |
-
return None, None, None, metrics_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
924 |
|
925 |
-
|
926 |
-
|
927 |
-
|
|
|
928 |
|
929 |
-
# Calculate metrics if
|
930 |
-
if
|
931 |
-
calculated_metrics =
|
932 |
-
user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
|
933 |
|
934 |
# Create a row for the dataframe
|
935 |
current_time = time.time()
|
@@ -939,7 +885,8 @@ def process_webcam_frame(
|
|
939 |
**calculated_metrics,
|
940 |
**ad_context,
|
941 |
'user_state': user_state,
|
942 |
-
'enhanced_user_state': enhanced_state
|
|
|
943 |
}
|
944 |
|
945 |
# Add row to DataFrame
|
@@ -947,15 +894,15 @@ def process_webcam_frame(
|
|
947 |
metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
|
948 |
|
949 |
# Annotate the frame
|
950 |
-
annotated_frame = annotate_frame(frame,
|
951 |
|
952 |
-
return annotated_frame, calculated_metrics, enhanced_state, metrics_data
|
953 |
else:
|
954 |
# No face detected
|
955 |
no_face_frame = frame.copy()
|
956 |
cv2.putText(no_face_frame, "No face detected", (30, 30),
|
957 |
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
|
958 |
-
return no_face_frame, None, "No face detected", metrics_data
|
959 |
|
960 |
def start_webcam_session(
|
961 |
ad_description: str = "",
|
@@ -1003,7 +950,8 @@ def start_webcam_session(
|
|
1003 |
"last_saved": 0,
|
1004 |
"record_video": record_video,
|
1005 |
"recorded_frames": [] if record_video else None,
|
1006 |
-
"timestamps": [] if record_video else None
|
|
|
1007 |
}
|
1008 |
|
1009 |
return session
|
@@ -1011,7 +959,7 @@ def start_webcam_session(
|
|
1011 |
def update_webcam_session(
|
1012 |
session: Dict[str, Any],
|
1013 |
frame: np.ndarray
|
1014 |
-
) -> Tuple[np.ndarray, Dict[str, float], str, Dict[str, Any]]:
|
1015 |
"""
|
1016 |
Update webcam session with a new frame
|
1017 |
|
@@ -1020,20 +968,22 @@ def update_webcam_session(
|
|
1020 |
frame: New frame from webcam
|
1021 |
|
1022 |
Returns:
|
1023 |
-
Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_session)
|
1024 |
"""
|
1025 |
# Process the frame
|
1026 |
-
annotated_frame, metrics, enhanced_state, updated_df = process_webcam_frame(
|
1027 |
frame,
|
1028 |
session["ad_context"],
|
1029 |
session["metrics_data"],
|
1030 |
session["frame_count"],
|
1031 |
-
session["start_time"]
|
|
|
1032 |
)
|
1033 |
|
1034 |
# Update session
|
1035 |
session["frame_count"] += 1
|
1036 |
session["metrics_data"] = updated_df
|
|
|
1037 |
|
1038 |
# Record frame if enabled
|
1039 |
if session["record_video"] and annotated_frame is not None:
|
@@ -1046,7 +996,7 @@ def update_webcam_session(
|
|
1046 |
updated_df.to_csv(session["csv_path"], index=False)
|
1047 |
session["last_saved"] = session["frame_count"]
|
1048 |
|
1049 |
-
return annotated_frame, metrics, enhanced_state, session
|
1050 |
|
1051 |
def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
|
1052 |
"""
|
@@ -1100,19 +1050,19 @@ def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
|
|
1100 |
return session["csv_path"], video_path
|
1101 |
|
1102 |
# --- Create Gradio Interface ---
|
1103 |
-
|
1104 |
def create_api_interface():
|
1105 |
-
with gr.Blocks(title="
|
1106 |
-
gr.Markdown("""
|
1107 |
-
# Enhanced Facial Analysis APIs
|
1108 |
|
1109 |
This interface provides two API endpoints:
|
1110 |
|
1111 |
1. **Video File API**: Upload and analyze pre-recorded videos
|
1112 |
2. **Webcam API**: Analyze live webcam feed in real-time
|
1113 |
|
1114 |
-
Both APIs use
|
1115 |
-
|
|
|
1116 |
""")
|
1117 |
|
1118 |
with gr.Tab("Video File API"):
|
@@ -1231,6 +1181,9 @@ def create_api_interface():
|
|
1231 |
with gr.Column():
|
1232 |
enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
|
1233 |
|
|
|
|
|
|
|
1234 |
with gr.Row():
|
1235 |
download_csv = gr.File(label="Download Session Data")
|
1236 |
download_video = gr.Video(label="Recorded Session")
|
@@ -1255,18 +1208,18 @@ def create_api_interface():
|
|
1255 |
|
1256 |
def process_frame(frame, session):
|
1257 |
if session is None:
|
1258 |
-
return frame, None, "No active session. Click 'Start Session' to begin.", session
|
1259 |
|
1260 |
# Process the frame
|
1261 |
-
annotated_frame, metrics, enhanced_state, updated_session = update_webcam_session(session, frame)
|
1262 |
|
1263 |
# Update the metrics plot if metrics available
|
1264 |
if metrics:
|
1265 |
metrics_plot = update_metrics_visualization(metrics)
|
1266 |
-
return annotated_frame, metrics_plot, enhanced_state, updated_session
|
1267 |
else:
|
1268 |
# Return the annotated frame (likely with "No face detected")
|
1269 |
-
return annotated_frame, None, enhanced_state or "No metrics available", updated_session
|
1270 |
|
1271 |
def end_session(session):
|
1272 |
if session is None:
|
@@ -1292,7 +1245,7 @@ def create_api_interface():
|
|
1292 |
webcam_input.stream(
|
1293 |
process_frame,
|
1294 |
inputs=[webcam_input, session_data],
|
1295 |
-
outputs=[processed_output, metrics_plot, enhanced_state_txt, session_data]
|
1296 |
)
|
1297 |
|
1298 |
end_session_btn.click(
|
@@ -1305,8 +1258,8 @@ def create_api_interface():
|
|
1305 |
|
1306 |
# Entry point
|
1307 |
if __name__ == "__main__":
|
1308 |
-
print("Starting Enhanced Facial Analysis API
|
1309 |
print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
|
1310 |
-
print(f"
|
1311 |
iface = create_api_interface()
|
1312 |
iface.launch(debug=True)
|
|
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
import time
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from matplotlib.colors import LinearSegmentedColormap
|
8 |
from matplotlib.collections import LineCollection
|
|
|
18 |
import base64
|
19 |
import io
|
20 |
from pathlib import Path
|
21 |
+
import torch
|
22 |
+
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
|
23 |
+
from io import BytesIO
|
24 |
|
25 |
# Suppress warnings for cleaner output
|
26 |
warnings.filterwarnings('ignore')
|
|
|
49 |
print("Running with simulated Gemini API responses.")
|
50 |
GEMINI_ENABLED = False
|
51 |
|
52 |
+
# --- Initialize LLaVA Vision Model ---
|
53 |
+
print("Initializing LLaVA Vision Model...")
|
54 |
+
LLAVA_ENABLED = False
|
55 |
try:
|
56 |
+
# Check if GPU is available
|
57 |
+
if torch.cuda.is_available():
|
58 |
+
device = "cuda"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
else:
|
60 |
+
device = "cpu"
|
61 |
+
|
62 |
+
# Use a smaller LLaVA model for better performance
|
63 |
+
model_id = "llava-hf/llava-1.5-7b-hf"
|
64 |
+
|
65 |
+
# Initialize the model
|
66 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
67 |
+
llava_model = AutoModelForCausalLM.from_pretrained(
|
68 |
+
model_id,
|
69 |
+
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
70 |
+
low_cpu_mem_usage=True if device == "cuda" else False,
|
71 |
+
).to(device)
|
72 |
+
|
73 |
+
# Create a pipeline
|
74 |
+
vision_llm = pipeline(
|
75 |
+
"image-to-text",
|
76 |
+
model=llava_model,
|
77 |
+
tokenizer=processor.tokenizer,
|
78 |
+
image_processor=processor.image_processor,
|
79 |
+
device=device,
|
80 |
+
max_new_tokens=512,
|
81 |
+
)
|
82 |
|
83 |
+
LLAVA_ENABLED = True
|
84 |
+
print(f"LLaVA Vision Model initialized successfully on {device.upper()}")
|
85 |
+
|
86 |
except Exception as e:
|
87 |
+
print(f"WARNING: Failed to initialize LLaVA Vision Model: {e}")
|
88 |
+
print("Running with DeepFace only (no LLaVA vision features).")
|
89 |
+
vision_llm = None
|
90 |
+
|
91 |
+
# --- Initialize OpenCV face detector for backup ---
|
92 |
+
print("Initializing OpenCV face detector...")
|
93 |
+
try:
|
94 |
+
# Use OpenCV's built-in face detector as backup
|
95 |
+
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
96 |
+
|
97 |
+
# Check if the face detector loaded successfully
|
98 |
+
if face_cascade.empty():
|
99 |
+
print("WARNING: Failed to load face cascade classifier")
|
100 |
+
else:
|
101 |
+
print("OpenCV face detector initialized successfully.")
|
102 |
+
except Exception as e:
|
103 |
+
print(f"ERROR initializing OpenCV face detector: {e}")
|
104 |
+
face_cascade = None
|
105 |
|
106 |
# --- Metrics Definition ---
|
107 |
metrics = [
|
|
|
123 |
}
|
124 |
|
125 |
ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
|
126 |
+
user_state_columns = ["user_state", "enhanced_user_state", "llava_analysis"]
|
127 |
all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
|
128 |
initial_metrics_df = pd.DataFrame(columns=all_columns)
|
129 |
|
130 |
+
# --- LLaVA Vision Analysis Function ---
|
131 |
+
def analyze_image_with_llava(image, ad_context=None):
|
132 |
+
"""
|
133 |
+
Use LLaVA vision model to analyze facial expression and emotion in image
|
134 |
+
"""
|
135 |
+
if not LLAVA_ENABLED or vision_llm is None or image is None:
|
136 |
+
return "LLaVA analysis not available"
|
137 |
+
|
138 |
+
try:
|
139 |
+
# Convert OpenCV image (BGR) to PIL Image (RGB)
|
140 |
+
if len(image.shape) == 3 and image.shape[2] == 3:
|
141 |
+
# Check if BGR and convert to RGB if needed
|
142 |
+
if np.mean(image[:,:,0]) < np.mean(image[:,:,2]): # Rough BGR check
|
143 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
144 |
+
else:
|
145 |
+
image_rgb = image
|
146 |
+
else:
|
147 |
+
# Handle grayscale or other formats
|
148 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
149 |
+
|
150 |
+
# Convert to PIL Image
|
151 |
+
pil_image = Image.fromarray(image_rgb)
|
152 |
+
|
153 |
+
# Create prompt based on ad context
|
154 |
+
ad_info = ""
|
155 |
+
if ad_context:
|
156 |
+
ad_desc = ad_context.get('ad_description', '')
|
157 |
+
ad_type = ad_context.get('ad_type', '')
|
158 |
+
if ad_desc:
|
159 |
+
ad_info = f" while watching an ad about {ad_desc} (type: {ad_type})"
|
160 |
+
|
161 |
+
prompt = f"""Analyze this person's facial expression and emotion{ad_info}.
|
162 |
+
Describe their emotional state, engagement level, and cognitive state in detail.
|
163 |
+
Focus on: valence (positive/negative emotion), arousal (excitement level),
|
164 |
+
attention, stress indicators, and overall reaction to what they're seeing.
|
165 |
+
"""
|
166 |
+
|
167 |
+
# Process with Vision LLM
|
168 |
+
outputs = vision_llm(pil_image, prompt=prompt)
|
169 |
+
|
170 |
+
# Extract the generated text
|
171 |
+
if isinstance(outputs, list) and len(outputs) > 0:
|
172 |
+
if isinstance(outputs[0], dict) and "generated_text" in outputs[0]:
|
173 |
+
return outputs[0]["generated_text"]
|
174 |
+
elif isinstance(outputs[0], str):
|
175 |
+
return outputs[0]
|
176 |
+
|
177 |
+
return str(outputs) if outputs else "No results from LLaVA analysis"
|
178 |
+
|
179 |
+
except Exception as e:
|
180 |
+
print(f"Error in LLaVA analysis: {e}")
|
181 |
+
return f"LLaVA analysis error: {str(e)}"
|
182 |
|
183 |
+
# --- Gemini API Functions ---
|
184 |
def call_gemini_api_for_ad(description, detail, ad_type):
|
185 |
"""
|
186 |
Uses Google Gemini to analyze ad context.
|
|
|
212 |
print(f"Error calling Gemini for ad context: {e}")
|
213 |
return f"Error analyzing ad context: {str(e)}"
|
214 |
|
215 |
+
def interpret_metrics_with_gemini(metrics_dict, deepface_results=None, llava_analysis=None, ad_context=None):
|
216 |
"""
|
217 |
+
Uses Google Gemini to interpret facial metrics, DeepFace results and LLaVA analysis
|
218 |
+
to determine user state.
|
219 |
"""
|
220 |
+
if not metrics_dict and not deepface_results and not llava_analysis:
|
221 |
return "No metrics", "No facial data detected"
|
222 |
|
223 |
if not GEMINI_ENABLED:
|
224 |
# Basic rule-based simulation for user state
|
225 |
+
valence = metrics_dict.get('valence', 0.5) if metrics_dict else 0.5
|
226 |
+
arousal = metrics_dict.get('arousal', 0.5) if metrics_dict else 0.5
|
227 |
+
|
228 |
+
# Extract emotion from DeepFace if available
|
229 |
+
dominant_emotion = "neutral"
|
230 |
+
if deepface_results and "emotion" in deepface_results:
|
231 |
+
emotion_dict = deepface_results["emotion"]
|
232 |
+
dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
|
233 |
|
234 |
# Simple rule-based simulation
|
235 |
+
state = dominant_emotion.capitalize() if dominant_emotion != "neutral" else "Neutral"
|
236 |
+
if valence > 0.65 and arousal > 0.55:
|
237 |
state = "Positive, Engaged"
|
238 |
+
elif valence < 0.4 and arousal > 0.6:
|
239 |
state = "Stressed, Negative"
|
|
|
|
|
|
|
|
|
240 |
|
241 |
+
enhanced_state = f"The viewer appears {state.lower()} while watching this content."
|
242 |
+
if llava_analysis and llava_analysis != "LLaVA analysis not available":
|
243 |
+
# Extract a brief summary from LLaVA analysis (first sentence)
|
244 |
+
first_sentence = llava_analysis.split('.')[0] + '.'
|
245 |
+
enhanced_state += f" {first_sentence}"
|
246 |
|
247 |
return state, enhanced_state
|
248 |
else:
|
249 |
try:
|
250 |
# Format metrics for Gemini
|
251 |
+
metrics_formatted = ""
|
252 |
+
if metrics_dict:
|
253 |
+
metrics_formatted = "\nMetrics (0-1 scale):\n" + "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
|
254 |
+
if k not in ('timestamp', 'frame_number')])
|
255 |
+
|
256 |
+
# Format DeepFace results
|
257 |
+
deepface_formatted = ""
|
258 |
+
if deepface_results and "emotion" in deepface_results:
|
259 |
+
emotion_dict = deepface_results["emotion"]
|
260 |
+
deepface_formatted = "\nDeepFace emotions:\n" + "\n".join([f"- {k.title()}: {v:.2f}" for k, v in emotion_dict.items()])
|
261 |
+
|
262 |
+
# Format LLaVA analysis
|
263 |
+
llava_formatted = ""
|
264 |
+
if llava_analysis and llava_analysis != "LLaVA analysis not available":
|
265 |
+
llava_formatted = f"\nLLaVA Vision Analysis:\n{llava_analysis}"
|
266 |
|
267 |
# Include ad context if available
|
268 |
ad_info = ""
|
|
|
272 |
ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
|
273 |
|
274 |
prompt = f"""
|
275 |
+
Analyze the facial expression and emotion of a person watching an advertisement{ad_info}.
|
276 |
+
|
277 |
+
Use these combined inputs:{metrics_formatted}{deepface_formatted}{llava_formatted}
|
278 |
|
279 |
Provide two outputs:
|
280 |
1. User State: A short 1-3 word description of their emotional/cognitive state
|
|
|
305 |
print(f"Error calling Gemini for metric interpretation: {e}")
|
306 |
return "Error", f"Error analyzing facial metrics: {str(e)}"
|
307 |
|
308 |
+
# --- DeepFace Analysis Function ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
def analyze_face_with_deepface(image):
|
310 |
+
"""Analyze facial emotions and attributes using DeepFace"""
|
311 |
if image is None:
|
312 |
return None
|
313 |
|
|
|
330 |
# Analyze with DeepFace
|
331 |
analysis = DeepFace.analyze(
|
332 |
img_path=temp_img,
|
333 |
+
actions=['emotion', 'age', 'gender', 'race'],
|
334 |
enforce_detection=False, # Don't throw error if face not detected
|
335 |
detector_backend='opencv' # Faster detection
|
336 |
)
|
|
|
351 |
print(f"DeepFace analysis error: {e}")
|
352 |
return None
|
353 |
|
354 |
+
# --- Face Detection Backup with OpenCV ---
|
355 |
+
def detect_face_opencv(image):
|
356 |
+
"""Detect faces using OpenCV cascade classifier as backup"""
|
357 |
+
if image is None or face_cascade is None:
|
358 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
try:
|
361 |
+
# Convert to grayscale for detection
|
362 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
+
# Detect faces
|
365 |
+
faces = face_cascade.detectMultiScale(
|
366 |
+
gray,
|
367 |
+
scaleFactor=1.1,
|
368 |
+
minNeighbors=5,
|
369 |
+
minSize=(30, 30)
|
370 |
+
)
|
371 |
|
372 |
+
if len(faces) == 0:
|
373 |
+
return None
|
|
|
374 |
|
375 |
+
# Get the largest face by area
|
376 |
+
largest_face = max(faces, key=lambda rect: rect[2] * rect[3])
|
|
|
377 |
|
378 |
+
return {"rect": largest_face}
|
379 |
|
380 |
except Exception as e:
|
381 |
+
print(f"Error in OpenCV face detection: {e}")
|
382 |
+
return None
|
383 |
|
384 |
+
# --- Calculate Metrics from DeepFace Results ---
|
385 |
+
def calculate_metrics_from_deepface(deepface_results, ad_context=None):
|
386 |
"""
|
387 |
+
Calculate psychometric metrics from DeepFace analysis results
|
|
|
388 |
"""
|
389 |
if ad_context is None:
|
390 |
ad_context = {}
|
|
|
393 |
default_metrics = {m: 0.5 for m in metrics}
|
394 |
|
395 |
# If no facial data, return defaults
|
396 |
+
if not deepface_results or "emotion" not in deepface_results:
|
397 |
return default_metrics
|
398 |
|
399 |
+
# Extract emotion data from DeepFace
|
400 |
+
emotion_dict = deepface_results["emotion"]
|
401 |
+
# Find dominant emotion
|
402 |
+
dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
|
403 |
+
dominant_score = max(emotion_dict.items(), key=lambda x: x[1])[1] / 100.0 # Convert to 0-1 scale
|
404 |
+
|
405 |
+
# Get base values from emotion mapping
|
406 |
+
base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
|
407 |
+
|
408 |
+
# Calculate primary metrics with confidence weighting
|
409 |
+
val = base_vals["valence"]
|
410 |
+
arsl = base_vals["arousal"]
|
411 |
+
dom = base_vals["dominance"]
|
412 |
+
|
413 |
+
# Add directional adjustments based on specific emotions
|
414 |
+
if dominant_emotion == "happy":
|
415 |
+
val += 0.1
|
416 |
+
elif dominant_emotion == "sad":
|
417 |
+
val -= 0.1
|
418 |
+
elif dominant_emotion == "angry":
|
419 |
+
arsl += 0.1
|
420 |
+
dom += 0.1
|
421 |
+
elif dominant_emotion == "fear":
|
422 |
+
arsl += 0.1
|
423 |
+
dom -= 0.1
|
424 |
+
|
425 |
+
# Adjust for gender and age if available (just examples of potential factors)
|
426 |
+
if "gender" in deepface_results:
|
427 |
+
gender = deepface_results["gender"]
|
428 |
+
gender_score = deepface_results.get("gender_score", 0.5)
|
429 |
+
# No real adjustment needed, this is just an example
|
430 |
+
|
431 |
+
if "age" in deepface_results:
|
432 |
+
age = deepface_results["age"]
|
433 |
+
# No real adjustment needed, this is just an example
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
# Illustrative Context Adjustments from ad
|
436 |
ad_type = ad_context.get('ad_type', 'Unknown')
|
|
|
444 |
val = max(0, min(1, val + val_adj))
|
445 |
arsl = max(0, min(1, arsl + arsl_adj))
|
446 |
|
447 |
+
# Estimate cognitive load based on emotional intensity
|
448 |
+
cl = 0.5 # Default
|
449 |
+
if dominant_emotion in ["neutral"]:
|
450 |
+
cl = 0.3 # Lower cognitive load for neutral expression
|
451 |
+
elif dominant_emotion in ["surprise", "fear"]:
|
452 |
+
cl = 0.7 # Higher cognitive load for surprise/fear
|
453 |
+
|
454 |
# Calculate secondary metrics
|
455 |
neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
|
456 |
em_stab = 1.0 - neur
|
457 |
extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
|
458 |
+
open = max(0, min(1, 0.5 + (val - 0.5) * 0.5))
|
459 |
agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
|
460 |
consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
|
461 |
+
stress = max(0, min(1, (cl * 0.5) + ((1.0 - val) * 0.5)))
|
462 |
+
engag = max(0, min(1, arsl * 0.7 + (val * 0.3)))
|
463 |
|
464 |
+
# Create metrics dictionary
|
465 |
+
calculated_metrics = {
|
466 |
'valence': val,
|
467 |
'arousal': arsl,
|
468 |
'dominance': dom,
|
|
|
475 |
'extraversion': extr,
|
476 |
'stress_index': stress,
|
477 |
'engagement_level': engag
|
478 |
+
}
|
479 |
|
480 |
return calculated_metrics
|
481 |
|
|
|
552 |
plt.tight_layout(pad=0.5)
|
553 |
return fig
|
554 |
|
555 |
+
def annotate_frame(frame, face_data=None, deepface_results=None, metrics=None, enhanced_state=None):
|
556 |
"""
|
557 |
+
Add facial annotations and metrics to a frame
|
558 |
"""
|
559 |
if frame is None:
|
560 |
return None
|
561 |
|
562 |
annotated = frame.copy()
|
563 |
|
564 |
+
# Draw face rectangle if available
|
565 |
+
if face_data and "rect" in face_data:
|
566 |
+
x, y, w, h = face_data["rect"]
|
567 |
+
cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
568 |
+
elif deepface_results and "region" in deepface_results:
|
569 |
+
region = deepface_results["region"]
|
570 |
+
x, y, w, h = region["x"], region["y"], region["w"], region["h"]
|
571 |
+
cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
572 |
+
|
573 |
+
# Add emotion and metrics summary
|
574 |
+
if deepface_results or metrics:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
# Format for display
|
576 |
h, w = annotated.shape[:2]
|
577 |
y_pos = 30 # Starting Y position
|
578 |
|
579 |
+
# Add emotion info if available from DeepFace
|
580 |
+
if deepface_results and "dominant_emotion" in deepface_results:
|
581 |
+
emotion_text = f"Emotion: {deepface_results['dominant_emotion'].capitalize()}"
|
582 |
+
text_size = cv2.getTextSize(emotion_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
|
583 |
+
cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
|
584 |
+
cv2.putText(annotated, emotion_text, (10, y_pos),
|
585 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
|
586 |
+
y_pos += 30
|
587 |
+
|
588 |
+
# Add enhanced user state if available
|
589 |
if enhanced_state:
|
590 |
+
# Truncate if too long
|
591 |
+
if len(enhanced_state) > 60:
|
592 |
+
enhanced_state = enhanced_state[:57] + "..."
|
593 |
+
|
594 |
# Draw background for text
|
595 |
text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
|
596 |
cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
|
|
|
600 |
y_pos += 30
|
601 |
|
602 |
# Show top 3 metrics
|
603 |
+
if metrics:
|
604 |
+
top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
|
605 |
+
key=lambda x: x[1], reverse=True)[:3]
|
606 |
+
|
607 |
+
for name, value in top_metrics:
|
608 |
+
metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
|
609 |
+
text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
|
610 |
+
cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
|
611 |
+
cv2.putText(annotated, metric_text, (10, y_pos),
|
612 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
613 |
+
y_pos += 25
|
614 |
|
615 |
return annotated
|
616 |
|
617 |
# --- API 1: Video File Processing ---
|
|
|
618 |
def process_video_file(
|
619 |
video_file: Union[str, np.ndarray],
|
620 |
ad_description: str = "",
|
|
|
675 |
processed_frames = []
|
676 |
|
677 |
# Process the single frame
|
678 |
+
deepface_results = analyze_face_with_deepface(video_file)
|
679 |
+
face_data = None
|
680 |
|
681 |
+
# Fall back to OpenCV face detection if DeepFace didn't detect a face
|
682 |
+
if not deepface_results or "region" not in deepface_results:
|
683 |
+
face_data = detect_face_opencv(video_file)
|
684 |
+
|
685 |
+
# Use LLaVA for additional analysis (once per frame)
|
686 |
+
llava_analysis = "LLaVA analysis not available"
|
687 |
+
if face_data is not None or (deepface_results and "region" in deepface_results):
|
688 |
+
# Only use LLaVA if a face was detected
|
689 |
+
llava_analysis = analyze_image_with_llava(video_file, ad_context)
|
690 |
+
|
691 |
+
# Calculate metrics if face detected
|
692 |
+
if deepface_results or face_data:
|
693 |
+
calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
|
694 |
+
user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
|
695 |
|
696 |
# Create a row for the dataframe
|
697 |
row = {
|
|
|
700 |
**calculated_metrics,
|
701 |
**ad_context,
|
702 |
'user_state': user_state,
|
703 |
+
'enhanced_user_state': enhanced_state,
|
704 |
+
'llava_analysis': llava_analysis
|
705 |
}
|
706 |
metrics_data.append(row)
|
707 |
|
708 |
# Annotate the frame
|
709 |
+
annotated_frame = annotate_frame(video_file, face_data, deepface_results, calculated_metrics, enhanced_state)
|
710 |
processed_frames.append(annotated_frame)
|
711 |
|
712 |
# Save processed image
|
|
|
742 |
metrics_data = []
|
743 |
processed_frames = []
|
744 |
frame_count = 0
|
745 |
+
llava_counter = 0 # To limit LLaVA analysis (it's slow)
|
746 |
+
llava_interval = sampling_rate * 10 # Run LLaVA every X frames
|
747 |
|
748 |
if show_progress:
|
749 |
print(f"Processing video with {total_frames} frames at {fps} FPS")
|
750 |
print(f"Ad Context: {ad_description} ({ad_type})")
|
751 |
+
print(f"LLaVA Vision Model: {'Enabled' if LLAVA_ENABLED else 'Disabled'}")
|
752 |
|
753 |
while True:
|
754 |
ret, frame = cap.read()
|
|
|
760 |
if show_progress and frame_count % (sampling_rate * 10) == 0:
|
761 |
print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
|
762 |
|
763 |
+
# Analyze with DeepFace
|
764 |
+
deepface_results = analyze_face_with_deepface(frame)
|
765 |
+
face_data = None
|
766 |
+
|
767 |
+
# Fall back to OpenCV face detection if DeepFace didn't detect a face
|
768 |
+
if not deepface_results or "region" not in deepface_results:
|
769 |
+
face_data = detect_face_opencv(frame)
|
770 |
+
|
771 |
+
# Use LLaVA for additional analysis (periodically to save time)
|
772 |
+
llava_analysis = "LLaVA analysis not available"
|
773 |
+
if (face_data is not None or (deepface_results and "region" in deepface_results)) and llava_counter % llava_interval == 0:
|
774 |
+
# Only use LLaVA if a face was detected and on the right interval
|
775 |
+
llava_analysis = analyze_image_with_llava(frame, ad_context)
|
776 |
+
llava_counter += 1
|
777 |
|
778 |
+
# Calculate metrics if face detected
|
779 |
+
if deepface_results or face_data:
|
780 |
+
calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
|
781 |
+
user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
|
782 |
|
783 |
# Create a row for the dataframe
|
784 |
row = {
|
|
|
787 |
**calculated_metrics,
|
788 |
**ad_context,
|
789 |
'user_state': user_state,
|
790 |
+
'enhanced_user_state': enhanced_state,
|
791 |
+
'llava_analysis': llava_analysis
|
792 |
}
|
793 |
metrics_data.append(row)
|
794 |
|
795 |
# Annotate the frame
|
796 |
+
annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
|
797 |
|
798 |
if save_processed_video:
|
799 |
out.write(annotated_frame)
|
|
|
830 |
return csv_path, video_path, metrics_df, processed_frames
|
831 |
|
832 |
# --- API 2: Webcam Processing Function ---
|
|
|
833 |
def process_webcam_frame(
|
834 |
frame: np.ndarray,
|
835 |
ad_context: Dict[str, Any],
|
836 |
metrics_data: pd.DataFrame,
|
837 |
frame_count: int,
|
838 |
+
start_time: float,
|
839 |
+
llava_counter: int
|
840 |
+
) -> Tuple[np.ndarray, Dict[str, float], str, str, pd.DataFrame, int]:
|
841 |
"""
|
842 |
Process a single webcam frame
|
843 |
|
|
|
847 |
metrics_data: DataFrame to accumulate metrics
|
848 |
frame_count: Current frame count
|
849 |
start_time: Start time of the session
|
850 |
+
llava_counter: Counter to limit LLaVA calls
|
851 |
|
852 |
Returns:
|
853 |
+
Tuple of (annotated_frame, metrics_dict, enhanced_state, llava_analysis, updated_metrics_df, updated_llava_counter)
|
854 |
"""
|
855 |
if frame is None:
|
856 |
+
return None, None, None, None, metrics_data, llava_counter
|
857 |
+
|
858 |
+
# Analyze with DeepFace
|
859 |
+
deepface_results = analyze_face_with_deepface(frame)
|
860 |
+
face_data = None
|
861 |
+
|
862 |
+
# Fall back to OpenCV face detection if DeepFace didn't detect a face
|
863 |
+
if not deepface_results or "region" not in deepface_results:
|
864 |
+
face_data = detect_face_opencv(frame)
|
865 |
+
|
866 |
+
# Use LLaVA for periodic analysis (it's slow)
|
867 |
+
llava_analysis = "LLaVA analysis not available"
|
868 |
+
llava_interval = 30 # Run LLaVA every X frames
|
869 |
|
870 |
+
if (face_data is not None or (deepface_results and "region" in deepface_results)) and llava_counter % llava_interval == 0:
|
871 |
+
# Only use LLaVA if a face was detected and on the right interval
|
872 |
+
llava_analysis = analyze_image_with_llava(frame, ad_context)
|
873 |
+
llava_counter += 1
|
874 |
|
875 |
+
# Calculate metrics if face detected
|
876 |
+
if deepface_results or face_data:
|
877 |
+
calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
|
878 |
+
user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
|
879 |
|
880 |
# Create a row for the dataframe
|
881 |
current_time = time.time()
|
|
|
885 |
**calculated_metrics,
|
886 |
**ad_context,
|
887 |
'user_state': user_state,
|
888 |
+
'enhanced_user_state': enhanced_state,
|
889 |
+
'llava_analysis': llava_analysis
|
890 |
}
|
891 |
|
892 |
# Add row to DataFrame
|
|
|
894 |
metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
|
895 |
|
896 |
# Annotate the frame
|
897 |
+
annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
|
898 |
|
899 |
+
return annotated_frame, calculated_metrics, enhanced_state, llava_analysis, metrics_data, llava_counter
|
900 |
else:
|
901 |
# No face detected
|
902 |
no_face_frame = frame.copy()
|
903 |
cv2.putText(no_face_frame, "No face detected", (30, 30),
|
904 |
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
|
905 |
+
return no_face_frame, None, "No face detected", None, metrics_data, llava_counter
|
906 |
|
907 |
def start_webcam_session(
|
908 |
ad_description: str = "",
|
|
|
950 |
"last_saved": 0,
|
951 |
"record_video": record_video,
|
952 |
"recorded_frames": [] if record_video else None,
|
953 |
+
"timestamps": [] if record_video else None,
|
954 |
+
"llava_counter": 0 # Counter to limit LLaVA calls
|
955 |
}
|
956 |
|
957 |
return session
|
|
|
959 |
def update_webcam_session(
|
960 |
session: Dict[str, Any],
|
961 |
frame: np.ndarray
|
962 |
+
) -> Tuple[np.ndarray, Dict[str, float], str, str, Dict[str, Any]]:
|
963 |
"""
|
964 |
Update webcam session with a new frame
|
965 |
|
|
|
968 |
frame: New frame from webcam
|
969 |
|
970 |
Returns:
|
971 |
+
Tuple of (annotated_frame, metrics_dict, enhanced_state, llava_analysis, updated_session)
|
972 |
"""
|
973 |
# Process the frame
|
974 |
+
annotated_frame, metrics, enhanced_state, llava_analysis, updated_df, updated_llava_counter = process_webcam_frame(
|
975 |
frame,
|
976 |
session["ad_context"],
|
977 |
session["metrics_data"],
|
978 |
session["frame_count"],
|
979 |
+
session["start_time"],
|
980 |
+
session["llava_counter"]
|
981 |
)
|
982 |
|
983 |
# Update session
|
984 |
session["frame_count"] += 1
|
985 |
session["metrics_data"] = updated_df
|
986 |
+
session["llava_counter"] = updated_llava_counter
|
987 |
|
988 |
# Record frame if enabled
|
989 |
if session["record_video"] and annotated_frame is not None:
|
|
|
996 |
updated_df.to_csv(session["csv_path"], index=False)
|
997 |
session["last_saved"] = session["frame_count"]
|
998 |
|
999 |
+
return annotated_frame, metrics, enhanced_state, llava_analysis, session
|
1000 |
|
1001 |
def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
|
1002 |
"""
|
|
|
1050 |
return session["csv_path"], video_path
|
1051 |
|
1052 |
# --- Create Gradio Interface ---
|
|
|
1053 |
def create_api_interface():
|
1054 |
+
with gr.Blocks(title="Facial Analysis APIs") as iface:
|
1055 |
+
gr.Markdown(f"""
|
1056 |
+
# Enhanced Facial Analysis APIs (LLaVA + DeepFace)
|
1057 |
|
1058 |
This interface provides two API endpoints:
|
1059 |
|
1060 |
1. **Video File API**: Upload and analyze pre-recorded videos
|
1061 |
2. **Webcam API**: Analyze live webcam feed in real-time
|
1062 |
|
1063 |
+
Both APIs use DeepFace for emotion analysis and Google's Gemini API for enhanced interpretations.
|
1064 |
+
|
1065 |
+
**LLaVA Vision Model: {'✅ Enabled' if LLAVA_ENABLED else '❌ Disabled'}**
|
1066 |
""")
|
1067 |
|
1068 |
with gr.Tab("Video File API"):
|
|
|
1181 |
with gr.Column():
|
1182 |
enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
|
1183 |
|
1184 |
+
with gr.Row():
|
1185 |
+
llava_analysis_txt = gr.Textbox(label="LLaVA Vision Analysis", lines=6)
|
1186 |
+
|
1187 |
with gr.Row():
|
1188 |
download_csv = gr.File(label="Download Session Data")
|
1189 |
download_video = gr.Video(label="Recorded Session")
|
|
|
1208 |
|
1209 |
def process_frame(frame, session):
|
1210 |
if session is None:
|
1211 |
+
return frame, None, "No active session. Click 'Start Session' to begin.", "LLaVA analysis not available", session
|
1212 |
|
1213 |
# Process the frame
|
1214 |
+
annotated_frame, metrics, enhanced_state, llava_analysis, updated_session = update_webcam_session(session, frame)
|
1215 |
|
1216 |
# Update the metrics plot if metrics available
|
1217 |
if metrics:
|
1218 |
metrics_plot = update_metrics_visualization(metrics)
|
1219 |
+
return annotated_frame, metrics_plot, enhanced_state, llava_analysis or "LLaVA analysis not available", updated_session
|
1220 |
else:
|
1221 |
# Return the annotated frame (likely with "No face detected")
|
1222 |
+
return annotated_frame, None, enhanced_state or "No metrics available", "LLaVA analysis not available", updated_session
|
1223 |
|
1224 |
def end_session(session):
|
1225 |
if session is None:
|
|
|
1245 |
webcam_input.stream(
|
1246 |
process_frame,
|
1247 |
inputs=[webcam_input, session_data],
|
1248 |
+
outputs=[processed_output, metrics_plot, enhanced_state_txt, llava_analysis_txt, session_data]
|
1249 |
)
|
1250 |
|
1251 |
end_session_btn.click(
|
|
|
1258 |
|
1259 |
# Entry point
|
1260 |
if __name__ == "__main__":
|
1261 |
+
print("Starting Enhanced Facial Analysis API (LLaVA + DeepFace)...")
|
1262 |
print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
|
1263 |
+
print(f"LLaVA Vision Model {'enabled' if LLAVA_ENABLED else 'disabled (using DeepFace only)'}")
|
1264 |
iface = create_api_interface()
|
1265 |
iface.launch(debug=True)
|