Spaces:

ans123
/

PSYCHOMETER_2.0

Configuration error

App Files Files Community

ans123 commited on Apr 30

Commit

2b82b08

verified ·

1 Parent(s): fcf70d2

Update app.py

Browse files

Files changed (1) hide show

app.py +357 -404

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import cv2
 import numpy as np
 import pandas as pd
 import time
-import dlib
 import matplotlib.pyplot as plt
 from matplotlib.colors import LinearSegmentedColormap
 from matplotlib.collections import LineCollection
@@ -19,6 +18,9 @@ from deepface import DeepFace
 import base64
 import io
 from pathlib import Path
 # Suppress warnings for cleaner output
 warnings.filterwarnings('ignore')
@@ -47,32 +49,59 @@ except Exception as e:
     print("Running with simulated Gemini API responses.")
     GEMINI_ENABLED = False
-# --- Initialize dlib and DeepFace for facial analysis ---
-print("Initializing dlib face detector and shape predictor...")
 try:
-    # Initialize dlib's face detector and facial landmark predictor
-    face_detector = dlib.get_frontal_face_detector()
-    # Paths to shape predictor model file
-    # You need to download this file from:
-    # http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
-    predictor_path = "shape_predictor_68_face_landmarks.dat"
-    # Check if the predictor file exists, otherwise inform the user
-    if not os.path.exists(predictor_path):
-        print(f"WARNING: {predictor_path} not found. Please download from:")
-        print("http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
-        print("Extract and place in the current directory.")
-        # Use a placeholder or alternative
-        shape_predictor = None
     else:
-        shape_predictor = dlib.shape_predictor(predictor_path)
-    print("dlib face detector initialized successfully.")
 except Exception as e:
-    print(f"ERROR initializing dlib face detector: {e}")
-    face_detector = None
-    shape_predictor = None
 # --- Metrics Definition ---
 metrics = [
@@ -94,12 +123,64 @@ emotion_mapping = {
 }
 ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
-user_state_columns = ["user_state", "enhanced_user_state"]
 all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
 initial_metrics_df = pd.DataFrame(columns=all_columns)
-# --- Gemini API Functions ---
 def call_gemini_api_for_ad(description, detail, ad_type):
     """
     Uses Google Gemini to analyze ad context.
@@ -131,40 +212,57 @@ def call_gemini_api_for_ad(description, detail, ad_type):
             print(f"Error calling Gemini for ad context: {e}")
             return f"Error analyzing ad context: {str(e)}"
-def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
     """
-    Uses Google Gemini to interpret facial metrics and determine user state.
     """
-    if not metrics_dict:
         return "No metrics", "No facial data detected"
     if not GEMINI_ENABLED:
         # Basic rule-based simulation for user state
-        valence = metrics_dict.get('valence', 0.5)
-        arousal = metrics_dict.get('arousal', 0.5)
-        cog_load = metrics_dict.get('cognitive_load', 0.5)
-        stress = metrics_dict.get('stress_index', 0.5)
-        engagement = metrics_dict.get('engagement_level', 0.5)
         # Simple rule-based simulation
-        state = "Neutral"
-        if valence > 0.65 and arousal > 0.55 and engagement > 0.6:
             state = "Positive, Engaged"
-        elif valence < 0.4 and stress > 0.6:
             state = "Stressed, Negative"
-        elif cog_load > 0.7 and engagement < 0.4:
-            state = "Confused, Disengaged"
-        elif arousal < 0.4 and engagement < 0.5:
-            state = "Calm, Passive"
-        enhanced_state = f"The viewer appears {state.lower()} while watching this content. They are likely not fully connecting with the message."
         return state, enhanced_state
     else:
         try:
             # Format metrics for Gemini
-            metrics_formatted = "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
-                                         if k not in ('timestamp', 'frame_number')])
             # Include ad context if available
             ad_info = ""
@@ -174,8 +272,9 @@ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
                 ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
             prompt = f"""
-            Analyze these facial metrics (scale 0-1) of a person watching an advertisement{ad_info}:
-            {metrics_formatted}
             Provide two outputs:
             1. User State: A short 1-3 word description of their emotional/cognitive state
@@ -206,45 +305,9 @@ def interpret_metrics_with_gemini(metrics_dict, ad_context=None):
             print(f"Error calling Gemini for metric interpretation: {e}")
             return "Error", f"Error analyzing facial metrics: {str(e)}"
-# --- Facial Analysis Functions with dlib and DeepFace ---
-def extract_face_landmarks_dlib(image):
-    """Extract facial landmarks using dlib"""
-    if image is None or face_detector is None or shape_predictor is None:
-        return None
-    try:
-        # Convert to grayscale for dlib
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        # Detect faces
-        faces = face_detector(gray, 0)
-        if len(faces) == 0:
-            return None
-        # Get the largest face by area
-        largest_face = faces[0]
-        largest_area = (faces[0].right() - faces[0].left()) * (faces[0].bottom() - faces[0].top())
-        for face in faces:
-            area = (face.right() - face.left()) * (face.bottom() - face.top())
-            if area > largest_area:
-                largest_face = face
-                largest_area = area
-        # Get facial landmarks
-        landmarks = shape_predictor(gray, largest_face)
-        # Return both the face detection rectangle and landmarks
-        return {"rect": largest_face, "landmarks": landmarks}
-    except Exception as e:
-        print(f"Error in dlib landmark extraction: {e}")
-        return None
 def analyze_face_with_deepface(image):
-    """Analyze facial emotions using DeepFace"""
     if image is None:
         return None
@@ -267,7 +330,7 @@ def analyze_face_with_deepface(image):
         # Analyze with DeepFace
         analysis = DeepFace.analyze(
             img_path=temp_img,
-            actions=['emotion'],
             enforce_detection=False,  # Don't throw error if face not detected
             detector_backend='opencv'  # Faster detection
         )
@@ -288,159 +351,40 @@ def analyze_face_with_deepface(image):
         print(f"DeepFace analysis error: {e}")
         return None
-def calculate_ear_dlib(landmarks):
-    """Calculate Eye Aspect Ratio using dlib landmarks"""
-    if landmarks is None:
-        return 0.0
-    try:
-        # dlib's 68-point face model landmark indices
-        # Left eye: 36-41, Right eye: 42-47
-        LEFT_EYE = range(36, 42)
-        RIGHT_EYE = range(42, 48)
-        def get_eye_aspect_ratio(eye_points):
-            # Compute the euclidean distances between the two sets of vertical landmarks
-            v1 = np.linalg.norm(eye_points[1] - eye_points[5])
-            v2 = np.linalg.norm(eye_points[2] - eye_points[4])
-            # Compute the euclidean distance between the horizontal landmarks
-            h = np.linalg.norm(eye_points[0] - eye_points[3])
-            # Compute the eye aspect ratio
-            return (v1 + v2) / (2.0 * h) if h > 1e-6 else 0.0
-        # Extract landmark coordinates
-        landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
-        # Calculate EAR for left and right eyes
-        left_eye_coords = landmark_coords[list(LEFT_EYE)]
-        right_eye_coords = landmark_coords[list(RIGHT_EYE)]
-        left_ear = get_eye_aspect_ratio(left_eye_coords)
-        right_ear = get_eye_aspect_ratio(right_eye_coords)
-        # Return average of both eyes
-        return (left_ear + right_ear) / 2.0
-    except Exception as e:
-        print(f"Error calculating EAR: {e}")
-        return 0.0
-def calculate_mar_dlib(landmarks):
-    """Calculate Mouth Aspect Ratio using dlib landmarks"""
-    if landmarks is None:
-        return 0.0
-    try:
-        # dlib's 68-point face model landmark indices for mouth
-        # Mouth outer: 48-59, Mouth inner: 60-67
-        MOUTH_OUTER = range(48, 60)
-        MOUTH_INNER = range(60, 68)
-        # Extract landmark coordinates
-        landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
-        # Use specific points for vertical and horizontal measurements
-        # Vertical: distance between top and bottom lips
-        top_lip = landmark_coords[51]  # Top lip center
-        bottom_lip = landmark_coords[57]  # Bottom lip center
-        vertical = np.linalg.norm(top_lip - bottom_lip)
-        # Horizontal: distance between mouth corners
-        left_corner = landmark_coords[48]  # Left mouth corner
-        right_corner = landmark_coords[54]  # Right mouth corner
-        horizontal = np.linalg.norm(left_corner - right_corner)
-        # Calculate ratio
-        return vertical / horizontal if horizontal > 1e-6 else 0.0
-    except Exception as e:
-        print(f"Error calculating MAR: {e}")
-        return 0.0
-def calculate_eyebrow_position_dlib(landmarks):
-    """Calculate eyebrow position using dlib landmarks"""
-    if landmarks is None:
-        return 0.0
-    try:
-        # dlib's 68-point face model landmark indices
-        # Left eyebrow: 17-21, Right eyebrow: 22-26
-        # Left eye: 36-41, Right eye: 42-47
-        L_BROW_C = 19  # Center of left eyebrow
-        R_BROW_C = 24  # Center of right eyebrow
-        L_EYE_C = 37   # Center top of left eye
-        R_EYE_C = 43   # Center top of right eye
-        # Extract landmark coordinates
-        landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
-        # Calculate distances between eyebrows and eyes
-        l_brow_y = landmark_coords[L_BROW_C][1]
-        r_brow_y = landmark_coords[R_BROW_C][1]
-        l_eye_y = landmark_coords[L_EYE_C][1]
-        r_eye_y = landmark_coords[R_EYE_C][1]
-        # Calculate vertical distances (smaller value means eyebrows are raised)
-        l_dist = l_eye_y - l_brow_y
-        r_dist = r_eye_y - r_brow_y
-        # Average the distances and normalize
-        avg_dist = (l_dist + r_dist) / 2.0
-        # Approximate normalization based on typical face proportions
-        # Higher value means eyebrows are raised more
-        norm = (avg_dist - 5) / 15  # Adjusted for typical pixel distances
-        return max(0.0, min(1.0, norm))
-    except Exception as e:
-        print(f"Error calculating Eyebrow Position: {e}")
-        return 0.0
-def estimate_head_pose_dlib(landmarks):
-    """Estimate head pose using dlib landmarks"""
-    if landmarks is None:
-        return 0.0, 0.0
     try:
-        # dlib's 68-point face model landmark indices
-        NOSE_TIP = 30     # Nose tip
-        LEFT_EYE_C = 37   # Left eye center
-        RIGHT_EYE_C = 44  # Right eye center
-        # Extract landmark coordinates
-        landmark_coords = np.array([[landmarks.part(i).x, landmarks.part(i).y] for i in range(68)])
-        # Get key points
-        nose_pt = landmark_coords[NOSE_TIP]
-        l_eye_pt = landmark_coords[LEFT_EYE_C]
-        r_eye_pt = landmark_coords[RIGHT_EYE_C]
-        # Calculate eye midpoint
-        eye_mid_x = (l_eye_pt[0] + r_eye_pt[0]) / 2.0
-        eye_mid_y = (l_eye_pt[1] + r_eye_pt[1]) / 2.0
-        # Calculate tilt
-        v_tilt = nose_pt[1] - eye_mid_y  # Vertical tilt
-        h_tilt = nose_pt[0] - eye_mid_x  # Horizontal tilt
-        # Normalize based on typical facial proportions
-        v_tilt_norm = v_tilt / 30.0  # Approximate normalization
-        h_tilt_norm = h_tilt / 20.0  # Approximate normalization
-        # Clip to range [-1, 1]
-        v_tilt_norm = max(-1.0, min(1.0, v_tilt_norm))
-        h_tilt_norm = max(-1.0, min(1.0, h_tilt_norm))
-        return v_tilt_norm, h_tilt_norm
     except Exception as e:
-        print(f"Error estimating Head Pose: {e}")
-        return 0.0, 0.0
-def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None):
     """
-    Calculate facial metrics using a combination of dlib landmarks and DeepFace emotions.
-    This provides a more robust approach by integrating both geometric and deep learning methods.
     """
     if ad_context is None:
         ad_context = {}
@@ -449,52 +393,44 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
     default_metrics = {m: 0.5 for m in metrics}
     # If no facial data, return defaults
-    if not facial_data:
         return default_metrics
-    # Extract landmarks from facial data
-    landmarks = facial_data.get("landmarks")
-    # If we have DeepFace data, use it to influence our metrics
-    emotion_weights = None
-    dominant_emotion = None
-    if deepface_data and "emotion" in deepface_data:
-        emotion_weights = deepface_data["emotion"]
-        # Find dominant emotion
-        dominant_emotion = max(emotion_weights.items(), key=lambda x: x[1])[0]
-    # Calculate base geometric features if landmarks are available
-    ear = calculate_ear_dlib(landmarks) if landmarks else 0.2
-    mar = calculate_mar_dlib(landmarks) if landmarks else 0.5
-    eb_pos = calculate_eyebrow_position_dlib(landmarks) if landmarks else 0.5
-    v_tilt, h_tilt = estimate_head_pose_dlib(landmarks) if landmarks else (0.0, 0.0)
-    # Combine geometric features with emotion weights
-    # Step 1: Start with default metrics
-    calculated_metrics = default_metrics.copy()
-    # Step 2: Update based on geometric features
-    cl = max(0, min(1, 1.0 - ear * 2.5))  # Cognitive load: Higher when eyes are more closed
-    # Step 3: If we have emotion data from DeepFace, incorporate it
-    if dominant_emotion and emotion_weights:
-        # Get base values from emotion mapping
-        base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
-        # Calculate confidence-weighted emotion values
-        confidence = emotion_weights.get(dominant_emotion, 0) / 100.0  # Convert percentage to 0-1
-        # Combine geometric and emotion-based metrics with weighted approach
-        val = base_vals["valence"] * confidence + (mar * 0.7 * (1.0 - eb_pos) * 0.3) * (1 - confidence)
-        arsl = base_vals["arousal"] * confidence + ((mar + (1.0 - ear) + eb_pos) / 3.0) * (1 - confidence)
-        dom = base_vals["dominance"] * confidence + (0.5 + v_tilt) * (1 - confidence)
-    else:
-        # Fallback to geometric features only
-        val = max(0, min(1, mar * 2.0 * (1.0 - eb_pos)))
-        arsl = max(0, min(1, (mar + (1.0 - ear) + eb_pos) / 3.0))
-        dom = max(0, min(1, 0.5 + v_tilt))
     # Illustrative Context Adjustments from ad
     ad_type = ad_context.get('ad_type', 'Unknown')
@@ -508,18 +444,25 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
     val = max(0, min(1, val + val_adj))
     arsl = max(0, min(1, arsl + arsl_adj))
     # Calculate secondary metrics
     neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
     em_stab = 1.0 - neur
     extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
-    open = max(0, min(1, 0.5 + ((mar - 0.5) * 0.5)))
     agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
     consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
-    stress = max(0, min(1, (cl * 0.5) + (eb_pos * 0.3) + ((1.0 - val) * 0.2)))
-    engag = max(0, min(1, (arsl * 0.7) + ((1.0 - abs(h_tilt)) * 0.3)))
-    # Update the metrics dictionary
-    calculated_metrics.update({
         'valence': val,
         'arousal': arsl,
         'dominance': dom,
@@ -532,7 +475,7 @@ def calculate_metrics_enhanced(facial_data, deepface_data=None, ad_context=None)
         'extraversion': extr,
         'stress_index': stress,
         'engagement_level': engag
-    })
     return calculated_metrics
@@ -609,83 +552,45 @@ def update_metrics_visualization(metrics_values):
     plt.tight_layout(pad=0.5)
     return fig
-def annotate_frame(frame, facial_data, metrics=None, enhanced_state=None):
     """
-    Add facial landmark annotations and metrics to a frame
     """
     if frame is None:
         return None
     annotated = frame.copy()
-    # If we have facial data, draw the landmarks
-    if facial_data and "landmarks" in facial_data:
-        landmarks = facial_data["landmarks"]
-        rect = facial_data.get("rect")
-        # Draw face rectangle if available
-        if rect:
-            x1, y1, x2, y2 = rect.left(), rect.top(), rect.right(), rect.bottom()
-            cv2.rectangle(annotated, (x1, y1), (x2, y2), (0, 255, 0), 2)
-        # Draw the 68 facial landmarks
-        for i in range(68):
-            x, y = landmarks.part(i).x, landmarks.part(i).y
-            cv2.circle(annotated, (x, y), 2, (0, 0, 255), -1)
-        # Draw connecting lines for different facial features
-        # Eyes
-        for eye_points in [(36, 41), (42, 47)]:  # Left eye, Right eye
-            for i in range(eye_points[0], eye_points[1]):
-                pt1 = (landmarks.part(i).x, landmarks.part(i).y)
-                pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
-                cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
-            # Connect last point to first
-            pt1 = (landmarks.part(eye_points[1]).x, landmarks.part(eye_points[1]).y)
-            pt2 = (landmarks.part(eye_points[0]).x, landmarks.part(eye_points[0]).y)
-            cv2.line(annotated, pt1, pt2, (0, 255, 255), 1)
-        # Eyebrows
-        for brow_points in [(17, 21), (22, 26)]:  # Left eyebrow, Right eyebrow
-            for i in range(brow_points[0], brow_points[1]):
-                pt1 = (landmarks.part(i).x, landmarks.part(i).y)
-                pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
-                cv2.line(annotated, pt1, pt2, (255, 255, 0), 1)
-        # Nose
-        for i in range(27, 35):
-            pt1 = (landmarks.part(i).x, landmarks.part(i).y)
-            pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
-            cv2.line(annotated, pt1, pt2, (255, 0, 255), 1)
-        # Mouth outer
-        for i in range(48, 59):
-            pt1 = (landmarks.part(i).x, landmarks.part(i).y)
-            pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
-            cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
-        # Connect last point to first for mouth
-        pt1 = (landmarks.part(59).x, landmarks.part(59).y)
-        pt2 = (landmarks.part(48).x, landmarks.part(48).y)
-        cv2.line(annotated, pt1, pt2, (0, 255, 0), 1)
-        # Mouth inner
-        for i in range(60, 67):
-            pt1 = (landmarks.part(i).x, landmarks.part(i).y)
-            pt2 = (landmarks.part(i + 1).x, landmarks.part(i + 1).y)
-            cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
-        # Connect last point to first for inner mouth
-        pt1 = (landmarks.part(67).x, landmarks.part(67).y)
-        pt2 = (landmarks.part(60).x, landmarks.part(60).y)
-        cv2.line(annotated, pt1, pt2, (255, 0, 0), 1)
-    # Add metrics summary if available
-    if metrics:
         # Format for display
         h, w = annotated.shape[:2]
         y_pos = 30  # Starting Y position
-        # Add user state if available
         if enhanced_state:
             # Draw background for text
             text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
             cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
@@ -695,21 +600,21 @@ def annotate_frame(frame, facial_data, metrics=None, enhanced_state=None):
             y_pos += 30
         # Show top 3 metrics
-        top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
-                            key=lambda x: x[1], reverse=True)[:3]
-        for name, value in top_metrics:
-            metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
-            text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
-            cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
-            cv2.putText(annotated, metric_text, (10, y_pos),
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-            y_pos += 25
     return annotated
 # --- API 1: Video File Processing ---
 def process_video_file(
     video_file: Union[str, np.ndarray],
     ad_description: str = "",
@@ -770,12 +675,23 @@ def process_video_file(
             processed_frames = []
             # Process the single frame
-            facial_data = extract_face_landmarks_dlib(video_file)
-            deepface_data = analyze_face_with_deepface(video_file)
-            if facial_data:
-                calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
-                user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
                 # Create a row for the dataframe
                 row = {
@@ -784,12 +700,13 @@ def process_video_file(
                     **calculated_metrics,
                     **ad_context,
                     'user_state': user_state,
-                    'enhanced_user_state': enhanced_state
                 }
                 metrics_data.append(row)
                 # Annotate the frame
-                annotated_frame = annotate_frame(video_file, facial_data, calculated_metrics, enhanced_state)
                 processed_frames.append(annotated_frame)
                 # Save processed image
@@ -825,10 +742,13 @@ def process_video_file(
     metrics_data = []
     processed_frames = []
     frame_count = 0
     if show_progress:
         print(f"Processing video with {total_frames} frames at {fps} FPS")
         print(f"Ad Context: {ad_description} ({ad_type})")
     while True:
         ret, frame = cap.read()
@@ -840,14 +760,25 @@ def process_video_file(
             if show_progress and frame_count % (sampling_rate * 10) == 0:
                 print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
-            # Extract facial landmarks and analyze with DeepFace
-            facial_data = extract_face_landmarks_dlib(frame)
-            deepface_data = analyze_face_with_deepface(frame)
-            # Calculate metrics if landmarks detected
-            if facial_data:
-                calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
-                user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
                 # Create a row for the dataframe
                 row = {
@@ -856,12 +787,13 @@ def process_video_file(
                     **calculated_metrics,
                     **ad_context,
                     'user_state': user_state,
-                    'enhanced_user_state': enhanced_state
                 }
                 metrics_data.append(row)
                 # Annotate the frame
-                annotated_frame = annotate_frame(frame, facial_data, calculated_metrics, enhanced_state)
                 if save_processed_video:
                     out.write(annotated_frame)
@@ -898,14 +830,14 @@ def process_video_file(
     return csv_path, video_path, metrics_df, processed_frames
 # --- API 2: Webcam Processing Function ---
 def process_webcam_frame(
     frame: np.ndarray,
     ad_context: Dict[str, Any],
     metrics_data: pd.DataFrame,
     frame_count: int,
-    start_time: float
-) -> Tuple[np.ndarray, Dict[str, float], str, pd.DataFrame]:
     """
     Process a single webcam frame
@@ -915,21 +847,35 @@ def process_webcam_frame(
         metrics_data: DataFrame to accumulate metrics
         frame_count: Current frame count
         start_time: Start time of the session
     Returns:
-        Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_metrics_df)
     """
     if frame is None:
-        return None, None, None, metrics_data
-    # Extract facial landmarks and analyze with DeepFace
-    facial_data = extract_face_landmarks_dlib(frame)
-    deepface_data = analyze_face_with_deepface(frame)
-    # Calculate metrics if landmarks detected
-    if facial_data:
-        calculated_metrics = calculate_metrics_enhanced(facial_data, deepface_data, ad_context)
-        user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, ad_context)
         # Create a row for the dataframe
         current_time = time.time()
@@ -939,7 +885,8 @@ def process_webcam_frame(
             **calculated_metrics,
             **ad_context,
             'user_state': user_state,
-            'enhanced_user_state': enhanced_state
         }
         # Add row to DataFrame
@@ -947,15 +894,15 @@ def process_webcam_frame(
         metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
         # Annotate the frame
-        annotated_frame = annotate_frame(frame, facial_data, calculated_metrics, enhanced_state)
-        return annotated_frame, calculated_metrics, enhanced_state, metrics_data
     else:
         # No face detected
         no_face_frame = frame.copy()
         cv2.putText(no_face_frame, "No face detected", (30, 30),
                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
-        return no_face_frame, None, "No face detected", metrics_data
 def start_webcam_session(
     ad_description: str = "",
@@ -1003,7 +950,8 @@ def start_webcam_session(
         "last_saved": 0,
         "record_video": record_video,
         "recorded_frames": [] if record_video else None,
-        "timestamps": [] if record_video else None
     }
     return session
@@ -1011,7 +959,7 @@ def start_webcam_session(
 def update_webcam_session(
     session: Dict[str, Any],
     frame: np.ndarray
-) -> Tuple[np.ndarray, Dict[str, float], str, Dict[str, Any]]:
     """
     Update webcam session with a new frame
@@ -1020,20 +968,22 @@ def update_webcam_session(
         frame: New frame from webcam
     Returns:
-        Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_session)
     """
     # Process the frame
-    annotated_frame, metrics, enhanced_state, updated_df = process_webcam_frame(
         frame,
         session["ad_context"],
         session["metrics_data"],
         session["frame_count"],
-        session["start_time"]
     )
     # Update session
     session["frame_count"] += 1
     session["metrics_data"] = updated_df
     # Record frame if enabled
     if session["record_video"] and annotated_frame is not None:
@@ -1046,7 +996,7 @@ def update_webcam_session(
             updated_df.to_csv(session["csv_path"], index=False)
         session["last_saved"] = session["frame_count"]
-    return annotated_frame, metrics, enhanced_state, session
 def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
     """
@@ -1100,19 +1050,19 @@ def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
     return session["csv_path"], video_path
 # --- Create Gradio Interface ---
 def create_api_interface():
-    with gr.Blocks(title="Enhanced Facial Analysis APIs") as iface:
-        gr.Markdown("""
-        # Enhanced Facial Analysis APIs
         This interface provides two API endpoints:
         1. **Video File API**: Upload and analyze pre-recorded videos
         2. **Webcam API**: Analyze live webcam feed in real-time
-        Both APIs use dlib for facial landmark detection, DeepFace for emotion analysis,
-        and Google's Gemini API for enhanced interpretations.
         """)
         with gr.Tab("Video File API"):
@@ -1231,6 +1181,9 @@ def create_api_interface():
                         with gr.Column():
                             enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
                     with gr.Row():
                         download_csv = gr.File(label="Download Session Data")
                         download_video = gr.Video(label="Recorded Session")
@@ -1255,18 +1208,18 @@ def create_api_interface():
             def process_frame(frame, session):
                 if session is None:
-                    return frame, None, "No active session. Click 'Start Session' to begin.", session
                 # Process the frame
-                annotated_frame, metrics, enhanced_state, updated_session = update_webcam_session(session, frame)
                 # Update the metrics plot if metrics available
                 if metrics:
                     metrics_plot = update_metrics_visualization(metrics)
-                    return annotated_frame, metrics_plot, enhanced_state, updated_session
                 else:
                     # Return the annotated frame (likely with "No face detected")
-                    return annotated_frame, None, enhanced_state or "No metrics available", updated_session
             def end_session(session):
                 if session is None:
@@ -1292,7 +1245,7 @@ def create_api_interface():
             webcam_input.stream(
                 process_frame,
                 inputs=[webcam_input, session_data],
-                outputs=[processed_output, metrics_plot, enhanced_state_txt, session_data]
             )
             end_session_btn.click(
@@ -1305,8 +1258,8 @@ def create_api_interface():
 # Entry point
 if __name__ == "__main__":
-    print("Starting Enhanced Facial Analysis API server...")
     print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
-    print(f"Facial analysis using dlib and DeepFace")
     iface = create_api_interface()
     iface.launch(debug=True)

 import numpy as np
 import pandas as pd
 import time
 import matplotlib.pyplot as plt
 from matplotlib.colors import LinearSegmentedColormap
 from matplotlib.collections import LineCollection
 import base64
 import io
 from pathlib import Path
+import torch
+from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
+from io import BytesIO
 # Suppress warnings for cleaner output
 warnings.filterwarnings('ignore')
     print("Running with simulated Gemini API responses.")
     GEMINI_ENABLED = False
+# --- Initialize LLaVA Vision Model ---
+print("Initializing LLaVA Vision Model...")
+LLAVA_ENABLED = False
 try:
+    # Check if GPU is available
+    if torch.cuda.is_available():
+        device = "cuda"
     else:
+        device = "cpu"
+    # Use a smaller LLaVA model for better performance
+    model_id = "llava-hf/llava-1.5-7b-hf"
+    # Initialize the model
+    processor = AutoProcessor.from_pretrained(model_id)
+    llava_model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+        low_cpu_mem_usage=True if device == "cuda" else False,
+    ).to(device)
+    # Create a pipeline
+    vision_llm = pipeline(
+        "image-to-text",
+        model=llava_model,
+        tokenizer=processor.tokenizer,
+        image_processor=processor.image_processor,
+        device=device,
+        max_new_tokens=512,
+    )
+    LLAVA_ENABLED = True
+    print(f"LLaVA Vision Model initialized successfully on {device.upper()}")
 except Exception as e:
+    print(f"WARNING: Failed to initialize LLaVA Vision Model: {e}")
+    print("Running with DeepFace only (no LLaVA vision features).")
+    vision_llm = None
+# --- Initialize OpenCV face detector for backup ---
+print("Initializing OpenCV face detector...")
+try:
+    # Use OpenCV's built-in face detector as backup
+    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+    # Check if the face detector loaded successfully
+    if face_cascade.empty():
+        print("WARNING: Failed to load face cascade classifier")
+    else:
+        print("OpenCV face detector initialized successfully.")
+except Exception as e:
+    print(f"ERROR initializing OpenCV face detector: {e}")
+    face_cascade = None
 # --- Metrics Definition ---
 metrics = [
 }
 ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
+user_state_columns = ["user_state", "enhanced_user_state", "llava_analysis"]
 all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
 initial_metrics_df = pd.DataFrame(columns=all_columns)
+# --- LLaVA Vision Analysis Function ---
+def analyze_image_with_llava(image, ad_context=None):
+    """
+    Use LLaVA vision model to analyze facial expression and emotion in image
+    """
+    if not LLAVA_ENABLED or vision_llm is None or image is None:
+        return "LLaVA analysis not available"
+    try:
+        # Convert OpenCV image (BGR) to PIL Image (RGB)
+        if len(image.shape) == 3 and image.shape[2] == 3:
+            # Check if BGR and convert to RGB if needed
+            if np.mean(image[:,:,0]) < np.mean(image[:,:,2]):  # Rough BGR check
+                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            else:
+                image_rgb = image
+        else:
+            # Handle grayscale or other formats
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Convert to PIL Image
+        pil_image = Image.fromarray(image_rgb)
+        # Create prompt based on ad context
+        ad_info = ""
+        if ad_context:
+            ad_desc = ad_context.get('ad_description', '')
+            ad_type = ad_context.get('ad_type', '')
+            if ad_desc:
+                ad_info = f" while watching an ad about {ad_desc} (type: {ad_type})"
+        prompt = f"""Analyze this person's facial expression and emotion{ad_info}.
+        Describe their emotional state, engagement level, and cognitive state in detail.
+        Focus on: valence (positive/negative emotion), arousal (excitement level),
+        attention, stress indicators, and overall reaction to what they're seeing.
+        """
+        # Process with Vision LLM
+        outputs = vision_llm(pil_image, prompt=prompt)
+        # Extract the generated text
+        if isinstance(outputs, list) and len(outputs) > 0:
+            if isinstance(outputs[0], dict) and "generated_text" in outputs[0]:
+                return outputs[0]["generated_text"]
+            elif isinstance(outputs[0], str):
+                return outputs[0]
+        return str(outputs) if outputs else "No results from LLaVA analysis"
+    except Exception as e:
+        print(f"Error in LLaVA analysis: {e}")
+        return f"LLaVA analysis error: {str(e)}"
+# --- Gemini API Functions ---
 def call_gemini_api_for_ad(description, detail, ad_type):
     """
     Uses Google Gemini to analyze ad context.
             print(f"Error calling Gemini for ad context: {e}")
             return f"Error analyzing ad context: {str(e)}"
+def interpret_metrics_with_gemini(metrics_dict, deepface_results=None, llava_analysis=None, ad_context=None):
     """
+    Uses Google Gemini to interpret facial metrics, DeepFace results and LLaVA analysis
+    to determine user state.
     """
+    if not metrics_dict and not deepface_results and not llava_analysis:
         return "No metrics", "No facial data detected"
     if not GEMINI_ENABLED:
         # Basic rule-based simulation for user state
+        valence = metrics_dict.get('valence', 0.5) if metrics_dict else 0.5
+        arousal = metrics_dict.get('arousal', 0.5) if metrics_dict else 0.5
+        # Extract emotion from DeepFace if available
+        dominant_emotion = "neutral"
+        if deepface_results and "emotion" in deepface_results:
+            emotion_dict = deepface_results["emotion"]
+            dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
         # Simple rule-based simulation
+        state = dominant_emotion.capitalize() if dominant_emotion != "neutral" else "Neutral"
+        if valence > 0.65 and arousal > 0.55:
             state = "Positive, Engaged"
+        elif valence < 0.4 and arousal > 0.6:
             state = "Stressed, Negative"
+        enhanced_state = f"The viewer appears {state.lower()} while watching this content."
+        if llava_analysis and llava_analysis != "LLaVA analysis not available":
+            # Extract a brief summary from LLaVA analysis (first sentence)
+            first_sentence = llava_analysis.split('.')[0] + '.'
+            enhanced_state += f" {first_sentence}"
         return state, enhanced_state
     else:
         try:
             # Format metrics for Gemini
+            metrics_formatted = ""
+            if metrics_dict:
+                metrics_formatted = "\nMetrics (0-1 scale):\n" + "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
+                                            if k not in ('timestamp', 'frame_number')])
+            # Format DeepFace results
+            deepface_formatted = ""
+            if deepface_results and "emotion" in deepface_results:
+                emotion_dict = deepface_results["emotion"]
+                deepface_formatted = "\nDeepFace emotions:\n" + "\n".join([f"- {k.title()}: {v:.2f}" for k, v in emotion_dict.items()])
+            # Format LLaVA analysis
+            llava_formatted = ""
+            if llava_analysis and llava_analysis != "LLaVA analysis not available":
+                llava_formatted = f"\nLLaVA Vision Analysis:\n{llava_analysis}"
             # Include ad context if available
             ad_info = ""
                 ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
             prompt = f"""
+            Analyze the facial expression and emotion of a person watching an advertisement{ad_info}.
+            Use these combined inputs:{metrics_formatted}{deepface_formatted}{llava_formatted}
             Provide two outputs:
             1. User State: A short 1-3 word description of their emotional/cognitive state
             print(f"Error calling Gemini for metric interpretation: {e}")
             return "Error", f"Error analyzing facial metrics: {str(e)}"
+# --- DeepFace Analysis Function ---
 def analyze_face_with_deepface(image):
+    """Analyze facial emotions and attributes using DeepFace"""
     if image is None:
         return None
         # Analyze with DeepFace
         analysis = DeepFace.analyze(
             img_path=temp_img,
+            actions=['emotion', 'age', 'gender', 'race'],
             enforce_detection=False,  # Don't throw error if face not detected
             detector_backend='opencv'  # Faster detection
         )
         print(f"DeepFace analysis error: {e}")
         return None
+# --- Face Detection Backup with OpenCV ---
+def detect_face_opencv(image):
+    """Detect faces using OpenCV cascade classifier as backup"""
+    if image is None or face_cascade is None:
+        return None
     try:
+        # Convert to grayscale for detection
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        # Detect faces
+        faces = face_cascade.detectMultiScale(
+            gray,
+            scaleFactor=1.1,
+            minNeighbors=5,
+            minSize=(30, 30)
+        )
+        if len(faces) == 0:
+            return None
+        # Get the largest face by area
+        largest_face = max(faces, key=lambda rect: rect[2] * rect[3])
+        return {"rect": largest_face}
     except Exception as e:
+        print(f"Error in OpenCV face detection: {e}")
+        return None
+# --- Calculate Metrics from DeepFace Results ---
+def calculate_metrics_from_deepface(deepface_results, ad_context=None):
     """
+    Calculate psychometric metrics from DeepFace analysis results
     """
     if ad_context is None:
         ad_context = {}
     default_metrics = {m: 0.5 for m in metrics}
     # If no facial data, return defaults
+    if not deepface_results or "emotion" not in deepface_results:
         return default_metrics
+    # Extract emotion data from DeepFace
+    emotion_dict = deepface_results["emotion"]
+    # Find dominant emotion
+    dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
+    dominant_score = max(emotion_dict.items(), key=lambda x: x[1])[1] / 100.0  # Convert to 0-1 scale
+    # Get base values from emotion mapping
+    base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
+    # Calculate primary metrics with confidence weighting
+    val = base_vals["valence"]
+    arsl = base_vals["arousal"]
+    dom = base_vals["dominance"]
+    # Add directional adjustments based on specific emotions
+    if dominant_emotion == "happy":
+        val += 0.1
+    elif dominant_emotion == "sad":
+        val -= 0.1
+    elif dominant_emotion == "angry":
+        arsl += 0.1
+        dom += 0.1
+    elif dominant_emotion == "fear":
+        arsl += 0.1
+        dom -= 0.1
+    # Adjust for gender and age if available (just examples of potential factors)
+    if "gender" in deepface_results:
+        gender = deepface_results["gender"]
+        gender_score = deepface_results.get("gender_score", 0.5)
+        # No real adjustment needed, this is just an example
+    if "age" in deepface_results:
+        age = deepface_results["age"]
+        # No real adjustment needed, this is just an example
     # Illustrative Context Adjustments from ad
     ad_type = ad_context.get('ad_type', 'Unknown')
     val = max(0, min(1, val + val_adj))
     arsl = max(0, min(1, arsl + arsl_adj))
+    # Estimate cognitive load based on emotional intensity
+    cl = 0.5  # Default
+    if dominant_emotion in ["neutral"]:
+        cl = 0.3  # Lower cognitive load for neutral expression
+    elif dominant_emotion in ["surprise", "fear"]:
+        cl = 0.7  # Higher cognitive load for surprise/fear
     # Calculate secondary metrics
     neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
     em_stab = 1.0 - neur
     extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
+    open = max(0, min(1, 0.5 + (val - 0.5) * 0.5))
     agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
     consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
+    stress = max(0, min(1, (cl * 0.5) + ((1.0 - val) * 0.5)))
+    engag = max(0, min(1, arsl * 0.7 + (val * 0.3)))
+    # Create metrics dictionary
+    calculated_metrics = {
         'valence': val,
         'arousal': arsl,
         'dominance': dom,
         'extraversion': extr,
         'stress_index': stress,
         'engagement_level': engag
+    }
     return calculated_metrics
     plt.tight_layout(pad=0.5)
     return fig
+def annotate_frame(frame, face_data=None, deepface_results=None, metrics=None, enhanced_state=None):
     """
+    Add facial annotations and metrics to a frame
     """
     if frame is None:
         return None
     annotated = frame.copy()
+    # Draw face rectangle if available
+    if face_data and "rect" in face_data:
+        x, y, w, h = face_data["rect"]
+        cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
+    elif deepface_results and "region" in deepface_results:
+        region = deepface_results["region"]
+        x, y, w, h = region["x"], region["y"], region["w"], region["h"]
+        cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
+    # Add emotion and metrics summary
+    if deepface_results or metrics:
         # Format for display
         h, w = annotated.shape[:2]
         y_pos = 30  # Starting Y position
+        # Add emotion info if available from DeepFace
+        if deepface_results and "dominant_emotion" in deepface_results:
+            emotion_text = f"Emotion: {deepface_results['dominant_emotion'].capitalize()}"
+            text_size = cv2.getTextSize(emotion_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
+            cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
+            cv2.putText(annotated, emotion_text, (10, y_pos),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+            y_pos += 30
+        # Add enhanced user state if available
         if enhanced_state:
+            # Truncate if too long
+            if len(enhanced_state) > 60:
+                enhanced_state = enhanced_state[:57] + "..."
             # Draw background for text
             text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
             cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
             y_pos += 30
         # Show top 3 metrics
+        if metrics:
+            top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
+                                key=lambda x: x[1], reverse=True)[:3]
+            for name, value in top_metrics:
+                metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
+                text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
+                cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
+                cv2.putText(annotated, metric_text, (10, y_pos),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+                y_pos += 25
     return annotated
 # --- API 1: Video File Processing ---
 def process_video_file(
     video_file: Union[str, np.ndarray],
     ad_description: str = "",
             processed_frames = []
             # Process the single frame
+            deepface_results = analyze_face_with_deepface(video_file)
+            face_data = None
+            # Fall back to OpenCV face detection if DeepFace didn't detect a face
+            if not deepface_results or "region" not in deepface_results:
+                face_data = detect_face_opencv(video_file)
+            # Use LLaVA for additional analysis (once per frame)
+            llava_analysis = "LLaVA analysis not available"
+            if face_data is not None or (deepface_results and "region" in deepface_results):
+                # Only use LLaVA if a face was detected
+                llava_analysis = analyze_image_with_llava(video_file, ad_context)
+            # Calculate metrics if face detected
+            if deepface_results or face_data:
+                calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
+                user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
                 # Create a row for the dataframe
                 row = {
                     **calculated_metrics,
                     **ad_context,
                     'user_state': user_state,
+                    'enhanced_user_state': enhanced_state,
+                    'llava_analysis': llava_analysis
                 }
                 metrics_data.append(row)
                 # Annotate the frame
+                annotated_frame = annotate_frame(video_file, face_data, deepface_results, calculated_metrics, enhanced_state)
                 processed_frames.append(annotated_frame)
                 # Save processed image
     metrics_data = []
     processed_frames = []
     frame_count = 0
+    llava_counter = 0  # To limit LLaVA analysis (it's slow)
+    llava_interval = sampling_rate * 10  # Run LLaVA every X frames
     if show_progress:
         print(f"Processing video with {total_frames} frames at {fps} FPS")
         print(f"Ad Context: {ad_description} ({ad_type})")
+        print(f"LLaVA Vision Model: {'Enabled' if LLAVA_ENABLED else 'Disabled'}")
     while True:
         ret, frame = cap.read()
             if show_progress and frame_count % (sampling_rate * 10) == 0:
                 print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
+            # Analyze with DeepFace
+            deepface_results = analyze_face_with_deepface(frame)
+            face_data = None
+            # Fall back to OpenCV face detection if DeepFace didn't detect a face
+            if not deepface_results or "region" not in deepface_results:
+                face_data = detect_face_opencv(frame)
+            # Use LLaVA for additional analysis (periodically to save time)
+            llava_analysis = "LLaVA analysis not available"
+            if (face_data is not None or (deepface_results and "region" in deepface_results)) and llava_counter % llava_interval == 0:
+                # Only use LLaVA if a face was detected and on the right interval
+                llava_analysis = analyze_image_with_llava(frame, ad_context)
+            llava_counter += 1
+            # Calculate metrics if face detected
+            if deepface_results or face_data:
+                calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
+                user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
                 # Create a row for the dataframe
                 row = {
                     **calculated_metrics,
                     **ad_context,
                     'user_state': user_state,
+                    'enhanced_user_state': enhanced_state,
+                    'llava_analysis': llava_analysis
                 }
                 metrics_data.append(row)
                 # Annotate the frame
+                annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
                 if save_processed_video:
                     out.write(annotated_frame)
     return csv_path, video_path, metrics_df, processed_frames
 # --- API 2: Webcam Processing Function ---
 def process_webcam_frame(
     frame: np.ndarray,
     ad_context: Dict[str, Any],
     metrics_data: pd.DataFrame,
     frame_count: int,
+    start_time: float,
+    llava_counter: int
+) -> Tuple[np.ndarray, Dict[str, float], str, str, pd.DataFrame, int]:
     """
     Process a single webcam frame
         metrics_data: DataFrame to accumulate metrics
         frame_count: Current frame count
         start_time: Start time of the session
+        llava_counter: Counter to limit LLaVA calls
     Returns:
+        Tuple of (annotated_frame, metrics_dict, enhanced_state, llava_analysis, updated_metrics_df, updated_llava_counter)
     """
     if frame is None:
+        return None, None, None, None, metrics_data, llava_counter
+    # Analyze with DeepFace
+    deepface_results = analyze_face_with_deepface(frame)
+    face_data = None
+    # Fall back to OpenCV face detection if DeepFace didn't detect a face
+    if not deepface_results or "region" not in deepface_results:
+        face_data = detect_face_opencv(frame)
+    # Use LLaVA for periodic analysis (it's slow)
+    llava_analysis = "LLaVA analysis not available"
+    llava_interval = 30  # Run LLaVA every X frames
+    if (face_data is not None or (deepface_results and "region" in deepface_results)) and llava_counter % llava_interval == 0:
+        # Only use LLaVA if a face was detected and on the right interval
+        llava_analysis = analyze_image_with_llava(frame, ad_context)
+    llava_counter += 1
+    # Calculate metrics if face detected
+    if deepface_results or face_data:
+        calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
+        user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, llava_analysis, ad_context)
         # Create a row for the dataframe
         current_time = time.time()
             **calculated_metrics,
             **ad_context,
             'user_state': user_state,
+            'enhanced_user_state': enhanced_state,
+            'llava_analysis': llava_analysis
         }
         # Add row to DataFrame
         metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
         # Annotate the frame
+        annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
+        return annotated_frame, calculated_metrics, enhanced_state, llava_analysis, metrics_data, llava_counter
     else:
         # No face detected
         no_face_frame = frame.copy()
         cv2.putText(no_face_frame, "No face detected", (30, 30),
                     cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+        return no_face_frame, None, "No face detected", None, metrics_data, llava_counter
 def start_webcam_session(
     ad_description: str = "",
         "last_saved": 0,
         "record_video": record_video,
         "recorded_frames": [] if record_video else None,
+        "timestamps": [] if record_video else None,
+        "llava_counter": 0  # Counter to limit LLaVA calls
     }
     return session
 def update_webcam_session(
     session: Dict[str, Any],
     frame: np.ndarray
+) -> Tuple[np.ndarray, Dict[str, float], str, str, Dict[str, Any]]:
     """
     Update webcam session with a new frame
         frame: New frame from webcam
     Returns:
+        Tuple of (annotated_frame, metrics_dict, enhanced_state, llava_analysis, updated_session)
     """
     # Process the frame
+    annotated_frame, metrics, enhanced_state, llava_analysis, updated_df, updated_llava_counter = process_webcam_frame(
         frame,
         session["ad_context"],
         session["metrics_data"],
         session["frame_count"],
+        session["start_time"],
+        session["llava_counter"]
     )
     # Update session
     session["frame_count"] += 1
     session["metrics_data"] = updated_df
+    session["llava_counter"] = updated_llava_counter
     # Record frame if enabled
     if session["record_video"] and annotated_frame is not None:
             updated_df.to_csv(session["csv_path"], index=False)
         session["last_saved"] = session["frame_count"]
+    return annotated_frame, metrics, enhanced_state, llava_analysis, session
 def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
     """
     return session["csv_path"], video_path
 # --- Create Gradio Interface ---
 def create_api_interface():
+    with gr.Blocks(title="Facial Analysis APIs") as iface:
+        gr.Markdown(f"""
+        # Enhanced Facial Analysis APIs (LLaVA + DeepFace)
         This interface provides two API endpoints:
         1. **Video File API**: Upload and analyze pre-recorded videos
         2. **Webcam API**: Analyze live webcam feed in real-time
+        Both APIs use DeepFace for emotion analysis and Google's Gemini API for enhanced interpretations.
+        **LLaVA Vision Model: {'✅ Enabled' if LLAVA_ENABLED else '❌ Disabled'}**
         """)
         with gr.Tab("Video File API"):
                         with gr.Column():
                             enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
+                    with gr.Row():
+                        llava_analysis_txt = gr.Textbox(label="LLaVA Vision Analysis", lines=6)
                     with gr.Row():
                         download_csv = gr.File(label="Download Session Data")
                         download_video = gr.Video(label="Recorded Session")
             def process_frame(frame, session):
                 if session is None:
+                    return frame, None, "No active session. Click 'Start Session' to begin.", "LLaVA analysis not available", session
                 # Process the frame
+                annotated_frame, metrics, enhanced_state, llava_analysis, updated_session = update_webcam_session(session, frame)
                 # Update the metrics plot if metrics available
                 if metrics:
                     metrics_plot = update_metrics_visualization(metrics)
+                    return annotated_frame, metrics_plot, enhanced_state, llava_analysis or "LLaVA analysis not available", updated_session
                 else:
                     # Return the annotated frame (likely with "No face detected")
+                    return annotated_frame, None, enhanced_state or "No metrics available", "LLaVA analysis not available", updated_session
             def end_session(session):
                 if session is None:
             webcam_input.stream(
                 process_frame,
                 inputs=[webcam_input, session_data],
+                outputs=[processed_output, metrics_plot, enhanced_state_txt, llava_analysis_txt, session_data]
             )
             end_session_btn.click(
 # Entry point
 if __name__ == "__main__":
+    print("Starting Enhanced Facial Analysis API (LLaVA + DeepFace)...")
     print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
+    print(f"LLaVA Vision Model {'enabled' if LLAVA_ENABLED else 'disabled (using DeepFace only)'}")
     iface = create_api_interface()
     iface.launch(debug=True)