Spaces:

ans123
/

PSYCHOMETER_2.0

Configuration error

App Files Files Community

ans123 commited on May 1

Commit

579018b

verified ·

1 Parent(s): ade09dd

Update app.py

Browse files

Files changed (1) hide show

app.py +866 -695

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import cv2
 import numpy as np
 import pandas as pd
 import time
 import matplotlib.pyplot as plt
 from matplotlib.colors import LinearSegmentedColormap
 from matplotlib.collections import LineCollection
@@ -10,57 +11,80 @@ import os
 import datetime
 import tempfile
 from typing import Dict, List, Tuple, Optional, Union, Any
 import google.generativeai as genai
-from PIL import Image
-import json
-import warnings
-from deepface import DeepFace
-import base64
-import io
-from pathlib import Path
-import traceback
-# Suppress warnings for cleaner output
-warnings.filterwarnings('ignore')
 # --- Constants ---
-VIDEO_FPS = 30  # Target FPS for saved video
 CSV_FILENAME_TEMPLATE = "facial_analysis_{timestamp}.csv"
 VIDEO_FILENAME_TEMPLATE = "processed_{timestamp}.mp4"
-TEMP_DIR = Path("temp_frames")
-TEMP_DIR.mkdir(exist_ok=True)
-# --- Configure Google Gemini API ---
-print("Configuring Google Gemini API...")
-try:
-    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
-    if not GOOGLE_API_KEY:
-        raise ValueError("GOOGLE_API_KEY environment variable not set.")
-    genai.configure(api_key=GOOGLE_API_KEY)
-    # Use gemini-1.5-flash for quick responses
-    model = genai.GenerativeModel('gemini-1.5-flash')
-    GEMINI_ENABLED = True
-    print("Google Gemini API configured successfully.")
-except Exception as e:
-    print(f"WARNING: Failed to configure Google Gemini API: {e}")
-    print("Running with simulated Gemini API responses.")
-    GEMINI_ENABLED = False
-# --- Initialize OpenCV face detector for backup ---
-print("Initializing OpenCV face detector...")
-try:
-    # Use OpenCV's built-in face detector as backup
-    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
-    # Check if the face detector loaded successfully
-    if face_cascade.empty():
-        print("WARNING: Failed to load face cascade classifier")
-    else:
-        print("OpenCV face detector initialized successfully.")
-except Exception as e:
-    print(f"ERROR initializing OpenCV face detector: {e}")
-    face_cascade = None
 # --- Metrics Definition ---
 metrics = [
@@ -69,336 +93,408 @@ metrics = [
     "neuroticism", "conscientiousness", "extraversion",
     "stress_index", "engagement_level"
 ]
-# DeepFace emotion mapping
-emotion_mapping = {
-    "angry": {"valence": 0.2, "arousal": 0.8, "dominance": 0.7},
-    "disgust": {"valence": 0.2, "arousal": 0.6, "dominance": 0.5},
-    "fear": {"valence": 0.2, "arousal": 0.8, "dominance": 0.3},
-    "happy": {"valence": 0.9, "arousal": 0.7, "dominance": 0.6},
-    "sad": {"valence": 0.3, "arousal": 0.4, "dominance": 0.3},
-    "surprise": {"valence": 0.6, "arousal": 0.9, "dominance": 0.5},
-    "neutral": {"valence": 0.5, "arousal": 0.5, "dominance": 0.5}
-}
 ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
-user_state_columns = ["user_state", "enhanced_user_state"]
-all_columns = ['timestamp', 'frame_number'] + metrics + ad_context_columns + user_state_columns
 initial_metrics_df = pd.DataFrame(columns=all_columns)
-# --- Gemini API Functions ---
-def call_gemini_api_for_ad(description, detail, ad_type):
-    """
-    Uses Google Gemini to analyze ad context.
-    """
-    print(f"Analyzing ad context: '{description}' ({ad_type})")
-    if not GEMINI_ENABLED:
-        # Simulated response
-        analysis = f"Simulated analysis: Ad='{description or 'N/A'}' ({ad_type}), Focus='{detail or 'N/A'}'."
-        if not description and not detail:
-            analysis = "No ad context provided."
-        print(f"Simulated Gemini Result: {analysis}")
-        return analysis
-    else:
-        try:
-            prompt = f"""
-            Please analyze this advertisement context:
-            - Description: {description}
-            - Detail focus: {detail}
-            - Type/Genre: {ad_type}
-            Provide a concise analysis of what emotional and cognitive responses might be expected from viewers.
-            Limit your response to 100 words.
-            """
-            response = model.generate_content(prompt)
-            return response.text
-        except Exception as e:
-            print(f"Error calling Gemini for ad context: {e}")
-            return f"Error analyzing ad context: {str(e)}"
-def interpret_metrics_with_gemini(metrics_dict, deepface_results=None, ad_context=None):
     """
-    Uses Google Gemini to interpret facial metrics and DeepFace results
-    to determine user state.
     """
-    if not metrics_dict and not deepface_results:
-        return "No metrics", "No facial data detected"
-    if not GEMINI_ENABLED:
-        # Basic rule-based simulation for user state
-        valence = metrics_dict.get('valence', 0.5) if metrics_dict else 0.5
-        arousal = metrics_dict.get('arousal', 0.5) if metrics_dict else 0.5
-        # Extract emotion from DeepFace if available
-        dominant_emotion = "neutral"
-        if deepface_results and "emotion" in deepface_results:
-            emotion_dict = deepface_results["emotion"]
-            dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
-        # Simple rule-based simulation
-        state = dominant_emotion.capitalize() if dominant_emotion != "neutral" else "Neutral"
-        if valence > 0.65 and arousal > 0.55:
-            state = "Positive, Engaged"
-        elif valence < 0.4 and arousal > 0.6:
-            state = "Stressed, Negative"
-        enhanced_state = f"The viewer appears {state.lower()} while watching this content."
-        return state, enhanced_state
-    else:
-        try:
-            # Format metrics for Gemini
-            metrics_formatted = ""
-            if metrics_dict:
-                metrics_formatted = "\nMetrics (0-1 scale):\n" + "\n".join([f"- {k.replace('_', ' ').title()}: {v:.2f}" for k, v in metrics_dict.items()
-                                            if k not in ('timestamp', 'frame_number')])
-            # Format DeepFace results
-            deepface_formatted = ""
-            if deepface_results and "emotion" in deepface_results:
-                emotion_dict = deepface_results["emotion"]
-                deepface_formatted = "\nDeepFace emotions:\n" + "\n".join([f"- {k.title()}: {v:.2f}" for k, v in emotion_dict.items()])
-            # Include ad context if available
-            ad_info = ""
-            if ad_context:
-                ad_desc = ad_context.get('ad_description', 'N/A')
-                ad_type = ad_context.get('ad_type', 'N/A')
-                ad_info = f"\nThey are watching an advertisement: {ad_desc} (Type: {ad_type})"
-            prompt = f"""
-            Analyze the facial expression and emotion of a person watching an advertisement{ad_info}.
-            Use these combined inputs:{metrics_formatted}{deepface_formatted}
-            Provide two outputs:
-            1. User State: A short 1-3 word description of their emotional/cognitive state
-            2. Enhanced Analysis: A detailed 1-2 sentence interpretation of their reaction to the content
-            Format as JSON: {{"user_state": "STATE", "enhanced_user_state": "DETAILED ANALYSIS"}}
-            """
-            response = model.generate_content(prompt)
-            try:
-                # Try to parse as JSON
-                result = json.loads(response.text)
-                return result.get("user_state", "Uncertain"), result.get("enhanced_user_state", "Analysis unavailable")
-            except json.JSONDecodeError:
-                # If not valid JSON, try to extract manually
-                text = response.text
-                if "user_state" in text and "enhanced_user_state" in text:
-                    parts = text.split("enhanced_user_state")
-                    user_state = parts[0].split("user_state")[1].replace('"', '').replace(':', '').replace(',', '').strip()
-                    enhanced = parts[1].replace('"', '').replace(':', '').replace('}', '').strip()
-                    return user_state, enhanced
-                else:
-                    # Just return the raw text as enhanced state
-                    return "Analyzed", text
-        except Exception as e:
-            print(f"Error calling Gemini for metric interpretation: {e}")
-            traceback.print_exc()
-            return "Error", f"Error analyzing facial metrics: {str(e)}"
-# --- DeepFace Analysis Function ---
-def analyze_face_with_deepface(image):
-    """Analyze facial emotions and attributes using DeepFace"""
-    if image is None:
         return None
     try:
-        # Convert to RGB for DeepFace if needed
-        if len(image.shape) == 3 and image.shape[2] == 3:
-            # Check if BGR and convert to RGB if needed
-            if np.mean(image[:,:,0]) < np.mean(image[:,:,2]):  # Rough BGR check
-                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            else:
-                image_rgb = image
-        else:
-            # Handle grayscale or other formats
-            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        # Save image to temp file (DeepFace sometimes works better with files)
-        temp_img = f"temp_frames/temp_analysis_{time.time()}.jpg"
-        cv2.imwrite(temp_img, image_rgb)
-        # Analyze with DeepFace
-        analysis = DeepFace.analyze(
-            img_path=temp_img,
-            actions=['emotion'],
-            enforce_detection=False,  # Don't throw error if face not detected
-            detector_backend='opencv'  # Faster detection
-        )
-        # Remove temporary file
-        try:
-            os.remove(temp_img)
-        except:
-            pass
-        # Return the first face analysis (assuming single face)
-        if isinstance(analysis, list) and len(analysis) > 0:
-            return analysis[0]
-        else:
-            return analysis
     except Exception as e:
-        print(f"DeepFace analysis error: {e}")
         return None
-# --- Face Detection Backup with OpenCV ---
-def detect_face_opencv(image):
-    """Detect faces using OpenCV cascade classifier as backup"""
-    if image is None or face_cascade is None:
         return None
     try:
-        # Convert to grayscale for detection
-        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        # Detect faces
-        faces = face_cascade.detectMultiScale(
-            gray,
-            scaleFactor=1.1,
-            minNeighbors=5,
-            minSize=(30, 30)
-        )
-        if len(faces) == 0:
-            return None
-        # Get the largest face by area
-        largest_face = max(faces, key=lambda rect: rect[2] * rect[3])
-        return {"rect": largest_face}
-    except Exception as e:
-        print(f"Error in OpenCV face detection: {e}")
-        return None
-# --- Calculate Metrics from DeepFace Results ---
-def calculate_metrics_from_deepface(deepface_results, ad_context=None):
-    """
-    Calculate psychometric metrics from DeepFace analysis results
-    """
     if ad_context is None:
         ad_context = {}
-    # Initialize default metrics
-    default_metrics = {m: 0.5 for m in metrics}
-    # If no facial data, return defaults
-    if not deepface_results or "emotion" not in deepface_results:
-        return default_metrics
-    # Extract emotion data from DeepFace
-    emotion_dict = deepface_results["emotion"]
-    # Find dominant emotion
-    dominant_emotion = max(emotion_dict.items(), key=lambda x: x[1])[0]
-    dominant_score = max(emotion_dict.items(), key=lambda x: x[1])[1] / 100.0  # Convert to 0-1 scale
-    # Get base values from emotion mapping
-    base_vals = emotion_mapping.get(dominant_emotion, {"valence": 0.5, "arousal": 0.5, "dominance": 0.5})
-    # Calculate primary metrics with confidence weighting
-    val = base_vals["valence"]
-    arsl = base_vals["arousal"]
-    dom = base_vals["dominance"]
-    # Add directional adjustments based on specific emotions
-    if dominant_emotion == "happy":
-        val += 0.1
-    elif dominant_emotion == "sad":
-        val -= 0.1
-    elif dominant_emotion == "angry":
-        arsl += 0.1
-        dom += 0.1
-    elif dominant_emotion == "fear":
-        arsl += 0.1
-        dom -= 0.1
-    # Illustrative Context Adjustments from ad
-    ad_type = ad_context.get('ad_type', 'Unknown')
     gem_txt = str(ad_context.get('gemini_ad_analysis', '')).lower()
-    # Adjust based on ad context
-    val_adj = 0.1 if ad_type == 'Funny' or 'humor' in gem_txt else 0.0
-    arsl_adj = 0.1 if ad_type == 'Action' or 'exciting' in gem_txt else 0.0
-    # Apply adjustments
-    val = max(0, min(1, val + val_adj))
-    arsl = max(0, min(1, arsl + arsl_adj))
-    # Estimate cognitive load based on emotional intensity
-    cl = 0.5  # Default
-    if dominant_emotion in ["neutral"]:
-        cl = 0.3  # Lower cognitive load for neutral expression
-    elif dominant_emotion in ["surprise", "fear"]:
-        cl = 0.7  # Higher cognitive load for surprise/fear
-    # Calculate secondary metrics
     neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
     em_stab = 1.0 - neur
     extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
-    open = max(0, min(1, 0.5 + (val - 0.5) * 0.5))
     agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
     consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
-    stress = max(0, min(1, (cl * 0.5) + ((1.0 - val) * 0.5)))
-    engag = max(0, min(1, arsl * 0.7 + (val * 0.3)))
-    # Create metrics dictionary
-    calculated_metrics = {
-        'valence': val,
-        'arousal': arsl,
-        'dominance': dom,
-        'cognitive_load': cl,
-        'emotional_stability': em_stab,
-        'openness': open,
-        'agreeableness': agree,
-        'neuroticism': neur,
-        'conscientiousness': consc,
-        'extraversion': extr,
-        'stress_index': stress,
-        'engagement_level': engag
     }
-    return calculated_metrics
-def update_metrics_visualization(metrics_values):
-    """Create a visualization of metrics"""
     if not metrics_values:
         fig, ax = plt.subplots(figsize=(10, 8))
-        ax.text(0.5, 0.5, "Waiting for facial metrics...", ha='center', va='center')
         ax.axis('off')
         fig.patch.set_facecolor('#FFFFFF')
         ax.set_facecolor('#FFFFFF')
         return fig
-    # Filter out non-metric keys
-    filtered_metrics = {k: v for k, v in metrics_values.items()
-                       if k in metrics and isinstance(v, (int, float))}
-    if not filtered_metrics:
-        fig, ax = plt.subplots(figsize=(10, 8))
-        ax.text(0.5, 0.5, "No valid metrics available", ha='center', va='center')
-        ax.axis('off')
-        return fig
-    num_metrics = len(filtered_metrics)
     nrows = (num_metrics + 2) // 3
     fig, axs = plt.subplots(nrows, 3, figsize=(10, nrows * 2.5), facecolor='#FFFFFF')
     axs = axs.flatten()
     colors = [(0.1, 0.1, 0.9), (0.9, 0.9, 0.1), (0.9, 0.1, 0.1)]
     cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=100)
     norm = plt.Normalize(0, 1)
     metric_idx = 0
-    for key, value in filtered_metrics.items():
-        value = max(0.0, min(1.0, value))  # Clip value for safety
         ax = axs[metric_idx]
         ax.set_title(key.replace('_', ' ').title(), fontsize=10)
@@ -440,69 +536,97 @@ def update_metrics_visualization(metrics_values):
     plt.tight_layout(pad=0.5)
     return fig
-def annotate_frame(frame, face_data=None, deepface_results=None, metrics=None, enhanced_state=None):
-    """
-    Add facial annotations and metrics to a frame
-    """
     if frame is None:
         return None
     annotated = frame.copy()
-    # Draw face rectangle if available
-    if face_data and "rect" in face_data:
-        x, y, w, h = face_data["rect"]
-        cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
-    elif deepface_results and "region" in deepface_results:
-        region = deepface_results["region"]
-        x, y, w, h = region["x"], region["y"], region["w"], region["h"]
-        cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
-    # Add emotion and metrics summary
-    if deepface_results or metrics:
-        # Format for display
-        h, w = annotated.shape[:2]
-        y_pos = 30  # Starting Y position
-        # Add emotion info if available from DeepFace
-        if deepface_results and "dominant_emotion" in deepface_results:
-            emotion_text = f"Emotion: {deepface_results['dominant_emotion'].capitalize()}"
-            text_size = cv2.getTextSize(emotion_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
-            cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
-            cv2.putText(annotated, emotion_text, (10, y_pos),
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
-            y_pos += 30
-        # Add enhanced user state if available
-        if enhanced_state:
-            # Truncate if too long
-            if len(enhanced_state) > 60:
-                enhanced_state = enhanced_state[:57] + "..."
-            # Draw background for text
-            text_size = cv2.getTextSize(enhanced_state, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
-            cv2.rectangle(annotated, (10, y_pos - 20), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
-            # Draw text
-            cv2.putText(annotated, enhanced_state, (10, y_pos),
-                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
-            y_pos += 30
-        # Show top 3 metrics
-        if metrics:
-            top_metrics = sorted([(k, v) for k, v in metrics.items() if k in metrics],
-                                key=lambda x: x[1], reverse=True)[:3]
-            for name, value in top_metrics:
-                metric_text = f"{name.replace('_', ' ').title()}: {value:.2f}"
-                text_size = cv2.getTextSize(metric_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
-                cv2.rectangle(annotated, (10, y_pos - 15), (10 + text_size[0], y_pos + 5), (0, 0, 0), -1)
-                cv2.putText(annotated, metric_text, (10, y_pos),
-                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
-                y_pos += 25
     return annotated
-# --- API 1: Video File Processing ---
 def process_video_file(
     video_file: Union[str, np.ndarray],
     ad_description: str = "",
@@ -510,8 +634,8 @@ def process_video_file(
     ad_type: str = "Video",
     sampling_rate: int = 5,  # Process every Nth frame
     save_processed_video: bool = True,
-    show_progress: bool = True
-) -> Tuple[str, str, pd.DataFrame, List[np.ndarray]]:
     """
     Process a video file and analyze facial expressions frame by frame
@@ -522,17 +646,21 @@ def process_video_file(
         ad_type: Type of ad (Video, Image, Audio, Text, Funny, etc.)
         sampling_rate: Process every Nth frame
         save_processed_video: Whether to save the processed video with annotations
-        show_progress: Whether to show processing progress
     Returns:
-        Tuple of (csv_path, processed_video_path, metrics_dataframe, processed_frames_list)
     """
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     csv_path = CSV_FILENAME_TEMPLATE.format(timestamp=timestamp)
     video_path = VIDEO_FILENAME_TEMPLATE.format(timestamp=timestamp) if save_processed_video else None
     # Setup ad context
-    gemini_result = call_gemini_api_for_ad(ad_description, ad_detail, ad_type)
     ad_context = {
         "ad_description": ad_description,
         "ad_detail": ad_detail,
@@ -540,6 +668,8 @@ def process_video_file(
         "gemini_ad_analysis": gemini_result
     }
     # Initialize capture
     if isinstance(video_file, str):
         cap = cv2.VideoCapture(video_file)
@@ -549,64 +679,24 @@ def process_video_file(
         temp_path = os.path.join(temp_dir, "temp_video.mp4")
         # Convert video array to file
-        if isinstance(video_file, np.ndarray) and len(video_file.shape) == 4:  # Multiple frames
-            h, w = video_file[0].shape[:2]
             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
             temp_writer = cv2.VideoWriter(temp_path, fourcc, 30, (w, h))
             for frame in video_file:
                 temp_writer.write(frame)
             temp_writer.release()
-            cap = cv2.VideoCapture(temp_path)
-        elif isinstance(video_file, np.ndarray) and len(video_file.shape) == 3:  # Single frame
-            # For single frame, just process it directly
-            metrics_data = []
-            processed_frames = []
-            # Process the single frame
-            deepface_results = analyze_face_with_deepface(video_file)
-            face_data = None
-            # Fall back to OpenCV face detection if DeepFace didn't detect a face
-            if not deepface_results or "region" not in deepface_results:
-                face_data = detect_face_opencv(video_file)
-            # Calculate metrics if face detected
-            if deepface_results or face_data:
-                calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
-                user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, ad_context)
-                # Create a row for the dataframe
-                row = {
-                    'timestamp': 0.0,
-                    'frame_number': 0,
-                    **calculated_metrics,
-                    **ad_context,
-                    'user_state': user_state,
-                    'enhanced_user_state': enhanced_state
-                }
-                metrics_data.append(row)
-                # Annotate the frame
-                annotated_frame = annotate_frame(video_file, face_data, deepface_results, calculated_metrics, enhanced_state)
-                processed_frames.append(annotated_frame)
-                # Save processed image
-                if save_processed_video:
-                    cv2.imwrite(video_path.replace('.mp4', '.jpg'), annotated_frame)
-            # Create DataFrame and save to CSV
-            metrics_df = pd.DataFrame(metrics_data)
-            if not metrics_df.empty:
-                metrics_df.to_csv(csv_path, index=False)
-            return csv_path, video_path.replace('.mp4', '.jpg') if save_processed_video else None, metrics_df, processed_frames
-        else:
-            print("Error: Invalid video input format")
-            return None, None, None, []
     if not cap.isOpened():
         print("Error: Could not open video.")
-        return None, None, None, []
     # Get video properties
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -617,68 +707,117 @@ def process_video_file(
     # Initialize video writer if saving processed video
     if save_processed_video:
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(video_path, fourcc, fps / sampling_rate, (frame_width, frame_height))
     # Process video frames
     metrics_data = []
-    processed_frames = []
     frame_count = 0
-    if show_progress:
-        print(f"Processing video with {total_frames} frames at {fps} FPS")
-        print(f"Ad Context: {ad_description} ({ad_type})")
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-        # Only process every Nth frame (according to sampling_rate)
-        if frame_count % sampling_rate == 0:
-            if show_progress and frame_count % (sampling_rate * 10) == 0:
-                print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
-            # Analyze with DeepFace
-            deepface_results = analyze_face_with_deepface(frame)
-            face_data = None
-            # Fall back to OpenCV face detection if DeepFace didn't detect a face
-            if not deepface_results or "region" not in deepface_results:
-                face_data = detect_face_opencv(frame)
-            # Calculate metrics if face detected
-            if deepface_results or face_data:
-                calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
-                user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, ad_context)
-                # Create a row for the dataframe
-                row = {
-                    'timestamp': frame_count / fps,
-                    'frame_number': frame_count,
-                    **calculated_metrics,
-                    **ad_context,
-                    'user_state': user_state,
-                    'enhanced_user_state': enhanced_state
-                }
-                metrics_data.append(row)
-                # Annotate the frame
-                annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
-                if save_processed_video:
-                    out.write(annotated_frame)
-                processed_frames.append(annotated_frame)
-            else:
-                # No face detected
-                if save_processed_video:
-                    # Add text to frame
-                    no_face_frame = frame.copy()
-                    cv2.putText(no_face_frame, "No face detected", (30, 30),
-                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
-                    out.write(no_face_frame)
-                    processed_frames.append(no_face_frame)
-        frame_count += 1
     # Release resources
     cap.release()
@@ -689,26 +828,25 @@ def process_video_file(
     metrics_df = pd.DataFrame(metrics_data)
     if not metrics_df.empty:
         metrics_df.to_csv(csv_path, index=False)
-    if show_progress:
-        print(f"Video processing complete. Analyzed {len(metrics_data)} frames.")
-        print(f"Results saved to {csv_path}")
-        if save_processed_video:
-            print(f"Processed video saved to {video_path}")
     # Return results
-    return csv_path, video_path, metrics_df, processed_frames
-# --- API 2: Webcam Processing Function ---
 def process_webcam_frame(
     frame: np.ndarray,
     ad_context: Dict[str, Any],
     metrics_data: pd.DataFrame,
     frame_count: int,
-    start_time: float
-) -> Tuple[np.ndarray, Dict[str, float], str, pd.DataFrame]:
     """
-    Process a single webcam frame
     Args:
         frame: Input frame from webcam
@@ -716,68 +854,113 @@ def process_webcam_frame(
         metrics_data: DataFrame to accumulate metrics
         frame_count: Current frame count
         start_time: Start time of the session
     Returns:
-        Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_metrics_df)
     """
     if frame is None:
-        return None, None, None, metrics_data
-    # Analyze with DeepFace
-    deepface_results = analyze_face_with_deepface(frame)
-    face_data = None
-    # Fall back to OpenCV face detection if DeepFace didn't detect a face
-    if not deepface_results or "region" not in deepface_results:
-        face_data = detect_face_opencv(frame)
-    # Calculate metrics if face detected
-    if deepface_results or face_data:
-        calculated_metrics = calculate_metrics_from_deepface(deepface_results, ad_context)
-        user_state, enhanced_state = interpret_metrics_with_gemini(calculated_metrics, deepface_results, ad_context)
         # Create a row for the dataframe
-        current_time = time.time()
         row = {
-            'timestamp': current_time - start_time,
             'frame_number': frame_count,
-            **calculated_metrics,
-            **ad_context,
-            'user_state': user_state,
-            'enhanced_user_state': enhanced_state
         }
         # Add row to DataFrame
         new_row_df = pd.DataFrame([row], columns=all_columns)
         metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
         # Annotate the frame
-        annotated_frame = annotate_frame(frame, face_data, deepface_results, calculated_metrics, enhanced_state)
-        return annotated_frame, calculated_metrics, enhanced_state, metrics_data
     else:
         # No face detected
-        no_face_frame = frame.copy()
-        cv2.putText(no_face_frame, "No face detected", (30, 30),
-                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
-        return no_face_frame, None, "No face detected", metrics_data
 def start_webcam_session(
     ad_description: str = "",
     ad_detail: str = "",
     ad_type: str = "Video",
     save_interval: int = 100,  # Save CSV every N frames
-    record_video: bool = True
 ) -> Dict[str, Any]:
     """
-    Initialize a webcam session for facial analysis
     Args:
         ad_description: Description of the ad being watched
         ad_detail: Detail focus of the ad
         ad_type: Type of ad
         save_interval: How often to save data to CSV
-        record_video: Whether to record processed frames for later saving
     Returns:
         Session context dictionary
@@ -785,10 +968,13 @@ def start_webcam_session(
     # Generate timestamp for file naming
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     csv_path = CSV_FILENAME_TEMPLATE.format(timestamp=timestamp)
-    video_path = VIDEO_FILENAME_TEMPLATE.format(timestamp=timestamp) if record_video else None
     # Setup ad context
-    gemini_result = call_gemini_api_for_ad(ad_description, ad_detail, ad_type)
     ad_context = {
         "ad_description": ad_description,
         "ad_detail": ad_detail,
@@ -803,20 +989,25 @@ def start_webcam_session(
         "metrics_data": initial_metrics_df.copy(),
         "ad_context": ad_context,
         "csv_path": csv_path,
-        "video_path": video_path,
         "save_interval": save_interval,
         "last_saved": 0,
-        "record_video": record_video,
-        "recorded_frames": [] if record_video else None,
-        "timestamps": [] if record_video else None
     }
     return session
 def update_webcam_session(
     session: Dict[str, Any],
     frame: np.ndarray
-) -> Tuple[np.ndarray, Dict[str, float], str, Dict[str, Any]]:
     """
     Update webcam session with a new frame
@@ -825,33 +1016,47 @@ def update_webcam_session(
         frame: New frame from webcam
     Returns:
-        Tuple of (annotated_frame, metrics_dict, enhanced_state, updated_session)
     """
-    # Process the frame
-    annotated_frame, metrics, enhanced_state, updated_df = process_webcam_frame(
-        frame,
-        session["ad_context"],
-        session["metrics_data"],
-        session["frame_count"],
-        session["start_time"]
-    )
-    # Update session
     session["frame_count"] += 1
-    session["metrics_data"] = updated_df
-    # Record frame if enabled
-    if session["record_video"] and annotated_frame is not None:
-        session["recorded_frames"].append(annotated_frame)
-        session["timestamps"].append(time.time() - session["start_time"])
     # Save CSV periodically
     if session["frame_count"] - session["last_saved"] >= session["save_interval"]:
-        if not updated_df.empty:
-            updated_df.to_csv(session["csv_path"], index=False)
         session["last_saved"] = session["frame_count"]
-    return annotated_frame, metrics, enhanced_state, session
 def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
     """
@@ -861,69 +1066,34 @@ def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
         session: Session context dictionary
     Returns:
-        Tuple of (csv_path, video_path)
     """
     # Save final metrics to CSV
     if not session["metrics_data"].empty:
         session["metrics_data"].to_csv(session["csv_path"], index=False)
-    # Save recorded video if available
-    video_path = None
-    if session["record_video"] and session["recorded_frames"]:
-        try:
-            frames = session["recorded_frames"]
-            if frames:
-                # Get frame dimensions
-                height, width = frames[0].shape[:2]
-                # Calculate FPS based on actual timestamps
-                if len(session["timestamps"]) > 1:
-                    # Calculate average time between frames
-                    time_diffs = np.diff(session["timestamps"])
-                    avg_frame_time = np.mean(time_diffs)
-                    fps = 1.0 / avg_frame_time if avg_frame_time > 0 else 15.0
-                else:
-                    fps = 15.0  # Default FPS
-                # Create video writer
-                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-                video_path = session["video_path"]
-                out = cv2.VideoWriter(video_path, fourcc, fps, (width, height))
-                # Write frames
-                for frame in frames:
-                    out.write(frame)
-                out.release()
-                print(f"Recorded video saved to {video_path}")
-            else:
-                print("No frames recorded")
-        except Exception as e:
-            print(f"Error saving video: {e}")
     print(f"Session ended. Data saved to {session['csv_path']}")
-    return session["csv_path"], video_path
-# --- Create Gradio Interface ---
 def create_api_interface():
-    with gr.Blocks(title="Facial Analysis APIs") as iface:
-        gr.Markdown(f"""
-        # Enhanced Facial Analysis APIs (DeepFace)
-        This interface provides two API endpoints:
-        1. **Video File API**: Upload and analyze pre-recorded videos
-        2. **Webcam API**: Analyze live webcam feed in real-time
-        Both APIs use DeepFace for emotion analysis and Google's Gemini API for enhanced interpretations.
-        """)
         with gr.Tab("Video File API"):
             with gr.Row():
                 with gr.Column(scale=1):
                     video_input = gr.Video(label="Upload Video")
-                    vid_ad_desc = gr.Textbox(label="Ad Description", placeholder="Enter a description of the advertisement being watched...")
-                    vid_ad_detail = gr.Textbox(label="Ad Detail Focus", placeholder="Enter specific aspects to focus on...")
                     vid_ad_type = gr.Radio(
                         ["Video", "Image", "Audio", "Text", "Funny", "Serious", "Action", "Informative"],
                         label="Ad Type/Genre",
@@ -934,181 +1104,182 @@ def create_api_interface():
                         label="Sampling Rate (process every N frames)"
                     )
                     save_video = gr.Checkbox(label="Save Processed Video", value=True)
-                    process_btn = gr.Button("Process Video", variant="primary")
                 with gr.Column(scale=2):
-                    output_text = gr.Textbox(label="Processing Results", lines=3)
                     with gr.Row():
-                        with gr.Column():
-                            output_video = gr.Video(label="Processed Video")
-                        with gr.Column():
-                            frame_gallery = gr.Gallery(label="Processed Frames",
-                                                     show_label=True, columns=2,
-                                                     height=400)
                     with gr.Row():
-                        with gr.Column():
-                            output_plot = gr.Plot(label="Sample Frame Metrics")
-                        with gr.Column():
-                            output_csv = gr.File(label="Download CSV Results")
-            # Define function to handle video processing and show frames
-            def handle_video_processing(video, desc, detail, ad_type, rate, save_vid):
                 if video is None:
-                    return "No video uploaded", None, None, [], None
                 try:
-                    result_text = "Starting video processing...\n"
-                    # Process the video
-                    csv_path, video_path, metrics_df, processed_frames = process_video_file(
                         video,
                         ad_description=desc,
                         ad_detail=detail,
                         ad_type=ad_type,
                         sampling_rate=rate,
                         save_processed_video=save_vid,
-                        show_progress=True
                     )
                     if metrics_df is None or metrics_df.empty:
-                        return "No facial data detected in video", None, None, [], None
-                    # Generate a sample metrics visualization
-                    sample_row = metrics_df.iloc[0].to_dict()
-                    metrics_plot = update_metrics_visualization(sample_row)
-                    # Create a gallery of processed frames
-                    # Take a subset if there are too many frames (maximum ~20 for display)
-                    display_frames = []
-                    step = max(1, len(processed_frames) // 20)
-                    for i in range(0, len(processed_frames), step):
-                        if i < len(processed_frames):
-                            # Convert BGR to RGB for display
-                            rgb_frame = cv2.cvtColor(processed_frames[i], cv2.COLOR_BGR2RGB)
-                            display_frames.append(rgb_frame)
-                    # Return results summary
-                    processed_count = metrics_df.shape[0]
-                    total_count = len(processed_frames)
-                    result_text = f"✅ Processed {processed_count} frames out of {total_count} total frames.\n"
-                    result_text += f"📊 CSV saved with {len(metrics_df.columns)} metrics columns.\n"
                     if video_path:
-                        result_text += f"🎬 Processed video saved to: {video_path}"
-                    return result_text, video_path, metrics_plot, display_frames, csv_path
                 except Exception as e:
-                    return f"❌ Error processing video: {str(e)}", None, None, [], None
             process_btn.click(
                 handle_video_processing,
                 inputs=[video_input, vid_ad_desc, vid_ad_detail, vid_ad_type, sampling_rate, save_video],
-                outputs=[output_text, output_video, output_plot, frame_gallery, output_csv]
             )
         with gr.Tab("Webcam API"):
             with gr.Row():
-                with gr.Column(scale=2):
                     webcam_input = gr.Image(sources="webcam", streaming=True, label="Webcam Input", type="numpy")
-                    with gr.Row():
-                        with gr.Column():
-                            web_ad_desc = gr.Textbox(label="Ad Description", placeholder="Enter a description of the advertisement being watched...")
-                            web_ad_detail = gr.Textbox(label="Ad Detail Focus", placeholder="Enter specific aspects to focus on...")
-                            web_ad_type = gr.Radio(
-                                ["Video", "Image", "Audio", "Text", "Funny", "Serious", "Action", "Informative"],
-                                label="Ad Type/Genre",
-                                value="Video"
-                            )
-                        with gr.Column():
-                            record_video_chk = gr.Checkbox(label="Record Video", value=True)
-                            start_session_btn = gr.Button("Start Session", variant="primary")
-                            end_session_btn = gr.Button("End Session", variant="stop")
-                            session_status = gr.Textbox(label="Session Status", placeholder="Session not started...")
                 with gr.Column(scale=2):
-                    processed_output = gr.Image(label="Processed Feed", type="numpy", height=360)
                     with gr.Row():
-                        with gr.Column():
-                            metrics_plot = gr.Plot(label="Current Metrics", height=300)
-                        with gr.Column():
-                            enhanced_state_txt = gr.Textbox(label="Enhanced State Analysis", lines=3)
                     with gr.Row():
                         download_csv = gr.File(label="Download Session Data")
-                        download_video = gr.Video(label="Recorded Session")
             # Session state
             session_data = gr.State(value=None)
             # Define session handlers
-            def start_session(desc, detail, ad_type, record_video):
-                session = start_webcam_session(
-                    ad_description=desc,
-                    ad_detail=detail,
-                    ad_type=ad_type,
-                    record_video=record_video
-                )
-                return (
-                    session,
-                    f"Session started at {datetime.datetime.now().strftime('%H:%M:%S')}.\n"
-                    f"Ad context: {desc} ({ad_type}).\n"
-                    f"Data will be saved to {session['csv_path']}"
-                )
             def process_frame(frame, session):
-                if session is None:
-                    return frame, None, "No active session. Click 'Start Session' to begin.", session
-                # Process the frame
-                annotated_frame, metrics, enhanced_state, updated_session = update_webcam_session(session, frame)
-                # Update the metrics plot if metrics available
-                if metrics:
-                    metrics_plot = update_metrics_visualization(metrics)
-                    return annotated_frame, metrics_plot, enhanced_state, updated_session
-                else:
-                    # Return the annotated frame (likely with "No face detected")
-                    return annotated_frame, None, enhanced_state or "No metrics available", updated_session
             def end_session(session):
                 if session is None:
-                    return "No active session", None, None
-                csv_path, video_path = end_webcam_session(session)
-                end_time = datetime.datetime.now().strftime('%H:%M:%S')
-                result = f"Session ended at {end_time}.\n"
-                if csv_path:
-                    result += f"CSV data saved to: {csv_path}\n"
-                if video_path:
-                    result += f"Video saved to: {video_path}"
-                return result, csv_path, video_path
             start_session_btn.click(
                 start_session,
-                inputs=[web_ad_desc, web_ad_detail, web_ad_type, record_video_chk],
                 outputs=[session_data, session_status]
             )
             webcam_input.stream(
                 process_frame,
                 inputs=[webcam_input, session_data],
-                outputs=[processed_output, metrics_plot, enhanced_state_txt, session_data]
             )
             end_session_btn.click(
                 end_session,
                 inputs=[session_data],
-                outputs=[session_status, download_csv, download_video]
             )
     return iface
 # Entry point
 if __name__ == "__main__":
-    print("Starting Enhanced Facial Analysis API (DeepFace)...")
-    print(f"Gemini API {'enabled' if GEMINI_ENABLED else 'disabled (using simulation)'}")
     iface = create_api_interface()
     iface.launch(debug=True)

 import numpy as np
 import pandas as pd
 import time
+import mediapipe as mp
 import matplotlib.pyplot as plt
 from matplotlib.colors import LinearSegmentedColormap
 from matplotlib.collections import LineCollection
 import datetime
 import tempfile
 from typing import Dict, List, Tuple, Optional, Union, Any
+import threading
+import queue
+import asyncio
+import librosa
+import torch
+from moviepy.editor import VideoFileClip
+from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
 import google.generativeai as genai
+from concurrent.futures import ThreadPoolExecutor
 # --- Constants ---
+VIDEO_FPS = 15  # Estimated/Target FPS for saved video
 CSV_FILENAME_TEMPLATE = "facial_analysis_{timestamp}.csv"
 VIDEO_FILENAME_TEMPLATE = "processed_{timestamp}.mp4"
+AUDIO_FILENAME_TEMPLATE = "audio_{timestamp}.wav"
+# --- MediaPipe Initialization ---
+mp_face_mesh = mp.solutions.face_mesh
+mp_drawing = mp.solutions.drawing_utils
+mp_drawing_styles = mp.solutions.drawing_styles
+face_mesh = mp_face_mesh.FaceMesh(
+    max_num_faces=1,
+    refine_landmarks=True,
+    min_detection_confidence=0.5,
+    min_tracking_confidence=0.5)
+# --- Audio Model Initialization ---
+# We'll initialize this in a function to avoid loading at startup
+audio_classifier = None
+audio_feature_extractor = None
+def initialize_audio_model():
+    global audio_classifier, audio_feature_extractor
+    if audio_classifier is None:
+        print("Loading audio classification model...")
+        model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+        audio_feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+        audio_classifier = AutoModelForAudioClassification.from_pretrained(model_name)
+        print("Audio model loaded successfully")
+    return audio_classifier, audio_feature_extractor
+# --- Gemini API Configuration ---
+# Replace with your Gemini API key
+GEMINI_API_KEY = "your-gemini-api-key"  # In production, load from environment variable
+def configure_gemini():
+    genai.configure(api_key=GEMINI_API_KEY)
+    # Set up the model
+    generation_config = {
+        "temperature": 0.2,
+        "top_p": 0.8,
+        "top_k": 40,
+        "max_output_tokens": 256,
+    }
+    safety_settings = [
+        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+    ]
+    try:
+        model = genai.GenerativeModel(
+            model_name="gemini-1.5-flash",
+            generation_config=generation_config,
+            safety_settings=safety_settings
+        )
+        return model
+    except Exception as e:
+        print(f"Error configuring Gemini: {e}")
+        return None
 # --- Metrics Definition ---
 metrics = [
     "neuroticism", "conscientiousness", "extraversion",
     "stress_index", "engagement_level"
 ]
+audio_metrics = [
+    "audio_valence", "audio_arousal", "audio_intensity",
+    "audio_emotion", "audio_confidence"
+]
 ad_context_columns = ["ad_description", "ad_detail", "ad_type", "gemini_ad_analysis"]
+user_state_column = ["user_state", "detailed_user_analysis"]
+all_columns = ['timestamp', 'frame_number'] + metrics + audio_metrics + ad_context_columns + user_state_column
 initial_metrics_df = pd.DataFrame(columns=all_columns)
+# --- Live Processing Queue ---
+processing_queue = queue.Queue()
+results_queue = queue.Queue()
+# --- Gemini Functions ---
+def call_gemini_api_for_ad(model, description, detail, ad_type):
+    """Uses Gemini to analyze ad context."""
+    if not model:
+        return "Gemini model not available. Using simulated analysis."
+    if not description and not detail:
+        return "No ad context provided."
+    prompt = f"""
+    Analyze this advertisement context:
+    - Description: {description or 'N/A'}
+    - Detail/Focus: {detail or 'N/A'}
+    - Type/Genre: {ad_type}
+    Provide a concise analysis of how this ad might affect viewer emotions and cognition.
+    Focus on potential emotional triggers, cognitive demands, and engagement patterns.
+    Keep your analysis under 100 words.
     """
+    try:
+        response = model.generate_content(prompt)
+        return response.text
+    except Exception as e:
+        print(f"Error calling Gemini API: {e}")
+        return f"Simulated analysis: Ad='{description or 'N/A'}' ({ad_type}), Focus='{detail or 'N/A'}'."
+def interpret_metrics_with_gemini(model, metrics_dict, audio_metrics_dict=None, ad_context=None, timestamp=None):
+    """Uses Gemini to interpret facial and audio metrics -> detailed user state."""
+    if not model:
+        return simple_user_state_analysis(metrics_dict, audio_metrics_dict), "Gemini model not available. Using rule-based analysis."
+    if not metrics_dict:
+        return "No response", "No metrics data available"
+    metrics_text = "\n".join([f"- {k}: {v:.3f}" for k, v in metrics_dict.items()])
+    audio_text = ""
+    if audio_metrics_dict:
+        audio_text = "\n".join([f"- {k}: {v}" for k, v in audio_metrics_dict.items()])
+    ad_text = ""
+    if ad_context:
+        ad_text = f"""
+        Ad Context:
+        - Description: {ad_context.get('ad_description', 'N/A')}
+        - Detail/Focus: {ad_context.get('ad_detail', 'N/A')}
+        - Type/Genre: {ad_context.get('ad_type', 'N/A')}
+        """
+    timestamp_text = f"Timestamp: {timestamp:.2f} seconds" if timestamp is not None else ""
+    prompt = f"""
+    Analyze the following viewer metrics and provide a detailed assessment of their current state:
+    {timestamp_text}
+    Facial Expression Metrics:
+    {metrics_text}
+    {'Audio Expression Metrics:' if audio_text else ''}
+    {audio_text}
+    {ad_text}
+    First, provide a short 1-5 word state label that summarizes the viewer's current emotional and cognitive state.
+    Then, provide a more detailed 2-3 sentence analysis explaining what these metrics suggest about the viewer's:
+    - Emotional state
+    - Cognitive engagement
+    - Likely response to the content
+    - Any notable patterns or anomalies
+    Format your response as:
+    USER STATE: [state label]
+    DETAILED ANALYSIS: [your analysis]
     """
+    try:
+        response = model.generate_content(prompt)
+        text = response.text.strip()
+        # Parse the response
+        state_parts = text.split("USER STATE:", 1)
+        if len(state_parts) > 1:
+            state_text = state_parts[1].split("DETAILED ANALYSIS:", 1)
+            if len(state_text) > 1:
+                simple_state = state_text[0].strip()
+                detailed_analysis = state_text[1].strip()
+                return simple_state, detailed_analysis
+        # Fallback if parsing fails
+        simple_state = text.split('\n')[0].strip()
+        detailed_analysis = ' '.join(text.split('\n')[1:]).strip()
+        return simple_state, detailed_analysis
+    except Exception as e:
+        print(f"Error interpreting metrics with Gemini: {e}")
+        return simple_user_state_analysis(metrics_dict, audio_metrics_dict), "Error generating detailed analysis"
+def simple_user_state_analysis(metrics_dict, audio_metrics_dict=None):
+    """Simple rule-based user state analysis as fallback."""
+    if not metrics_dict:
+        return "No metrics"
+    valence = metrics_dict.get('valence', 0.5)
+    arousal = metrics_dict.get('arousal', 0.5)
+    cog_load = metrics_dict.get('cognitive_load', 0.5)
+    stress = metrics_dict.get('stress_index', 0.5)
+    engagement = metrics_dict.get('engagement_level', 0.5)
+    # Include audio metrics when available
+    audio_emotion = None
+    audio_valence = 0.5
+    if audio_metrics_dict:
+        audio_emotion = audio_metrics_dict.get('audio_emotion')
+        audio_valence = audio_metrics_dict.get('audio_valence', 0.5)
+        # Blend facial and audio valence
+        valence = (valence * 0.7) + (audio_valence * 0.3)
+    # Simple rule-based analysis
+    state = "Neutral"
+    if valence > 0.65 and arousal > 0.55 and engagement > 0.6:
+        state = "Positive, Engaged"
+    elif valence < 0.4 and stress > 0.6:
+        state = "Stressed, Negative"
+    elif cog_load > 0.7 and engagement < 0.4:
+        state = "Confused, Disengaged"
+    elif arousal < 0.4 and engagement < 0.5:
+        state = "Calm, Passive"
+    # Override with audio emotion if it's strong
+    if audio_emotion in ["happy", "excited"] and audio_metrics_dict.get('audio_confidence', 0) > 0.7:
+        state = audio_emotion.capitalize()
+    elif audio_emotion in ["angry", "sad", "fearful"] and audio_metrics_dict.get('audio_confidence', 0) > 0.7:
+        state = audio_emotion.capitalize()
+    return state
+# --- Audio Analysis Functions ---
+def extract_audio_from_video(video_path, output_audio_path=None):
+    """Extract audio from video file"""
+    if output_audio_path is None:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_audio_path = AUDIO_FILENAME_TEMPLATE.format(timestamp=timestamp)
+    try:
+        video = VideoFileClip(video_path)
+        video.audio.write_audiofile(output_audio_path, fps=16000, nbytes=2, codec='pcm_s16le')
+        return output_audio_path
+    except Exception as e:
+        print(f"Error extracting audio: {e}")
         return None
+def analyze_audio_segment(audio_path, start_time, duration=1.0):
+    """Analyze a segment of audio for emotion"""
+    classifier, feature_extractor = initialize_audio_model()
     try:
+        # Load audio segment
+        y, sr = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration)
+        if len(y) < 100:  # Too short to analyze
+            return None
+        # Extract features
+        inputs = feature_extractor(y, sampling_rate=sr, return_tensors="pt")
+        # Get predictions
+        with torch.no_grad():
+            outputs = classifier(**inputs)
+            logits = outputs.logits
+            probabilities = torch.nn.functional.softmax(logits, dim=1)
+        # Get the predicted class and its probability
+        predicted_class_idx = torch.argmax(probabilities, dim=1).item()
+        confidence = probabilities[0][predicted_class_idx].item()
+        # Map to emotion labels (verify these match your model's labels)
+        emotion_labels = ["angry", "fearful", "happy", "neutral", "sad", "surprised"]
+        predicted_emotion = emotion_labels[predicted_class_idx]
+        # Calculate valence and arousal based on emotion
+        emotion_mappings = {
+            "angry": {"valence": 0.2, "arousal": 0.9, "intensity": 0.8},
+            "fearful": {"valence": 0.3, "arousal": 0.8, "intensity": 0.7},
+            "happy": {"valence": 0.9, "arousal": 0.7, "intensity": 0.6},
+            "neutral": {"valence": 0.5, "arousal": 0.5, "intensity": 0.3},
+            "sad": {"valence": 0.2, "arousal": 0.3, "intensity": 0.5},
+            "surprised": {"valence": 0.6, "arousal": 0.8, "intensity": 0.7}
+        }
+        valence = emotion_mappings.get(predicted_emotion, {"valence": 0.5})["valence"]
+        arousal = emotion_mappings.get(predicted_emotion, {"arousal": 0.5})["arousal"]
+        intensity = emotion_mappings.get(predicted_emotion, {"intensity": 0.5})["intensity"]
+        # Return audio metrics
+        return {
+            "audio_valence": valence,
+            "audio_arousal": arousal,
+            "audio_intensity": intensity,
+            "audio_emotion": predicted_emotion,
+            "audio_confidence": confidence
+        }
     except Exception as e:
+        print(f"Error analyzing audio segment: {e}")
         return None
+# --- Analysis Functions ---
+def extract_face_landmarks(image, face_mesh_instance):
+    if image is None or face_mesh_instance is None:
         return None
     try:
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_rgb.flags.writeable = False
+        results = face_mesh_instance.process(image_rgb)
+        image_rgb.flags.writeable = True
+        if results.multi_face_landmarks:
+            return results.multi_face_landmarks[0]
+    except Exception as e:
+        print(f"Error in landmark extraction: {e}")
+    return None
+def calculate_ear(landmarks):
+    if not landmarks:
+        return 0.0
+    try:
+        LEFT_EYE = [33, 160, 158, 133, 153, 144]
+        RIGHT_EYE = [362, 385, 387, 263, 373, 380]
+        def get_coords(idx_list):
+            return np.array([(landmarks.landmark[i].x, landmarks.landmark[i].y) for i in idx_list])
+        left_pts = get_coords(LEFT_EYE)
+        right_pts = get_coords(RIGHT_EYE)
+        def ear_aspect(pts):
+            v1 = np.linalg.norm(pts[1] - pts[5])
+            v2 = np.linalg.norm(pts[2] - pts[4])
+            h = np.linalg.norm(pts[0] - pts[3])
+            return (v1 + v2) / (2.0 * h) if h > 1e-6 else 0.0
+        return (ear_aspect(left_pts) + ear_aspect(right_pts)) / 2.0
+    except (IndexError, AttributeError) as e:
+        print(f"Error calculating EAR: {e}")
+        return 0.0
+def calculate_mar(landmarks):
+    if not landmarks:
+        return 0.0
+    try:
+        MOUTH = [61, 291, 39, 181, 0, 17, 269, 405]
+        pts = np.array([(landmarks.landmark[i].x, landmarks.landmark[i].y) for i in MOUTH])
+        h = np.mean([np.linalg.norm(pts[1] - pts[7]), np.linalg.norm(pts[2] - pts[6]), np.linalg.norm(pts[3] - pts[5])])
+        w = np.linalg.norm(pts[0] - pts[4])
+        return h / w if w > 1e-6 else 0.0
+    except (IndexError, AttributeError) as e:
+        print(f"Error calculating MAR: {e}")
+        return 0.0
+def calculate_eyebrow_position(landmarks):
+    if not landmarks:
+        return 0.0
+    try:
+        L_BROW = 107
+        R_BROW = 336
+        L_EYE_C = 159
+        R_EYE_C = 386
+        l_brow_y = landmarks.landmark[L_BROW].y
+        r_brow_y = landmarks.landmark[R_BROW].y
+        l_eye_y = landmarks.landmark[L_EYE_C].y
+        r_eye_y = landmarks.landmark[R_EYE_C].y
+        l_dist = l_eye_y - l_brow_y
+        r_dist = r_eye_y - r_brow_y
+        avg_dist = (l_dist + r_dist) / 2.0
+        norm = (avg_dist - 0.02) / 0.06
+        return max(0.0, min(1.0, norm))
+    except (IndexError, AttributeError) as e:
+        print(f"Error calculating Eyebrow Pos: {e}")
+        return 0.0
+def estimate_head_pose(landmarks):
+    if not landmarks:
+        return 0.0, 0.0
+    try:
+        NOSE = 4
+        L_EYE_C = 159
+        R_EYE_C = 386
+        nose_pt = np.array([landmarks.landmark[NOSE].x, landmarks.landmark[NOSE].y])
+        l_eye_pt = np.array([landmarks.landmark[L_EYE_C].x, landmarks.landmark[L_EYE_C].y])
+        r_eye_pt = np.array([landmarks.landmark[R_EYE_C].x, landmarks.landmark[R_EYE_C].y])
+        eye_mid_y = (l_eye_pt[1] + r_eye_pt[1]) / 2.0
+        eye_mid_x = (l_eye_pt[0] + r_eye_pt[0]) / 2.0
+        v_tilt = nose_pt[1] - eye_mid_y
+        h_tilt = nose_pt[0] - eye_mid_x
+        v_tilt_norm = max(-1.0, min(1.0, v_tilt * 5.0))
+        h_tilt_norm = max(-1.0, min(1.0, h_tilt * 10.0))
+        return v_tilt_norm, h_tilt_norm
+    except (IndexError, AttributeError) as e:
+        print(f"Error estimating Head Pose: {e}")
+        return 0.0, 0.0
+def calculate_metrics(landmarks, ad_context=None):
     if ad_context is None:
         ad_context = {}
+    if not landmarks:
+        return {m: 0.5 for m in metrics}  # Return defaults if no landmarks
+    # Calculate base features
+    ear = calculate_ear(landmarks)
+    mar = calculate_mar(landmarks)
+    eb_pos = calculate_eyebrow_position(landmarks)
+    v_tilt, h_tilt = estimate_head_pose(landmarks)
+    # Illustrative Context Adjustments
+    ad_type = ad_context.get('ad_type', 'Unk')
     gem_txt = str(ad_context.get('gemini_ad_analysis', '')).lower()
+    val_mar_w = 2.5 if ad_type == 'Funny' or 'humor' in gem_txt else 2.0
+    val_eb_w = 0.8 if ad_type == 'Serious' or 'sad' in gem_txt else 1.0
+    arsl_base = 0.05 if ad_type == 'Action' or 'exciting' in gem_txt else 0.0
+    # Calculate final metrics using base features and context adjustments
+    cl = max(0, min(1, 1.0 - ear * 2.5))
+    val = max(0, min(1, mar * val_mar_w * (val_eb_w * (1.0 - eb_pos))))
+    arsl = max(0, min(1, arsl_base + (mar + (1.0 - ear) + eb_pos) / 3.0))
+    dom = max(0, min(1, 0.5 + v_tilt))
     neur = max(0, min(1, (cl * 0.6) + ((1.0 - val) * 0.4)))
     em_stab = 1.0 - neur
     extr = max(0, min(1, (arsl * 0.5) + (val * 0.5)))
+    open = max(0, min(1, 0.5 + ((mar - 0.5) * 0.5)))
     agree = max(0, min(1, (val * 0.7) + ((1.0 - arsl) * 0.3)))
     consc = max(0, min(1, (1.0 - abs(arsl - 0.5)) * 0.7 + (em_stab * 0.3)))
+    stress = max(0, min(1, (cl * 0.5) + (eb_pos * 0.3) + ((1.0 - val) * 0.2)))
+    engag = max(0, min(1, (arsl * 0.7) + ((1.0 - abs(h_tilt)) * 0.3)))
+    # Return dictionary of metrics
+    return {
+        'valence': val, 'arousal': arsl, 'dominance': dom, 'cognitive_load': cl,
+        'emotional_stability': em_stab, 'openness': open, 'agreeableness': agree,
+        'neuroticism': neur, 'conscientiousness': consc, 'extraversion': extr,
+        'stress_index': stress, 'engagement_level': engag
     }
+def update_metrics_visualization(metrics_values, audio_metrics=None, title=None):
     if not metrics_values:
         fig, ax = plt.subplots(figsize=(10, 8))
+        ax.text(0.5, 0.5, "Waiting...", ha='center', va='center')
         ax.axis('off')
         fig.patch.set_facecolor('#FFFFFF')
         ax.set_facecolor('#FFFFFF')
         return fig
+    # Combine face and audio metrics for visualization
+    all_metrics = {}
+    for k, v in metrics_values.items():
+        if k not in ('timestamp', 'frame_number', 'user_state', 'detailed_user_analysis'):
+            all_metrics[k] = v
+    if audio_metrics:
+        for k, v in audio_metrics.items():
+            if isinstance(v, (int, float)):
+                all_metrics[k] = v
+    num_metrics = len(all_metrics)
     nrows = (num_metrics + 2) // 3
     fig, axs = plt.subplots(nrows, 3, figsize=(10, nrows * 2.5), facecolor='#FFFFFF')
     axs = axs.flatten()
+    if title:
+        fig.suptitle(title, fontsize=12)
     colors = [(0.1, 0.1, 0.9), (0.9, 0.9, 0.1), (0.9, 0.1, 0.1)]
     cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=100)
     norm = plt.Normalize(0, 1)
     metric_idx = 0
+    for key, value in all_metrics.items():
+        if not isinstance(value, (int, float)):
+            value = 0.5
+        value = max(0.0, min(1.0, value))
         ax = axs[metric_idx]
         ax.set_title(key.replace('_', ' ').title(), fontsize=10)
     plt.tight_layout(pad=0.5)
     return fig
+def create_user_state_display(state_text, detailed_analysis=None):
+    """Create a visual display of the user state"""
+    fig, ax = plt.subplots(figsize=(10, 2.5))
+    ax.axis('off')
+    # Display state
+    ax.text(0.5, 0.8, f"USER STATE: {state_text}",
+            ha='center', va='center', fontsize=14, fontweight='bold',
+            bbox=dict(facecolor='#e6f2ff', alpha=0.7, boxstyle='round,pad=0.5'))
+    # Display detailed analysis if available
+    if detailed_analysis:
+        ax.text(0.5, 0.3, detailed_analysis,
+                ha='center', va='center', fontsize=10,
+                bbox=dict(facecolor='#f2f2f2', alpha=0.7, boxstyle='round,pad=0.5'))
+    plt.tight_layout()
+    return fig
+def annotate_frame(frame, landmarks):
+    """Add facial landmark annotations to a frame"""
     if frame is None:
         return None
     annotated = frame.copy()
+    if landmarks:
+        try:
+            mp_drawing.draw_landmarks(
+                image=annotated,
+                landmark_list=landmarks,
+                connections=mp_face_mesh.FACEMESH_TESSELATION,
+                landmark_drawing_spec=None,
+                connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
+            )
+            mp_drawing.draw_landmarks(
+                image=annotated,
+                landmark_list=landmarks,
+                connections=mp_face_mesh.FACEMESH_CONTOURS,
+                landmark_drawing_spec=None,
+                connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style()
+            )
+        except Exception as e:
+            print(f"Error drawing landmarks: {e}")
     return annotated
+# --- Background Processing Functions ---
+def process_frames_in_background(session):
+    """Background thread for processing frames and updating metrics"""
+    while True:
+        try:
+            # Get task from queue
+            task = processing_queue.get(timeout=1.0)
+            if task.get('command') == 'stop':
+                break
+            frame = task.get('frame')
+            if frame is None:
+                continue
+            # Process the frame
+            result = process_webcam_frame(
+                frame,
+                task.get('ad_context', {}),
+                task.get('metrics_data', initial_metrics_df.copy()),
+                task.get('frame_count', 0),
+                task.get('start_time', time.time()),
+                task.get('audio_path'),
+                task.get('gemini_model')
+            )
+            # Put result in results queue
+            results_queue.put({
+                'annotated_frame': result[0],
+                'metrics': result[1],
+                'audio_metrics': result[2],
+                'metrics_df': result[3],
+                'state_fig': result[4],
+                'metrics_fig': result[5]
+            })
+            # Mark task as done
+            processing_queue.task_done()
+        except queue.Empty:
+            continue
+        except Exception as e:
+            print(f"Error in background processing: {e}")
+            continue
+# --- Video File Processing with Progress Updates ---
 def process_video_file(
     video_file: Union[str, np.ndarray],
     ad_description: str = "",
     ad_type: str = "Video",
     sampling_rate: int = 5,  # Process every Nth frame
     save_processed_video: bool = True,
+    progress=gr.Progress()
+) -> Tuple[str, str, str, pd.DataFrame]:
     """
     Process a video file and analyze facial expressions frame by frame
         ad_type: Type of ad (Video, Image, Audio, Text, Funny, etc.)
         sampling_rate: Process every Nth frame
         save_processed_video: Whether to save the processed video with annotations
+        progress: Gradio progress bar
     Returns:
+        Tuple of (csv_path, audio_path, processed_video_path, metrics_dataframe)
     """
+    # Initialize Gemini model
+    gemini_model = configure_gemini()
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     csv_path = CSV_FILENAME_TEMPLATE.format(timestamp=timestamp)
+    audio_path = AUDIO_FILENAME_TEMPLATE.format(timestamp=timestamp)
     video_path = VIDEO_FILENAME_TEMPLATE.format(timestamp=timestamp) if save_processed_video else None
     # Setup ad context
+    gemini_result = call_gemini_api_for_ad(gemini_model, ad_description, ad_detail, ad_type)
     ad_context = {
         "ad_description": ad_description,
         "ad_detail": ad_detail,
         "gemini_ad_analysis": gemini_result
     }
+    progress(0, desc="Initializing video processing")
     # Initialize capture
     if isinstance(video_file, str):
         cap = cv2.VideoCapture(video_file)
         temp_path = os.path.join(temp_dir, "temp_video.mp4")
         # Convert video array to file
+        if isinstance(video_file, np.ndarray):
+            # Assuming it's a series of frames
+            h, w = video_file[0].shape[:2] if len(video_file) > 0 else (480, 640)
             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
             temp_writer = cv2.VideoWriter(temp_path, fourcc, 30, (w, h))
             for frame in video_file:
                 temp_writer.write(frame)
             temp_writer.release()
+        video_file = temp_path
+        cap = cv2.VideoCapture(temp_path)
     if not cap.isOpened():
         print("Error: Could not open video.")
+        return None, None, None, None
+    # Extract audio for analysis
+    audio_extracted = extract_audio_from_video(video_file, audio_path)
     # Get video properties
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     # Initialize video writer if saving processed video
     if save_processed_video:
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(video_path, fourcc, fps, (frame_width, frame_height))
     # Process video frames
     metrics_data = []
     frame_count = 0
+    # Create a thread pool for audio processing
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        # Queue for audio analysis results
+        audio_futures = {}
+        progress(0.1, desc="Starting frame analysis")
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Only process every Nth frame (according to sampling_rate)
+            process_this_frame = frame_count % sampling_rate == 0
+            frame_timestamp = frame_count / fps
+            if process_this_frame:
+                progress(min(0.1 + 0.8 * (frame_count / total_frames), 0.9),
+                         desc=f"Processing frame {frame_count}/{total_frames}")
+                # Extract facial landmarks
+                landmarks = extract_face_landmarks(frame, face_mesh)
+                # Submit audio analysis task if audio was extracted
+                if process_this_frame and audio_extracted and frame_timestamp not in audio_futures:
+                    audio_futures[frame_timestamp] = executor.submit(
+                        analyze_audio_segment, audio_path, frame_timestamp, 1.0
+                    )
+                # Get audio analysis results if available
+                audio_metrics = None
+                if frame_timestamp in audio_futures and audio_futures[frame_timestamp].done():
+                    audio_metrics = audio_futures[frame_timestamp].result()
+                # Calculate metrics if landmarks detected
+                if landmarks:
+                    calculated_metrics = calculate_metrics(landmarks, ad_context)
+                    user_state, detailed_analysis = interpret_metrics_with_gemini(
+                        gemini_model, calculated_metrics, audio_metrics, ad_context, frame_timestamp
+                    )
+                    # Create a row for the dataframe
+                    row = {
+                        'timestamp': frame_timestamp,
+                        'frame_number': frame_count,
+                        **calculated_metrics
+                    }
+                    # Add audio metrics if available
+                    if audio_metrics:
+                        row.update(audio_metrics)
+                    else:
+                        # Default audio metrics
+                        row.update({m: 0.5 for m in audio_metrics})
+                    # Add context and state
+                    row.update(ad_context)
+                    row['user_state'] = user_state
+                    row['detailed_user_analysis'] = detailed_analysis
+                    metrics_data.append(row)
+                    # Annotate the frame with facial landmarks
+                    if save_processed_video:
+                        annotated_frame = annotate_frame(frame, landmarks)
+                        # Add user state text to frame
+                        cv2.putText(
+                            annotated_frame,
+                            f"State: {user_state}",
+                            (10, 30),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.7,
+                            (0, 255, 0),
+                            2
+                        )
+                        # Add audio emotion if available
+                        if audio_metrics and 'audio_emotion' in audio_metrics:
+                            cv2.putText(
+                                annotated_frame,
+                                f"Audio: {audio_metrics['audio_emotion']}",
+                                (10, 60),
+                                cv2.FONT_HERSHEY_SIMPLEX,
+                                0.7,
+                                (255, 0, 0),
+                                2
+                            )
+                        out.write(annotated_frame)
+                elif save_processed_video:
+                    # If no landmarks detected, still write the original frame to the video
+                    out.write(frame)
+            elif save_processed_video:
+                # For frames not being analyzed, still include them in the output video
+                out.write(frame)
+            frame_count += 1
+        # Wait for all audio analysis to complete
+        for future in audio_futures.values():
+            if not future.done():
+                future.result()  # This will wait for completion
+    progress(0.95, desc="Finalizing results")
     # Release resources
     cap.release()
     metrics_df = pd.DataFrame(metrics_data)
     if not metrics_df.empty:
         metrics_df.to_csv(csv_path, index=False)
+        progress(1.0, desc="Processing complete")
+    else:
+        progress(1.0, desc="No facial data detected")
     # Return results
+    return csv_path, audio_path, video_path, metrics_df
+# --- Updated Webcam Processing Function ---
 def process_webcam_frame(
     frame: np.ndarray,
     ad_context: Dict[str, Any],
     metrics_data: pd.DataFrame,
     frame_count: int,
+    start_time: float,
+    audio_path: str = None,
+    gemini_model = None
+) -> Tuple[np.ndarray, Dict[str, float], Dict[str, Any], pd.DataFrame, object, object]:
     """
+    Process a single webcam frame with audio integration
     Args:
         frame: Input frame from webcam
         metrics_data: DataFrame to accumulate metrics
         frame_count: Current frame count
         start_time: Start time of the session
+        audio_path: Path to extracted audio file (if available)
+        gemini_model: Configured Gemini model instance
     Returns:
+        Tuple of (annotated_frame, metrics_dict, audio_metrics, updated_metrics_df, state_fig, metrics_fig)
     """
     if frame is None:
+        return None, None, None, metrics_data, None, None
+    # Extract facial landmarks
+    landmarks = extract_face_landmarks(frame, face_mesh)
+    # Get current timestamp
+    current_time = time.time()
+    elapsed_time = current_time - start_time
+    # Analyze audio segment if available
+    audio_metrics = None
+    if audio_path and os.path.exists(audio_path):
+        audio_metrics = analyze_audio_segment(audio_path, elapsed_time, 1.0)
+    # Calculate metrics if landmarks detected
+    if landmarks:
+        calculated_metrics = calculate_metrics(landmarks, ad_context)
+        user_state, detailed_analysis = interpret_metrics_with_gemini(
+            gemini_model, calculated_metrics, audio_metrics, ad_context, elapsed_time
+        )
         # Create a row for the dataframe
         row = {
+            'timestamp': elapsed_time,
             'frame_number': frame_count,
+            **calculated_metrics
         }
+        # Add audio metrics if available
+        if audio_metrics:
+            row.update(audio_metrics)
+        else:
+            # Default audio metrics
+            row.update({m: 0.5 for m in audio_metrics})
+        # Add context and state
+        row.update(ad_context)
+        row['user_state'] = user_state
+        row['detailed_user_analysis'] = detailed_analysis
         # Add row to DataFrame
         new_row_df = pd.DataFrame([row], columns=all_columns)
         metrics_data = pd.concat([metrics_data, new_row_df], ignore_index=True)
+        # Create visualizations
+        metrics_plot = update_metrics_visualization(
+            calculated_metrics,
+            audio_metrics,
+            title=f"Frame {frame_count} Metrics"
+        )
+        state_plot = create_user_state_display(user_state, detailed_analysis)
         # Annotate the frame
+        annotated_frame = annotate_frame(frame, landmarks)
+        # Add user state text to frame
+        cv2.putText(
+            annotated_frame,
+            f"State: {user_state}",
+            (10, 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.7,
+            (0, 255, 0),
+            2
+        )
+        # Add audio emotion if available
+        if audio_metrics and 'audio_emotion' in audio_metrics:
+            cv2.putText(
+                annotated_frame,
+                f"Audio: {audio_metrics['audio_emotion']}",
+                (10, 60),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.7,
+                (255, 0, 0),
+                2
+            )
+        return annotated_frame, calculated_metrics, audio_metrics, metrics_data, state_plot, metrics_plot
     else:
         # No face detected
+        return frame, None, None, metrics_data, None, None
+# --- Updated Webcam Session Functions ---
 def start_webcam_session(
     ad_description: str = "",
     ad_detail: str = "",
     ad_type: str = "Video",
     save_interval: int = 100,  # Save CSV every N frames
+    record_audio: bool = False
 ) -> Dict[str, Any]:
     """
+    Initialize a webcam session for facial analysis with audio recording
     Args:
         ad_description: Description of the ad being watched
         ad_detail: Detail focus of the ad
         ad_type: Type of ad
         save_interval: How often to save data to CSV
+        record_audio: Whether to record audio during session
     Returns:
         Session context dictionary
     # Generate timestamp for file naming
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     csv_path = CSV_FILENAME_TEMPLATE.format(timestamp=timestamp)
+    audio_path = AUDIO_FILENAME_TEMPLATE.format(timestamp=timestamp) if record_audio else None
+    # Initialize Gemini model
+    gemini_model = configure_gemini()
     # Setup ad context
+    gemini_result = call_gemini_api_for_ad(gemini_model, ad_description, ad_detail, ad_type)
     ad_context = {
         "ad_description": ad_description,
         "ad_detail": ad_detail,
         "metrics_data": initial_metrics_df.copy(),
         "ad_context": ad_context,
         "csv_path": csv_path,
+        "audio_path": audio_path,
         "save_interval": save_interval,
         "last_saved": 0,
+        "gemini_model": gemini_model,
+        "processing_thread": None
     }
+    # Start background processing thread
+    processor = threading.Thread(target=process_frames_in_background, args=(session,))
+    processor.daemon = True
+    processor.start()
+    session["processing_thread"] = processor
     return session
 def update_webcam_session(
     session: Dict[str, Any],
     frame: np.ndarray
+) -> Tuple[np.ndarray, object, object, Dict[str, Any]]:
     """
     Update webcam session with a new frame
         frame: New frame from webcam
     Returns:
+        Tuple of (annotated_frame, state_plot, metrics_plot, updated_session)
     """
+    if session is None:
+        return frame, None, None, session
+    # Add task to processing queue
+    processing_queue.put({
+        'command': 'process',
+        'frame': frame.copy() if frame is not None else None,
+        'ad_context': session["ad_context"],
+        'metrics_data': session["metrics_data"],
+        'frame_count': session["frame_count"],
+        'start_time': session["start_time"],
+        'audio_path': session["audio_path"],
+        'gemini_model': session["gemini_model"]
+    })
+    # Update frame count
     session["frame_count"] += 1
+    # Get result if available
+    try:
+        result = results_queue.get_nowait()
+        annotated_frame = result.get('annotated_frame', frame)
+        state_fig = result.get('state_fig')
+        metrics_fig = result.get('metrics_fig')
+        session["metrics_data"] = result.get('metrics_df', session["metrics_data"])
+        results_queue.task_done()
+    except queue.Empty:
+        # No result yet, return original frame
+        annotated_frame = frame
+        state_fig = None
+        metrics_fig = None
     # Save CSV periodically
     if session["frame_count"] - session["last_saved"] >= session["save_interval"]:
+        if not session["metrics_data"].empty:
+            session["metrics_data"].to_csv(session["csv_path"], index=False)
         session["last_saved"] = session["frame_count"]
+    return annotated_frame, state_fig, metrics_fig, session
 def end_webcam_session(session: Dict[str, Any]) -> Tuple[str, str]:
     """
         session: Session context dictionary
     Returns:
+        Tuple of (csv_path, audio_path)
     """
+    if session is None:
+        return None, None
+    # Stop background processing thread
+    if session["processing_thread"] and session["processing_thread"].is_alive():
+        processing_queue.put({"command": "stop"})
+        session["processing_thread"].join(timeout=2.0)
     # Save final metrics to CSV
     if not session["metrics_data"].empty:
         session["metrics_data"].to_csv(session["csv_path"], index=False)
     print(f"Session ended. Data saved to {session['csv_path']}")
+    return session["csv_path"], session["audio_path"]
+# --- Create Enhanced Gradio Interface ---
 def create_api_interface():
+    with gr.Blocks(title="Enhanced Facial Analysis APIs") as iface:
+        gr.Markdown("# Enhanced Facial Analysis APIs\nAnalyze facial expressions and audio in videos or webcam feed")
         with gr.Tab("Video File API"):
             with gr.Row():
                 with gr.Column(scale=1):
                     video_input = gr.Video(label="Upload Video")
+                    vid_ad_desc = gr.Textbox(label="Ad Description")
+                    vid_ad_detail = gr.Textbox(label="Ad Detail Focus")
                     vid_ad_type = gr.Radio(
                         ["Video", "Image", "Audio", "Text", "Funny", "Serious", "Action", "Informative"],
                         label="Ad Type/Genre",
                         label="Sampling Rate (process every N frames)"
                     )
                     save_video = gr.Checkbox(label="Save Processed Video", value=True)
+                    process_btn = gr.Button("Process Video")
                 with gr.Column(scale=2):
                     with gr.Row():
+                        output_text = gr.Textbox(label="Processing Status")
+                    with gr.Row():
+                        output_video = gr.Video(label="Processed Video")
+                    with gr.Row():
+                        output_plot = gr.Plot(label="Metrics Visualization")
+                        user_state_plot = gr.Plot(label="User State Analysis")
                     with gr.Row():
+                        output_csv = gr.File(label="Download CSV Results")
+                        output_audio = gr.Audio(label="Extracted Audio")
+            # Define function to handle video processing with live updates
+            def handle_video_processing(video, desc, detail, ad_type, rate, save_vid, progress=gr.Progress()):
                 if video is None:
+                    return "No video uploaded", None, None, None, None, None
                 try:
+                    progress(0.05, "Starting video processing...")
+                    csv_path, audio_path, video_path, metrics_df = process_video_file(
                         video,
                         ad_description=desc,
                         ad_detail=detail,
                         ad_type=ad_type,
                         sampling_rate=rate,
                         save_processed_video=save_vid,
+                        progress=progress
                     )
                     if metrics_df is None or metrics_df.empty:
+                        return "No facial data detected in video", None, None, None, None, None
+                    # Get a sample row for visualization
+                    middle_idx = len(metrics_df) // 2
+                    sample_row = metrics_df.iloc[middle_idx].to_dict()
+                    # Generate visualizations
+                    metrics_plot = update_metrics_visualization(
+                        {k: v for k, v in sample_row.items() if k in metrics},
+                        {k: v for k, v in sample_row.items() if k in audio_metrics},
+                        title=f"Sample Frame Metrics (Frame {sample_row['frame_number']})"
+                    )
+                    state_plot = create_user_state_display(
+                        sample_row.get('user_state', 'No state'),
+                        sample_row.get('detailed_user_analysis', '')
+                    )
+                    processed_frames = metrics_df.shape[0]
+                    total_duration = metrics_df['timestamp'].max() if not metrics_df.empty else 0
+                    result_text = f"✅ Processing complete!\n"
+                    result_text += f"• Analyzed {processed_frames} frames over {total_duration:.2f} seconds\n"
+                    result_text += f"• CSV saved to: {csv_path}\n"
+                    if audio_path:
+                        result_text += f"• Audio extracted to: {audio_path}\n"
                     if video_path:
+                        result_text += f"• Processed video saved to: {video_path}\n"
+                    return result_text, csv_path, video_path, audio_path, metrics_plot, state_plot
                 except Exception as e:
+                    return f"Error processing video: {str(e)}", None, None, None, None, None
             process_btn.click(
                 handle_video_processing,
                 inputs=[video_input, vid_ad_desc, vid_ad_detail, vid_ad_type, sampling_rate, save_video],
+                outputs=[output_text, output_csv, output_video, output_audio, output_plot, user_state_plot]
             )
         with gr.Tab("Webcam API"):
             with gr.Row():
+                with gr.Column(scale=1):
                     webcam_input = gr.Image(sources="webcam", streaming=True, label="Webcam Input", type="numpy")
+                    web_ad_desc = gr.Textbox(label="Ad Description")
+                    web_ad_detail = gr.Textbox(label="Ad Detail Focus")
+                    web_ad_type = gr.Radio(
+                        ["Video", "Image", "Audio", "Text", "Funny", "Serious", "Action", "Informative"],
+                        label="Ad Type/Genre",
+                        value="Video"
+                    )
+                    record_audio = gr.Checkbox(label="Record Audio", value=True)
+                    start_session_btn = gr.Button("Start Session")
+                    end_session_btn = gr.Button("End Session")
                 with gr.Column(scale=2):
+                    with gr.Row():
+                        processed_output = gr.Image(label="Processed Feed", type="numpy")
                     with gr.Row():
+                        metrics_plot = gr.Plot(label="Live Metrics")
+                        state_plot = gr.Plot(label="User State Analysis")
                     with gr.Row():
+                        session_status = gr.Textbox(label="Session Status")
                         download_csv = gr.File(label="Download Session Data")
             # Session state
             session_data = gr.State(value=None)
             # Define session handlers
+            def start_session(desc, detail, ad_type, record_audio):
+                try:
+                    session = start_webcam_session(
+                        ad_description=desc,
+                        ad_detail=detail,
+                        ad_type=ad_type,
+                        record_audio=record_audio
+                    )
+                    status_text = "✅ Session started successfully!\n\n"
+                    status_text += f"• Ad Context: {desc} ({ad_type})\n"
+                    status_text += f"• Focus: {detail}\n"
+                    status_text += f"• Audio Recording: {'Enabled' if record_audio else 'Disabled'}\n"
+                    status_text += f"• Data will be saved to: {session['csv_path']}"
+                    return session, status_text
+                except Exception as e:
+                    return None, f"Error starting session: {str(e)}"
             def process_frame(frame, session):
+                if session is None or frame is None:
+                    return frame, None, None, session
+                try:
+                    annotated_frame, state_fig, metrics_fig, updated_session = update_webcam_session(session, frame)
+                    return annotated_frame, state_fig, metrics_fig, updated_session
+                except Exception as e:
+                    print(f"Error processing frame: {e}")
+                    return frame, None, None, session
             def end_session(session):
                 if session is None:
+                    return "No active session", None
+                try:
+                    csv_path, audio_path = end_webcam_session(session)
+                    status_text = "✅ Session ended successfully!\n\n"
+                    status_text += f"• Data saved to: {csv_path}\n"
+                    if audio_path:
+                        status_text += f"• Audio saved to: {audio_path}"
+                    return status_text, csv_path
+                except Exception as e:
+                    return f"Error ending session: {str(e)}", None
             start_session_btn.click(
                 start_session,
+                inputs=[web_ad_desc, web_ad_detail, web_ad_type, record_audio],
                 outputs=[session_data, session_status]
             )
             webcam_input.stream(
                 process_frame,
                 inputs=[webcam_input, session_data],
+                outputs=[processed_output, state_plot, metrics_plot, session_data]
             )
             end_session_btn.click(
                 end_session,
                 inputs=[session_data],
+                outputs=[session_status, download_csv]
             )
     return iface
 # Entry point
 if __name__ == "__main__":
+    print("Starting Enhanced Facial Analysis API server...")
+    # Pre-initialize models if needed
+    # initialize_audio_model()
     iface = create_api_interface()
     iface.launch(debug=True)