import cv2 import sounddevice as sd import mediapipe as mp import numpy as np import pandas as pd import librosa import threading import time import csv from collections import deque # --- Configuration --- SAMPLE_RATE = 16000 AUDIO_CHANNELS = 1 BUFFER_DURATION_SECONDS = 10 # Keep last 10s of data PROCESSING_INTERVAL_SECONDS = 4.0 CSV_FILENAME = "metrics_log.csv" # --- Buffers (use thread-safe versions if needed) --- frame_buffer = deque(maxlen=int(BUFFER_DURATION_SECONDS * 30)) # Assuming ~30fps audio_buffer = deque(maxlen=int(BUFFER_DURATION_SECONDS * SAMPLE_RATE)) frame_timestamps = deque(maxlen=int(BUFFER_DURATION_SECONDS * 30)) audio_timestamps = deque(maxlen=int(BUFFER_DURATION_SECONDS * SAMPLE_RATE)) # Timestamps per chunk # --- MediaPipe Setup --- mp_face_mesh = mp.solutions.face_mesh mp_drawing = mp.solutions.drawing_utils drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) face_mesh = mp_face_mesh.FaceMesh( max_num_faces=1, refine_landmarks=True, # Crucial for iris/pupil min_detection_confidence=0.5, min_tracking_confidence=0.5) # --- Placeholder Functions (Requires detailed implementation) --- def analyze_video_window(frames, timestamps): print(f"Analyzing {len(frames)} frames...") # TODO: # - Run MediaPipe Face Mesh + Iris on each frame # - Extract face presence, landmarks, blink status, pupil data per frame # - Aggregate: % face detected, avg emotion scores (if using FER), avg pupil proxy, total blinks # - Return aggregated features blink_count = np.random.randint(0, 5) # Placeholder avg_pupil_proxy = np.random.rand() # Placeholder face_detected_ratio = np.random.rand() # Placeholder avg_valence_proxy = (np.random.rand() - 0.5) * 2 # Placeholder [-1, 1] avg_arousal_proxy_face = np.random.rand() # Placeholder [0, 1] return { "blink_count": blink_count, "avg_pupil_proxy": avg_pupil_proxy, "face_detected_ratio": face_detected_ratio, "avg_valence_proxy": avg_valence_proxy, "avg_arousal_proxy_face": avg_arousal_proxy_face } def analyze_audio_window(audio_chunks, timestamps): if not audio_chunks: return {"avg_rms": 0, "avg_pitch": 0} # Default print(f"Analyzing {len(audio_chunks)} audio chunks...") # TODO: # - Concatenate chunks carefully based on timestamps / expected samples # - Run librosa: calculate RMS, pitch (e.g., pyin), maybe pauses # - Return aggregated features full_audio = np.concatenate(audio_chunks) avg_rms = np.sqrt(np.mean(full_audio**2)) # Basic RMS # Pitch estimation can be computationally expensive # pitches, magnitudes = librosa.pyin(full_audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=SAMPLE_RATE) # avg_pitch = np.nanmean(pitches) if pitches is not None and len(pitches) > 0 else 0 avg_pitch = np.random.randint(80, 300) # Placeholder return {"avg_rms": avg_rms, "avg_pitch": avg_pitch} def calculate_final_metrics(video_features, audio_features): # TODO: Combine features into the final 0-1 metrics # This requires defining heuristics or a simple model based on the features valence = (video_features.get("avg_valence_proxy", 0) + 1) / 2 # Normalize [-1,1] to [0,1] # Combine multiple arousal indicators (weights are examples) arousal_face = video_features.get("avg_arousal_proxy_face", 0) arousal_voice_rms = min(audio_features.get("avg_rms", 0) * 10, 1.0) # Scale RMS arousal_pupil = video_features.get("avg_pupil_proxy", 0.5) # Assuming pupil proxy is 0-1 arousal = (0.4 * arousal_face + 0.3 * arousal_voice_rms + 0.3 * arousal_pupil) engagement = video_features.get("face_detected_ratio", 0) # Simple proxy # Could add logic based on blink rate deviations, gaze stability etc. # Stress based on neg valence, high arousal stress = max(0, (1.0 - valence) * arousal) # Example heuristic # Cog load based on blink rate, pupil dilation blink_rate = video_features.get("blink_count", 0) / PROCESSING_INTERVAL_SECONDS # Normalize blink rate based on expected range (e.g. 0-1 Hz) norm_blink_rate = min(blink_rate, 1.0) cog_load = (0.5 * arousal_pupil + 0.5 * norm_blink_rate) # Example heuristic return { "Timestamp": time.strftime('%Y-%m-%d %H:%M:%S'), "Valence": round(valence, 3), "Arousal": round(arousal, 3), "Engagement_Proxy": round(engagement, 3), "Stress_Proxy": round(stress, 3), "Cognitive_Load_Proxy": round(cog_load, 3), "Blink_Rate_Hz": round(blink_rate, 3), "Pupil_Size_Proxy": round(video_features.get("avg_pupil_proxy", 0), 3) # --- Exclude Traits --- } def log_to_csv(filename, metrics_dict): file_exists = os.path.isfile(filename) with open(filename, 'a', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=metrics_dict.keys()) if not file_exists: writer.writeheader() # Write header only once writer.writerow(metrics_dict) # --- Capture Threads (Simplified Example - Needs proper implementation) --- video_active = True audio_active = True def video_capture_thread(): cap = cv2.VideoCapture(0) while video_active: ret, frame = cap.read() if ret: ts = time.time() # Make copies to avoid issues if buffer processes frame later frame_buffer.append(frame.copy()) frame_timestamps.append(ts) time.sleep(1/30.0) # Limit capture rate cap.release() print("Video thread stopped.") def audio_capture_callback(indata, frames, time_info, status): """This is called (from a separate thread) for each audio block.""" if status: print(status) ts = time.time() # Timestamp the arrival of the chunk # Make copies to avoid issues if buffer processes chunk later audio_buffer.append(indata.copy()) audio_timestamps.append(ts) # Add timestamp for the chunk def audio_capture_thread(): with sd.InputStream(samplerate=SAMPLE_RATE, channels=AUDIO_CHANNELS, callback=audio_capture_callback): print("Audio stream started. Press Ctrl+C to stop.") while audio_active: sd.sleep(1000) # Keep thread alive while stream is running print("Audio thread stopped.") # --- Main Processing Logic --- import os if __name__ == "__main__": print("Starting capture threads...") vid_thread = threading.Thread(target=video_capture_thread, daemon=True) aud_thread = threading.Thread(target=audio_capture_thread, daemon=True) vid_thread.start() aud_thread.start() last_process_time = time.time() try: while True: current_time = time.time() if current_time - last_process_time >= PROCESSING_INTERVAL_SECONDS: print(f"\n--- Processing window ending {time.strftime('%H:%M:%S')} ---") window_end_time = current_time window_start_time = window_end_time - PROCESSING_INTERVAL_SECONDS # --- Get data for the window (Needs thread safety - locks!) --- # This part is tricky: efficiently select items in the timestamp range # Simple non-thread-safe example: frames_in_window = [f for f, ts in zip(list(frame_buffer), list(frame_timestamps)) if window_start_time <= ts < window_end_time] audio_in_window = [a for a, ts in zip(list(audio_buffer), list(audio_timestamps)) if window_start_time <= ts < window_end_time] # In practice, you'd remove processed items from the buffer if not frames_in_window: print("No frames in window, skipping.") last_process_time = current_time # Or += PROCESSING_INTERVAL_SECONDS continue # --- Analyze --- video_features = analyze_video_window(frames_in_window, []) # Pass timestamps if needed audio_features = analyze_audio_window(audio_in_window, []) # Pass timestamps if needed # --- Calculate & Log --- final_metrics = calculate_final_metrics(video_features, audio_features) print("Calculated Metrics:", final_metrics) log_to_csv(CSV_FILENAME, final_metrics) last_process_time = current_time # Reset timer accurately time.sleep(0.1) # Prevent busy-waiting except KeyboardInterrupt: print("Stopping...") video_active = False audio_active = False # Wait for threads to finish vid_thread.join(timeout=2.0) # Audio thread stops when sd.sleep ends or stream closes print("Done.")