# app.py import streamlit as st import cv2 import numpy as np import moviepy.editor as mp from transformers import ViTImageProcessor, ViTForImageClassification import torch from speechbrain.pretrained import EncoderClassifier import tempfile import os # 1. Load Models with caching @st.cache_resource def load_models(): # Load ViT model for facial emotion detection vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224') vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') # Load SpeechBrain model for audio emotion recognition audio_classifier = EncoderClassifier.from_hparams( source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", savedir="pretrained_models/emotion-audio" ) return vit_processor, vit_model, audio_classifier # 2. Video Processing Functions def analyze_frame(frame, processor, model): """Analyze single frame using ViT model""" inputs = processor(images=frame, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) return model.config.id2label[outputs.logits.argmax(-1).item()] def process_video(video_path, processor, model, audio_classifier): """Process video and return combined results""" # Extract audio from video video = mp.VideoFileClip(video_path) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio: audio_path = tmp_audio.name video.audio.write_audiofile(audio_path) # Analyze audio audio_signal = audio_classifier.load_audio(audio_path) audio_prediction = audio_classifier.classify_batch(audio_signal) audio_emotion = audio_prediction[3][0] # Analyze video frames cap = cv2.VideoCapture(video_path) emotions = [] # Process every 5th frame to reduce computation frame_count = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break if frame_count % 5 == 0: # Sample every 5th frame frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) emotions.append(analyze_frame(frame_rgb, processor, model)) frame_count += 1 cap.release() os.unlink(audio_path) # Clean up temporary audio file # Get most common visual emotion visual_emotion = max(set(emotions), key=emotions.count) return { 'audio_emotion': audio_emotion, 'visual_emotion': visual_emotion, 'frame_emotions': emotions } # 3. Streamlit UI st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide") st.title("🎥 Video Sentiment Analysis") st.markdown(""" Analyze emotions from: - **Facial Expressions** using ViT (Vision Transformer) - **Speech Tone** using wav2vec2 """) uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"]) if uploaded_file: # Display video preview st.video(uploaded_file) # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video: tmp_video.write(uploaded_file.getbuffer()) video_path = tmp_video.name # Load models vit_processor, vit_model, audio_classifier = load_models() # Process video with st.spinner("Analyzing video content..."): try: results = process_video(video_path, vit_processor, vit_model, audio_classifier) finally: os.unlink(video_path) # Clean up temporary video file # Display results col1, col2 = st.columns(2) with col1: st.subheader("🎧 Audio Analysis") st.metric("Dominant Emotion", results['audio_emotion']) with col2: st.subheader("👁️ Visual Analysis") st.metric("Dominant Emotion", results['visual_emotion']) # Show emotion timeline st.subheader("📈 Emotion Timeline") st.line_chart( data={ "Frame Emotions": results['frame_emotions'] }, use_container_width=True ) st.success("Analysis complete!") # Footer st.markdown("---") st.markdown("Built with [Hugging Face](https://huggingface.co/) 🤗 & [Streamlit](https://streamlit.io/) 🎈")