import streamlit as st import cv2 import numpy as np import moviepy.editor as mp from transformers import ( ViTImageProcessor, ViTForImageClassification, pipeline ) import torch # 1. Load Models @st.cache_resource def load_models(): # Visual model vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224') vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') # Audio model audio_analyzer = pipeline( "audio-classification", model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP" ) return vit_processor, vit_model, audio_analyzer # 2. Processing Functions def analyze_frame(frame, processor, model): inputs = processor(images=frame, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) return model.config.id2label[outputs.logits.argmax(-1).item()] def process_video(video_path, processor, model, audio_analyzer): # Extract audio video = mp.VideoFileClip(video_path) audio_path = "temp_audio.wav" video.audio.write_audiofile(audio_path) # Analyze audio audio_result = audio_analyzer(audio_path) audio_emotion = max(audio_result, key=lambda x: x['score'])['label'] # Analyze video frames cap = cv2.VideoCapture(video_path) emotions = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) emotions.append(analyze_frame(frame, processor, model)) cap.release() return { 'audio': audio_emotion, 'visual': max(set(emotions), key=emotions.count) } # 3. Streamlit UI st.title("Video Sentiment Analyzer 🎥") st.markdown(""" Analyze emotions from: - Facial expressions (ViT model) - Audio tone (wav2vec2 model) """) uploaded_file = st.file_uploader("Upload video (max 200MB)", type=["mp4", "avi"]) if uploaded_file: # Save to temp file with open("temp_video.mp4", "wb") as f: f.write(uploaded_file.getbuffer()) # Load models vit_processor, vit_model, audio_analyzer = load_models() # Process video with st.spinner("Analyzing video..."): result = process_video( "temp_video.mp4", vit_processor, vit_model, audio_analyzer ) # Display results col1, col2 = st.columns(2) with col1: st.subheader("🎧 Audio Analysis") st.metric("Emotion", result['audio']) with col2: st.subheader("👁️ Visual Analysis") st.metric("Dominant Emotion", result['visual']) st.success("Analysis complete!")