Faceoff / app.py
Sagnik1750's picture
Update app.py
f49a563 verified
raw
history blame
4.23 kB
# app.py
import streamlit as st
import cv2
import numpy as np
import moviepy.editor as mp
from transformers import ViTImageProcessor, ViTForImageClassification
import torch
from speechbrain.pretrained import EncoderClassifier
import tempfile
import os
# 1. Load Models with caching
@st.cache_resource
def load_models():
# Load ViT model for facial emotion detection
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
# Load SpeechBrain model for audio emotion recognition
audio_classifier = EncoderClassifier.from_hparams(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
savedir="pretrained_models/emotion-audio"
)
return vit_processor, vit_model, audio_classifier
# 2. Video Processing Functions
def analyze_frame(frame, processor, model):
"""Analyze single frame using ViT model"""
inputs = processor(images=frame, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
return model.config.id2label[outputs.logits.argmax(-1).item()]
def process_video(video_path, processor, model, audio_classifier):
"""Process video and return combined results"""
# Extract audio from video
video = mp.VideoFileClip(video_path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
audio_path = tmp_audio.name
video.audio.write_audiofile(audio_path)
# Analyze audio
audio_signal = audio_classifier.load_audio(audio_path)
audio_prediction = audio_classifier.classify_batch(audio_signal)
audio_emotion = audio_prediction[3][0]
# Analyze video frames
cap = cv2.VideoCapture(video_path)
emotions = []
# Process every 5th frame to reduce computation
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % 5 == 0: # Sample every 5th frame
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
emotions.append(analyze_frame(frame_rgb, processor, model))
frame_count += 1
cap.release()
os.unlink(audio_path) # Clean up temporary audio file
# Get most common visual emotion
visual_emotion = max(set(emotions), key=emotions.count)
return {
'audio_emotion': audio_emotion,
'visual_emotion': visual_emotion,
'frame_emotions': emotions
}
# 3. Streamlit UI
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")
st.title("πŸŽ₯ Video Sentiment Analysis")
st.markdown("""
Analyze emotions from:
- **Facial Expressions** using ViT (Vision Transformer)
- **Speech Tone** using wav2vec2
""")
uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])
if uploaded_file:
# Display video preview
st.video(uploaded_file)
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
tmp_video.write(uploaded_file.getbuffer())
video_path = tmp_video.name
# Load models
vit_processor, vit_model, audio_classifier = load_models()
# Process video
with st.spinner("Analyzing video content..."):
try:
results = process_video(video_path, vit_processor, vit_model, audio_classifier)
finally:
os.unlink(video_path) # Clean up temporary video file
# Display results
col1, col2 = st.columns(2)
with col1:
st.subheader("🎧 Audio Analysis")
st.metric("Dominant Emotion", results['audio_emotion'])
with col2:
st.subheader("πŸ‘οΈ Visual Analysis")
st.metric("Dominant Emotion", results['visual_emotion'])
# Show emotion timeline
st.subheader("πŸ“ˆ Emotion Timeline")
st.line_chart(
data={ "Frame Emotions": results['frame_emotions'] },
use_container_width=True
)
st.success("Analysis complete!")
# Footer
st.markdown("---")
st.markdown("Built with [Hugging Face](https://huggingface.co/) πŸ€— & [Streamlit](https://streamlit.io/) 🎈")