Spaces:
Sleeping
Sleeping
# app.py | |
import streamlit as st | |
import cv2 | |
import numpy as np | |
import moviepy.editor as mp | |
from transformers import ViTImageProcessor, ViTForImageClassification | |
import torch | |
from speechbrain.pretrained import EncoderClassifier | |
import tempfile | |
import os | |
# 1. Load Models with caching | |
def load_models(): | |
# Load ViT model for facial emotion detection | |
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224') | |
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') | |
# Load SpeechBrain model for audio emotion recognition | |
audio_classifier = EncoderClassifier.from_hparams( | |
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
savedir="pretrained_models/emotion-audio" | |
) | |
return vit_processor, vit_model, audio_classifier | |
# 2. Video Processing Functions | |
def analyze_frame(frame, processor, model): | |
"""Analyze single frame using ViT model""" | |
inputs = processor(images=frame, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
return model.config.id2label[outputs.logits.argmax(-1).item()] | |
def process_video(video_path, processor, model, audio_classifier): | |
"""Process video and return combined results""" | |
# Extract audio from video | |
video = mp.VideoFileClip(video_path) | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio: | |
audio_path = tmp_audio.name | |
video.audio.write_audiofile(audio_path) | |
# Analyze audio | |
audio_signal = audio_classifier.load_audio(audio_path) | |
audio_prediction = audio_classifier.classify_batch(audio_signal) | |
audio_emotion = audio_prediction[3][0] | |
# Analyze video frames | |
cap = cv2.VideoCapture(video_path) | |
emotions = [] | |
# Process every 5th frame to reduce computation | |
frame_count = 0 | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
break | |
if frame_count % 5 == 0: # Sample every 5th frame | |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
emotions.append(analyze_frame(frame_rgb, processor, model)) | |
frame_count += 1 | |
cap.release() | |
os.unlink(audio_path) # Clean up temporary audio file | |
# Get most common visual emotion | |
visual_emotion = max(set(emotions), key=emotions.count) | |
return { | |
'audio_emotion': audio_emotion, | |
'visual_emotion': visual_emotion, | |
'frame_emotions': emotions | |
} | |
# 3. Streamlit UI | |
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide") | |
st.title("π₯ Video Sentiment Analysis") | |
st.markdown(""" | |
Analyze emotions from: | |
- **Facial Expressions** using ViT (Vision Transformer) | |
- **Speech Tone** using wav2vec2 | |
""") | |
uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"]) | |
if uploaded_file: | |
# Display video preview | |
st.video(uploaded_file) | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video: | |
tmp_video.write(uploaded_file.getbuffer()) | |
video_path = tmp_video.name | |
# Load models | |
vit_processor, vit_model, audio_classifier = load_models() | |
# Process video | |
with st.spinner("Analyzing video content..."): | |
try: | |
results = process_video(video_path, vit_processor, vit_model, audio_classifier) | |
finally: | |
os.unlink(video_path) # Clean up temporary video file | |
# Display results | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("π§ Audio Analysis") | |
st.metric("Dominant Emotion", results['audio_emotion']) | |
with col2: | |
st.subheader("ποΈ Visual Analysis") | |
st.metric("Dominant Emotion", results['visual_emotion']) | |
# Show emotion timeline | |
st.subheader("π Emotion Timeline") | |
st.line_chart( | |
data={ "Frame Emotions": results['frame_emotions'] }, | |
use_container_width=True | |
) | |
st.success("Analysis complete!") | |
# Footer | |
st.markdown("---") | |
st.markdown("Built with [Hugging Face](https://huggingface.co/) π€ & [Streamlit](https://streamlit.io/) π") |