Spaces:
Sleeping
Sleeping
File size: 4,233 Bytes
f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 c3b2feb f49a563 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# app.py
import streamlit as st
import cv2
import numpy as np
import moviepy.editor as mp
from transformers import ViTImageProcessor, ViTForImageClassification
import torch
from speechbrain.pretrained import EncoderClassifier
import tempfile
import os
# 1. Load Models with caching
@st.cache_resource
def load_models():
# Load ViT model for facial emotion detection
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
# Load SpeechBrain model for audio emotion recognition
audio_classifier = EncoderClassifier.from_hparams(
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
savedir="pretrained_models/emotion-audio"
)
return vit_processor, vit_model, audio_classifier
# 2. Video Processing Functions
def analyze_frame(frame, processor, model):
"""Analyze single frame using ViT model"""
inputs = processor(images=frame, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
return model.config.id2label[outputs.logits.argmax(-1).item()]
def process_video(video_path, processor, model, audio_classifier):
"""Process video and return combined results"""
# Extract audio from video
video = mp.VideoFileClip(video_path)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
audio_path = tmp_audio.name
video.audio.write_audiofile(audio_path)
# Analyze audio
audio_signal = audio_classifier.load_audio(audio_path)
audio_prediction = audio_classifier.classify_batch(audio_signal)
audio_emotion = audio_prediction[3][0]
# Analyze video frames
cap = cv2.VideoCapture(video_path)
emotions = []
# Process every 5th frame to reduce computation
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % 5 == 0: # Sample every 5th frame
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
emotions.append(analyze_frame(frame_rgb, processor, model))
frame_count += 1
cap.release()
os.unlink(audio_path) # Clean up temporary audio file
# Get most common visual emotion
visual_emotion = max(set(emotions), key=emotions.count)
return {
'audio_emotion': audio_emotion,
'visual_emotion': visual_emotion,
'frame_emotions': emotions
}
# 3. Streamlit UI
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")
st.title("π₯ Video Sentiment Analysis")
st.markdown("""
Analyze emotions from:
- **Facial Expressions** using ViT (Vision Transformer)
- **Speech Tone** using wav2vec2
""")
uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])
if uploaded_file:
# Display video preview
st.video(uploaded_file)
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
tmp_video.write(uploaded_file.getbuffer())
video_path = tmp_video.name
# Load models
vit_processor, vit_model, audio_classifier = load_models()
# Process video
with st.spinner("Analyzing video content..."):
try:
results = process_video(video_path, vit_processor, vit_model, audio_classifier)
finally:
os.unlink(video_path) # Clean up temporary video file
# Display results
col1, col2 = st.columns(2)
with col1:
st.subheader("π§ Audio Analysis")
st.metric("Dominant Emotion", results['audio_emotion'])
with col2:
st.subheader("ποΈ Visual Analysis")
st.metric("Dominant Emotion", results['visual_emotion'])
# Show emotion timeline
st.subheader("π Emotion Timeline")
st.line_chart(
data={ "Frame Emotions": results['frame_emotions'] },
use_container_width=True
)
st.success("Analysis complete!")
# Footer
st.markdown("---")
st.markdown("Built with [Hugging Face](https://huggingface.co/) π€ & [Streamlit](https://streamlit.io/) π") |