File size: 4,233 Bytes
f49a563
c3b2feb
 
 
 
f49a563
c3b2feb
f49a563
 
 
c3b2feb
f49a563
c3b2feb
 
f49a563
c3b2feb
 
 
f49a563
 
 
 
c3b2feb
f49a563
 
c3b2feb
f49a563
c3b2feb
f49a563
c3b2feb
 
 
 
 
f49a563
 
 
c3b2feb
f49a563
 
 
c3b2feb
 
f49a563
 
 
c3b2feb
 
 
 
 
f49a563
 
c3b2feb
 
f49a563
 
 
 
 
 
 
 
c3b2feb
 
f49a563
 
 
 
 
c3b2feb
f49a563
 
 
c3b2feb
 
 
f49a563
 
 
c3b2feb
 
f49a563
 
c3b2feb
 
f49a563
c3b2feb
 
f49a563
 
 
 
 
 
 
c3b2feb
 
f49a563
c3b2feb
 
f49a563
 
 
 
 
c3b2feb
 
 
f49a563
c3b2feb
 
f49a563
c3b2feb
 
 
f49a563
 
 
 
 
 
 
 
c3b2feb
f49a563
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# app.py
import streamlit as st
import cv2
import numpy as np
import moviepy.editor as mp
from transformers import ViTImageProcessor, ViTForImageClassification
import torch
from speechbrain.pretrained import EncoderClassifier
import tempfile
import os

# 1. Load Models with caching
@st.cache_resource
def load_models():
    # Load ViT model for facial emotion detection
    vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
    vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
    
    # Load SpeechBrain model for audio emotion recognition
    audio_classifier = EncoderClassifier.from_hparams(
        source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
        savedir="pretrained_models/emotion-audio"
    )
    
    return vit_processor, vit_model, audio_classifier

# 2. Video Processing Functions
def analyze_frame(frame, processor, model):
    """Analyze single frame using ViT model"""
    inputs = processor(images=frame, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return model.config.id2label[outputs.logits.argmax(-1).item()]

def process_video(video_path, processor, model, audio_classifier):
    """Process video and return combined results"""
    # Extract audio from video
    video = mp.VideoFileClip(video_path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
        audio_path = tmp_audio.name
        video.audio.write_audiofile(audio_path)
    
    # Analyze audio
    audio_signal = audio_classifier.load_audio(audio_path)
    audio_prediction = audio_classifier.classify_batch(audio_signal)
    audio_emotion = audio_prediction[3][0]
    
    # Analyze video frames
    cap = cv2.VideoCapture(video_path)
    emotions = []
    
    # Process every 5th frame to reduce computation
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        if frame_count % 5 == 0:  # Sample every 5th frame
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            emotions.append(analyze_frame(frame_rgb, processor, model))
            
        frame_count += 1
    
    cap.release()
    os.unlink(audio_path)  # Clean up temporary audio file
    
    # Get most common visual emotion
    visual_emotion = max(set(emotions), key=emotions.count)
    
    return {
        'audio_emotion': audio_emotion,
        'visual_emotion': visual_emotion,
        'frame_emotions': emotions
    }

# 3. Streamlit UI
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")

st.title("πŸŽ₯ Video Sentiment Analysis")
st.markdown("""
Analyze emotions from:
- **Facial Expressions** using ViT (Vision Transformer)
- **Speech Tone** using wav2vec2
""")

uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])

if uploaded_file:
    # Display video preview
    st.video(uploaded_file)
    
    # Save to temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
        tmp_video.write(uploaded_file.getbuffer())
        video_path = tmp_video.name
    
    # Load models
    vit_processor, vit_model, audio_classifier = load_models()
    
    # Process video
    with st.spinner("Analyzing video content..."):
        try:
            results = process_video(video_path, vit_processor, vit_model, audio_classifier)
        finally:
            os.unlink(video_path)  # Clean up temporary video file
    
    # Display results
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("🎧 Audio Analysis")
        st.metric("Dominant Emotion", results['audio_emotion'])
    
    with col2:
        st.subheader("πŸ‘οΈ Visual Analysis")
        st.metric("Dominant Emotion", results['visual_emotion'])
    
    # Show emotion timeline
    st.subheader("πŸ“ˆ Emotion Timeline")
    st.line_chart(
        data={ "Frame Emotions": results['frame_emotions'] },
        use_container_width=True
    )
    
    st.success("Analysis complete!")

# Footer
st.markdown("---")
st.markdown("Built with [Hugging Face](https://huggingface.co/) πŸ€— & [Streamlit](https://streamlit.io/) 🎈")