File size: 2,693 Bytes
c3b2feb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import streamlit as st
import cv2
import numpy as np
import moviepy.editor as mp
from transformers import (
    ViTImageProcessor, 
    ViTForImageClassification,
    pipeline
)
import torch

# 1. Load Models
@st.cache_resource
def load_models():
    # Visual model
    vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
    vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
    
    # Audio model
    audio_analyzer = pipeline(
        "audio-classification", 
        model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP"
    )
    return vit_processor, vit_model, audio_analyzer

# 2. Processing Functions
def analyze_frame(frame, processor, model):
    inputs = processor(images=frame, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return model.config.id2label[outputs.logits.argmax(-1).item()]

def process_video(video_path, processor, model, audio_analyzer):
    # Extract audio
    video = mp.VideoFileClip(video_path)
    audio_path = "temp_audio.wav"
    video.audio.write_audiofile(audio_path)
    
    # Analyze audio
    audio_result = audio_analyzer(audio_path)
    audio_emotion = max(audio_result, key=lambda x: x['score'])['label']
    
    # Analyze video frames
    cap = cv2.VideoCapture(video_path)
    emotions = []
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        emotions.append(analyze_frame(frame, processor, model))
    
    cap.release()
    return {
        'audio': audio_emotion,
        'visual': max(set(emotions), key=emotions.count)
    }

# 3. Streamlit UI
st.title("Video Sentiment Analyzer πŸŽ₯")
st.markdown("""
Analyze emotions from:
- Facial expressions (ViT model)
- Audio tone (wav2vec2 model)
""")

uploaded_file = st.file_uploader("Upload video (max 200MB)", type=["mp4", "avi"])

if uploaded_file:
    # Save to temp file
    with open("temp_video.mp4", "wb") as f:
        f.write(uploaded_file.getbuffer())
    
    # Load models
    vit_processor, vit_model, audio_analyzer = load_models()
    
    # Process video
    with st.spinner("Analyzing video..."):
        result = process_video(
            "temp_video.mp4",
            vit_processor,
            vit_model,
            audio_analyzer
        )
    
    # Display results
    col1, col2 = st.columns(2)
    with col1:
        st.subheader("🎧 Audio Analysis")
        st.metric("Emotion", result['audio'])
    
    with col2:
        st.subheader("πŸ‘οΈ Visual Analysis")
        st.metric("Dominant Emotion", result['visual'])
    
    st.success("Analysis complete!")