Spaces:

Sagnik1750
/

Faceoff

Sleeping

File size: 4,233 Bytes

f49a563
c3b2feb
 
 
 
f49a563
c3b2feb
f49a563
 
 
c3b2feb
f49a563
c3b2feb
 
f49a563
c3b2feb
 
 
f49a563
 
 
 
c3b2feb
f49a563
 
c3b2feb
f49a563
c3b2feb
f49a563
c3b2feb
 
 
 
 
f49a563
 
 
c3b2feb
f49a563
 
 
c3b2feb
 
f49a563
 
 
c3b2feb
 
 
 
 
f49a563
 
c3b2feb
 
f49a563
 
 
 
 
 
 
 
c3b2feb
 
f49a563
 
 
 
 
c3b2feb
f49a563
 
 
c3b2feb
 
 
f49a563
 
 
c3b2feb
 
f49a563
 
c3b2feb
 
f49a563
c3b2feb
 
f49a563
 
 
 
 
 
 
c3b2feb
 
f49a563
c3b2feb
 
f49a563
 
 
 
 
c3b2feb
 
 
f49a563
c3b2feb
 
f49a563
c3b2feb
 
 
f49a563
 
 
 
 
 
 
 
c3b2feb
f49a563

# app.py
import streamlit as st
import cv2
import numpy as np
import moviepy.editor as mp
from transformers import ViTImageProcessor, ViTForImageClassification
import torch
from speechbrain.pretrained import EncoderClassifier
import tempfile
import os

# 1. Load Models with caching
@st.cache_resource
def load_models():
    # Load ViT model for facial emotion detection
    vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
    vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
    
    # Load SpeechBrain model for audio emotion recognition
    audio_classifier = EncoderClassifier.from_hparams(
        source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
        savedir="pretrained_models/emotion-audio"
    )
    
    return vit_processor, vit_model, audio_classifier

# 2. Video Processing Functions
def analyze_frame(frame, processor, model):
    """Analyze single frame using ViT model"""
    inputs = processor(images=frame, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return model.config.id2label[outputs.logits.argmax(-1).item()]

def process_video(video_path, processor, model, audio_classifier):
    """Process video and return combined results"""
    # Extract audio from video
    video = mp.VideoFileClip(video_path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
        audio_path = tmp_audio.name
        video.audio.write_audiofile(audio_path)
    
    # Analyze audio
    audio_signal = audio_classifier.load_audio(audio_path)
    audio_prediction = audio_classifier.classify_batch(audio_signal)
    audio_emotion = audio_prediction[3][0]
    
    # Analyze video frames
    cap = cv2.VideoCapture(video_path)
    emotions = []
    
    # Process every 5th frame to reduce computation
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        if frame_count % 5 == 0:  # Sample every 5th frame
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            emotions.append(analyze_frame(frame_rgb, processor, model))
            
        frame_count += 1
    
    cap.release()
    os.unlink(audio_path)  # Clean up temporary audio file
    
    # Get most common visual emotion
    visual_emotion = max(set(emotions), key=emotions.count)
    
    return {
        'audio_emotion': audio_emotion,
        'visual_emotion': visual_emotion,
        'frame_emotions': emotions
    }

# 3. Streamlit UI
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")

st.title("🎥 Video Sentiment Analysis")
st.markdown("""
Analyze emotions from:
- **Facial Expressions** using ViT (Vision Transformer)
- **Speech Tone** using wav2vec2
""")

uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])

if uploaded_file:
    # Display video preview
    st.video(uploaded_file)
    
    # Save to temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
        tmp_video.write(uploaded_file.getbuffer())
        video_path = tmp_video.name
    
    # Load models
    vit_processor, vit_model, audio_classifier = load_models()
    
    # Process video
    with st.spinner("Analyzing video content..."):
        try:
            results = process_video(video_path, vit_processor, vit_model, audio_classifier)
        finally:
            os.unlink(video_path)  # Clean up temporary video file
    
    # Display results
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("🎧 Audio Analysis")
        st.metric("Dominant Emotion", results['audio_emotion'])
    
    with col2:
        st.subheader("👁️ Visual Analysis")
        st.metric("Dominant Emotion", results['visual_emotion'])
    
    # Show emotion timeline
    st.subheader("📈 Emotion Timeline")
    st.line_chart(
        data={ "Frame Emotions": results['frame_emotions'] },
        use_container_width=True
    )
    
    st.success("Analysis complete!")

# Footer
st.markdown("---")
st.markdown("Built with [Hugging Face](https://huggingface.co/) 🤗 & [Streamlit](https://streamlit.io/) 🎈")