# app.py
import streamlit as st
import cv2
import numpy as np
import moviepy.editor as mp
from transformers import ViTImageProcessor, ViTForImageClassification
import torch
from speechbrain.pretrained import EncoderClassifier
import tempfile
import os

# 1. Load Models with caching
@st.cache_resource
def load_models():
    # Load ViT model for facial emotion detection
    vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
    vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
    
    # Load SpeechBrain model for audio emotion recognition
    audio_classifier = EncoderClassifier.from_hparams(
        source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
        savedir="pretrained_models/emotion-audio"
    )
    
    return vit_processor, vit_model, audio_classifier

# 2. Video Processing Functions
def analyze_frame(frame, processor, model):
    """Analyze single frame using ViT model"""
    inputs = processor(images=frame, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return model.config.id2label[outputs.logits.argmax(-1).item()]

def process_video(video_path, processor, model, audio_classifier):
    """Process video and return combined results"""
    # Extract audio from video
    video = mp.VideoFileClip(video_path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
        audio_path = tmp_audio.name
        video.audio.write_audiofile(audio_path)
    
    # Analyze audio
    audio_signal = audio_classifier.load_audio(audio_path)
    audio_prediction = audio_classifier.classify_batch(audio_signal)
    audio_emotion = audio_prediction[3][0]
    
    # Analyze video frames
    cap = cv2.VideoCapture(video_path)
    emotions = []
    
    # Process every 5th frame to reduce computation
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        if frame_count % 5 == 0:  # Sample every 5th frame
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            emotions.append(analyze_frame(frame_rgb, processor, model))
            
        frame_count += 1
    
    cap.release()
    os.unlink(audio_path)  # Clean up temporary audio file
    
    # Get most common visual emotion
    visual_emotion = max(set(emotions), key=emotions.count)
    
    return {
        'audio_emotion': audio_emotion,
        'visual_emotion': visual_emotion,
        'frame_emotions': emotions
    }

# 3. Streamlit UI
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")

st.title("🎥 Video Sentiment Analysis")
st.markdown("""
Analyze emotions from:
- **Facial Expressions** using ViT (Vision Transformer)
- **Speech Tone** using wav2vec2
""")

uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])

if uploaded_file:
    # Display video preview
    st.video(uploaded_file)
    
    # Save to temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
        tmp_video.write(uploaded_file.getbuffer())
        video_path = tmp_video.name
    
    # Load models
    vit_processor, vit_model, audio_classifier = load_models()
    
    # Process video
    with st.spinner("Analyzing video content..."):
        try:
            results = process_video(video_path, vit_processor, vit_model, audio_classifier)
        finally:
            os.unlink(video_path)  # Clean up temporary video file
    
    # Display results
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("🎧 Audio Analysis")
        st.metric("Dominant Emotion", results['audio_emotion'])
    
    with col2:
        st.subheader("👁️ Visual Analysis")
        st.metric("Dominant Emotion", results['visual_emotion'])
    
    # Show emotion timeline
    st.subheader("📈 Emotion Timeline")
    st.line_chart(
        data={ "Frame Emotions": results['frame_emotions'] },
        use_container_width=True
    )
    
    st.success("Analysis complete!")

# Footer
st.markdown("---")
st.markdown("Built with [Hugging Face](https://huggingface.co/) 🤗 & [Streamlit](https://streamlit.io/) 🎈")