Spaces:

Sagnik1750
/

Faceoff

Sleeping

App Files Files Community

Faceoff / app.py

Sagnik1750

Update app.py

f49a563 verified 4 months ago

raw

history blame

4.23 kB

	# app.py
	import streamlit as st
	import cv2
	import numpy as np
	import moviepy.editor as mp
	from transformers import ViTImageProcessor, ViTForImageClassification
	import torch
	from speechbrain.pretrained import EncoderClassifier
	import tempfile
	import os

	# 1. Load Models with caching
	@st.cache_resource
	def load_models():
	# Load ViT model for facial emotion detection
	vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
	vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

	# Load SpeechBrain model for audio emotion recognition
	audio_classifier = EncoderClassifier.from_hparams(
	source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
	savedir="pretrained_models/emotion-audio"
	)

	return vit_processor, vit_model, audio_classifier

	# 2. Video Processing Functions
	def analyze_frame(frame, processor, model):
	"""Analyze single frame using ViT model"""
	inputs = processor(images=frame, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**inputs)
	return model.config.id2label[outputs.logits.argmax(-1).item()]

	def process_video(video_path, processor, model, audio_classifier):
	"""Process video and return combined results"""
	# Extract audio from video
	video = mp.VideoFileClip(video_path)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
	audio_path = tmp_audio.name
	video.audio.write_audiofile(audio_path)

	# Analyze audio
	audio_signal = audio_classifier.load_audio(audio_path)
	audio_prediction = audio_classifier.classify_batch(audio_signal)
	audio_emotion = audio_prediction[3][0]

	# Analyze video frames
	cap = cv2.VideoCapture(video_path)
	emotions = []

	# Process every 5th frame to reduce computation
	frame_count = 0
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	if frame_count % 5 == 0: # Sample every 5th frame
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	emotions.append(analyze_frame(frame_rgb, processor, model))

	frame_count += 1

	cap.release()
	os.unlink(audio_path) # Clean up temporary audio file

	# Get most common visual emotion
	visual_emotion = max(set(emotions), key=emotions.count)

	return {
	'audio_emotion': audio_emotion,
	'visual_emotion': visual_emotion,
	'frame_emotions': emotions
	}

	# 3. Streamlit UI
	st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")

	st.title("🎥 Video Sentiment Analysis")
	st.markdown("""
	Analyze emotions from:
	- Facial Expressions using ViT (Vision Transformer)
	- Speech Tone using wav2vec2
	""")

	uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])

	if uploaded_file:
	# Display video preview
	st.video(uploaded_file)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
	tmp_video.write(uploaded_file.getbuffer())
	video_path = tmp_video.name

	# Load models
	vit_processor, vit_model, audio_classifier = load_models()

	# Process video
	with st.spinner("Analyzing video content..."):
	try:
	results = process_video(video_path, vit_processor, vit_model, audio_classifier)
	finally:
	os.unlink(video_path) # Clean up temporary video file

	# Display results
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("🎧 Audio Analysis")
	st.metric("Dominant Emotion", results['audio_emotion'])

	with col2:
	st.subheader("👁️ Visual Analysis")
	st.metric("Dominant Emotion", results['visual_emotion'])

	# Show emotion timeline
	st.subheader("📈 Emotion Timeline")
	st.line_chart(
	data={ "Frame Emotions": results['frame_emotions'] },
	use_container_width=True
	)

	st.success("Analysis complete!")

	# Footer
	st.markdown("---")
	st.markdown("Built with [Hugging Face](https://huggingface.co/) 🤗 & [Streamlit](https://streamlit.io/) 🎈")