Spaces:
Sleeping
Sleeping
import streamlit as st | |
import cv2 | |
import numpy as np | |
import moviepy.editor as mp | |
from transformers import ( | |
ViTImageProcessor, | |
ViTForImageClassification, | |
pipeline | |
) | |
import torch | |
# 1. Load Models | |
def load_models(): | |
# Visual model | |
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224') | |
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') | |
# Audio model | |
audio_analyzer = pipeline( | |
"audio-classification", | |
model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP" | |
) | |
return vit_processor, vit_model, audio_analyzer | |
# 2. Processing Functions | |
def analyze_frame(frame, processor, model): | |
inputs = processor(images=frame, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
return model.config.id2label[outputs.logits.argmax(-1).item()] | |
def process_video(video_path, processor, model, audio_analyzer): | |
# Extract audio | |
video = mp.VideoFileClip(video_path) | |
audio_path = "temp_audio.wav" | |
video.audio.write_audiofile(audio_path) | |
# Analyze audio | |
audio_result = audio_analyzer(audio_path) | |
audio_emotion = max(audio_result, key=lambda x: x['score'])['label'] | |
# Analyze video frames | |
cap = cv2.VideoCapture(video_path) | |
emotions = [] | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: break | |
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
emotions.append(analyze_frame(frame, processor, model)) | |
cap.release() | |
return { | |
'audio': audio_emotion, | |
'visual': max(set(emotions), key=emotions.count) | |
} | |
# 3. Streamlit UI | |
st.title("Video Sentiment Analyzer π₯") | |
st.markdown(""" | |
Analyze emotions from: | |
- Facial expressions (ViT model) | |
- Audio tone (wav2vec2 model) | |
""") | |
uploaded_file = st.file_uploader("Upload video (max 200MB)", type=["mp4", "avi"]) | |
if uploaded_file: | |
# Save to temp file | |
with open("temp_video.mp4", "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
# Load models | |
vit_processor, vit_model, audio_analyzer = load_models() | |
# Process video | |
with st.spinner("Analyzing video..."): | |
result = process_video( | |
"temp_video.mp4", | |
vit_processor, | |
vit_model, | |
audio_analyzer | |
) | |
# Display results | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("π§ Audio Analysis") | |
st.metric("Emotion", result['audio']) | |
with col2: | |
st.subheader("ποΈ Visual Analysis") | |
st.metric("Dominant Emotion", result['visual']) | |
st.success("Analysis complete!") |