Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,96 +1,130 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import cv2
|
3 |
import numpy as np
|
4 |
import moviepy.editor as mp
|
5 |
-
from transformers import
|
6 |
-
ViTImageProcessor,
|
7 |
-
ViTForImageClassification,
|
8 |
-
pipeline
|
9 |
-
)
|
10 |
import torch
|
|
|
|
|
|
|
11 |
|
12 |
-
# 1. Load Models
|
13 |
@st.cache_resource
|
14 |
def load_models():
|
15 |
-
#
|
16 |
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
|
17 |
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
"
|
22 |
-
|
23 |
)
|
24 |
-
|
|
|
25 |
|
26 |
-
# 2. Processing Functions
|
27 |
def analyze_frame(frame, processor, model):
|
|
|
28 |
inputs = processor(images=frame, return_tensors="pt")
|
29 |
with torch.no_grad():
|
30 |
outputs = model(**inputs)
|
31 |
return model.config.id2label[outputs.logits.argmax(-1).item()]
|
32 |
|
33 |
-
def process_video(video_path, processor, model,
|
34 |
-
|
|
|
35 |
video = mp.VideoFileClip(video_path)
|
36 |
-
|
37 |
-
|
|
|
38 |
|
39 |
# Analyze audio
|
40 |
-
|
41 |
-
|
|
|
42 |
|
43 |
# Analyze video frames
|
44 |
cap = cv2.VideoCapture(video_path)
|
45 |
emotions = []
|
46 |
|
|
|
|
|
47 |
while cap.isOpened():
|
48 |
ret, frame = cap.read()
|
49 |
-
if not ret:
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
cap.release()
|
|
|
|
|
|
|
|
|
|
|
54 |
return {
|
55 |
-
'
|
56 |
-
'
|
|
|
57 |
}
|
58 |
|
59 |
# 3. Streamlit UI
|
60 |
-
st.
|
|
|
|
|
61 |
st.markdown("""
|
62 |
Analyze emotions from:
|
63 |
-
- Facial
|
64 |
-
-
|
65 |
""")
|
66 |
|
67 |
-
uploaded_file = st.file_uploader("Upload video (max
|
68 |
|
69 |
if uploaded_file:
|
70 |
-
#
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
|
74 |
# Load models
|
75 |
-
vit_processor, vit_model,
|
76 |
|
77 |
# Process video
|
78 |
-
with st.spinner("Analyzing video..."):
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
audio_analyzer
|
84 |
-
)
|
85 |
|
86 |
# Display results
|
87 |
col1, col2 = st.columns(2)
|
|
|
88 |
with col1:
|
89 |
st.subheader("π§ Audio Analysis")
|
90 |
-
st.metric("Emotion",
|
91 |
|
92 |
with col2:
|
93 |
st.subheader("ποΈ Visual Analysis")
|
94 |
-
st.metric("Dominant Emotion",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
st.success("Analysis complete!")
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
import streamlit as st
|
3 |
import cv2
|
4 |
import numpy as np
|
5 |
import moviepy.editor as mp
|
6 |
+
from transformers import ViTImageProcessor, ViTForImageClassification
|
|
|
|
|
|
|
|
|
7 |
import torch
|
8 |
+
from speechbrain.pretrained import EncoderClassifier
|
9 |
+
import tempfile
|
10 |
+
import os
|
11 |
|
12 |
+
# 1. Load Models with caching
|
13 |
@st.cache_resource
|
14 |
def load_models():
|
15 |
+
# Load ViT model for facial emotion detection
|
16 |
vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
|
17 |
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
|
18 |
|
19 |
+
# Load SpeechBrain model for audio emotion recognition
|
20 |
+
audio_classifier = EncoderClassifier.from_hparams(
|
21 |
+
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
|
22 |
+
savedir="pretrained_models/emotion-audio"
|
23 |
)
|
24 |
+
|
25 |
+
return vit_processor, vit_model, audio_classifier
|
26 |
|
27 |
+
# 2. Video Processing Functions
|
28 |
def analyze_frame(frame, processor, model):
|
29 |
+
"""Analyze single frame using ViT model"""
|
30 |
inputs = processor(images=frame, return_tensors="pt")
|
31 |
with torch.no_grad():
|
32 |
outputs = model(**inputs)
|
33 |
return model.config.id2label[outputs.logits.argmax(-1).item()]
|
34 |
|
35 |
+
def process_video(video_path, processor, model, audio_classifier):
|
36 |
+
"""Process video and return combined results"""
|
37 |
+
# Extract audio from video
|
38 |
video = mp.VideoFileClip(video_path)
|
39 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
|
40 |
+
audio_path = tmp_audio.name
|
41 |
+
video.audio.write_audiofile(audio_path)
|
42 |
|
43 |
# Analyze audio
|
44 |
+
audio_signal = audio_classifier.load_audio(audio_path)
|
45 |
+
audio_prediction = audio_classifier.classify_batch(audio_signal)
|
46 |
+
audio_emotion = audio_prediction[3][0]
|
47 |
|
48 |
# Analyze video frames
|
49 |
cap = cv2.VideoCapture(video_path)
|
50 |
emotions = []
|
51 |
|
52 |
+
# Process every 5th frame to reduce computation
|
53 |
+
frame_count = 0
|
54 |
while cap.isOpened():
|
55 |
ret, frame = cap.read()
|
56 |
+
if not ret:
|
57 |
+
break
|
58 |
+
|
59 |
+
if frame_count % 5 == 0: # Sample every 5th frame
|
60 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
61 |
+
emotions.append(analyze_frame(frame_rgb, processor, model))
|
62 |
+
|
63 |
+
frame_count += 1
|
64 |
|
65 |
cap.release()
|
66 |
+
os.unlink(audio_path) # Clean up temporary audio file
|
67 |
+
|
68 |
+
# Get most common visual emotion
|
69 |
+
visual_emotion = max(set(emotions), key=emotions.count)
|
70 |
+
|
71 |
return {
|
72 |
+
'audio_emotion': audio_emotion,
|
73 |
+
'visual_emotion': visual_emotion,
|
74 |
+
'frame_emotions': emotions
|
75 |
}
|
76 |
|
77 |
# 3. Streamlit UI
|
78 |
+
st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")
|
79 |
+
|
80 |
+
st.title("π₯ Video Sentiment Analysis")
|
81 |
st.markdown("""
|
82 |
Analyze emotions from:
|
83 |
+
- **Facial Expressions** using ViT (Vision Transformer)
|
84 |
+
- **Speech Tone** using wav2vec2
|
85 |
""")
|
86 |
|
87 |
+
uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])
|
88 |
|
89 |
if uploaded_file:
|
90 |
+
# Display video preview
|
91 |
+
st.video(uploaded_file)
|
92 |
+
|
93 |
+
# Save to temporary file
|
94 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
|
95 |
+
tmp_video.write(uploaded_file.getbuffer())
|
96 |
+
video_path = tmp_video.name
|
97 |
|
98 |
# Load models
|
99 |
+
vit_processor, vit_model, audio_classifier = load_models()
|
100 |
|
101 |
# Process video
|
102 |
+
with st.spinner("Analyzing video content..."):
|
103 |
+
try:
|
104 |
+
results = process_video(video_path, vit_processor, vit_model, audio_classifier)
|
105 |
+
finally:
|
106 |
+
os.unlink(video_path) # Clean up temporary video file
|
|
|
|
|
107 |
|
108 |
# Display results
|
109 |
col1, col2 = st.columns(2)
|
110 |
+
|
111 |
with col1:
|
112 |
st.subheader("π§ Audio Analysis")
|
113 |
+
st.metric("Dominant Emotion", results['audio_emotion'])
|
114 |
|
115 |
with col2:
|
116 |
st.subheader("ποΈ Visual Analysis")
|
117 |
+
st.metric("Dominant Emotion", results['visual_emotion'])
|
118 |
+
|
119 |
+
# Show emotion timeline
|
120 |
+
st.subheader("π Emotion Timeline")
|
121 |
+
st.line_chart(
|
122 |
+
data={ "Frame Emotions": results['frame_emotions'] },
|
123 |
+
use_container_width=True
|
124 |
+
)
|
125 |
|
126 |
+
st.success("Analysis complete!")
|
127 |
+
|
128 |
+
# Footer
|
129 |
+
st.markdown("---")
|
130 |
+
st.markdown("Built with [Hugging Face](https://huggingface.co/) π€ & [Streamlit](https://streamlit.io/) π")
|