Spaces:

Sagnik1750
/

Faceoff

Sleeping

App Files Files Community

Sagnik1750 commited on Mar 7

Commit

f49a563

verified ·

1 Parent(s): 1c501c3

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -42

app.py CHANGED Viewed

@@ -1,96 +1,130 @@
 import streamlit as st
 import cv2
 import numpy as np
 import moviepy.editor as mp
-from transformers import (
-    ViTImageProcessor,
-    ViTForImageClassification,
-    pipeline
-)
 import torch
-# 1. Load Models
 @st.cache_resource
 def load_models():
-    # Visual model
     vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
     vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
-    # Audio model
-    audio_analyzer = pipeline(
-        "audio-classification",
-        model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP"
     )
-    return vit_processor, vit_model, audio_analyzer
-# 2. Processing Functions
 def analyze_frame(frame, processor, model):
     inputs = processor(images=frame, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
     return model.config.id2label[outputs.logits.argmax(-1).item()]
-def process_video(video_path, processor, model, audio_analyzer):
-    # Extract audio
     video = mp.VideoFileClip(video_path)
-    audio_path = "temp_audio.wav"
-    video.audio.write_audiofile(audio_path)
     # Analyze audio
-    audio_result = audio_analyzer(audio_path)
-    audio_emotion = max(audio_result, key=lambda x: x['score'])['label']
     # Analyze video frames
     cap = cv2.VideoCapture(video_path)
     emotions = []
     while cap.isOpened():
         ret, frame = cap.read()
-        if not ret: break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        emotions.append(analyze_frame(frame, processor, model))
     cap.release()
     return {
-        'audio': audio_emotion,
-        'visual': max(set(emotions), key=emotions.count)
     }
 # 3. Streamlit UI
-st.title("Video Sentiment Analyzer 🎥")
 st.markdown("""
 Analyze emotions from:
-- Facial expressions (ViT model)
-- Audio tone (wav2vec2 model)
 """)
-uploaded_file = st.file_uploader("Upload video (max 200MB)", type=["mp4", "avi"])
 if uploaded_file:
-    # Save to temp file
-    with open("temp_video.mp4", "wb") as f:
-        f.write(uploaded_file.getbuffer())
     # Load models
-    vit_processor, vit_model, audio_analyzer = load_models()
     # Process video
-    with st.spinner("Analyzing video..."):
-        result = process_video(
-            "temp_video.mp4",
-            vit_processor,
-            vit_model,
-            audio_analyzer
-        )
     # Display results
     col1, col2 = st.columns(2)
     with col1:
         st.subheader("🎧 Audio Analysis")
-        st.metric("Emotion", result['audio'])
     with col2:
         st.subheader("👁️ Visual Analysis")
-        st.metric("Dominant Emotion", result['visual'])
-    st.success("Analysis complete!")

+# app.py
 import streamlit as st
 import cv2
 import numpy as np
 import moviepy.editor as mp
+from transformers import ViTImageProcessor, ViTForImageClassification
 import torch
+from speechbrain.pretrained import EncoderClassifier
+import tempfile
+import os
+# 1. Load Models with caching
 @st.cache_resource
 def load_models():
+    # Load ViT model for facial emotion detection
     vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
     vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+    # Load SpeechBrain model for audio emotion recognition
+    audio_classifier = EncoderClassifier.from_hparams(
+        source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
+        savedir="pretrained_models/emotion-audio"
     )
+    return vit_processor, vit_model, audio_classifier
+# 2. Video Processing Functions
 def analyze_frame(frame, processor, model):
+    """Analyze single frame using ViT model"""
     inputs = processor(images=frame, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
     return model.config.id2label[outputs.logits.argmax(-1).item()]
+def process_video(video_path, processor, model, audio_classifier):
+    """Process video and return combined results"""
+    # Extract audio from video
     video = mp.VideoFileClip(video_path)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
+        audio_path = tmp_audio.name
+        video.audio.write_audiofile(audio_path)
     # Analyze audio
+    audio_signal = audio_classifier.load_audio(audio_path)
+    audio_prediction = audio_classifier.classify_batch(audio_signal)
+    audio_emotion = audio_prediction[3][0]
     # Analyze video frames
     cap = cv2.VideoCapture(video_path)
     emotions = []
+    # Process every 5th frame to reduce computation
+    frame_count = 0
     while cap.isOpened():
         ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % 5 == 0:  # Sample every 5th frame
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            emotions.append(analyze_frame(frame_rgb, processor, model))
+        frame_count += 1
     cap.release()
+    os.unlink(audio_path)  # Clean up temporary audio file
+    # Get most common visual emotion
+    visual_emotion = max(set(emotions), key=emotions.count)
     return {
+        'audio_emotion': audio_emotion,
+        'visual_emotion': visual_emotion,
+        'frame_emotions': emotions
     }
 # 3. Streamlit UI
+st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")
+st.title("🎥 Video Sentiment Analysis")
 st.markdown("""
 Analyze emotions from:
+- **Facial Expressions** using ViT (Vision Transformer)
+- **Speech Tone** using wav2vec2
 """)
+uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])
 if uploaded_file:
+    # Display video preview
+    st.video(uploaded_file)
+    # Save to temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
+        tmp_video.write(uploaded_file.getbuffer())
+        video_path = tmp_video.name
     # Load models
+    vit_processor, vit_model, audio_classifier = load_models()
     # Process video
+    with st.spinner("Analyzing video content..."):
+        try:
+            results = process_video(video_path, vit_processor, vit_model, audio_classifier)
+        finally:
+            os.unlink(video_path)  # Clean up temporary video file
     # Display results
     col1, col2 = st.columns(2)
     with col1:
         st.subheader("🎧 Audio Analysis")
+        st.metric("Dominant Emotion", results['audio_emotion'])
     with col2:
         st.subheader("👁️ Visual Analysis")
+        st.metric("Dominant Emotion", results['visual_emotion'])
+    # Show emotion timeline
+    st.subheader("📈 Emotion Timeline")
+    st.line_chart(
+        data={ "Frame Emotions": results['frame_emotions'] },
+        use_container_width=True
+    )
+    st.success("Analysis complete!")
+# Footer
+st.markdown("---")
+st.markdown("Built with [Hugging Face](https://huggingface.co/) 🤗 & [Streamlit](https://streamlit.io/) 🎈")