Sagnik1750 commited on
Commit
f49a563
Β·
verified Β·
1 Parent(s): 1c501c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -42
app.py CHANGED
@@ -1,96 +1,130 @@
 
1
  import streamlit as st
2
  import cv2
3
  import numpy as np
4
  import moviepy.editor as mp
5
- from transformers import (
6
- ViTImageProcessor,
7
- ViTForImageClassification,
8
- pipeline
9
- )
10
  import torch
 
 
 
11
 
12
- # 1. Load Models
13
  @st.cache_resource
14
  def load_models():
15
- # Visual model
16
  vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
17
  vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
18
 
19
- # Audio model
20
- audio_analyzer = pipeline(
21
- "audio-classification",
22
- model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP"
23
  )
24
- return vit_processor, vit_model, audio_analyzer
 
25
 
26
- # 2. Processing Functions
27
  def analyze_frame(frame, processor, model):
 
28
  inputs = processor(images=frame, return_tensors="pt")
29
  with torch.no_grad():
30
  outputs = model(**inputs)
31
  return model.config.id2label[outputs.logits.argmax(-1).item()]
32
 
33
- def process_video(video_path, processor, model, audio_analyzer):
34
- # Extract audio
 
35
  video = mp.VideoFileClip(video_path)
36
- audio_path = "temp_audio.wav"
37
- video.audio.write_audiofile(audio_path)
 
38
 
39
  # Analyze audio
40
- audio_result = audio_analyzer(audio_path)
41
- audio_emotion = max(audio_result, key=lambda x: x['score'])['label']
 
42
 
43
  # Analyze video frames
44
  cap = cv2.VideoCapture(video_path)
45
  emotions = []
46
 
 
 
47
  while cap.isOpened():
48
  ret, frame = cap.read()
49
- if not ret: break
50
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
51
- emotions.append(analyze_frame(frame, processor, model))
 
 
 
 
 
52
 
53
  cap.release()
 
 
 
 
 
54
  return {
55
- 'audio': audio_emotion,
56
- 'visual': max(set(emotions), key=emotions.count)
 
57
  }
58
 
59
  # 3. Streamlit UI
60
- st.title("Video Sentiment Analyzer πŸŽ₯")
 
 
61
  st.markdown("""
62
  Analyze emotions from:
63
- - Facial expressions (ViT model)
64
- - Audio tone (wav2vec2 model)
65
  """)
66
 
67
- uploaded_file = st.file_uploader("Upload video (max 200MB)", type=["mp4", "avi"])
68
 
69
  if uploaded_file:
70
- # Save to temp file
71
- with open("temp_video.mp4", "wb") as f:
72
- f.write(uploaded_file.getbuffer())
 
 
 
 
73
 
74
  # Load models
75
- vit_processor, vit_model, audio_analyzer = load_models()
76
 
77
  # Process video
78
- with st.spinner("Analyzing video..."):
79
- result = process_video(
80
- "temp_video.mp4",
81
- vit_processor,
82
- vit_model,
83
- audio_analyzer
84
- )
85
 
86
  # Display results
87
  col1, col2 = st.columns(2)
 
88
  with col1:
89
  st.subheader("🎧 Audio Analysis")
90
- st.metric("Emotion", result['audio'])
91
 
92
  with col2:
93
  st.subheader("πŸ‘οΈ Visual Analysis")
94
- st.metric("Dominant Emotion", result['visual'])
 
 
 
 
 
 
 
95
 
96
- st.success("Analysis complete!")
 
 
 
 
 
1
+ # app.py
2
  import streamlit as st
3
  import cv2
4
  import numpy as np
5
  import moviepy.editor as mp
6
+ from transformers import ViTImageProcessor, ViTForImageClassification
 
 
 
 
7
  import torch
8
+ from speechbrain.pretrained import EncoderClassifier
9
+ import tempfile
10
+ import os
11
 
12
+ # 1. Load Models with caching
13
  @st.cache_resource
14
  def load_models():
15
+ # Load ViT model for facial emotion detection
16
  vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
17
  vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
18
 
19
+ # Load SpeechBrain model for audio emotion recognition
20
+ audio_classifier = EncoderClassifier.from_hparams(
21
+ source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
22
+ savedir="pretrained_models/emotion-audio"
23
  )
24
+
25
+ return vit_processor, vit_model, audio_classifier
26
 
27
+ # 2. Video Processing Functions
28
  def analyze_frame(frame, processor, model):
29
+ """Analyze single frame using ViT model"""
30
  inputs = processor(images=frame, return_tensors="pt")
31
  with torch.no_grad():
32
  outputs = model(**inputs)
33
  return model.config.id2label[outputs.logits.argmax(-1).item()]
34
 
35
+ def process_video(video_path, processor, model, audio_classifier):
36
+ """Process video and return combined results"""
37
+ # Extract audio from video
38
  video = mp.VideoFileClip(video_path)
39
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio:
40
+ audio_path = tmp_audio.name
41
+ video.audio.write_audiofile(audio_path)
42
 
43
  # Analyze audio
44
+ audio_signal = audio_classifier.load_audio(audio_path)
45
+ audio_prediction = audio_classifier.classify_batch(audio_signal)
46
+ audio_emotion = audio_prediction[3][0]
47
 
48
  # Analyze video frames
49
  cap = cv2.VideoCapture(video_path)
50
  emotions = []
51
 
52
+ # Process every 5th frame to reduce computation
53
+ frame_count = 0
54
  while cap.isOpened():
55
  ret, frame = cap.read()
56
+ if not ret:
57
+ break
58
+
59
+ if frame_count % 5 == 0: # Sample every 5th frame
60
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
61
+ emotions.append(analyze_frame(frame_rgb, processor, model))
62
+
63
+ frame_count += 1
64
 
65
  cap.release()
66
+ os.unlink(audio_path) # Clean up temporary audio file
67
+
68
+ # Get most common visual emotion
69
+ visual_emotion = max(set(emotions), key=emotions.count)
70
+
71
  return {
72
+ 'audio_emotion': audio_emotion,
73
+ 'visual_emotion': visual_emotion,
74
+ 'frame_emotions': emotions
75
  }
76
 
77
  # 3. Streamlit UI
78
+ st.set_page_config(page_title="Video Sentiment Analyzer", layout="wide")
79
+
80
+ st.title("πŸŽ₯ Video Sentiment Analysis")
81
  st.markdown("""
82
  Analyze emotions from:
83
+ - **Facial Expressions** using ViT (Vision Transformer)
84
+ - **Speech Tone** using wav2vec2
85
  """)
86
 
87
+ uploaded_file = st.file_uploader("Upload a video file (max 30 seconds)", type=["mp4", "mov", "avi"])
88
 
89
  if uploaded_file:
90
+ # Display video preview
91
+ st.video(uploaded_file)
92
+
93
+ # Save to temporary file
94
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
95
+ tmp_video.write(uploaded_file.getbuffer())
96
+ video_path = tmp_video.name
97
 
98
  # Load models
99
+ vit_processor, vit_model, audio_classifier = load_models()
100
 
101
  # Process video
102
+ with st.spinner("Analyzing video content..."):
103
+ try:
104
+ results = process_video(video_path, vit_processor, vit_model, audio_classifier)
105
+ finally:
106
+ os.unlink(video_path) # Clean up temporary video file
 
 
107
 
108
  # Display results
109
  col1, col2 = st.columns(2)
110
+
111
  with col1:
112
  st.subheader("🎧 Audio Analysis")
113
+ st.metric("Dominant Emotion", results['audio_emotion'])
114
 
115
  with col2:
116
  st.subheader("πŸ‘οΈ Visual Analysis")
117
+ st.metric("Dominant Emotion", results['visual_emotion'])
118
+
119
+ # Show emotion timeline
120
+ st.subheader("πŸ“ˆ Emotion Timeline")
121
+ st.line_chart(
122
+ data={ "Frame Emotions": results['frame_emotions'] },
123
+ use_container_width=True
124
+ )
125
 
126
+ st.success("Analysis complete!")
127
+
128
+ # Footer
129
+ st.markdown("---")
130
+ st.markdown("Built with [Hugging Face](https://huggingface.co/) πŸ€— & [Streamlit](https://streamlit.io/) 🎈")