Sagnik1750 commited on
Commit
69ca061
·
verified ·
1 Parent(s): 096df1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -99
app.py CHANGED
@@ -1,111 +1,164 @@
1
- import gradio as gr
2
- import cv2
3
  import torch
 
 
 
4
  import numpy as np
5
- import mediapipe as mp
6
  import matplotlib.pyplot as plt
7
- import seaborn as sns
8
- from facenet_pytorch import MTCNN
9
- from transformers import AutoFeatureExtractor, AutoModelForImageClassification
10
- from PIL import Image
11
- import os
12
- from collections import Counter
13
-
14
- # Load models
15
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
- mtcnn = MTCNN(device=device)
17
- model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression").to(device)
18
- extractor = AutoFeatureExtractor.from_pretrained("trpakov/vit-face-expression")
19
-
20
- # Emotion labels
21
- affectnet_labels = {
22
- 0: "neutral", 1: "happy", 2: "sad", 3: "surprise", 4: "fear",
23
- 5: "disgust", 6: "anger", 7: "contempt"
 
 
24
  }
25
 
26
- def detect_emotions(frame):
27
- """Detects facial emotions in a given frame."""
28
- img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
29
- faces, _ = mtcnn.detect(img)
30
- if faces is None or len(faces) == 0:
31
- return "No Face Detected"
32
-
33
- face = img.crop(faces[0])
34
- inputs = extractor(images=face, return_tensors="pt").to(device)
35
- outputs = model(**inputs)
36
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
37
-
38
- return model.config.id2label[torch.argmax(probs).item()]
39
-
40
- def process_video(input_path):
41
- """Processes video, overlays emotions, and creates a summary chart."""
42
- cap = cv2.VideoCapture(input_path)
43
- fps = int(cap.get(cv2.CAP_PROP_FPS))
44
- frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
45
- out = cv2.VideoWriter("output_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))
46
-
47
- emotion_counts = []
48
-
49
  while cap.isOpened():
50
  ret, frame = cap.read()
51
  if not ret:
52
  break
53
-
54
- emotion = detect_emotions(frame)
55
- emotion_counts.append(emotion)
56
-
57
- # Overlay emotion
58
- overlay = frame.copy()
59
- cv2.rectangle(overlay, (10, 10), (350, 80), (255, 255, 255), -1)
60
- cv2.putText(overlay, f'Emotion: {emotion}', (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
61
- cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
62
-
63
- out.write(frame)
64
 
65
  cap.release()
66
- out.release()
67
- cv2.destroyAllWindows()
68
-
69
- # Find major emotion
70
- emotion_counter = Counter(emotion_counts)
71
- major_emotion = emotion_counter.most_common(1)[0][0] if emotion_counter else "No Face Detected"
72
-
73
- # Generate emotion distribution pie chart
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  plt.figure(figsize=(5, 5))
75
- labels, sizes = zip(*emotion_counter.items())
76
- plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=sns.color_palette('pastel'))
77
- plt.title("Emotion Distribution")
78
- plt.savefig("emotion_distribution.jpg")
79
-
80
- return "output_video.mp4", plt, major_emotion
81
-
82
- # Gradio Web Interface
83
- with gr.Blocks(css="""
84
- .gradio-container { max-width: 750px !important; margin: auto; background-color: #f8f9fa; padding: 20px; border-radius: 15px; }
85
- .gradio-container h1 { font-size: 22px; text-align: center; color: #333; }
86
- .gradio-container .gr-button { background-color: #007bff; color: white; border-radius: 10px; padding: 8px 15px; }
87
- .gradio-container .gr-textbox { font-size: 16px; font-weight: bold; color: #007bff; }
88
- .gradio-container .gr-file { border-radius: 10px; padding: 5px; }
89
- @media screen and (max-width: 768px) {
90
- .gradio-container { width: 100%; padding: 10px; }
91
- .gradio-container h1 { font-size: 18px; }
92
- }
93
- """) as demo:
94
- gr.Markdown("# 🎭 Emotion Analysis from Video 🎥")
95
- gr.Markdown("Upload a video, and the AI will detect emotions in each frame, providing a processed video, an emotion distribution chart, and the major detected emotion.")
96
-
97
- with gr.Row():
98
- video_input = gr.File(label="📤 Upload Video (MP4, MOV, AVI)")
99
-
100
- with gr.Row():
101
- process_button = gr.Button("🚀 Analyze")
102
-
103
- with gr.Row():
104
- video_output = gr.File(label="📥 Processed Video")
105
- emotion_chart = gr.Plot(label="📊 Emotion Distribution Chart")
106
-
107
- major_emotion_output = gr.Textbox(label="🔥 Major Emotion Detected", interactive=False)
108
-
109
- process_button.click(fn=process_video, inputs=video_input, outputs=[video_output, emotion_chart, major_emotion_output])
110
-
111
- demo.launch()
 
 
 
 
1
  import torch
2
+ import torchaudio
3
+ import cv2
4
+ import librosa
5
  import numpy as np
6
+ import gradio as gr
7
  import matplotlib.pyplot as plt
8
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSequenceClassification
9
+ from deepface import DeepFace
10
+ from moviepy.editor import VideoFileClip
11
+
12
+ # --- Load Pretrained Models ---
13
+ # Speech-to-Text
14
+ asr_model_name = "facebook/wav2vec2-large-960h"
15
+ asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
16
+ asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name).to("cpu")
17
+
18
+ # Sentiment Analysis (Text)
19
+ emotion_model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
20
+ emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
21
+ emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name).to("cpu")
22
+
23
+ # Emotion Categories
24
+ emotion_labels = {
25
+ 0: "Neutral", 1: "Happy", 2: "Sad", 3: "Surprise", 4: "Fear",
26
+ 5: "Disgust", 6: "Anger", 7: "Contempt"
27
  }
28
 
29
+ # --- Extract Audio from Video ---
30
+ def extract_audio(video_path, audio_output_path="temp_audio.wav"):
31
+ video = VideoFileClip(video_path)
32
+ video.audio.write_audiofile(audio_output_path, codec="pcm_s16le")
33
+ return audio_output_path
34
+
35
+ # --- Extract Frames for Facial & Posture Analysis ---
36
+ def extract_frames(video_path, interval=10):
37
+ cap = cv2.VideoCapture(video_path)
38
+ frames = []
39
+
 
 
 
 
 
 
 
 
 
 
 
 
40
  while cap.isOpened():
41
  ret, frame = cap.read()
42
  if not ret:
43
  break
44
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45
+ frames.append(frame)
 
 
 
 
 
 
 
 
 
46
 
47
  cap.release()
48
+ return frames[::interval] # Process every nth frame
49
+
50
+ # --- Normalize Emotion Percentages to 100% ---
51
+ def normalize_emotion_percentages(emotion_counts):
52
+ print("Raw emotion counts:", emotion_counts) # Debugging
53
+ total = sum(emotion_counts.values())
54
+ if total > 0:
55
+ normalized_counts = {k: round((v / total) * 100, 1) for k, v in emotion_counts.items()}
56
+
57
+ # Adjust the highest emotion to ensure total = 100%
58
+ total_after = sum(normalized_counts.values())
59
+ if total_after != 100:
60
+ diff = 100 - total_after
61
+ max_emotion = max(normalized_counts, key=normalized_counts.get)
62
+ normalized_counts[max_emotion] += diff
63
+
64
+ print("Normalized emotion counts:", normalized_counts) # Debugging
65
+ return normalized_counts
66
+ else:
67
+ return {k: 0 for k in emotion_counts}
68
+
69
+ # --- Facial Emotion Analysis ---
70
+ def analyze_facial_emotion(frames):
71
+ emotion_counts = {key: 0 for key in emotion_labels.values()}
72
+
73
+ for frame in frames:
74
+ try:
75
+ result = DeepFace.analyze(frame, actions=["emotion"], enforce_detection=False)
76
+ detected_emotion = result[0]["dominant_emotion"].capitalize()
77
+ print("Detected emotion:", detected_emotion) # Debugging
78
+ if detected_emotion in emotion_counts:
79
+ emotion_counts[detected_emotion] += 1
80
+ except Exception:
81
+ continue
82
+
83
+ return normalize_emotion_percentages(emotion_counts)
84
+
85
+ # --- Speech-to-Text ---
86
+ def transcribe_audio(audio_path):
87
+ speech, sr = librosa.load(audio_path, sr=16000)
88
+ input_values = asr_processor(speech, return_tensors="pt", sampling_rate=16000).input_values
89
+
90
+ with torch.no_grad():
91
+ logits = asr_model(input_values).logits
92
+
93
+ predicted_ids = torch.argmax(logits, dim=-1)
94
+ return asr_processor.batch_decode(predicted_ids)[0]
95
+
96
+ # --- Sentiment Analysis from Text ---
97
+ def analyze_audio_emotion(text):
98
+ inputs = emotion_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
99
+ with torch.no_grad():
100
+ logits = emotion_model(**inputs).logits
101
+
102
+ probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
103
+ predicted_emotion = emotion_labels[torch.argmax(logits, dim=-1).item()]
104
+
105
+ return predicted_emotion, probabilities
106
+
107
+ # --- Full Analysis Pipeline ---
108
+ def analyze_video(video_path):
109
+ # Extract Audio from Video
110
+ audio_path = extract_audio(video_path)
111
+
112
+ # Extract Frames for Facial & Posture Analysis
113
+ frames = extract_frames(video_path)
114
+
115
+ # Facial Emotion Analysis
116
+ facial_emotions = analyze_facial_emotion(frames)
117
+
118
+ # Audio Analysis
119
+ transcription = transcribe_audio(audio_path)
120
+ audio_emotion, audio_probabilities = analyze_audio_emotion(transcription)
121
+
122
+ # Combine Emotion Scores
123
+ final_emotion = max(facial_emotions, key=facial_emotions.get) if facial_emotions else "Neutral"
124
+
125
+ # Display Emotion Pie Chart
126
  plt.figure(figsize=(5, 5))
127
+ plt.pie(facial_emotions.values(), labels=facial_emotions.keys(), autopct="%1.1f%%", colors=plt.cm.Paired.colors)
128
+ plt.title("Facial Emotion Distribution")
129
+ plt.savefig("emotion_pie_chart.png")
130
+
131
+ return (
132
+ transcription,
133
+ audio_emotion,
134
+ final_emotion,
135
+ facial_emotions,
136
+ "emotion_pie_chart.png"
137
+ )
138
+
139
+ # --- Gradio UI ---
140
+ theme_css = """
141
+ body { font-family: Arial, sans-serif; background: #f4f4f4; }
142
+ .gradio-container { max-width: 800px; margin: auto; padding: 20px; background: white; border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); }
143
+ .gr-box { border-radius: 10px; padding: 15px; background: #fff; }
144
+ h1 { color: #333; text-align: center; }
145
+ """
146
+
147
+ interface = gr.Interface(
148
+ fn=analyze_video,
149
+ inputs=gr.Video(),
150
+ outputs=[
151
+ gr.Textbox(label="Transcribed Speech"),
152
+ gr.Textbox(label="Predicted Audio Emotion"),
153
+ gr.Textbox(label="Major Detected Emotion (Face + Posture)"),
154
+ gr.Label(label="Facial Emotion Distribution"),
155
+ gr.Image(label="Facial Emotion Pie Chart"),
156
+ ],
157
+ title="🎭 Multi-Modal Emotion Analysis",
158
+ description="📌 Upload a video and get analyzed emotions from **facial expressions, posture, and voice** in one step.\n\n🚀 Features:\n- Facial Emotion Analysis\n- Audio-Based Sentiment Detection\n- Real-Time Processing\n- Visual Pie Chart Representation",
159
+ theme="compact",
160
+ css=theme_css
161
+ )
162
+
163
+ interface.launch()
164
+