File size: 5,218 Bytes
d1fb9a5 787c2bc a1288b8 d5038df d1fb9a5 eb64d62 8a7312c 4a065d2 864e9d8 0a54d22 787c2bc 0a54d22 d5038df eb64d62 d5038df eb64d62 1de8eea d5038df 787c2bc d5038df eb64d62 787c2bc eb64d62 d5038df 1de8eea 787c2bc d5038df 787c2bc d5038df 1de8eea eb64d62 d5038df 787c2bc eb64d62 d5038df eb64d62 26f158e eb64d62 1de8eea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import gradio as gr
import torch
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import librosa
# Initialize model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Hatman/audio-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
model.to(device)
# Define emotion labels
EMOTION_LABELS = {
0: "angry",
1: "disgust",
2: "fear",
3: "happy",
4: "neutral",
5: "sad",
6: "surprise"
}
def preprocess_audio(audio, target_sr=16000):
"""Enhanced audio preprocessing"""
try:
# Convert to numpy array and ensure float32
audio = np.array(audio, dtype=np.float32)
# Convert to mono if stereo
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.T)
# Resample if needed
if target_sr != 16000:
audio = librosa.resample(audio, orig_sr=target_sr, target_sr=16000)
# Apply preprocessing steps
# 1. Noise reduction
audio = librosa.effects.preemphasis(audio)
# 2. Normalize
audio = librosa.util.normalize(audio)
# 3. Voice activity detection
intervals = librosa.effects.split(audio, top_db=20)
if len(intervals) > 0:
audio = np.concatenate([audio[start:end] for start, end in intervals])
# 4. Ensure minimum length (1 second)
if len(audio) < 16000:
audio = np.pad(audio, (0, 16000 - len(audio)))
# 5. Take center 3 seconds if too long
if len(audio) > 48000: # 3 seconds at 16kHz
center = len(audio) // 2
start = center - 24000
end = center + 24000
audio = audio[start:end]
return audio
except Exception as e:
print(f"Preprocessing error: {str(e)}")
return None
def get_emotion_history():
"""Get emotion detection history"""
if not hasattr(get_emotion_history, "history"):
get_emotion_history.history = []
return get_emotion_history.history
def process_audio(audio):
"""Process audio chunk and return emotion"""
if audio is None:
return "No audio input detected"
try:
# Get the audio data
if isinstance(audio, tuple):
audio = audio[1]
# Preprocess audio
processed_audio = preprocess_audio(audio)
if processed_audio is None:
return "Audio preprocessing failed"
if np.max(np.abs(processed_audio)) < 0.01:
return "Audio too quiet"
# Prepare input for the model
inputs = feature_extractor(
processed_audio,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
# Move to device and ensure float32
inputs = {k: v.to(device, dtype=torch.float32) for k, v in inputs.items()}
# Get prediction
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
# Get top 2 predictions
top2_probs, top2_ids = torch.topk(probs, 2)
# Convert to percentages
top2_probs = [p * 100 for p in top2_probs.cpu().numpy()]
top2_emotions = [EMOTION_LABELS[idx.item()] for idx in top2_ids]
# Update history
history = get_emotion_history()
history.append(top2_emotions[0])
if len(history) > 5:
history.pop(0)
# Get most common emotion in history
if len(history) >= 3:
from collections import Counter
most_common = Counter(history).most_common(1)[0][0]
else:
most_common = top2_emotions[0]
result = f"Primary: {top2_emotions[0]} ({top2_probs[0]:.1f}%)\n"
result += f"Secondary: {top2_emotions[1]} ({top2_probs[1]:.1f}%)\n"
result += f"Trending: {most_common}"
return result
except Exception as e:
print(f"Error in processing: {str(e)}")
return "Processing error. Please try again."
# Create Gradio interface
demo = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(
sources=["microphone"],
type="numpy",
streaming=True,
label="Speak into your microphone",
show_label=True
)
],
outputs=gr.Textbox(
label="Detected Emotions",
lines=3
),
title="Enhanced Live Emotion Detection",
description="Speak naturally into your microphone. Shows primary and secondary emotions with confidence levels.",
live=True,
allow_flagging=False
)
# Launch with a small queue for better real-time performance
if __name__ == "__main__":
demo.queue(max_size=1).launch(share=True) |