Audio-Emotion-Recognition

Running

File size: 5,218 Bytes

d1fb9a5
 
787c2bc
a1288b8
d5038df
d1fb9a5
eb64d62
8a7312c
4a065d2
864e9d8
0a54d22
787c2bc
0a54d22
 
 
 
 
 
 
 
 
 
 
 
d5038df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb64d62
 
 
d5038df
 
eb64d62
1de8eea
 
 
 
d5038df
 
 
 
 
 
 
 
787c2bc
 
d5038df
eb64d62
787c2bc
 
 
eb64d62
d5038df
1de8eea
787c2bc
 
 
 
 
d5038df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787c2bc
d5038df
1de8eea
eb64d62
d5038df
 
787c2bc
eb64d62
 
 
 
 
 
 
 
 
 
 
 
d5038df
 
 
 
 
 
eb64d62
 
 
26f158e
eb64d62
1de8eea

import gradio as gr
import torch
import numpy as np
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import librosa

# Initialize model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Hatman/audio-emotion-detection"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
model.to(device)

# Define emotion labels
EMOTION_LABELS = {
    0: "angry",
    1: "disgust",
    2: "fear",
    3: "happy",
    4: "neutral",
    5: "sad",
    6: "surprise"
}

def preprocess_audio(audio, target_sr=16000):
    """Enhanced audio preprocessing"""
    try:
        # Convert to numpy array and ensure float32
        audio = np.array(audio, dtype=np.float32)
        
        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.T)
            
        # Resample if needed
        if target_sr != 16000:
            audio = librosa.resample(audio, orig_sr=target_sr, target_sr=16000)
            
        # Apply preprocessing steps
        # 1. Noise reduction
        audio = librosa.effects.preemphasis(audio)
        
        # 2. Normalize
        audio = librosa.util.normalize(audio)
        
        # 3. Voice activity detection
        intervals = librosa.effects.split(audio, top_db=20)
        if len(intervals) > 0:
            audio = np.concatenate([audio[start:end] for start, end in intervals])
        
        # 4. Ensure minimum length (1 second)
        if len(audio) < 16000:
            audio = np.pad(audio, (0, 16000 - len(audio)))
            
        # 5. Take center 3 seconds if too long
        if len(audio) > 48000:  # 3 seconds at 16kHz
            center = len(audio) // 2
            start = center - 24000
            end = center + 24000
            audio = audio[start:end]
            
        return audio
        
    except Exception as e:
        print(f"Preprocessing error: {str(e)}")
        return None

def get_emotion_history():
    """Get emotion detection history"""
    if not hasattr(get_emotion_history, "history"):
        get_emotion_history.history = []
    return get_emotion_history.history

def process_audio(audio):
    """Process audio chunk and return emotion"""
    if audio is None:
        return "No audio input detected"
        
    try:
        # Get the audio data
        if isinstance(audio, tuple):
            audio = audio[1]
            
        # Preprocess audio
        processed_audio = preprocess_audio(audio)
        if processed_audio is None:
            return "Audio preprocessing failed"
            
        if np.max(np.abs(processed_audio)) < 0.01:
            return "Audio too quiet"
            
        # Prepare input for the model
        inputs = feature_extractor(
            processed_audio,
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        
        # Move to device and ensure float32
        inputs = {k: v.to(device, dtype=torch.float32) for k, v in inputs.items()}
        
        # Get prediction
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)[0]
            
            # Get top 2 predictions
            top2_probs, top2_ids = torch.topk(probs, 2)
            
            # Convert to percentages
            top2_probs = [p * 100 for p in top2_probs.cpu().numpy()]
            top2_emotions = [EMOTION_LABELS[idx.item()] for idx in top2_ids]
            
            # Update history
            history = get_emotion_history()
            history.append(top2_emotions[0])
            if len(history) > 5:
                history.pop(0)
                
            # Get most common emotion in history
            if len(history) >= 3:
                from collections import Counter
                most_common = Counter(history).most_common(1)[0][0]
            else:
                most_common = top2_emotions[0]
                
            result = f"Primary: {top2_emotions[0]} ({top2_probs[0]:.1f}%)\n"
            result += f"Secondary: {top2_emotions[1]} ({top2_probs[1]:.1f}%)\n"
            result += f"Trending: {most_common}"
            
            return result
            
    except Exception as e:
        print(f"Error in processing: {str(e)}")
        return "Processing error. Please try again."

# Create Gradio interface
demo = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(
            sources=["microphone"],
            type="numpy",
            streaming=True,
            label="Speak into your microphone",
            show_label=True
        )
    ],
    outputs=gr.Textbox(
        label="Detected Emotions",
        lines=3
    ),
    title="Enhanced Live Emotion Detection",
    description="Speak naturally into your microphone. Shows primary and secondary emotions with confidence levels.",
    live=True,
    allow_flagging=False
)

# Launch with a small queue for better real-time performance
if __name__ == "__main__":
    demo.queue(max_size=1).launch(share=True)