Spaces:

dj-dawgs-ipd
/

IPD-Audio-Model

Sleeping

File size: 5,886 Bytes

import gradio as gr
import torch
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import parselmouth
from parselmouth.praat import call
from transformers import HubertForSequenceClassification
import torch.nn as nn


class HuBERTHateSpeechClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.hubert = HubertForSequenceClassification.from_pretrained(
            "facebook/hubert-base-ls960"
        )

        self.classifier = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.classifier(x)
        

class AudioFeatureExtractor:
    def __init__(self, scaler_path='scaler.joblib'):
        self.scaler = joblib.load(scaler_path)

    def safe_mean(self, arr):
        try:
            arr = np.array(arr).flatten()
            arr = arr[np.isfinite(arr)]
            return float(np.mean(arr)) if len(arr) > 0 else 0.0
        except Exception:
            return 0.0

    def safe_std(self, arr):
        try:
            arr = np.array(arr).flatten()
            arr = arr[np.isfinite(arr)]
            return float(np.std(arr)) if len(arr) > 1 else 0.0
        except Exception:
            return 0.0
        
    def extract_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, duration=5)  
        except Exception as e:
            print(f"Error loading audio file: {e}")
            return np.zeros(13) 

        if len(y) == 0:
            return np.zeros(13)
        
        try:
            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
            pitches = pitches[pitches > 0] 
            pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
            pitch_std = np.std(pitches) if len(pitches) > 0 else 0

            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_centroid_mean = np.mean(spectral_centroid)
            spectral_centroid_std = np.mean(spectral_centroid)

            zcr = librosa.feature.zero_crossing_rate(y)
            zcr_mean = np.mean(zcr)
            zcr_std = np.mean(zcr)

            rms = librosa.feature.rms(y=y)
            rms_mean = np.mean(rms)
            rms_std = np.mean(rms)

            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
            spectral_rolloff_mean = np.mean(spectral_rolloff)
            spectral_rolloff_std = np.mean(spectral_rolloff)

            hop_length = 512  
            duration = librosa.get_duration(y=y, sr=sr)
            voiced_frames = librosa.effects.split(y, top_db=20)  
            speaking_rate = len(voiced_frames) / duration if duration > 0 else 0

            try:
                sound = parselmouth.Sound(audio_path)
                pitch = call(sound, "To Pitch", 0.0, 75, 600)
                harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
                hnr_values = []
                for time in pitch.ts():
                    harmonicity_value = call(harmonicity, "Get value at time", time, "Linear")
                    if not np.isnan(harmonicity_value):
                        hnr_values.append(harmonicity_value)

                hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0
                hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0

            except Exception as e:
                print(f"Error calculating HNR: {e}")
                hnr_mean = 0
                hnr_std = 0
            
            feature_vector = np.array([
                pitch_mean, pitch_std,
                spectral_centroid_mean, spectral_centroid_std,
                zcr_mean, zcr_std,
                rms_mean, rms_std,
                spectral_rolloff_mean, spectral_rolloff_std,
                speaking_rate,
                hnr_mean, hnr_std
            ])
            
            scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0]
            
            return scaled_features
        
        except Exception as e:
            print(f"Error extracting features: {e}")
            return np.zeros(13) 

        
def predict_hate_speech(audio_path):
    state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu'))
    model = HuBERTHateSpeechClassifier(13, 2)
    model.load_state_dict(state_dict)

    feature_extractor = AudioFeatureExtractor()
    features = feature_extractor.extract_features(audio_path)
    
    input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(input_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()

    if confidence > 0.6 and predicted_class == 1:
            result = {
            "Classification": "Hate Speech",
            "Confidence": confidence
        }
    else:
        if confidence < 0.5 and predicted_class == 1:
            confidence = 1 - confidence
        result = {
            "Classification": "Non-Hate Speech",
            "Confidence": confidence
        }
    
    return result

iface = gr.Interface(
    fn=predict_hate_speech,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=gr.Textbox(label="Hate Speech Analysis"),
    title="Hate Speech Audio Classifier",
    description="Upload an audio file to detect potential hate speech content.",
    examples=[
        ["hate_video_3_3_snippet2.wav"]
    ],
    allow_flagging="manual"
)

if __name__ == "__main__":
    iface.launch()