import gradio as gr import torch import librosa import numpy as np from sklearn.preprocessing import StandardScaler import joblib import parselmouth from parselmouth.praat import call from transformers import HubertForSequenceClassification import torch.nn as nn class HuBERTHateSpeechClassifier(nn.Module): def __init__(self, input_dim, num_classes): super().__init__() self.hubert = HubertForSequenceClassification.from_pretrained( "facebook/hubert-base-ls960" ) self.classifier = nn.Sequential( nn.Linear(input_dim, 128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.3), nn.Linear(64, num_classes) ) def forward(self, x): return self.classifier(x) class AudioFeatureExtractor: def __init__(self, scaler_path='scaler.joblib'): self.scaler = joblib.load(scaler_path) def safe_mean(self, arr): try: arr = np.array(arr).flatten() arr = arr[np.isfinite(arr)] return float(np.mean(arr)) if len(arr) > 0 else 0.0 except Exception: return 0.0 def safe_std(self, arr): try: arr = np.array(arr).flatten() arr = arr[np.isfinite(arr)] return float(np.std(arr)) if len(arr) > 1 else 0.0 except Exception: return 0.0 def extract_features(self, audio_path): try: y, sr = librosa.load(audio_path, duration=5) except Exception as e: print(f"Error loading audio file: {e}") return np.zeros(13) if len(y) == 0: return np.zeros(13) try: pitches, magnitudes = librosa.piptrack(y=y, sr=sr) pitches = pitches[pitches > 0] pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0 pitch_std = np.std(pitches) if len(pitches) > 0 else 0 spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) spectral_centroid_mean = np.mean(spectral_centroid) spectral_centroid_std = np.mean(spectral_centroid) zcr = librosa.feature.zero_crossing_rate(y) zcr_mean = np.mean(zcr) zcr_std = np.mean(zcr) rms = librosa.feature.rms(y=y) rms_mean = np.mean(rms) rms_std = np.mean(rms) spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85) spectral_rolloff_mean = np.mean(spectral_rolloff) spectral_rolloff_std = np.mean(spectral_rolloff) hop_length = 512 duration = librosa.get_duration(y=y, sr=sr) voiced_frames = librosa.effects.split(y, top_db=20) speaking_rate = len(voiced_frames) / duration if duration > 0 else 0 try: sound = parselmouth.Sound(audio_path) pitch = call(sound, "To Pitch", 0.0, 75, 600) harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0) hnr_values = [] for time in pitch.ts(): harmonicity_value = call(harmonicity, "Get value at time", time, "Linear") if not np.isnan(harmonicity_value): hnr_values.append(harmonicity_value) hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0 hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0 except Exception as e: print(f"Error calculating HNR: {e}") hnr_mean = 0 hnr_std = 0 feature_vector = np.array([ pitch_mean, pitch_std, spectral_centroid_mean, spectral_centroid_std, zcr_mean, zcr_std, rms_mean, rms_std, spectral_rolloff_mean, spectral_rolloff_std, speaking_rate, hnr_mean, hnr_std ]) scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0] return scaled_features except Exception as e: print(f"Error extracting features: {e}") return np.zeros(13) def predict_hate_speech(audio_path): state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu')) model = HuBERTHateSpeechClassifier(13, 2) model.load_state_dict(state_dict) feature_extractor = AudioFeatureExtractor() features = feature_extractor.extract_features(audio_path) input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0) with torch.no_grad(): outputs = model(input_tensor) probabilities = torch.softmax(outputs, dim=1) predicted_class = torch.argmax(probabilities, dim=1).item() confidence = probabilities[0][predicted_class].item() if confidence > 0.6 and predicted_class == 1: result = { "Classification": "Hate Speech", "Confidence": confidence } else: if confidence < 0.5 and predicted_class == 1: confidence = 1 - confidence result = { "Classification": "Non-Hate Speech", "Confidence": confidence } return result iface = gr.Interface( fn=predict_hate_speech, inputs=gr.Audio(type="filepath", label="Upload Audio"), outputs=gr.Textbox(label="Hate Speech Analysis"), title="Hate Speech Audio Classifier", description="Upload an audio file to detect potential hate speech content.", examples=[ ["hate_video_3_3_snippet2.wav"] ], allow_flagging="manual" ) if __name__ == "__main__": iface.launch()