Spaces:
Running
Running
import gradio as gr | |
import torch | |
import librosa | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler | |
import joblib | |
import parselmouth | |
from parselmouth.praat import call | |
from transformers import HubertForSequenceClassification | |
import torch.nn as nn | |
class HuBERTHateSpeechClassifier(nn.Module): | |
def __init__(self, input_dim, num_classes): | |
super().__init__() | |
self.hubert = HubertForSequenceClassification.from_pretrained( | |
"facebook/hubert-base-ls960" | |
) | |
self.classifier = nn.Sequential( | |
nn.Linear(input_dim, 128), | |
nn.ReLU(), | |
nn.Dropout(0.3), | |
nn.Linear(128, 64), | |
nn.ReLU(), | |
nn.Dropout(0.3), | |
nn.Linear(64, num_classes) | |
) | |
def forward(self, x): | |
return self.classifier(x) | |
class AudioFeatureExtractor: | |
def __init__(self, scaler_path='scaler.joblib'): | |
self.scaler = joblib.load(scaler_path) | |
def safe_mean(self, arr): | |
try: | |
arr = np.array(arr).flatten() | |
arr = arr[np.isfinite(arr)] | |
return float(np.mean(arr)) if len(arr) > 0 else 0.0 | |
except Exception: | |
return 0.0 | |
def safe_std(self, arr): | |
try: | |
arr = np.array(arr).flatten() | |
arr = arr[np.isfinite(arr)] | |
return float(np.std(arr)) if len(arr) > 1 else 0.0 | |
except Exception: | |
return 0.0 | |
def extract_features(self, audio_path): | |
try: | |
y, sr = librosa.load(audio_path, duration=5) | |
except Exception as e: | |
print(f"Error loading audio file: {e}") | |
return np.zeros(13) | |
if len(y) == 0: | |
return np.zeros(13) | |
try: | |
pitches, magnitudes = librosa.piptrack(y=y, sr=sr) | |
pitches = pitches[pitches > 0] | |
pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0 | |
pitch_std = np.std(pitches) if len(pitches) > 0 else 0 | |
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) | |
spectral_centroid_mean = np.mean(spectral_centroid) | |
spectral_centroid_std = np.mean(spectral_centroid) | |
zcr = librosa.feature.zero_crossing_rate(y) | |
zcr_mean = np.mean(zcr) | |
zcr_std = np.mean(zcr) | |
rms = librosa.feature.rms(y=y) | |
rms_mean = np.mean(rms) | |
rms_std = np.mean(rms) | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85) | |
spectral_rolloff_mean = np.mean(spectral_rolloff) | |
spectral_rolloff_std = np.mean(spectral_rolloff) | |
hop_length = 512 | |
duration = librosa.get_duration(y=y, sr=sr) | |
voiced_frames = librosa.effects.split(y, top_db=20) | |
speaking_rate = len(voiced_frames) / duration if duration > 0 else 0 | |
try: | |
sound = parselmouth.Sound(audio_path) | |
pitch = call(sound, "To Pitch", 0.0, 75, 600) | |
harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0) | |
hnr_values = [] | |
for time in pitch.ts(): | |
harmonicity_value = call(harmonicity, "Get value at time", time, "Linear") | |
if not np.isnan(harmonicity_value): | |
hnr_values.append(harmonicity_value) | |
hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0 | |
hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0 | |
except Exception as e: | |
print(f"Error calculating HNR: {e}") | |
hnr_mean = 0 | |
hnr_std = 0 | |
feature_vector = np.array([ | |
pitch_mean, pitch_std, | |
spectral_centroid_mean, spectral_centroid_std, | |
zcr_mean, zcr_std, | |
rms_mean, rms_std, | |
spectral_rolloff_mean, spectral_rolloff_std, | |
speaking_rate, | |
hnr_mean, hnr_std | |
]) | |
scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0] | |
return scaled_features | |
except Exception as e: | |
print(f"Error extracting features: {e}") | |
return np.zeros(13) | |
def predict_hate_speech(audio_path): | |
state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu')) | |
model = HuBERTHateSpeechClassifier(13, 2) | |
model.load_state_dict(state_dict) | |
feature_extractor = AudioFeatureExtractor() | |
features = feature_extractor.extract_features(audio_path) | |
input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0) | |
with torch.no_grad(): | |
outputs = model(input_tensor) | |
probabilities = torch.softmax(outputs, dim=1) | |
predicted_class = torch.argmax(probabilities, dim=1).item() | |
confidence = probabilities[0][predicted_class].item() | |
if confidence > 0.65 and predicted_class == 1: | |
result = { | |
"Classification": "Hate Speech", | |
"Confidence": confidence | |
} | |
else: | |
if confidence < 0.5 and predicted_class == 1: | |
confidence = 1 - confidence | |
result = { | |
"Classification": "Non-Hate Speech", | |
"Confidence": confidence | |
} | |
return result | |
iface = gr.Interface( | |
fn=predict_hate_speech, | |
inputs=gr.Audio(type="filepath", label="Upload Audio"), | |
outputs=gr.Textbox(label="Hate Speech Analysis"), | |
title="Hate Speech Audio Classifier", | |
description="Upload an audio file to detect potential hate speech content.", | |
examples=[ | |
["hate_video_3_3_snippet2.wav"] | |
], | |
allow_flagging="manual" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |