|
import gradio as gr |
|
import numpy as np |
|
import librosa |
|
import pickle |
|
import tensorflow as tf |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
lstm_speaker_model = '/content/lstm_speaker_model.h5' |
|
lstm_gender_model = '/content/lstm_gender_model.h5' |
|
lstm_speaker_label = '/content/lstm_speaker_label.pkl' |
|
lstm_gender_label = '/content/lstm_gender_label.pkl' |
|
|
|
|
|
def extract_features(audio_data, max_len=34): |
|
"""Extract MFCC features from an audio file.""" |
|
audio, sr = librosa.load(audio_data, sr=None) |
|
|
|
|
|
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) |
|
mfccs_mean = np.mean(mfccs, axis=1) |
|
|
|
|
|
chroma = librosa.feature.chroma_stft(y=audio, sr=sr) |
|
chroma_mean = np.mean(chroma, axis=1) |
|
|
|
|
|
spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr) |
|
spectral_contrast_mean = np.mean(spectral_contrast, axis=1) |
|
|
|
|
|
features = np.hstack([mfccs_mean[:13], chroma_mean[:13], spectral_contrast_mean[:8]]) |
|
|
|
|
|
if features.shape[0] < max_len: |
|
padding = np.zeros((max_len - features.shape[0],)) |
|
features = np.concatenate((features, padding)) |
|
elif features.shape[0] > max_len: |
|
features = features[:max_len] |
|
|
|
return features |
|
|
|
def preprocess_audio_for_model(audio_data, max_len=34): |
|
"""Preprocess audio file for model prediction.""" |
|
features = extract_features(audio_data, max_len=max_len) |
|
features = features.reshape(1, 1, features.shape[0]) |
|
return features |
|
|
|
|
|
def load_trained_model(model_path='/content/lstm_speaker_model.h5'): |
|
"""Load the trained speaker model.""" |
|
return tf.keras.models.load_model(model_path) |
|
|
|
def load_gender_model(model_path='/content/lstm_gender_model.h5'): |
|
"""Load the trained gender model.""" |
|
return tf.keras.models.load_model(model_path) |
|
|
|
def load_label_encoder(label_encoder_path='/content/lstm_speaker_label.pkl'): |
|
"""Load the label encoder for speaker labels.""" |
|
with open(label_encoder_path, 'rb') as f: |
|
label_encoder = pickle.load(f) |
|
return label_encoder |
|
|
|
def load_gender_label_encoder(label_encoder_path='/content/lstm_gender_label.pkl'): |
|
"""Load the label encoder for gender labels.""" |
|
with open(label_encoder_path, 'rb') as f: |
|
label_encoder = pickle.load(f) |
|
return label_encoder |
|
|
|
|
|
def predict_top_3_speakers_and_gender(audio_data, speaker_model, gender_model, speaker_encoder, gender_encoder, max_len=34): |
|
"""Predict the top 3 speakers and gender from an uploaded audio file.""" |
|
features = preprocess_audio_for_model(audio_data, max_len=max_len) |
|
|
|
|
|
speaker_pred = speaker_model.predict(features) |
|
|
|
|
|
top_3_speakers_idx = np.argsort(speaker_pred[0])[::-1][:3] |
|
top_3_speakers_probs = speaker_pred[0][top_3_speakers_idx] * 100 |
|
top_3_speakers = speaker_encoder.inverse_transform(top_3_speakers_idx) |
|
|
|
|
|
gender_pred = gender_model.predict(features) |
|
predicted_gender = gender_encoder.inverse_transform([np.argmax(gender_pred)])[0] |
|
|
|
return top_3_speakers, top_3_speakers_probs, predicted_gender |
|
|
|
|
|
def gradio_interface(audio): |
|
|
|
speaker_model = load_trained_model(lstm_speaker_model) |
|
gender_model = load_gender_model(lstm_gender_model) |
|
speaker_encoder = load_label_encoder(lstm_speaker_label) |
|
gender_encoder = load_gender_label_encoder(lstm_gender_label) |
|
|
|
|
|
top_3_speakers, top_3_speakers_probs, predicted_gender = predict_top_3_speakers_and_gender( |
|
audio, speaker_model, gender_model, speaker_encoder, gender_encoder |
|
) |
|
|
|
|
|
result = f"The top 3 predicted speakers are:\n" |
|
for speaker, prob in zip(top_3_speakers, top_3_speakers_probs): |
|
result += f"{speaker}: {prob:.2f}%\n" |
|
|
|
result += f"\nThe predicted gender is: {predicted_gender}" |
|
|
|
return result |
|
|
|
|
|
demo = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs="text", |
|
live=False, |
|
title="Speaker and Gender Prediction", |
|
description="Upload or record an audio file to predict the top 3 speakers and gender.", |
|
allow_flagging="never", |
|
theme="compact", |
|
css=""" |
|
body { |
|
margin: 0; |
|
padding: 0; |
|
background-color: #f1f1f1; |
|
font-family: 'Roboto', sans-serif; |
|
} |
|
|
|
.gradio-container { |
|
background-color: #ffffff; |
|
padding: 20px; |
|
border-radius: 8px; |
|
box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1); |
|
} |
|
|
|
h1, p { |
|
color: #333; |
|
} |
|
""" |
|
) |
|
|
|
|
|
demo.launch() |