File size: 5,633 Bytes
1ef3e75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
import numpy as np
import librosa
import pickle
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
# Paths to your models and label encoders
lstm_speaker_model = '/content/lstm_speaker_model.h5'
lstm_gender_model = '/content/lstm_gender_model.h5'
lstm_speaker_label = '/content/lstm_speaker_label.pkl'
lstm_gender_label = '/content/lstm_gender_label.pkl'
# ------------------- Feature Extraction -------------------
def extract_features(audio_data, max_len=34):
"""Extract MFCC features from an audio file."""
audio, sr = librosa.load(audio_data, sr=None)
# Extract MFCC features (13 coefficients)
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
mfccs_mean = np.mean(mfccs, axis=1)
# Spectral Features: Chroma
chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
chroma_mean = np.mean(chroma, axis=1)
# Spectral Features: Spectral Contrast
spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
spectral_contrast_mean = np.mean(spectral_contrast, axis=1)
# Combine only a subset of features (to match the model's expected input size)
features = np.hstack([mfccs_mean[:13], chroma_mean[:13], spectral_contrast_mean[:8]])
# Padding or truncating to fixed length (max_len)
if features.shape[0] < max_len:
padding = np.zeros((max_len - features.shape[0],))
features = np.concatenate((features, padding))
elif features.shape[0] > max_len:
features = features[:max_len]
return features
def preprocess_audio_for_model(audio_data, max_len=34):
"""Preprocess audio file for model prediction."""
features = extract_features(audio_data, max_len=max_len)
features = features.reshape(1, 1, features.shape[0]) # Shape for LSTM: (samples, timesteps, features)
return features
# ------------------- Load Pre-trained Models and Label Encoders -------------------
def load_trained_model(model_path='/content/lstm_speaker_model.h5'):
"""Load the trained speaker model."""
return tf.keras.models.load_model(model_path)
def load_gender_model(model_path='/content/lstm_gender_model.h5'):
"""Load the trained gender model."""
return tf.keras.models.load_model(model_path)
def load_label_encoder(label_encoder_path='/content/lstm_speaker_label.pkl'):
"""Load the label encoder for speaker labels."""
with open(label_encoder_path, 'rb') as f:
label_encoder = pickle.load(f)
return label_encoder
def load_gender_label_encoder(label_encoder_path='/content/lstm_gender_label.pkl'):
"""Load the label encoder for gender labels."""
with open(label_encoder_path, 'rb') as f:
label_encoder = pickle.load(f)
return label_encoder
# ------------------- Predict Top 3 Speakers and Gender -------------------
def predict_top_3_speakers_and_gender(audio_data, speaker_model, gender_model, speaker_encoder, gender_encoder, max_len=34):
"""Predict the top 3 speakers and gender from an uploaded audio file."""
features = preprocess_audio_for_model(audio_data, max_len=max_len)
# Predict the speaker probabilities
speaker_pred = speaker_model.predict(features)
# Get top 3 speakers
top_3_speakers_idx = np.argsort(speaker_pred[0])[::-1][:3]
top_3_speakers_probs = speaker_pred[0][top_3_speakers_idx] * 100 # Convert to percentages
top_3_speakers = speaker_encoder.inverse_transform(top_3_speakers_idx)
# Predict the gender
gender_pred = gender_model.predict(features) # Gender model needs 1D features
predicted_gender = gender_encoder.inverse_transform([np.argmax(gender_pred)])[0]
return top_3_speakers, top_3_speakers_probs, predicted_gender
# ------------------- Gradio Interface -------------------
def gradio_interface(audio):
# Load the trained models and label encoders
speaker_model = load_trained_model(lstm_speaker_model) # Speaker model
gender_model = load_gender_model(lstm_gender_model) # Gender model
speaker_encoder = load_label_encoder(lstm_speaker_label) # Speaker label encoder
gender_encoder = load_gender_label_encoder(lstm_gender_label) # Gender label encoder
# Predict the top 3 speakers and gender from the uploaded audio file
top_3_speakers, top_3_speakers_probs, predicted_gender = predict_top_3_speakers_and_gender(
audio, speaker_model, gender_model, speaker_encoder, gender_encoder
)
# Return results as a formatted string for Gradio output
result = f"The top 3 predicted speakers are:\n"
for speaker, prob in zip(top_3_speakers, top_3_speakers_probs):
result += f"{speaker}: {prob:.2f}%\n"
result += f"\nThe predicted gender is: {predicted_gender}"
return result
# Gradio interface creation
demo = gr.Interface(
fn=gradio_interface, # The function to predict speaker and gender
inputs=gr.Audio(type="filepath"), # Audio input (file upload)
outputs="text", # Output the prediction result as text
live=False, # Disable live feedback
title="Speaker and Gender Prediction",
description="Upload or record an audio file to predict the top 3 speakers and gender.",
allow_flagging="never", # Disable flagging
theme="compact", # Set the theme
css="""
body {
margin: 0;
padding: 0;
background-color: #f1f1f1;
font-family: 'Roboto', sans-serif;
}
.gradio-container {
background-color: #ffffff;
padding: 20px;
border-radius: 8px;
box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1);
}
h1, p {
color: #333;
}
"""
)
# Launch Gradio app
demo.launch() |