File size: 5,633 Bytes
1ef3e75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
import numpy as np
import librosa
import pickle
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# Paths to your models and label encoders
lstm_speaker_model = '/content/lstm_speaker_model.h5'
lstm_gender_model = '/content/lstm_gender_model.h5'
lstm_speaker_label = '/content/lstm_speaker_label.pkl'
lstm_gender_label = '/content/lstm_gender_label.pkl'

# ------------------- Feature Extraction -------------------
def extract_features(audio_data, max_len=34):
    """Extract MFCC features from an audio file."""
    audio, sr = librosa.load(audio_data, sr=None)

    # Extract MFCC features (13 coefficients)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)

    # Spectral Features: Chroma
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)

    # Spectral Features: Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    spectral_contrast_mean = np.mean(spectral_contrast, axis=1)

    # Combine only a subset of features (to match the model's expected input size)
    features = np.hstack([mfccs_mean[:13], chroma_mean[:13], spectral_contrast_mean[:8]])

    # Padding or truncating to fixed length (max_len)
    if features.shape[0] < max_len:
        padding = np.zeros((max_len - features.shape[0],))
        features = np.concatenate((features, padding))
    elif features.shape[0] > max_len:
        features = features[:max_len]

    return features

def preprocess_audio_for_model(audio_data, max_len=34):
    """Preprocess audio file for model prediction."""
    features = extract_features(audio_data, max_len=max_len)
    features = features.reshape(1, 1, features.shape[0])  # Shape for LSTM: (samples, timesteps, features)
    return features

# ------------------- Load Pre-trained Models and Label Encoders -------------------
def load_trained_model(model_path='/content/lstm_speaker_model.h5'):
    """Load the trained speaker model."""
    return tf.keras.models.load_model(model_path)

def load_gender_model(model_path='/content/lstm_gender_model.h5'):
    """Load the trained gender model."""
    return tf.keras.models.load_model(model_path)

def load_label_encoder(label_encoder_path='/content/lstm_speaker_label.pkl'):
    """Load the label encoder for speaker labels."""
    with open(label_encoder_path, 'rb') as f:
        label_encoder = pickle.load(f)
    return label_encoder

def load_gender_label_encoder(label_encoder_path='/content/lstm_gender_label.pkl'):
    """Load the label encoder for gender labels."""
    with open(label_encoder_path, 'rb') as f:
        label_encoder = pickle.load(f)
    return label_encoder

# ------------------- Predict Top 3 Speakers and Gender -------------------
def predict_top_3_speakers_and_gender(audio_data, speaker_model, gender_model, speaker_encoder, gender_encoder, max_len=34):
    """Predict the top 3 speakers and gender from an uploaded audio file."""
    features = preprocess_audio_for_model(audio_data, max_len=max_len)

    # Predict the speaker probabilities
    speaker_pred = speaker_model.predict(features)

    # Get top 3 speakers
    top_3_speakers_idx = np.argsort(speaker_pred[0])[::-1][:3]
    top_3_speakers_probs = speaker_pred[0][top_3_speakers_idx] * 100  # Convert to percentages
    top_3_speakers = speaker_encoder.inverse_transform(top_3_speakers_idx)

    # Predict the gender
    gender_pred = gender_model.predict(features)  # Gender model needs 1D features
    predicted_gender = gender_encoder.inverse_transform([np.argmax(gender_pred)])[0]

    return top_3_speakers, top_3_speakers_probs, predicted_gender

# ------------------- Gradio Interface -------------------
def gradio_interface(audio):
    # Load the trained models and label encoders
    speaker_model = load_trained_model(lstm_speaker_model)  # Speaker model
    gender_model = load_gender_model(lstm_gender_model)  # Gender model
    speaker_encoder = load_label_encoder(lstm_speaker_label)  # Speaker label encoder
    gender_encoder = load_gender_label_encoder(lstm_gender_label)  # Gender label encoder

    # Predict the top 3 speakers and gender from the uploaded audio file
    top_3_speakers, top_3_speakers_probs, predicted_gender = predict_top_3_speakers_and_gender(
        audio, speaker_model, gender_model, speaker_encoder, gender_encoder
    )

    # Return results as a formatted string for Gradio output
    result = f"The top 3 predicted speakers are:\n"
    for speaker, prob in zip(top_3_speakers, top_3_speakers_probs):
        result += f"{speaker}: {prob:.2f}%\n"

    result += f"\nThe predicted gender is: {predicted_gender}"

    return result

# Gradio interface creation
demo = gr.Interface(
    fn=gradio_interface,  # The function to predict speaker and gender
    inputs=gr.Audio(type="filepath"),  # Audio input (file upload)
    outputs="text",  # Output the prediction result as text
    live=False,  # Disable live feedback
    title="Speaker and Gender Prediction",
    description="Upload or record an audio file to predict the top 3 speakers and gender.",
    allow_flagging="never",  # Disable flagging
    theme="compact",  # Set the theme
    css="""
    body {
        margin: 0;
        padding: 0;
        background-color: #f1f1f1;
        font-family: 'Roboto', sans-serif;
    }

    .gradio-container {
        background-color: #ffffff;
        padding: 20px;
        border-radius: 8px;
        box-shadow: 0px 4px 6px rgba(0, 0, 0, 0.1);
    }

    h1, p {
        color: #333;
    }
    """
)

# Launch Gradio app
demo.launch()