|
import gradio as gr |
|
import torch |
|
import librosa |
|
import numpy as np |
|
from torchvision import models |
|
from scipy.ndimage import zoom |
|
from sklearn.decomposition import PCA |
|
import joblib |
|
|
|
|
|
ann_model = torch.load('ann_model.pth') |
|
pca = joblib.load('pca.pkl') |
|
|
|
|
|
vgg16 = models.vgg16(pretrained=True).features |
|
|
|
|
|
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance): |
|
|
|
y, sr = librosa.load(audio_file, sr=None) |
|
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) |
|
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) |
|
norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) |
|
|
|
|
|
target_shape = (128, 128) |
|
resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest') |
|
|
|
|
|
mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1) |
|
|
|
|
|
mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() |
|
|
|
|
|
vgg16_model.eval() |
|
with torch.no_grad(): |
|
features = vgg16_model(mel_spec_tensor) |
|
|
|
|
|
features_np = features.squeeze().detach().numpy() |
|
features_flattened = features_np.flatten().reshape(1, -1) |
|
|
|
|
|
features_pca = pca_instance.transform(features_flattened) |
|
|
|
|
|
features_tensor = torch.from_numpy(features_pca).float() |
|
|
|
return features_tensor |
|
|
|
def predict(audio_file): |
|
|
|
preprocessed_features = preprocess_single_audio_vgg16(audio_file.name, vgg16, pca) |
|
|
|
|
|
ann_model.eval() |
|
with torch.no_grad(): |
|
output = ann_model(preprocessed_features) |
|
_, predicted_class = torch.max(output, 1) |
|
|
|
|
|
predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()] |
|
|
|
return predicted_label |
|
|
|
iface = gr.Interface(fn=predict, inputs="file", outputs="text") |
|
iface.launch() |
|
|