saronium's picture
Update app.py
3c699d2 verified
raw
history blame
2.58 kB
import gradio as gr
import torch
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
from sklearn.decomposition import PCA
import joblib
# Load the trained model and PCA instance
ann_model = torch.load('ann_model.pth')
pca = joblib.load('pca.pkl')
# Load VGG16 model
vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
# Load and preprocess the audio file
y, sr = librosa.load(audio_file, sr=None) # Load audio
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation
norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize
# Resize mel spectrogram to the target shape (128, 128) using zoom
target_shape = (128, 128)
resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
# Stack the resized mel spectrogram along the third axis to create 3 channels
mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)
# Convert the preprocessed audio data into a format suitable for the VGG16 model
mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order
# Extract features using VGG16
vgg16_model.eval()
with torch.no_grad():
features = vgg16_model(mel_spec_tensor)
# Convert the features to numpy array and flatten them
features_np = features.squeeze().detach().numpy()
features_flattened = features_np.flatten().reshape(1, -1)
# Apply PCA transformation
features_pca = pca_instance.transform(features_flattened)
# Convert to PyTorch tensor
features_tensor = torch.from_numpy(features_pca).float()
return features_tensor
def predict(audio_file):
# Preprocess the audio file
preprocessed_features = preprocess_single_audio_vgg16(audio_file.name, vgg16, pca)
# Make a prediction
ann_model.eval()
with torch.no_grad():
output = ann_model(preprocessed_features)
_, predicted_class = torch.max(output, 1)
# Map predicted class index to actual label
predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]
return predicted_label
iface = gr.Interface(fn=predict, inputs="file", outputs="text")
iface.launch()