File size: 2,576 Bytes
3c699d2 6abefd9 3c699d2 6abefd9 3c699d2 6abefd9 3c699d2 6abefd9 3c699d2 6abefd9 3c699d2 6abefd9 3c699d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import torch
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
from sklearn.decomposition import PCA
import joblib
# Load the trained model and PCA instance
ann_model = torch.load('ann_model.pth')
pca = joblib.load('pca.pkl')
# Load VGG16 model
vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
# Load and preprocess the audio file
y, sr = librosa.load(audio_file, sr=None) # Load audio
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation
norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize
# Resize mel spectrogram to the target shape (128, 128) using zoom
target_shape = (128, 128)
resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
# Stack the resized mel spectrogram along the third axis to create 3 channels
mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)
# Convert the preprocessed audio data into a format suitable for the VGG16 model
mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order
# Extract features using VGG16
vgg16_model.eval()
with torch.no_grad():
features = vgg16_model(mel_spec_tensor)
# Convert the features to numpy array and flatten them
features_np = features.squeeze().detach().numpy()
features_flattened = features_np.flatten().reshape(1, -1)
# Apply PCA transformation
features_pca = pca_instance.transform(features_flattened)
# Convert to PyTorch tensor
features_tensor = torch.from_numpy(features_pca).float()
return features_tensor
def predict(audio_file):
# Preprocess the audio file
preprocessed_features = preprocess_single_audio_vgg16(audio_file.name, vgg16, pca)
# Make a prediction
ann_model.eval()
with torch.no_grad():
output = ann_model(preprocessed_features)
_, predicted_class = torch.max(output, 1)
# Map predicted class index to actual label
predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]
return predicted_label
iface = gr.Interface(fn=predict, inputs="file", outputs="text")
iface.launch()
|