File size: 3,871 Bytes
6abefd9 d30bab3 6abefd9 8b10513 c0f1a73 6e4bc1e 5bba7e5 6abefd9 8b10513 d303dbe 3c699d2 6e4bc1e 0b54da7 6e4bc1e 0b54da7 6e4bc1e 0b54da7 6e4bc1e 0b54da7 6e4bc1e 0b54da7 51d0bd5 6e4bc1e c0f1a73 0b54da7 c0f1a73 0b54da7 6e4bc1e c0f1a73 3c699d2 5bba7e5 8b10513 5bba7e5 04813c5 6abefd9 f5f3175 6abefd9 d8ba3b8 5bba7e5 8b10513 ab3968e 5bba7e5 04813c5 ab3968e 5bba7e5 ab3968e a29c937 04b2aaa 8b10513 5bba7e5 6abefd9 8b10513 3c699d2 6abefd9 3c699d2 6abefd9 3c699d2 6abefd9 5bba7e5 3c699d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
import gradio as gr
import pickle
from joblib import load
import soundfile as sf
# Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3}
class ANNModel(nn.Module):
def __init__(self, input_size=300, hidden_size1=256, hidden_size2=64, num_classes=4):
super(ANNModel, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size1)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_size1, hidden_size2)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(hidden_size2, num_classes)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.fc1(x)
x = self.relu1(x)
x = self.fc2(x)
x = self.relu2(x)
x = self.fc3(x)
x = self.softmax(x)
return x
# Create an instance of your model
ann_model = ANNModel()
# Load the trained model
# Load the PCA instance
pca = load('pca_400.pkl')
vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance):
# Your existing preprocessing code goes here
y= audio_data
# Load audio
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation
norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize
# Resize mel spectrogram to the target shape (128, 128) using zoom
target_shape = (224, 224)
resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
# Stack the resized mel spectrogram along the third axis to create 3 channels
mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)
# Convert the preprocessed audio data into a format suitable for the VGG16 model
mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order
# Extract features using VGG16
with torch.no_grad():
features = vgg16_model(mel_spec_tensor)
# Convert the features to numpy array and flatten them
features_np = features.squeeze().detach().numpy()
features_flattened = features_np.flatten().reshape(1, -1)
# Apply PCA transformation
features_pca = pca_instance.transform(features_flattened)
# Convert to PyTorch tensor
features_tensor = torch.from_numpy(features_pca).float()
return features_tensor
def predict_language(audio_input):
# Load VGG16 model
if isinstance(audio_input, str):
# Load the audio file
audio, sr = librosa.load(audio_input, sr=22050)
# Get the sample rate and convert the audio data to float
sr, audio = audio_input
audio = audio.astype(np.float32)
# Preprocess the single audio file using VGG16 for feature extraction
preprocessed_features = preprocess_single_audio_vgg16(audio, sr, vgg16, pca)
# Make predictions using the trained model
with torch.no_grad():
output = ann_model(preprocessed_features)
_, predicted_class = torch.max(output, 1)
# Map predicted class index to actual label
predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]
return predicted_label
iface = gr.Interface(fn=predict_language, inputs="audio", outputs="text")