File size: 2,576 Bytes
3c699d2
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
3c699d2
 
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c699d2
 
 
6abefd9
3c699d2
 
 
 
 
6abefd9
3c699d2
 
6abefd9
3c699d2
6abefd9
3c699d2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import torch
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
from sklearn.decomposition import PCA
import joblib

# Load the trained model and PCA instance
ann_model = torch.load('ann_model.pth')
pca = joblib.load('pca.pkl')

# Load VGG16 model
vgg16 = models.vgg16(pretrained=True).features

# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
    # Load and preprocess the audio file
    y, sr = librosa.load(audio_file, sr=None)  # Load audio
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
    norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize

    # Resize mel spectrogram to the target shape (128, 128) using zoom
    target_shape = (128, 128)
    resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

    # Stack the resized mel spectrogram along the third axis to create 3 channels
    mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

    # Convert the preprocessed audio data into a format suitable for the VGG16 model
    mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and change channel order

    # Extract features using VGG16
    vgg16_model.eval()
    with torch.no_grad():
        features = vgg16_model(mel_spec_tensor)

    # Convert the features to numpy array and flatten them
    features_np = features.squeeze().detach().numpy()
    features_flattened = features_np.flatten().reshape(1, -1)

    # Apply PCA transformation
    features_pca = pca_instance.transform(features_flattened)

    # Convert to PyTorch tensor
    features_tensor = torch.from_numpy(features_pca).float()

    return features_tensor

def predict(audio_file):
    # Preprocess the audio file
    preprocessed_features = preprocess_single_audio_vgg16(audio_file.name, vgg16, pca)

    # Make a prediction
    ann_model.eval()
    with torch.no_grad():
        output = ann_model(preprocessed_features)
        _, predicted_class = torch.max(output, 1)

    # Map predicted class index to actual label
    predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

    return predicted_label

iface = gr.Interface(fn=predict, inputs="file", outputs="text")
iface.launch()