Spaces:

saronium
/

Indian-language-identification-from-audio

Sleeping

Indian-language-identification-from-audio

File size: 2,576 Bytes

3c699d2
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
3c699d2
 
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c699d2
 
 
6abefd9
3c699d2
 
 
 
 
6abefd9
3c699d2
 
6abefd9
3c699d2
6abefd9
3c699d2

import gradio as gr
import torch
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
from sklearn.decomposition import PCA
import joblib

# Load the trained model and PCA instance
ann_model = torch.load('ann_model.pth')
pca = joblib.load('pca.pkl')

# Load VGG16 model
vgg16 = models.vgg16(pretrained=True).features

# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
    # Load and preprocess the audio file
    y, sr = librosa.load(audio_file, sr=None)  # Load audio
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
    norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize

    # Resize mel spectrogram to the target shape (128, 128) using zoom
    target_shape = (128, 128)
    resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

    # Stack the resized mel spectrogram along the third axis to create 3 channels
    mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

    # Convert the preprocessed audio data into a format suitable for the VGG16 model
    mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and change channel order

    # Extract features using VGG16
    vgg16_model.eval()
    with torch.no_grad():
        features = vgg16_model(mel_spec_tensor)

    # Convert the features to numpy array and flatten them
    features_np = features.squeeze().detach().numpy()
    features_flattened = features_np.flatten().reshape(1, -1)

    # Apply PCA transformation
    features_pca = pca_instance.transform(features_flattened)

    # Convert to PyTorch tensor
    features_tensor = torch.from_numpy(features_pca).float()

    return features_tensor

def predict(audio_file):
    # Preprocess the audio file
    preprocessed_features = preprocess_single_audio_vgg16(audio_file.name, vgg16, pca)

    # Make a prediction
    ann_model.eval()
    with torch.no_grad():
        output = ann_model(preprocessed_features)
        _, predicted_class = torch.max(output, 1)

    # Map predicted class index to actual label
    predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

    return predicted_label

iface = gr.Interface(fn=predict, inputs="file", outputs="text")
iface.launch()