Spaces:

saronium
/

Indian-language-identification-from-audio

Sleeping

Indian-language-identification-from-audio

File size: 3,686 Bytes

6abefd9
d30bab3
 
6abefd9
 
 
 
8b10513
c0f1a73
6e4bc1e
5bba7e5
6abefd9
8b10513
 
3c699d2
6e4bc1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0f1a73
 
6e4bc1e
c0f1a73
 
6e4bc1e
 
c0f1a73
3c699d2
5bba7e5
8b10513
5bba7e5
 
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bba7e5
8b10513
5bba7e5
 
 
e4412c8
5bba7e5
 
 
8b10513
 
5bba7e5
6abefd9
8b10513
3c699d2
 
 
 
6abefd9
3c699d2
 
6abefd9
3c699d2
6abefd9
5bba7e5
 
3c699d2

import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
import gradio as gr
import pickle
from joblib import load
import soundfile as sf

# Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3,'kannada':4,'telugu':5}

class ANNModel(nn.Module):
    def __init__(self):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(300, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Create an instance of your model
ann_model = ANNModel()

# Load the trained model
ann_model.load_state_dict(torch.load('ann_model.pth'))

# Load the PCA instance
pca = load('pca.pkl')

vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance):
    # Your existing preprocessing code goes here
    y= audio_data
    sr = sr# Load audio
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
    norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize

    # Resize mel spectrogram to the target shape (128, 128) using zoom
    target_shape = (128, 128)
    resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

    # Stack the resized mel spectrogram along the third axis to create 3 channels
    mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

    # Convert the preprocessed audio data into a format suitable for the VGG16 model
    mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and change channel order

    # Extract features using VGG16
    vgg16_model.eval()
    with torch.no_grad():
        features = vgg16_model(mel_spec_tensor)

    # Convert the features to numpy array and flatten them
    features_np = features.squeeze().detach().numpy()
    features_flattened = features_np.flatten().reshape(1, -1)

    # Apply PCA transformation
    features_pca = pca_instance.transform(features_flattened)

    # Convert to PyTorch tensor
    features_tensor = torch.from_numpy(features_pca).float()
    return features_tensor

def predict_language(audio_input):
    # Load VGG16 model
     if isinstance(audio_input, str):
        # Load the audio file
        audio, sr = librosa.load(audio_input, sr=None)
     else:
        # Get the sample rate and convert the audio data to float
        sr, audio = audio_input
        audio = audio.astype(np.float32)

    # Preprocess the single audio file using VGG16 for feature extraction
    preprocessed_features = preprocess_single_audio_vgg16(audio, sr, vgg16, pca)

    # Make predictions using the trained model
    ann_model.eval()
    with torch.no_grad():
        output = ann_model(preprocessed_features)
        _, predicted_class = torch.max(output, 1)

    # Map predicted class index to actual label
    predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

    return predicted_label

iface = gr.Interface(fn=predict_language, inputs="audio", outputs="text")

iface.launch()