Spaces:

saronium
/

Indian-language-identification-from-audio

Sleeping

Indian-language-identification-from-audio

File size: 3,368 Bytes

6abefd9
 
 
 
 
8b10513
c0f1a73
6e4bc1e
c0f1a73
6abefd9
8b10513
 
3c699d2
6e4bc1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0f1a73
 
6e4bc1e
c0f1a73
 
6e4bc1e
 
c0f1a73
3c699d2
6abefd9
8b10513
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10513
 
c0f1a73
8b10513
 
 
6abefd9
8b10513
3c699d2
 
 
 
6abefd9
3c699d2
 
6abefd9
3c699d2
6abefd9
8b10513
3c699d2

import torch
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
import gradio as gr
import pickle
from joblib import load


# Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3,'kannada':4,'telugu':5}

class ANNModel(nn.Module):
    def __init__(self):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(300, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 6)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Create an instance of your model
ann_model = ANNModel()

# Load the trained model
ann_model.load_state_dict(torch.load('ann_model.pth'))

# Load the PCA instance
pca = load('pca.pkl')

vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
    # Your existing preprocessing code goes here
    y, sr = librosa.load(audio_file, sr=None)  # Load audio
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
    norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize

    # Resize mel spectrogram to the target shape (128, 128) using zoom
    target_shape = (128, 128)
    resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

    # Stack the resized mel spectrogram along the third axis to create 3 channels
    mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

    # Convert the preprocessed audio data into a format suitable for the VGG16 model
    mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and change channel order

    # Extract features using VGG16
    vgg16_model.eval()
    with torch.no_grad():
        features = vgg16_model(mel_spec_tensor)

    # Convert the features to numpy array and flatten them
    features_np = features.squeeze().detach().numpy()
    features_flattened = features_np.flatten().reshape(1, -1)

    # Apply PCA transformation
    features_pca = pca_instance.transform(features_flattened)

    # Convert to PyTorch tensor
    features_tensor = torch.from_numpy(features_pca).float()
    return features_tensor

def predict_language(audio_file_path):
    # Load VGG16 model
    

    # Preprocess the single audio file using VGG16 for feature extraction
    preprocessed_features = preprocess_single_audio_vgg16(audio_file_path, vgg16, pca)

    # Make predictions using the trained model
    ann_model.eval()
    with torch.no_grad():
        output = ann_model(preprocessed_features)
        _, predicted_class = torch.max(output, 1)

    # Map predicted class index to actual label
    predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

    return predicted_label

iface = gr.Interface(fn=predict_language, inputs="file", outputs="text")
iface.launch()