Spaces:

saronium
/

Indian-language-identification-from-audio

Sleeping

App Files Files Community

Indian-language-identification-from-audio / app.py

saronium

Update app.py

3c699d2 verified over 1 year ago

raw

history blame

2.58 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	from torchvision import models
	from scipy.ndimage import zoom
	from sklearn.decomposition import PCA
	import joblib

	# Load the trained model and PCA instance
	ann_model = torch.load('ann_model.pth')
	pca = joblib.load('pca.pkl')

	# Load VGG16 model
	vgg16 = models.vgg16(pretrained=True).features

	# Function to load and preprocess a single audio file
	def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
	# Load and preprocess the audio file
	y, sr = librosa.load(audio_file, sr=None) # Load audio
	mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram
	log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation
	norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize

	# Resize mel spectrogram to the target shape (128, 128) using zoom
	target_shape = (128, 128)
	resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

	# Stack the resized mel spectrogram along the third axis to create 3 channels
	mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

	# Convert the preprocessed audio data into a format suitable for the VGG16 model
	mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order

	# Extract features using VGG16
	vgg16_model.eval()
	with torch.no_grad():
	features = vgg16_model(mel_spec_tensor)

	# Convert the features to numpy array and flatten them
	features_np = features.squeeze().detach().numpy()
	features_flattened = features_np.flatten().reshape(1, -1)

	# Apply PCA transformation
	features_pca = pca_instance.transform(features_flattened)

	# Convert to PyTorch tensor
	features_tensor = torch.from_numpy(features_pca).float()

	return features_tensor

	def predict(audio_file):
	# Preprocess the audio file
	preprocessed_features = preprocess_single_audio_vgg16(audio_file.name, vgg16, pca)

	# Make a prediction
	ann_model.eval()
	with torch.no_grad():
	output = ann_model(preprocessed_features)
	_, predicted_class = torch.max(output, 1)

	# Map predicted class index to actual label
	predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

	return predicted_label

	iface = gr.Interface(fn=predict, inputs="file", outputs="text")
	iface.launch()