Spaces:

saronium
/

Indian-language-identification-from-audio

Sleeping

saronium commited on Apr 2, 2024

Commit

f5f3175

verified ·

1 Parent(s): 9bccc69

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,9 +16,9 @@ language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3}
 class ANNModel(nn.Module):
     def __init__(self):
         super(ANNModel, self).__init__()
-        self.fc1 = nn.Linear(300, 128)
         self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(128, 64)
         self.relu2 = nn.ReLU()
         self.fc3 = nn.Linear(64, 4)
@@ -34,23 +34,23 @@ class ANNModel(nn.Module):
 ann_model = ANNModel()
 # Load the trained model
-ann_model.load_state_dict(torch.load('ann_model1.pth'))
 # Load the PCA instance
-pca = load('pca1.pkl')
 vgg16 = models.vgg16(pretrained=True).features
 # Function to load and preprocess a single audio file
 def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance):
     # Your existing preprocessing code goes here
     y= audio_data
-    sr = sr# Load audio
     mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
     log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
     norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize
     # Resize mel spectrogram to the target shape (128, 128) using zoom
-    target_shape = (128, 128)
     resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
     # Stack the resized mel spectrogram along the third axis to create 3 channels

 class ANNModel(nn.Module):
     def __init__(self):
         super(ANNModel, self).__init__()
+        self.fc1 = nn.Linear(300, 256)
         self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(256, 64)
         self.relu2 = nn.ReLU()
         self.fc3 = nn.Linear(64, 4)
 ann_model = ANNModel()
 # Load the trained model
+ann_model.load_state_dict(torch.load('ann_model_256_01_94.pth'))
 # Load the PCA instance
+pca = load('pca_256_01_94.pkl')
 vgg16 = models.vgg16(pretrained=True).features
 # Function to load and preprocess a single audio file
 def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance):
     # Your existing preprocessing code goes here
     y= audio_data
+    sr = 22050# Load audio
     mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
     log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
     norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize
     # Resize mel spectrogram to the target shape (128, 128) using zoom
+    target_shape = (224, 224)
     resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
     # Stack the resized mel spectrogram along the third axis to create 3 channels