voice_clone_detection

Runtime error

Kabatubare commited on Mar 13, 2024

Commit

797b845

verified ·

1 Parent(s): 323b26a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,15 +3,15 @@ import librosa
 import numpy as np
 import torch
 import logging
-import soundfile as sf
 from transformers import AutoModelForAudioClassification
 logging.basicConfig(level=logging.INFO)
 model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(model_path)
-def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512):
     y, sr = librosa.load(audio_path, sr=sr)
     y_augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
     y_augmented = librosa.effects.time_stretch(y_augmented, rate=1.2)
@@ -25,7 +25,14 @@ def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, ho
     features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
     features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
-    features_tensor = torch.tensor(features).float().unsqueeze(0)
     return features_tensor
 def predict_voice(audio_file_path):
@@ -55,4 +62,3 @@ iface = gr.Interface(
 )
 iface.launch()

 import numpy as np
 import torch
 import logging
 from transformers import AutoModelForAudioClassification
+import soundfile as sf
 logging.basicConfig(level=logging.INFO)
 model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(model_path)
+def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512, target_length=512):
     y, sr = librosa.load(audio_path, sr=sr)
     y_augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
     y_augmented = librosa.effects.time_stretch(y_augmented, rate=1.2)
     features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
     features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
+    # Reshape the features to match the model's expected input
+    if features.shape[1] > target_length:
+        features = features[:, :target_length]
+    else:
+        padding = target_length - features.shape[1]
+        features = np.pad(features, ((0, 0), (0, padding)), 'constant')
+    features_tensor = torch.tensor(features).float().unsqueeze(0)  # Add batch dimension
     return features_tensor
 def predict_voice(audio_file_path):
 )
 iface.launch()