voice_clone_detection

Runtime error

App Files Files Community

Kabatubare commited on Mar 13, 2024

Commit

09e98e6

verified ·

1 Parent(s): 411539a

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -71

app.py CHANGED Viewed

@@ -3,96 +3,90 @@ import librosa
 import numpy as np
 import torch
 import logging
-from transformers import AutoModelForAudioClassification
-from torch.nn.functional import interpolate
-# Set up logging to help diagnose issues and track progress
 logging.basicConfig(level=logging.INFO)
-# Load the pretrained model for audio classification
 model_path = "./"
 try:
-    model = AutoModelForAudioClassification.from_pretrained(model_path)
-    logging.info("Model loaded successfully.")
 except Exception as e:
-    logging.error(f"Failed to load model: {e}")
-# Function to preprocess audio file
-def preprocess_audio(audio_path, target_sr=16000):
     try:
-        y, sr = librosa.load(audio_path, sr=target_sr)
         logging.info("Audio file loaded and resampled.")
-        return y, sr
     except Exception as e:
-        logging.error(f"Error in audio preprocessing: {e}")
-        return None, None
-# Function to extract features from audio
-def extract_features(y, sr, n_mfcc=40, n_fft=2048, hop_length=512):
     try:
-        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
-        logging.info("MFCC features extracted.")
-        return mfcc
     except Exception as e:
-        logging.error(f"Error extracting MFCC features: {e}")
-        return None
-# Function to normalize and pad features
-def normalize_and_pad_features(mfcc, target_size=512):
     try:
-        # Normalize features
-        mfcc_normalized = (mfcc - np.mean(mfcc, axis=1, keepdims=True)) / np.std(mfcc, axis=1, keepdims=True)
-        logging.info("Features normalized.")
-        # Pad features
-        if mfcc_normalized.shape[1] < target_size:
-            padding = target_size - mfcc_normalized.shape[1]
-            mfcc_padded = np.pad(mfcc_normalized, ((0, 0), (0, padding)), 'constant')
-            logging.info("Features padded.")
-        else:
-            mfcc_padded = mfcc_normalized[:, :target_size]
-        return mfcc_padded
     except Exception as e:
-        logging.error(f"Error in normalization and padding: {e}")
-        return None
-# Prediction function
-def predict_voice(audio_file_path):
     try:
-        # Preprocess and extract features
-        y, sr = preprocess_audio(audio_file_path)
-        if y is None or sr is None:
-            return "Error in audio preprocessing."
-        mfcc = extract_features(y, sr)
-        if mfcc is None:
-            return "Error extracting features."
-        features = normalize_and_pad_features(mfcc)
-        if features is None:
-            return "Error in feature normalization and padding."
-        # Convert to tensor and add batch dimension
-        features_tensor = torch.tensor(features).float().unsqueeze(0)
-        # Ensure the input tensor matches the model's expected dimensions
-        if features_tensor.dim() == 2:
-            features_tensor = features_tensor.unsqueeze(0)  # Add a channel dimension
-        # Predict
-        with torch.no_grad():
-            outputs = model(features_tensor)
-        logits = outputs.logits
-        predicted_index = logits.argmax().item()
-        label = model.config.id2label[predicted_index]
-        confidence = torch.softmax(logits, dim=1).max().item() * 100
-        return f"Classified as '{label}' with {confidence:.2f}% confidence."
     except Exception as e:
-        logging.error(f"Prediction error: {e}")
-        return "Error during prediction."
-# Gradio interface
-iface = gr.Interface(fn=predict_voice, inputs=gr.inputs.Audio(type="filepath"), outputs="text",
-                     title="Audio Classification", description="Classify audio files with a pretrained model.")
-# Launch the Gradio app
-if __name__ == "__main__":
-    iface.launch()

 import numpy as np
 import torch
 import logging
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
+# Initialize logging
 logging.basicConfig(level=logging.INFO)
+# Load the model and feature extractor
 model_path = "./"
 try:
+    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
+    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
+    logging.info("Model and feature extractor loaded successfully.")
 except Exception as e:
+    logging.error(f"Model loading failed: {e}")
+    raise e
+def load_audio(audio_path, sr=16000):
+    """
+    Load an audio file and resample to the target sample rate.
+    """
     try:
+        audio, _ = librosa.load(audio_path, sr=sr)
         logging.info("Audio file loaded and resampled.")
+        return audio
     except Exception as e:
+        logging.error(f"Failed to load audio: {e}")
+        raise e
+def preprocess_audio(audio):
+    """
+    Preprocess the audio file to the format expected by Wav2Vec2 model.
+    """
     try:
+        input_values = feature_extractor(audio, return_tensors="pt", padding="longest", sampling_rate=16000).input_values
+        logging.info("Audio file preprocessed.")
+        return input_values
     except Exception as e:
+        logging.error(f"Audio preprocessing failed: {e}")
+        raise e
+def predict(input_values):
+    """
+    Make a prediction with the Wav2Vec2 model.
+    """
     try:
+        with torch.no_grad():
+            logits = model(input_values).logits
+            predicted_id = torch.argmax(logits, dim=-1)
+            logging.info(f"Prediction made with id {predicted_id}")
+        return predicted_id
     except Exception as e:
+        logging.error(f"Prediction failed: {e}")
+        raise e
+def get_label(prediction_id):
+    """
+    Convert the prediction ID to a meaningful label.
+    """
+    # Example of converting predicted id to label
+    # This should be adapted based on your specific model's labels
+    labels = ["label1", "label2"]  # Dummy label list for demonstration
     try:
+        label = labels[prediction_id]
+        logging.info(f"Label obtained: {label}")
+        return label
+    except Exception as e:
+        logging.error(f"Failed to get label: {e}")
+        raise e
+def main(audio_file_path):
+    """
+    Load audio, preprocess, predict, and return the label.
+    """
+    try:
+        audio = load_audio(audio_file_path)
+        input_values = preprocess_audio(audio)
+        prediction_id = predict(input_values)
+        label = get_label(prediction_id)
+        return label
     except Exception as e:
+        logging.error(f"Error in processing: {e}")
+        return str(e)
+# Set up Gradio interface
+iface = gr.Interface(fn=main, inputs=gr.inputs.Audio(type="filepath"), outputs="text", title="Audio Classification")
+# Launch the interface
+iface.launch()