voice_clone_detection

Runtime error

App Files Files Community

Kabatubare commited on Mar 13, 2024

Commit

637d0ca

verified ·

1 Parent(s): 15eca51

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -55

app.py CHANGED Viewed

@@ -1,77 +1,80 @@
 import librosa
 import numpy as np
 import torch
 import logging
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
-import gradio as gr
 logging.basicConfig(level=logging.INFO)
-# Path to your Wav2Vec2 model and processor
-model_path = "./wav2vec2-sequence-classification"
-try:
-    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
-    processor = Wav2Vec2Processor.from_pretrained(model_path)
-    logging.info("Model and processor loaded successfully.")
-except Exception as e:
-    logging.error(f"Loading model and processor failed: {e}")
-    raise e
-def preprocess_audio(file_path):
     """
-    Load and preprocess the audio file.
     """
-    # Load the audio file using librosa
-    audio, sr = librosa.load(file_path, sr=None)
-    # Resample the audio to 16 kHz (if not already at this sample rate)
-    if sr != 16000:
-        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-        sr = 16000
-    return audio, sr
-def audio_to_features(audio, sr):
-    """
-    Convert audio waveform to model features.
-    """
-    # Use the processor to prepare the features for the model
-    return processor(audio, sampling_rate=sr, return_tensors="pt", padding=True, truncation=True).input_values
-def classify_audio(file_path):
     """
-    Classify the content of the audio file.
     """
     try:
-        audio, sr = preprocess_audio(file_path)
-        input_values = audio_to_features(audio, sr)
-        # Inference
         with torch.no_grad():
-            logits = model(input_values).logits
-        # Post-processing: Convert logits to softmax to get probabilities
-        probabilities = torch.softmax(logits, dim=1).detach().numpy()
-        # Assuming you have a binary classification model for simplicity
-        # Modify this part based on your actual number of classes and labels
-        labels = ['Class 0', 'Class 1']  # Example labels
-        predictions = dict(zip(labels, probabilities[0]))
-        # Format the prediction output
-        prediction_output = "\n".join([f"{label}: {prob:.4f}" for label, prob in predictions.items()])
-        return prediction_output
     except Exception as e:
-        logging.error(f"Error during classification: {e}")
-        return f"Classification error: {e}"
-# Gradio interface
 iface = gr.Interface(
-    fn=classify_audio,
-    inputs=gr.inputs.Audio(source="upload", type="filepath"),
-    outputs="text",
-    title="Audio Classification with Wav2Vec2",
-    description="Upload an audio file to classify its content using a Wav2Vec2 model."
 )
-# Launch the interface
-if __name__ == "__main__":
-    iface.launch()

+import gradio as gr
 import librosa
 import numpy as np
 import torch
+import torch.nn.functional as F
 import logging
+from transformers import AutoModelForAudioClassification
+# Configure logging for debugging and information
 logging.basicConfig(level=logging.INFO)
+# Model loading from the specified local path
+local_model_path = "./"
+model = AutoModelForAudioClassification.from_pretrained(local_model_path)
+def custom_feature_extraction(audio_file_path, sr=16000, n_mels=128, n_fft=2048, hop_length=512, target_length=1024):
     """
+    Custom feature extraction using Mel spectrogram, tailored for models trained on datasets like AudioSet.
+    Args:
+        audio_file_path: Path to the audio file for prediction.
+        sr: Target sampling rate for the audio file.
+        n_mels: Number of Mel bands to generate.
+        n_fft: Length of the FFT window.
+        hop_length: Number of samples between successive frames.
+        target_length: Expected length of the Mel spectrogram in the time dimension.
+    Returns:
+        A tensor representation of the Mel spectrogram features.
     """
+    waveform, sample_rate = librosa.load(audio_file_path, sr=sr)
+    S = librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
+    S_DB = librosa.power_to_db(S, ref=np.max)
+    mel_tensor = torch.tensor(S_DB).float()
+    # Ensure the tensor matches the expected sequence length
+    current_length = mel_tensor.shape[1]
+    if current_length > target_length:
+        mel_tensor = mel_tensor[:, :target_length]  # Truncate if longer
+    elif current_length < target_length:
+        padding = target_length - current_length
+        mel_tensor = F.pad(mel_tensor, (0, padding), "constant", 0)  # Pad if shorter
+    mel_tensor = mel_tensor.unsqueeze(0)  # Add batch dimension for compatibility with model
+    return mel_tensor
+def predict_voice(audio_file_path):
     """
+    Predicts the audio class using a pre-trained model and custom feature extraction.
+    Args:
+        audio_file_path: Path to the audio file for prediction.
+    Returns:
+        A string containing the predicted class and confidence level.
     """
     try:
+        features = custom_feature_extraction(audio_file_path)
         with torch.no_grad():
+            outputs = model(features)
+        logits = outputs.logits
+        predicted_index = logits.argmax()
+        label = model.config.id2label[predicted_index.item()]
+        confidence = torch.softmax(logits, dim=1).max().item() * 100
+        result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
+        logging.info("Prediction successful.")
     except Exception as e:
+        result = f"Error during processing: {e}"
+        logging.error(result)
+    return result
+# Setting up the Gradio interface
 iface = gr.Interface(
+    fn=predict_voice,
+    inputs=gr.Audio(label="Upload Audio File", type="filepath"),
+    outputs=gr.Textbox(label="Prediction"),
+    title="Voice Authenticity Detection",
+    description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
 )
+# Launching the interface
+iface.launch()