voice_clone_detection

Runtime error

App Files Files Community

Kabatubare commited on Mar 13, 2024

Commit

d75aa1b

verified ·

1 Parent(s): 14693fc

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -11

app.py CHANGED Viewed

@@ -2,33 +2,48 @@ import gradio as gr
 import librosa
 import numpy as np
 import torch
-import torch.nn.functional as F
 import logging
 from transformers import AutoModelForAudioClassification
-# Configure logging
 logging.basicConfig(level=logging.INFO)
-# Model loading
 local_model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(local_model_path)
-def custom_feature_extraction(audio_file_path, sr=16000, n_mels=128, n_fft=2048, hop_length=512, target_length=1024):
     waveform, sample_rate = librosa.load(audio_file_path, sr=sr)
     S = librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
     S_DB = librosa.power_to_db(S, ref=np.max)
     S_DB_tensor = torch.tensor(S_DB).float().unsqueeze(0)  # Add batch dimension
-    # Resizing the tensor to match the model's expected input size
-    S_DB_tensor_resized = F.interpolate(S_DB_tensor, size=(n_mels, target_length), mode='nearest')
-    return S_DB_tensor_resized
 def predict_voice(audio_file_path):
     try:
         features = custom_feature_extraction(audio_file_path)
         with torch.no_grad():
-            outputs = model(features)
         logits = outputs.logits
         predicted_index = logits.argmax()
@@ -36,13 +51,14 @@ def predict_voice(audio_file_path):
         confidence = torch.softmax(logits, dim=1).max().item() * 100
         result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
-        logging.info(f"Prediction: {result}")
     except Exception as e:
         result = f"Error during processing: {e}"
         logging.error(result)
     return result
 iface = gr.Interface(
     fn=predict_voice,
     inputs=gr.Audio(label="Upload Audio File", type="filepath"),
@@ -51,4 +67,5 @@ iface = gr.Interface(
     description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
 )
-iface.launch()

 import librosa
 import numpy as np
 import torch
 import logging
 from transformers import AutoModelForAudioClassification
+# Configure logging for debugging and information
 logging.basicConfig(level=logging.INFO)
+# Model loading from the specified local path
 local_model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(local_model_path)
+def custom_feature_extraction(audio_file_path, sr=16000, n_mels=128, n_fft=2048, hop_length=512):
+    """
+    Custom feature extraction using Mel spectrogram, tailored for models trained on datasets like AudioSet.
+    Args:
+        audio_file_path: Path to the audio file for prediction.
+        sr: Target sampling rate for the audio file.
+        n_mels: Number of Mel bands to generate.
+        n_fft: Length of the FFT window.
+        hop_length: Number of samples between successive frames.
+    Returns:
+        A tensor representation of the Mel spectrogram features.
+    """
     waveform, sample_rate = librosa.load(audio_file_path, sr=sr)
     S = librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
     S_DB = librosa.power_to_db(S, ref=np.max)
     S_DB_tensor = torch.tensor(S_DB).float().unsqueeze(0)  # Add batch dimension
+    return S_DB_tensor
 def predict_voice(audio_file_path):
+    """
+    Predicts the audio class using a pre-trained model and custom feature extraction.
+    Args:
+        audio_file_path: Path to the audio file for prediction.
+    Returns:
+        A string containing the predicted class and confidence level.
+    """
     try:
         features = custom_feature_extraction(audio_file_path)
         with torch.no_grad():
+            # Adjust the model prediction line if necessary to match your model's expected input
+            outputs = model(inputs=features)
         logits = outputs.logits
         predicted_index = logits.argmax()
         confidence = torch.softmax(logits, dim=1).max().item() * 100
         result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
+        logging.info("Prediction successful.")
     except Exception as e:
         result = f"Error during processing: {e}"
         logging.error(result)
     return result
+# Setting up the Gradio interface
 iface = gr.Interface(
     fn=predict_voice,
     inputs=gr.Audio(label="Upload Audio File", type="filepath"),
     description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
 )
+# Launching the interface
+iface.launch()