voice_clone_detection

Runtime error

App Files Files Community

Kabatubare commited on Mar 14, 2024

Commit

af80923

verified ·

1 Parent(s): 14ac9f5

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -23

app.py CHANGED Viewed

@@ -2,43 +2,35 @@ import numpy as np
 import torch
 import librosa
 import gradio as gr
-from transformers import AutoModelForAudioClassification
 import logging
 logging.basicConfig(level=logging.INFO)
 model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(model_path)
 def preprocess_audio(audio_path, sr=16000):
-    # Load the audio file. Note: Adjusting the sample rate (sr) to match the model's expected input
-    audio, sr = librosa.load(audio_path, sr=sr)
-    # Trim silence from the beginning and the end
     audio, _ = librosa.effects.trim(audio)
-    return audio, sr
 def extract_features(audio, sr=16000):
-    # Compute the Mel spectrogram
-    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
-    # Convert to dB scale
-    S_DB = librosa.power_to_db(S, ref=np.max)
-    # Normally, further feature extraction steps would be here. For this model, we will directly use S_DB.
-    return S_DB
 def predict_voice(audio_file_path):
     try:
-        audio, sr = preprocess_audio(audio_file_path)
-        S_DB = extract_features(audio, sr)
-        # Convert S_DB to tensor and add required batch dimension
-        S_DB_tensor = torch.tensor(S_DB).unsqueeze(0)
         with torch.no_grad():
-            outputs = model(S_DB_tensor)
             logits = outputs.logits
-            predicted_index = logits.argmax()
-            label = model.config.id2label[predicted_index.item()]
-            confidence = torch.softmax(logits, dim=1).max().item() * 100
         result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
         logging.info("Prediction successful.")
@@ -54,6 +46,4 @@ iface = gr.Interface(
     outputs=gr.Text(label="Prediction"),
     title="Voice Authenticity Detection",
     description="This system uses advanced audio processing to detect whether a voice is real or AI-generated. Upload an audio file to see the results."
-)
-iface.launch()

 import torch
 import librosa
 import gradio as gr
+from transformers import AutoModelForAudioClassification, Wav2Vec2Processor
 import logging
 logging.basicConfig(level=logging.INFO)
 model_path = "./"
 model = AutoModelForAudioClassification.from_pretrained(model_path)
+processor = Wav2Vec2Processor.from_pretrained(model_path)
 def preprocess_audio(audio_path, sr=16000):
+    audio, _ = librosa.load(audio_path, sr=sr)
     audio, _ = librosa.effects.trim(audio)
+    return audio
 def extract_features(audio, sr=16000):
+    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+    return inputs
 def predict_voice(audio_file_path):
     try:
+        audio = preprocess_audio(audio_file_path)
+        features = extract_features(audio)
         with torch.no_grad():
+            outputs = model(**features)
             logits = outputs.logits
+            predicted_index = logits.argmax(dim=-1)
+            label = processor.decode(predicted_index)
+            confidence = torch.softmax(logits, dim=-1).max().item() * 100
         result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
         logging.info("Prediction successful.")
     outputs=gr.Text(label="Prediction"),
     title="Voice Authenticity Detection",
     description="This system uses advanced audio processing to detect whether a voice is real or AI-generated. Upload an audio file to see the results."
+).launch()