voice_clone_detection

Runtime error

Kabatubare commited on Mar 13, 2024

Commit

1364a7f

verified ·

1 Parent(s): 9ff14b4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,23 +1,25 @@
 import gradio as gr
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 import torch
-from torch.nn.functional import softmax
 import librosa
-import os
 local_model_path = "./"
 extractor = AutoFeatureExtractor.from_pretrained(local_model_path)
 model = AutoModelForAudioClassification.from_pretrained(local_model_path)
 def preprocess_audio(audio_file_path, target_sample_rate=16000):
     waveform, _ = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
     return waveform, target_sample_rate
 def predict_voice(audio_file_path):
     try:
-        # In Hugging Face Spaces, uploaded files are temporarily stored in a way that's accessible
-        # to the app, so there's no need for a strict path check here.
         waveform, sample_rate = preprocess_audio(audio_file_path)
         inputs = extractor(waveform, return_tensors="pt", sampling_rate=sample_rate)
         with torch.no_grad():
@@ -26,11 +28,12 @@ def predict_voice(audio_file_path):
         logits = outputs.logits
         predicted_index = logits.argmax()
         label = model.config.id2label[predicted_index.item()]
-        confidence = softmax(logits, dim=1).max().item() * 100
         result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
     except Exception as e:
-        result = f"An error occurred during processing: {str(e)}"
     return result
@@ -43,3 +46,4 @@ iface = gr.Interface(
 )
 iface.launch()

 import gradio as gr
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 import torch
 import librosa
+import numpy as np
 local_model_path = "./"
 extractor = AutoFeatureExtractor.from_pretrained(local_model_path)
 model = AutoModelForAudioClassification.from_pretrained(local_model_path)
 def preprocess_audio(audio_file_path, target_sample_rate=16000):
+    # Load the audio file, ensuring mono conversion
     waveform, _ = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
+    # Normalizing waveform to be between -1 and 1
+    waveform = librosa.util.normalize(waveform)
     return waveform, target_sample_rate
 def predict_voice(audio_file_path):
     try:
         waveform, sample_rate = preprocess_audio(audio_file_path)
+        # Ensure waveform is a float32 array
+        waveform = waveform.astype(np.float32)
         inputs = extractor(waveform, return_tensors="pt", sampling_rate=sample_rate)
         with torch.no_grad():
         logits = outputs.logits
         predicted_index = logits.argmax()
         label = model.config.id2label[predicted_index.item()]
+        confidence = torch.softmax(logits, dim=1).max().item() * 100
         result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
     except Exception as e:
+        # Improved error handling for debugging
+        result = f"Error during processing: {e}"
     return result
 )
 iface.launch()