voice_clone_detection

Runtime error

App Files Files Community

Kabatubare commited on Mar 13, 2024

Commit

411539a

verified ·

1 Parent(s): 38963c6

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -63

app.py CHANGED Viewed

@@ -3,80 +3,96 @@ import librosa
 import numpy as np
 import torch
 import logging
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
-from pydub import AudioSegment
-import os
-import tempfile
-import soundfile as sf
-# Setup logging
 logging.basicConfig(level=logging.INFO)
-# Load model and processor
 model_path = "./"
-model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
-processor = Wav2Vec2Processor.from_pretrained(model_path)
-def preprocess_audio(audio_file_path, target_sampling_rate=16000):
-    """
-    Preprocess the input audio file to the target sampling rate and format.
-    """
-    # Convert audio to target sampling rate using librosa
-    y, sr = librosa.load(audio_file_path, sr=target_sampling_rate)
-    return y, sr
-def predict_audio_class(audio_file_path):
-    """
-    Predict the class of the input audio file using Wav2Vec 2.0 model.
-    """
     try:
-        # Preprocess audio
-        audio, sr = preprocess_audio(audio_file_path, target_sampling_rate=16000)
-        # Prepare the audio for the model
-        inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True, truncation=True)
         # Predict
         with torch.no_grad():
-            outputs = model(**inputs)
         logits = outputs.logits
-        predicted_index = logits.argmax(dim=1).item()
-        confidence = torch.softmax(logits, dim=1).max().item() * 100
         label = model.config.id2label[predicted_index]
-        return f"Predicted class: {label} with confidence: {confidence:.2f}%"
     except Exception as e:
-        logging.error(f"Error during processing: {e}")
-        return "Prediction failed due to an error."
-def save_temp_audio(file):
-    """
-    Saves a temporary audio file, returns the path.
-    """
-    temp_dir = tempfile.gettempdir()
-    temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir, suffix=".wav")
-    temp_file_path = temp_file.name
-    # Convert to WAV for consistency
-    AudioSegment.from_file(file).export(temp_file_path, format="wav")
-    return temp_file_path
-def handle_audio_input(file_info):
-    """
-    Handles the input audio file for prediction.
-    """
-    audio_file_path = save_temp_audio(file_info)
-    prediction = predict_audio_class(audio_file_path)
-    os.unlink(audio_file_path)  # Clean up temp file
-    return prediction
-# Setup Gradio interface
-iface = gr.Interface(
-    fn=handle_audio_input,
-    inputs=gr.inputs.Audio(source="upload", type="file", label="Upload Audio"),
-    outputs="text",
-    title="Audio Class Prediction",
-    description="Predicts the class of uploaded audio files using a fine-tuned Wav2Vec 2.0 model."
-)
-iface.launch()

 import numpy as np
 import torch
 import logging
+from transformers import AutoModelForAudioClassification
+from torch.nn.functional import interpolate
+# Set up logging to help diagnose issues and track progress
 logging.basicConfig(level=logging.INFO)
+# Load the pretrained model for audio classification
 model_path = "./"
+try:
+    model = AutoModelForAudioClassification.from_pretrained(model_path)
+    logging.info("Model loaded successfully.")
+except Exception as e:
+    logging.error(f"Failed to load model: {e}")
+# Function to preprocess audio file
+def preprocess_audio(audio_path, target_sr=16000):
+    try:
+        y, sr = librosa.load(audio_path, sr=target_sr)
+        logging.info("Audio file loaded and resampled.")
+        return y, sr
+    except Exception as e:
+        logging.error(f"Error in audio preprocessing: {e}")
+        return None, None
+# Function to extract features from audio
+def extract_features(y, sr, n_mfcc=40, n_fft=2048, hop_length=512):
+    try:
+        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
+        logging.info("MFCC features extracted.")
+        return mfcc
+    except Exception as e:
+        logging.error(f"Error extracting MFCC features: {e}")
+        return None
+# Function to normalize and pad features
+def normalize_and_pad_features(mfcc, target_size=512):
+    try:
+        # Normalize features
+        mfcc_normalized = (mfcc - np.mean(mfcc, axis=1, keepdims=True)) / np.std(mfcc, axis=1, keepdims=True)
+        logging.info("Features normalized.")
+        # Pad features
+        if mfcc_normalized.shape[1] < target_size:
+            padding = target_size - mfcc_normalized.shape[1]
+            mfcc_padded = np.pad(mfcc_normalized, ((0, 0), (0, padding)), 'constant')
+            logging.info("Features padded.")
+        else:
+            mfcc_padded = mfcc_normalized[:, :target_size]
+        return mfcc_padded
+    except Exception as e:
+        logging.error(f"Error in normalization and padding: {e}")
+        return None
+# Prediction function
+def predict_voice(audio_file_path):
     try:
+        # Preprocess and extract features
+        y, sr = preprocess_audio(audio_file_path)
+        if y is None or sr is None:
+            return "Error in audio preprocessing."
+        mfcc = extract_features(y, sr)
+        if mfcc is None:
+            return "Error extracting features."
+        features = normalize_and_pad_features(mfcc)
+        if features is None:
+            return "Error in feature normalization and padding."
+        # Convert to tensor and add batch dimension
+        features_tensor = torch.tensor(features).float().unsqueeze(0)
+        # Ensure the input tensor matches the model's expected dimensions
+        if features_tensor.dim() == 2:
+            features_tensor = features_tensor.unsqueeze(0)  # Add a channel dimension
         # Predict
         with torch.no_grad():
+            outputs = model(features_tensor)
         logits = outputs.logits
+        predicted_index = logits.argmax().item()
         label = model.config.id2label[predicted_index]
+        confidence = torch.softmax(logits, dim=1).max().item() * 100
+        return f"Classified as '{label}' with {confidence:.2f}% confidence."
     except Exception as e:
+        logging.error(f"Prediction error: {e}")
+        return "Error during prediction."
+# Gradio interface
+iface = gr.Interface(fn=predict_voice, inputs=gr.inputs.Audio(type="filepath"), outputs="text",
+                     title="Audio Classification", description="Classify audio files with a pretrained model.")
+# Launch the Gradio app
+if __name__ == "__main__":
+    iface.launch()