Spaces:
Runtime error
Runtime error
import gradio as gr | |
import librosa | |
import numpy as np | |
import torch | |
import logging | |
from transformers import AutoModelForAudioClassification | |
logging.basicConfig(level=logging.INFO) | |
model_path = "./" | |
model = AutoModelForAudioClassification.from_pretrained(model_path) | |
def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512, target_length=512): | |
y, sr = librosa.load(audio_path, sr=sr) | |
y_pitch_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=4) | |
y_time_stretched = librosa.effects.time_stretch(y_pitch_shifted, rate=1.2) | |
mfcc = librosa.feature.mfcc(y=y_time_stretched, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length) | |
chroma = librosa.feature.chroma_stft(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length) | |
mel = librosa.feature.melspectrogram(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length) | |
contrast = librosa.feature.spectral_contrast(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length) | |
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_time_stretched), sr=sr) | |
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0) | |
features_normalized = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True) | |
if features_normalized.shape[1] > target_length: | |
features_normalized = features_normalized[:, :target_length] | |
else: | |
padding = target_length - features_normalized.shape[1] | |
features_normalized = np.pad(features_normalized, ((0, 0), (0, padding)), 'constant') | |
features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Add batch dimension | |
return features_tensor | |
def predict_voice(audio_file_path): | |
try: | |
features_tensor = augment_and_extract_features(audio_file_path) | |
with torch.no_grad(): | |
outputs = model(features_tensor) | |
logits = outputs.logits | |
predicted_index = logits.argmax() | |
label = model.config.id2label[predicted_index.item()] | |
confidence = torch.softmax(logits, dim=1).max().item() * 100 | |
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%." | |
logging.info("Prediction successful.") | |
except Exception as e: | |
result = f"Error during processing: {e}" | |
logging.error(result) | |
return result | |
iface = gr.Interface( | |
fn=predict_voice, | |
inputs=gr.Audio(label="Upload Audio File", type="filepath"), | |
outputs=gr.Textbox(label="Prediction"), | |
title="Voice Authenticity Detection", | |
description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results." | |
) | |
iface.launch() | |