Spaces:
Runtime error
Runtime error
File size: 2,707 Bytes
ee91d94 0a26e54 1364a7f 6530ee3 f0dd070 b6be582 f0dd070 ee91d94 e02dec8 0c35856 b6be582 94d6357 b6be582 94d6357 b6be582 94d6357 b6be582 797b845 b6be582 8a834c6 3b392fa df3ef47 b6be582 8a834c6 e02dec8 323b26a e8e81bf 1364a7f 323b26a e8e81bf 323b26a ee91d94 cea8753 e8e81bf cea8753 0c35856 9ff14b4 ee91d94 7045c5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
import librosa
import numpy as np
import torch
import logging
from transformers import AutoModelForAudioClassification
logging.basicConfig(level=logging.INFO)
model_path = "./"
model = AutoModelForAudioClassification.from_pretrained(model_path)
def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512, target_length=512):
y, sr = librosa.load(audio_path, sr=sr)
y_pitch_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
y_time_stretched = librosa.effects.time_stretch(y_pitch_shifted, rate=1.2)
mfcc = librosa.feature.mfcc(y=y_time_stretched, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
chroma = librosa.feature.chroma_stft(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
mel = librosa.feature.melspectrogram(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
contrast = librosa.feature.spectral_contrast(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_time_stretched), sr=sr)
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
features_normalized = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
if features_normalized.shape[1] > target_length:
features_normalized = features_normalized[:, :target_length]
else:
padding = target_length - features_normalized.shape[1]
features_normalized = np.pad(features_normalized, ((0, 0), (0, padding)), 'constant')
features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Add batch dimension
return features_tensor
def predict_voice(audio_file_path):
try:
features_tensor = augment_and_extract_features(audio_file_path)
with torch.no_grad():
outputs = model(features_tensor)
logits = outputs.logits
predicted_index = logits.argmax()
label = model.config.id2label[predicted_index.item()]
confidence = torch.softmax(logits, dim=1).max().item() * 100
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
logging.info("Prediction successful.")
except Exception as e:
result = f"Error during processing: {e}"
logging.error(result)
return result
iface = gr.Interface(
fn=predict_voice,
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
outputs=gr.Textbox(label="Prediction"),
title="Voice Authenticity Detection",
description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
)
iface.launch()
|