Spaces:
Runtime error
Runtime error
File size: 3,691 Bytes
fe0bcff dfabd2f 30a5efb 05e6aba 84de51b fe0bcff f0dd070 6aa52fc fe0bcff 84de51b ee91d94 05e6aba b860c29 05e6aba 6aa52fc b860c29 6aa52fc 05e6aba 84de51b 0c35856 fe0bcff 411539a 50facbf 411539a 84de51b fe0bcff 84de51b a29043b fe0bcff 84de51b 53b1abc fe0bcff 84de51b fe0bcff 05e6aba b860c29 05e6aba 6aa52fc 05e6aba b860c29 e8e81bf 6aa52fc ee91d94 b860c29 15eca51 637d0ca 30c595f 05e6aba b860c29 6aa52fc 05e6aba b860c29 fe0bcff b860c29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import librosa
import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import AutoModelForAudioClassification, ASTFeatureExtractor
import random
# Model and feature extractor loading
model = AutoModelForAudioClassification.from_pretrained("./")
feature_extractor = ASTFeatureExtractor.from_pretrained("./")
def plot_waveform(waveform, sr):
plt.figure(figsize=(12, 3)) # Larger figure size
plt.title('Waveform')
plt.ylabel('Amplitude')
plt.plot(np.linspace(0, len(waveform) / sr, len(waveform)), waveform)
plt.xlabel('Time (s)')
return plt.gcf()
def plot_spectrogram(waveform, sr):
S = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128)
S_DB = librosa.power_to_db(S, ref=np.max)
plt.figure(figsize=(12, 4)) # Larger figure size
librosa.display.specshow(S_DB, sr=sr, x_axis='time', y_axis='mel')
plt.title('Mel Spectrogram')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
return plt.gcf()
def custom_feature_extraction(audio, sr=16000, target_length=1024):
features = feature_extractor(audio, sampling_rate=sr, return_tensors="pt", padding="max_length", max_length=target_length)
return features.input_values
def apply_time_shift(waveform, max_shift_fraction=0.1):
shift = random.randint(-int(max_shift_fraction * len(waveform)), int(max_shift_fraction * len(waveform)))
return np.roll(waveform, shift)
def predict_voice(audio_file_path):
try:
waveform, sample_rate = librosa.load(audio_file_path, sr=feature_extractor.sampling_rate, mono=True)
augmented_waveform = apply_time_shift(waveform)
original_features = custom_feature_extraction(waveform, sr=sample_rate)
augmented_features = custom_feature_extraction(augmented_waveform, sr=sample_rate)
with torch.no_grad():
outputs_original = model(original_features)
outputs_augmented = model(augmented_features)
logits = (outputs_original.logits + outputs_augmented.logits) / 2
predicted_index = logits.argmax()
label = model.config.id2label[predicted_index.item()]
confidence = torch.softmax(logits, dim=1).max().item() * 100
prediction_text = (f"The model predicts the voice as '{label}'. "
f"Confidence level: {confidence:.2f}%")
waveform_plot = plot_waveform(waveform, sample_rate)
spectrogram_plot = plot_spectrogram(waveform, sample_rate)
return prediction_text, waveform_plot, spectrogram_plot
except Exception as e:
return f"Error during processing: {e}", None, None
# Define the Gradio app layout
iface = gr.Interface(
fn=predict_voice,
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
outputs=[
gr.Textbox(label="Analysis", type="auto"),
gr.Plot(label="Waveform"),
gr.Plot(label="Spectrogram")
],
layout="vertical",
title="Voice Clone Detection",
description="This tool determines whether a voice is real or an AI-generated clone. Audio files judged to be authentic and produced by humans are classified as 'Bonafide'. In contrast, those perceived to be synthetically generated are labeled as 'Spoof'. Upload an audio file for analysis."
)
# Customize the CSS to adjust the layout and component sizes
css = """
.gradio-container {
max-width: 960px; /* Adjust the maximum width as needed */
}
.input-container {
width: 25%; /* Smaller input area */
}
.output-container {
width: 74%; /* Larger output area */
}
"""
iface.launch(css=css)
|