voice_clone_detection

Runtime error

App Files Files Community

voice_clone_detection / app.py

Kabatubare

Update app.py

53b1abc verified over 1 year ago

raw

history blame

2.87 kB

	import gradio as gr
	import librosa
	import numpy as np
	import torch
	import torch.nn.functional as F
	import logging
	from transformers import AutoModelForAudioClassification
	import random

	logging.basicConfig(level=logging.INFO)

	model_path = "./"
	model = AutoModelForAudioClassification.from_pretrained(model_path)

	def custom_feature_extraction(waveform, sr, n_mels=128, n_fft=2048, hop_length=512, target_length=1024):
	S = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
	S_DB = librosa.power_to_db(S, ref=np.max)

	pitches, _ = librosa.piptrack(y=waveform, sr=sr, n_fft=n_fft, hop_length=hop_length)
	pitches = pitches.mean(axis=0, keepdims=True)
	spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr, n_fft=n_fft, hop_length=hop_length)

	features = np.concatenate([S_DB, pitches, spectral_centroids], axis=0)
	features_tensor = torch.tensor(features).float()

	current_length = features_tensor.shape[1]
	if current_length > target_length:
	features_tensor = features_tensor[:, :target_length]
	elif current_length < target_length:
	padding = target_length - current_length
	features_tensor = F.pad(features_tensor, (0, padding), "constant", 0)

	return features_tensor.unsqueeze(0)

	def apply_time_shift(waveform, max_shift_fraction=0.1):
	shift = int(max_shift_fraction * waveform.size)
	shift = random.randint(-shift, shift)
	return np.roll(waveform, shift)

	def predict_voice(audio_file_path):
	try:
	waveform, sample_rate = librosa.load(audio_file_path, sr=None)
	augmented_waveform = apply_time_shift(waveform)

	original_features = custom_feature_extraction(waveform, sample_rate)
	augmented_features = custom_feature_extraction(augmented_waveform, sample_rate)

	with torch.no_grad():
	outputs_original = model(original_features)
	outputs_augmented = model(augmented_features)

	logits = (outputs_original.logits + outputs_augmented.logits) / 2
	predicted_index = logits.argmax()
	label = model.config.id2label[predicted_index.item()]
	confidence = torch.softmax(logits, dim=1).max().item() * 100

	result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
	logging.info("Prediction successful.")
	except Exception as e:
	result = f"Error during processing: {e}"
	logging.error(result)

	return result

	iface = gr.Interface(
	fn=predict_voice,
	inputs=gr.inputs.Audio(label="Upload Audio File", type="filepath"),
	outputs=gr.outputs.Textbox(label="Prediction"),
	title="Voice Authenticity Detection",
	description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
	)

	iface.launch()