Spaces:

dhs-st
/

iden

Running

App Files Files Community

iden / app.py

dhs-st

Update app.py

d03cf27 verified 6 months ago

raw

history blame contribute delete

7.35 kB

	import torch
	import torchaudio
	from speechbrain.inference.speaker import SpeakerRecognition
	import gradio as gr
	from pydub import AudioSegment
	import os
	import tempfile
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.figure import Figure
	import io
	from PIL import Image

	import pkg_resources
	installed_packages = [f"{dist.key}=={dist.version}" for dist in pkg_resources.working_set]
	for package in installed_packages:
	print(package)

	class SpeakerVerification:
	def __init__(self):
	self.verification = SpeakerRecognition.from_hparams(
	source="speechbrain/spkrec-ecapa-voxceleb",
	savedir="pretrained_models/spkrec-ecapa-voxceleb"
	)
	self.threshold = 0.25

	def convert_audio(self, audio_path: str) -> str:
	try:
	file_ext = os.path.splitext(audio_path)[1].lower()
	if file_ext == '.wav':
	return audio_path

	audio = AudioSegment.from_file(audio_path)
	audio = audio.set_channels(1) # Convert to mono
	audio = audio.set_frame_rate(16000) # Set sample rate to 16kHz
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
	temp_wav_path = temp_wav.name
	audio.export(temp_wav_path, format='wav')
	return temp_wav_path

	except Exception as e:
	print(f"Error converting audio: {str(e)}")
	raise

	def score_to_probability(self, score: float) -> float:
	scale = 10
	centered_score = (score - self.threshold) * scale
	probability = 1 / (1 + np.exp(-centered_score))
	probability = max(0.0, min(1.0, probability))
	return probability

	def calculate_confidence_metrics(self, score_value: float) -> dict:
	"""Calculate various confidence metrics."""
	try:
	# Distance from threshold
	threshold_distance = abs(score_value - self.threshold)

	# Normalized confidence score (0-1 scale)
	normalized_confidence = (score_value + 1) / 2

	# Certainty score based on distance from decision boundary
	certainty = 1 - (1 / (1 + np.exp(5 * threshold_distance)))

	# Decision strength (how far from ambiguous region)
	ambiguous_region = 0.1
	if abs(score_value - self.threshold) < ambiguous_region:
	decision_strength = "Low"
	elif abs(score_value - self.threshold) < ambiguous_region * 2:
	decision_strength = "Medium"
	else:
	decision_strength = "High"

	# Confidence level categories
	if certainty < 0.6:
	confidence_level = "Low"
	elif certainty < 0.8:
	confidence_level = "Medium"
	else:
	confidence_level = "High"

	return {
	"certainty_score": certainty,
	"threshold_distance": threshold_distance,
	"decision_strength": decision_strength,
	"confidence_level": confidence_level
	}
	except Exception as e:
	print(f"Error calculating confidence metrics: {str(e)}")
	return {}

	def verify_speaker(self, audio_path1: str, audio_path2: str) -> tuple:
	try:
	wav_path1 = self.convert_audio(audio_path1)
	wav_path2 = self.convert_audio(audio_path2)

	score, prediction = self.verification.verify_files(wav_path1, wav_path2)

	if wav_path1 != audio_path1:
	os.unlink(wav_path1)
	if wav_path2 != audio_path2:
	os.unlink(wav_path2)

	score_value = score.item()
	probability = self.score_to_probability(score_value)
	decision = "Same speaker" if prediction.item() else "Different speakers"

	# Calculate confidence metrics
	confidence_metrics = self.calculate_confidence_metrics(score_value)

	return probability, decision, score_value, confidence_metrics

	except Exception as e:
	print(f"Error in speaker verification: {str(e)}")
	return 0.0, f"Error: {str(e)}", 0.0, {}

	def get_embeddings(self, audio_path: str):
	wav_path = self.convert_audio(audio_path)
	signal, fs = torchaudio.load(wav_path)

	if signal.shape[0] > 1:
	signal = torch.mean(signal, dim=0, keepdim=True)

	embeddings = self.verification.encode_batch(signal)

	if wav_path != audio_path:
	os.unlink(wav_path)
	return embeddings.squeeze()

	def plot_embeddings_comparison(self, emb1, emb2):
	fig = Figure(figsize=(10, 4))
	ax = fig.add_subplot(111)

	emb1_np = emb1.cpu().numpy()
	emb2_np = emb2.cpu().numpy()

	x = range(len(emb1_np))
	ax.plot(x, emb1_np, label='Speaker 1', alpha=0.7)
	ax.plot(x, emb2_np, label='Speaker 2', alpha=0.7)

	ax.set_title('Speaker Embeddings Comparison')
	ax.set_xlabel('Embedding Dimension')
	ax.set_ylabel('Value')
	ax.legend()

	buf = io.BytesIO()
	fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
	buf.seek(0)

	image = Image.open(buf)
	plt.close(fig)
	return image

	def create_gradio_interface():
	speaker_verifier = SpeakerVerification()

	def process_audio(audio1, audio2):
	try:
	if audio1 is None or audio2 is None:
	return "Error: Please provide both audio samples", None

	probability, decision, score, confidence_metrics = speaker_verifier.verify_speaker(audio1, audio2)
	emb1 = speaker_verifier.get_embeddings(audio1)
	emb2 = speaker_verifier.get_embeddings(audio2)

	embeddings_plot = speaker_verifier.plot_embeddings_comparison(emb1, emb2)

	result_text = (
	f"Cosine similarity (threshold=0.25): {score:.3f}\n"
	f"Decision: {decision}\n"
	f"Certainty Score: {confidence_metrics['certainty_score']:.2f}\n"
	f"Threshold Distance: {confidence_metrics['threshold_distance']:.3f}\n"
	f"Decision Strength: {confidence_metrics['decision_strength']}\n"
	f"Confidence Level: {confidence_metrics['confidence_level']}"
	)

	return result_text, embeddings_plot

	except Exception as e:
	return f"Error processing audio: {str(e)}", None

	interface = gr.Interface(
	fn=process_audio,
	inputs=[
	gr.Audio(label="Audio Sample 1", type="filepath"),
	gr.Audio(label="Audio Sample 2", type="filepath")
	],
	outputs=[
	gr.Textbox(label="Result"),
	gr.Image(label="Embeddings Comparison", type="pil"),
	],
	title="Speaker Verification System",
	description="Upload two audio samples to check if they're from the same speaker."
	)

	return interface

	app = create_gradio_interface()

	if __name__ == "__main__":
	app.launch()