Spaces:

dhs-st
/

iden

Running

App Files Files Community

iden / app.py

dhs-st

Create app.py

ea0886a verified 5 months ago

raw

history blame

4.94 kB

	import torch
	import torchaudio
	from speechbrain.inference.speaker import SpeakerRecognition
	import gradio as gr
	from pydub import AudioSegment
	import os
	import tempfile
	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.figure import Figure
	import io
	from PIL import Image

	class SpeakerVerification:
	def __init__(self):
	self.verification = SpeakerRecognition.from_hparams(
	source="speechbrain/spkrec-ecapa-voxceleb",
	savedir="pretrained_models/spkrec-ecapa-voxceleb"
	)
	self.threshold = 0.25

	def convert_audio(self, audio_path: str) -> str:
	try:
	file_ext = os.path.splitext(audio_path)[1].lower()
	if file_ext == '.wav':
	return audio_path

	audio = AudioSegment.from_file(audio_path)
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
	temp_wav_path = temp_wav.name
	audio.export(temp_wav_path, format='wav')
	return temp_wav_path

	except Exception as e:
	print(f"Error converting audio: {str(e)}")
	raise

	def score_to_probability(self, score: float) -> float:
	scale = 10
	centered_score = (score - self.threshold) * scale
	probability = 1 / (1 + np.exp(-centered_score))
	probability = max(0.0, min(1.0, probability))
	return probability

	def verify_speaker(self, audio_path1: str, audio_path2: str) -> tuple[float, str]:
	try:
	wav_path1 = self.convert_audio(audio_path1)
	wav_path2 = self.convert_audio(audio_path2)

	score, prediction = self.verification.verify_files(wav_path1, wav_path2)

	if wav_path1 != audio_path1:
	os.unlink(wav_path1)
	if wav_path2 != audio_path2:
	os.unlink(wav_path2)

	score_value = score.item()
	probability = self.score_to_probability(score_value)
	decision = "Same speaker" if prediction.item() else "Different speakers"

	return probability, decision, score_value

	except Exception as e:
	print(f"Error in speaker verification: {str(e)}")
	return 0.0, f"Error: {str(e)}"

	def get_embeddings(self, audio_path: str):
	wav_path = self.convert_audio(audio_path)
	signal, fs = torchaudio.load(wav_path)

	if signal.shape[0] > 1:
	signal = torch.mean(signal, dim=0, keepdim=True)

	embeddings = self.verification.encode_batch(signal)

	if wav_path != audio_path:
	os.unlink(wav_path)
	return embeddings.squeeze()

	def plot_embeddings_comparison(self, emb1, emb2):
	fig = Figure(figsize=(10, 4))
	ax = fig.add_subplot(111)

	emb1_np = emb1.cpu().numpy()
	emb2_np = emb2.cpu().numpy()

	x = range(len(emb1_np))
	ax.plot(x, emb1_np, label='Speaker 1', alpha=0.7)
	ax.plot(x, emb2_np, label='Speaker 2', alpha=0.7)

	ax.set_title('Speaker Embeddings Comparison')
	ax.set_xlabel('Embedding Dimension')
	ax.set_ylabel('Value')
	ax.legend()

	buf = io.BytesIO()
	fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
	buf.seek(0)

	image = Image.open(buf)
	plt.close(fig)
	return image

	def create_gradio_interface():
	speaker_verifier = SpeakerVerification()

	def process_audio(audio1, audio2):
	try:
	if audio1 is None or audio2 is None:
	return "Error: Please provide both audio samples", None

	probability, decision, score = speaker_verifier.verify_speaker(audio1, audio2)
	emb1 = speaker_verifier.get_embeddings(audio1)
	emb2 = speaker_verifier.get_embeddings(audio2)

	embeddings_plot = speaker_verifier.plot_embeddings_comparison(emb1, emb2)

	result_text = f"Probability: {probability:.2%}\nCosine distance: {score}\nDecision: {decision}"

	return result_text, embeddings_plot

	except Exception as e:
	return f"Error processing audio: {str(e)}", None

	interface = gr.Interface(
	fn=process_audio,
	inputs=[
	gr.Audio(label="Audio Sample 1", type="filepath"),
	gr.Audio(label="Audio Sample 2", type="filepath")
	],
	outputs=[
	gr.Textbox(label="Result"),
	gr.Image(label="Embeddings Comparison", type="pil"),
	],
	title="Speaker Verification System",
	description="Upload two audio samples to check if they're from the same speaker."
	)

	return interface

	app = create_gradio_interface()

	if __name__ == "__main__":
	app.launch()