|
import torch |
|
import torchaudio |
|
from speechbrain.inference.speaker import SpeakerRecognition |
|
import gradio as gr |
|
from pydub import AudioSegment |
|
import os |
|
import tempfile |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from matplotlib.figure import Figure |
|
import io |
|
from PIL import Image |
|
|
|
import pkg_resources |
|
installed_packages = [f"{dist.key}=={dist.version}" for dist in pkg_resources.working_set] |
|
for package in installed_packages: |
|
print(package) |
|
|
|
class SpeakerVerification: |
|
def __init__(self): |
|
self.verification = SpeakerRecognition.from_hparams( |
|
source="speechbrain/spkrec-ecapa-voxceleb", |
|
savedir="pretrained_models/spkrec-ecapa-voxceleb" |
|
) |
|
self.threshold = 0.25 |
|
|
|
def convert_audio(self, audio_path: str) -> str: |
|
try: |
|
file_ext = os.path.splitext(audio_path)[1].lower() |
|
if file_ext == '.wav': |
|
return audio_path |
|
|
|
audio = AudioSegment.from_file(audio_path) |
|
audio = audio.set_channels(1) |
|
audio = audio.set_frame_rate(16000) |
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: |
|
temp_wav_path = temp_wav.name |
|
audio.export(temp_wav_path, format='wav') |
|
return temp_wav_path |
|
|
|
except Exception as e: |
|
print(f"Error converting audio: {str(e)}") |
|
raise |
|
|
|
def score_to_probability(self, score: float) -> float: |
|
scale = 10 |
|
centered_score = (score - self.threshold) * scale |
|
probability = 1 / (1 + np.exp(-centered_score)) |
|
probability = max(0.0, min(1.0, probability)) |
|
return probability |
|
|
|
def calculate_confidence_metrics(self, score_value: float) -> dict: |
|
"""Calculate various confidence metrics.""" |
|
try: |
|
|
|
threshold_distance = abs(score_value - self.threshold) |
|
|
|
|
|
normalized_confidence = (score_value + 1) / 2 |
|
|
|
|
|
certainty = 1 - (1 / (1 + np.exp(5 * threshold_distance))) |
|
|
|
|
|
ambiguous_region = 0.1 |
|
if abs(score_value - self.threshold) < ambiguous_region: |
|
decision_strength = "Low" |
|
elif abs(score_value - self.threshold) < ambiguous_region * 2: |
|
decision_strength = "Medium" |
|
else: |
|
decision_strength = "High" |
|
|
|
|
|
if certainty < 0.6: |
|
confidence_level = "Low" |
|
elif certainty < 0.8: |
|
confidence_level = "Medium" |
|
else: |
|
confidence_level = "High" |
|
|
|
return { |
|
"certainty_score": certainty, |
|
"threshold_distance": threshold_distance, |
|
"decision_strength": decision_strength, |
|
"confidence_level": confidence_level |
|
} |
|
except Exception as e: |
|
print(f"Error calculating confidence metrics: {str(e)}") |
|
return {} |
|
|
|
def verify_speaker(self, audio_path1: str, audio_path2: str) -> tuple: |
|
try: |
|
wav_path1 = self.convert_audio(audio_path1) |
|
wav_path2 = self.convert_audio(audio_path2) |
|
|
|
score, prediction = self.verification.verify_files(wav_path1, wav_path2) |
|
|
|
if wav_path1 != audio_path1: |
|
os.unlink(wav_path1) |
|
if wav_path2 != audio_path2: |
|
os.unlink(wav_path2) |
|
|
|
score_value = score.item() |
|
probability = self.score_to_probability(score_value) |
|
decision = "Same speaker" if prediction.item() else "Different speakers" |
|
|
|
|
|
confidence_metrics = self.calculate_confidence_metrics(score_value) |
|
|
|
return probability, decision, score_value, confidence_metrics |
|
|
|
except Exception as e: |
|
print(f"Error in speaker verification: {str(e)}") |
|
return 0.0, f"Error: {str(e)}", 0.0, {} |
|
|
|
def get_embeddings(self, audio_path: str): |
|
wav_path = self.convert_audio(audio_path) |
|
signal, fs = torchaudio.load(wav_path) |
|
|
|
if signal.shape[0] > 1: |
|
signal = torch.mean(signal, dim=0, keepdim=True) |
|
|
|
embeddings = self.verification.encode_batch(signal) |
|
|
|
if wav_path != audio_path: |
|
os.unlink(wav_path) |
|
return embeddings.squeeze() |
|
|
|
def plot_embeddings_comparison(self, emb1, emb2): |
|
fig = Figure(figsize=(10, 4)) |
|
ax = fig.add_subplot(111) |
|
|
|
emb1_np = emb1.cpu().numpy() |
|
emb2_np = emb2.cpu().numpy() |
|
|
|
x = range(len(emb1_np)) |
|
ax.plot(x, emb1_np, label='Speaker 1', alpha=0.7) |
|
ax.plot(x, emb2_np, label='Speaker 2', alpha=0.7) |
|
|
|
ax.set_title('Speaker Embeddings Comparison') |
|
ax.set_xlabel('Embedding Dimension') |
|
ax.set_ylabel('Value') |
|
ax.legend() |
|
|
|
buf = io.BytesIO() |
|
fig.savefig(buf, format='png', dpi=100, bbox_inches='tight') |
|
buf.seek(0) |
|
|
|
image = Image.open(buf) |
|
plt.close(fig) |
|
return image |
|
|
|
def create_gradio_interface(): |
|
speaker_verifier = SpeakerVerification() |
|
|
|
def process_audio(audio1, audio2): |
|
try: |
|
if audio1 is None or audio2 is None: |
|
return "Error: Please provide both audio samples", None |
|
|
|
probability, decision, score, confidence_metrics = speaker_verifier.verify_speaker(audio1, audio2) |
|
emb1 = speaker_verifier.get_embeddings(audio1) |
|
emb2 = speaker_verifier.get_embeddings(audio2) |
|
|
|
embeddings_plot = speaker_verifier.plot_embeddings_comparison(emb1, emb2) |
|
|
|
result_text = ( |
|
f"Cosine similarity (threshold=0.25): {score:.3f}\n" |
|
f"Decision: {decision}\n" |
|
f"Certainty Score: {confidence_metrics['certainty_score']:.2f}\n" |
|
f"Threshold Distance: {confidence_metrics['threshold_distance']:.3f}\n" |
|
f"Decision Strength: {confidence_metrics['decision_strength']}\n" |
|
f"Confidence Level: {confidence_metrics['confidence_level']}" |
|
) |
|
|
|
return result_text, embeddings_plot |
|
|
|
except Exception as e: |
|
return f"Error processing audio: {str(e)}", None |
|
|
|
interface = gr.Interface( |
|
fn=process_audio, |
|
inputs=[ |
|
gr.Audio(label="Audio Sample 1", type="filepath"), |
|
gr.Audio(label="Audio Sample 2", type="filepath") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Result"), |
|
gr.Image(label="Embeddings Comparison", type="pil"), |
|
], |
|
title="Speaker Verification System", |
|
description="Upload two audio samples to check if they're from the same speaker." |
|
) |
|
|
|
return interface |
|
|
|
app = create_gradio_interface() |
|
|
|
if __name__ == "__main__": |
|
app.launch() |