dhs-st commited on
Commit
ea0886a
·
verified ·
1 Parent(s): fa376a3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from speechbrain.inference.speaker import SpeakerRecognition
4
+ import gradio as gr
5
+ from pydub import AudioSegment
6
+ import os
7
+ import tempfile
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ from matplotlib.figure import Figure
11
+ import io
12
+ from PIL import Image
13
+
14
+ class SpeakerVerification:
15
+ def __init__(self):
16
+ self.verification = SpeakerRecognition.from_hparams(
17
+ source="speechbrain/spkrec-ecapa-voxceleb",
18
+ savedir="pretrained_models/spkrec-ecapa-voxceleb"
19
+ )
20
+ self.threshold = 0.25
21
+
22
+ def convert_audio(self, audio_path: str) -> str:
23
+ try:
24
+ file_ext = os.path.splitext(audio_path)[1].lower()
25
+ if file_ext == '.wav':
26
+ return audio_path
27
+
28
+ audio = AudioSegment.from_file(audio_path)
29
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
30
+ temp_wav_path = temp_wav.name
31
+ audio.export(temp_wav_path, format='wav')
32
+ return temp_wav_path
33
+
34
+ except Exception as e:
35
+ print(f"Error converting audio: {str(e)}")
36
+ raise
37
+
38
+ def score_to_probability(self, score: float) -> float:
39
+ scale = 10
40
+ centered_score = (score - self.threshold) * scale
41
+ probability = 1 / (1 + np.exp(-centered_score))
42
+ probability = max(0.0, min(1.0, probability))
43
+ return probability
44
+
45
+ def verify_speaker(self, audio_path1: str, audio_path2: str) -> tuple[float, str]:
46
+ try:
47
+ wav_path1 = self.convert_audio(audio_path1)
48
+ wav_path2 = self.convert_audio(audio_path2)
49
+
50
+ score, prediction = self.verification.verify_files(wav_path1, wav_path2)
51
+
52
+ if wav_path1 != audio_path1:
53
+ os.unlink(wav_path1)
54
+ if wav_path2 != audio_path2:
55
+ os.unlink(wav_path2)
56
+
57
+ score_value = score.item()
58
+ probability = self.score_to_probability(score_value)
59
+ decision = "Same speaker" if prediction.item() else "Different speakers"
60
+
61
+ return probability, decision, score_value
62
+
63
+ except Exception as e:
64
+ print(f"Error in speaker verification: {str(e)}")
65
+ return 0.0, f"Error: {str(e)}"
66
+
67
+ def get_embeddings(self, audio_path: str):
68
+ wav_path = self.convert_audio(audio_path)
69
+ signal, fs = torchaudio.load(wav_path)
70
+
71
+ if signal.shape[0] > 1:
72
+ signal = torch.mean(signal, dim=0, keepdim=True)
73
+
74
+ embeddings = self.verification.encode_batch(signal)
75
+
76
+ if wav_path != audio_path:
77
+ os.unlink(wav_path)
78
+ return embeddings.squeeze()
79
+
80
+ def plot_embeddings_comparison(self, emb1, emb2):
81
+ fig = Figure(figsize=(10, 4))
82
+ ax = fig.add_subplot(111)
83
+
84
+ emb1_np = emb1.cpu().numpy()
85
+ emb2_np = emb2.cpu().numpy()
86
+
87
+ x = range(len(emb1_np))
88
+ ax.plot(x, emb1_np, label='Speaker 1', alpha=0.7)
89
+ ax.plot(x, emb2_np, label='Speaker 2', alpha=0.7)
90
+
91
+ ax.set_title('Speaker Embeddings Comparison')
92
+ ax.set_xlabel('Embedding Dimension')
93
+ ax.set_ylabel('Value')
94
+ ax.legend()
95
+
96
+ buf = io.BytesIO()
97
+ fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
98
+ buf.seek(0)
99
+
100
+ image = Image.open(buf)
101
+ plt.close(fig)
102
+ return image
103
+
104
+ def create_gradio_interface():
105
+ speaker_verifier = SpeakerVerification()
106
+
107
+ def process_audio(audio1, audio2):
108
+ try:
109
+ if audio1 is None or audio2 is None:
110
+ return "Error: Please provide both audio samples", None
111
+
112
+ probability, decision, score = speaker_verifier.verify_speaker(audio1, audio2)
113
+ emb1 = speaker_verifier.get_embeddings(audio1)
114
+ emb2 = speaker_verifier.get_embeddings(audio2)
115
+
116
+ embeddings_plot = speaker_verifier.plot_embeddings_comparison(emb1, emb2)
117
+
118
+ result_text = f"Probability: {probability:.2%}\nCosine distance: {score}\nDecision: {decision}"
119
+
120
+ return result_text, embeddings_plot
121
+
122
+ except Exception as e:
123
+ return f"Error processing audio: {str(e)}", None
124
+
125
+ interface = gr.Interface(
126
+ fn=process_audio,
127
+ inputs=[
128
+ gr.Audio(label="Audio Sample 1", type="filepath"),
129
+ gr.Audio(label="Audio Sample 2", type="filepath")
130
+ ],
131
+ outputs=[
132
+ gr.Textbox(label="Result"),
133
+ gr.Image(label="Embeddings Comparison", type="pil"),
134
+ ],
135
+ title="Speaker Verification System",
136
+ description="Upload two audio samples to check if they're from the same speaker."
137
+ )
138
+
139
+ return interface
140
+
141
+ app = create_gradio_interface()
142
+
143
+ if __name__ == "__main__":
144
+ app.launch()