Spaces:

dhs-st
/

iden

Running

App Files Files Community

dhs-st commited on Jan 20

Commit

ea0886a

verified ·

1 Parent(s): fa376a3

Create app.py

Browse files

Files changed (1) hide show

app.py +144 -0

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import torchaudio
+from speechbrain.inference.speaker import SpeakerRecognition
+import gradio as gr
+from pydub import AudioSegment
+import os
+import tempfile
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure
+import io
+from PIL import Image
+class SpeakerVerification:
+    def __init__(self):
+        self.verification = SpeakerRecognition.from_hparams(
+            source="speechbrain/spkrec-ecapa-voxceleb",
+            savedir="pretrained_models/spkrec-ecapa-voxceleb"
+        )
+        self.threshold = 0.25
+    def convert_audio(self, audio_path: str) -> str:
+        try:
+            file_ext = os.path.splitext(audio_path)[1].lower()
+            if file_ext == '.wav':
+                return audio_path
+            audio = AudioSegment.from_file(audio_path)
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
+                temp_wav_path = temp_wav.name
+                audio.export(temp_wav_path, format='wav')
+            return temp_wav_path
+        except Exception as e:
+            print(f"Error converting audio: {str(e)}")
+            raise
+    def score_to_probability(self, score: float) -> float:
+        scale = 10
+        centered_score = (score - self.threshold) * scale
+        probability = 1 / (1 + np.exp(-centered_score))
+        probability = max(0.0, min(1.0, probability))
+        return probability
+    def verify_speaker(self, audio_path1: str, audio_path2: str) -> tuple[float, str]:
+        try:
+            wav_path1 = self.convert_audio(audio_path1)
+            wav_path2 = self.convert_audio(audio_path2)
+            score, prediction = self.verification.verify_files(wav_path1, wav_path2)
+            if wav_path1 != audio_path1:
+                os.unlink(wav_path1)
+            if wav_path2 != audio_path2:
+                os.unlink(wav_path2)
+            score_value = score.item()
+            probability = self.score_to_probability(score_value)
+            decision = "Same speaker" if prediction.item() else "Different speakers"
+            return probability, decision, score_value
+        except Exception as e:
+            print(f"Error in speaker verification: {str(e)}")
+            return 0.0, f"Error: {str(e)}"
+    def get_embeddings(self, audio_path: str):
+        wav_path = self.convert_audio(audio_path)
+        signal, fs = torchaudio.load(wav_path)
+        if signal.shape[0] > 1:
+            signal = torch.mean(signal, dim=0, keepdim=True)
+        embeddings = self.verification.encode_batch(signal)
+        if wav_path != audio_path:
+            os.unlink(wav_path)
+        return embeddings.squeeze()
+    def plot_embeddings_comparison(self, emb1, emb2):
+        fig = Figure(figsize=(10, 4))
+        ax = fig.add_subplot(111)
+        emb1_np = emb1.cpu().numpy()
+        emb2_np = emb2.cpu().numpy()
+        x = range(len(emb1_np))
+        ax.plot(x, emb1_np, label='Speaker 1', alpha=0.7)
+        ax.plot(x, emb2_np, label='Speaker 2', alpha=0.7)
+        ax.set_title('Speaker Embeddings Comparison')
+        ax.set_xlabel('Embedding Dimension')
+        ax.set_ylabel('Value')
+        ax.legend()
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+        buf.seek(0)
+        image = Image.open(buf)
+        plt.close(fig)
+        return image
+def create_gradio_interface():
+    speaker_verifier = SpeakerVerification()
+    def process_audio(audio1, audio2):
+        try:
+            if audio1 is None or audio2 is None:
+                return "Error: Please provide both audio samples", None
+            probability, decision, score = speaker_verifier.verify_speaker(audio1, audio2)
+            emb1 = speaker_verifier.get_embeddings(audio1)
+            emb2 = speaker_verifier.get_embeddings(audio2)
+            embeddings_plot = speaker_verifier.plot_embeddings_comparison(emb1, emb2)
+            result_text = f"Probability: {probability:.2%}\nCosine distance: {score}\nDecision: {decision}"
+            return result_text, embeddings_plot
+        except Exception as e:
+            return f"Error processing audio: {str(e)}", None
+    interface = gr.Interface(
+        fn=process_audio,
+        inputs=[
+            gr.Audio(label="Audio Sample 1", type="filepath"),
+            gr.Audio(label="Audio Sample 2", type="filepath")
+        ],
+        outputs=[
+            gr.Textbox(label="Result"),
+            gr.Image(label="Embeddings Comparison", type="pil"),
+        ],
+        title="Speaker Verification System",
+        description="Upload two audio samples to check if they're from the same speaker."
+    )
+    return interface
+app = create_gradio_interface()
+if __name__ == "__main__":
+    app.launch()