import gradio as gr
import torch
from nemo.collections.asr.models import EncDecSpeakerLabelModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
STYLE = """
"""
OUTPUT_OK = (
STYLE
+ """
The provided samples are
Same Speakers!!!
similarity score: {:.1f}%
(Similarity score must be atleast 80% to be considered as same speaker)
"""
)
OUTPUT_FAIL = (
STYLE
+ """
The provided samples are from
Different Speakers!!!
similarity score: {:.1f}%
(Similarity score must be atleast 80% to be considered as same speaker)
"""
)
THRESHOLD = 0.80
model_name = "nvidia/speakerverification_en_titanet_large"
model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)
def compare_samples(path1, path2):
if not (path1 and path2):
return 'ERROR: Please record audio for *both* speakers!'
embs1 = model.get_embedding(path1).squeeze()
embs2 = model.get_embedding(path2).squeeze()
#Length Normalize
X = embs1 / torch.linalg.norm(embs1)
Y = embs2 / torch.linalg.norm(embs2)
# Score
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2
# Decision
if similarity_score >= THRESHOLD:
return OUTPUT_OK.format(similarity_score * 100)
else:
return OUTPUT_FAIL.format(similarity_score * 100)
inputs = [
gr.Audio(sources=["microphone"], type="filepath", label="Speaker #1"),
gr.Audio(sources=["microphone"], type="filepath", label="Speaker #2"),
]
upload_inputs = [
gr.Audio(sources=["upload"], type="filepath", label="Speaker #1"),
gr.Audio(sources=["upload"], type="filepath", label="Speaker #2"),
]
description = (
"This demonstration will analyze two recordings of speech and ascertain whether they have been spoken by the same individual.\n"
"You can attempt this exercise using your own voice."
)
article = (
""
"🎙️ Learn more about TitaNet model | "
"📚 TitaNet paper | "
"🧑💻 Repository"
"
"
)
examples = [
["data/id10270_5r0dWxy17C8-00001.wav", "data/id10270_5r0dWxy17C8-00002.wav"],
["data/id10271_1gtz-CUIygI-00001.wav", "data/id10271_1gtz-CUIygI-00002.wav"],
["data/id10270_5r0dWxy17C8-00001.wav", "data/id10271_1gtz-CUIygI-00001.wav"],
["data/id10270_5r0dWxy17C8-00002.wav", "data/id10271_1gtz-CUIygI-00002.wav"],
]
microphone_interface = gr.Interface(
fn=compare_samples,
inputs=inputs,
outputs=gr.HTML(label=""),
title="Speaker Verification with TitaNet Embeddings",
description=description,
article=article,
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
upload_interface = gr.Interface(
fn=compare_samples,
inputs=upload_inputs,
outputs=gr.HTML(label=""),
title="Speaker Verification with TitaNet Embeddings",
description=description,
article=article,
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
demo.queue(max_size=5, default_concurrency_limit=4)
demo.launch()