import gradio as gr import torch from nemo.collections.asr.models import EncDecSpeakerLabelModel device = torch.device("cuda" if torch.cuda.is_available() else "cpu") STYLE = """ """ OUTPUT_OK = ( STYLE + """

The provided samples are

Same Speakers!!!

""" ) OUTPUT_FAIL = ( STYLE + """

The provided samples are from

Different Speakers!!!

""" ) THRESHOLD = 0.80 model_name = "nvidia/speakerverification_en_titanet_large" model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device) def compare_samples(path1, path2): if not (path1 and path2): return 'ERROR: Please record audio for *both* speakers!' output = model.verify_speakers(path1,path2,THRESHOLD) return OUTPUT_OK if output else OUTPUT_FAIL inputs = [ gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"), gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"), ] output = gr.outputs.HTML(label="") description = ( "This demonstration will analyze two recordings of speech and ascertain whether they have been spoken by the same individual.\n" "You can attempt this exercise using your own voice." ) article = ( "

" "🎙️ Learn more about TitaNet model | " "📚 TitaNet paper | " "🧑‍💻 Repository" "

" ) examples = [ ["data/id10270_5r0dWxy17C8-00001.wav", "data/id10270_5r0dWxy17C8-00002.wav"], ["data/id10271_1gtz-CUIygI-00001.wav", "data/id10271_1gtz-CUIygI-00002.wav"], ["data/id10270_5r0dWxy17C8-00001.wav", "data/id10271_1gtz-CUIygI-00001.wav"], ["data/id10270_5r0dWxy17C8-00002.wav", "data/id10271_1gtz-CUIygI-00002.wav"], ] interface = gr.Interface( fn=compare_samples, inputs=inputs, outputs=output, title="Speaker Verification with TitaNet Embeddings", description=description, article=article, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, examples=examples, ) interface.launch(enable_queue=True)