import nemo from nemo.collections.asr.models.msdd_models import NeuralDiarizer import gradio as gr import pandas as pd import torch device = "cuda" if torch.cuda.is_available() else "cpu" model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device) def run_diarization(path1): annotation = model(path1, num_workers=0, batch_size=16) rttm=annotation.to_rttm() df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker']) for idx,line in enumerate(rttm.splitlines()): split = line.split() start_time, duration, speaker = split[3], split[4], split[7] end_time = float(start_time) + float(duration) df.loc[idx] = start_time, end_time, speaker return df article = ( "
" "🎙️ Learn more about MSDD model | " "📚 MSDD paper | " "🧑💻 Repository" "
" ) examples = [ ["data/conversation.wav"], ["data/id10270_5r0dWxy17C8-00001.wav"], ] microphone_interface = gr.Interface( fn=run_diarization, inputs=[gr.Audio(source="microphone", type="filepath", optional=True, label="Mic Audio")], outputs=[gr.components.Dataframe()], title="Offline Speaker Diarization with NeMo", description="This demonstration will perform offline speaker diarization on an audio file using nemo", article=article, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, examples=examples, ) upload_interface = gr.Interface( fn=run_diarization, inputs=[gr.Audio(source="upload", type='filepath',optional=True, label='Upload File')], outputs=[gr.components.Dataframe()], title="Offline Speaker Diarization with NeMo", description="This demonstration will perform offline speaker diarization on an audio file using nemo", article=article, layout="horizontal", theme="huggingface", allow_flagging=False, live=False, examples=examples, ) demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"]) demo.launch(enable_queue=True)