import SoundScribe.SpeakerID.nemo.collections.asr as nemo_asr import torch import os speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( "nvidia/speakerverification_en_titanet_large") speakers = os.listdir("SoundScribe/voices") identified_speakers = [] for speaker in speakers: if not speaker.startswith("."): identified_speakers.append(speaker) embeddings = [speaker_model.get_embedding("./SoundScribe/voices/"+audio_file).squeeze() for audio_file in identified_speakers] def verify_speaker(file): embs1 = speaker_model.get_embedding(file).squeeze() similarity_scores = [] probably_speaker = "" for embs2, speaker in zip(embeddings, identified_speakers): X = embs1 / torch.linalg.norm(embs1) Y = embs2 / torch.linalg.norm(embs2) # Score similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) similarity_score = (similarity_score + 1) / 2 print(f"{speaker} - {similarity_score}") if similarity_score > 0.7: similarity_scores.append(similarity_score) if similarity_scores: if max(similarity_scores) == similarity_score: probably_speaker = speaker.split(".")[0] if probably_speaker: return probably_speaker else: return "Unidentified User" def find_user(audio): speaker = verify_speaker(audio) print(speaker) return speaker