import gdown import random import numpy as np from audio import read_mfcc from batcher import sample_from_mfcc from constants import SAMPLE_RATE, NUM_FRAMES from conv_models import DeepSpeakerModel from test import batch_cosine_similarity class speaker_recognition: def __init__(self): np.random.seed(123) random.seed(123) self.speakers = {} self.weights = "" self.by_name = True self.SAMPLE_RATE = SAMPLE_RATE self.NUM_FRAMES = NUM_FRAMES self.spin_up() def spin_up(self): if self.weights == "": output = "weights.h5" gdown.download("https://drive.google.com/uc?id=1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP", output, quiet=False) self.weights = "weights.h5" self.model = DeepSpeakerModel() self.model.m.load_weights(self.weights, by_name=True) def create_speaker(self, data, id=""): id = id if id != "" else f"{len(self.speakers)}" self.speakers[id] = data return id def check_speakers(self, data, id="", threshold = 0.5): us = "" n = 0 for speaker in self.speakers: k = batch_cosine_similarity(self.speakers[speaker], data) if k > threshold: if k > n: n = k us = speaker else:pass if n == 0: id = self.create_speaker(data, id) return f"created new speaker : {id}" return (us, k[0]) def run_transform(self, audio, pcm = False): data = sample_from_mfcc(read_mfcc(audio, self.SAMPLE_RATE), self.NUM_FRAMES) data = self.model.m.predict(np.expand_dims(data, axis=0)) return data