TomCallan's picture
Upload 14 files
aed64b5
import gdown
import random
import numpy as np
from audio import read_mfcc
from batcher import sample_from_mfcc
from constants import SAMPLE_RATE, NUM_FRAMES
from conv_models import DeepSpeakerModel
from test import batch_cosine_similarity
class speaker_recognition:
def __init__(self):
np.random.seed(123)
random.seed(123)
self.speakers = {}
self.weights = ""
self.by_name = True
self.SAMPLE_RATE = SAMPLE_RATE
self.NUM_FRAMES = NUM_FRAMES
self.spin_up()
def spin_up(self):
if self.weights == "":
output = "weights.h5"
gdown.download("https://drive.google.com/uc?id=1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP", output, quiet=False)
self.weights = "weights.h5"
self.model = DeepSpeakerModel()
self.model.m.load_weights(self.weights, by_name=True)
def create_speaker(self, data, id=""):
id = id if id != "" else f"{len(self.speakers)}"
self.speakers[id] = data
return id
def check_speakers(self, data, id="", threshold = 0.5):
us = ""
n = 0
for speaker in self.speakers:
k = batch_cosine_similarity(self.speakers[speaker], data)
if k > threshold:
if k > n:
n = k
us = speaker
else:pass
if n == 0:
id = self.create_speaker(data, id)
return f"created new speaker : {id}"
return (us, k[0])
def run_transform(self, audio, pcm = False):
data = sample_from_mfcc(read_mfcc(audio, self.SAMPLE_RATE), self.NUM_FRAMES)
data = self.model.m.predict(np.expand_dims(data, axis=0))
return data