|
import gdown |
|
import random |
|
|
|
import numpy as np |
|
|
|
from audio import read_mfcc |
|
from batcher import sample_from_mfcc |
|
from constants import SAMPLE_RATE, NUM_FRAMES |
|
from conv_models import DeepSpeakerModel |
|
from test import batch_cosine_similarity |
|
|
|
class speaker_recognition: |
|
def __init__(self): |
|
|
|
np.random.seed(123) |
|
random.seed(123) |
|
|
|
self.speakers = {} |
|
self.weights = "" |
|
self.by_name = True |
|
|
|
self.SAMPLE_RATE = SAMPLE_RATE |
|
self.NUM_FRAMES = NUM_FRAMES |
|
|
|
self.spin_up() |
|
|
|
def spin_up(self): |
|
if self.weights == "": |
|
output = "weights.h5" |
|
gdown.download("https://drive.google.com/uc?id=1F9NvdrarWZNktdX9KlRYWWHDwRkip_aP", output, quiet=False) |
|
self.weights = "weights.h5" |
|
|
|
self.model = DeepSpeakerModel() |
|
self.model.m.load_weights(self.weights, by_name=True) |
|
|
|
def create_speaker(self, data, id=""): |
|
id = id if id != "" else f"{len(self.speakers)}" |
|
self.speakers[id] = data |
|
return id |
|
|
|
def check_speakers(self, data, id="", threshold = 0.5): |
|
us = "" |
|
n = 0 |
|
for speaker in self.speakers: |
|
k = batch_cosine_similarity(self.speakers[speaker], data) |
|
if k > threshold: |
|
if k > n: |
|
n = k |
|
us = speaker |
|
else:pass |
|
if n == 0: |
|
id = self.create_speaker(data, id) |
|
return f"created new speaker : {id}" |
|
|
|
return (us, k[0]) |
|
|
|
def run_transform(self, audio, pcm = False): |
|
data = sample_from_mfcc(read_mfcc(audio, self.SAMPLE_RATE), self.NUM_FRAMES) |
|
data = self.model.m.predict(np.expand_dims(data, axis=0)) |
|
return data |