#!/usr/bin/env python # encoding: utf-8 # The MIT License # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/) # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import onnxruntime import warnings warnings.filterwarnings("ignore") import os # os.environ["CUDA_DEVICE_ORDER"]= '0' import sys import math from iman import Audio import numpy as np from tensorflow import keras from tensorflow.compat.v1.keras.backend import set_session from tqdm import tqdm from .thread_returning import ThreadReturning import shutil import time import random from skimage.util import view_as_windows as vaw from .viterbi import viterbi_decoding from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp from .features import media2feats from .export_funcs import seg2csv, seg2textgrid def _energy_activity(loge, ratio=0.4): ##########0.9 threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio) raw_activity = (loge > threshold) return viterbi_decoding(pred2logemission(raw_activity), log_trans_exp(50, cost0=-5)) #exp(150, cost0=-5) def filter_sig(isig , wav , sr=16000): if (sr!=16000): wav = Audio.Resample(wav , 16000, sr) try: w=[] wn=[] wn.append(wav[0 : int(isig[0][1]*sr)]) for i , [_,a,b,_] in enumerate(isig): w.append(wav[int(a*sr) : int(b*sr)]) try: wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)]) except: wn.append(wav[int(isig[i][2]*sr) : len(wav)]) return (np.concatenate(w),np.concatenate(wn)) except: w=[] wn=[] wn.append(wav[0 : int(isig[0][1]*sr)]) for i , [_,a,b,_,_] in enumerate(isig): w.append(wav[int(a*sr) : int(b*sr)]) try: wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)]) except: wn.append(wav[int(isig[i][2]*sr) : len(wav)]) return (np.concatenate(w),np.concatenate(wn)) def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20): if (len(isig)==0): return -1 # _dels=[] # for i , [_,_,_,_d] in enumerate(isig): # if (_d<=ignore_small_speech_segments) : # _dels.append(i) # _dels.reverse() # for i in _dels: # del isig[i] # if (len(isig)==0): # return -1 for i in range(len(isig)-1): t = isig[i+1][1] - isig[i][2] # silence between towo chunk isig[i].append(t) isig[-1].append(-1) if (len(isig)>0): rang = np.arange(0.01,max_silence+0.1,0.1) for di in rang: for i , [_,_,_,_,_t] in enumerate(isig): if (_t==-1): break if (_t <=di): try: if (isig[i+1][2] - isig[i][1] <= max_speech_len): isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ] del isig[i+1] except: pass _dels=[] for i , [_,_,_,_d,_] in enumerate(isig): if (_d<=ignore_small_speech_segments) : _dels.append(i) _dels.reverse() for i in _dels: del isig[i] if (len(isig)==0): return -1 isign=[] for i , [_,_,_,_d,_] in enumerate(isig): if (_d> split_speech_bigger_than ) : _gc = math.ceil(_d/split_speech_bigger_than) m = _d/_gc print('Bigger-->' + str(_d) + '-->' + str(m)) for jj in range(_gc): fas=0 if (jj== _gc-1): fas= isig[i][4] isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] ) else: isign.append(isig[i]) for i,(a,b,c,d,e) in enumerate(isign): if (e==-1): break _addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen] return(isign) def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20): isig = [] i=0 while (i max_speech_len): if (i>inn and i>0): i=i-1 break i=i+1 en = (vad[i][2]) fa = en-st if (fa > ignore_small_speech_segments): if (fa>split_speech_bigger_than): _gc = math.ceil(fa/split_speech_bigger_than) m = fa/_gc print('Bigger-->' + str(fa) + '-->' + str(m)) for jj in range(_gc): isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m)) else: isig.append(('speech', st , en,fa)) i=i+1 isign=[] for i,(a,b,c,d) in enumerate(isig): if (i == len(isig)-1): isign.append(isig[i]) break _addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود isign.append([a,b,c+_addlen ,d+_addlen]) return(isign) def get_path_3d(data,batch_size): total_batches = data.shape[0] // batch_size last_batch_size = data.shape[0] % batch_size if last_batch_size != 0: batches = np.split(data[:total_batches * batch_size], total_batches) last_batch = np.expand_dims(data[total_batches * batch_size:], axis=0).squeeze() batches.append(last_batch) else: batches = np.split(data, total_batches) return batches def _get_patches(mspec, w, step): h = mspec.shape[1] data = vaw(mspec, (w,h), step=step) data.shape = (len(data), w*h) data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1)) lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step)) rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2) data = np.vstack(lfill + [data] + rfill ) finite = np.all(np.isfinite(data), axis=1) data.shape = (len(data), w, h) return data, finite def _binidx2seglist(binidx): """ ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5) Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)] #TODO: is there a pandas alternative?? """ curlabel = None bseg = -1 ret = [] for i, e in enumerate(binidx): if e != curlabel: if curlabel is not None: ret.append((curlabel, bseg, i)) curlabel = e bseg = i ret.append((curlabel, bseg, i + 1)) return ret class DnnSegmenter: """ DnnSegmenter is an abstract class allowing to perform Dnn-based segmentation using Keras serialized models using 24 mel spectrogram features obtained with SIDEKIT framework. Child classes MUST define the following class attributes: * nmel: the number of mel bands to used (max: 24) * viterbi_arg: the argument to be used with viterbi post-processing * model_fname: the filename of the serialized keras model to be used the model should be stored in the current directory * inlabel: only segments with label name inlabel will be analyzed. other labels will stay unchanged * outlabels: the labels associated the output of neural network models """ def __init__(self, batch_size, vad_type,model_path,EP_list): # load the DNN model if (vad_type!='vad'): self.session = onnxruntime.InferenceSession(model_path,providers=EP_list) #self.nn = keras.models.load_model(model_path, compile=False) print('model Loded from--> ' + model_path) # self.nn.summary() self.batch_size = batch_size def __call__(self, mspec, lseg, difflen = 0): """ *** input * mspec: mel spectrogram * lseg: list of tuples (label, start, stop) corresponding to previous segmentations * difflen: 0 if the original length of the mel spectrogram is >= 68 otherwise it is set to 68 - length(mspec) *** output a list of adjacent tuples (label, start, stop) """ if self.nmel < 24: mspec = mspec[:, :self.nmel].copy() patches, finite = _get_patches(mspec, 68, 2) if difflen > 0: patches = patches[:-int(difflen / 2), :, :] finite = finite[:-int(difflen / 2)] assert len(finite) == len(patches), (len(patches), len(finite)) batch = [] for lab, start, stop in lseg: if lab == self.inlabel: batch.append(patches[start:stop, :]) if len(batch) > 0: batch = np.concatenate(batch) batches = get_path_3d(batch , self.batch_size,) #rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1) input_name = self.session.get_inputs()[0].name rawpred=[] for batch in tqdm(batches): rawpred.append(self.session.run(None, {input_name: batch})[0]) rawpred = np.concatenate(rawpred) ret = [] for lab, start, stop in lseg: if lab != self.inlabel: ret.append((lab, start, stop)) continue l = stop - start r = rawpred[:l] rawpred = rawpred[l:] r[finite[start:stop] == False, :] = 0.5 pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels))) for lab2, start2, stop2 in _binidx2seglist(pred): ret.append((self.outlabels[int(lab2)], start2+start, stop2+start)) return ret class SpeechMusic(DnnSegmenter): # Voice activity detection: requires energetic activity detection outlabels = ('speech', 'music') inlabel = 'energy' nmel = 21 viterbi_arg = 150 class SpeechMusicNoise(DnnSegmenter): # Voice activity detection: requires energetic activity detection outlabels = ('speech', 'music', 'noise') inlabel = 'energy' nmel = 21 viterbi_arg = 80 class Gender(DnnSegmenter): # Gender Segmentation, requires voice activity detection outlabels = ('female', 'male') inlabel = 'speech' nmel = 24 viterbi_arg = 80 class Segmenter: def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.onnx",gender_path="c:\\keras_male_female_cnn.onnx" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda'): """ Load neural network models Input: 'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise) 'sm' was used in the results presented in ICASSP 2017 paper and in MIREX 2018 challenge submission 'smn' has been implemented more recently and has not been evaluated in papers 'detect_gender': if False, speech excerpts are return labelled as 'speech' if True, speech excerpts are splitted into 'male' and 'female' segments """ self.complete_output = complete_output self.sample_rate = sr self.ffmpeg_path=ffmpeg_path if (device != 'cuda'): os.environ["CUDA_DEVICE_ORDER"]= '-1' EP_list=[ 'CPUExecutionProvider'] else: EP_list=['CUDAExecutionProvider'] import tensorflow as tf config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.log_device_placement = True sess = tf.compat.v1.Session(config=config) set_session(sess) # self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks # select speech/music or speech/music/noise voice activity detection engine assert vad_engine in ['sm', 'smn'] if vad_engine == 'sm': self.vad = SpeechMusic(batch_size) elif vad_engine == 'smn': self.vad = SpeechMusicNoise(batch_size , vad_type,model_path,EP_list) # load gender detection NN if required assert detect_gender in [True, False] self.detect_gender = detect_gender if detect_gender: self.gender = Gender(batch_size , vad_type ,gender_path,EP_list) self.vad_type = vad_type self.model_path = model_path self.gender_path = gender_path def segment_feats(self, mspec, loge, difflen, start_sec): """ do segmentation require input corresponding to wav file sampled at 16000Hz with a single channel """ # perform energy-based activity detection lseg = [] vadseg=[] for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]): if lab == 0: lab = 'noEnergy' else: lab = 'energy' vadseg.append(('speech', start, stop)) lseg.append((lab, start, stop)) if (self.vad_type == 'vad'): return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg] # perform voice activity detection lseg = self.vad(mspec, lseg, difflen) # perform gender segmentation on speech segments if self.detect_gender: lseg = self.gender(mspec, lseg, difflen) if (self.complete_output): return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ] else: return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")] def __call__(self, medianame, input_type='file',start_sec=None, stop_sec=None): """ Return segmentation of a given file * convert file to wav 16k mono with ffmpeg * call NN segmentation procedures * media_name: path to the media to be processed (including remote url) may include any format supported by ffmpeg * tmpdir: allow to define a custom path for storing temporary files fast read/write HD are a good choice * start_sec (seconds): sound stream before start_sec won't be processed * stop_sec (seconds): sound stream after stop_sec won't be processed """ mspec, loge, difflen , me = media2feats(medianame, input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path) if start_sec is None: start_sec = 0 # do segmentation return self.segment_feats(mspec, loge, difflen, start_sec),me def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'): if verbose: print('batch_processing %d files' % len(linput)) if output_format == 'csv': fexport = seg2csv elif output_format == 'textgrid': fexport = seg2textgrid else: raise NotImplementedError() t_batch_start = time.time() lmsg = [] fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay) i = 0 for feats, msg in fg: lmsg += msg i += len(msg) if verbose: print('%d/%d' % (i, len(linput)), msg) if feats is None: break mspec, loge, difflen = feats #if verbose == True: # print(i, linput[i], loutput[i]) b = time.time() lseg = self.segment_feats(mspec, loge, difflen, 0) fexport(lseg, loutput[len(lmsg) -1]) lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b)) t_batch_dur = time.time() - t_batch_start nb_processed = len([e for e in lmsg if e[1] == 0]) if nb_processed > 0: avg = t_batch_dur / nb_processed else: avg = -1 return t_batch_dur, nb_processed, avg, lmsg def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000): """ To be used when processing batches if resulting file exists, it is skipped in case of remote files, access is tried nbtry times """ ret = None msg = [] while ret is None and len(lin) > 0: src = lin.pop(0) dst = lout.pop(0) # print('popping', src) # if file exists: skipp if skipifexist and os.path.exists(dst): msg.append((dst, 1, 'already exists')) continue # create storing directory if required dname = os.path.dirname(dst) if not os.path.isdir(dname): os.makedirs(dname) itry = 0 while ret is None and itry < nbtry: try: ret = media2feats(src, tmpdir, None, None, ffmpeg) except: itry += 1 errmsg = sys.exc_info()[0] if itry != nbtry: time.sleep(random.random() * trydelay) if ret is None: msg.append((dst, 2, 'error: ' + str(errmsg))) else: msg.append((dst, 0, 'ok')) return ret, msg def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000): # print('init feat gen', len(ilist)) thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate]) thread.start() while True: ret, msg = thread.join() # print('join done', len(ilist)) # print('new list', ilist) #ilist = ilist[len(msg):] #olist = olist[len(msg):] if len(ilist) == 0: break thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate]) thread.start() yield ret, msg yield ret, msg