Spaces:

imansarraf
/

Soorani_ASR

Sleeping

App Files Files Community

Soorani_ASR / sad_tf /segmentero.py

imansarraf

Upload 9 files

ab53da2 verified 30 days ago

raw

history blame

20 kB

	#!/usr/bin/env python
	# encoding: utf-8

	# The MIT License

	# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.
	import onnxruntime
	import warnings
	warnings.filterwarnings("ignore")
	import os
	# os.environ["CUDA_DEVICE_ORDER"]= '0'
	import sys
	import math
	from iman import Audio
	import numpy as np
	from tensorflow import keras
	from tensorflow.compat.v1.keras.backend import set_session
	from tqdm import tqdm
	from .thread_returning import ThreadReturning

	import shutil
	import time
	import random

	from skimage.util import view_as_windows as vaw


	from .viterbi import viterbi_decoding
	from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp

	from .features import media2feats
	from .export_funcs import seg2csv, seg2textgrid



	def _energy_activity(loge, ratio=0.4): ##########0.9

	threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
	raw_activity = (loge > threshold)
	return viterbi_decoding(pred2logemission(raw_activity),
	log_trans_exp(50, cost0=-5))

	#exp(150, cost0=-5)

	def filter_sig(isig , wav , sr=16000):

	if (sr!=16000):
	wav = Audio.Resample(wav , 16000, sr)


	try:
	w=[]
	wn=[]
	wn.append(wav[0 : int(isig[0][1]*sr)])
	for i , [_,a,b,_] in enumerate(isig):

	w.append(wav[int(asr) : int(bsr)])
	try:
	wn.append(wav[ int(isig[i][2]sr) : int(isig[i+1][1]sr)])
	except:
	wn.append(wav[int(isig[i][2]*sr) : len(wav)])

	return (np.concatenate(w),np.concatenate(wn))
	except:
	w=[]
	wn=[]
	wn.append(wav[0 : int(isig[0][1]*sr)])
	for i , [_,a,b,_,_] in enumerate(isig):
	w.append(wav[int(asr) : int(bsr)])
	try:
	wn.append(wav[ int(isig[i][2]sr) : int(isig[i+1][1]sr)])
	except:
	wn.append(wav[int(isig[i][2]*sr) : len(wav)])

	return (np.concatenate(w),np.concatenate(wn))

	def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):

	if (len(isig)==0):
	return -1

	# _dels=[]
	# for i , [_,_,_,_d] in enumerate(isig):
	# if (_d<=ignore_small_speech_segments) :
	# _dels.append(i)
	# _dels.reverse()
	# for i in _dels:
	# del isig[i]

	# if (len(isig)==0):
	# return -1


	for i in range(len(isig)-1):
	t = isig[i+1][1] - isig[i][2] # silence between towo chunk
	isig[i].append(t)
	isig[-1].append(-1)


	if (len(isig)>0):

	rang = np.arange(0.01,max_silence+0.1,0.1)
	for di in rang:
	for i , [_,_,_,_,_t] in enumerate(isig):
	if (_t==-1):
	break
	if (_t <=di):
	try:
	if (isig[i+1][2] - isig[i][1] <= max_speech_len):
	isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
	del isig[i+1]
	except:
	pass
	_dels=[]
	for i , [_,_,_,_d,_] in enumerate(isig):
	if (_d<=ignore_small_speech_segments) :
	_dels.append(i)
	_dels.reverse()

	for i in _dels:
	del isig[i]

	if (len(isig)==0):
	return -1


	isign=[]
	for i , [_,_,_,_d,_] in enumerate(isig):
	if (_d> split_speech_bigger_than ) :

	_gc = math.ceil(_d/split_speech_bigger_than)
	m = _d/_gc
	print('Bigger-->' + str(_d) + '-->' + str(m))
	for jj in range(_gc):
	fas=0
	if (jj== _gc-1):
	fas= isig[i][4]
	isign.append( [isig[i][0] ,isig[i][1] + mjj ,isig[i][1] + (m(jj+1)), m, fas ] )
	else:
	isign.append(isig[i])
	for i,(a,b,c,d,e) in enumerate(isign):
	if (e==-1):
	break
	_addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
	isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]

	return(isign)


	def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):

	isig = []
	i=0
	while (i <len(vad)):

	ml=0
	inn = i
	st = (vad[i][1])

	while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
	ml = (vad[i][2]) - st
	if (ml > max_speech_len):
	if (i>inn and i>0):
	i=i-1
	break
	i=i+1
	en = (vad[i][2])
	fa = en-st
	if (fa > ignore_small_speech_segments):
	if (fa>split_speech_bigger_than):
	_gc = math.ceil(fa/split_speech_bigger_than)
	m = fa/_gc
	print('Bigger-->' + str(fa) + '-->' + str(m))
	for jj in range(_gc):
	isig.append(('speech' , st + (mjj) , st+ (m(jj+1)) , m))
	else:
	isig.append(('speech', st , en,fa))
	i=i+1
	isign=[]
	for i,(a,b,c,d) in enumerate(isig):
	if (i == len(isig)-1):
	isign.append(isig[i])
	break
	_addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
	isign.append([a,b,c+_addlen ,d+_addlen])

	return(isign)


	def get_path_3d(data,batch_size):
	total_batches = data.shape[0] // batch_size
	last_batch_size = data.shape[0] % batch_size
	if last_batch_size != 0:
	batches = np.split(data[:total_batches * batch_size], total_batches)
	last_batch = np.expand_dims(data[total_batches * batch_size:], axis=0).squeeze()
	batches.append(last_batch)
	else:
	batches = np.split(data, total_batches)
	return batches


	def _get_patches(mspec, w, step):
	h = mspec.shape[1]
	data = vaw(mspec, (w,h), step=step)
	data.shape = (len(data), w*h)
	data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
	lfill = [data[0,:].reshape(1, hw)] (w // (2 * step))
	rfill = [data[-1,:].reshape(1, hw)] (w // (2* step) - 1 + len(mspec) % 2)
	data = np.vstack(lfill + [data] + rfill )
	finite = np.all(np.isfinite(data), axis=1)
	data.shape = (len(data), w, h)
	return data, finite


	def _binidx2seglist(binidx):
	"""
	ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
	Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]

	#TODO: is there a pandas alternative??
	"""
	curlabel = None
	bseg = -1
	ret = []
	for i, e in enumerate(binidx):
	if e != curlabel:
	if curlabel is not None:
	ret.append((curlabel, bseg, i))
	curlabel = e
	bseg = i
	ret.append((curlabel, bseg, i + 1))
	return ret


	class DnnSegmenter:
	"""
	DnnSegmenter is an abstract class allowing to perform Dnn-based
	segmentation using Keras serialized models using 24 mel spectrogram
	features obtained with SIDEKIT framework.

	Child classes MUST define the following class attributes:
	* nmel: the number of mel bands to used (max: 24)
	* viterbi_arg: the argument to be used with viterbi post-processing
	* model_fname: the filename of the serialized keras model to be used
	the model should be stored in the current directory
	* inlabel: only segments with label name inlabel will be analyzed.
	other labels will stay unchanged
	* outlabels: the labels associated the output of neural network models
	"""
	def __init__(self, batch_size, vad_type,model_path,EP_list):
	# load the DNN model
	if (vad_type!='vad'):
	self.session = onnxruntime.InferenceSession(model_path,providers=EP_list)
	#self.nn = keras.models.load_model(model_path, compile=False)
	print('model Loded from--> ' + model_path)
	# self.nn.summary()
	self.batch_size = batch_size

	def __call__(self, mspec, lseg, difflen = 0):
	"""
	*** input
	* mspec: mel spectrogram
	* lseg: list of tuples (label, start, stop) corresponding to previous segmentations
	* difflen: 0 if the original length of the mel spectrogram is >= 68
	otherwise it is set to 68 - length(mspec)
	*** output
	a list of adjacent tuples (label, start, stop)
	"""
	if self.nmel < 24:
	mspec = mspec[:, :self.nmel].copy()

	patches, finite = _get_patches(mspec, 68, 2)
	if difflen > 0:
	patches = patches[:-int(difflen / 2), :, :]
	finite = finite[:-int(difflen / 2)]

	assert len(finite) == len(patches), (len(patches), len(finite))

	batch = []
	for lab, start, stop in lseg:
	if lab == self.inlabel:
	batch.append(patches[start:stop, :])

	if len(batch) > 0:

	batch = np.concatenate(batch)
	batches = get_path_3d(batch , self.batch_size,)


	#rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
	input_name = self.session.get_inputs()[0].name
	rawpred=[]
	for batch in tqdm(batches):
	rawpred.append(self.session.run(None, {input_name: batch})[0])

	rawpred = np.concatenate(rawpred)


	ret = []
	for lab, start, stop in lseg:
	if lab != self.inlabel:
	ret.append((lab, start, stop))
	continue

	l = stop - start
	r = rawpred[:l]
	rawpred = rawpred[l:]
	r[finite[start:stop] == False, :] = 0.5
	pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
	for lab2, start2, stop2 in _binidx2seglist(pred):
	ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
	return ret


	class SpeechMusic(DnnSegmenter):
	# Voice activity detection: requires energetic activity detection
	outlabels = ('speech', 'music')
	inlabel = 'energy'
	nmel = 21
	viterbi_arg = 150


	class SpeechMusicNoise(DnnSegmenter):
	# Voice activity detection: requires energetic activity detection
	outlabels = ('speech', 'music', 'noise')
	inlabel = 'energy'
	nmel = 21
	viterbi_arg = 80


	class Gender(DnnSegmenter):
	# Gender Segmentation, requires voice activity detection
	outlabels = ('female', 'male')
	inlabel = 'speech'
	nmel = 24
	viterbi_arg = 80



	class Segmenter:


	def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.onnx",gender_path="c:\\keras_male_female_cnn.onnx" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda'):
	"""
	Load neural network models

	Input:

	'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
	'sm' was used in the results presented in ICASSP 2017 paper
	and in MIREX 2018 challenge submission
	'smn' has been implemented more recently and has not been evaluated in papers

	'detect_gender': if False, speech excerpts are return labelled as 'speech'
	if True, speech excerpts are splitted into 'male' and 'female' segments
	"""
	self.complete_output = complete_output
	self.sample_rate = sr
	self.ffmpeg_path=ffmpeg_path


	if (device != 'cuda'):
	os.environ["CUDA_DEVICE_ORDER"]= '-1'
	EP_list=[ 'CPUExecutionProvider']
	else:
	EP_list=['CUDAExecutionProvider']

	import tensorflow as tf

	config = tf.compat.v1.ConfigProto()
	config.gpu_options.allow_growth = True
	config.log_device_placement = True
	sess = tf.compat.v1.Session(config=config)
	set_session(sess)



	# self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks


	# select speech/music or speech/music/noise voice activity detection engine
	assert vad_engine in ['sm', 'smn']
	if vad_engine == 'sm':
	self.vad = SpeechMusic(batch_size)
	elif vad_engine == 'smn':
	self.vad = SpeechMusicNoise(batch_size , vad_type,model_path,EP_list)

	# load gender detection NN if required
	assert detect_gender in [True, False]
	self.detect_gender = detect_gender
	if detect_gender:
	self.gender = Gender(batch_size , vad_type ,gender_path,EP_list)
	self.vad_type = vad_type
	self.model_path = model_path
	self.gender_path = gender_path

	def segment_feats(self, mspec, loge, difflen, start_sec):
	"""
	do segmentation
	require input corresponding to wav file sampled at 16000Hz
	with a single channel
	"""




	# perform energy-based activity detection
	lseg = []
	vadseg=[]
	for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
	if lab == 0:
	lab = 'noEnergy'
	else:
	lab = 'energy'
	vadseg.append(('speech', start, stop))
	lseg.append((lab, start, stop))
	if (self.vad_type == 'vad'):
	return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
	# perform voice activity detection
	lseg = self.vad(mspec, lseg, difflen)




	# perform gender segmentation on speech segments
	if self.detect_gender:
	lseg = self.gender(mspec, lseg, difflen)
	if (self.complete_output):
	return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
	else:
	return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]


	def __call__(self, medianame, input_type='file',start_sec=None, stop_sec=None):
	"""
	Return segmentation of a given file
	* convert file to wav 16k mono with ffmpeg
	* call NN segmentation procedures
	* media_name: path to the media to be processed (including remote url)
	may include any format supported by ffmpeg
	* tmpdir: allow to define a custom path for storing temporary files
	fast read/write HD are a good choice
	* start_sec (seconds): sound stream before start_sec won't be processed
	* stop_sec (seconds): sound stream after stop_sec won't be processed
	"""


	mspec, loge, difflen , me = media2feats(medianame, input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)

	if start_sec is None:
	start_sec = 0
	# do segmentation
	return self.segment_feats(mspec, loge, difflen, start_sec),me


	def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):

	if verbose:
	print('batch_processing %d files' % len(linput))

	if output_format == 'csv':
	fexport = seg2csv
	elif output_format == 'textgrid':
	fexport = seg2textgrid
	else:
	raise NotImplementedError()

	t_batch_start = time.time()

	lmsg = []
	fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
	i = 0
	for feats, msg in fg:
	lmsg += msg
	i += len(msg)
	if verbose:
	print('%d/%d' % (i, len(linput)), msg)
	if feats is None:
	break
	mspec, loge, difflen = feats
	#if verbose == True:
	# print(i, linput[i], loutput[i])
	b = time.time()
	lseg = self.segment_feats(mspec, loge, difflen, 0)
	fexport(lseg, loutput[len(lmsg) -1])
	lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))

	t_batch_dur = time.time() - t_batch_start
	nb_processed = len([e for e in lmsg if e[1] == 0])
	if nb_processed > 0:
	avg = t_batch_dur / nb_processed
	else:
	avg = -1
	return t_batch_dur, nb_processed, avg, lmsg


	def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
	"""
	To be used when processing batches
	if resulting file exists, it is skipped
	in case of remote files, access is tried nbtry times
	"""
	ret = None
	msg = []
	while ret is None and len(lin) > 0:
	src = lin.pop(0)
	dst = lout.pop(0)
	# print('popping', src)

	# if file exists: skipp
	if skipifexist and os.path.exists(dst):
	msg.append((dst, 1, 'already exists'))
	continue

	# create storing directory if required
	dname = os.path.dirname(dst)
	if not os.path.isdir(dname):
	os.makedirs(dname)

	itry = 0
	while ret is None and itry < nbtry:
	try:
	ret = media2feats(src, tmpdir, None, None, ffmpeg)
	except:
	itry += 1
	errmsg = sys.exc_info()[0]
	if itry != nbtry:
	time.sleep(random.random() * trydelay)
	if ret is None:
	msg.append((dst, 2, 'error: ' + str(errmsg)))
	else:
	msg.append((dst, 0, 'ok'))

	return ret, msg


	def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
	# print('init feat gen', len(ilist))
	thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
	thread.start()
	while True:
	ret, msg = thread.join()
	# print('join done', len(ilist))
	# print('new list', ilist)
	#ilist = ilist[len(msg):]
	#olist = olist[len(msg):]
	if len(ilist) == 0:
	break
	thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
	thread.start()
	yield ret, msg
	yield ret, msg