Spaces:

imansarraf
/

Soorani_ASR

Sleeping

App Files Files Community

imansarraf commited on 11 days ago

Commit

ab53da2

•

1 Parent(s): d1a38da

Upload 9 files

Browse files

Files changed (9) hide show

sad_tf/__init__.py +2 -0
sad_tf/export_funcs.py +238 -0
sad_tf/features.py +62 -0
sad_tf/segmenter_for_tf2_16.py +555 -0
sad_tf/segmentero.py +570 -0
sad_tf/sidekit_mfcc.py +379 -0
sad_tf/thread_returning.py +27 -0
sad_tf/viterbi.py +222 -0
sad_tf/viterbi_utils.py +49 -0

sad_tf/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .segmenter import Segmenter,filter_output,filter_sig
2	+ from .export_funcs import seg2aud,seg2json,seg2Gender_Info,seg2Info

sad_tf/export_funcs.py ADDED Viewed

	@@ -0,0 +1,238 @@

+#!/usr/bin/env python
+# encoding: utf-8
+# The MIT License
+# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import pandas as pd
+from pytextgrid.PraatTextGrid import PraatTextGrid, Interval, Tier
+import os
+import json
+def seg2csv(lseg, fout=None):
+    df = pd.DataFrame.from_records(lseg, columns=['labels', 'start', 'stop'])
+    df.to_csv(fout, sep='\t', index=False)
+def seg2textgrid1(lseg, fout=None):
+    tier = Tier(name='inaSpeechSegmenter')
+    for label, start, stop,_ in lseg:
+        if (label=='noEnergy'):
+            label=''
+        tier.append(Interval(start, stop, label))
+    ptg = PraatTextGrid(xmin=lseg[0][1], xmax=lseg[-1][2])
+    ptg.append(tier)
+    ptg.save(fout)
+def seg2json(lseg)   :
+    try:
+        return(seg2json5(lseg))
+    except:
+         return(seg2json4(lseg))
+def seg2Info(lseg):
+    x=[]
+    nch=0
+    for segs in lseg:
+      f=0
+      nch = nch+1
+      data_list=[]
+      if (segs!=-1):
+        for y in segs:
+           if (y[0]!='noEnergy'):
+                 f = f + y[2] - y[1]
+      data = {
+                    'channel' : nch,
+                    'speech': f
+             }
+      x.append(data)
+    return(json.dumps(x))
+def seg2Gender_Info(lseg):
+    x=[]
+    nch=0
+    for segs in lseg:
+      f=0
+      m=0
+      nch = nch+1
+      data_list=[]
+      if (segs!=-1):
+        for y in segs:
+           if (y[0]!='noEnergy'):
+             if (y[0] == "female"):
+                 f = f + y[2] - y[1]
+             elif(y[0] == "male"):
+                   m = m + y[2] - y[1]
+      data = {
+                    'channel' : nch,
+                    'male': m,
+                    'female': f
+             }
+      x.append(data)
+    return(json.dumps(x))
+def seg2json5(lseg):
+    x=[]
+    nch=0
+    for segs in lseg:
+      nch = nch+1
+      data_list=[]
+      if (segs!=-1):
+        for label, start, stop ,_,_ in segs:
+           if (label!='noEnergy'):
+             data = {
+                    'startTime': start,
+                    'endTime': stop,
+                    'gender': label[0]
+                   }
+             data_list.append(data)
+      data = {
+                    'channel' : nch,
+                    'segments' : data_list
+             }
+      x.append(data)
+    return(json.dumps(x))
+def seg2json4(lseg):
+    x=[]
+    nch=0
+    for segs in lseg:
+      nch = nch+1
+      data_list=[]
+      if (segs!=-1):
+        for label, start, stop ,_ in segs:
+           if (label!='noEnergy'):
+             data = {
+                    'startTime': start,
+                    'endTime': stop,
+                    'gender': label[0]
+                   }
+             data_list.append(data)
+      data = {
+                    'channel' : nch,
+                    'segments' : data_list
+             }
+      x.append(data)
+    return(json.dumps(x))
+def seg2aud(lseg , fout=None)   :
+    try:
+        seg2aud5(lseg , fout)
+    except:
+          seg2aud4(lseg , fout)
+def seg2aud5(lseg , fout=None):
+    if (lseg==-1):
+       return
+    with open(fout , 'w') as fid:
+      for label, start, stop ,_,_ in lseg:
+           if (label!='noEnergy'):
+              fid.write('%s\t%s\t%s\n' %(start , stop , label))
+def seg2aud4(lseg , fout=None):
+    if (lseg==-1):
+       return
+    with open(fout , 'w') as fid:
+      for label, start, stop ,_ in lseg:
+           if (label!='noEnergy'):
+              fid.write('%s\t%s\t%s\n' %(start , stop , label))
+def seg2textgrid(data , fout=None):
+    ghabli=False
+    kh=[]
+    if (True):
+       kh.append('File type = "ooTextFile"\n')
+       kh.append('Object class = "TextGrid"\n')
+       kh.append('\n')
+       kh.append('xmin = 0 \n')
+       kh.append('xmax = %s \n' %(data[-1][2]))
+       kh.append('tiers? <exists> \n')
+       kh.append('size = 1 \n')
+       kh.append('item []: \n')
+       kh.append('    item [1]:\n')
+       kh.append('        class = "IntervalTier" \n')
+       kh.append('        name = "sen" \n')
+       kh.append('        xmin = 0 \n')
+       kh.append('        xmax = %s \n' %(data[-1][2]))
+       kh.append('        intervals: size = %s \n' %(0))
+       x=1
+       if (float(data[0][1])>0):
+           kh.append('        intervals [%s]:\n' %(x))
+           kh.append('            xmin = 0\n')
+           kh.append('            xmax = %s \n' %(data[0][1]))
+           kh.append('            text = "" \n')
+           x=x+1
+       for i in range(len(data)):
+             kh.append('        intervals [%s]:\n' %(x))
+             if (ghabli):
+                 kh.append('            xmin = %s \n' %(data[i-1][2]))
+             else:
+                 kh.append('            xmin = %s \n' %(data[i][1]))
+             kh.append('            xmax = %s \n' %(data[i][2]))
+             kh.append('            text = "%s" \n' %(data[i][0].strip()))
+             x=x+1
+             if (i+1 >= len(data)):
+                break
+             if (data[i][2] != data[i+1][1]):
+              if (float(data[i+1][1]) - float(data[i][2]) > 0.5):
+                 kh.append('        intervals [%s]:\n' %(x))
+                 kh.append('            xmin = %s \n' %(data[i][2]))
+                 kh.append('            xmax = %s \n' %(data[i+1][1]))
+                 kh.append('            text = "" \n')
+                 x=x+1
+                 ghabli=False
+              else:
+                 ghabli=True
+    kh[13] = ('        intervals: size = %s \n' %(kh[-4].strip().split(' ')[1].replace('[','').replace(']','').replace(':','')))
+    with open(fout, mode='w') as fid:
+        for line in kh:
+            fid.write(line)

sad_tf/features.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python
+# encoding: utf-8
+# The MIT License
+# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import os
+import numpy as np
+from iman import Audio
+#os.environ['SIDEKIT'] = 'theano=false,libsvm=false,cuda=false'
+#from sidekit.frontend.io import read_wav
+#from sidekit.frontend.features import mfcc
+from .sidekit_mfcc import  mfcc
+def _wav2feats(wavname,input_type='file',sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
+    """
+    Extract features for wav 16k mono
+    """
+    if (input_type == 'file'):
+        sig = Audio.Read(wavname , sr,mono = True, ffmpeg_path=ffmpeg_path)
+    else:
+        sig =  wavname
+    read_framerate=sr
+    _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True,fs=sr, maxfreq=int(sr/2))
+    # Management of short duration segments
+    difflen = 0
+    if len(loge) < 68:
+        difflen = 68 - len(loge)
+        mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
+    return mspec, loge, difflen,sig
+def media2feats(medianame,input_type='file', sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
+        return _wav2feats(medianame, input_type , sr,ffmpeg_path=ffmpeg_path)

sad_tf/segmenter_for_tf2_16.py ADDED Viewed

	@@ -0,0 +1,555 @@

+#!/usr/bin/env python
+# encoding: utf-8
+# The MIT License
+# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import warnings
+warnings.filterwarnings("ignore")
+import os
+# os.environ["CUDA_DEVICE_ORDER"]= '0'
+import sys
+import math
+from iman import Audio
+import numpy as np
+from tensorflow import keras
+# from tensorflow.compat.v1.keras.backend import set_session
+import tensorflow.python.keras.backend as K
+from .thread_returning import ThreadReturning
+import shutil
+import time
+import random
+from skimage.util import view_as_windows as vaw
+from .viterbi import viterbi_decoding
+from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
+from .features import media2feats
+from .export_funcs import seg2csv, seg2textgrid
+def _energy_activity(loge, ratio=0.4):   ##########0.9
+    threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
+    raw_activity = (loge > threshold)
+    return viterbi_decoding(pred2logemission(raw_activity),
+                            log_trans_exp(50, cost0=-5))
+#exp(150, cost0=-5)
+def filter_sig(isig , wav , sr=16000):
+    if (sr!=16000):
+       wav = Audio.Resample(wav , 16000, sr)
+    try:
+      w=[]
+      wn=[]
+      wn.append(wav[0 : int(isig[0][1]*sr)])
+      for i , xxx in enumerate(isig):
+          a=xxx[1]
+          b=xxx[2]
+          w.append(wav[int(a*sr) : int(b*sr)])
+          try:
+            wn.append(wav[ int(isig[i][2]*sr) :  int(isig[i+1][1]*sr)])
+          except:
+            wn.append(wav[int(isig[i][2]*sr) : len(wav)])
+      return (np.concatenate(w),np.concatenate(wn))
+    except:
+      w=[]
+      wn=[]
+      wn.append(wav[0 : int(isig[0][1]*sr)])
+      for i , [_,a,b,_,_] in enumerate(isig):
+          w.append(wav[int(a*sr) : int(b*sr)])
+          try:
+            wn.append(wav[ int(isig[i][2]*sr) :  int(isig[i+1][1]*sr)])
+          except:
+            wn.append(wav[int(isig[i][2]*sr) : len(wav)])
+      return (np.concatenate(w),np.concatenate(wn))
+def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
+   if (len(isig)==0):
+     return -1
+   # _dels=[]
+   # for i , [_,_,_,_d] in enumerate(isig):
+        # if (_d<=ignore_small_speech_segments) :
+                 # _dels.append(i)
+   # _dels.reverse()
+   # for i in _dels:
+       # del isig[i]
+   # if (len(isig)==0):
+     # return -1
+   isig = [list(x) for x in isig]
+   for i in range(len(isig)-1):
+      t = isig[i+1][1] - isig[i][2] # silence between towo chunk
+      isig[i].append(t)
+   isig[-1].append(-1)
+   if (len(isig)>0):
+          rang = np.arange(0.01,max_silence+0.1,0.1)
+          for di in rang:
+             for i , xxx in enumerate(isig):
+                        _t = xxx[-1]
+                        if (_t==-1):
+                            break
+                        if (_t <=di):
+                           try:
+                            if (isig[i+1][2] -   isig[i][1] <= max_speech_len):
+                              isig[i] =  [isig[i][0] , isig[i][1] , isig[i+1][2] ,  isig[i+1][2] -   isig[i][1] , isig[i+1][4] ]
+                              del isig[i+1]
+                           except:
+                               pass
+          _dels=[]
+          for i , xxxx in enumerate(isig):
+               _d = xxxx[3]
+               if (_d<=ignore_small_speech_segments) :
+                        _dels.append(i)
+          _dels.reverse()
+          for i in _dels:
+              del isig[i]
+          if (len(isig)==0):
+            return -1
+   isign=[]
+   for i , xxxxx in enumerate(isig):
+        _d = xxxxx[3]
+        if (_d> split_speech_bigger_than ) :
+               _gc = math.ceil(_d/split_speech_bigger_than)
+               m = _d/_gc
+               print('Bigger-->' + str(_d) + '-->' + str(m))
+               for jj in range(_gc):
+                    fas=0
+                    if (jj== _gc-1):
+                        fas= isig[i][4]
+                    isign.append(  [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ]    )
+        else:
+               isign.append(isig[i])
+   for i,(a,b,c,d,e) in enumerate(isign):
+      if (e==-1):
+          break
+      _addlen = min(e , 1) / 2      #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
+      isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
+   return(isign)
+def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
+  isig = []
+  i=0
+  while (i <len(vad)):
+     ml=0
+     inn = i
+     st = (vad[i][1])
+     while ( (i<len(vad)-1 )and ( (  (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
+         ml = (vad[i][2]) - st
+         if (ml > max_speech_len):
+             if (i>inn and i>0):
+                 i=i-1
+             break
+         i=i+1
+     en = (vad[i][2])
+     fa = en-st
+     if (fa > ignore_small_speech_segments):
+       if (fa>split_speech_bigger_than):
+          _gc = math.ceil(fa/split_speech_bigger_than)
+          m = fa/_gc
+          print('Bigger-->' + str(fa) + '-->' + str(m))
+          for jj in range(_gc):
+            isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
+       else:
+          isig.append(('speech', st , en,fa))
+     i=i+1
+  isign=[]
+  for i,(a,b,c,d) in enumerate(isig):
+    if (i == len(isig)-1):
+        isign.append(isig[i])
+        break
+    _addlen = min(isig[i+1][1]-c , 1) / 2      #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
+    isign.append([a,b,c+_addlen ,d+_addlen])
+  return(isign)
+def _get_patches(mspec, w, step):
+    h = mspec.shape[1]
+    data = vaw(mspec, (w,h), step=step)
+    data.shape = (len(data), w*h)
+    data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
+    lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
+    rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
+    data = np.vstack(lfill + [data] + rfill )
+    finite = np.all(np.isfinite(data), axis=1)
+    data.shape = (len(data), w, h)
+    return data, finite
+def _binidx2seglist(binidx):
+    """
+    ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
+    Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
+    #TODO: is there a pandas alternative??
+    """
+    curlabel = None
+    bseg = -1
+    ret = []
+    for i, e in enumerate(binidx):
+        if e != curlabel:
+            if curlabel is not None:
+                ret.append((curlabel, bseg, i))
+            curlabel = e
+            bseg = i
+    ret.append((curlabel, bseg, i + 1))
+    return ret
+class DnnSegmenter:
+    """
+    DnnSegmenter is an abstract class allowing to perform Dnn-based
+    segmentation using Keras serialized models using 24 mel spectrogram
+    features obtained with SIDEKIT framework.
+    Child classes MUST define the following class attributes:
+    * nmel: the number of mel bands to used (max: 24)
+    * viterbi_arg: the argument to be used with viterbi post-processing
+    * model_fname: the filename of the serialized keras model to be used
+        the model should be stored in the current directory
+    * inlabel: only segments with label name inlabel will be analyzed.
+        other labels will stay unchanged
+    * outlabels: the labels associated the output of neural network models
+    """
+    def __init__(self, batch_size, vad_type,model_path):
+        # load the DNN model
+      if (vad_type!='vad'):
+        self.nn = keras.models.load_model(model_path, compile=False)
+        print('model Loded from--> ' + model_path)
+        # self.nn.summary()
+        self.batch_size = batch_size
+    def __call__(self, mspec, lseg, difflen = 0):
+        """
+        *** input
+        * mspec: mel spectrogram
+        * lseg: list of tuples (label, start, stop) corresponding to previous segmentations
+        * difflen: 0 if the original length of the mel spectrogram is >= 68
+                otherwise it is set to 68 - length(mspec)
+        *** output
+        a list of adjacent tuples (label, start, stop)
+        """
+        if self.nmel < 24:
+            mspec = mspec[:, :self.nmel].copy()
+        patches, finite = _get_patches(mspec, 68, 2)
+        if difflen > 0:
+            patches = patches[:-int(difflen / 2), :, :]
+            finite = finite[:-int(difflen / 2)]
+        assert len(finite) == len(patches), (len(patches), len(finite))
+        batch = []
+        for lab, start, stop in lseg:
+            if lab == self.inlabel:
+                batch.append(patches[start:stop, :])
+        if len(batch) > 0:
+            batch = np.concatenate(batch)
+            rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
+        ret = []
+        for lab, start, stop in lseg:
+            if lab != self.inlabel:
+                ret.append((lab, start, stop))
+                continue
+            l = stop - start
+            r = rawpred[:l]
+            rawpred = rawpred[l:]
+            r[finite[start:stop] == False, :] = 0.5
+            pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
+            for lab2, start2, stop2 in _binidx2seglist(pred):
+                ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
+        return  ret
+class SpeechMusic(DnnSegmenter):
+    # Voice activity detection: requires energetic activity detection
+    outlabels = ('speech', 'music')
+    inlabel = 'energy'
+    nmel = 21
+    viterbi_arg = 150
+class SpeechMusicNoise(DnnSegmenter):
+    # Voice activity detection: requires energetic activity detection
+    outlabels = ('speech', 'music', 'noise')
+    inlabel = 'energy'
+    nmel = 21
+    viterbi_arg = 80
+class Gender(DnnSegmenter):
+    # Gender Segmentation, requires voice activity detection
+    outlabels = ('female', 'male')
+    inlabel = 'speech'
+    nmel = 24
+    viterbi_arg = 80
+class Segmenter:
+    def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.hdf5",gender_path="c:\\keras_male_female_cnn.hdf5" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda' ,input_type="file"):
+        """
+        Load neural network models
+        Input:
+        'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
+                'sm' was used in the results presented in ICASSP 2017 paper
+                        and in MIREX 2018 challenge submission
+                'smn' has been implemented more recently and has not been evaluated in papers
+        'detect_gender': if False, speech excerpts are return labelled as 'speech'
+                if True, speech excerpts are splitted into 'male' and 'female' segments
+        """
+        if (device != 'cuda'):
+              os.environ["CUDA_DEVICE_ORDER"]= '-1'
+        else:
+           pass
+        import tensorflow as tf
+        config = tf.compat.v1.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.log_device_placement = True
+        sess = tf.compat.v1.Session(config=config)
+        sess = K.get_session()
+        self.complete_output = complete_output
+        self.sample_rate = sr
+        self.ffmpeg_path=ffmpeg_path
+        self.input_type = input_type
+        self.device = device
+#        self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
+        # select speech/music or speech/music/noise voice activity detection engine
+        assert vad_engine in ['sm', 'smn']
+        if vad_engine == 'sm':
+            self.vad = SpeechMusic(batch_size)
+        elif vad_engine == 'smn':
+            self.vad = SpeechMusicNoise(batch_size , vad_type,model_path)
+        # load gender detection NN if required
+        assert detect_gender in [True, False]
+        self.detect_gender = detect_gender
+        if detect_gender:
+            self.gender = Gender(batch_size , vad_type ,gender_path)
+        self.vad_type = vad_type
+        self.model_path = model_path
+        self.gender_path = gender_path
+    def segment_feats(self, mspec, loge, difflen, start_sec):
+        """
+        do segmentation
+        require input corresponding to wav file sampled at 16000Hz
+        with a single channel
+        """
+        # perform energy-based activity detection
+        lseg = []
+        vadseg=[]
+        for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
+            if lab == 0:
+                lab = 'noEnergy'
+            else:
+                lab = 'energy'
+                vadseg.append(('speech', start, stop))
+            lseg.append((lab, start, stop))
+        if (self.vad_type == 'vad'):
+           return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
+        # perform voice activity detection
+        lseg = self.vad(mspec, lseg, difflen)
+        # perform gender segmentation on speech segments
+        if self.detect_gender:
+            lseg = self.gender(mspec, lseg, difflen)
+        if (self.complete_output):
+           return   [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
+        else:
+           return   [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
+    def __call__(self, medianame,start_sec=None, stop_sec=None):
+        """
+        Return segmentation of a given file
+                * convert file to wav 16k mono with ffmpeg
+                * call NN segmentation procedures
+        * media_name: path to the media to be processed (including remote url)
+                may include any format supported by ffmpeg
+        * tmpdir: allow to define a custom path for storing temporary files
+                fast read/write HD are a good choice
+        * start_sec (seconds): sound stream before start_sec won't be processed
+        * stop_sec (seconds): sound stream after stop_sec won't be processed
+        """
+        mspec, loge, difflen , me = media2feats(medianame, self.input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
+        if start_sec is None:
+            start_sec = 0
+        # do segmentation
+        return self.segment_feats(mspec, loge, difflen, start_sec),me
+    def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
+        if verbose:
+            print('batch_processing %d files' % len(linput))
+        if output_format == 'csv':
+            fexport = seg2csv
+        elif output_format == 'textgrid':
+            fexport = seg2textgrid
+        else:
+            raise NotImplementedError()
+        t_batch_start = time.time()
+        lmsg = []
+        fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
+        i = 0
+        for feats, msg in fg:
+            lmsg += msg
+            i += len(msg)
+            if verbose:
+                print('%d/%d' % (i, len(linput)), msg)
+            if feats is None:
+                break
+            mspec, loge, difflen = feats
+            #if verbose == True:
+            #    print(i, linput[i], loutput[i])
+            b = time.time()
+            lseg = self.segment_feats(mspec, loge, difflen, 0)
+            fexport(lseg, loutput[len(lmsg) -1])
+            lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
+        t_batch_dur = time.time() - t_batch_start
+        nb_processed = len([e for e in lmsg if e[1] == 0])
+        if nb_processed > 0:
+            avg = t_batch_dur / nb_processed
+        else:
+            avg = -1
+        return t_batch_dur, nb_processed, avg, lmsg
+def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
+    """
+    To be used when processing batches
+    if resulting file exists, it is skipped
+    in case of remote files, access is tried nbtry times
+    """
+    ret = None
+    msg = []
+    while ret is None and len(lin) > 0:
+        src = lin.pop(0)
+        dst = lout.pop(0)
+#        print('popping', src)
+        # if file exists: skipp
+        if skipifexist and os.path.exists(dst):
+            msg.append((dst, 1, 'already exists'))
+            continue
+        # create storing directory if required
+        dname = os.path.dirname(dst)
+        if not os.path.isdir(dname):
+            os.makedirs(dname)
+        itry = 0
+        while ret is None and itry < nbtry:
+            try:
+                ret = media2feats(src, tmpdir, None, None, ffmpeg)
+            except:
+                itry += 1
+                errmsg = sys.exc_info()[0]
+                if itry != nbtry:
+                    time.sleep(random.random() * trydelay)
+        if ret is None:
+            msg.append((dst, 2, 'error: ' + str(errmsg)))
+        else:
+            msg.append((dst, 0, 'ok'))
+    return ret, msg
+def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
+#    print('init feat gen', len(ilist))
+    thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
+    thread.start()
+    while True:
+        ret, msg = thread.join()
+#        print('join done', len(ilist))
+#        print('new list', ilist)
+        #ilist = ilist[len(msg):]
+        #olist = olist[len(msg):]
+        if len(ilist) == 0:
+            break
+        thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
+        thread.start()
+        yield ret, msg
+    yield ret, msg

sad_tf/segmentero.py ADDED Viewed

	@@ -0,0 +1,570 @@

+#!/usr/bin/env python
+# encoding: utf-8
+# The MIT License
+# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import onnxruntime
+import warnings
+warnings.filterwarnings("ignore")
+import os
+# os.environ["CUDA_DEVICE_ORDER"]= '0'
+import sys
+import math
+from iman import Audio
+import numpy as np
+from tensorflow import keras
+from tensorflow.compat.v1.keras.backend import set_session
+from tqdm import tqdm
+from .thread_returning import ThreadReturning
+import shutil
+import time
+import random
+from skimage.util import view_as_windows as vaw
+from .viterbi import viterbi_decoding
+from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
+from .features import media2feats
+from .export_funcs import seg2csv, seg2textgrid
+def _energy_activity(loge, ratio=0.4):   ##########0.9
+    threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
+    raw_activity = (loge > threshold)
+    return viterbi_decoding(pred2logemission(raw_activity),
+                            log_trans_exp(50, cost0=-5))
+#exp(150, cost0=-5)
+def filter_sig(isig , wav , sr=16000):
+    if (sr!=16000):
+       wav = Audio.Resample(wav , 16000, sr)
+    try:
+      w=[]
+      wn=[]
+      wn.append(wav[0 : int(isig[0][1]*sr)])
+      for i , [_,a,b,_] in enumerate(isig):
+          w.append(wav[int(a*sr) : int(b*sr)])
+          try:
+            wn.append(wav[ int(isig[i][2]*sr) :  int(isig[i+1][1]*sr)])
+          except:
+            wn.append(wav[int(isig[i][2]*sr) : len(wav)])
+      return (np.concatenate(w),np.concatenate(wn))
+    except:
+      w=[]
+      wn=[]
+      wn.append(wav[0 : int(isig[0][1]*sr)])
+      for i , [_,a,b,_,_] in enumerate(isig):
+          w.append(wav[int(a*sr) : int(b*sr)])
+          try:
+            wn.append(wav[ int(isig[i][2]*sr) :  int(isig[i+1][1]*sr)])
+          except:
+            wn.append(wav[int(isig[i][2]*sr) : len(wav)])
+      return (np.concatenate(w),np.concatenate(wn))
+def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
+   if (len(isig)==0):
+     return -1
+   # _dels=[]
+   # for i , [_,_,_,_d] in enumerate(isig):
+        # if (_d<=ignore_small_speech_segments) :
+                 # _dels.append(i)
+   # _dels.reverse()
+   # for i in _dels:
+       # del isig[i]
+   # if (len(isig)==0):
+     # return -1
+   for i in range(len(isig)-1):
+      t = isig[i+1][1] - isig[i][2] # silence between towo chunk
+      isig[i].append(t)
+   isig[-1].append(-1)
+   if (len(isig)>0):
+          rang = np.arange(0.01,max_silence+0.1,0.1)
+          for di in rang:
+             for i , [_,_,_,_,_t] in enumerate(isig):
+                        if (_t==-1):
+                            break
+                        if (_t <=di):
+                           try:
+                            if (isig[i+1][2] -   isig[i][1] <= max_speech_len):
+                              isig[i] =  [isig[i][0] , isig[i][1] , isig[i+1][2] ,  isig[i+1][2] -   isig[i][1] , isig[i+1][4] ]
+                              del isig[i+1]
+                           except:
+                               pass
+          _dels=[]
+          for i , [_,_,_,_d,_] in enumerate(isig):
+               if (_d<=ignore_small_speech_segments) :
+                        _dels.append(i)
+          _dels.reverse()
+          for i in _dels:
+              del isig[i]
+          if (len(isig)==0):
+            return -1
+   isign=[]
+   for i , [_,_,_,_d,_] in enumerate(isig):
+        if (_d> split_speech_bigger_than ) :
+               _gc = math.ceil(_d/split_speech_bigger_than)
+               m = _d/_gc
+               print('Bigger-->' + str(_d) + '-->' + str(m))
+               for jj in range(_gc):
+                    fas=0
+                    if (jj== _gc-1):
+                        fas= isig[i][4]
+                    isign.append(  [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ]    )
+        else:
+               isign.append(isig[i])
+   for i,(a,b,c,d,e) in enumerate(isign):
+      if (e==-1):
+          break
+      _addlen = min(e , 1) / 2      #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
+      isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
+   return(isign)
+def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
+  isig = []
+  i=0
+  while (i <len(vad)):
+     ml=0
+     inn = i
+     st = (vad[i][1])
+     while ( (i<len(vad)-1 )and ( (  (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
+         ml = (vad[i][2]) - st
+         if (ml > max_speech_len):
+             if (i>inn and i>0):
+                 i=i-1
+             break
+         i=i+1
+     en = (vad[i][2])
+     fa = en-st
+     if (fa > ignore_small_speech_segments):
+       if (fa>split_speech_bigger_than):
+          _gc = math.ceil(fa/split_speech_bigger_than)
+          m = fa/_gc
+          print('Bigger-->' + str(fa) + '-->' + str(m))
+          for jj in range(_gc):
+            isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
+       else:
+          isig.append(('speech', st , en,fa))
+     i=i+1
+  isign=[]
+  for i,(a,b,c,d) in enumerate(isig):
+    if (i == len(isig)-1):
+        isign.append(isig[i])
+        break
+    _addlen = min(isig[i+1][1]-c , 1) / 2      #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
+    isign.append([a,b,c+_addlen ,d+_addlen])
+  return(isign)
+def get_path_3d(data,batch_size):
+    total_batches = data.shape[0] // batch_size
+    last_batch_size = data.shape[0] % batch_size
+    if last_batch_size != 0:
+      batches = np.split(data[:total_batches * batch_size], total_batches)
+      last_batch = np.expand_dims(data[total_batches * batch_size:], axis=0).squeeze()
+      batches.append(last_batch)
+    else:
+      batches = np.split(data, total_batches)
+    return  batches
+def _get_patches(mspec, w, step):
+    h = mspec.shape[1]
+    data = vaw(mspec, (w,h), step=step)
+    data.shape = (len(data), w*h)
+    data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
+    lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
+    rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
+    data = np.vstack(lfill + [data] + rfill )
+    finite = np.all(np.isfinite(data), axis=1)
+    data.shape = (len(data), w, h)
+    return data, finite
+def _binidx2seglist(binidx):
+    """
+    ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
+    Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
+    #TODO: is there a pandas alternative??
+    """
+    curlabel = None
+    bseg = -1
+    ret = []
+    for i, e in enumerate(binidx):
+        if e != curlabel:
+            if curlabel is not None:
+                ret.append((curlabel, bseg, i))
+            curlabel = e
+            bseg = i
+    ret.append((curlabel, bseg, i + 1))
+    return ret
+class DnnSegmenter:
+    """
+    DnnSegmenter is an abstract class allowing to perform Dnn-based
+    segmentation using Keras serialized models using 24 mel spectrogram
+    features obtained with SIDEKIT framework.
+    Child classes MUST define the following class attributes:
+    * nmel: the number of mel bands to used (max: 24)
+    * viterbi_arg: the argument to be used with viterbi post-processing
+    * model_fname: the filename of the serialized keras model to be used
+        the model should be stored in the current directory
+    * inlabel: only segments with label name inlabel will be analyzed.
+        other labels will stay unchanged
+    * outlabels: the labels associated the output of neural network models
+    """
+    def __init__(self, batch_size, vad_type,model_path,EP_list):
+        # load the DNN model
+      if (vad_type!='vad'):
+        self.session = onnxruntime.InferenceSession(model_path,providers=EP_list)
+        #self.nn = keras.models.load_model(model_path, compile=False)
+        print('model Loded from--> ' + model_path)
+        # self.nn.summary()
+        self.batch_size = batch_size
+    def __call__(self, mspec, lseg, difflen = 0):
+        """
+        *** input
+        * mspec: mel spectrogram
+        * lseg: list of tuples (label, start, stop) corresponding to previous segmentations
+        * difflen: 0 if the original length of the mel spectrogram is >= 68
+                otherwise it is set to 68 - length(mspec)
+        *** output
+        a list of adjacent tuples (label, start, stop)
+        """
+        if self.nmel < 24:
+            mspec = mspec[:, :self.nmel].copy()
+        patches, finite = _get_patches(mspec, 68, 2)
+        if difflen > 0:
+            patches = patches[:-int(difflen / 2), :, :]
+            finite = finite[:-int(difflen / 2)]
+        assert len(finite) == len(patches), (len(patches), len(finite))
+        batch = []
+        for lab, start, stop in lseg:
+            if lab == self.inlabel:
+                batch.append(patches[start:stop, :])
+        if len(batch) > 0:
+            batch = np.concatenate(batch)
+            batches = get_path_3d(batch , self.batch_size,)
+            #rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
+            input_name = self.session.get_inputs()[0].name
+            rawpred=[]
+            for batch in tqdm(batches):
+                rawpred.append(self.session.run(None, {input_name: batch})[0])
+            rawpred = np.concatenate(rawpred)
+        ret = []
+        for lab, start, stop in lseg:
+            if lab != self.inlabel:
+                ret.append((lab, start, stop))
+                continue
+            l = stop - start
+            r = rawpred[:l]
+            rawpred = rawpred[l:]
+            r[finite[start:stop] == False, :] = 0.5
+            pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
+            for lab2, start2, stop2 in _binidx2seglist(pred):
+                ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
+        return  ret
+class SpeechMusic(DnnSegmenter):
+    # Voice activity detection: requires energetic activity detection
+    outlabels = ('speech', 'music')
+    inlabel = 'energy'
+    nmel = 21
+    viterbi_arg = 150
+class SpeechMusicNoise(DnnSegmenter):
+    # Voice activity detection: requires energetic activity detection
+    outlabels = ('speech', 'music', 'noise')
+    inlabel = 'energy'
+    nmel = 21
+    viterbi_arg = 80
+class Gender(DnnSegmenter):
+    # Gender Segmentation, requires voice activity detection
+    outlabels = ('female', 'male')
+    inlabel = 'speech'
+    nmel = 24
+    viterbi_arg = 80
+class Segmenter:
+    def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.onnx",gender_path="c:\\keras_male_female_cnn.onnx" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda'):
+        """
+        Load neural network models
+        Input:
+        'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
+                'sm' was used in the results presented in ICASSP 2017 paper
+                        and in MIREX 2018 challenge submission
+                'smn' has been implemented more recently and has not been evaluated in papers
+        'detect_gender': if False, speech excerpts are return labelled as 'speech'
+                if True, speech excerpts are splitted into 'male' and 'female' segments
+        """
+        self.complete_output = complete_output
+        self.sample_rate = sr
+        self.ffmpeg_path=ffmpeg_path
+        if (device != 'cuda'):
+              os.environ["CUDA_DEVICE_ORDER"]= '-1'
+              EP_list=[ 'CPUExecutionProvider']
+        else:
+               EP_list=['CUDAExecutionProvider']
+        import tensorflow as tf
+        config = tf.compat.v1.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.log_device_placement = True
+        sess = tf.compat.v1.Session(config=config)
+        set_session(sess)
+#        self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
+        # select speech/music or speech/music/noise voice activity detection engine
+        assert vad_engine in ['sm', 'smn']
+        if vad_engine == 'sm':
+            self.vad = SpeechMusic(batch_size)
+        elif vad_engine == 'smn':
+            self.vad = SpeechMusicNoise(batch_size , vad_type,model_path,EP_list)
+        # load gender detection NN if required
+        assert detect_gender in [True, False]
+        self.detect_gender = detect_gender
+        if detect_gender:
+            self.gender = Gender(batch_size , vad_type ,gender_path,EP_list)
+        self.vad_type = vad_type
+        self.model_path = model_path
+        self.gender_path = gender_path
+    def segment_feats(self, mspec, loge, difflen, start_sec):
+        """
+        do segmentation
+        require input corresponding to wav file sampled at 16000Hz
+        with a single channel
+        """
+        # perform energy-based activity detection
+        lseg = []
+        vadseg=[]
+        for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
+            if lab == 0:
+                lab = 'noEnergy'
+            else:
+                lab = 'energy'
+                vadseg.append(('speech', start, stop))
+            lseg.append((lab, start, stop))
+        if (self.vad_type == 'vad'):
+           return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
+        # perform voice activity detection
+        lseg = self.vad(mspec, lseg, difflen)
+        # perform gender segmentation on speech segments
+        if self.detect_gender:
+            lseg = self.gender(mspec, lseg, difflen)
+        if (self.complete_output):
+           return   [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
+        else:
+           return   [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
+    def __call__(self, medianame, input_type='file',start_sec=None, stop_sec=None):
+        """
+        Return segmentation of a given file
+                * convert file to wav 16k mono with ffmpeg
+                * call NN segmentation procedures
+        * media_name: path to the media to be processed (including remote url)
+                may include any format supported by ffmpeg
+        * tmpdir: allow to define a custom path for storing temporary files
+                fast read/write HD are a good choice
+        * start_sec (seconds): sound stream before start_sec won't be processed
+        * stop_sec (seconds): sound stream after stop_sec won't be processed
+        """
+        mspec, loge, difflen , me = media2feats(medianame, input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
+        if start_sec is None:
+            start_sec = 0
+        # do segmentation
+        return self.segment_feats(mspec, loge, difflen, start_sec),me
+    def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
+        if verbose:
+            print('batch_processing %d files' % len(linput))
+        if output_format == 'csv':
+            fexport = seg2csv
+        elif output_format == 'textgrid':
+            fexport = seg2textgrid
+        else:
+            raise NotImplementedError()
+        t_batch_start = time.time()
+        lmsg = []
+        fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
+        i = 0
+        for feats, msg in fg:
+            lmsg += msg
+            i += len(msg)
+            if verbose:
+                print('%d/%d' % (i, len(linput)), msg)
+            if feats is None:
+                break
+            mspec, loge, difflen = feats
+            #if verbose == True:
+            #    print(i, linput[i], loutput[i])
+            b = time.time()
+            lseg = self.segment_feats(mspec, loge, difflen, 0)
+            fexport(lseg, loutput[len(lmsg) -1])
+            lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
+        t_batch_dur = time.time() - t_batch_start
+        nb_processed = len([e for e in lmsg if e[1] == 0])
+        if nb_processed > 0:
+            avg = t_batch_dur / nb_processed
+        else:
+            avg = -1
+        return t_batch_dur, nb_processed, avg, lmsg
+def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
+    """
+    To be used when processing batches
+    if resulting file exists, it is skipped
+    in case of remote files, access is tried nbtry times
+    """
+    ret = None
+    msg = []
+    while ret is None and len(lin) > 0:
+        src = lin.pop(0)
+        dst = lout.pop(0)
+#        print('popping', src)
+        # if file exists: skipp
+        if skipifexist and os.path.exists(dst):
+            msg.append((dst, 1, 'already exists'))
+            continue
+        # create storing directory if required
+        dname = os.path.dirname(dst)
+        if not os.path.isdir(dname):
+            os.makedirs(dname)
+        itry = 0
+        while ret is None and itry < nbtry:
+            try:
+                ret = media2feats(src, tmpdir, None, None, ffmpeg)
+            except:
+                itry += 1
+                errmsg = sys.exc_info()[0]
+                if itry != nbtry:
+                    time.sleep(random.random() * trydelay)
+        if ret is None:
+            msg.append((dst, 2, 'error: ' + str(errmsg)))
+        else:
+            msg.append((dst, 0, 'ok'))
+    return ret, msg
+def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
+#    print('init feat gen', len(ilist))
+    thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
+    thread.start()
+    while True:
+        ret, msg = thread.join()
+#        print('join done', len(ilist))
+#        print('new list', ilist)
+        #ilist = ilist[len(msg):]
+        #olist = olist[len(msg):]
+        if len(ilist) == 0:
+            break
+        thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
+        thread.start()
+        yield ret, msg
+    yield ret, msg

sad_tf/sidekit_mfcc.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# -*- coding: utf-8 -*-
+#
+# This file is part of SIDEKIT.
+#
+# The following code has been copy-pasted from SIDEKIT source files:
+# frontend/features.py frontend/io.py frontend/vad.py
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is a python package for speaker verification.
+# Home page: http://www-lium.univ-lemans.fr/sidekit/
+#
+# SIDEKIT is free software: you can redistribute it and/or modify
+# it under the terms of the GNU LLesser General Public License as
+# published by the Free Software Foundation, either version 3 of the License,
+# or (at your option) any later version.
+#
+# SIDEKIT is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with SIDEKIT.  If not, see <http://www.gnu.org/licenses/>.
+"""
+Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
+:mod:`frontend` provides methods to process an audio signal in order to extract
+useful parameters for speaker verification.
+"""
+import numpy
+import soundfile
+import scipy
+from scipy.fftpack.realtransforms import dct
+__author__ = "Anthony Larcher and Sylvain Meignier"
+__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
+__license__ = "LGPL"
+__maintainer__ = "Anthony Larcher"
+__email__ = "[email protected]"
+__status__ = "Production"
+__docformat__ = 'reStructuredText'
+wav_flag = "float32"    # Could be "int16"
+PARAM_TYPE = numpy.float32
+def read_wav(input_file_name):
+    """
+    :param input_file_name:
+    :return:
+    """
+    #with wave.open(input_file_name, "r") as wfh:
+    #    (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
+    #    raw = wfh.readframes(nframes * nchannels)
+    #    out = struct.unpack_from("%dh" % nframes * nchannels, raw)
+    #    sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
+    #    return sig.astype(numpy.float32), framerate, sampwidth
+    nfo = soundfile.info(input_file_name)
+    sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
+    sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
+    sig = sig.astype(numpy.float32)
+    return sig, sample_rate, 4
+def hz2mel(f, htk=True):
+    """Convert an array of frequency in Hz into mel.
+    :param f: frequency to convert
+    :return: the equivalence on the mel scale.
+    """
+    if htk:
+        return 2595 * numpy.log10(1 + f / 700.)
+    else:
+        f = numpy.array(f)
+        # Mel fn to match Slaney's Auditory Toolbox mfcc.m
+        # Mel fn to match Slaney's Auditory Toolbox mfcc.m
+        f_0 = 0.
+        f_sp = 200. / 3.
+        brkfrq = 1000.
+        brkpt  = (brkfrq - f_0) / f_sp
+        logstep = numpy.exp(numpy.log(6.4) / 27)
+        linpts = f < brkfrq
+        z = numpy.zeros_like(f)
+        # fill in parts separately
+        z[linpts] = (f[linpts] - f_0) / f_sp
+        z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
+        if z.shape == (1,):
+            return z[0]
+        else:
+            return z
+def mel2hz(z, htk=True):
+    """Convert an array of mel values in Hz.
+    :param m: ndarray of frequencies to convert in Hz.
+    :return: the equivalent values in Hertz.
+    """
+    if htk:
+        return 700. * (10**(z / 2595.) - 1)
+    else:
+        z = numpy.array(z, dtype=float)
+        f_0 = 0
+        f_sp = 200. / 3.
+        brkfrq = 1000.
+        brkpt  = (brkfrq - f_0) / f_sp
+        logstep = numpy.exp(numpy.log(6.4) / 27)
+        linpts = (z < brkpt)
+        f = numpy.zeros_like(z)
+        # fill in parts separately
+        f[linpts] = f_0 + f_sp * z[linpts]
+        f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
+        if f.shape == (1,):
+            return f[0]
+        else:
+            return f
+def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
+    """Compute triangular filterbank for cepstral coefficient computation.
+    :param fs: sampling frequency of the original signal.
+    :param nfft: number of points for the Fourier Transform
+    :param lowfreq: lower limit of the frequency band filtered
+    :param maxfreq: higher limit of the frequency band filtered
+    :param nlinfilt: number of linear filters to use in low frequencies
+    :param  nlogfilt: number of log-linear filters to use in high frequencies
+    :param midfreq: frequency boundary between linear and log-linear filters
+    :return: the filter bank and the central frequencies of each filter
+    """
+    # Total number of filters
+    nfilt = nlinfilt + nlogfilt
+    # ------------------------
+    # Compute the filter bank
+    # ------------------------
+    # Compute start/middle/end points of the triangular filters in spectral
+    # domain
+    frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
+    if nlogfilt == 0:
+        linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
+        frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
+    elif nlinfilt == 0:
+        low_mel = hz2mel(lowfreq)
+        max_mel = hz2mel(maxfreq)
+        mels = numpy.zeros(nlogfilt + 2)
+        # mels[nlinfilt:]
+        melsc = (max_mel - low_mel) / (nfilt + 1)
+        mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
+        # Back to the frequency domain
+        frequences = mel2hz(mels)
+    else:
+        # Compute linear filters on [0;1000Hz]
+        linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
+        frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
+        # Compute log-linear filters on [1000;maxfreq]
+        low_mel = hz2mel(min([1000, maxfreq]))
+        max_mel = hz2mel(maxfreq)
+        mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
+        melsc = (max_mel - low_mel) / (nlogfilt + 1)
+        # Verify that mel2hz(melsc)>linsc
+        while mel2hz(melsc) < linsc:
+            # in this case, we add a linear filter
+            nlinfilt += 1
+            nlogfilt -= 1
+            frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
+            low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
+            max_mel = hz2mel(maxfreq)
+            mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
+            melsc = (max_mel - low_mel) / (nlogfilt + 1)
+        mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
+        # Back to the frequency domain
+        frequences[nlinfilt:] = mel2hz(mels)
+    heights = 2. / (frequences[2:] - frequences[0:-2])
+    # Compute filterbank coeff (in fft domain, in bins)
+    fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
+    # FFT bins (in Hz)
+    n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
+    for i in range(nfilt):
+        low = frequences[i]
+        cen = frequences[i + 1]
+        hi = frequences[i + 2]
+        try:
+            lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
+        except:
+            lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int32)
+        left_slope = heights[i] / (cen - low)
+        try:
+            rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
+        except:
+            rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int32)
+        right_slope = heights[i] / (hi - cen)
+        fbank[i][lid] = left_slope * (n_frequences[lid] - low)
+        fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
+    return fbank, frequences
+def power_spectrum(input_sig,
+                   fs=8000,
+                   win_time=0.025,
+                   shift=0.01,
+                   prefac=0.97):
+    """
+    Compute the power spectrum of the signal.
+    :param input_sig:
+    :param fs:
+    :param win_time:
+    :param shift:
+    :param prefac:
+    :return:
+    """
+    window_length = int(round(win_time * fs))
+    overlap = window_length - int(shift * fs)
+    framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
+    # Pre-emphasis filtering is applied after framing to be consistent with stream processing
+    framed = pre_emphasis(framed, prefac)
+    l = framed.shape[0]
+    n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
+    # Windowing has been changed to hanning which is supposed to have less noisy sidelobes
+    # ham = numpy.hamming(window_length)
+    window = numpy.hanning(window_length)
+    spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
+    log_energy = numpy.log((framed**2).sum(axis=1))
+    dec = 500000
+    start = 0
+    stop = min(dec, l)
+    while start < l:
+        ahan = framed[start:stop, :] * window
+        mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
+        spec[start:stop, :] = mag.real**2 + mag.imag**2
+        start = stop
+        stop = min(stop + dec, l)
+    return spec, log_energy
+def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
+    """
+    :param sig: input signal, can be mono or multi dimensional
+    :param win_size: size of the window in term of samples
+    :param win_shift: shift of the sliding window in terme of samples
+    :param context: tuple of left and right context
+    :param pad: can be zeros or edge
+    """
+    dsize = sig.dtype.itemsize
+    if sig.ndim == 1:
+        sig = sig[:, numpy.newaxis]
+    # Manage padding
+    c = (context, ) + (sig.ndim - 1) * ((0, 0), )
+    _win_size = win_size + sum(context)
+    shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
+    strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
+    if pad == 'zeros':
+        return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
+                                                  shape=shape,
+                                                  strides=strides).squeeze()
+    elif pad == 'edge':
+        return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
+                                                  shape=shape,
+                                                  strides=strides).squeeze()
+def pre_emphasis(input_sig, pre):
+    """Pre-emphasis of an audio signal.
+    :param input_sig: the input vector of signal to pre emphasize
+    :param pre: value that defines the pre-emphasis filter.
+    """
+    if input_sig.ndim == 1:
+        return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
+                                     input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
+    else:
+        return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre
+def mfcc(input_sig,
+         lowfreq=100, maxfreq=8000,
+         nlinfilt=0, nlogfilt=24,
+         nwin=0.025,
+         fs=16000,
+         nceps=13,
+         shift=0.01,
+         get_spec=False,
+         get_mspec=False,
+         prefac=0.97):
+    """Compute Mel Frequency Cepstral Coefficients.
+    :param input_sig: input signal from which the coefficients are computed.
+            Input audio is supposed to be RAW PCM 16bits
+    :param lowfreq: lower limit of the frequency band filtered.
+            Default is 100Hz.
+    :param maxfreq: higher limit of the frequency band filtered.
+            Default is 8000Hz.
+    :param nlinfilt: number of linear filters to use in low frequencies.
+            Default is 0.
+    :param nlogfilt: number of log-linear filters to use in high frequencies.
+            Default is 24.
+    :param nwin: length of the sliding window in seconds
+            Default is 0.025.
+    :param fs: sampling frequency of the original signal. Default is 16000Hz.
+    :param nceps: number of cepstral coefficients to extract.
+            Default is 13.
+    :param shift: shift between two analyses. Default is 0.01 (10ms).
+    :param get_spec: boolean, if true returns the spectrogram
+    :param get_mspec:  boolean, if true returns the output of the filter banks
+    :param prefac: pre-emphasis filter value
+    :return: the cepstral coefficients in a ndaray as well as
+            the Log-spectrum in the mel-domain in a ndarray.
+    .. note:: MFCC are computed as follows:
+            - Pre-processing in time-domain (pre-emphasizing)
+            - Compute the spectrum amplitude by windowing with a Hamming window
+            - Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
+               linearly spaced on the mel scale, and have equal bandwith in the mel scale
+            - Compute the DCT of the log-spectrom
+            - Log-energy is returned as first coefficient of the feature vector.
+    For more details, refer to [Davis80]_.
+    """
+    # Compute power spectrum
+    spec, log_energy = power_spectrum(input_sig,
+                                      fs,
+                                      win_time=nwin,
+                                      shift=shift,
+                                      prefac=prefac)
+    # Filter the spectrum through the triangle filter-bank
+    n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
+    fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
+    mspec = numpy.log(numpy.dot(spec, fbank.T))   # A tester avec log10 et log
+    # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
+    # The C0 term is removed as it is the constant term
+    # ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
+    lst = list()
+    lst.append(None)
+    lst.append(log_energy)
+    if get_spec:
+        lst.append(spec)
+    else:
+        lst.append(None)
+        del spec
+    if get_mspec:
+        lst.append(mspec)
+    else:
+        lst.append(None)
+        del mspec
+    return lst

sad_tf/thread_returning.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 27 15:18:49 2018
+@author: elechapt
+"""
+from threading import Thread
+class ThreadReturning(Thread):
+    """
+    Allow us to get the results from a thread
+    """
+    def __init__(self, *args, **kwargs):
+        Thread.__init__(self, *args, **kwargs)
+        self._return = None
+    def run(self):
+        if self._target is not None:
+            self._return = self._target(*self._args, **self._kwargs)
+    def join(self):
+        Thread.join(self)
+        return self._return

sad_tf/viterbi.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python
+# encoding: utf-8
+# The MIT License (MIT)
+# Copyright (c) 2014-2016 CNRS
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# AUTHORS
+# Hervé BREDIN - http://herve.niderb.fr
+from __future__ import unicode_literals
+import six.moves
+import numpy as np
+import itertools
+VITERBI_CONSTRAINT_NONE = 0
+VITERBI_CONSTRAINT_FORBIDDEN = 1
+VITERBI_CONSTRAINT_MANDATORY = 2
+LOG_ZERO = np.log(1e-200)
+# handling 'consecutive' constraints is achieved by duplicating states
+# the following functions are here to help in this process
+# create new transition prob. matrix accounting for duplicated states.
+def _update_transition(transition, consecutive):
+    # initialize with LOG_ZERO everywhere
+    # except on the +1 diagonal np.log(1)
+    new_n_states = np.sum(consecutive)
+    new_transition = LOG_ZERO * np.ones((new_n_states, new_n_states))
+    for i in range(1, new_n_states):
+        new_transition[i - 1, i] = np.log(1)
+    n_states = len(consecutive)
+    boundary = np.hstack(([0], np.cumsum(consecutive)))
+    start = boundary[:-1]
+    end = boundary[1:] - 1
+    for i, j in itertools.product(six.moves.range(n_states), repeat=2):
+        new_transition[end[i], start[j]] = transition[i, j]
+    return new_transition
+# create new initial prob. matrix accounting for duplicated states.
+def _update_initial(initial, consecutive):
+    new_n_states = np.sum(consecutive)
+    new_initial = LOG_ZERO * np.ones((new_n_states, ))
+    n_states = len(consecutive)
+    boundary = np.hstack(([0], np.cumsum(consecutive)))
+    start = boundary[:-1]
+    for i in range(n_states):
+        new_initial[start[i]] = initial[i]
+    return new_initial
+# create new emission prob. matrix accounting for duplicated states.
+def _update_emission(emission, consecutive):
+    return np.vstack(
+        np.tile(e, (c, 1))  # duplicate emission probabilities c times
+        for e, c in six.moves.zip(emission.T, consecutive)
+    ).T
+# create new constraint matrix accounting for duplicated states
+def _update_constraint(constraint, consecutive):
+    return np.vstack(
+        np.tile(e, (c, 1))  # duplicate constraint probabilities c times
+        for e, c in six.moves.zip(constraint.T, consecutive)
+    ).T
+# convert sequence of duplicated states back to sequence of original states.
+def _update_states(states, consecutive):
+    boundary = np.hstack(([0], np.cumsum(consecutive)))
+    start = boundary[:-1]
+    end = boundary[1:]
+    new_states = np.empty(states.shape)
+    for i, (s, e) in enumerate(six.moves.zip(start, end)):
+        new_states[np.where((s <= states) & (states < e))] = i
+    return new_states
+def viterbi_decoding(emission, transition,
+                     initial=None, consecutive=None, constraint=None):
+    """(Constrained) Viterbi decoding
+    Parameters
+    ----------
+    emission : array of shape (n_samples, n_states)
+        E[t, i] is the emission log-probabilities of sample t at state i.
+    transition : array of shape (n_states, n_states)
+        T[i, j] is the transition log-probabilities from state i to state j.
+    initial : optional, array of shape (n_states, )
+        I[i] is the initial log-probabilities of state i.
+        Defaults to equal log-probabilities.
+    consecutive : optional, int or int array of shape (n_states, )
+        C[i] is a the minimum-consecutive-states constraint for state i.
+        C[i] = 1 is equivalent to no constraint (default).
+    constraint : optional, array of shape (n_samples, n_states)
+        K[t, i] = 1 forbids state i at time t.
+        K[t, i] = 2 forces state i at time t.
+        Use K[t, i] = 0 for no constraint (default).
+    Returns
+    -------
+    states : array of shape (n_samples, )
+        Most probable state sequence
+    """
+    # ~~ INITIALIZATION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    T, k = emission.shape  # number of observations x number of states
+    # no minimum-consecutive-states constraints
+    if consecutive is None:
+        consecutive = np.ones((k, ), dtype=int)
+    # same value for all states
+    elif isinstance(consecutive, int):
+        consecutive = consecutive * np.ones((k, ), dtype=int)
+    # (potentially) different values per state
+    else:
+        consecutive = np.array(consecutive, dtype=int).reshape((k, ))
+    # at least one sample
+    consecutive = np.maximum(1, consecutive)
+    # balance initial probabilities when they are not provided
+    if initial is None:
+        initial = np.log(np.ones((k, )) / k)
+    # no constraint?
+    if constraint is None:
+        constraint = VITERBI_CONSTRAINT_NONE * np.ones((T, k))
+    # artificially create new states to account for 'consecutive' constraints
+    emission = _update_emission(emission, consecutive)
+    transition = _update_transition(transition, consecutive)
+    initial = _update_initial(initial, consecutive)
+    constraint = _update_constraint(constraint, consecutive)
+    T, K = emission.shape  # number of observations x number of new states
+    states = np.arange(K)  # states 0 to K-1
+    # set emission probability to zero for forbidden states
+    emission[
+        np.where(constraint == VITERBI_CONSTRAINT_FORBIDDEN)] = LOG_ZERO
+    # set emission probability to zero for all states but the mandatory one
+    for t, k in six.moves.zip(
+        *np.where(constraint == VITERBI_CONSTRAINT_MANDATORY)
+    ):
+        emission[t, states != k] = LOG_ZERO
+    # ~~ FORWARD PASS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    V = np.empty((T, K))                # V[t, k] is the probability of the
+    V[0, :] = emission[0, :] + initial  # most probable state sequence for the
+                                        # first t observations that has k as
+                                        # its final state.
+    P = np.empty((T, K), dtype=int)  # P[t, k] remembers which state was used
+    P[0, :] = states                 # to get from time t-1 to time t at
+                                     # state k
+    for t in range(1, T):
+        # tmp[k, k'] is the probability of the most probable path
+        # leading to state k at time t - 1, plus the probability of
+        # transitioning from state k to state k' (at time t)
+        tmp = (V[t - 1, :] + transition.T).T
+        # optimal path to state k at t comes from state P[t, k] at t - 1
+        # (find among all possible states at this time t)
+        P[t, :] = np.argmax(tmp, axis=0)
+        # update V for time t
+        V[t, :] = emission[t, :] + tmp[P[t, :], states]
+    # ~~ BACK-TRACKING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    X = np.empty((T,), dtype=int)
+    X[-1] = np.argmax(V[-1, :])
+    for t in range(1, T):
+        X[-(t + 1)] = P[-t, X[-t]]
+    # ~~ CONVERT BACK TO ORIGINAL STATES
+    return _update_states(X, consecutive)

sad_tf/viterbi_utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python
+# encoding: utf-8
+# The MIT License
+# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import numpy as np
+def pred2logemission(pred, eps=1e-10):
+    pred = np.array(pred)
+    ret = np.ones((len(pred), 2)) * eps
+    ret[pred == 0, 0] = 1 - eps
+    ret[pred == 1, 1] = 1 - eps
+    return np.log(ret)
+def log_trans_exp(exp,cost0=0, cost1=0):
+    # transition cost is assumed to be 10**-exp
+    cost = -exp * np.log(10)
+    ret = np.ones((2,2)) * cost
+    ret[0,0]= cost0
+    ret[1,1]= cost1
+    return ret
+def diag_trans_exp(exp, dim):
+    cost = -exp * np.log(10)
+    ret = np.ones((dim, dim)) * cost
+    for i in range(dim):
+        ret[i, i] = 0
+    return ret