imansarraf commited on
Commit
ab53da2
1 Parent(s): d1a38da

Upload 9 files

Browse files
sad_tf/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .segmenter import Segmenter,filter_output,filter_sig
2
+ from .export_funcs import seg2aud,seg2json,seg2Gender_Info,seg2Info
sad_tf/export_funcs.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ # The MIT License
5
+
6
+ # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
+
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ # THE SOFTWARE.
25
+
26
+ import pandas as pd
27
+ from pytextgrid.PraatTextGrid import PraatTextGrid, Interval, Tier
28
+ import os
29
+ import json
30
+
31
+ def seg2csv(lseg, fout=None):
32
+ df = pd.DataFrame.from_records(lseg, columns=['labels', 'start', 'stop'])
33
+ df.to_csv(fout, sep='\t', index=False)
34
+
35
+ def seg2textgrid1(lseg, fout=None):
36
+ tier = Tier(name='inaSpeechSegmenter')
37
+ for label, start, stop,_ in lseg:
38
+ if (label=='noEnergy'):
39
+ label=''
40
+ tier.append(Interval(start, stop, label))
41
+ ptg = PraatTextGrid(xmin=lseg[0][1], xmax=lseg[-1][2])
42
+ ptg.append(tier)
43
+ ptg.save(fout)
44
+
45
+
46
+ def seg2json(lseg) :
47
+ try:
48
+ return(seg2json5(lseg))
49
+ except:
50
+ return(seg2json4(lseg))
51
+
52
+
53
+
54
+
55
+ def seg2Info(lseg):
56
+
57
+
58
+ x=[]
59
+ nch=0
60
+ for segs in lseg:
61
+ f=0
62
+ nch = nch+1
63
+ data_list=[]
64
+ if (segs!=-1):
65
+ for y in segs:
66
+ if (y[0]!='noEnergy'):
67
+ f = f + y[2] - y[1]
68
+
69
+
70
+ data = {
71
+ 'channel' : nch,
72
+ 'speech': f
73
+ }
74
+ x.append(data)
75
+ return(json.dumps(x))
76
+
77
+
78
+ def seg2Gender_Info(lseg):
79
+
80
+
81
+ x=[]
82
+ nch=0
83
+ for segs in lseg:
84
+ f=0
85
+ m=0
86
+ nch = nch+1
87
+ data_list=[]
88
+ if (segs!=-1):
89
+ for y in segs:
90
+ if (y[0]!='noEnergy'):
91
+ if (y[0] == "female"):
92
+ f = f + y[2] - y[1]
93
+ elif(y[0] == "male"):
94
+ m = m + y[2] - y[1]
95
+
96
+
97
+ data = {
98
+ 'channel' : nch,
99
+ 'male': m,
100
+ 'female': f
101
+ }
102
+ x.append(data)
103
+ return(json.dumps(x))
104
+
105
+ def seg2json5(lseg):
106
+
107
+
108
+ x=[]
109
+ nch=0
110
+ for segs in lseg:
111
+ nch = nch+1
112
+ data_list=[]
113
+ if (segs!=-1):
114
+ for label, start, stop ,_,_ in segs:
115
+ if (label!='noEnergy'):
116
+ data = {
117
+ 'startTime': start,
118
+ 'endTime': stop,
119
+ 'gender': label[0]
120
+ }
121
+ data_list.append(data)
122
+ data = {
123
+ 'channel' : nch,
124
+ 'segments' : data_list
125
+ }
126
+ x.append(data)
127
+ return(json.dumps(x))
128
+
129
+ def seg2json4(lseg):
130
+
131
+ x=[]
132
+ nch=0
133
+ for segs in lseg:
134
+ nch = nch+1
135
+ data_list=[]
136
+ if (segs!=-1):
137
+ for label, start, stop ,_ in segs:
138
+ if (label!='noEnergy'):
139
+ data = {
140
+ 'startTime': start,
141
+ 'endTime': stop,
142
+ 'gender': label[0]
143
+ }
144
+ data_list.append(data)
145
+ data = {
146
+ 'channel' : nch,
147
+ 'segments' : data_list
148
+ }
149
+ x.append(data)
150
+ return(json.dumps(x))
151
+
152
+
153
+
154
+
155
+ def seg2aud(lseg , fout=None) :
156
+ try:
157
+ seg2aud5(lseg , fout)
158
+ except:
159
+ seg2aud4(lseg , fout)
160
+
161
+ def seg2aud5(lseg , fout=None):
162
+ if (lseg==-1):
163
+ return
164
+ with open(fout , 'w') as fid:
165
+ for label, start, stop ,_,_ in lseg:
166
+ if (label!='noEnergy'):
167
+ fid.write('%s\t%s\t%s\n' %(start , stop , label))
168
+
169
+ def seg2aud4(lseg , fout=None):
170
+ if (lseg==-1):
171
+ return
172
+ with open(fout , 'w') as fid:
173
+ for label, start, stop ,_ in lseg:
174
+ if (label!='noEnergy'):
175
+ fid.write('%s\t%s\t%s\n' %(start , stop , label))
176
+
177
+ def seg2textgrid(data , fout=None):
178
+ ghabli=False
179
+ kh=[]
180
+ if (True):
181
+ kh.append('File type = "ooTextFile"\n')
182
+ kh.append('Object class = "TextGrid"\n')
183
+ kh.append('\n')
184
+ kh.append('xmin = 0 \n')
185
+ kh.append('xmax = %s \n' %(data[-1][2]))
186
+ kh.append('tiers? <exists> \n')
187
+ kh.append('size = 1 \n')
188
+ kh.append('item []: \n')
189
+ kh.append(' item [1]:\n')
190
+ kh.append(' class = "IntervalTier" \n')
191
+ kh.append(' name = "sen" \n')
192
+ kh.append(' xmin = 0 \n')
193
+ kh.append(' xmax = %s \n' %(data[-1][2]))
194
+ kh.append(' intervals: size = %s \n' %(0))
195
+ x=1
196
+
197
+ if (float(data[0][1])>0):
198
+ kh.append(' intervals [%s]:\n' %(x))
199
+ kh.append(' xmin = 0\n')
200
+ kh.append(' xmax = %s \n' %(data[0][1]))
201
+ kh.append(' text = "" \n')
202
+ x=x+1
203
+
204
+
205
+ for i in range(len(data)):
206
+ kh.append(' intervals [%s]:\n' %(x))
207
+ if (ghabli):
208
+ kh.append(' xmin = %s \n' %(data[i-1][2]))
209
+ else:
210
+ kh.append(' xmin = %s \n' %(data[i][1]))
211
+ kh.append(' xmax = %s \n' %(data[i][2]))
212
+ kh.append(' text = "%s" \n' %(data[i][0].strip()))
213
+ x=x+1
214
+
215
+ if (i+1 >= len(data)):
216
+ break
217
+
218
+ if (data[i][2] != data[i+1][1]):
219
+
220
+
221
+ if (float(data[i+1][1]) - float(data[i][2]) > 0.5):
222
+ kh.append(' intervals [%s]:\n' %(x))
223
+
224
+ kh.append(' xmin = %s \n' %(data[i][2]))
225
+ kh.append(' xmax = %s \n' %(data[i+1][1]))
226
+ kh.append(' text = "" \n')
227
+ x=x+1
228
+ ghabli=False
229
+ else:
230
+ ghabli=True
231
+
232
+
233
+ kh[13] = (' intervals: size = %s \n' %(kh[-4].strip().split(' ')[1].replace('[','').replace(']','').replace(':','')))
234
+
235
+
236
+ with open(fout, mode='w') as fid:
237
+ for line in kh:
238
+ fid.write(line)
sad_tf/features.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ # The MIT License
5
+
6
+ # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
+
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ # THE SOFTWARE.
25
+
26
+ import os
27
+ import numpy as np
28
+ from iman import Audio
29
+
30
+ #os.environ['SIDEKIT'] = 'theano=false,libsvm=false,cuda=false'
31
+ #from sidekit.frontend.io import read_wav
32
+ #from sidekit.frontend.features import mfcc
33
+ from .sidekit_mfcc import mfcc
34
+
35
+
36
+ def _wav2feats(wavname,input_type='file',sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
37
+ """
38
+ Extract features for wav 16k mono
39
+ """
40
+
41
+ if (input_type == 'file'):
42
+ sig = Audio.Read(wavname , sr,mono = True, ffmpeg_path=ffmpeg_path)
43
+ else:
44
+ sig = wavname
45
+
46
+ read_framerate=sr
47
+
48
+
49
+ _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True,fs=sr, maxfreq=int(sr/2))
50
+
51
+ # Management of short duration segments
52
+ difflen = 0
53
+ if len(loge) < 68:
54
+ difflen = 68 - len(loge)
55
+ mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
56
+
57
+ return mspec, loge, difflen,sig
58
+
59
+
60
+ def media2feats(medianame,input_type='file', sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
61
+
62
+ return _wav2feats(medianame, input_type , sr,ffmpeg_path=ffmpeg_path)
sad_tf/segmenter_for_tf2_16.py ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ # The MIT License
5
+
6
+ # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
+
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ # THE SOFTWARE.
25
+
26
+ import warnings
27
+ warnings.filterwarnings("ignore")
28
+ import os
29
+ # os.environ["CUDA_DEVICE_ORDER"]= '0'
30
+ import sys
31
+ import math
32
+ from iman import Audio
33
+ import numpy as np
34
+ from tensorflow import keras
35
+ # from tensorflow.compat.v1.keras.backend import set_session
36
+ import tensorflow.python.keras.backend as K
37
+ from .thread_returning import ThreadReturning
38
+
39
+ import shutil
40
+ import time
41
+ import random
42
+
43
+ from skimage.util import view_as_windows as vaw
44
+
45
+
46
+ from .viterbi import viterbi_decoding
47
+ from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
48
+
49
+ from .features import media2feats
50
+ from .export_funcs import seg2csv, seg2textgrid
51
+
52
+
53
+
54
+ def _energy_activity(loge, ratio=0.4): ##########0.9
55
+
56
+ threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
57
+ raw_activity = (loge > threshold)
58
+ return viterbi_decoding(pred2logemission(raw_activity),
59
+ log_trans_exp(50, cost0=-5))
60
+
61
+ #exp(150, cost0=-5)
62
+
63
+ def filter_sig(isig , wav , sr=16000):
64
+
65
+ if (sr!=16000):
66
+ wav = Audio.Resample(wav , 16000, sr)
67
+
68
+
69
+ try:
70
+ w=[]
71
+ wn=[]
72
+ wn.append(wav[0 : int(isig[0][1]*sr)])
73
+ for i , xxx in enumerate(isig):
74
+ a=xxx[1]
75
+ b=xxx[2]
76
+ w.append(wav[int(a*sr) : int(b*sr)])
77
+ try:
78
+ wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
79
+ except:
80
+ wn.append(wav[int(isig[i][2]*sr) : len(wav)])
81
+
82
+ return (np.concatenate(w),np.concatenate(wn))
83
+ except:
84
+ w=[]
85
+ wn=[]
86
+ wn.append(wav[0 : int(isig[0][1]*sr)])
87
+ for i , [_,a,b,_,_] in enumerate(isig):
88
+ w.append(wav[int(a*sr) : int(b*sr)])
89
+ try:
90
+ wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
91
+ except:
92
+ wn.append(wav[int(isig[i][2]*sr) : len(wav)])
93
+
94
+ return (np.concatenate(w),np.concatenate(wn))
95
+
96
+ def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
97
+
98
+ if (len(isig)==0):
99
+ return -1
100
+
101
+ # _dels=[]
102
+ # for i , [_,_,_,_d] in enumerate(isig):
103
+ # if (_d<=ignore_small_speech_segments) :
104
+ # _dels.append(i)
105
+ # _dels.reverse()
106
+ # for i in _dels:
107
+ # del isig[i]
108
+
109
+ # if (len(isig)==0):
110
+ # return -1
111
+
112
+ isig = [list(x) for x in isig]
113
+
114
+ for i in range(len(isig)-1):
115
+ t = isig[i+1][1] - isig[i][2] # silence between towo chunk
116
+ isig[i].append(t)
117
+ isig[-1].append(-1)
118
+
119
+
120
+ if (len(isig)>0):
121
+
122
+ rang = np.arange(0.01,max_silence+0.1,0.1)
123
+ for di in rang:
124
+ for i , xxx in enumerate(isig):
125
+
126
+ _t = xxx[-1]
127
+ if (_t==-1):
128
+ break
129
+ if (_t <=di):
130
+ try:
131
+ if (isig[i+1][2] - isig[i][1] <= max_speech_len):
132
+ isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
133
+ del isig[i+1]
134
+ except:
135
+ pass
136
+ _dels=[]
137
+ for i , xxxx in enumerate(isig):
138
+ _d = xxxx[3]
139
+ if (_d<=ignore_small_speech_segments) :
140
+ _dels.append(i)
141
+ _dels.reverse()
142
+
143
+ for i in _dels:
144
+ del isig[i]
145
+
146
+ if (len(isig)==0):
147
+ return -1
148
+
149
+
150
+ isign=[]
151
+ for i , xxxxx in enumerate(isig):
152
+ _d = xxxxx[3]
153
+ if (_d> split_speech_bigger_than ) :
154
+
155
+ _gc = math.ceil(_d/split_speech_bigger_than)
156
+ m = _d/_gc
157
+ print('Bigger-->' + str(_d) + '-->' + str(m))
158
+ for jj in range(_gc):
159
+ fas=0
160
+ if (jj== _gc-1):
161
+ fas= isig[i][4]
162
+ isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] )
163
+ else:
164
+ isign.append(isig[i])
165
+ for i,(a,b,c,d,e) in enumerate(isign):
166
+ if (e==-1):
167
+ break
168
+ _addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
169
+ isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
170
+
171
+ return(isign)
172
+
173
+
174
+ def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
175
+
176
+ isig = []
177
+ i=0
178
+ while (i <len(vad)):
179
+
180
+ ml=0
181
+ inn = i
182
+ st = (vad[i][1])
183
+
184
+ while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
185
+ ml = (vad[i][2]) - st
186
+ if (ml > max_speech_len):
187
+ if (i>inn and i>0):
188
+ i=i-1
189
+ break
190
+ i=i+1
191
+ en = (vad[i][2])
192
+ fa = en-st
193
+ if (fa > ignore_small_speech_segments):
194
+ if (fa>split_speech_bigger_than):
195
+ _gc = math.ceil(fa/split_speech_bigger_than)
196
+ m = fa/_gc
197
+ print('Bigger-->' + str(fa) + '-->' + str(m))
198
+ for jj in range(_gc):
199
+ isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
200
+ else:
201
+ isig.append(('speech', st , en,fa))
202
+ i=i+1
203
+ isign=[]
204
+ for i,(a,b,c,d) in enumerate(isig):
205
+ if (i == len(isig)-1):
206
+ isign.append(isig[i])
207
+ break
208
+ _addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
209
+ isign.append([a,b,c+_addlen ,d+_addlen])
210
+
211
+ return(isign)
212
+
213
+
214
+ def _get_patches(mspec, w, step):
215
+ h = mspec.shape[1]
216
+ data = vaw(mspec, (w,h), step=step)
217
+ data.shape = (len(data), w*h)
218
+ data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
219
+ lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
220
+ rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
221
+ data = np.vstack(lfill + [data] + rfill )
222
+ finite = np.all(np.isfinite(data), axis=1)
223
+ data.shape = (len(data), w, h)
224
+ return data, finite
225
+
226
+
227
+ def _binidx2seglist(binidx):
228
+ """
229
+ ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
230
+ Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
231
+
232
+ #TODO: is there a pandas alternative??
233
+ """
234
+ curlabel = None
235
+ bseg = -1
236
+ ret = []
237
+ for i, e in enumerate(binidx):
238
+ if e != curlabel:
239
+ if curlabel is not None:
240
+ ret.append((curlabel, bseg, i))
241
+ curlabel = e
242
+ bseg = i
243
+ ret.append((curlabel, bseg, i + 1))
244
+ return ret
245
+
246
+
247
+ class DnnSegmenter:
248
+ """
249
+ DnnSegmenter is an abstract class allowing to perform Dnn-based
250
+ segmentation using Keras serialized models using 24 mel spectrogram
251
+ features obtained with SIDEKIT framework.
252
+
253
+ Child classes MUST define the following class attributes:
254
+ * nmel: the number of mel bands to used (max: 24)
255
+ * viterbi_arg: the argument to be used with viterbi post-processing
256
+ * model_fname: the filename of the serialized keras model to be used
257
+ the model should be stored in the current directory
258
+ * inlabel: only segments with label name inlabel will be analyzed.
259
+ other labels will stay unchanged
260
+ * outlabels: the labels associated the output of neural network models
261
+ """
262
+ def __init__(self, batch_size, vad_type,model_path):
263
+ # load the DNN model
264
+ if (vad_type!='vad'):
265
+ self.nn = keras.models.load_model(model_path, compile=False)
266
+ print('model Loded from--> ' + model_path)
267
+ # self.nn.summary()
268
+ self.batch_size = batch_size
269
+
270
+ def __call__(self, mspec, lseg, difflen = 0):
271
+ """
272
+ *** input
273
+ * mspec: mel spectrogram
274
+ * lseg: list of tuples (label, start, stop) corresponding to previous segmentations
275
+ * difflen: 0 if the original length of the mel spectrogram is >= 68
276
+ otherwise it is set to 68 - length(mspec)
277
+ *** output
278
+ a list of adjacent tuples (label, start, stop)
279
+ """
280
+ if self.nmel < 24:
281
+ mspec = mspec[:, :self.nmel].copy()
282
+
283
+ patches, finite = _get_patches(mspec, 68, 2)
284
+ if difflen > 0:
285
+ patches = patches[:-int(difflen / 2), :, :]
286
+ finite = finite[:-int(difflen / 2)]
287
+
288
+ assert len(finite) == len(patches), (len(patches), len(finite))
289
+
290
+ batch = []
291
+ for lab, start, stop in lseg:
292
+ if lab == self.inlabel:
293
+ batch.append(patches[start:stop, :])
294
+
295
+ if len(batch) > 0:
296
+ batch = np.concatenate(batch)
297
+ rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
298
+
299
+ ret = []
300
+ for lab, start, stop in lseg:
301
+ if lab != self.inlabel:
302
+ ret.append((lab, start, stop))
303
+ continue
304
+
305
+ l = stop - start
306
+ r = rawpred[:l]
307
+ rawpred = rawpred[l:]
308
+ r[finite[start:stop] == False, :] = 0.5
309
+ pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
310
+ for lab2, start2, stop2 in _binidx2seglist(pred):
311
+ ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
312
+ return ret
313
+
314
+
315
+ class SpeechMusic(DnnSegmenter):
316
+ # Voice activity detection: requires energetic activity detection
317
+ outlabels = ('speech', 'music')
318
+ inlabel = 'energy'
319
+ nmel = 21
320
+ viterbi_arg = 150
321
+
322
+
323
+ class SpeechMusicNoise(DnnSegmenter):
324
+ # Voice activity detection: requires energetic activity detection
325
+ outlabels = ('speech', 'music', 'noise')
326
+ inlabel = 'energy'
327
+ nmel = 21
328
+ viterbi_arg = 80
329
+
330
+
331
+ class Gender(DnnSegmenter):
332
+ # Gender Segmentation, requires voice activity detection
333
+ outlabels = ('female', 'male')
334
+ inlabel = 'speech'
335
+ nmel = 24
336
+ viterbi_arg = 80
337
+
338
+
339
+
340
+ class Segmenter:
341
+
342
+
343
+ def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.hdf5",gender_path="c:\\keras_male_female_cnn.hdf5" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda' ,input_type="file"):
344
+ """
345
+ Load neural network models
346
+
347
+ Input:
348
+
349
+ 'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
350
+ 'sm' was used in the results presented in ICASSP 2017 paper
351
+ and in MIREX 2018 challenge submission
352
+ 'smn' has been implemented more recently and has not been evaluated in papers
353
+
354
+ 'detect_gender': if False, speech excerpts are return labelled as 'speech'
355
+ if True, speech excerpts are splitted into 'male' and 'female' segments
356
+ """
357
+
358
+ if (device != 'cuda'):
359
+ os.environ["CUDA_DEVICE_ORDER"]= '-1'
360
+ else:
361
+ pass
362
+
363
+
364
+ import tensorflow as tf
365
+
366
+ config = tf.compat.v1.ConfigProto()
367
+ config.gpu_options.allow_growth = True
368
+ config.log_device_placement = True
369
+ sess = tf.compat.v1.Session(config=config)
370
+ sess = K.get_session()
371
+
372
+
373
+ self.complete_output = complete_output
374
+ self.sample_rate = sr
375
+ self.ffmpeg_path=ffmpeg_path
376
+ self.input_type = input_type
377
+ self.device = device
378
+
379
+
380
+
381
+ # self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
382
+
383
+
384
+ # select speech/music or speech/music/noise voice activity detection engine
385
+ assert vad_engine in ['sm', 'smn']
386
+ if vad_engine == 'sm':
387
+ self.vad = SpeechMusic(batch_size)
388
+ elif vad_engine == 'smn':
389
+ self.vad = SpeechMusicNoise(batch_size , vad_type,model_path)
390
+
391
+ # load gender detection NN if required
392
+ assert detect_gender in [True, False]
393
+ self.detect_gender = detect_gender
394
+ if detect_gender:
395
+ self.gender = Gender(batch_size , vad_type ,gender_path)
396
+ self.vad_type = vad_type
397
+ self.model_path = model_path
398
+ self.gender_path = gender_path
399
+
400
+ def segment_feats(self, mspec, loge, difflen, start_sec):
401
+ """
402
+ do segmentation
403
+ require input corresponding to wav file sampled at 16000Hz
404
+ with a single channel
405
+ """
406
+
407
+
408
+
409
+
410
+ # perform energy-based activity detection
411
+ lseg = []
412
+ vadseg=[]
413
+ for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
414
+ if lab == 0:
415
+ lab = 'noEnergy'
416
+ else:
417
+ lab = 'energy'
418
+ vadseg.append(('speech', start, stop))
419
+ lseg.append((lab, start, stop))
420
+ if (self.vad_type == 'vad'):
421
+ return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
422
+ # perform voice activity detection
423
+ lseg = self.vad(mspec, lseg, difflen)
424
+
425
+
426
+
427
+
428
+ # perform gender segmentation on speech segments
429
+ if self.detect_gender:
430
+ lseg = self.gender(mspec, lseg, difflen)
431
+ if (self.complete_output):
432
+ return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
433
+ else:
434
+ return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
435
+
436
+
437
+ def __call__(self, medianame,start_sec=None, stop_sec=None):
438
+ """
439
+ Return segmentation of a given file
440
+ * convert file to wav 16k mono with ffmpeg
441
+ * call NN segmentation procedures
442
+ * media_name: path to the media to be processed (including remote url)
443
+ may include any format supported by ffmpeg
444
+ * tmpdir: allow to define a custom path for storing temporary files
445
+ fast read/write HD are a good choice
446
+ * start_sec (seconds): sound stream before start_sec won't be processed
447
+ * stop_sec (seconds): sound stream after stop_sec won't be processed
448
+ """
449
+
450
+
451
+ mspec, loge, difflen , me = media2feats(medianame, self.input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
452
+
453
+ if start_sec is None:
454
+ start_sec = 0
455
+ # do segmentation
456
+ return self.segment_feats(mspec, loge, difflen, start_sec),me
457
+
458
+
459
+ def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
460
+
461
+ if verbose:
462
+ print('batch_processing %d files' % len(linput))
463
+
464
+ if output_format == 'csv':
465
+ fexport = seg2csv
466
+ elif output_format == 'textgrid':
467
+ fexport = seg2textgrid
468
+ else:
469
+ raise NotImplementedError()
470
+
471
+ t_batch_start = time.time()
472
+
473
+ lmsg = []
474
+ fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
475
+ i = 0
476
+ for feats, msg in fg:
477
+ lmsg += msg
478
+ i += len(msg)
479
+ if verbose:
480
+ print('%d/%d' % (i, len(linput)), msg)
481
+ if feats is None:
482
+ break
483
+ mspec, loge, difflen = feats
484
+ #if verbose == True:
485
+ # print(i, linput[i], loutput[i])
486
+ b = time.time()
487
+ lseg = self.segment_feats(mspec, loge, difflen, 0)
488
+ fexport(lseg, loutput[len(lmsg) -1])
489
+ lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
490
+
491
+ t_batch_dur = time.time() - t_batch_start
492
+ nb_processed = len([e for e in lmsg if e[1] == 0])
493
+ if nb_processed > 0:
494
+ avg = t_batch_dur / nb_processed
495
+ else:
496
+ avg = -1
497
+ return t_batch_dur, nb_processed, avg, lmsg
498
+
499
+
500
+ def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
501
+ """
502
+ To be used when processing batches
503
+ if resulting file exists, it is skipped
504
+ in case of remote files, access is tried nbtry times
505
+ """
506
+ ret = None
507
+ msg = []
508
+ while ret is None and len(lin) > 0:
509
+ src = lin.pop(0)
510
+ dst = lout.pop(0)
511
+ # print('popping', src)
512
+
513
+ # if file exists: skipp
514
+ if skipifexist and os.path.exists(dst):
515
+ msg.append((dst, 1, 'already exists'))
516
+ continue
517
+
518
+ # create storing directory if required
519
+ dname = os.path.dirname(dst)
520
+ if not os.path.isdir(dname):
521
+ os.makedirs(dname)
522
+
523
+ itry = 0
524
+ while ret is None and itry < nbtry:
525
+ try:
526
+ ret = media2feats(src, tmpdir, None, None, ffmpeg)
527
+ except:
528
+ itry += 1
529
+ errmsg = sys.exc_info()[0]
530
+ if itry != nbtry:
531
+ time.sleep(random.random() * trydelay)
532
+ if ret is None:
533
+ msg.append((dst, 2, 'error: ' + str(errmsg)))
534
+ else:
535
+ msg.append((dst, 0, 'ok'))
536
+
537
+ return ret, msg
538
+
539
+
540
+ def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
541
+ # print('init feat gen', len(ilist))
542
+ thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
543
+ thread.start()
544
+ while True:
545
+ ret, msg = thread.join()
546
+ # print('join done', len(ilist))
547
+ # print('new list', ilist)
548
+ #ilist = ilist[len(msg):]
549
+ #olist = olist[len(msg):]
550
+ if len(ilist) == 0:
551
+ break
552
+ thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
553
+ thread.start()
554
+ yield ret, msg
555
+ yield ret, msg
sad_tf/segmentero.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ # The MIT License
5
+
6
+ # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
+
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ # THE SOFTWARE.
25
+ import onnxruntime
26
+ import warnings
27
+ warnings.filterwarnings("ignore")
28
+ import os
29
+ # os.environ["CUDA_DEVICE_ORDER"]= '0'
30
+ import sys
31
+ import math
32
+ from iman import Audio
33
+ import numpy as np
34
+ from tensorflow import keras
35
+ from tensorflow.compat.v1.keras.backend import set_session
36
+ from tqdm import tqdm
37
+ from .thread_returning import ThreadReturning
38
+
39
+ import shutil
40
+ import time
41
+ import random
42
+
43
+ from skimage.util import view_as_windows as vaw
44
+
45
+
46
+ from .viterbi import viterbi_decoding
47
+ from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
48
+
49
+ from .features import media2feats
50
+ from .export_funcs import seg2csv, seg2textgrid
51
+
52
+
53
+
54
+ def _energy_activity(loge, ratio=0.4): ##########0.9
55
+
56
+ threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
57
+ raw_activity = (loge > threshold)
58
+ return viterbi_decoding(pred2logemission(raw_activity),
59
+ log_trans_exp(50, cost0=-5))
60
+
61
+ #exp(150, cost0=-5)
62
+
63
+ def filter_sig(isig , wav , sr=16000):
64
+
65
+ if (sr!=16000):
66
+ wav = Audio.Resample(wav , 16000, sr)
67
+
68
+
69
+ try:
70
+ w=[]
71
+ wn=[]
72
+ wn.append(wav[0 : int(isig[0][1]*sr)])
73
+ for i , [_,a,b,_] in enumerate(isig):
74
+
75
+ w.append(wav[int(a*sr) : int(b*sr)])
76
+ try:
77
+ wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
78
+ except:
79
+ wn.append(wav[int(isig[i][2]*sr) : len(wav)])
80
+
81
+ return (np.concatenate(w),np.concatenate(wn))
82
+ except:
83
+ w=[]
84
+ wn=[]
85
+ wn.append(wav[0 : int(isig[0][1]*sr)])
86
+ for i , [_,a,b,_,_] in enumerate(isig):
87
+ w.append(wav[int(a*sr) : int(b*sr)])
88
+ try:
89
+ wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
90
+ except:
91
+ wn.append(wav[int(isig[i][2]*sr) : len(wav)])
92
+
93
+ return (np.concatenate(w),np.concatenate(wn))
94
+
95
+ def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
96
+
97
+ if (len(isig)==0):
98
+ return -1
99
+
100
+ # _dels=[]
101
+ # for i , [_,_,_,_d] in enumerate(isig):
102
+ # if (_d<=ignore_small_speech_segments) :
103
+ # _dels.append(i)
104
+ # _dels.reverse()
105
+ # for i in _dels:
106
+ # del isig[i]
107
+
108
+ # if (len(isig)==0):
109
+ # return -1
110
+
111
+
112
+ for i in range(len(isig)-1):
113
+ t = isig[i+1][1] - isig[i][2] # silence between towo chunk
114
+ isig[i].append(t)
115
+ isig[-1].append(-1)
116
+
117
+
118
+ if (len(isig)>0):
119
+
120
+ rang = np.arange(0.01,max_silence+0.1,0.1)
121
+ for di in rang:
122
+ for i , [_,_,_,_,_t] in enumerate(isig):
123
+ if (_t==-1):
124
+ break
125
+ if (_t <=di):
126
+ try:
127
+ if (isig[i+1][2] - isig[i][1] <= max_speech_len):
128
+ isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
129
+ del isig[i+1]
130
+ except:
131
+ pass
132
+ _dels=[]
133
+ for i , [_,_,_,_d,_] in enumerate(isig):
134
+ if (_d<=ignore_small_speech_segments) :
135
+ _dels.append(i)
136
+ _dels.reverse()
137
+
138
+ for i in _dels:
139
+ del isig[i]
140
+
141
+ if (len(isig)==0):
142
+ return -1
143
+
144
+
145
+ isign=[]
146
+ for i , [_,_,_,_d,_] in enumerate(isig):
147
+ if (_d> split_speech_bigger_than ) :
148
+
149
+ _gc = math.ceil(_d/split_speech_bigger_than)
150
+ m = _d/_gc
151
+ print('Bigger-->' + str(_d) + '-->' + str(m))
152
+ for jj in range(_gc):
153
+ fas=0
154
+ if (jj== _gc-1):
155
+ fas= isig[i][4]
156
+ isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] )
157
+ else:
158
+ isign.append(isig[i])
159
+ for i,(a,b,c,d,e) in enumerate(isign):
160
+ if (e==-1):
161
+ break
162
+ _addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
163
+ isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
164
+
165
+ return(isign)
166
+
167
+
168
+ def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
169
+
170
+ isig = []
171
+ i=0
172
+ while (i <len(vad)):
173
+
174
+ ml=0
175
+ inn = i
176
+ st = (vad[i][1])
177
+
178
+ while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
179
+ ml = (vad[i][2]) - st
180
+ if (ml > max_speech_len):
181
+ if (i>inn and i>0):
182
+ i=i-1
183
+ break
184
+ i=i+1
185
+ en = (vad[i][2])
186
+ fa = en-st
187
+ if (fa > ignore_small_speech_segments):
188
+ if (fa>split_speech_bigger_than):
189
+ _gc = math.ceil(fa/split_speech_bigger_than)
190
+ m = fa/_gc
191
+ print('Bigger-->' + str(fa) + '-->' + str(m))
192
+ for jj in range(_gc):
193
+ isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
194
+ else:
195
+ isig.append(('speech', st , en,fa))
196
+ i=i+1
197
+ isign=[]
198
+ for i,(a,b,c,d) in enumerate(isig):
199
+ if (i == len(isig)-1):
200
+ isign.append(isig[i])
201
+ break
202
+ _addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
203
+ isign.append([a,b,c+_addlen ,d+_addlen])
204
+
205
+ return(isign)
206
+
207
+
208
+ def get_path_3d(data,batch_size):
209
+ total_batches = data.shape[0] // batch_size
210
+ last_batch_size = data.shape[0] % batch_size
211
+ if last_batch_size != 0:
212
+ batches = np.split(data[:total_batches * batch_size], total_batches)
213
+ last_batch = np.expand_dims(data[total_batches * batch_size:], axis=0).squeeze()
214
+ batches.append(last_batch)
215
+ else:
216
+ batches = np.split(data, total_batches)
217
+ return batches
218
+
219
+
220
+ def _get_patches(mspec, w, step):
221
+ h = mspec.shape[1]
222
+ data = vaw(mspec, (w,h), step=step)
223
+ data.shape = (len(data), w*h)
224
+ data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
225
+ lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
226
+ rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
227
+ data = np.vstack(lfill + [data] + rfill )
228
+ finite = np.all(np.isfinite(data), axis=1)
229
+ data.shape = (len(data), w, h)
230
+ return data, finite
231
+
232
+
233
+ def _binidx2seglist(binidx):
234
+ """
235
+ ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
236
+ Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
237
+
238
+ #TODO: is there a pandas alternative??
239
+ """
240
+ curlabel = None
241
+ bseg = -1
242
+ ret = []
243
+ for i, e in enumerate(binidx):
244
+ if e != curlabel:
245
+ if curlabel is not None:
246
+ ret.append((curlabel, bseg, i))
247
+ curlabel = e
248
+ bseg = i
249
+ ret.append((curlabel, bseg, i + 1))
250
+ return ret
251
+
252
+
253
+ class DnnSegmenter:
254
+ """
255
+ DnnSegmenter is an abstract class allowing to perform Dnn-based
256
+ segmentation using Keras serialized models using 24 mel spectrogram
257
+ features obtained with SIDEKIT framework.
258
+
259
+ Child classes MUST define the following class attributes:
260
+ * nmel: the number of mel bands to used (max: 24)
261
+ * viterbi_arg: the argument to be used with viterbi post-processing
262
+ * model_fname: the filename of the serialized keras model to be used
263
+ the model should be stored in the current directory
264
+ * inlabel: only segments with label name inlabel will be analyzed.
265
+ other labels will stay unchanged
266
+ * outlabels: the labels associated the output of neural network models
267
+ """
268
+ def __init__(self, batch_size, vad_type,model_path,EP_list):
269
+ # load the DNN model
270
+ if (vad_type!='vad'):
271
+ self.session = onnxruntime.InferenceSession(model_path,providers=EP_list)
272
+ #self.nn = keras.models.load_model(model_path, compile=False)
273
+ print('model Loded from--> ' + model_path)
274
+ # self.nn.summary()
275
+ self.batch_size = batch_size
276
+
277
+ def __call__(self, mspec, lseg, difflen = 0):
278
+ """
279
+ *** input
280
+ * mspec: mel spectrogram
281
+ * lseg: list of tuples (label, start, stop) corresponding to previous segmentations
282
+ * difflen: 0 if the original length of the mel spectrogram is >= 68
283
+ otherwise it is set to 68 - length(mspec)
284
+ *** output
285
+ a list of adjacent tuples (label, start, stop)
286
+ """
287
+ if self.nmel < 24:
288
+ mspec = mspec[:, :self.nmel].copy()
289
+
290
+ patches, finite = _get_patches(mspec, 68, 2)
291
+ if difflen > 0:
292
+ patches = patches[:-int(difflen / 2), :, :]
293
+ finite = finite[:-int(difflen / 2)]
294
+
295
+ assert len(finite) == len(patches), (len(patches), len(finite))
296
+
297
+ batch = []
298
+ for lab, start, stop in lseg:
299
+ if lab == self.inlabel:
300
+ batch.append(patches[start:stop, :])
301
+
302
+ if len(batch) > 0:
303
+
304
+ batch = np.concatenate(batch)
305
+ batches = get_path_3d(batch , self.batch_size,)
306
+
307
+
308
+ #rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
309
+ input_name = self.session.get_inputs()[0].name
310
+ rawpred=[]
311
+ for batch in tqdm(batches):
312
+ rawpred.append(self.session.run(None, {input_name: batch})[0])
313
+
314
+ rawpred = np.concatenate(rawpred)
315
+
316
+
317
+ ret = []
318
+ for lab, start, stop in lseg:
319
+ if lab != self.inlabel:
320
+ ret.append((lab, start, stop))
321
+ continue
322
+
323
+ l = stop - start
324
+ r = rawpred[:l]
325
+ rawpred = rawpred[l:]
326
+ r[finite[start:stop] == False, :] = 0.5
327
+ pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
328
+ for lab2, start2, stop2 in _binidx2seglist(pred):
329
+ ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
330
+ return ret
331
+
332
+
333
+ class SpeechMusic(DnnSegmenter):
334
+ # Voice activity detection: requires energetic activity detection
335
+ outlabels = ('speech', 'music')
336
+ inlabel = 'energy'
337
+ nmel = 21
338
+ viterbi_arg = 150
339
+
340
+
341
+ class SpeechMusicNoise(DnnSegmenter):
342
+ # Voice activity detection: requires energetic activity detection
343
+ outlabels = ('speech', 'music', 'noise')
344
+ inlabel = 'energy'
345
+ nmel = 21
346
+ viterbi_arg = 80
347
+
348
+
349
+ class Gender(DnnSegmenter):
350
+ # Gender Segmentation, requires voice activity detection
351
+ outlabels = ('female', 'male')
352
+ inlabel = 'speech'
353
+ nmel = 24
354
+ viterbi_arg = 80
355
+
356
+
357
+
358
+ class Segmenter:
359
+
360
+
361
+ def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.onnx",gender_path="c:\\keras_male_female_cnn.onnx" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda'):
362
+ """
363
+ Load neural network models
364
+
365
+ Input:
366
+
367
+ 'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
368
+ 'sm' was used in the results presented in ICASSP 2017 paper
369
+ and in MIREX 2018 challenge submission
370
+ 'smn' has been implemented more recently and has not been evaluated in papers
371
+
372
+ 'detect_gender': if False, speech excerpts are return labelled as 'speech'
373
+ if True, speech excerpts are splitted into 'male' and 'female' segments
374
+ """
375
+ self.complete_output = complete_output
376
+ self.sample_rate = sr
377
+ self.ffmpeg_path=ffmpeg_path
378
+
379
+
380
+ if (device != 'cuda'):
381
+ os.environ["CUDA_DEVICE_ORDER"]= '-1'
382
+ EP_list=[ 'CPUExecutionProvider']
383
+ else:
384
+ EP_list=['CUDAExecutionProvider']
385
+
386
+ import tensorflow as tf
387
+
388
+ config = tf.compat.v1.ConfigProto()
389
+ config.gpu_options.allow_growth = True
390
+ config.log_device_placement = True
391
+ sess = tf.compat.v1.Session(config=config)
392
+ set_session(sess)
393
+
394
+
395
+
396
+ # self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
397
+
398
+
399
+ # select speech/music or speech/music/noise voice activity detection engine
400
+ assert vad_engine in ['sm', 'smn']
401
+ if vad_engine == 'sm':
402
+ self.vad = SpeechMusic(batch_size)
403
+ elif vad_engine == 'smn':
404
+ self.vad = SpeechMusicNoise(batch_size , vad_type,model_path,EP_list)
405
+
406
+ # load gender detection NN if required
407
+ assert detect_gender in [True, False]
408
+ self.detect_gender = detect_gender
409
+ if detect_gender:
410
+ self.gender = Gender(batch_size , vad_type ,gender_path,EP_list)
411
+ self.vad_type = vad_type
412
+ self.model_path = model_path
413
+ self.gender_path = gender_path
414
+
415
+ def segment_feats(self, mspec, loge, difflen, start_sec):
416
+ """
417
+ do segmentation
418
+ require input corresponding to wav file sampled at 16000Hz
419
+ with a single channel
420
+ """
421
+
422
+
423
+
424
+
425
+ # perform energy-based activity detection
426
+ lseg = []
427
+ vadseg=[]
428
+ for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
429
+ if lab == 0:
430
+ lab = 'noEnergy'
431
+ else:
432
+ lab = 'energy'
433
+ vadseg.append(('speech', start, stop))
434
+ lseg.append((lab, start, stop))
435
+ if (self.vad_type == 'vad'):
436
+ return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
437
+ # perform voice activity detection
438
+ lseg = self.vad(mspec, lseg, difflen)
439
+
440
+
441
+
442
+
443
+ # perform gender segmentation on speech segments
444
+ if self.detect_gender:
445
+ lseg = self.gender(mspec, lseg, difflen)
446
+ if (self.complete_output):
447
+ return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
448
+ else:
449
+ return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
450
+
451
+
452
+ def __call__(self, medianame, input_type='file',start_sec=None, stop_sec=None):
453
+ """
454
+ Return segmentation of a given file
455
+ * convert file to wav 16k mono with ffmpeg
456
+ * call NN segmentation procedures
457
+ * media_name: path to the media to be processed (including remote url)
458
+ may include any format supported by ffmpeg
459
+ * tmpdir: allow to define a custom path for storing temporary files
460
+ fast read/write HD are a good choice
461
+ * start_sec (seconds): sound stream before start_sec won't be processed
462
+ * stop_sec (seconds): sound stream after stop_sec won't be processed
463
+ """
464
+
465
+
466
+ mspec, loge, difflen , me = media2feats(medianame, input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
467
+
468
+ if start_sec is None:
469
+ start_sec = 0
470
+ # do segmentation
471
+ return self.segment_feats(mspec, loge, difflen, start_sec),me
472
+
473
+
474
+ def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
475
+
476
+ if verbose:
477
+ print('batch_processing %d files' % len(linput))
478
+
479
+ if output_format == 'csv':
480
+ fexport = seg2csv
481
+ elif output_format == 'textgrid':
482
+ fexport = seg2textgrid
483
+ else:
484
+ raise NotImplementedError()
485
+
486
+ t_batch_start = time.time()
487
+
488
+ lmsg = []
489
+ fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
490
+ i = 0
491
+ for feats, msg in fg:
492
+ lmsg += msg
493
+ i += len(msg)
494
+ if verbose:
495
+ print('%d/%d' % (i, len(linput)), msg)
496
+ if feats is None:
497
+ break
498
+ mspec, loge, difflen = feats
499
+ #if verbose == True:
500
+ # print(i, linput[i], loutput[i])
501
+ b = time.time()
502
+ lseg = self.segment_feats(mspec, loge, difflen, 0)
503
+ fexport(lseg, loutput[len(lmsg) -1])
504
+ lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
505
+
506
+ t_batch_dur = time.time() - t_batch_start
507
+ nb_processed = len([e for e in lmsg if e[1] == 0])
508
+ if nb_processed > 0:
509
+ avg = t_batch_dur / nb_processed
510
+ else:
511
+ avg = -1
512
+ return t_batch_dur, nb_processed, avg, lmsg
513
+
514
+
515
+ def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
516
+ """
517
+ To be used when processing batches
518
+ if resulting file exists, it is skipped
519
+ in case of remote files, access is tried nbtry times
520
+ """
521
+ ret = None
522
+ msg = []
523
+ while ret is None and len(lin) > 0:
524
+ src = lin.pop(0)
525
+ dst = lout.pop(0)
526
+ # print('popping', src)
527
+
528
+ # if file exists: skipp
529
+ if skipifexist and os.path.exists(dst):
530
+ msg.append((dst, 1, 'already exists'))
531
+ continue
532
+
533
+ # create storing directory if required
534
+ dname = os.path.dirname(dst)
535
+ if not os.path.isdir(dname):
536
+ os.makedirs(dname)
537
+
538
+ itry = 0
539
+ while ret is None and itry < nbtry:
540
+ try:
541
+ ret = media2feats(src, tmpdir, None, None, ffmpeg)
542
+ except:
543
+ itry += 1
544
+ errmsg = sys.exc_info()[0]
545
+ if itry != nbtry:
546
+ time.sleep(random.random() * trydelay)
547
+ if ret is None:
548
+ msg.append((dst, 2, 'error: ' + str(errmsg)))
549
+ else:
550
+ msg.append((dst, 0, 'ok'))
551
+
552
+ return ret, msg
553
+
554
+
555
+ def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
556
+ # print('init feat gen', len(ilist))
557
+ thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
558
+ thread.start()
559
+ while True:
560
+ ret, msg = thread.join()
561
+ # print('join done', len(ilist))
562
+ # print('new list', ilist)
563
+ #ilist = ilist[len(msg):]
564
+ #olist = olist[len(msg):]
565
+ if len(ilist) == 0:
566
+ break
567
+ thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
568
+ thread.start()
569
+ yield ret, msg
570
+ yield ret, msg
sad_tf/sidekit_mfcc.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # This file is part of SIDEKIT.
5
+ #
6
+ # The following code has been copy-pasted from SIDEKIT source files:
7
+ # frontend/features.py frontend/io.py frontend/vad.py
8
+ #
9
+ # SIDEKIT is a python package for speaker verification.
10
+ # Home page: http://www-lium.univ-lemans.fr/sidekit/
11
+ #
12
+ # SIDEKIT is a python package for speaker verification.
13
+ # Home page: http://www-lium.univ-lemans.fr/sidekit/
14
+ #
15
+ # SIDEKIT is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU LLesser General Public License as
17
+ # published by the Free Software Foundation, either version 3 of the License,
18
+ # or (at your option) any later version.
19
+ #
20
+ # SIDEKIT is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU Lesser General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU Lesser General Public License
26
+ # along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ """
29
+ Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
30
+
31
+ :mod:`frontend` provides methods to process an audio signal in order to extract
32
+ useful parameters for speaker verification.
33
+ """
34
+
35
+
36
+ import numpy
37
+ import soundfile
38
+ import scipy
39
+ from scipy.fftpack.realtransforms import dct
40
+
41
+
42
+ __author__ = "Anthony Larcher and Sylvain Meignier"
43
+ __copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
44
+ __license__ = "LGPL"
45
+ __maintainer__ = "Anthony Larcher"
46
+ __email__ = "[email protected]"
47
+ __status__ = "Production"
48
+ __docformat__ = 'reStructuredText'
49
+
50
+
51
+
52
+
53
+ wav_flag = "float32" # Could be "int16"
54
+ PARAM_TYPE = numpy.float32
55
+
56
+
57
+ def read_wav(input_file_name):
58
+ """
59
+ :param input_file_name:
60
+ :return:
61
+ """
62
+ #with wave.open(input_file_name, "r") as wfh:
63
+ # (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
64
+ # raw = wfh.readframes(nframes * nchannels)
65
+ # out = struct.unpack_from("%dh" % nframes * nchannels, raw)
66
+ # sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
67
+ # return sig.astype(numpy.float32), framerate, sampwidth
68
+ nfo = soundfile.info(input_file_name)
69
+ sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
70
+ sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
71
+ sig = sig.astype(numpy.float32)
72
+ return sig, sample_rate, 4
73
+
74
+
75
+
76
+
77
+ def hz2mel(f, htk=True):
78
+ """Convert an array of frequency in Hz into mel.
79
+
80
+ :param f: frequency to convert
81
+
82
+ :return: the equivalence on the mel scale.
83
+ """
84
+ if htk:
85
+ return 2595 * numpy.log10(1 + f / 700.)
86
+ else:
87
+ f = numpy.array(f)
88
+
89
+ # Mel fn to match Slaney's Auditory Toolbox mfcc.m
90
+ # Mel fn to match Slaney's Auditory Toolbox mfcc.m
91
+ f_0 = 0.
92
+ f_sp = 200. / 3.
93
+ brkfrq = 1000.
94
+ brkpt = (brkfrq - f_0) / f_sp
95
+ logstep = numpy.exp(numpy.log(6.4) / 27)
96
+
97
+ linpts = f < brkfrq
98
+
99
+ z = numpy.zeros_like(f)
100
+ # fill in parts separately
101
+ z[linpts] = (f[linpts] - f_0) / f_sp
102
+ z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
103
+
104
+ if z.shape == (1,):
105
+ return z[0]
106
+ else:
107
+ return z
108
+
109
+ def mel2hz(z, htk=True):
110
+ """Convert an array of mel values in Hz.
111
+
112
+ :param m: ndarray of frequencies to convert in Hz.
113
+
114
+ :return: the equivalent values in Hertz.
115
+ """
116
+ if htk:
117
+ return 700. * (10**(z / 2595.) - 1)
118
+ else:
119
+ z = numpy.array(z, dtype=float)
120
+ f_0 = 0
121
+ f_sp = 200. / 3.
122
+ brkfrq = 1000.
123
+ brkpt = (brkfrq - f_0) / f_sp
124
+ logstep = numpy.exp(numpy.log(6.4) / 27)
125
+
126
+ linpts = (z < brkpt)
127
+
128
+ f = numpy.zeros_like(z)
129
+
130
+ # fill in parts separately
131
+ f[linpts] = f_0 + f_sp * z[linpts]
132
+ f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
133
+
134
+ if f.shape == (1,):
135
+ return f[0]
136
+ else:
137
+ return f
138
+
139
+
140
+
141
+ def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
142
+ """Compute triangular filterbank for cepstral coefficient computation.
143
+
144
+ :param fs: sampling frequency of the original signal.
145
+ :param nfft: number of points for the Fourier Transform
146
+ :param lowfreq: lower limit of the frequency band filtered
147
+ :param maxfreq: higher limit of the frequency band filtered
148
+ :param nlinfilt: number of linear filters to use in low frequencies
149
+ :param nlogfilt: number of log-linear filters to use in high frequencies
150
+ :param midfreq: frequency boundary between linear and log-linear filters
151
+
152
+ :return: the filter bank and the central frequencies of each filter
153
+ """
154
+ # Total number of filters
155
+ nfilt = nlinfilt + nlogfilt
156
+
157
+ # ------------------------
158
+ # Compute the filter bank
159
+ # ------------------------
160
+ # Compute start/middle/end points of the triangular filters in spectral
161
+ # domain
162
+ frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
163
+ if nlogfilt == 0:
164
+ linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
165
+ frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
166
+ elif nlinfilt == 0:
167
+ low_mel = hz2mel(lowfreq)
168
+ max_mel = hz2mel(maxfreq)
169
+ mels = numpy.zeros(nlogfilt + 2)
170
+ # mels[nlinfilt:]
171
+ melsc = (max_mel - low_mel) / (nfilt + 1)
172
+ mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
173
+ # Back to the frequency domain
174
+ frequences = mel2hz(mels)
175
+ else:
176
+ # Compute linear filters on [0;1000Hz]
177
+ linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
178
+ frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
179
+ # Compute log-linear filters on [1000;maxfreq]
180
+ low_mel = hz2mel(min([1000, maxfreq]))
181
+ max_mel = hz2mel(maxfreq)
182
+ mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
183
+ melsc = (max_mel - low_mel) / (nlogfilt + 1)
184
+
185
+ # Verify that mel2hz(melsc)>linsc
186
+ while mel2hz(melsc) < linsc:
187
+ # in this case, we add a linear filter
188
+ nlinfilt += 1
189
+ nlogfilt -= 1
190
+ frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
191
+ low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
192
+ max_mel = hz2mel(maxfreq)
193
+ mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
194
+ melsc = (max_mel - low_mel) / (nlogfilt + 1)
195
+
196
+ mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
197
+ # Back to the frequency domain
198
+ frequences[nlinfilt:] = mel2hz(mels)
199
+
200
+ heights = 2. / (frequences[2:] - frequences[0:-2])
201
+
202
+ # Compute filterbank coeff (in fft domain, in bins)
203
+ fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
204
+ # FFT bins (in Hz)
205
+ n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
206
+
207
+ for i in range(nfilt):
208
+ low = frequences[i]
209
+ cen = frequences[i + 1]
210
+ hi = frequences[i + 2]
211
+ try:
212
+ lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
213
+ except:
214
+ lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int32)
215
+ left_slope = heights[i] / (cen - low)
216
+ try:
217
+ rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
218
+ except:
219
+ rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int32)
220
+ right_slope = heights[i] / (hi - cen)
221
+ fbank[i][lid] = left_slope * (n_frequences[lid] - low)
222
+ fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
223
+
224
+ return fbank, frequences
225
+
226
+
227
+ def power_spectrum(input_sig,
228
+ fs=8000,
229
+ win_time=0.025,
230
+ shift=0.01,
231
+ prefac=0.97):
232
+ """
233
+ Compute the power spectrum of the signal.
234
+ :param input_sig:
235
+ :param fs:
236
+ :param win_time:
237
+ :param shift:
238
+ :param prefac:
239
+ :return:
240
+ """
241
+ window_length = int(round(win_time * fs))
242
+ overlap = window_length - int(shift * fs)
243
+ framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
244
+ # Pre-emphasis filtering is applied after framing to be consistent with stream processing
245
+ framed = pre_emphasis(framed, prefac)
246
+ l = framed.shape[0]
247
+ n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
248
+ # Windowing has been changed to hanning which is supposed to have less noisy sidelobes
249
+ # ham = numpy.hamming(window_length)
250
+ window = numpy.hanning(window_length)
251
+
252
+ spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
253
+ log_energy = numpy.log((framed**2).sum(axis=1))
254
+ dec = 500000
255
+ start = 0
256
+ stop = min(dec, l)
257
+ while start < l:
258
+ ahan = framed[start:stop, :] * window
259
+ mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
260
+ spec[start:stop, :] = mag.real**2 + mag.imag**2
261
+ start = stop
262
+ stop = min(stop + dec, l)
263
+
264
+ return spec, log_energy
265
+
266
+
267
+ def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
268
+ """
269
+ :param sig: input signal, can be mono or multi dimensional
270
+ :param win_size: size of the window in term of samples
271
+ :param win_shift: shift of the sliding window in terme of samples
272
+ :param context: tuple of left and right context
273
+ :param pad: can be zeros or edge
274
+ """
275
+ dsize = sig.dtype.itemsize
276
+ if sig.ndim == 1:
277
+ sig = sig[:, numpy.newaxis]
278
+ # Manage padding
279
+ c = (context, ) + (sig.ndim - 1) * ((0, 0), )
280
+ _win_size = win_size + sum(context)
281
+ shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
282
+ strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
283
+ if pad == 'zeros':
284
+ return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
285
+ shape=shape,
286
+ strides=strides).squeeze()
287
+ elif pad == 'edge':
288
+ return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
289
+ shape=shape,
290
+ strides=strides).squeeze()
291
+
292
+
293
+ def pre_emphasis(input_sig, pre):
294
+ """Pre-emphasis of an audio signal.
295
+ :param input_sig: the input vector of signal to pre emphasize
296
+ :param pre: value that defines the pre-emphasis filter.
297
+ """
298
+ if input_sig.ndim == 1:
299
+ return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
300
+ input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
301
+ else:
302
+ return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre
303
+
304
+
305
+ def mfcc(input_sig,
306
+ lowfreq=100, maxfreq=8000,
307
+ nlinfilt=0, nlogfilt=24,
308
+ nwin=0.025,
309
+ fs=16000,
310
+ nceps=13,
311
+ shift=0.01,
312
+ get_spec=False,
313
+ get_mspec=False,
314
+ prefac=0.97):
315
+ """Compute Mel Frequency Cepstral Coefficients.
316
+
317
+ :param input_sig: input signal from which the coefficients are computed.
318
+ Input audio is supposed to be RAW PCM 16bits
319
+ :param lowfreq: lower limit of the frequency band filtered.
320
+ Default is 100Hz.
321
+ :param maxfreq: higher limit of the frequency band filtered.
322
+ Default is 8000Hz.
323
+ :param nlinfilt: number of linear filters to use in low frequencies.
324
+ Default is 0.
325
+ :param nlogfilt: number of log-linear filters to use in high frequencies.
326
+ Default is 24.
327
+ :param nwin: length of the sliding window in seconds
328
+ Default is 0.025.
329
+ :param fs: sampling frequency of the original signal. Default is 16000Hz.
330
+ :param nceps: number of cepstral coefficients to extract.
331
+ Default is 13.
332
+ :param shift: shift between two analyses. Default is 0.01 (10ms).
333
+ :param get_spec: boolean, if true returns the spectrogram
334
+ :param get_mspec: boolean, if true returns the output of the filter banks
335
+ :param prefac: pre-emphasis filter value
336
+
337
+ :return: the cepstral coefficients in a ndaray as well as
338
+ the Log-spectrum in the mel-domain in a ndarray.
339
+
340
+ .. note:: MFCC are computed as follows:
341
+
342
+ - Pre-processing in time-domain (pre-emphasizing)
343
+ - Compute the spectrum amplitude by windowing with a Hamming window
344
+ - Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
345
+ linearly spaced on the mel scale, and have equal bandwith in the mel scale
346
+ - Compute the DCT of the log-spectrom
347
+ - Log-energy is returned as first coefficient of the feature vector.
348
+
349
+ For more details, refer to [Davis80]_.
350
+ """
351
+ # Compute power spectrum
352
+ spec, log_energy = power_spectrum(input_sig,
353
+ fs,
354
+ win_time=nwin,
355
+ shift=shift,
356
+ prefac=prefac)
357
+ # Filter the spectrum through the triangle filter-bank
358
+ n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
359
+ fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
360
+
361
+ mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
362
+ # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
363
+ # The C0 term is removed as it is the constant term
364
+ # ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
365
+ lst = list()
366
+ lst.append(None)
367
+ lst.append(log_energy)
368
+ if get_spec:
369
+ lst.append(spec)
370
+ else:
371
+ lst.append(None)
372
+ del spec
373
+ if get_mspec:
374
+ lst.append(mspec)
375
+ else:
376
+ lst.append(None)
377
+ del mspec
378
+
379
+ return lst
sad_tf/thread_returning.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Tue Mar 27 15:18:49 2018
5
+
6
+ @author: elechapt
7
+ """
8
+
9
+ from threading import Thread
10
+
11
+ class ThreadReturning(Thread):
12
+ """
13
+ Allow us to get the results from a thread
14
+ """
15
+ def __init__(self, *args, **kwargs):
16
+ Thread.__init__(self, *args, **kwargs)
17
+ self._return = None
18
+
19
+ def run(self):
20
+ if self._target is not None:
21
+ self._return = self._target(*self._args, **self._kwargs)
22
+
23
+ def join(self):
24
+ Thread.join(self)
25
+ return self._return
26
+
27
+
sad_tf/viterbi.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ # The MIT License (MIT)
5
+
6
+ # Copyright (c) 2014-2016 CNRS
7
+
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ # SOFTWARE.
25
+
26
+ # AUTHORS
27
+ # Hervé BREDIN - http://herve.niderb.fr
28
+
29
+ from __future__ import unicode_literals
30
+
31
+ import six.moves
32
+ import numpy as np
33
+ import itertools
34
+
35
+ VITERBI_CONSTRAINT_NONE = 0
36
+ VITERBI_CONSTRAINT_FORBIDDEN = 1
37
+ VITERBI_CONSTRAINT_MANDATORY = 2
38
+
39
+
40
+ LOG_ZERO = np.log(1e-200)
41
+
42
+ # handling 'consecutive' constraints is achieved by duplicating states
43
+ # the following functions are here to help in this process
44
+
45
+
46
+ # create new transition prob. matrix accounting for duplicated states.
47
+ def _update_transition(transition, consecutive):
48
+
49
+ # initialize with LOG_ZERO everywhere
50
+ # except on the +1 diagonal np.log(1)
51
+ new_n_states = np.sum(consecutive)
52
+ new_transition = LOG_ZERO * np.ones((new_n_states, new_n_states))
53
+ for i in range(1, new_n_states):
54
+ new_transition[i - 1, i] = np.log(1)
55
+
56
+ n_states = len(consecutive)
57
+ boundary = np.hstack(([0], np.cumsum(consecutive)))
58
+ start = boundary[:-1]
59
+ end = boundary[1:] - 1
60
+
61
+ for i, j in itertools.product(six.moves.range(n_states), repeat=2):
62
+ new_transition[end[i], start[j]] = transition[i, j]
63
+
64
+ return new_transition
65
+
66
+
67
+ # create new initial prob. matrix accounting for duplicated states.
68
+ def _update_initial(initial, consecutive):
69
+
70
+ new_n_states = np.sum(consecutive)
71
+ new_initial = LOG_ZERO * np.ones((new_n_states, ))
72
+
73
+ n_states = len(consecutive)
74
+ boundary = np.hstack(([0], np.cumsum(consecutive)))
75
+ start = boundary[:-1]
76
+
77
+ for i in range(n_states):
78
+ new_initial[start[i]] = initial[i]
79
+
80
+ return new_initial
81
+
82
+
83
+ # create new emission prob. matrix accounting for duplicated states.
84
+ def _update_emission(emission, consecutive):
85
+
86
+ return np.vstack(
87
+ np.tile(e, (c, 1)) # duplicate emission probabilities c times
88
+ for e, c in six.moves.zip(emission.T, consecutive)
89
+ ).T
90
+
91
+
92
+ # create new constraint matrix accounting for duplicated states
93
+ def _update_constraint(constraint, consecutive):
94
+
95
+ return np.vstack(
96
+ np.tile(e, (c, 1)) # duplicate constraint probabilities c times
97
+ for e, c in six.moves.zip(constraint.T, consecutive)
98
+ ).T
99
+
100
+
101
+ # convert sequence of duplicated states back to sequence of original states.
102
+ def _update_states(states, consecutive):
103
+
104
+ boundary = np.hstack(([0], np.cumsum(consecutive)))
105
+ start = boundary[:-1]
106
+ end = boundary[1:]
107
+
108
+ new_states = np.empty(states.shape)
109
+
110
+ for i, (s, e) in enumerate(six.moves.zip(start, end)):
111
+ new_states[np.where((s <= states) & (states < e))] = i
112
+
113
+ return new_states
114
+
115
+
116
+ def viterbi_decoding(emission, transition,
117
+ initial=None, consecutive=None, constraint=None):
118
+ """(Constrained) Viterbi decoding
119
+
120
+ Parameters
121
+ ----------
122
+ emission : array of shape (n_samples, n_states)
123
+ E[t, i] is the emission log-probabilities of sample t at state i.
124
+ transition : array of shape (n_states, n_states)
125
+ T[i, j] is the transition log-probabilities from state i to state j.
126
+ initial : optional, array of shape (n_states, )
127
+ I[i] is the initial log-probabilities of state i.
128
+ Defaults to equal log-probabilities.
129
+ consecutive : optional, int or int array of shape (n_states, )
130
+ C[i] is a the minimum-consecutive-states constraint for state i.
131
+ C[i] = 1 is equivalent to no constraint (default).
132
+ constraint : optional, array of shape (n_samples, n_states)
133
+ K[t, i] = 1 forbids state i at time t.
134
+ K[t, i] = 2 forces state i at time t.
135
+ Use K[t, i] = 0 for no constraint (default).
136
+
137
+ Returns
138
+ -------
139
+ states : array of shape (n_samples, )
140
+ Most probable state sequence
141
+
142
+ """
143
+
144
+ # ~~ INITIALIZATION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145
+
146
+ T, k = emission.shape # number of observations x number of states
147
+
148
+ # no minimum-consecutive-states constraints
149
+ if consecutive is None:
150
+ consecutive = np.ones((k, ), dtype=int)
151
+
152
+ # same value for all states
153
+ elif isinstance(consecutive, int):
154
+ consecutive = consecutive * np.ones((k, ), dtype=int)
155
+
156
+ # (potentially) different values per state
157
+ else:
158
+ consecutive = np.array(consecutive, dtype=int).reshape((k, ))
159
+
160
+ # at least one sample
161
+ consecutive = np.maximum(1, consecutive)
162
+
163
+ # balance initial probabilities when they are not provided
164
+ if initial is None:
165
+ initial = np.log(np.ones((k, )) / k)
166
+
167
+ # no constraint?
168
+ if constraint is None:
169
+ constraint = VITERBI_CONSTRAINT_NONE * np.ones((T, k))
170
+
171
+ # artificially create new states to account for 'consecutive' constraints
172
+ emission = _update_emission(emission, consecutive)
173
+ transition = _update_transition(transition, consecutive)
174
+ initial = _update_initial(initial, consecutive)
175
+ constraint = _update_constraint(constraint, consecutive)
176
+ T, K = emission.shape # number of observations x number of new states
177
+ states = np.arange(K) # states 0 to K-1
178
+
179
+ # set emission probability to zero for forbidden states
180
+ emission[
181
+ np.where(constraint == VITERBI_CONSTRAINT_FORBIDDEN)] = LOG_ZERO
182
+
183
+ # set emission probability to zero for all states but the mandatory one
184
+ for t, k in six.moves.zip(
185
+ *np.where(constraint == VITERBI_CONSTRAINT_MANDATORY)
186
+ ):
187
+ emission[t, states != k] = LOG_ZERO
188
+
189
+ # ~~ FORWARD PASS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
190
+
191
+ V = np.empty((T, K)) # V[t, k] is the probability of the
192
+ V[0, :] = emission[0, :] + initial # most probable state sequence for the
193
+ # first t observations that has k as
194
+ # its final state.
195
+
196
+ P = np.empty((T, K), dtype=int) # P[t, k] remembers which state was used
197
+ P[0, :] = states # to get from time t-1 to time t at
198
+ # state k
199
+
200
+ for t in range(1, T):
201
+
202
+ # tmp[k, k'] is the probability of the most probable path
203
+ # leading to state k at time t - 1, plus the probability of
204
+ # transitioning from state k to state k' (at time t)
205
+ tmp = (V[t - 1, :] + transition.T).T
206
+
207
+ # optimal path to state k at t comes from state P[t, k] at t - 1
208
+ # (find among all possible states at this time t)
209
+ P[t, :] = np.argmax(tmp, axis=0)
210
+
211
+ # update V for time t
212
+ V[t, :] = emission[t, :] + tmp[P[t, :], states]
213
+
214
+ # ~~ BACK-TRACKING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
215
+ X = np.empty((T,), dtype=int)
216
+ X[-1] = np.argmax(V[-1, :])
217
+ for t in range(1, T):
218
+ X[-(t + 1)] = P[-t, X[-t]]
219
+
220
+ # ~~ CONVERT BACK TO ORIGINAL STATES
221
+
222
+ return _update_states(X, consecutive)
sad_tf/viterbi_utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+
4
+ # The MIT License
5
+
6
+ # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
+
8
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ # of this software and associated documentation files (the "Software"), to deal
10
+ # in the Software without restriction, including without limitation the rights
11
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ # copies of the Software, and to permit persons to whom the Software is
13
+ # furnished to do so, subject to the following conditions:
14
+
15
+ # The above copyright notice and this permission notice shall be included in
16
+ # all copies or substantial portions of the Software.
17
+
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
+ # THE SOFTWARE.
25
+
26
+ import numpy as np
27
+
28
+
29
+ def pred2logemission(pred, eps=1e-10):
30
+ pred = np.array(pred)
31
+ ret = np.ones((len(pred), 2)) * eps
32
+ ret[pred == 0, 0] = 1 - eps
33
+ ret[pred == 1, 1] = 1 - eps
34
+ return np.log(ret)
35
+
36
+ def log_trans_exp(exp,cost0=0, cost1=0):
37
+ # transition cost is assumed to be 10**-exp
38
+ cost = -exp * np.log(10)
39
+ ret = np.ones((2,2)) * cost
40
+ ret[0,0]= cost0
41
+ ret[1,1]= cost1
42
+ return ret
43
+
44
+ def diag_trans_exp(exp, dim):
45
+ cost = -exp * np.log(10)
46
+ ret = np.ones((dim, dim)) * cost
47
+ for i in range(dim):
48
+ ret[i, i] = 0
49
+ return ret