Spaces:
Sleeping
Sleeping
imansarraf
commited on
Commit
•
ab53da2
1
Parent(s):
d1a38da
Upload 9 files
Browse files- sad_tf/__init__.py +2 -0
- sad_tf/export_funcs.py +238 -0
- sad_tf/features.py +62 -0
- sad_tf/segmenter_for_tf2_16.py +555 -0
- sad_tf/segmentero.py +570 -0
- sad_tf/sidekit_mfcc.py +379 -0
- sad_tf/thread_returning.py +27 -0
- sad_tf/viterbi.py +222 -0
- sad_tf/viterbi_utils.py +49 -0
sad_tf/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .segmenter import Segmenter,filter_output,filter_sig
|
2 |
+
from .export_funcs import seg2aud,seg2json,seg2Gender_Info,seg2Info
|
sad_tf/export_funcs.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
# The MIT License
|
5 |
+
|
6 |
+
# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
|
7 |
+
|
8 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
# of this software and associated documentation files (the "Software"), to deal
|
10 |
+
# in the Software without restriction, including without limitation the rights
|
11 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
# copies of the Software, and to permit persons to whom the Software is
|
13 |
+
# furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
# The above copyright notice and this permission notice shall be included in
|
16 |
+
# all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
24 |
+
# THE SOFTWARE.
|
25 |
+
|
26 |
+
import pandas as pd
|
27 |
+
from pytextgrid.PraatTextGrid import PraatTextGrid, Interval, Tier
|
28 |
+
import os
|
29 |
+
import json
|
30 |
+
|
31 |
+
def seg2csv(lseg, fout=None):
|
32 |
+
df = pd.DataFrame.from_records(lseg, columns=['labels', 'start', 'stop'])
|
33 |
+
df.to_csv(fout, sep='\t', index=False)
|
34 |
+
|
35 |
+
def seg2textgrid1(lseg, fout=None):
|
36 |
+
tier = Tier(name='inaSpeechSegmenter')
|
37 |
+
for label, start, stop,_ in lseg:
|
38 |
+
if (label=='noEnergy'):
|
39 |
+
label=''
|
40 |
+
tier.append(Interval(start, stop, label))
|
41 |
+
ptg = PraatTextGrid(xmin=lseg[0][1], xmax=lseg[-1][2])
|
42 |
+
ptg.append(tier)
|
43 |
+
ptg.save(fout)
|
44 |
+
|
45 |
+
|
46 |
+
def seg2json(lseg) :
|
47 |
+
try:
|
48 |
+
return(seg2json5(lseg))
|
49 |
+
except:
|
50 |
+
return(seg2json4(lseg))
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
def seg2Info(lseg):
|
56 |
+
|
57 |
+
|
58 |
+
x=[]
|
59 |
+
nch=0
|
60 |
+
for segs in lseg:
|
61 |
+
f=0
|
62 |
+
nch = nch+1
|
63 |
+
data_list=[]
|
64 |
+
if (segs!=-1):
|
65 |
+
for y in segs:
|
66 |
+
if (y[0]!='noEnergy'):
|
67 |
+
f = f + y[2] - y[1]
|
68 |
+
|
69 |
+
|
70 |
+
data = {
|
71 |
+
'channel' : nch,
|
72 |
+
'speech': f
|
73 |
+
}
|
74 |
+
x.append(data)
|
75 |
+
return(json.dumps(x))
|
76 |
+
|
77 |
+
|
78 |
+
def seg2Gender_Info(lseg):
|
79 |
+
|
80 |
+
|
81 |
+
x=[]
|
82 |
+
nch=0
|
83 |
+
for segs in lseg:
|
84 |
+
f=0
|
85 |
+
m=0
|
86 |
+
nch = nch+1
|
87 |
+
data_list=[]
|
88 |
+
if (segs!=-1):
|
89 |
+
for y in segs:
|
90 |
+
if (y[0]!='noEnergy'):
|
91 |
+
if (y[0] == "female"):
|
92 |
+
f = f + y[2] - y[1]
|
93 |
+
elif(y[0] == "male"):
|
94 |
+
m = m + y[2] - y[1]
|
95 |
+
|
96 |
+
|
97 |
+
data = {
|
98 |
+
'channel' : nch,
|
99 |
+
'male': m,
|
100 |
+
'female': f
|
101 |
+
}
|
102 |
+
x.append(data)
|
103 |
+
return(json.dumps(x))
|
104 |
+
|
105 |
+
def seg2json5(lseg):
|
106 |
+
|
107 |
+
|
108 |
+
x=[]
|
109 |
+
nch=0
|
110 |
+
for segs in lseg:
|
111 |
+
nch = nch+1
|
112 |
+
data_list=[]
|
113 |
+
if (segs!=-1):
|
114 |
+
for label, start, stop ,_,_ in segs:
|
115 |
+
if (label!='noEnergy'):
|
116 |
+
data = {
|
117 |
+
'startTime': start,
|
118 |
+
'endTime': stop,
|
119 |
+
'gender': label[0]
|
120 |
+
}
|
121 |
+
data_list.append(data)
|
122 |
+
data = {
|
123 |
+
'channel' : nch,
|
124 |
+
'segments' : data_list
|
125 |
+
}
|
126 |
+
x.append(data)
|
127 |
+
return(json.dumps(x))
|
128 |
+
|
129 |
+
def seg2json4(lseg):
|
130 |
+
|
131 |
+
x=[]
|
132 |
+
nch=0
|
133 |
+
for segs in lseg:
|
134 |
+
nch = nch+1
|
135 |
+
data_list=[]
|
136 |
+
if (segs!=-1):
|
137 |
+
for label, start, stop ,_ in segs:
|
138 |
+
if (label!='noEnergy'):
|
139 |
+
data = {
|
140 |
+
'startTime': start,
|
141 |
+
'endTime': stop,
|
142 |
+
'gender': label[0]
|
143 |
+
}
|
144 |
+
data_list.append(data)
|
145 |
+
data = {
|
146 |
+
'channel' : nch,
|
147 |
+
'segments' : data_list
|
148 |
+
}
|
149 |
+
x.append(data)
|
150 |
+
return(json.dumps(x))
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
def seg2aud(lseg , fout=None) :
|
156 |
+
try:
|
157 |
+
seg2aud5(lseg , fout)
|
158 |
+
except:
|
159 |
+
seg2aud4(lseg , fout)
|
160 |
+
|
161 |
+
def seg2aud5(lseg , fout=None):
|
162 |
+
if (lseg==-1):
|
163 |
+
return
|
164 |
+
with open(fout , 'w') as fid:
|
165 |
+
for label, start, stop ,_,_ in lseg:
|
166 |
+
if (label!='noEnergy'):
|
167 |
+
fid.write('%s\t%s\t%s\n' %(start , stop , label))
|
168 |
+
|
169 |
+
def seg2aud4(lseg , fout=None):
|
170 |
+
if (lseg==-1):
|
171 |
+
return
|
172 |
+
with open(fout , 'w') as fid:
|
173 |
+
for label, start, stop ,_ in lseg:
|
174 |
+
if (label!='noEnergy'):
|
175 |
+
fid.write('%s\t%s\t%s\n' %(start , stop , label))
|
176 |
+
|
177 |
+
def seg2textgrid(data , fout=None):
|
178 |
+
ghabli=False
|
179 |
+
kh=[]
|
180 |
+
if (True):
|
181 |
+
kh.append('File type = "ooTextFile"\n')
|
182 |
+
kh.append('Object class = "TextGrid"\n')
|
183 |
+
kh.append('\n')
|
184 |
+
kh.append('xmin = 0 \n')
|
185 |
+
kh.append('xmax = %s \n' %(data[-1][2]))
|
186 |
+
kh.append('tiers? <exists> \n')
|
187 |
+
kh.append('size = 1 \n')
|
188 |
+
kh.append('item []: \n')
|
189 |
+
kh.append(' item [1]:\n')
|
190 |
+
kh.append(' class = "IntervalTier" \n')
|
191 |
+
kh.append(' name = "sen" \n')
|
192 |
+
kh.append(' xmin = 0 \n')
|
193 |
+
kh.append(' xmax = %s \n' %(data[-1][2]))
|
194 |
+
kh.append(' intervals: size = %s \n' %(0))
|
195 |
+
x=1
|
196 |
+
|
197 |
+
if (float(data[0][1])>0):
|
198 |
+
kh.append(' intervals [%s]:\n' %(x))
|
199 |
+
kh.append(' xmin = 0\n')
|
200 |
+
kh.append(' xmax = %s \n' %(data[0][1]))
|
201 |
+
kh.append(' text = "" \n')
|
202 |
+
x=x+1
|
203 |
+
|
204 |
+
|
205 |
+
for i in range(len(data)):
|
206 |
+
kh.append(' intervals [%s]:\n' %(x))
|
207 |
+
if (ghabli):
|
208 |
+
kh.append(' xmin = %s \n' %(data[i-1][2]))
|
209 |
+
else:
|
210 |
+
kh.append(' xmin = %s \n' %(data[i][1]))
|
211 |
+
kh.append(' xmax = %s \n' %(data[i][2]))
|
212 |
+
kh.append(' text = "%s" \n' %(data[i][0].strip()))
|
213 |
+
x=x+1
|
214 |
+
|
215 |
+
if (i+1 >= len(data)):
|
216 |
+
break
|
217 |
+
|
218 |
+
if (data[i][2] != data[i+1][1]):
|
219 |
+
|
220 |
+
|
221 |
+
if (float(data[i+1][1]) - float(data[i][2]) > 0.5):
|
222 |
+
kh.append(' intervals [%s]:\n' %(x))
|
223 |
+
|
224 |
+
kh.append(' xmin = %s \n' %(data[i][2]))
|
225 |
+
kh.append(' xmax = %s \n' %(data[i+1][1]))
|
226 |
+
kh.append(' text = "" \n')
|
227 |
+
x=x+1
|
228 |
+
ghabli=False
|
229 |
+
else:
|
230 |
+
ghabli=True
|
231 |
+
|
232 |
+
|
233 |
+
kh[13] = (' intervals: size = %s \n' %(kh[-4].strip().split(' ')[1].replace('[','').replace(']','').replace(':','')))
|
234 |
+
|
235 |
+
|
236 |
+
with open(fout, mode='w') as fid:
|
237 |
+
for line in kh:
|
238 |
+
fid.write(line)
|
sad_tf/features.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
# The MIT License
|
5 |
+
|
6 |
+
# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
|
7 |
+
|
8 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
# of this software and associated documentation files (the "Software"), to deal
|
10 |
+
# in the Software without restriction, including without limitation the rights
|
11 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
# copies of the Software, and to permit persons to whom the Software is
|
13 |
+
# furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
# The above copyright notice and this permission notice shall be included in
|
16 |
+
# all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
24 |
+
# THE SOFTWARE.
|
25 |
+
|
26 |
+
import os
|
27 |
+
import numpy as np
|
28 |
+
from iman import Audio
|
29 |
+
|
30 |
+
#os.environ['SIDEKIT'] = 'theano=false,libsvm=false,cuda=false'
|
31 |
+
#from sidekit.frontend.io import read_wav
|
32 |
+
#from sidekit.frontend.features import mfcc
|
33 |
+
from .sidekit_mfcc import mfcc
|
34 |
+
|
35 |
+
|
36 |
+
def _wav2feats(wavname,input_type='file',sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
|
37 |
+
"""
|
38 |
+
Extract features for wav 16k mono
|
39 |
+
"""
|
40 |
+
|
41 |
+
if (input_type == 'file'):
|
42 |
+
sig = Audio.Read(wavname , sr,mono = True, ffmpeg_path=ffmpeg_path)
|
43 |
+
else:
|
44 |
+
sig = wavname
|
45 |
+
|
46 |
+
read_framerate=sr
|
47 |
+
|
48 |
+
|
49 |
+
_, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True,fs=sr, maxfreq=int(sr/2))
|
50 |
+
|
51 |
+
# Management of short duration segments
|
52 |
+
difflen = 0
|
53 |
+
if len(loge) < 68:
|
54 |
+
difflen = 68 - len(loge)
|
55 |
+
mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
|
56 |
+
|
57 |
+
return mspec, loge, difflen,sig
|
58 |
+
|
59 |
+
|
60 |
+
def media2feats(medianame,input_type='file', sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
|
61 |
+
|
62 |
+
return _wav2feats(medianame, input_type , sr,ffmpeg_path=ffmpeg_path)
|
sad_tf/segmenter_for_tf2_16.py
ADDED
@@ -0,0 +1,555 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
# The MIT License
|
5 |
+
|
6 |
+
# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
|
7 |
+
|
8 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
# of this software and associated documentation files (the "Software"), to deal
|
10 |
+
# in the Software without restriction, including without limitation the rights
|
11 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
# copies of the Software, and to permit persons to whom the Software is
|
13 |
+
# furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
# The above copyright notice and this permission notice shall be included in
|
16 |
+
# all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
24 |
+
# THE SOFTWARE.
|
25 |
+
|
26 |
+
import warnings
|
27 |
+
warnings.filterwarnings("ignore")
|
28 |
+
import os
|
29 |
+
# os.environ["CUDA_DEVICE_ORDER"]= '0'
|
30 |
+
import sys
|
31 |
+
import math
|
32 |
+
from iman import Audio
|
33 |
+
import numpy as np
|
34 |
+
from tensorflow import keras
|
35 |
+
# from tensorflow.compat.v1.keras.backend import set_session
|
36 |
+
import tensorflow.python.keras.backend as K
|
37 |
+
from .thread_returning import ThreadReturning
|
38 |
+
|
39 |
+
import shutil
|
40 |
+
import time
|
41 |
+
import random
|
42 |
+
|
43 |
+
from skimage.util import view_as_windows as vaw
|
44 |
+
|
45 |
+
|
46 |
+
from .viterbi import viterbi_decoding
|
47 |
+
from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
|
48 |
+
|
49 |
+
from .features import media2feats
|
50 |
+
from .export_funcs import seg2csv, seg2textgrid
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def _energy_activity(loge, ratio=0.4): ##########0.9
|
55 |
+
|
56 |
+
threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
|
57 |
+
raw_activity = (loge > threshold)
|
58 |
+
return viterbi_decoding(pred2logemission(raw_activity),
|
59 |
+
log_trans_exp(50, cost0=-5))
|
60 |
+
|
61 |
+
#exp(150, cost0=-5)
|
62 |
+
|
63 |
+
def filter_sig(isig , wav , sr=16000):
|
64 |
+
|
65 |
+
if (sr!=16000):
|
66 |
+
wav = Audio.Resample(wav , 16000, sr)
|
67 |
+
|
68 |
+
|
69 |
+
try:
|
70 |
+
w=[]
|
71 |
+
wn=[]
|
72 |
+
wn.append(wav[0 : int(isig[0][1]*sr)])
|
73 |
+
for i , xxx in enumerate(isig):
|
74 |
+
a=xxx[1]
|
75 |
+
b=xxx[2]
|
76 |
+
w.append(wav[int(a*sr) : int(b*sr)])
|
77 |
+
try:
|
78 |
+
wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
|
79 |
+
except:
|
80 |
+
wn.append(wav[int(isig[i][2]*sr) : len(wav)])
|
81 |
+
|
82 |
+
return (np.concatenate(w),np.concatenate(wn))
|
83 |
+
except:
|
84 |
+
w=[]
|
85 |
+
wn=[]
|
86 |
+
wn.append(wav[0 : int(isig[0][1]*sr)])
|
87 |
+
for i , [_,a,b,_,_] in enumerate(isig):
|
88 |
+
w.append(wav[int(a*sr) : int(b*sr)])
|
89 |
+
try:
|
90 |
+
wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
|
91 |
+
except:
|
92 |
+
wn.append(wav[int(isig[i][2]*sr) : len(wav)])
|
93 |
+
|
94 |
+
return (np.concatenate(w),np.concatenate(wn))
|
95 |
+
|
96 |
+
def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
|
97 |
+
|
98 |
+
if (len(isig)==0):
|
99 |
+
return -1
|
100 |
+
|
101 |
+
# _dels=[]
|
102 |
+
# for i , [_,_,_,_d] in enumerate(isig):
|
103 |
+
# if (_d<=ignore_small_speech_segments) :
|
104 |
+
# _dels.append(i)
|
105 |
+
# _dels.reverse()
|
106 |
+
# for i in _dels:
|
107 |
+
# del isig[i]
|
108 |
+
|
109 |
+
# if (len(isig)==0):
|
110 |
+
# return -1
|
111 |
+
|
112 |
+
isig = [list(x) for x in isig]
|
113 |
+
|
114 |
+
for i in range(len(isig)-1):
|
115 |
+
t = isig[i+1][1] - isig[i][2] # silence between towo chunk
|
116 |
+
isig[i].append(t)
|
117 |
+
isig[-1].append(-1)
|
118 |
+
|
119 |
+
|
120 |
+
if (len(isig)>0):
|
121 |
+
|
122 |
+
rang = np.arange(0.01,max_silence+0.1,0.1)
|
123 |
+
for di in rang:
|
124 |
+
for i , xxx in enumerate(isig):
|
125 |
+
|
126 |
+
_t = xxx[-1]
|
127 |
+
if (_t==-1):
|
128 |
+
break
|
129 |
+
if (_t <=di):
|
130 |
+
try:
|
131 |
+
if (isig[i+1][2] - isig[i][1] <= max_speech_len):
|
132 |
+
isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
|
133 |
+
del isig[i+1]
|
134 |
+
except:
|
135 |
+
pass
|
136 |
+
_dels=[]
|
137 |
+
for i , xxxx in enumerate(isig):
|
138 |
+
_d = xxxx[3]
|
139 |
+
if (_d<=ignore_small_speech_segments) :
|
140 |
+
_dels.append(i)
|
141 |
+
_dels.reverse()
|
142 |
+
|
143 |
+
for i in _dels:
|
144 |
+
del isig[i]
|
145 |
+
|
146 |
+
if (len(isig)==0):
|
147 |
+
return -1
|
148 |
+
|
149 |
+
|
150 |
+
isign=[]
|
151 |
+
for i , xxxxx in enumerate(isig):
|
152 |
+
_d = xxxxx[3]
|
153 |
+
if (_d> split_speech_bigger_than ) :
|
154 |
+
|
155 |
+
_gc = math.ceil(_d/split_speech_bigger_than)
|
156 |
+
m = _d/_gc
|
157 |
+
print('Bigger-->' + str(_d) + '-->' + str(m))
|
158 |
+
for jj in range(_gc):
|
159 |
+
fas=0
|
160 |
+
if (jj== _gc-1):
|
161 |
+
fas= isig[i][4]
|
162 |
+
isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] )
|
163 |
+
else:
|
164 |
+
isign.append(isig[i])
|
165 |
+
for i,(a,b,c,d,e) in enumerate(isign):
|
166 |
+
if (e==-1):
|
167 |
+
break
|
168 |
+
_addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
|
169 |
+
isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
|
170 |
+
|
171 |
+
return(isign)
|
172 |
+
|
173 |
+
|
174 |
+
def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
|
175 |
+
|
176 |
+
isig = []
|
177 |
+
i=0
|
178 |
+
while (i <len(vad)):
|
179 |
+
|
180 |
+
ml=0
|
181 |
+
inn = i
|
182 |
+
st = (vad[i][1])
|
183 |
+
|
184 |
+
while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
|
185 |
+
ml = (vad[i][2]) - st
|
186 |
+
if (ml > max_speech_len):
|
187 |
+
if (i>inn and i>0):
|
188 |
+
i=i-1
|
189 |
+
break
|
190 |
+
i=i+1
|
191 |
+
en = (vad[i][2])
|
192 |
+
fa = en-st
|
193 |
+
if (fa > ignore_small_speech_segments):
|
194 |
+
if (fa>split_speech_bigger_than):
|
195 |
+
_gc = math.ceil(fa/split_speech_bigger_than)
|
196 |
+
m = fa/_gc
|
197 |
+
print('Bigger-->' + str(fa) + '-->' + str(m))
|
198 |
+
for jj in range(_gc):
|
199 |
+
isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
|
200 |
+
else:
|
201 |
+
isig.append(('speech', st , en,fa))
|
202 |
+
i=i+1
|
203 |
+
isign=[]
|
204 |
+
for i,(a,b,c,d) in enumerate(isig):
|
205 |
+
if (i == len(isig)-1):
|
206 |
+
isign.append(isig[i])
|
207 |
+
break
|
208 |
+
_addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
|
209 |
+
isign.append([a,b,c+_addlen ,d+_addlen])
|
210 |
+
|
211 |
+
return(isign)
|
212 |
+
|
213 |
+
|
214 |
+
def _get_patches(mspec, w, step):
|
215 |
+
h = mspec.shape[1]
|
216 |
+
data = vaw(mspec, (w,h), step=step)
|
217 |
+
data.shape = (len(data), w*h)
|
218 |
+
data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
|
219 |
+
lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
|
220 |
+
rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
|
221 |
+
data = np.vstack(lfill + [data] + rfill )
|
222 |
+
finite = np.all(np.isfinite(data), axis=1)
|
223 |
+
data.shape = (len(data), w, h)
|
224 |
+
return data, finite
|
225 |
+
|
226 |
+
|
227 |
+
def _binidx2seglist(binidx):
|
228 |
+
"""
|
229 |
+
ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
|
230 |
+
Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
|
231 |
+
|
232 |
+
#TODO: is there a pandas alternative??
|
233 |
+
"""
|
234 |
+
curlabel = None
|
235 |
+
bseg = -1
|
236 |
+
ret = []
|
237 |
+
for i, e in enumerate(binidx):
|
238 |
+
if e != curlabel:
|
239 |
+
if curlabel is not None:
|
240 |
+
ret.append((curlabel, bseg, i))
|
241 |
+
curlabel = e
|
242 |
+
bseg = i
|
243 |
+
ret.append((curlabel, bseg, i + 1))
|
244 |
+
return ret
|
245 |
+
|
246 |
+
|
247 |
+
class DnnSegmenter:
|
248 |
+
"""
|
249 |
+
DnnSegmenter is an abstract class allowing to perform Dnn-based
|
250 |
+
segmentation using Keras serialized models using 24 mel spectrogram
|
251 |
+
features obtained with SIDEKIT framework.
|
252 |
+
|
253 |
+
Child classes MUST define the following class attributes:
|
254 |
+
* nmel: the number of mel bands to used (max: 24)
|
255 |
+
* viterbi_arg: the argument to be used with viterbi post-processing
|
256 |
+
* model_fname: the filename of the serialized keras model to be used
|
257 |
+
the model should be stored in the current directory
|
258 |
+
* inlabel: only segments with label name inlabel will be analyzed.
|
259 |
+
other labels will stay unchanged
|
260 |
+
* outlabels: the labels associated the output of neural network models
|
261 |
+
"""
|
262 |
+
def __init__(self, batch_size, vad_type,model_path):
|
263 |
+
# load the DNN model
|
264 |
+
if (vad_type!='vad'):
|
265 |
+
self.nn = keras.models.load_model(model_path, compile=False)
|
266 |
+
print('model Loded from--> ' + model_path)
|
267 |
+
# self.nn.summary()
|
268 |
+
self.batch_size = batch_size
|
269 |
+
|
270 |
+
def __call__(self, mspec, lseg, difflen = 0):
|
271 |
+
"""
|
272 |
+
*** input
|
273 |
+
* mspec: mel spectrogram
|
274 |
+
* lseg: list of tuples (label, start, stop) corresponding to previous segmentations
|
275 |
+
* difflen: 0 if the original length of the mel spectrogram is >= 68
|
276 |
+
otherwise it is set to 68 - length(mspec)
|
277 |
+
*** output
|
278 |
+
a list of adjacent tuples (label, start, stop)
|
279 |
+
"""
|
280 |
+
if self.nmel < 24:
|
281 |
+
mspec = mspec[:, :self.nmel].copy()
|
282 |
+
|
283 |
+
patches, finite = _get_patches(mspec, 68, 2)
|
284 |
+
if difflen > 0:
|
285 |
+
patches = patches[:-int(difflen / 2), :, :]
|
286 |
+
finite = finite[:-int(difflen / 2)]
|
287 |
+
|
288 |
+
assert len(finite) == len(patches), (len(patches), len(finite))
|
289 |
+
|
290 |
+
batch = []
|
291 |
+
for lab, start, stop in lseg:
|
292 |
+
if lab == self.inlabel:
|
293 |
+
batch.append(patches[start:stop, :])
|
294 |
+
|
295 |
+
if len(batch) > 0:
|
296 |
+
batch = np.concatenate(batch)
|
297 |
+
rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
|
298 |
+
|
299 |
+
ret = []
|
300 |
+
for lab, start, stop in lseg:
|
301 |
+
if lab != self.inlabel:
|
302 |
+
ret.append((lab, start, stop))
|
303 |
+
continue
|
304 |
+
|
305 |
+
l = stop - start
|
306 |
+
r = rawpred[:l]
|
307 |
+
rawpred = rawpred[l:]
|
308 |
+
r[finite[start:stop] == False, :] = 0.5
|
309 |
+
pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
|
310 |
+
for lab2, start2, stop2 in _binidx2seglist(pred):
|
311 |
+
ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
|
312 |
+
return ret
|
313 |
+
|
314 |
+
|
315 |
+
class SpeechMusic(DnnSegmenter):
|
316 |
+
# Voice activity detection: requires energetic activity detection
|
317 |
+
outlabels = ('speech', 'music')
|
318 |
+
inlabel = 'energy'
|
319 |
+
nmel = 21
|
320 |
+
viterbi_arg = 150
|
321 |
+
|
322 |
+
|
323 |
+
class SpeechMusicNoise(DnnSegmenter):
|
324 |
+
# Voice activity detection: requires energetic activity detection
|
325 |
+
outlabels = ('speech', 'music', 'noise')
|
326 |
+
inlabel = 'energy'
|
327 |
+
nmel = 21
|
328 |
+
viterbi_arg = 80
|
329 |
+
|
330 |
+
|
331 |
+
class Gender(DnnSegmenter):
|
332 |
+
# Gender Segmentation, requires voice activity detection
|
333 |
+
outlabels = ('female', 'male')
|
334 |
+
inlabel = 'speech'
|
335 |
+
nmel = 24
|
336 |
+
viterbi_arg = 80
|
337 |
+
|
338 |
+
|
339 |
+
|
340 |
+
class Segmenter:
|
341 |
+
|
342 |
+
|
343 |
+
def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.hdf5",gender_path="c:\\keras_male_female_cnn.hdf5" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda' ,input_type="file"):
|
344 |
+
"""
|
345 |
+
Load neural network models
|
346 |
+
|
347 |
+
Input:
|
348 |
+
|
349 |
+
'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
|
350 |
+
'sm' was used in the results presented in ICASSP 2017 paper
|
351 |
+
and in MIREX 2018 challenge submission
|
352 |
+
'smn' has been implemented more recently and has not been evaluated in papers
|
353 |
+
|
354 |
+
'detect_gender': if False, speech excerpts are return labelled as 'speech'
|
355 |
+
if True, speech excerpts are splitted into 'male' and 'female' segments
|
356 |
+
"""
|
357 |
+
|
358 |
+
if (device != 'cuda'):
|
359 |
+
os.environ["CUDA_DEVICE_ORDER"]= '-1'
|
360 |
+
else:
|
361 |
+
pass
|
362 |
+
|
363 |
+
|
364 |
+
import tensorflow as tf
|
365 |
+
|
366 |
+
config = tf.compat.v1.ConfigProto()
|
367 |
+
config.gpu_options.allow_growth = True
|
368 |
+
config.log_device_placement = True
|
369 |
+
sess = tf.compat.v1.Session(config=config)
|
370 |
+
sess = K.get_session()
|
371 |
+
|
372 |
+
|
373 |
+
self.complete_output = complete_output
|
374 |
+
self.sample_rate = sr
|
375 |
+
self.ffmpeg_path=ffmpeg_path
|
376 |
+
self.input_type = input_type
|
377 |
+
self.device = device
|
378 |
+
|
379 |
+
|
380 |
+
|
381 |
+
# self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
|
382 |
+
|
383 |
+
|
384 |
+
# select speech/music or speech/music/noise voice activity detection engine
|
385 |
+
assert vad_engine in ['sm', 'smn']
|
386 |
+
if vad_engine == 'sm':
|
387 |
+
self.vad = SpeechMusic(batch_size)
|
388 |
+
elif vad_engine == 'smn':
|
389 |
+
self.vad = SpeechMusicNoise(batch_size , vad_type,model_path)
|
390 |
+
|
391 |
+
# load gender detection NN if required
|
392 |
+
assert detect_gender in [True, False]
|
393 |
+
self.detect_gender = detect_gender
|
394 |
+
if detect_gender:
|
395 |
+
self.gender = Gender(batch_size , vad_type ,gender_path)
|
396 |
+
self.vad_type = vad_type
|
397 |
+
self.model_path = model_path
|
398 |
+
self.gender_path = gender_path
|
399 |
+
|
400 |
+
def segment_feats(self, mspec, loge, difflen, start_sec):
|
401 |
+
"""
|
402 |
+
do segmentation
|
403 |
+
require input corresponding to wav file sampled at 16000Hz
|
404 |
+
with a single channel
|
405 |
+
"""
|
406 |
+
|
407 |
+
|
408 |
+
|
409 |
+
|
410 |
+
# perform energy-based activity detection
|
411 |
+
lseg = []
|
412 |
+
vadseg=[]
|
413 |
+
for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
|
414 |
+
if lab == 0:
|
415 |
+
lab = 'noEnergy'
|
416 |
+
else:
|
417 |
+
lab = 'energy'
|
418 |
+
vadseg.append(('speech', start, stop))
|
419 |
+
lseg.append((lab, start, stop))
|
420 |
+
if (self.vad_type == 'vad'):
|
421 |
+
return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
|
422 |
+
# perform voice activity detection
|
423 |
+
lseg = self.vad(mspec, lseg, difflen)
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
# perform gender segmentation on speech segments
|
429 |
+
if self.detect_gender:
|
430 |
+
lseg = self.gender(mspec, lseg, difflen)
|
431 |
+
if (self.complete_output):
|
432 |
+
return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
|
433 |
+
else:
|
434 |
+
return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
|
435 |
+
|
436 |
+
|
437 |
+
def __call__(self, medianame,start_sec=None, stop_sec=None):
|
438 |
+
"""
|
439 |
+
Return segmentation of a given file
|
440 |
+
* convert file to wav 16k mono with ffmpeg
|
441 |
+
* call NN segmentation procedures
|
442 |
+
* media_name: path to the media to be processed (including remote url)
|
443 |
+
may include any format supported by ffmpeg
|
444 |
+
* tmpdir: allow to define a custom path for storing temporary files
|
445 |
+
fast read/write HD are a good choice
|
446 |
+
* start_sec (seconds): sound stream before start_sec won't be processed
|
447 |
+
* stop_sec (seconds): sound stream after stop_sec won't be processed
|
448 |
+
"""
|
449 |
+
|
450 |
+
|
451 |
+
mspec, loge, difflen , me = media2feats(medianame, self.input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
|
452 |
+
|
453 |
+
if start_sec is None:
|
454 |
+
start_sec = 0
|
455 |
+
# do segmentation
|
456 |
+
return self.segment_feats(mspec, loge, difflen, start_sec),me
|
457 |
+
|
458 |
+
|
459 |
+
def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
|
460 |
+
|
461 |
+
if verbose:
|
462 |
+
print('batch_processing %d files' % len(linput))
|
463 |
+
|
464 |
+
if output_format == 'csv':
|
465 |
+
fexport = seg2csv
|
466 |
+
elif output_format == 'textgrid':
|
467 |
+
fexport = seg2textgrid
|
468 |
+
else:
|
469 |
+
raise NotImplementedError()
|
470 |
+
|
471 |
+
t_batch_start = time.time()
|
472 |
+
|
473 |
+
lmsg = []
|
474 |
+
fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
|
475 |
+
i = 0
|
476 |
+
for feats, msg in fg:
|
477 |
+
lmsg += msg
|
478 |
+
i += len(msg)
|
479 |
+
if verbose:
|
480 |
+
print('%d/%d' % (i, len(linput)), msg)
|
481 |
+
if feats is None:
|
482 |
+
break
|
483 |
+
mspec, loge, difflen = feats
|
484 |
+
#if verbose == True:
|
485 |
+
# print(i, linput[i], loutput[i])
|
486 |
+
b = time.time()
|
487 |
+
lseg = self.segment_feats(mspec, loge, difflen, 0)
|
488 |
+
fexport(lseg, loutput[len(lmsg) -1])
|
489 |
+
lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
|
490 |
+
|
491 |
+
t_batch_dur = time.time() - t_batch_start
|
492 |
+
nb_processed = len([e for e in lmsg if e[1] == 0])
|
493 |
+
if nb_processed > 0:
|
494 |
+
avg = t_batch_dur / nb_processed
|
495 |
+
else:
|
496 |
+
avg = -1
|
497 |
+
return t_batch_dur, nb_processed, avg, lmsg
|
498 |
+
|
499 |
+
|
500 |
+
def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
|
501 |
+
"""
|
502 |
+
To be used when processing batches
|
503 |
+
if resulting file exists, it is skipped
|
504 |
+
in case of remote files, access is tried nbtry times
|
505 |
+
"""
|
506 |
+
ret = None
|
507 |
+
msg = []
|
508 |
+
while ret is None and len(lin) > 0:
|
509 |
+
src = lin.pop(0)
|
510 |
+
dst = lout.pop(0)
|
511 |
+
# print('popping', src)
|
512 |
+
|
513 |
+
# if file exists: skipp
|
514 |
+
if skipifexist and os.path.exists(dst):
|
515 |
+
msg.append((dst, 1, 'already exists'))
|
516 |
+
continue
|
517 |
+
|
518 |
+
# create storing directory if required
|
519 |
+
dname = os.path.dirname(dst)
|
520 |
+
if not os.path.isdir(dname):
|
521 |
+
os.makedirs(dname)
|
522 |
+
|
523 |
+
itry = 0
|
524 |
+
while ret is None and itry < nbtry:
|
525 |
+
try:
|
526 |
+
ret = media2feats(src, tmpdir, None, None, ffmpeg)
|
527 |
+
except:
|
528 |
+
itry += 1
|
529 |
+
errmsg = sys.exc_info()[0]
|
530 |
+
if itry != nbtry:
|
531 |
+
time.sleep(random.random() * trydelay)
|
532 |
+
if ret is None:
|
533 |
+
msg.append((dst, 2, 'error: ' + str(errmsg)))
|
534 |
+
else:
|
535 |
+
msg.append((dst, 0, 'ok'))
|
536 |
+
|
537 |
+
return ret, msg
|
538 |
+
|
539 |
+
|
540 |
+
def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
|
541 |
+
# print('init feat gen', len(ilist))
|
542 |
+
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
|
543 |
+
thread.start()
|
544 |
+
while True:
|
545 |
+
ret, msg = thread.join()
|
546 |
+
# print('join done', len(ilist))
|
547 |
+
# print('new list', ilist)
|
548 |
+
#ilist = ilist[len(msg):]
|
549 |
+
#olist = olist[len(msg):]
|
550 |
+
if len(ilist) == 0:
|
551 |
+
break
|
552 |
+
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
|
553 |
+
thread.start()
|
554 |
+
yield ret, msg
|
555 |
+
yield ret, msg
|
sad_tf/segmentero.py
ADDED
@@ -0,0 +1,570 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
# The MIT License
|
5 |
+
|
6 |
+
# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
|
7 |
+
|
8 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
# of this software and associated documentation files (the "Software"), to deal
|
10 |
+
# in the Software without restriction, including without limitation the rights
|
11 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
# copies of the Software, and to permit persons to whom the Software is
|
13 |
+
# furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
# The above copyright notice and this permission notice shall be included in
|
16 |
+
# all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
24 |
+
# THE SOFTWARE.
|
25 |
+
import onnxruntime
|
26 |
+
import warnings
|
27 |
+
warnings.filterwarnings("ignore")
|
28 |
+
import os
|
29 |
+
# os.environ["CUDA_DEVICE_ORDER"]= '0'
|
30 |
+
import sys
|
31 |
+
import math
|
32 |
+
from iman import Audio
|
33 |
+
import numpy as np
|
34 |
+
from tensorflow import keras
|
35 |
+
from tensorflow.compat.v1.keras.backend import set_session
|
36 |
+
from tqdm import tqdm
|
37 |
+
from .thread_returning import ThreadReturning
|
38 |
+
|
39 |
+
import shutil
|
40 |
+
import time
|
41 |
+
import random
|
42 |
+
|
43 |
+
from skimage.util import view_as_windows as vaw
|
44 |
+
|
45 |
+
|
46 |
+
from .viterbi import viterbi_decoding
|
47 |
+
from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
|
48 |
+
|
49 |
+
from .features import media2feats
|
50 |
+
from .export_funcs import seg2csv, seg2textgrid
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def _energy_activity(loge, ratio=0.4): ##########0.9
|
55 |
+
|
56 |
+
threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
|
57 |
+
raw_activity = (loge > threshold)
|
58 |
+
return viterbi_decoding(pred2logemission(raw_activity),
|
59 |
+
log_trans_exp(50, cost0=-5))
|
60 |
+
|
61 |
+
#exp(150, cost0=-5)
|
62 |
+
|
63 |
+
def filter_sig(isig , wav , sr=16000):
|
64 |
+
|
65 |
+
if (sr!=16000):
|
66 |
+
wav = Audio.Resample(wav , 16000, sr)
|
67 |
+
|
68 |
+
|
69 |
+
try:
|
70 |
+
w=[]
|
71 |
+
wn=[]
|
72 |
+
wn.append(wav[0 : int(isig[0][1]*sr)])
|
73 |
+
for i , [_,a,b,_] in enumerate(isig):
|
74 |
+
|
75 |
+
w.append(wav[int(a*sr) : int(b*sr)])
|
76 |
+
try:
|
77 |
+
wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
|
78 |
+
except:
|
79 |
+
wn.append(wav[int(isig[i][2]*sr) : len(wav)])
|
80 |
+
|
81 |
+
return (np.concatenate(w),np.concatenate(wn))
|
82 |
+
except:
|
83 |
+
w=[]
|
84 |
+
wn=[]
|
85 |
+
wn.append(wav[0 : int(isig[0][1]*sr)])
|
86 |
+
for i , [_,a,b,_,_] in enumerate(isig):
|
87 |
+
w.append(wav[int(a*sr) : int(b*sr)])
|
88 |
+
try:
|
89 |
+
wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
|
90 |
+
except:
|
91 |
+
wn.append(wav[int(isig[i][2]*sr) : len(wav)])
|
92 |
+
|
93 |
+
return (np.concatenate(w),np.concatenate(wn))
|
94 |
+
|
95 |
+
def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
|
96 |
+
|
97 |
+
if (len(isig)==0):
|
98 |
+
return -1
|
99 |
+
|
100 |
+
# _dels=[]
|
101 |
+
# for i , [_,_,_,_d] in enumerate(isig):
|
102 |
+
# if (_d<=ignore_small_speech_segments) :
|
103 |
+
# _dels.append(i)
|
104 |
+
# _dels.reverse()
|
105 |
+
# for i in _dels:
|
106 |
+
# del isig[i]
|
107 |
+
|
108 |
+
# if (len(isig)==0):
|
109 |
+
# return -1
|
110 |
+
|
111 |
+
|
112 |
+
for i in range(len(isig)-1):
|
113 |
+
t = isig[i+1][1] - isig[i][2] # silence between towo chunk
|
114 |
+
isig[i].append(t)
|
115 |
+
isig[-1].append(-1)
|
116 |
+
|
117 |
+
|
118 |
+
if (len(isig)>0):
|
119 |
+
|
120 |
+
rang = np.arange(0.01,max_silence+0.1,0.1)
|
121 |
+
for di in rang:
|
122 |
+
for i , [_,_,_,_,_t] in enumerate(isig):
|
123 |
+
if (_t==-1):
|
124 |
+
break
|
125 |
+
if (_t <=di):
|
126 |
+
try:
|
127 |
+
if (isig[i+1][2] - isig[i][1] <= max_speech_len):
|
128 |
+
isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
|
129 |
+
del isig[i+1]
|
130 |
+
except:
|
131 |
+
pass
|
132 |
+
_dels=[]
|
133 |
+
for i , [_,_,_,_d,_] in enumerate(isig):
|
134 |
+
if (_d<=ignore_small_speech_segments) :
|
135 |
+
_dels.append(i)
|
136 |
+
_dels.reverse()
|
137 |
+
|
138 |
+
for i in _dels:
|
139 |
+
del isig[i]
|
140 |
+
|
141 |
+
if (len(isig)==0):
|
142 |
+
return -1
|
143 |
+
|
144 |
+
|
145 |
+
isign=[]
|
146 |
+
for i , [_,_,_,_d,_] in enumerate(isig):
|
147 |
+
if (_d> split_speech_bigger_than ) :
|
148 |
+
|
149 |
+
_gc = math.ceil(_d/split_speech_bigger_than)
|
150 |
+
m = _d/_gc
|
151 |
+
print('Bigger-->' + str(_d) + '-->' + str(m))
|
152 |
+
for jj in range(_gc):
|
153 |
+
fas=0
|
154 |
+
if (jj== _gc-1):
|
155 |
+
fas= isig[i][4]
|
156 |
+
isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] )
|
157 |
+
else:
|
158 |
+
isign.append(isig[i])
|
159 |
+
for i,(a,b,c,d,e) in enumerate(isign):
|
160 |
+
if (e==-1):
|
161 |
+
break
|
162 |
+
_addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
|
163 |
+
isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
|
164 |
+
|
165 |
+
return(isign)
|
166 |
+
|
167 |
+
|
168 |
+
def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
|
169 |
+
|
170 |
+
isig = []
|
171 |
+
i=0
|
172 |
+
while (i <len(vad)):
|
173 |
+
|
174 |
+
ml=0
|
175 |
+
inn = i
|
176 |
+
st = (vad[i][1])
|
177 |
+
|
178 |
+
while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
|
179 |
+
ml = (vad[i][2]) - st
|
180 |
+
if (ml > max_speech_len):
|
181 |
+
if (i>inn and i>0):
|
182 |
+
i=i-1
|
183 |
+
break
|
184 |
+
i=i+1
|
185 |
+
en = (vad[i][2])
|
186 |
+
fa = en-st
|
187 |
+
if (fa > ignore_small_speech_segments):
|
188 |
+
if (fa>split_speech_bigger_than):
|
189 |
+
_gc = math.ceil(fa/split_speech_bigger_than)
|
190 |
+
m = fa/_gc
|
191 |
+
print('Bigger-->' + str(fa) + '-->' + str(m))
|
192 |
+
for jj in range(_gc):
|
193 |
+
isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
|
194 |
+
else:
|
195 |
+
isig.append(('speech', st , en,fa))
|
196 |
+
i=i+1
|
197 |
+
isign=[]
|
198 |
+
for i,(a,b,c,d) in enumerate(isig):
|
199 |
+
if (i == len(isig)-1):
|
200 |
+
isign.append(isig[i])
|
201 |
+
break
|
202 |
+
_addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
|
203 |
+
isign.append([a,b,c+_addlen ,d+_addlen])
|
204 |
+
|
205 |
+
return(isign)
|
206 |
+
|
207 |
+
|
208 |
+
def get_path_3d(data,batch_size):
|
209 |
+
total_batches = data.shape[0] // batch_size
|
210 |
+
last_batch_size = data.shape[0] % batch_size
|
211 |
+
if last_batch_size != 0:
|
212 |
+
batches = np.split(data[:total_batches * batch_size], total_batches)
|
213 |
+
last_batch = np.expand_dims(data[total_batches * batch_size:], axis=0).squeeze()
|
214 |
+
batches.append(last_batch)
|
215 |
+
else:
|
216 |
+
batches = np.split(data, total_batches)
|
217 |
+
return batches
|
218 |
+
|
219 |
+
|
220 |
+
def _get_patches(mspec, w, step):
|
221 |
+
h = mspec.shape[1]
|
222 |
+
data = vaw(mspec, (w,h), step=step)
|
223 |
+
data.shape = (len(data), w*h)
|
224 |
+
data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
|
225 |
+
lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
|
226 |
+
rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
|
227 |
+
data = np.vstack(lfill + [data] + rfill )
|
228 |
+
finite = np.all(np.isfinite(data), axis=1)
|
229 |
+
data.shape = (len(data), w, h)
|
230 |
+
return data, finite
|
231 |
+
|
232 |
+
|
233 |
+
def _binidx2seglist(binidx):
|
234 |
+
"""
|
235 |
+
ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
|
236 |
+
Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
|
237 |
+
|
238 |
+
#TODO: is there a pandas alternative??
|
239 |
+
"""
|
240 |
+
curlabel = None
|
241 |
+
bseg = -1
|
242 |
+
ret = []
|
243 |
+
for i, e in enumerate(binidx):
|
244 |
+
if e != curlabel:
|
245 |
+
if curlabel is not None:
|
246 |
+
ret.append((curlabel, bseg, i))
|
247 |
+
curlabel = e
|
248 |
+
bseg = i
|
249 |
+
ret.append((curlabel, bseg, i + 1))
|
250 |
+
return ret
|
251 |
+
|
252 |
+
|
253 |
+
class DnnSegmenter:
|
254 |
+
"""
|
255 |
+
DnnSegmenter is an abstract class allowing to perform Dnn-based
|
256 |
+
segmentation using Keras serialized models using 24 mel spectrogram
|
257 |
+
features obtained with SIDEKIT framework.
|
258 |
+
|
259 |
+
Child classes MUST define the following class attributes:
|
260 |
+
* nmel: the number of mel bands to used (max: 24)
|
261 |
+
* viterbi_arg: the argument to be used with viterbi post-processing
|
262 |
+
* model_fname: the filename of the serialized keras model to be used
|
263 |
+
the model should be stored in the current directory
|
264 |
+
* inlabel: only segments with label name inlabel will be analyzed.
|
265 |
+
other labels will stay unchanged
|
266 |
+
* outlabels: the labels associated the output of neural network models
|
267 |
+
"""
|
268 |
+
def __init__(self, batch_size, vad_type,model_path,EP_list):
|
269 |
+
# load the DNN model
|
270 |
+
if (vad_type!='vad'):
|
271 |
+
self.session = onnxruntime.InferenceSession(model_path,providers=EP_list)
|
272 |
+
#self.nn = keras.models.load_model(model_path, compile=False)
|
273 |
+
print('model Loded from--> ' + model_path)
|
274 |
+
# self.nn.summary()
|
275 |
+
self.batch_size = batch_size
|
276 |
+
|
277 |
+
def __call__(self, mspec, lseg, difflen = 0):
|
278 |
+
"""
|
279 |
+
*** input
|
280 |
+
* mspec: mel spectrogram
|
281 |
+
* lseg: list of tuples (label, start, stop) corresponding to previous segmentations
|
282 |
+
* difflen: 0 if the original length of the mel spectrogram is >= 68
|
283 |
+
otherwise it is set to 68 - length(mspec)
|
284 |
+
*** output
|
285 |
+
a list of adjacent tuples (label, start, stop)
|
286 |
+
"""
|
287 |
+
if self.nmel < 24:
|
288 |
+
mspec = mspec[:, :self.nmel].copy()
|
289 |
+
|
290 |
+
patches, finite = _get_patches(mspec, 68, 2)
|
291 |
+
if difflen > 0:
|
292 |
+
patches = patches[:-int(difflen / 2), :, :]
|
293 |
+
finite = finite[:-int(difflen / 2)]
|
294 |
+
|
295 |
+
assert len(finite) == len(patches), (len(patches), len(finite))
|
296 |
+
|
297 |
+
batch = []
|
298 |
+
for lab, start, stop in lseg:
|
299 |
+
if lab == self.inlabel:
|
300 |
+
batch.append(patches[start:stop, :])
|
301 |
+
|
302 |
+
if len(batch) > 0:
|
303 |
+
|
304 |
+
batch = np.concatenate(batch)
|
305 |
+
batches = get_path_3d(batch , self.batch_size,)
|
306 |
+
|
307 |
+
|
308 |
+
#rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
|
309 |
+
input_name = self.session.get_inputs()[0].name
|
310 |
+
rawpred=[]
|
311 |
+
for batch in tqdm(batches):
|
312 |
+
rawpred.append(self.session.run(None, {input_name: batch})[0])
|
313 |
+
|
314 |
+
rawpred = np.concatenate(rawpred)
|
315 |
+
|
316 |
+
|
317 |
+
ret = []
|
318 |
+
for lab, start, stop in lseg:
|
319 |
+
if lab != self.inlabel:
|
320 |
+
ret.append((lab, start, stop))
|
321 |
+
continue
|
322 |
+
|
323 |
+
l = stop - start
|
324 |
+
r = rawpred[:l]
|
325 |
+
rawpred = rawpred[l:]
|
326 |
+
r[finite[start:stop] == False, :] = 0.5
|
327 |
+
pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
|
328 |
+
for lab2, start2, stop2 in _binidx2seglist(pred):
|
329 |
+
ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
|
330 |
+
return ret
|
331 |
+
|
332 |
+
|
333 |
+
class SpeechMusic(DnnSegmenter):
|
334 |
+
# Voice activity detection: requires energetic activity detection
|
335 |
+
outlabels = ('speech', 'music')
|
336 |
+
inlabel = 'energy'
|
337 |
+
nmel = 21
|
338 |
+
viterbi_arg = 150
|
339 |
+
|
340 |
+
|
341 |
+
class SpeechMusicNoise(DnnSegmenter):
|
342 |
+
# Voice activity detection: requires energetic activity detection
|
343 |
+
outlabels = ('speech', 'music', 'noise')
|
344 |
+
inlabel = 'energy'
|
345 |
+
nmel = 21
|
346 |
+
viterbi_arg = 80
|
347 |
+
|
348 |
+
|
349 |
+
class Gender(DnnSegmenter):
|
350 |
+
# Gender Segmentation, requires voice activity detection
|
351 |
+
outlabels = ('female', 'male')
|
352 |
+
inlabel = 'speech'
|
353 |
+
nmel = 24
|
354 |
+
viterbi_arg = 80
|
355 |
+
|
356 |
+
|
357 |
+
|
358 |
+
class Segmenter:
|
359 |
+
|
360 |
+
|
361 |
+
def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.onnx",gender_path="c:\\keras_male_female_cnn.onnx" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda'):
|
362 |
+
"""
|
363 |
+
Load neural network models
|
364 |
+
|
365 |
+
Input:
|
366 |
+
|
367 |
+
'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
|
368 |
+
'sm' was used in the results presented in ICASSP 2017 paper
|
369 |
+
and in MIREX 2018 challenge submission
|
370 |
+
'smn' has been implemented more recently and has not been evaluated in papers
|
371 |
+
|
372 |
+
'detect_gender': if False, speech excerpts are return labelled as 'speech'
|
373 |
+
if True, speech excerpts are splitted into 'male' and 'female' segments
|
374 |
+
"""
|
375 |
+
self.complete_output = complete_output
|
376 |
+
self.sample_rate = sr
|
377 |
+
self.ffmpeg_path=ffmpeg_path
|
378 |
+
|
379 |
+
|
380 |
+
if (device != 'cuda'):
|
381 |
+
os.environ["CUDA_DEVICE_ORDER"]= '-1'
|
382 |
+
EP_list=[ 'CPUExecutionProvider']
|
383 |
+
else:
|
384 |
+
EP_list=['CUDAExecutionProvider']
|
385 |
+
|
386 |
+
import tensorflow as tf
|
387 |
+
|
388 |
+
config = tf.compat.v1.ConfigProto()
|
389 |
+
config.gpu_options.allow_growth = True
|
390 |
+
config.log_device_placement = True
|
391 |
+
sess = tf.compat.v1.Session(config=config)
|
392 |
+
set_session(sess)
|
393 |
+
|
394 |
+
|
395 |
+
|
396 |
+
# self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
|
397 |
+
|
398 |
+
|
399 |
+
# select speech/music or speech/music/noise voice activity detection engine
|
400 |
+
assert vad_engine in ['sm', 'smn']
|
401 |
+
if vad_engine == 'sm':
|
402 |
+
self.vad = SpeechMusic(batch_size)
|
403 |
+
elif vad_engine == 'smn':
|
404 |
+
self.vad = SpeechMusicNoise(batch_size , vad_type,model_path,EP_list)
|
405 |
+
|
406 |
+
# load gender detection NN if required
|
407 |
+
assert detect_gender in [True, False]
|
408 |
+
self.detect_gender = detect_gender
|
409 |
+
if detect_gender:
|
410 |
+
self.gender = Gender(batch_size , vad_type ,gender_path,EP_list)
|
411 |
+
self.vad_type = vad_type
|
412 |
+
self.model_path = model_path
|
413 |
+
self.gender_path = gender_path
|
414 |
+
|
415 |
+
def segment_feats(self, mspec, loge, difflen, start_sec):
|
416 |
+
"""
|
417 |
+
do segmentation
|
418 |
+
require input corresponding to wav file sampled at 16000Hz
|
419 |
+
with a single channel
|
420 |
+
"""
|
421 |
+
|
422 |
+
|
423 |
+
|
424 |
+
|
425 |
+
# perform energy-based activity detection
|
426 |
+
lseg = []
|
427 |
+
vadseg=[]
|
428 |
+
for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
|
429 |
+
if lab == 0:
|
430 |
+
lab = 'noEnergy'
|
431 |
+
else:
|
432 |
+
lab = 'energy'
|
433 |
+
vadseg.append(('speech', start, stop))
|
434 |
+
lseg.append((lab, start, stop))
|
435 |
+
if (self.vad_type == 'vad'):
|
436 |
+
return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
|
437 |
+
# perform voice activity detection
|
438 |
+
lseg = self.vad(mspec, lseg, difflen)
|
439 |
+
|
440 |
+
|
441 |
+
|
442 |
+
|
443 |
+
# perform gender segmentation on speech segments
|
444 |
+
if self.detect_gender:
|
445 |
+
lseg = self.gender(mspec, lseg, difflen)
|
446 |
+
if (self.complete_output):
|
447 |
+
return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
|
448 |
+
else:
|
449 |
+
return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
|
450 |
+
|
451 |
+
|
452 |
+
def __call__(self, medianame, input_type='file',start_sec=None, stop_sec=None):
|
453 |
+
"""
|
454 |
+
Return segmentation of a given file
|
455 |
+
* convert file to wav 16k mono with ffmpeg
|
456 |
+
* call NN segmentation procedures
|
457 |
+
* media_name: path to the media to be processed (including remote url)
|
458 |
+
may include any format supported by ffmpeg
|
459 |
+
* tmpdir: allow to define a custom path for storing temporary files
|
460 |
+
fast read/write HD are a good choice
|
461 |
+
* start_sec (seconds): sound stream before start_sec won't be processed
|
462 |
+
* stop_sec (seconds): sound stream after stop_sec won't be processed
|
463 |
+
"""
|
464 |
+
|
465 |
+
|
466 |
+
mspec, loge, difflen , me = media2feats(medianame, input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
|
467 |
+
|
468 |
+
if start_sec is None:
|
469 |
+
start_sec = 0
|
470 |
+
# do segmentation
|
471 |
+
return self.segment_feats(mspec, loge, difflen, start_sec),me
|
472 |
+
|
473 |
+
|
474 |
+
def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
|
475 |
+
|
476 |
+
if verbose:
|
477 |
+
print('batch_processing %d files' % len(linput))
|
478 |
+
|
479 |
+
if output_format == 'csv':
|
480 |
+
fexport = seg2csv
|
481 |
+
elif output_format == 'textgrid':
|
482 |
+
fexport = seg2textgrid
|
483 |
+
else:
|
484 |
+
raise NotImplementedError()
|
485 |
+
|
486 |
+
t_batch_start = time.time()
|
487 |
+
|
488 |
+
lmsg = []
|
489 |
+
fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
|
490 |
+
i = 0
|
491 |
+
for feats, msg in fg:
|
492 |
+
lmsg += msg
|
493 |
+
i += len(msg)
|
494 |
+
if verbose:
|
495 |
+
print('%d/%d' % (i, len(linput)), msg)
|
496 |
+
if feats is None:
|
497 |
+
break
|
498 |
+
mspec, loge, difflen = feats
|
499 |
+
#if verbose == True:
|
500 |
+
# print(i, linput[i], loutput[i])
|
501 |
+
b = time.time()
|
502 |
+
lseg = self.segment_feats(mspec, loge, difflen, 0)
|
503 |
+
fexport(lseg, loutput[len(lmsg) -1])
|
504 |
+
lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
|
505 |
+
|
506 |
+
t_batch_dur = time.time() - t_batch_start
|
507 |
+
nb_processed = len([e for e in lmsg if e[1] == 0])
|
508 |
+
if nb_processed > 0:
|
509 |
+
avg = t_batch_dur / nb_processed
|
510 |
+
else:
|
511 |
+
avg = -1
|
512 |
+
return t_batch_dur, nb_processed, avg, lmsg
|
513 |
+
|
514 |
+
|
515 |
+
def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
|
516 |
+
"""
|
517 |
+
To be used when processing batches
|
518 |
+
if resulting file exists, it is skipped
|
519 |
+
in case of remote files, access is tried nbtry times
|
520 |
+
"""
|
521 |
+
ret = None
|
522 |
+
msg = []
|
523 |
+
while ret is None and len(lin) > 0:
|
524 |
+
src = lin.pop(0)
|
525 |
+
dst = lout.pop(0)
|
526 |
+
# print('popping', src)
|
527 |
+
|
528 |
+
# if file exists: skipp
|
529 |
+
if skipifexist and os.path.exists(dst):
|
530 |
+
msg.append((dst, 1, 'already exists'))
|
531 |
+
continue
|
532 |
+
|
533 |
+
# create storing directory if required
|
534 |
+
dname = os.path.dirname(dst)
|
535 |
+
if not os.path.isdir(dname):
|
536 |
+
os.makedirs(dname)
|
537 |
+
|
538 |
+
itry = 0
|
539 |
+
while ret is None and itry < nbtry:
|
540 |
+
try:
|
541 |
+
ret = media2feats(src, tmpdir, None, None, ffmpeg)
|
542 |
+
except:
|
543 |
+
itry += 1
|
544 |
+
errmsg = sys.exc_info()[0]
|
545 |
+
if itry != nbtry:
|
546 |
+
time.sleep(random.random() * trydelay)
|
547 |
+
if ret is None:
|
548 |
+
msg.append((dst, 2, 'error: ' + str(errmsg)))
|
549 |
+
else:
|
550 |
+
msg.append((dst, 0, 'ok'))
|
551 |
+
|
552 |
+
return ret, msg
|
553 |
+
|
554 |
+
|
555 |
+
def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
|
556 |
+
# print('init feat gen', len(ilist))
|
557 |
+
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
|
558 |
+
thread.start()
|
559 |
+
while True:
|
560 |
+
ret, msg = thread.join()
|
561 |
+
# print('join done', len(ilist))
|
562 |
+
# print('new list', ilist)
|
563 |
+
#ilist = ilist[len(msg):]
|
564 |
+
#olist = olist[len(msg):]
|
565 |
+
if len(ilist) == 0:
|
566 |
+
break
|
567 |
+
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
|
568 |
+
thread.start()
|
569 |
+
yield ret, msg
|
570 |
+
yield ret, msg
|
sad_tf/sidekit_mfcc.py
ADDED
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
#
|
4 |
+
# This file is part of SIDEKIT.
|
5 |
+
#
|
6 |
+
# The following code has been copy-pasted from SIDEKIT source files:
|
7 |
+
# frontend/features.py frontend/io.py frontend/vad.py
|
8 |
+
#
|
9 |
+
# SIDEKIT is a python package for speaker verification.
|
10 |
+
# Home page: http://www-lium.univ-lemans.fr/sidekit/
|
11 |
+
#
|
12 |
+
# SIDEKIT is a python package for speaker verification.
|
13 |
+
# Home page: http://www-lium.univ-lemans.fr/sidekit/
|
14 |
+
#
|
15 |
+
# SIDEKIT is free software: you can redistribute it and/or modify
|
16 |
+
# it under the terms of the GNU LLesser General Public License as
|
17 |
+
# published by the Free Software Foundation, either version 3 of the License,
|
18 |
+
# or (at your option) any later version.
|
19 |
+
#
|
20 |
+
# SIDEKIT is distributed in the hope that it will be useful,
|
21 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
22 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
23 |
+
# GNU Lesser General Public License for more details.
|
24 |
+
#
|
25 |
+
# You should have received a copy of the GNU Lesser General Public License
|
26 |
+
# along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
|
27 |
+
|
28 |
+
"""
|
29 |
+
Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
|
30 |
+
|
31 |
+
:mod:`frontend` provides methods to process an audio signal in order to extract
|
32 |
+
useful parameters for speaker verification.
|
33 |
+
"""
|
34 |
+
|
35 |
+
|
36 |
+
import numpy
|
37 |
+
import soundfile
|
38 |
+
import scipy
|
39 |
+
from scipy.fftpack.realtransforms import dct
|
40 |
+
|
41 |
+
|
42 |
+
__author__ = "Anthony Larcher and Sylvain Meignier"
|
43 |
+
__copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
|
44 |
+
__license__ = "LGPL"
|
45 |
+
__maintainer__ = "Anthony Larcher"
|
46 |
+
__email__ = "[email protected]"
|
47 |
+
__status__ = "Production"
|
48 |
+
__docformat__ = 'reStructuredText'
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
wav_flag = "float32" # Could be "int16"
|
54 |
+
PARAM_TYPE = numpy.float32
|
55 |
+
|
56 |
+
|
57 |
+
def read_wav(input_file_name):
|
58 |
+
"""
|
59 |
+
:param input_file_name:
|
60 |
+
:return:
|
61 |
+
"""
|
62 |
+
#with wave.open(input_file_name, "r") as wfh:
|
63 |
+
# (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
|
64 |
+
# raw = wfh.readframes(nframes * nchannels)
|
65 |
+
# out = struct.unpack_from("%dh" % nframes * nchannels, raw)
|
66 |
+
# sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
|
67 |
+
# return sig.astype(numpy.float32), framerate, sampwidth
|
68 |
+
nfo = soundfile.info(input_file_name)
|
69 |
+
sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
|
70 |
+
sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
|
71 |
+
sig = sig.astype(numpy.float32)
|
72 |
+
return sig, sample_rate, 4
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
def hz2mel(f, htk=True):
|
78 |
+
"""Convert an array of frequency in Hz into mel.
|
79 |
+
|
80 |
+
:param f: frequency to convert
|
81 |
+
|
82 |
+
:return: the equivalence on the mel scale.
|
83 |
+
"""
|
84 |
+
if htk:
|
85 |
+
return 2595 * numpy.log10(1 + f / 700.)
|
86 |
+
else:
|
87 |
+
f = numpy.array(f)
|
88 |
+
|
89 |
+
# Mel fn to match Slaney's Auditory Toolbox mfcc.m
|
90 |
+
# Mel fn to match Slaney's Auditory Toolbox mfcc.m
|
91 |
+
f_0 = 0.
|
92 |
+
f_sp = 200. / 3.
|
93 |
+
brkfrq = 1000.
|
94 |
+
brkpt = (brkfrq - f_0) / f_sp
|
95 |
+
logstep = numpy.exp(numpy.log(6.4) / 27)
|
96 |
+
|
97 |
+
linpts = f < brkfrq
|
98 |
+
|
99 |
+
z = numpy.zeros_like(f)
|
100 |
+
# fill in parts separately
|
101 |
+
z[linpts] = (f[linpts] - f_0) / f_sp
|
102 |
+
z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
|
103 |
+
|
104 |
+
if z.shape == (1,):
|
105 |
+
return z[0]
|
106 |
+
else:
|
107 |
+
return z
|
108 |
+
|
109 |
+
def mel2hz(z, htk=True):
|
110 |
+
"""Convert an array of mel values in Hz.
|
111 |
+
|
112 |
+
:param m: ndarray of frequencies to convert in Hz.
|
113 |
+
|
114 |
+
:return: the equivalent values in Hertz.
|
115 |
+
"""
|
116 |
+
if htk:
|
117 |
+
return 700. * (10**(z / 2595.) - 1)
|
118 |
+
else:
|
119 |
+
z = numpy.array(z, dtype=float)
|
120 |
+
f_0 = 0
|
121 |
+
f_sp = 200. / 3.
|
122 |
+
brkfrq = 1000.
|
123 |
+
brkpt = (brkfrq - f_0) / f_sp
|
124 |
+
logstep = numpy.exp(numpy.log(6.4) / 27)
|
125 |
+
|
126 |
+
linpts = (z < brkpt)
|
127 |
+
|
128 |
+
f = numpy.zeros_like(z)
|
129 |
+
|
130 |
+
# fill in parts separately
|
131 |
+
f[linpts] = f_0 + f_sp * z[linpts]
|
132 |
+
f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
|
133 |
+
|
134 |
+
if f.shape == (1,):
|
135 |
+
return f[0]
|
136 |
+
else:
|
137 |
+
return f
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
|
142 |
+
"""Compute triangular filterbank for cepstral coefficient computation.
|
143 |
+
|
144 |
+
:param fs: sampling frequency of the original signal.
|
145 |
+
:param nfft: number of points for the Fourier Transform
|
146 |
+
:param lowfreq: lower limit of the frequency band filtered
|
147 |
+
:param maxfreq: higher limit of the frequency band filtered
|
148 |
+
:param nlinfilt: number of linear filters to use in low frequencies
|
149 |
+
:param nlogfilt: number of log-linear filters to use in high frequencies
|
150 |
+
:param midfreq: frequency boundary between linear and log-linear filters
|
151 |
+
|
152 |
+
:return: the filter bank and the central frequencies of each filter
|
153 |
+
"""
|
154 |
+
# Total number of filters
|
155 |
+
nfilt = nlinfilt + nlogfilt
|
156 |
+
|
157 |
+
# ------------------------
|
158 |
+
# Compute the filter bank
|
159 |
+
# ------------------------
|
160 |
+
# Compute start/middle/end points of the triangular filters in spectral
|
161 |
+
# domain
|
162 |
+
frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
|
163 |
+
if nlogfilt == 0:
|
164 |
+
linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
|
165 |
+
frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
|
166 |
+
elif nlinfilt == 0:
|
167 |
+
low_mel = hz2mel(lowfreq)
|
168 |
+
max_mel = hz2mel(maxfreq)
|
169 |
+
mels = numpy.zeros(nlogfilt + 2)
|
170 |
+
# mels[nlinfilt:]
|
171 |
+
melsc = (max_mel - low_mel) / (nfilt + 1)
|
172 |
+
mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
|
173 |
+
# Back to the frequency domain
|
174 |
+
frequences = mel2hz(mels)
|
175 |
+
else:
|
176 |
+
# Compute linear filters on [0;1000Hz]
|
177 |
+
linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
|
178 |
+
frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
|
179 |
+
# Compute log-linear filters on [1000;maxfreq]
|
180 |
+
low_mel = hz2mel(min([1000, maxfreq]))
|
181 |
+
max_mel = hz2mel(maxfreq)
|
182 |
+
mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
|
183 |
+
melsc = (max_mel - low_mel) / (nlogfilt + 1)
|
184 |
+
|
185 |
+
# Verify that mel2hz(melsc)>linsc
|
186 |
+
while mel2hz(melsc) < linsc:
|
187 |
+
# in this case, we add a linear filter
|
188 |
+
nlinfilt += 1
|
189 |
+
nlogfilt -= 1
|
190 |
+
frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
|
191 |
+
low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
|
192 |
+
max_mel = hz2mel(maxfreq)
|
193 |
+
mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
|
194 |
+
melsc = (max_mel - low_mel) / (nlogfilt + 1)
|
195 |
+
|
196 |
+
mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
|
197 |
+
# Back to the frequency domain
|
198 |
+
frequences[nlinfilt:] = mel2hz(mels)
|
199 |
+
|
200 |
+
heights = 2. / (frequences[2:] - frequences[0:-2])
|
201 |
+
|
202 |
+
# Compute filterbank coeff (in fft domain, in bins)
|
203 |
+
fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
|
204 |
+
# FFT bins (in Hz)
|
205 |
+
n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
|
206 |
+
|
207 |
+
for i in range(nfilt):
|
208 |
+
low = frequences[i]
|
209 |
+
cen = frequences[i + 1]
|
210 |
+
hi = frequences[i + 2]
|
211 |
+
try:
|
212 |
+
lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
|
213 |
+
except:
|
214 |
+
lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int32)
|
215 |
+
left_slope = heights[i] / (cen - low)
|
216 |
+
try:
|
217 |
+
rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
|
218 |
+
except:
|
219 |
+
rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int32)
|
220 |
+
right_slope = heights[i] / (hi - cen)
|
221 |
+
fbank[i][lid] = left_slope * (n_frequences[lid] - low)
|
222 |
+
fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
|
223 |
+
|
224 |
+
return fbank, frequences
|
225 |
+
|
226 |
+
|
227 |
+
def power_spectrum(input_sig,
|
228 |
+
fs=8000,
|
229 |
+
win_time=0.025,
|
230 |
+
shift=0.01,
|
231 |
+
prefac=0.97):
|
232 |
+
"""
|
233 |
+
Compute the power spectrum of the signal.
|
234 |
+
:param input_sig:
|
235 |
+
:param fs:
|
236 |
+
:param win_time:
|
237 |
+
:param shift:
|
238 |
+
:param prefac:
|
239 |
+
:return:
|
240 |
+
"""
|
241 |
+
window_length = int(round(win_time * fs))
|
242 |
+
overlap = window_length - int(shift * fs)
|
243 |
+
framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
|
244 |
+
# Pre-emphasis filtering is applied after framing to be consistent with stream processing
|
245 |
+
framed = pre_emphasis(framed, prefac)
|
246 |
+
l = framed.shape[0]
|
247 |
+
n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
|
248 |
+
# Windowing has been changed to hanning which is supposed to have less noisy sidelobes
|
249 |
+
# ham = numpy.hamming(window_length)
|
250 |
+
window = numpy.hanning(window_length)
|
251 |
+
|
252 |
+
spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
|
253 |
+
log_energy = numpy.log((framed**2).sum(axis=1))
|
254 |
+
dec = 500000
|
255 |
+
start = 0
|
256 |
+
stop = min(dec, l)
|
257 |
+
while start < l:
|
258 |
+
ahan = framed[start:stop, :] * window
|
259 |
+
mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
|
260 |
+
spec[start:stop, :] = mag.real**2 + mag.imag**2
|
261 |
+
start = stop
|
262 |
+
stop = min(stop + dec, l)
|
263 |
+
|
264 |
+
return spec, log_energy
|
265 |
+
|
266 |
+
|
267 |
+
def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
|
268 |
+
"""
|
269 |
+
:param sig: input signal, can be mono or multi dimensional
|
270 |
+
:param win_size: size of the window in term of samples
|
271 |
+
:param win_shift: shift of the sliding window in terme of samples
|
272 |
+
:param context: tuple of left and right context
|
273 |
+
:param pad: can be zeros or edge
|
274 |
+
"""
|
275 |
+
dsize = sig.dtype.itemsize
|
276 |
+
if sig.ndim == 1:
|
277 |
+
sig = sig[:, numpy.newaxis]
|
278 |
+
# Manage padding
|
279 |
+
c = (context, ) + (sig.ndim - 1) * ((0, 0), )
|
280 |
+
_win_size = win_size + sum(context)
|
281 |
+
shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
|
282 |
+
strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
|
283 |
+
if pad == 'zeros':
|
284 |
+
return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
|
285 |
+
shape=shape,
|
286 |
+
strides=strides).squeeze()
|
287 |
+
elif pad == 'edge':
|
288 |
+
return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
|
289 |
+
shape=shape,
|
290 |
+
strides=strides).squeeze()
|
291 |
+
|
292 |
+
|
293 |
+
def pre_emphasis(input_sig, pre):
|
294 |
+
"""Pre-emphasis of an audio signal.
|
295 |
+
:param input_sig: the input vector of signal to pre emphasize
|
296 |
+
:param pre: value that defines the pre-emphasis filter.
|
297 |
+
"""
|
298 |
+
if input_sig.ndim == 1:
|
299 |
+
return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
|
300 |
+
input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
|
301 |
+
else:
|
302 |
+
return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre
|
303 |
+
|
304 |
+
|
305 |
+
def mfcc(input_sig,
|
306 |
+
lowfreq=100, maxfreq=8000,
|
307 |
+
nlinfilt=0, nlogfilt=24,
|
308 |
+
nwin=0.025,
|
309 |
+
fs=16000,
|
310 |
+
nceps=13,
|
311 |
+
shift=0.01,
|
312 |
+
get_spec=False,
|
313 |
+
get_mspec=False,
|
314 |
+
prefac=0.97):
|
315 |
+
"""Compute Mel Frequency Cepstral Coefficients.
|
316 |
+
|
317 |
+
:param input_sig: input signal from which the coefficients are computed.
|
318 |
+
Input audio is supposed to be RAW PCM 16bits
|
319 |
+
:param lowfreq: lower limit of the frequency band filtered.
|
320 |
+
Default is 100Hz.
|
321 |
+
:param maxfreq: higher limit of the frequency band filtered.
|
322 |
+
Default is 8000Hz.
|
323 |
+
:param nlinfilt: number of linear filters to use in low frequencies.
|
324 |
+
Default is 0.
|
325 |
+
:param nlogfilt: number of log-linear filters to use in high frequencies.
|
326 |
+
Default is 24.
|
327 |
+
:param nwin: length of the sliding window in seconds
|
328 |
+
Default is 0.025.
|
329 |
+
:param fs: sampling frequency of the original signal. Default is 16000Hz.
|
330 |
+
:param nceps: number of cepstral coefficients to extract.
|
331 |
+
Default is 13.
|
332 |
+
:param shift: shift between two analyses. Default is 0.01 (10ms).
|
333 |
+
:param get_spec: boolean, if true returns the spectrogram
|
334 |
+
:param get_mspec: boolean, if true returns the output of the filter banks
|
335 |
+
:param prefac: pre-emphasis filter value
|
336 |
+
|
337 |
+
:return: the cepstral coefficients in a ndaray as well as
|
338 |
+
the Log-spectrum in the mel-domain in a ndarray.
|
339 |
+
|
340 |
+
.. note:: MFCC are computed as follows:
|
341 |
+
|
342 |
+
- Pre-processing in time-domain (pre-emphasizing)
|
343 |
+
- Compute the spectrum amplitude by windowing with a Hamming window
|
344 |
+
- Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
|
345 |
+
linearly spaced on the mel scale, and have equal bandwith in the mel scale
|
346 |
+
- Compute the DCT of the log-spectrom
|
347 |
+
- Log-energy is returned as first coefficient of the feature vector.
|
348 |
+
|
349 |
+
For more details, refer to [Davis80]_.
|
350 |
+
"""
|
351 |
+
# Compute power spectrum
|
352 |
+
spec, log_energy = power_spectrum(input_sig,
|
353 |
+
fs,
|
354 |
+
win_time=nwin,
|
355 |
+
shift=shift,
|
356 |
+
prefac=prefac)
|
357 |
+
# Filter the spectrum through the triangle filter-bank
|
358 |
+
n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
|
359 |
+
fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
|
360 |
+
|
361 |
+
mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
|
362 |
+
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
|
363 |
+
# The C0 term is removed as it is the constant term
|
364 |
+
# ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
|
365 |
+
lst = list()
|
366 |
+
lst.append(None)
|
367 |
+
lst.append(log_energy)
|
368 |
+
if get_spec:
|
369 |
+
lst.append(spec)
|
370 |
+
else:
|
371 |
+
lst.append(None)
|
372 |
+
del spec
|
373 |
+
if get_mspec:
|
374 |
+
lst.append(mspec)
|
375 |
+
else:
|
376 |
+
lst.append(None)
|
377 |
+
del mspec
|
378 |
+
|
379 |
+
return lst
|
sad_tf/thread_returning.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Tue Mar 27 15:18:49 2018
|
5 |
+
|
6 |
+
@author: elechapt
|
7 |
+
"""
|
8 |
+
|
9 |
+
from threading import Thread
|
10 |
+
|
11 |
+
class ThreadReturning(Thread):
|
12 |
+
"""
|
13 |
+
Allow us to get the results from a thread
|
14 |
+
"""
|
15 |
+
def __init__(self, *args, **kwargs):
|
16 |
+
Thread.__init__(self, *args, **kwargs)
|
17 |
+
self._return = None
|
18 |
+
|
19 |
+
def run(self):
|
20 |
+
if self._target is not None:
|
21 |
+
self._return = self._target(*self._args, **self._kwargs)
|
22 |
+
|
23 |
+
def join(self):
|
24 |
+
Thread.join(self)
|
25 |
+
return self._return
|
26 |
+
|
27 |
+
|
sad_tf/viterbi.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
# The MIT License (MIT)
|
5 |
+
|
6 |
+
# Copyright (c) 2014-2016 CNRS
|
7 |
+
|
8 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
# of this software and associated documentation files (the "Software"), to deal
|
10 |
+
# in the Software without restriction, including without limitation the rights
|
11 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
# copies of the Software, and to permit persons to whom the Software is
|
13 |
+
# furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
# The above copyright notice and this permission notice shall be included in
|
16 |
+
# all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24 |
+
# SOFTWARE.
|
25 |
+
|
26 |
+
# AUTHORS
|
27 |
+
# Hervé BREDIN - http://herve.niderb.fr
|
28 |
+
|
29 |
+
from __future__ import unicode_literals
|
30 |
+
|
31 |
+
import six.moves
|
32 |
+
import numpy as np
|
33 |
+
import itertools
|
34 |
+
|
35 |
+
VITERBI_CONSTRAINT_NONE = 0
|
36 |
+
VITERBI_CONSTRAINT_FORBIDDEN = 1
|
37 |
+
VITERBI_CONSTRAINT_MANDATORY = 2
|
38 |
+
|
39 |
+
|
40 |
+
LOG_ZERO = np.log(1e-200)
|
41 |
+
|
42 |
+
# handling 'consecutive' constraints is achieved by duplicating states
|
43 |
+
# the following functions are here to help in this process
|
44 |
+
|
45 |
+
|
46 |
+
# create new transition prob. matrix accounting for duplicated states.
|
47 |
+
def _update_transition(transition, consecutive):
|
48 |
+
|
49 |
+
# initialize with LOG_ZERO everywhere
|
50 |
+
# except on the +1 diagonal np.log(1)
|
51 |
+
new_n_states = np.sum(consecutive)
|
52 |
+
new_transition = LOG_ZERO * np.ones((new_n_states, new_n_states))
|
53 |
+
for i in range(1, new_n_states):
|
54 |
+
new_transition[i - 1, i] = np.log(1)
|
55 |
+
|
56 |
+
n_states = len(consecutive)
|
57 |
+
boundary = np.hstack(([0], np.cumsum(consecutive)))
|
58 |
+
start = boundary[:-1]
|
59 |
+
end = boundary[1:] - 1
|
60 |
+
|
61 |
+
for i, j in itertools.product(six.moves.range(n_states), repeat=2):
|
62 |
+
new_transition[end[i], start[j]] = transition[i, j]
|
63 |
+
|
64 |
+
return new_transition
|
65 |
+
|
66 |
+
|
67 |
+
# create new initial prob. matrix accounting for duplicated states.
|
68 |
+
def _update_initial(initial, consecutive):
|
69 |
+
|
70 |
+
new_n_states = np.sum(consecutive)
|
71 |
+
new_initial = LOG_ZERO * np.ones((new_n_states, ))
|
72 |
+
|
73 |
+
n_states = len(consecutive)
|
74 |
+
boundary = np.hstack(([0], np.cumsum(consecutive)))
|
75 |
+
start = boundary[:-1]
|
76 |
+
|
77 |
+
for i in range(n_states):
|
78 |
+
new_initial[start[i]] = initial[i]
|
79 |
+
|
80 |
+
return new_initial
|
81 |
+
|
82 |
+
|
83 |
+
# create new emission prob. matrix accounting for duplicated states.
|
84 |
+
def _update_emission(emission, consecutive):
|
85 |
+
|
86 |
+
return np.vstack(
|
87 |
+
np.tile(e, (c, 1)) # duplicate emission probabilities c times
|
88 |
+
for e, c in six.moves.zip(emission.T, consecutive)
|
89 |
+
).T
|
90 |
+
|
91 |
+
|
92 |
+
# create new constraint matrix accounting for duplicated states
|
93 |
+
def _update_constraint(constraint, consecutive):
|
94 |
+
|
95 |
+
return np.vstack(
|
96 |
+
np.tile(e, (c, 1)) # duplicate constraint probabilities c times
|
97 |
+
for e, c in six.moves.zip(constraint.T, consecutive)
|
98 |
+
).T
|
99 |
+
|
100 |
+
|
101 |
+
# convert sequence of duplicated states back to sequence of original states.
|
102 |
+
def _update_states(states, consecutive):
|
103 |
+
|
104 |
+
boundary = np.hstack(([0], np.cumsum(consecutive)))
|
105 |
+
start = boundary[:-1]
|
106 |
+
end = boundary[1:]
|
107 |
+
|
108 |
+
new_states = np.empty(states.shape)
|
109 |
+
|
110 |
+
for i, (s, e) in enumerate(six.moves.zip(start, end)):
|
111 |
+
new_states[np.where((s <= states) & (states < e))] = i
|
112 |
+
|
113 |
+
return new_states
|
114 |
+
|
115 |
+
|
116 |
+
def viterbi_decoding(emission, transition,
|
117 |
+
initial=None, consecutive=None, constraint=None):
|
118 |
+
"""(Constrained) Viterbi decoding
|
119 |
+
|
120 |
+
Parameters
|
121 |
+
----------
|
122 |
+
emission : array of shape (n_samples, n_states)
|
123 |
+
E[t, i] is the emission log-probabilities of sample t at state i.
|
124 |
+
transition : array of shape (n_states, n_states)
|
125 |
+
T[i, j] is the transition log-probabilities from state i to state j.
|
126 |
+
initial : optional, array of shape (n_states, )
|
127 |
+
I[i] is the initial log-probabilities of state i.
|
128 |
+
Defaults to equal log-probabilities.
|
129 |
+
consecutive : optional, int or int array of shape (n_states, )
|
130 |
+
C[i] is a the minimum-consecutive-states constraint for state i.
|
131 |
+
C[i] = 1 is equivalent to no constraint (default).
|
132 |
+
constraint : optional, array of shape (n_samples, n_states)
|
133 |
+
K[t, i] = 1 forbids state i at time t.
|
134 |
+
K[t, i] = 2 forces state i at time t.
|
135 |
+
Use K[t, i] = 0 for no constraint (default).
|
136 |
+
|
137 |
+
Returns
|
138 |
+
-------
|
139 |
+
states : array of shape (n_samples, )
|
140 |
+
Most probable state sequence
|
141 |
+
|
142 |
+
"""
|
143 |
+
|
144 |
+
# ~~ INITIALIZATION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
145 |
+
|
146 |
+
T, k = emission.shape # number of observations x number of states
|
147 |
+
|
148 |
+
# no minimum-consecutive-states constraints
|
149 |
+
if consecutive is None:
|
150 |
+
consecutive = np.ones((k, ), dtype=int)
|
151 |
+
|
152 |
+
# same value for all states
|
153 |
+
elif isinstance(consecutive, int):
|
154 |
+
consecutive = consecutive * np.ones((k, ), dtype=int)
|
155 |
+
|
156 |
+
# (potentially) different values per state
|
157 |
+
else:
|
158 |
+
consecutive = np.array(consecutive, dtype=int).reshape((k, ))
|
159 |
+
|
160 |
+
# at least one sample
|
161 |
+
consecutive = np.maximum(1, consecutive)
|
162 |
+
|
163 |
+
# balance initial probabilities when they are not provided
|
164 |
+
if initial is None:
|
165 |
+
initial = np.log(np.ones((k, )) / k)
|
166 |
+
|
167 |
+
# no constraint?
|
168 |
+
if constraint is None:
|
169 |
+
constraint = VITERBI_CONSTRAINT_NONE * np.ones((T, k))
|
170 |
+
|
171 |
+
# artificially create new states to account for 'consecutive' constraints
|
172 |
+
emission = _update_emission(emission, consecutive)
|
173 |
+
transition = _update_transition(transition, consecutive)
|
174 |
+
initial = _update_initial(initial, consecutive)
|
175 |
+
constraint = _update_constraint(constraint, consecutive)
|
176 |
+
T, K = emission.shape # number of observations x number of new states
|
177 |
+
states = np.arange(K) # states 0 to K-1
|
178 |
+
|
179 |
+
# set emission probability to zero for forbidden states
|
180 |
+
emission[
|
181 |
+
np.where(constraint == VITERBI_CONSTRAINT_FORBIDDEN)] = LOG_ZERO
|
182 |
+
|
183 |
+
# set emission probability to zero for all states but the mandatory one
|
184 |
+
for t, k in six.moves.zip(
|
185 |
+
*np.where(constraint == VITERBI_CONSTRAINT_MANDATORY)
|
186 |
+
):
|
187 |
+
emission[t, states != k] = LOG_ZERO
|
188 |
+
|
189 |
+
# ~~ FORWARD PASS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
190 |
+
|
191 |
+
V = np.empty((T, K)) # V[t, k] is the probability of the
|
192 |
+
V[0, :] = emission[0, :] + initial # most probable state sequence for the
|
193 |
+
# first t observations that has k as
|
194 |
+
# its final state.
|
195 |
+
|
196 |
+
P = np.empty((T, K), dtype=int) # P[t, k] remembers which state was used
|
197 |
+
P[0, :] = states # to get from time t-1 to time t at
|
198 |
+
# state k
|
199 |
+
|
200 |
+
for t in range(1, T):
|
201 |
+
|
202 |
+
# tmp[k, k'] is the probability of the most probable path
|
203 |
+
# leading to state k at time t - 1, plus the probability of
|
204 |
+
# transitioning from state k to state k' (at time t)
|
205 |
+
tmp = (V[t - 1, :] + transition.T).T
|
206 |
+
|
207 |
+
# optimal path to state k at t comes from state P[t, k] at t - 1
|
208 |
+
# (find among all possible states at this time t)
|
209 |
+
P[t, :] = np.argmax(tmp, axis=0)
|
210 |
+
|
211 |
+
# update V for time t
|
212 |
+
V[t, :] = emission[t, :] + tmp[P[t, :], states]
|
213 |
+
|
214 |
+
# ~~ BACK-TRACKING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
215 |
+
X = np.empty((T,), dtype=int)
|
216 |
+
X[-1] = np.argmax(V[-1, :])
|
217 |
+
for t in range(1, T):
|
218 |
+
X[-(t + 1)] = P[-t, X[-t]]
|
219 |
+
|
220 |
+
# ~~ CONVERT BACK TO ORIGINAL STATES
|
221 |
+
|
222 |
+
return _update_states(X, consecutive)
|
sad_tf/viterbi_utils.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
|
4 |
+
# The MIT License
|
5 |
+
|
6 |
+
# Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
|
7 |
+
|
8 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
# of this software and associated documentation files (the "Software"), to deal
|
10 |
+
# in the Software without restriction, including without limitation the rights
|
11 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
# copies of the Software, and to permit persons to whom the Software is
|
13 |
+
# furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
# The above copyright notice and this permission notice shall be included in
|
16 |
+
# all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
24 |
+
# THE SOFTWARE.
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
|
28 |
+
|
29 |
+
def pred2logemission(pred, eps=1e-10):
|
30 |
+
pred = np.array(pred)
|
31 |
+
ret = np.ones((len(pred), 2)) * eps
|
32 |
+
ret[pred == 0, 0] = 1 - eps
|
33 |
+
ret[pred == 1, 1] = 1 - eps
|
34 |
+
return np.log(ret)
|
35 |
+
|
36 |
+
def log_trans_exp(exp,cost0=0, cost1=0):
|
37 |
+
# transition cost is assumed to be 10**-exp
|
38 |
+
cost = -exp * np.log(10)
|
39 |
+
ret = np.ones((2,2)) * cost
|
40 |
+
ret[0,0]= cost0
|
41 |
+
ret[1,1]= cost1
|
42 |
+
return ret
|
43 |
+
|
44 |
+
def diag_trans_exp(exp, dim):
|
45 |
+
cost = -exp * np.log(10)
|
46 |
+
ret = np.ones((dim, dim)) * cost
|
47 |
+
for i in range(dim):
|
48 |
+
ret[i, i] = 0
|
49 |
+
return ret
|