catiR
commited on
Commit
·
f484865
1
Parent(s):
cfb1726
normalise
Browse files- scripts/clusterprosody.py +13 -8
- scripts/runSQ.py +1 -1
scripts/clusterprosody.py
CHANGED
@@ -6,6 +6,7 @@ import soundfile as sf
|
|
6 |
from collections import defaultdict
|
7 |
from dtw import dtw
|
8 |
from sklearn_extra.cluster import KMedoids
|
|
|
9 |
from copy import deepcopy
|
10 |
import os, librosa, json
|
11 |
|
@@ -55,16 +56,16 @@ def get_pitches(start_time, end_time, fpath):
|
|
55 |
|
56 |
|
57 |
# find the mean of all pitches in the whole sentence
|
58 |
-
mean = np.mean([line[1] for line in lines if line[2]
|
59 |
# find the std of all pitches in the whole sentence
|
60 |
-
std = np.std([line[1] for line in lines if line[2]
|
61 |
|
62 |
|
63 |
for line in lines:
|
64 |
time, pitch, is_pitch = line
|
65 |
|
66 |
if start_time <= time <= end_time:
|
67 |
-
if is_pitch:
|
68 |
pitches.append(z_score(pitch, mean, std))
|
69 |
else:
|
70 |
#pitches.append(z_score(fifth_percentile, mean, std))
|
@@ -83,17 +84,21 @@ def get_pitches(start_time, end_time, fpath):
|
|
83 |
# TODO: implement that. ?
|
84 |
# not sure librosa provides hamming window in rms function directly
|
85 |
# TODO handle audio that not originally .wav
|
86 |
-
def get_rmse(start_time, end_time, wpath):
|
87 |
"""
|
88 |
Returns an array of RMSE values for a given speech.
|
89 |
"""
|
90 |
|
91 |
audio, sr = librosa.load(wpath, sr=16000)
|
92 |
-
|
93 |
-
|
94 |
-
rmse =
|
|
|
|
|
|
|
|
|
95 |
#idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
|
96 |
-
return
|
97 |
|
98 |
|
99 |
# may be unnecessary depending how rmse and pitch window/hop are calculated already
|
|
|
6 |
from collections import defaultdict
|
7 |
from dtw import dtw
|
8 |
from sklearn_extra.cluster import KMedoids
|
9 |
+
from scipy import stats
|
10 |
from copy import deepcopy
|
11 |
import os, librosa, json
|
12 |
|
|
|
56 |
|
57 |
|
58 |
# find the mean of all pitches in the whole sentence
|
59 |
+
mean = np.mean([line[1] for line in lines if line[2] == 1])
|
60 |
# find the std of all pitches in the whole sentence
|
61 |
+
std = np.std([line[1] for line in lines if line[2] == 1])
|
62 |
|
63 |
|
64 |
for line in lines:
|
65 |
time, pitch, is_pitch = line
|
66 |
|
67 |
if start_time <= time <= end_time:
|
68 |
+
if is_pitch == 1:
|
69 |
pitches.append(z_score(pitch, mean, std))
|
70 |
else:
|
71 |
#pitches.append(z_score(fifth_percentile, mean, std))
|
|
|
84 |
# TODO: implement that. ?
|
85 |
# not sure librosa provides hamming window in rms function directly
|
86 |
# TODO handle audio that not originally .wav
|
87 |
+
def get_rmse(start_time, end_time, wpath, znorm = True):
|
88 |
"""
|
89 |
Returns an array of RMSE values for a given speech.
|
90 |
"""
|
91 |
|
92 |
audio, sr = librosa.load(wpath, sr=16000)
|
93 |
+
hop = 80
|
94 |
+
#segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
|
95 |
+
rmse = librosa.feature.rms(y=audio,frame_length=480,hop_length=hop)
|
96 |
+
rmse = rmse[0]
|
97 |
+
if znorm:
|
98 |
+
rmse = stats.zscore(rmse)
|
99 |
+
segment = rmse[int(np.floor(start_time * sr/hop)):int(np.ceil(end_time * sr/hop))]
|
100 |
#idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
|
101 |
+
return segment#[idx]
|
102 |
|
103 |
|
104 |
# may be unnecessary depending how rmse and pitch window/hop are calculated already
|
scripts/runSQ.py
CHANGED
@@ -286,7 +286,7 @@ def localtest():
|
|
286 |
|
287 |
|
288 |
|
289 |
-
|
290 |
# torch matplotlib librosa sklearn_extra pydub
|
291 |
# env pclustr
|
292 |
|
|
|
286 |
|
287 |
|
288 |
|
289 |
+
localtest()
|
290 |
# torch matplotlib librosa sklearn_extra pydub
|
291 |
# env pclustr
|
292 |
|