catiR
commited on
Commit
·
f484865
1
Parent(s):
cfb1726
normalise
Browse files- scripts/clusterprosody.py +13 -8
- scripts/runSQ.py +1 -1
scripts/clusterprosody.py
CHANGED
|
@@ -6,6 +6,7 @@ import soundfile as sf
|
|
| 6 |
from collections import defaultdict
|
| 7 |
from dtw import dtw
|
| 8 |
from sklearn_extra.cluster import KMedoids
|
|
|
|
| 9 |
from copy import deepcopy
|
| 10 |
import os, librosa, json
|
| 11 |
|
|
@@ -55,16 +56,16 @@ def get_pitches(start_time, end_time, fpath):
|
|
| 55 |
|
| 56 |
|
| 57 |
# find the mean of all pitches in the whole sentence
|
| 58 |
-
mean = np.mean([line[1] for line in lines if line[2]
|
| 59 |
# find the std of all pitches in the whole sentence
|
| 60 |
-
std = np.std([line[1] for line in lines if line[2]
|
| 61 |
|
| 62 |
|
| 63 |
for line in lines:
|
| 64 |
time, pitch, is_pitch = line
|
| 65 |
|
| 66 |
if start_time <= time <= end_time:
|
| 67 |
-
if is_pitch:
|
| 68 |
pitches.append(z_score(pitch, mean, std))
|
| 69 |
else:
|
| 70 |
#pitches.append(z_score(fifth_percentile, mean, std))
|
|
@@ -83,17 +84,21 @@ def get_pitches(start_time, end_time, fpath):
|
|
| 83 |
# TODO: implement that. ?
|
| 84 |
# not sure librosa provides hamming window in rms function directly
|
| 85 |
# TODO handle audio that not originally .wav
|
| 86 |
-
def get_rmse(start_time, end_time, wpath):
|
| 87 |
"""
|
| 88 |
Returns an array of RMSE values for a given speech.
|
| 89 |
"""
|
| 90 |
|
| 91 |
audio, sr = librosa.load(wpath, sr=16000)
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
rmse =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
#idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
|
| 96 |
-
return
|
| 97 |
|
| 98 |
|
| 99 |
# may be unnecessary depending how rmse and pitch window/hop are calculated already
|
|
|
|
| 6 |
from collections import defaultdict
|
| 7 |
from dtw import dtw
|
| 8 |
from sklearn_extra.cluster import KMedoids
|
| 9 |
+
from scipy import stats
|
| 10 |
from copy import deepcopy
|
| 11 |
import os, librosa, json
|
| 12 |
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
# find the mean of all pitches in the whole sentence
|
| 59 |
+
mean = np.mean([line[1] for line in lines if line[2] == 1])
|
| 60 |
# find the std of all pitches in the whole sentence
|
| 61 |
+
std = np.std([line[1] for line in lines if line[2] == 1])
|
| 62 |
|
| 63 |
|
| 64 |
for line in lines:
|
| 65 |
time, pitch, is_pitch = line
|
| 66 |
|
| 67 |
if start_time <= time <= end_time:
|
| 68 |
+
if is_pitch == 1:
|
| 69 |
pitches.append(z_score(pitch, mean, std))
|
| 70 |
else:
|
| 71 |
#pitches.append(z_score(fifth_percentile, mean, std))
|
|
|
|
| 84 |
# TODO: implement that. ?
|
| 85 |
# not sure librosa provides hamming window in rms function directly
|
| 86 |
# TODO handle audio that not originally .wav
|
| 87 |
+
def get_rmse(start_time, end_time, wpath, znorm = True):
|
| 88 |
"""
|
| 89 |
Returns an array of RMSE values for a given speech.
|
| 90 |
"""
|
| 91 |
|
| 92 |
audio, sr = librosa.load(wpath, sr=16000)
|
| 93 |
+
hop = 80
|
| 94 |
+
#segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
|
| 95 |
+
rmse = librosa.feature.rms(y=audio,frame_length=480,hop_length=hop)
|
| 96 |
+
rmse = rmse[0]
|
| 97 |
+
if znorm:
|
| 98 |
+
rmse = stats.zscore(rmse)
|
| 99 |
+
segment = rmse[int(np.floor(start_time * sr/hop)):int(np.ceil(end_time * sr/hop))]
|
| 100 |
#idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
|
| 101 |
+
return segment#[idx]
|
| 102 |
|
| 103 |
|
| 104 |
# may be unnecessary depending how rmse and pitch window/hop are calculated already
|
scripts/runSQ.py
CHANGED
|
@@ -286,7 +286,7 @@ def localtest():
|
|
| 286 |
|
| 287 |
|
| 288 |
|
| 289 |
-
|
| 290 |
# torch matplotlib librosa sklearn_extra pydub
|
| 291 |
# env pclustr
|
| 292 |
|
|
|
|
| 286 |
|
| 287 |
|
| 288 |
|
| 289 |
+
localtest()
|
| 290 |
# torch matplotlib librosa sklearn_extra pydub
|
| 291 |
# env pclustr
|
| 292 |
|