catiR commited on
Commit
f484865
·
1 Parent(s): cfb1726
Files changed (2) hide show
  1. scripts/clusterprosody.py +13 -8
  2. scripts/runSQ.py +1 -1
scripts/clusterprosody.py CHANGED
@@ -6,6 +6,7 @@ import soundfile as sf
6
  from collections import defaultdict
7
  from dtw import dtw
8
  from sklearn_extra.cluster import KMedoids
 
9
  from copy import deepcopy
10
  import os, librosa, json
11
 
@@ -55,16 +56,16 @@ def get_pitches(start_time, end_time, fpath):
55
 
56
 
57
  # find the mean of all pitches in the whole sentence
58
- mean = np.mean([line[1] for line in lines if line[2] != -1])
59
  # find the std of all pitches in the whole sentence
60
- std = np.std([line[1] for line in lines if line[2] != -1])
61
 
62
 
63
  for line in lines:
64
  time, pitch, is_pitch = line
65
 
66
  if start_time <= time <= end_time:
67
- if is_pitch:
68
  pitches.append(z_score(pitch, mean, std))
69
  else:
70
  #pitches.append(z_score(fifth_percentile, mean, std))
@@ -83,17 +84,21 @@ def get_pitches(start_time, end_time, fpath):
83
  # TODO: implement that. ?
84
  # not sure librosa provides hamming window in rms function directly
85
  # TODO handle audio that not originally .wav
86
- def get_rmse(start_time, end_time, wpath):
87
  """
88
  Returns an array of RMSE values for a given speech.
89
  """
90
 
91
  audio, sr = librosa.load(wpath, sr=16000)
92
- segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
93
- rmse = librosa.feature.rms(y=segment,frame_length=480,hop_length=80)#librosa.feature.rms(y=segment)
94
- rmse = rmse[0]
 
 
 
 
95
  #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
96
- return rmse#[idx]
97
 
98
 
99
  # may be unnecessary depending how rmse and pitch window/hop are calculated already
 
6
  from collections import defaultdict
7
  from dtw import dtw
8
  from sklearn_extra.cluster import KMedoids
9
+ from scipy import stats
10
  from copy import deepcopy
11
  import os, librosa, json
12
 
 
56
 
57
 
58
  # find the mean of all pitches in the whole sentence
59
+ mean = np.mean([line[1] for line in lines if line[2] == 1])
60
  # find the std of all pitches in the whole sentence
61
+ std = np.std([line[1] for line in lines if line[2] == 1])
62
 
63
 
64
  for line in lines:
65
  time, pitch, is_pitch = line
66
 
67
  if start_time <= time <= end_time:
68
+ if is_pitch == 1:
69
  pitches.append(z_score(pitch, mean, std))
70
  else:
71
  #pitches.append(z_score(fifth_percentile, mean, std))
 
84
  # TODO: implement that. ?
85
  # not sure librosa provides hamming window in rms function directly
86
  # TODO handle audio that not originally .wav
87
+ def get_rmse(start_time, end_time, wpath, znorm = True):
88
  """
89
  Returns an array of RMSE values for a given speech.
90
  """
91
 
92
  audio, sr = librosa.load(wpath, sr=16000)
93
+ hop = 80
94
+ #segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
95
+ rmse = librosa.feature.rms(y=audio,frame_length=480,hop_length=hop)
96
+ rmse = rmse[0]
97
+ if znorm:
98
+ rmse = stats.zscore(rmse)
99
+ segment = rmse[int(np.floor(start_time * sr/hop)):int(np.ceil(end_time * sr/hop))]
100
  #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
101
+ return segment#[idx]
102
 
103
 
104
  # may be unnecessary depending how rmse and pitch window/hop are calculated already
scripts/runSQ.py CHANGED
@@ -286,7 +286,7 @@ def localtest():
286
 
287
 
288
 
289
- #localtest()
290
  # torch matplotlib librosa sklearn_extra pydub
291
  # env pclustr
292
 
 
286
 
287
 
288
 
289
+ localtest()
290
  # torch matplotlib librosa sklearn_extra pydub
291
  # env pclustr
292