import numpy as np | |
import matplotlib.pyplot as plt | |
import soundfile as sf | |
from collections import defaultdict | |
from dtw import dtw | |
from sklearn_extra.cluster import KMedoids | |
from copy import deepcopy | |
import os, librosa, json | |
# based on original implementation by | |
# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73 | |
# by magnús freyr morthens 2023 supported by rannís nsn | |
# will need: | |
# the whole sentence text (index, word) pairs | |
# the indices of units the user wants | |
# human meta db of all human recordings | |
# tts dir, human wav + align + f0 dirs | |
# list of tts voices | |
# an actual wav file for each human rec, probably | |
# params like: use f0, use rmse, (use dur), [.....] | |
# .. check what i wrote anywhere abt this. | |
def z_score(x, mean, std): | |
return (x - mean) / std | |
# TODO ADJUST | |
# new input will be one Meta db | |
# output should probably be the same, e.g. | |
# {'013823-0457777': [('hvaða', 0.89, 1.35), | |
# ('sjúkdómar', 1.35, 2.17), | |
# ('geta', 2.17, 2.4), | |
# ('fylgt', 2.4, 2.83), | |
# ('óbeinum', 2.83, 3.29), | |
# ('reykingum', 3.29, 3.9)], | |
# '014226-0508808': [('hvaða', 1.03, 1.45), | |
# ('sjúkdómar', 1.45, 2.28), | |
# ('geta', 2.41, 2.7), | |
# ('fylgt', 2.7, 3.09), | |
# ('óbeinum', 3.09, 3.74), | |
# ('reykingum', 3.74, 4.42)], | |
# '013726-0843679': [('hvaða', 0.87, 1.14), | |
# ('sjúkdómar', 1.14, 1.75), | |
# ('geta', 1.75, 1.96), | |
# ('fylgt', 1.96, 2.27), | |
# ('óbeinum', 2.27, 2.73), | |
# ('reykingum', 2.73, 3.27)] } | |
def get_word_aligns(sentences, directory): | |
""" | |
Returns a dictionary of word alignments for a given sentence. | |
""" | |
word_aligns = defaultdict(list) | |
for sentence in sentences: | |
print(sentence) | |
slist = sentence.split(" ") | |
for filename in os.listdir(directory): | |
f = os.path.join(directory, filename) | |
with open(f) as f: | |
lines = f.read().splitlines()[1:] | |
lines = [line.split(",") for line in lines] | |
if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]): | |
id = filename.replace(".csv", "") | |
word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)] | |
# word_aligns[id].append(word_al) # If one speaker has multiple sentences | |
word_aligns[id] = word_al | |
if len(word_aligns) >= 10 * len(sentences): break | |
return word_aligns | |
# TODO ADJUST | |
# or tbqh it is possibly fine as is | |
# well, what file format is it reading. | |
# either adjust my f0 file format or adjust this, a little. | |
def get_pitches(start_time, end_time, id, path): | |
""" | |
Returns an array of pitch values for a given speech. | |
""" | |
f = os.path.join(path, id + ".f0") | |
with open(f) as f: | |
lines = f.read().splitlines()[7:] | |
lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats | |
pitches = [] | |
# find the mean of all pitches in the whole sentence | |
mean = np.mean([line[2] for line in lines if line[2] != -1]) | |
# find the std of all pitches in the whole sentence | |
std = np.std([line[2] for line in lines if line[2] != -1]) | |
fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5) | |
ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95) | |
for line in lines: | |
time, is_pitch, pitch = line | |
if start_time <= time <= end_time: | |
if is_pitch: | |
if fifth_percentile <= pitch <= ninetyfifth_percentile: | |
pitches.append(z_score(pitch, mean, std)) | |
elif pitch < fifth_percentile: | |
pitches.append(z_score(fifth_percentile, mean, std)) | |
elif pitch > ninetyfifth_percentile: | |
pitches.append(z_score(ninetyfifth_percentile, mean, std)) | |
else: | |
pitches.append(z_score(fifth_percentile, mean, std)) | |
return pitches | |
# TODO adjust | |
# probably mainly for the assumption about filepath lol | |
# but also then, comprehend it lol | |
def get_rmse(start_time, end_time, id, path, pitch_len): | |
""" | |
Returns an array of RMSE values for a given speech. | |
""" | |
f = os.path.join(path, id + ".wav") | |
audio, sr = librosa.load(f, sr=16000) | |
segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))] | |
rmse = librosa.feature.rms(segment) | |
rmse = rmse[0] | |
idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int) | |
return rmse[idx] | |
tEMP_start_end_word_pairs = [ | |
[("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")], | |
[("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")], | |
] | |
#TODO !!!!!!!!!!!!!######## | |
# make it take any list of (1stword, lastword) or (word) | |
# units and do the thing for those units. | |
# make it work if the sentence has 2 of the same word | |
# PROBABLY this means i actually need to display the sentence | |
# to the user with the words numbered, | |
# and make the user input word indices. | |
def get_data(word_aligns, start_end_word_pairs): | |
""" | |
Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations. | |
""" | |
data = defaultdict(list) | |
f0_dir = "aligned-reaper/samromur-queries/f0/" | |
wav_dir = "aligned-reaper/samromur-queries/wav/" | |
for id, word_al in word_aligns.items(): | |
for sent in start_end_word_pairs: | |
for word_combs in sent: | |
start, end = word_combs[0], word_combs[-1] | |
if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al): | |
start_time = [al[1] for al in word_al if al[0] == start][0] | |
end_time = [al[2] for al in word_al if al[0] == end][0] | |
pitches = get_pitches(start_time, end_time, id, f0_dir) | |
rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches)) | |
spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches)) | |
pitches_cpy = np.array(deepcopy(pitches)) | |
rmses_cpy = np.array(deepcopy(rmses)) | |
d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)] | |
words = "-".join(word_combs) | |
data[f"{words}-{id}"] = d | |
return data | |
# output - | |
# {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879], | |
# [-1.9923755532468812, 0.0027455997, -0.4325454395749879], | |
# [-1.9923755532468812, 0.0027455997, -0.4325454395749879], | |
# [-1.9923755532468812, 0.0027455997, -0.4325454395749879], | |
# [-1.9923755532468812, 0.0033261522, -0.4428492071628255]], | |
# 'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]], | |
# 'hvaða-sjúkdómar-013726-0843679': [[],[]] } | |
# e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens | |
# for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3) | |
# up to here was forming the data | |
# ----------------------------------------------------- | |
# from here down is probably clustering it | |
# TODO i have no idea how necessary this will be at all | |
def dtw_distance(x, y): | |
""" | |
Returns the DTW distance between two pitch sequences. | |
""" | |
alignment = dtw(x, y, keep_internals=True) | |
return alignment.normalizedDistance | |
# TODO idk but it looks p good | |
# HOWEVER consider exclude the 0 self-comparisons | |
# or see if there is something later that takes care of them | |
dtw_dists = defaultdict(list) | |
for key1, value1 in data.items(): | |
d = key1.split("-") | |
words1 = d[:-2] | |
id1, id2 = d[-2], d[-1] | |
for key2, value2 in data.items(): | |
d = key2.split("-") | |
words2 = d[:-2] | |
id3, id4 = d[-2], d[-1] | |
if all([w1 == w2 for w1, w2 in zip(words1, words2)]): | |
dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2))) | |
# dtw dists ends up as the dict from units to list of tuples | |
# {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0), | |
# ('013823-0457777_013698-0441666', 0.5999433281203399), | |
# ('013823-0457777_014675-0563760', 0.4695447105594414), | |
# ('014226-0508808_013823-0457777', 0.44080874425223393), | |
# ('014226-0508808_014226-0508808', 0.0), | |
# ('014226-0508808_013726-0843679', 0.5599404672667414), | |
# ('014226-0508808_013681-0442313', 0.6871330752342419)] } | |
# note that currently the 0 self-comparisons are present here so | |
# TODO | |
# a) do i need this? | |
# b) make n_clusters a param with default 3 | |
def kmedoids_clustering(X): | |
kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X) | |
y_km = kmedoids.labels_ | |
return y_km, kmedoids | |
# TODO !!!!!!!!!!!! ######### | |
# THIS IS LIKE THE MAIN THINGS probably | |
# ok ya it can probably use some restructurings | |
# like i can make something make ids_dist2 format already earlier. | |
# also triplecheck what kind of distancematrix is supposed to go into X | |
# and what currently is it | |
# although ok i think it might be, and self-organising, | |
# and why it keeps the 0s and has symmetric doubles of everything. | |
# HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!?? | |
# btw since i guess clustering strictly operates on X, | |
# once i reduce whatever duration thing down to pair-distances, | |
# it no longer matters that duration and pitch/energy had different dimensionality... | |
# .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on | |
# 3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that | |
# in which case i could still, u kno, average the 3 distances into 1 x, altho.. | |
kmedoids_cluster_dists = defaultdict(list) | |
for words, datas in dtw_dists.items(): | |
ids_dist = {d[0]: d[1] for d in datas} | |
ids_dist2 = defaultdict(list) | |
for d in datas: | |
id1, id2 = d[0].split("_") | |
ids_dist2[id1].append(d[1]) | |
X = [d[1] for d in datas] | |
X = [X[i:i+10] for i in range(0, len(X), 10)] | |
X = np.array(X) | |
y_km, kmedoids = kmedoids_clustering(X) | |
plot_clusters(X, y_km, words) | |
c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)] | |
result = zip(X, kmedoids.labels_) | |
sortedR = sorted(result, key=lambda x: x[1]) | |
for dp in sortedR: | |
arr, label = dp | |
ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None) | |
if ids is None: | |
print("ID is none") | |
continue | |
kmedoids_cluster_dists[words].append((label, ids, arr)) | |
# TODO probably remember to make it RETURN kmedoids_cluster_dists .. | |
# ############### | |
# TTS and misc ------------------ | |
# | |
# TODO rename this get_audio_part | |
# also maybe take that tmp wav-making out of reaper and put it somewhere general. | |
# so everything gets a wav. | |
# TODO do NOT specify SR | |
# and CHECK if everything that depends on this is ok with arbitrary SR | |
def get_audio(start_time, end_time, id, path): | |
""" | |
Returns a dictionary of RMSE values for a given sentence. | |
""" | |
f = os.path.join(path, id + ".wav") | |
audio, sr = librosa.load(f, sr=16000) | |
segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))] | |
return segment | |
# see near end of notebook for v nice way to grab timespans of tts audio | |
# (or just the start/end timestamps to mark them) from alignment json | |
# based on word position index - | |
# so probably really do show user the sentence with each word numbered. | |
# TODO the speech_marks.json is NOT EXACTLY what u get from tiro | |
# but idr how different, so. | |
alfur_sents = speech_marks_data["Alfur"] | |
with open("speech_marks.json") as f: | |
speech_marks_data = json.load(f) | |
# TODO there IS sth for making tts_data | |
# but im probably p much on my own rlly for that. | |
# TODO this one is v v helpful. | |
# but mind if i adjusted a dictionaries earlier. | |
speaker_to_tts_dtw_dists = defaultdict(list) | |
for key1, value1 in data.items(): | |
d = key1.split("-") | |
words1 = d[:-2] | |
id1, id2 = d[-2], d[-1] | |
for key2, value2 in tts_data.items(): | |
d = key2.split("-") | |
words2 = d[:-2] | |
id3, id4 = d[-2], d[-1] | |
if all([w1 == w2 for w1, w2 in zip(words1, words2)]): | |
speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2))) | |
#TODO i think this is also gr8 | |
# but like figure out how its doing | |
# bc dict format and stuff, | |
# working keying by word index instead of word text, *********** | |
# and for 1 wd or 3+ wd units... | |
tts_dist_to_cluster = defaultdict(list) | |
for words1, datas1 in kmedoids_cluster_dists.items(): | |
for d1 in datas1: | |
cluster, sp_id1, arr = d1 | |
for words2, datas2 in speaker_to_tts_dtw_dists.items(): | |
for d2 in datas2: | |
ids, dist = d2 | |
sp_id2, tts_alfur = ids.split("_") | |
if sp_id1 == sp_id2 and words1 == words2: | |
tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist) | |
tts_mean_dist_to_cluster = { | |
key: np.mean(value) for key, value in tts_dist_to_cluster.items() | |
} | |
# THEN there is - | |
# \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker | |
# - this is one persontoken per graph and has a word division line - idk if works >2 wds. | |
# it might be good to do this for tts at least, eh | |
# Plot pitch values for each word combination for each speaker in each cluster (with word boundaries) | |
# - multi speakers (one cluster) per graph - this will be good to show, with tts on top. | |
# i may want to recentre it around wd bound. at least if only 2 wds. | |
# well i could just pick, like, it will be centred around the 1st wboundary & good luck if more. | |
# - the same as above, but rmse | |
# go all the way to the bottom to see gphs with a tts added on to one cluster. | |
# PLOTTING IS GOING TO BE A WHOLE NIGHTMare | |
# that is just too bad | |
def plot_clusters(X, y, word): | |
u_labels = np.unique(y) | |
# plot the results | |
for i in u_labels: | |
plt.scatter(X[y == i, 0], X[y == i, 1], label=i) | |
plt.title(word) | |
plt.legend() | |
plt.show() | |