import numpy as np import matplotlib.pyplot as plt import soundfile as sf from collections import defaultdict from dtw import dtw from sklearn_extra.cluster import KMedoids from copy import deepcopy import os, librosa, json # based on original implementation by # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73 # by magnús freyr morthens 2023 supported by rannís nsn # will need: # the whole sentence text (index, word) pairs # the indices of units the user wants # human meta db of all human recordings # tts dir, human wav + align + f0 dirs # list of tts voices # an actual wav file for each human rec, probably # params like: use f0, use rmse, (use dur), [.....] # .. check what i wrote anywhere abt this. def z_score(x, mean, std): return (x - mean) / std # TODO ADJUST # new input will be one Meta db # output should probably be the same, e.g. # {'013823-0457777': [('hvaða', 0.89, 1.35), # ('sjúkdómar', 1.35, 2.17), # ('geta', 2.17, 2.4), # ('fylgt', 2.4, 2.83), # ('óbeinum', 2.83, 3.29), # ('reykingum', 3.29, 3.9)], # '014226-0508808': [('hvaða', 1.03, 1.45), # ('sjúkdómar', 1.45, 2.28), # ('geta', 2.41, 2.7), # ('fylgt', 2.7, 3.09), # ('óbeinum', 3.09, 3.74), # ('reykingum', 3.74, 4.42)], # '013726-0843679': [('hvaða', 0.87, 1.14), # ('sjúkdómar', 1.14, 1.75), # ('geta', 1.75, 1.96), # ('fylgt', 1.96, 2.27), # ('óbeinum', 2.27, 2.73), # ('reykingum', 2.73, 3.27)] } def get_word_aligns(sentences, directory): """ Returns a dictionary of word alignments for a given sentence. """ word_aligns = defaultdict(list) for sentence in sentences: print(sentence) slist = sentence.split(" ") for filename in os.listdir(directory): f = os.path.join(directory, filename) with open(f) as f: lines = f.read().splitlines()[1:] lines = [line.split(",") for line in lines] if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]): id = filename.replace(".csv", "") word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)] # word_aligns[id].append(word_al) # If one speaker has multiple sentences word_aligns[id] = word_al if len(word_aligns) >= 10 * len(sentences): break return word_aligns # TODO ADJUST # or tbqh it is possibly fine as is # well, what file format is it reading. # either adjust my f0 file format or adjust this, a little. def get_pitches(start_time, end_time, id, path): """ Returns an array of pitch values for a given speech. """ f = os.path.join(path, id + ".f0") with open(f) as f: lines = f.read().splitlines()[7:] lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats pitches = [] # find the mean of all pitches in the whole sentence mean = np.mean([line[2] for line in lines if line[2] != -1]) # find the std of all pitches in the whole sentence std = np.std([line[2] for line in lines if line[2] != -1]) fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5) ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95) for line in lines: time, is_pitch, pitch = line if start_time <= time <= end_time: if is_pitch: if fifth_percentile <= pitch <= ninetyfifth_percentile: pitches.append(z_score(pitch, mean, std)) elif pitch < fifth_percentile: pitches.append(z_score(fifth_percentile, mean, std)) elif pitch > ninetyfifth_percentile: pitches.append(z_score(ninetyfifth_percentile, mean, std)) else: pitches.append(z_score(fifth_percentile, mean, std)) return pitches # TODO adjust # probably mainly for the assumption about filepath lol # but also then, comprehend it lol def get_rmse(start_time, end_time, id, path, pitch_len): """ Returns an array of RMSE values for a given speech. """ f = os.path.join(path, id + ".wav") audio, sr = librosa.load(f, sr=16000) segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))] rmse = librosa.feature.rms(segment) rmse = rmse[0] idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int) return rmse[idx] tEMP_start_end_word_pairs = [ [("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")], [("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")], ] #TODO !!!!!!!!!!!!!######## # make it take any list of (1stword, lastword) or (word) # units and do the thing for those units. # make it work if the sentence has 2 of the same word # PROBABLY this means i actually need to display the sentence # to the user with the words numbered, # and make the user input word indices. def get_data(word_aligns, start_end_word_pairs): """ Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations. """ data = defaultdict(list) f0_dir = "aligned-reaper/samromur-queries/f0/" wav_dir = "aligned-reaper/samromur-queries/wav/" for id, word_al in word_aligns.items(): for sent in start_end_word_pairs: for word_combs in sent: start, end = word_combs[0], word_combs[-1] if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al): start_time = [al[1] for al in word_al if al[0] == start][0] end_time = [al[2] for al in word_al if al[0] == end][0] pitches = get_pitches(start_time, end_time, id, f0_dir) rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches)) spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches)) pitches_cpy = np.array(deepcopy(pitches)) rmses_cpy = np.array(deepcopy(rmses)) d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)] words = "-".join(word_combs) data[f"{words}-{id}"] = d return data # output - # {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879], # [-1.9923755532468812, 0.0027455997, -0.4325454395749879], # [-1.9923755532468812, 0.0027455997, -0.4325454395749879], # [-1.9923755532468812, 0.0027455997, -0.4325454395749879], # [-1.9923755532468812, 0.0033261522, -0.4428492071628255]], # 'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]], # 'hvaða-sjúkdómar-013726-0843679': [[],[]] } # e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens # for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3) # up to here was forming the data # ----------------------------------------------------- # from here down is probably clustering it # TODO i have no idea how necessary this will be at all def dtw_distance(x, y): """ Returns the DTW distance between two pitch sequences. """ alignment = dtw(x, y, keep_internals=True) return alignment.normalizedDistance # TODO idk but it looks p good # HOWEVER consider exclude the 0 self-comparisons # or see if there is something later that takes care of them dtw_dists = defaultdict(list) for key1, value1 in data.items(): d = key1.split("-") words1 = d[:-2] id1, id2 = d[-2], d[-1] for key2, value2 in data.items(): d = key2.split("-") words2 = d[:-2] id3, id4 = d[-2], d[-1] if all([w1 == w2 for w1, w2 in zip(words1, words2)]): dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2))) # dtw dists ends up as the dict from units to list of tuples # {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0), # ('013823-0457777_013698-0441666', 0.5999433281203399), # ('013823-0457777_014675-0563760', 0.4695447105594414), # ('014226-0508808_013823-0457777', 0.44080874425223393), # ('014226-0508808_014226-0508808', 0.0), # ('014226-0508808_013726-0843679', 0.5599404672667414), # ('014226-0508808_013681-0442313', 0.6871330752342419)] } # note that currently the 0 self-comparisons are present here so # TODO # a) do i need this? # b) make n_clusters a param with default 3 def kmedoids_clustering(X): kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X) y_km = kmedoids.labels_ return y_km, kmedoids # TODO !!!!!!!!!!!! ######### # THIS IS LIKE THE MAIN THINGS probably # ok ya it can probably use some restructurings # like i can make something make ids_dist2 format already earlier. # also triplecheck what kind of distancematrix is supposed to go into X # and what currently is it # although ok i think it might be, and self-organising, # and why it keeps the 0s and has symmetric doubles of everything. # HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!?? # btw since i guess clustering strictly operates on X, # once i reduce whatever duration thing down to pair-distances, # it no longer matters that duration and pitch/energy had different dimensionality... # .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on # 3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that # in which case i could still, u kno, average the 3 distances into 1 x, altho.. kmedoids_cluster_dists = defaultdict(list) for words, datas in dtw_dists.items(): ids_dist = {d[0]: d[1] for d in datas} ids_dist2 = defaultdict(list) for d in datas: id1, id2 = d[0].split("_") ids_dist2[id1].append(d[1]) X = [d[1] for d in datas] X = [X[i:i+10] for i in range(0, len(X), 10)] X = np.array(X) y_km, kmedoids = kmedoids_clustering(X) plot_clusters(X, y_km, words) c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)] result = zip(X, kmedoids.labels_) sortedR = sorted(result, key=lambda x: x[1]) for dp in sortedR: arr, label = dp ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None) if ids is None: print("ID is none") continue kmedoids_cluster_dists[words].append((label, ids, arr)) # TODO probably remember to make it RETURN kmedoids_cluster_dists .. # ############### # TTS and misc ------------------ # # TODO rename this get_audio_part # also maybe take that tmp wav-making out of reaper and put it somewhere general. # so everything gets a wav. # TODO do NOT specify SR # and CHECK if everything that depends on this is ok with arbitrary SR def get_audio(start_time, end_time, id, path): """ Returns a dictionary of RMSE values for a given sentence. """ f = os.path.join(path, id + ".wav") audio, sr = librosa.load(f, sr=16000) segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))] return segment # see near end of notebook for v nice way to grab timespans of tts audio # (or just the start/end timestamps to mark them) from alignment json # based on word position index - # so probably really do show user the sentence with each word numbered. # TODO the speech_marks.json is NOT EXACTLY what u get from tiro # but idr how different, so. alfur_sents = speech_marks_data["Alfur"] with open("speech_marks.json") as f: speech_marks_data = json.load(f) # TODO there IS sth for making tts_data # but im probably p much on my own rlly for that. # TODO this one is v v helpful. # but mind if i adjusted a dictionaries earlier. speaker_to_tts_dtw_dists = defaultdict(list) for key1, value1 in data.items(): d = key1.split("-") words1 = d[:-2] id1, id2 = d[-2], d[-1] for key2, value2 in tts_data.items(): d = key2.split("-") words2 = d[:-2] id3, id4 = d[-2], d[-1] if all([w1 == w2 for w1, w2 in zip(words1, words2)]): speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2))) #TODO i think this is also gr8 # but like figure out how its doing # bc dict format and stuff, # working keying by word index instead of word text, *********** # and for 1 wd or 3+ wd units... tts_dist_to_cluster = defaultdict(list) for words1, datas1 in kmedoids_cluster_dists.items(): for d1 in datas1: cluster, sp_id1, arr = d1 for words2, datas2 in speaker_to_tts_dtw_dists.items(): for d2 in datas2: ids, dist = d2 sp_id2, tts_alfur = ids.split("_") if sp_id1 == sp_id2 and words1 == words2: tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist) tts_mean_dist_to_cluster = { key: np.mean(value) for key, value in tts_dist_to_cluster.items() } # THEN there is - # \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker # - this is one persontoken per graph and has a word division line - idk if works >2 wds. # it might be good to do this for tts at least, eh # Plot pitch values for each word combination for each speaker in each cluster (with word boundaries) # - multi speakers (one cluster) per graph - this will be good to show, with tts on top. # i may want to recentre it around wd bound. at least if only 2 wds. # well i could just pick, like, it will be centred around the 1st wboundary & good luck if more. # - the same as above, but rmse # go all the way to the bottom to see gphs with a tts added on to one cluster. # PLOTTING IS GOING TO BE A WHOLE NIGHTMare # that is just too bad def plot_clusters(X, y, word): u_labels = np.unique(y) # plot the results for i in u_labels: plt.scatter(X[y == i, 0], X[y == i, 1], label=i) plt.title(word) plt.legend() plt.show()