Spaces:

clr
/

pce

Sleeping

pce / scripts /clusterprosody.py

catiR

779c244 over 1 year ago

15.1 kB

	import numpy as np
	import matplotlib.pyplot as plt
	import soundfile as sf
	from collections import defaultdict
	from dtw import dtw
	from sklearn_extra.cluster import KMedoids
	from copy import deepcopy
	import os, librosa, json


	# based on original implementation by
	# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
	# by magnús freyr morthens 2023 supported by rannís nsn




	# will need:
	# the whole sentence text (index, word) pairs
	# the indices of units the user wants
	# human meta db of all human recordings
	# tts dir, human wav + align + f0 dirs
	# list of tts voices
	# an actual wav file for each human rec, probably
	# params like: use f0, use rmse, (use dur), [.....]
	# .. check what i wrote anywhere abt this.



	def z_score(x, mean, std):
	return (x - mean) / std



	# TODO ADJUST
	# new input will be one Meta db
	# output should probably be the same, e.g.
	# {'013823-0457777': [('hvaða', 0.89, 1.35),
	# ('sjúkdómar', 1.35, 2.17),
	# ('geta', 2.17, 2.4),
	# ('fylgt', 2.4, 2.83),
	# ('óbeinum', 2.83, 3.29),
	# ('reykingum', 3.29, 3.9)],
	# '014226-0508808': [('hvaða', 1.03, 1.45),
	# ('sjúkdómar', 1.45, 2.28),
	# ('geta', 2.41, 2.7),
	# ('fylgt', 2.7, 3.09),
	# ('óbeinum', 3.09, 3.74),
	# ('reykingum', 3.74, 4.42)],
	# '013726-0843679': [('hvaða', 0.87, 1.14),
	# ('sjúkdómar', 1.14, 1.75),
	# ('geta', 1.75, 1.96),
	# ('fylgt', 1.96, 2.27),
	# ('óbeinum', 2.27, 2.73),
	# ('reykingum', 2.73, 3.27)] }
	def get_word_aligns(sentences, directory):
	"""
	Returns a dictionary of word alignments for a given sentence.
	"""
	word_aligns = defaultdict(list)

	for sentence in sentences:
	print(sentence)
	slist = sentence.split(" ")

	for filename in os.listdir(directory):
	f = os.path.join(directory, filename)

	with open(f) as f:
	lines = f.read().splitlines()[1:]
	lines = [line.split(",") for line in lines]
	if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]):
	id = filename.replace(".csv", "")
	word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)]
	# word_aligns[id].append(word_al) # If one speaker has multiple sentences
	word_aligns[id] = word_al

	if len(word_aligns) >= 10 * len(sentences): break

	return word_aligns





	# TODO ADJUST
	# or tbqh it is possibly fine as is
	# well, what file format is it reading.
	# either adjust my f0 file format or adjust this, a little.
	def get_pitches(start_time, end_time, id, path):
	"""
	Returns an array of pitch values for a given speech.
	"""

	f = os.path.join(path, id + ".f0")
	with open(f) as f:
	lines = f.read().splitlines()[7:]
	lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
	pitches = []

	# find the mean of all pitches in the whole sentence
	mean = np.mean([line[2] for line in lines if line[2] != -1])
	# find the std of all pitches in the whole sentence
	std = np.std([line[2] for line in lines if line[2] != -1])

	fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5)
	ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95)

	for line in lines:
	time, is_pitch, pitch = line

	if start_time <= time <= end_time:
	if is_pitch:
	if fifth_percentile <= pitch <= ninetyfifth_percentile:
	pitches.append(z_score(pitch, mean, std))
	elif pitch < fifth_percentile:
	pitches.append(z_score(fifth_percentile, mean, std))
	elif pitch > ninetyfifth_percentile:
	pitches.append(z_score(ninetyfifth_percentile, mean, std))
	else:
	pitches.append(z_score(fifth_percentile, mean, std))

	return pitches




	# TODO adjust
	# probably mainly for the assumption about filepath lol
	# but also then, comprehend it lol
	def get_rmse(start_time, end_time, id, path, pitch_len):
	"""
	Returns an array of RMSE values for a given speech.
	"""

	f = os.path.join(path, id + ".wav")
	audio, sr = librosa.load(f, sr=16000)
	segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
	rmse = librosa.feature.rms(segment)
	rmse = rmse[0]
	idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
	return rmse[idx]




	tEMP_start_end_word_pairs = [
	[("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")],
	[("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")],
	]


	#TODO !!!!!!!!!!!!!########
	# make it take any list of (1stword, lastword) or (word)
	# units and do the thing for those units.
	# make it work if the sentence has 2 of the same word
	# PROBABLY this means i actually need to display the sentence
	# to the user with the words numbered,
	# and make the user input word indices.
	def get_data(word_aligns, start_end_word_pairs):
	"""
	Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
	"""

	data = defaultdict(list)
	f0_dir = "aligned-reaper/samromur-queries/f0/"
	wav_dir = "aligned-reaper/samromur-queries/wav/"

	for id, word_al in word_aligns.items():
	for sent in start_end_word_pairs:
	for word_combs in sent:
	start, end = word_combs[0], word_combs[-1]

	if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al):
	start_time = [al[1] for al in word_al if al[0] == start][0]
	end_time = [al[2] for al in word_al if al[0] == end][0]

	pitches = get_pitches(start_time, end_time, id, f0_dir)
	rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches))
	spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
	pitches_cpy = np.array(deepcopy(pitches))
	rmses_cpy = np.array(deepcopy(rmses))
	d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)]
	words = "-".join(word_combs)
	data[f"{words}-{id}"] = d

	return data
	# output -
	# {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879],
	# [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
	# [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
	# [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
	# [-1.9923755532468812, 0.0033261522, -0.4428492071628255]],
	# 'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]],
	# 'hvaða-sjúkdómar-013726-0843679': [[],[]] }
	# e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens
	# for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3)



	# up to here was forming the data
	# -----------------------------------------------------
	# from here down is probably clustering it



	# TODO i have no idea how necessary this will be at all
	def dtw_distance(x, y):
	"""
	Returns the DTW distance between two pitch sequences.
	"""

	alignment = dtw(x, y, keep_internals=True)
	return alignment.normalizedDistance




	# TODO idk but it looks p good
	# HOWEVER consider exclude the 0 self-comparisons
	# or see if there is something later that takes care of them
	dtw_dists = defaultdict(list)

	for key1, value1 in data.items():
	d = key1.split("-")
	words1 = d[:-2]
	id1, id2 = d[-2], d[-1]
	for key2, value2 in data.items():
	d = key2.split("-")
	words2 = d[:-2]
	id3, id4 = d[-2], d[-1]
	if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
	dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))

	# dtw dists ends up as the dict from units to list of tuples
	# {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0),
	# ('013823-0457777_013698-0441666', 0.5999433281203399),
	# ('013823-0457777_014675-0563760', 0.4695447105594414),
	# ('014226-0508808_013823-0457777', 0.44080874425223393),
	# ('014226-0508808_014226-0508808', 0.0),
	# ('014226-0508808_013726-0843679', 0.5599404672667414),
	# ('014226-0508808_013681-0442313', 0.6871330752342419)] }
	# note that currently the 0 self-comparisons are present here so



	# TODO
	# a) do i need this?
	# b) make n_clusters a param with default 3
	def kmedoids_clustering(X):
	kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
	y_km = kmedoids.labels_
	return y_km, kmedoids





	# TODO !!!!!!!!!!!! #########
	# THIS IS LIKE THE MAIN THINGS probably
	# ok ya it can probably use some restructurings
	# like i can make something make ids_dist2 format already earlier.
	# also triplecheck what kind of distancematrix is supposed to go into X
	# and what currently is it
	# although ok i think it might be, and self-organising,
	# and why it keeps the 0s and has symmetric doubles of everything.
	# HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!??


	# btw since i guess clustering strictly operates on X,
	# once i reduce whatever duration thing down to pair-distances,
	# it no longer matters that duration and pitch/energy had different dimensionality...
	# .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on
	# 3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that
	# in which case i could still, u kno, average the 3 distances into 1 x, altho..

	kmedoids_cluster_dists = defaultdict(list)

	for words, datas in dtw_dists.items():
	ids_dist = {d[0]: d[1] for d in datas}

	ids_dist2 = defaultdict(list)

	for d in datas:
	id1, id2 = d[0].split("_")
	ids_dist2[id1].append(d[1])

	X = [d[1] for d in datas]
	X = [X[i:i+10] for i in range(0, len(X), 10)]
	X = np.array(X)
	y_km, kmedoids = kmedoids_clustering(X)
	plot_clusters(X, y_km, words)

	c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]

	result = zip(X, kmedoids.labels_)
	sortedR = sorted(result, key=lambda x: x[1])

	for dp in sortedR:
	arr, label = dp
	ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None)

	if ids is None:
	print("ID is none")
	continue

	kmedoids_cluster_dists[words].append((label, ids, arr))

	# TODO probably remember to make it RETURN kmedoids_cluster_dists ..








	# ###############
	# TTS and misc ------------------
	#


	# TODO rename this get_audio_part
	# also maybe take that tmp wav-making out of reaper and put it somewhere general.
	# so everything gets a wav.
	# TODO do NOT specify SR
	# and CHECK if everything that depends on this is ok with arbitrary SR
	def get_audio(start_time, end_time, id, path):
	"""
	Returns a dictionary of RMSE values for a given sentence.
	"""

	f = os.path.join(path, id + ".wav")
	audio, sr = librosa.load(f, sr=16000)
	segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
	return segment



	# see near end of notebook for v nice way to grab timespans of tts audio
	# (or just the start/end timestamps to mark them) from alignment json
	# based on word position index -
	# so probably really do show user the sentence with each word numbered.



	# TODO the speech_marks.json is NOT EXACTLY what u get from tiro
	# but idr how different, so.
	alfur_sents = speech_marks_data["Alfur"]
	with open("speech_marks.json") as f:
	speech_marks_data = json.load(f)





	# TODO there IS sth for making tts_data
	# but im probably p much on my own rlly for that.


	# TODO this one is v v helpful.
	# but mind if i adjusted a dictionaries earlier.
	speaker_to_tts_dtw_dists = defaultdict(list)

	for key1, value1 in data.items():
	d = key1.split("-")
	words1 = d[:-2]
	id1, id2 = d[-2], d[-1]
	for key2, value2 in tts_data.items():
	d = key2.split("-")
	words2 = d[:-2]
	id3, id4 = d[-2], d[-1]
	if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
	speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))


	#TODO i think this is also gr8
	# but like figure out how its doing
	# bc dict format and stuff,
	# working keying by word index instead of word text, ***********
	# and for 1 wd or 3+ wd units...
	tts_dist_to_cluster = defaultdict(list)

	for words1, datas1 in kmedoids_cluster_dists.items():
	for d1 in datas1:
	cluster, sp_id1, arr = d1
	for words2, datas2 in speaker_to_tts_dtw_dists.items():
	for d2 in datas2:
	ids, dist = d2
	sp_id2, tts_alfur = ids.split("_")
	if sp_id1 == sp_id2 and words1 == words2:
	tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)

	tts_mean_dist_to_cluster = {
	key: np.mean(value) for key, value in tts_dist_to_cluster.items()
	}




	# THEN there is -
	# \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker
	# - this is one persontoken per graph and has a word division line - idk if works >2 wds.
	# it might be good to do this for tts at least, eh


	# Plot pitch values for each word combination for each speaker in each cluster (with word boundaries)
	# - multi speakers (one cluster) per graph - this will be good to show, with tts on top.
	# i may want to recentre it around wd bound. at least if only 2 wds.
	# well i could just pick, like, it will be centred around the 1st wboundary & good luck if more.

	# - the same as above, but rmse

	# go all the way to the bottom to see gphs with a tts added on to one cluster.





	# PLOTTING IS GOING TO BE A WHOLE NIGHTMare
	# that is just too bad

	def plot_clusters(X, y, word):
	u_labels = np.unique(y)

	# plot the results
	for i in u_labels:
	plt.scatter(X[y == i, 0], X[y == i, 1], label=i)
	plt.title(word)
	plt.legend()
	plt.show()