import numpy as np import soundfile as sf from scipy import signal import librosa import subprocess import matplotlib.pyplot as plt def readwav(wav_path): wav, sr = sf.read(wav_path, dtype=np.float32) if len(wav.shape) == 2: wav = wav.mean(1) if sr != 16000: wlen = int(wav.shape[0] / sr * 16000) wav = signal.resample(wav, wlen) return wav def normalise_transcript(xcp): xcp = xcp.lower() while ' ' in xcp: xcp = xcp.replace(' ', ' ') return xcp def get_pitch_tracks(wav_path): print('FILE PATH:', wav_path) f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout print('PLAIN:',f0_data) f0_data = f0_data.decode() print('DECODE-PITCH:',f0_data) f0_data = f0_data.split('EST_Header_End\n')[1].splitlines() #print(f0_data) f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1'] return f0_data # transcript could be from a corpus with the wav file, # input by the user, # or from a previous speech recognition process def align_and_graph(wav_path, transcript, aligner_function): plt.close('all') # fetch data speech = readwav(wav_path) w_align, seg_align = aligner_function(speech,normalise_transcript(transcript)) # set up the graph shape rec_start = w_align[0][1] rec_end = w_align[-1][2] f0_data = get_pitch_tracks(wav_path) if f0_data: f_max = max([f0 for t,f0 in f0_data]) + 50 else: f_max = 400 fig, axes1 = plt.subplots(figsize=(15,5)) plt.xlim([rec_start, rec_end]) axes1.set_ylim([0.0, f_max]) axes1.get_xaxis().set_visible(False) # draw word boundaries for w,s,e in w_align: plt.vlines(s,0,f_max,linewidth=0.5,color='black') plt.vlines(e,0,f_max,linewidth=0.5,color='black') plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15) # draw phone/char boundaries for p,s,e in seg_align: plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4))) plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4))) plt.text( (s+e)/2 - (len(p)*.01), -30, p, fontsize=15, color='teal') f0c = "blue" axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c) w, sr = librosa.load(wav_path) fr_l = 2048 # librosa default h_l = 512 # default rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l) rmse = rmse[0] # show rms energy axes2 = axes1.twinx() axes2.set_ylim([0.0, 0.5]) rms_xval = [(h_l*i)/sr for i in range(len(rmse))] axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5) # label the graph axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue") axes2.set_ylabel("RMS energy", fontsize=14,color="coral") #plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15) #plt.show() return fig # uppboðssøla bussleiðini viðmerkingar upprunaligur