prosalign / graph.py
clr's picture
Upload 2 files
7e57864
raw
history blame
3.16 kB
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt
import ctcalign
def readwav(wav_path):
wav, sr = sf.read(wav_path, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def normalise_transcript(xcp):
xcp = xcp.lower()
while ' ' in xcp:
xcp = xcp.replace(' ', ' ')
return xcp
def get_pitch_tracks(wav_path):
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-a']).stdout
#with open('tmp.f0','r') as handle:
f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
print(f0_data) #!!!!!!!!!!!!!!!!!!!!!
f0_data = [l.split(' ') for l in f0_data]
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
f0_data = [[t,f0] for t,prob,f0 in f0_data if prob==1.0]
return f0_data
# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(wav_path, transcript,lang_aligner):
# fetch data
#f0_data = get_pitch_tracks(wav_path)
speech = readwav(wav_path)
w_align, seg_align = ctcalign.align(speech,normalise_transcript(transcript),lang_aligner)
# set up the graph shape
rec_start = w_align[0][1]
rec_end = w_align[-1][2]
f0_data = get_pitch_tracks(wav_path)
if f0_data:
f_max = max([f0 for t,f0 in f0_data]) + 50
else:
f_max = 400
fig, axes1 = plt.subplots(figsize=(15,5))
plt.xlim([rec_start, rec_end])
axes1.set_ylim([0.0, f_max])
axes1.get_xaxis().set_visible(False)
# draw word boundaries
for w,s,e in w_align:
plt.vlines(s,0,f_max,linewidth=0.5,color='black')
plt.vlines(e,0,f_max,linewidth=0.5,color='black')
plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
# draw phone/char boundaries
for p,s,e in seg_align:
plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
plt.text( (s+e)/2 - (len(p)*.01), -30, p, fontsize=15, color='teal')
f0c = "blue"
axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)
w, sr = librosa.load(wav_path)
fr_l = 2048 # librosa default
h_l = 512 # default
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
rmse = rmse[0]
# show rms energy
axes2 = axes1.twinx()
axes2.set_ylim([0.0, 0.5])
rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)
# label the graph
axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
#plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
#plt.show()
return fig
#plt.close('all')
# uppboðssøla bussleiðini viðmerkingar upprunaligur