File size: 3,090 Bytes
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424bfb6
459923a
 
 
424bfb6
 
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt



def readwav(wav_path):
    wav, sr = sf.read(wav_path, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
    return wav


def normalise_transcript(xcp):
    xcp = xcp.lower()
    while '  ' in xcp:
        xcp = xcp.replace('  ', ' ')
    return xcp



def get_pitch_tracks(wav_path):
  f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-a']).stdout
  #with open('tmp.f0','r') as handle:
  f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
  print(f0_data) #!!!!!!!!!!!!!!!!!!!!!
  f0_data = [l.split(' ') for l in f0_data]
  f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
  f0_data = [[t,f0] for t,prob,f0 in f0_data if prob==1.0]
  return f0_data






# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(wav_path, transcript, aligner_function):

    # fetch data
    speech = readwav(wav_path)
    w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))

    
    # set up the graph shape
    rec_start = w_align[0][1]
    rec_end = w_align[-1][2]
    
    f0_data = get_pitch_tracks(wav_path)
    if f0_data:
        f_max = max([f0 for t,f0 in f0_data]) + 50
    else:
        f_max = 400


    fig, axes1 = plt.subplots(figsize=(15,5))
    plt.xlim([rec_start, rec_end])
    axes1.set_ylim([0.0, f_max])
    axes1.get_xaxis().set_visible(False)
    
    # draw word boundaries
    for w,s,e in w_align:
        plt.vlines(s,0,f_max,linewidth=0.5,color='black')
        plt.vlines(e,0,f_max,linewidth=0.5,color='black')
        plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
        
    # draw phone/char boundaries
    for p,s,e in seg_align:
        plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.text( (s+e)/2 - (len(p)*.01), -30, p, fontsize=15, color='teal')
    
    
    f0c = "blue"
    axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)

    
    
    w, sr = librosa.load(wav_path)
    fr_l = 2048 # librosa default
    h_l = 512 # default
    rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
    rmse = rmse[0]


    # show rms energy
    axes2 = axes1.twinx()
    axes2.set_ylim([0.0, 0.5])
    rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
    axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)

    
    # label the graph
    axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
    axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
    #plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
    #plt.show()
    
    return fig
    
    #plt.close('all')


# uppboðssøla bussleiðini viðmerkingar upprunaligur