File size: 3,189 Bytes
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62e3c32
260f9cb
62e3c32
 
 
 
f54c768
260f9cb
 
 
459923a
 
 
 
 
424bfb6
459923a
260f9cb
 
 
459923a
 
424bfb6
 
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt



def readwav(wav_path):
    wav, sr = sf.read(wav_path, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
    return wav


def normalise_transcript(xcp):
    xcp = xcp.lower()
    while '  ' in xcp:
        xcp = xcp.replace('  ', ' ')
    return xcp



def get_pitch_tracks(wav_path):
    print('FILE PATH:', wav_path)
    f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
    print('PLAIN:',f0_data)
    f0_data = f0_data.decode()
    print('DECODE-PITCH:',f0_data)
    f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
    #print(f0_data) 
    f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
    f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
    return f0_data


# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(wav_path, transcript, aligner_function):

    plt.close('all')

    
    # fetch data
    speech = readwav(wav_path)
    w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))

    
    # set up the graph shape
    rec_start = w_align[0][1]
    rec_end = w_align[-1][2]
    
    f0_data = get_pitch_tracks(wav_path)
    if f0_data:
        f_max = max([f0 for t,f0 in f0_data]) + 50
    else:
        f_max = 400


    fig, axes1 = plt.subplots(figsize=(15,5))
    plt.xlim([rec_start, rec_end])
    axes1.set_ylim([0.0, f_max])
    axes1.get_xaxis().set_visible(False)
    
    # draw word boundaries
    for w,s,e in w_align:
        plt.vlines(s,0,f_max,linewidth=0.5,color='black')
        plt.vlines(e,0,f_max,linewidth=0.5,color='black')
        plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
        
    # draw phone/char boundaries
    for p,s,e in seg_align:
        plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.text( (s+e)/2 - (len(p)*.01), -30, p, fontsize=15, color='teal')
    
    
    f0c = "blue"
    axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)

    
    
    w, sr = librosa.load(wav_path)
    fr_l = 2048 # librosa default
    h_l = 512 # default
    rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
    rmse = rmse[0]


    # show rms energy
    axes2 = axes1.twinx()
    axes2.set_ylim([0.0, 0.5])
    rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
    axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)

    
    # label the graph
    axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
    axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
    #plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
    #plt.show()
    
    return fig
    


# uppboðssøla bussleiðini viðmerkingar upprunaligur