File size: 3,092 Bytes
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260f9cb
 
 
 
 
 
459923a
 
 
 
 
424bfb6
459923a
260f9cb
 
 
459923a
 
424bfb6
 
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt



def readwav(wav_path):
    wav, sr = sf.read(wav_path, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
    return wav


def normalise_transcript(xcp):
    xcp = xcp.lower()
    while '  ' in xcp:
        xcp = xcp.replace('  ', ' ')
    return xcp



def get_pitch_tracks(wav_path):
    f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
    f0_data = f0_data.decode().split('EST_Header_End\n')[1].splitlines()
    #print(f0_data) #!!!!!!!!!!!!!!!!!!!!!
    f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
    f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
    return f0_data


# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(wav_path, transcript, aligner_function):

    plt.close('all')

    
    # fetch data
    speech = readwav(wav_path)
    w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))

    
    # set up the graph shape
    rec_start = w_align[0][1]
    rec_end = w_align[-1][2]
    
    f0_data = get_pitch_tracks(wav_path)
    if f0_data:
        f_max = max([f0 for t,f0 in f0_data]) + 50
    else:
        f_max = 400


    fig, axes1 = plt.subplots(figsize=(15,5))
    plt.xlim([rec_start, rec_end])
    axes1.set_ylim([0.0, f_max])
    axes1.get_xaxis().set_visible(False)
    
    # draw word boundaries
    for w,s,e in w_align:
        plt.vlines(s,0,f_max,linewidth=0.5,color='black')
        plt.vlines(e,0,f_max,linewidth=0.5,color='black')
        plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
        
    # draw phone/char boundaries
    for p,s,e in seg_align:
        plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.text( (s+e)/2 - (len(p)*.01), -30, p, fontsize=15, color='teal')
    
    
    f0c = "blue"
    axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)

    
    
    w, sr = librosa.load(wav_path)
    fr_l = 2048 # librosa default
    h_l = 512 # default
    rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
    rmse = rmse[0]


    # show rms energy
    axes2 = axes1.twinx()
    axes2.set_ylim([0.0, 0.5])
    rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
    axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)

    
    # label the graph
    axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
    axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
    #plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
    #plt.show()
    
    return fig
    


# uppboðssøla bussleiðini viðmerkingar upprunaligur