File size: 11,772 Bytes
937e691
 
 
 
 
 
 
 
 
796185d
 
937e691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32e8892
 
 
796185d
 
32e8892
937e691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3eb37e3
 
937e691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0dbf99
 
 
 
 
 
 
937e691
d0dbf99
937e691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3eb37e3
937e691
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c0498
937e691
 
 
 
c5c0498
 
 
937e691
c5c0498
 
 
 
 
 
 
937e691
 
c5c0498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937e691
 
c5c0498
 
 
937e691
 
 
c5c0498
937e691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7768e3f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
import streamlit as st
import torch
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import dataclass
import string
import IPython

st.image('abby_cadabby_.jpeg')

# Part A: Import torch and torchaudio
st.write(torch.__version__)
st.write(torchaudio.__version__)
device = 'cpu'
st.write(device)

# Part B: Load the audio file
SPEECH_FILE = 'abby_cadabby.wav'
waveform, sample_rate = torchaudio.load(SPEECH_FILE)
st.write(SPEECH_FILE)

# Part C: torchaudio.pipelines | bundle.get_model | bundle.get_labels()
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model().to(device)
labels = bundle.get_labels()

# Inference mode
with torch.inference_mode():
    # Load the audio file using torchaudio.load
    waveform, sample_rate = torchaudio.load(SPEECH_FILE)
    waveform = waveform.to(device)
    
    # Pass the waveform through the model
    emissions, _ = model(waveform)
    emissions = torch.log_softmax(emissions, dim=-1)
    
    # Get the emissions for the first example
    emission = emissions[0].cpu().detach()

# Print the labels
st.write('Labels are: ', labels)
st.write('Length of labels are: ', len(labels))

# Part D: Frame-wise class probability plot
def plot():
    fig, ax = plt.subplots()
    img = ax.imshow(emission.T)
    ax.set_title("Frame-wise class probability")
    ax.set_xlabel("Time")
    ax.set_ylabel("Labels")
    fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
    fig.tight_layout()
    return fig
    
st.pyplot(plot())

# Part E: Remove punctuation add | after each word. Also, convert into all UPPERCASE
def remove_punctuation(input_string):
    # Make a translator object to remove all punctuation
    translator = str.maketrans('', '', string.punctuation)
    
    # Split the input string into words
    words = input_string.split()
    
    # Remove punctuation from each word, convert to uppercase, and join them with '|'
    clean_words = ['|' + word.translate(translator).upper() + '|' for word in words]
    clean_transcript = ''.join(clean_words).strip('|')
    
    return clean_transcript

# Test the function 
transcript = " Oh hi! It's me, Abby Cadabby. Do you want to watch me practice my magic? I am going to turn this"

clean_transcript = remove_punctuation(transcript)
st.write(clean_transcript)

# Part F: Populate Trellis
updated_clean_UPPERCASE_transcript = "OH||HI||ITS||ME||ABBY||CADABBY||DO||YOU||WANT||TO||WATCH||ME||PRACTICE||MY||MAGIC||I||AM||GOING||TO||TURN||THIS"
dictionary = {c: i for i, c in enumerate(labels)}

tokens = [dictionary[c] for c in updated_clean_UPPERCASE_transcript]
st.write(list(zip(updated_clean_UPPERCASE_transcript, tokens)))

def get_trellis(emission, tokens, blank_id=0):
    num_frame = emission.size(0)
    num_tokens = len(tokens)
    
    trellis = torch.zeros((num_frame, num_tokens))
    trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
    trellis[0, 1:] = -float("inf")
    trellis[-num_tokens + 1 :, 0] = float("inf")
    
    for t in range(num_frame - 1):
        trellis[t + 1, 1:] = torch.maximum(
            # Score for staying at the same token
            trellis[t, 1:] + emission[t, blank_id],
            # Score for changing to the next token
            trellis[t, :-1] + emission[t, tokens[1:]]
        )
    return trellis

trellis = get_trellis(emission, tokens)
st.write('Trellis =', trellis)

# Part G: Labels and Time -Inf | +Inf 
def n_inf_to_p_inf():
    fig, ax = plt.subplots()
    img = ax.imshow(trellis.T, origin="lower")
    ax.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
    # Shift the "+ Inf" annotation to the right by increasing the denominator
    ax.annotate("+ Inf", (trellis.size(0) - trellis.size(1) / 2.4 , trellis.size(1) / 3))
    fig.colorbar(img, ax=ax, shrink=0.25, location="bottom")
    fig.tight_layout()
    return fig
    
st.pyplot(n_inf_to_p_inf())

# Part H: Backtrack Trellis Emissions Tensor and Tokens
@dataclass
class Point:
    token_index: int
    time_index: int
    score: float

def backtrack(trellis, emission, tokens, blank_id=0):
    t, j = trellis.size(0) - 1, trellis.size(1) - 1
    
    path = [Point(j, t, emission[t, blank_id].exp().item())]
    while j > 0:
        # Should not happen but just in case
        assert t > 0
        
        # 1. Figure out if the current position was stay or change
        # Frame-wise score of stay vs change
        p_stay = emission[t - 1, blank_id]
        p_change = emission[t - 1, tokens[j]]
        
        # Context-aware score for stay vs change
        stayed = trellis[t - 1, j] + p_stay
        changed = trellis[t - 1, j - 1] + p_change
        
        # Update position
        t -= 1
        if changed > stayed:
            j -= 1
            
        # Store the path with frame-wise probability
        prob = (p_change if changed > stayed else p_stay).exp().item()
        path.append(Point(j, t, prob))
        
    # Now j == 0, which means, it reached the SOS.
    # Fill up the rest for the sake of visualization
    while t > 0:
        prob = emission[t - 1, blank_id].exp().item()
        path.append(Point(j, t - 1, prob))
        t -= 1
    return path[::-1]

path = backtrack(trellis, emission, tokens)
for p in path:
    st.write('Token index, Time index and Score:')
    st.write(p)

# Part I: Trellis with Path Visualization
def plot_trellis_with_path(trellis, path):
    # To plot trellis with path, we take advantage of 'nan' value
    trellis_with_path = trellis.clone()
    for _, p in enumerate(path):
        trellis_with_path[p.time_index, p.token_index] = float("nan")
    fig, ax = plt.subplots()
    ax.imshow(trellis_with_path.T, origin="lower")
    ax.set_title("The path found by backtracking")
    ax.set_xlabel("Time")
    ax.set_ylabel("Labels")
    fig.tight_layout()
    return fig
    
st.pyplot(fig)

# Part J: Merge Repeats | Segments
# Merge the labels
@dataclass
class Segment:
    label: str 
    start: int 
    end: int 
    score: float 
    
    def __repr__(self):
        return f"{self.label}\t({self.score:4.2f}) : [{self.start:5d}, {self.end:5d})"
    
    @property
    def length(self):
        return self.end - self.start 
    
def merge_repeats(path):
    i1, i2 = 0, 0
    segments = []
    while i1 < len(path):
        while i2 < len(path) and path[i1].token_index == path[i2].token_index:
            i2 += 1
        score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
        segments.append(
            Segment(
                updated_clean_UPPERCASE_transcript[path[i1].token_index],
                path[i1].time_index,
                path[i2 - 1].time_index + 1,
                score,
            )
        )
        i1 = i2
    return segments

segments = merge_repeats(path)
for seg in segments:
    st.write('Segments:')
    st.write(seg)

# Part K: Trellis with Segments Visualization
def plot_alignments(trellis, segments, word_segments, waveform=np.random.randn(1024), sample_rate=44100):
    trellis_with_path = trellis.clone()
    for i, seg in enumerate(segments):
        if seg.label != "|":
            trellis_with_path[seg.start : seg.end, i] = float("nan")

    fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(20, 18))

    ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
    ax1.set_facecolor("lightgray")
    ax1.set_xticks([])
    ax1.set_yticks([])

    for word in word_segments:
        ax1.axvspan(word.start - 0.5, word.end - 0.5, edgecolor="white", facecolor="none")

    for i, seg in enumerate(segments):
        if seg.label != "|":
            ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")

    # The original waveform
    NFFT = 1024  # Adjust NFFT to be less than the length of the waveform
    ratio = len(waveform) / sample_rate / trellis.size(0)

    # Add a small offset to the waveform to avoid log of zero or negative numbers
    waveform = waveform + 1e-10

    ax2.specgram(waveform, Fs=sample_rate, NFFT=NFFT)
    for word in word_segments:
        x0 = ratio * word.start
        x1 = ratio * word.end
        ax2.axvspan(x0, x1, facecolor="none", edgecolor="white", hatch="/")
        ax2.annotate(f"{word.score:.2f}", (x0, sample_rate * 0.51), annotation_clip=False)

    for seg in segments:
        if seg.label != "|":
            ax2.annotate(seg.label, (seg.start * ratio, sample_rate * 0.55), annotation_clip=False)
    ax2.set_xlabel("time [second]")
    ax2.set_yticks([])
    fig.tight_layout()
    return fig

st.pyplot(fig)

# Part L: Merge words | Segments
# Merge words
def merge_words(segments, separator="|"):
    words = []
    i1, i2 = 0, 0
    while i1 < len(segments):
        if i2 >= len(segments) or segments[i2].label == separator:
            if i1 != i2:
                segs = segments[i1:i2]
                word = "".join([seg.label for seg in segs])
                score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
                words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
            i1 = i2 + 1
            i2 = i1
        else:
            i2 += 1
    return words


word_segments = merge_words(segments)
for word in word_segments:
    st.write('Word Segments:')
    st.write(word)

# Part M: Alignment Visualizations
def plot_alignments(trellis, segments, word_segments, waveform=np.random.randn(1024), sample_rate=44100):
    trellis_with_path = trellis.clone()
    for i, seg in enumerate(segments):
        if seg.label != "|":
            trellis_with_path[seg.start : seg.end, i] = float("nan")

    fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(20, 18))

    ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
    ax1.set_facecolor("lightgray")
    ax1.set_xticks([])
    ax1.set_yticks([])

    for word in word_segments:
        ax1.axvspan(word.start - 0.5, word.end - 0.5, edgecolor="white", facecolor="none")

    for i, seg in enumerate(segments):
        if seg.label != "|":
            ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")

    # The original waveform
    NFFT = 1024
    #ratio = waveform.size(0) / sample_rate / trellis.size(0)
    #ratio = len(waveform) / sample_rate / trellis.size(0)  
    ratio = len(waveform) / sample_rate / trellis.size(0) #-> populates both visualizations
  
    ax2.specgram(waveform, Fs=sample_rate, NFFT=NFFT)
    for word in word_segments:
        x0 = ratio * word.start
        x1 = ratio * word.end
        ax2.axvspan(x0, x1, facecolor="none", edgecolor="white", hatch="/")
        ax2.annotate(f"{word.score:.2f}", (x0, sample_rate * 0.51), annotation_clip=False)

    for seg in segments:
        if seg.label != "|":
            ax2.annotate(seg.label, (seg.start * ratio, sample_rate * 0.55), annotation_clip=False)
    ax2.set_xlabel("time [second]")
    ax2.set_yticks([])
    fig.tight_layout()
    return fig


plot_alignments(trellis, segments, word_segments, waveform, sample_rate)
st.pyplot(plot_alignments(trellis, word_segments, waveform, sample_rate))

# Part N: Display Segment
def display_segment(i):
    ratio = waveform.size(1) / trellis.size(0)
    word = word_segments[i]
    x0 = int(ratio * word.start)
    x1 = int(ratio * word.end)
    print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
    segment = waveform[:, x0:x1]
    return IPython.display.Audio(segment.numpy(), rate=bundle.sample_rate)

# Part O: Audio generation for each segment
st.write('Abby Cadabby Transcript:')
st.write('Transcript')
st.write(IPython.display.Audio(SPEECH_FILE))

st.image('Abby_and_Prince.jpg')