Spaces:

TroglodyteDerivations
/

Abby_Cadabby_WAV2VEC2_Alignment

Sleeping

App Files Files Community

Abby_Cadabby_WAV2VEC2_Alignment / app.py

TroglodyteDerivations

Updated lines 166-177

d0dbf99 verified 7 months ago

raw

history blame

11.8 kB

	import streamlit as st
	import torch
	import torchaudio
	import matplotlib.pyplot as plt
	import numpy as np
	from dataclasses import dataclass
	import string
	import IPython

	st.image('abby_cadabby_.jpeg')

	# Part A: Import torch and torchaudio
	st.write(torch.__version__)
	st.write(torchaudio.__version__)
	device = 'cpu'
	st.write(device)

	# Part B: Load the audio file
	SPEECH_FILE = 'abby_cadabby.wav'
	waveform, sample_rate = torchaudio.load(SPEECH_FILE)
	st.write(SPEECH_FILE)

	# Part C: torchaudio.pipelines \| bundle.get_model \| bundle.get_labels()
	bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
	model = bundle.get_model().to(device)
	labels = bundle.get_labels()

	# Inference mode
	with torch.inference_mode():
	# Load the audio file using torchaudio.load
	waveform, sample_rate = torchaudio.load(SPEECH_FILE)
	waveform = waveform.to(device)

	# Pass the waveform through the model
	emissions, _ = model(waveform)
	emissions = torch.log_softmax(emissions, dim=-1)

	# Get the emissions for the first example
	emission = emissions[0].cpu().detach()

	# Print the labels
	st.write('Labels are: ', labels)
	st.write('Length of labels are: ', len(labels))

	# Part D: Frame-wise class probability plot
	def plot():
	fig, ax = plt.subplots()
	img = ax.imshow(emission.T)
	ax.set_title("Frame-wise class probability")
	ax.set_xlabel("Time")
	ax.set_ylabel("Labels")
	fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
	fig.tight_layout()
	return fig

	st.pyplot(plot())

	# Part E: Remove punctuation add \| after each word. Also, convert into all UPPERCASE
	def remove_punctuation(input_string):
	# Make a translator object to remove all punctuation
	translator = str.maketrans('', '', string.punctuation)

	# Split the input string into words
	words = input_string.split()

	# Remove punctuation from each word, convert to uppercase, and join them with '\|'
	clean_words = ['\|' + word.translate(translator).upper() + '\|' for word in words]
	clean_transcript = ''.join(clean_words).strip('\|')

	return clean_transcript

	# Test the function
	transcript = " Oh hi! It's me, Abby Cadabby. Do you want to watch me practice my magic? I am going to turn this"

	clean_transcript = remove_punctuation(transcript)
	st.write(clean_transcript)

	# Part F: Populate Trellis
	updated_clean_UPPERCASE_transcript = "OH\|\|HI\|\|ITS\|\|ME\|\|ABBY\|\|CADABBY\|\|DO\|\|YOU\|\|WANT\|\|TO\|\|WATCH\|\|ME\|\|PRACTICE\|\|MY\|\|MAGIC\|\|I\|\|AM\|\|GOING\|\|TO\|\|TURN\|\|THIS"
	dictionary = {c: i for i, c in enumerate(labels)}

	tokens = [dictionary[c] for c in updated_clean_UPPERCASE_transcript]
	st.write(list(zip(updated_clean_UPPERCASE_transcript, tokens)))

	def get_trellis(emission, tokens, blank_id=0):
	num_frame = emission.size(0)
	num_tokens = len(tokens)

	trellis = torch.zeros((num_frame, num_tokens))
	trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
	trellis[0, 1:] = -float("inf")
	trellis[-num_tokens + 1 :, 0] = float("inf")

	for t in range(num_frame - 1):
	trellis[t + 1, 1:] = torch.maximum(
	# Score for staying at the same token
	trellis[t, 1:] + emission[t, blank_id],
	# Score for changing to the next token
	trellis[t, :-1] + emission[t, tokens[1:]]
	)
	return trellis

	trellis = get_trellis(emission, tokens)
	st.write('Trellis =', trellis)

	# Part G: Labels and Time -Inf \| +Inf
	def n_inf_to_p_inf():
	fig, ax = plt.subplots()
	img = ax.imshow(trellis.T, origin="lower")
	ax.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
	# Shift the "+ Inf" annotation to the right by increasing the denominator
	ax.annotate("+ Inf", (trellis.size(0) - trellis.size(1) / 2.4 , trellis.size(1) / 3))
	fig.colorbar(img, ax=ax, shrink=0.25, location="bottom")
	fig.tight_layout()
	return fig

	st.pyplot(n_inf_to_p_inf())

	# Part H: Backtrack Trellis Emissions Tensor and Tokens
	@dataclass
	class Point:
	token_index: int
	time_index: int
	score: float

	def backtrack(trellis, emission, tokens, blank_id=0):
	t, j = trellis.size(0) - 1, trellis.size(1) - 1

	path = [Point(j, t, emission[t, blank_id].exp().item())]
	while j > 0:
	# Should not happen but just in case
	assert t > 0

	# 1. Figure out if the current position was stay or change
	# Frame-wise score of stay vs change
	p_stay = emission[t - 1, blank_id]
	p_change = emission[t - 1, tokens[j]]

	# Context-aware score for stay vs change
	stayed = trellis[t - 1, j] + p_stay
	changed = trellis[t - 1, j - 1] + p_change

	# Update position
	t -= 1
	if changed > stayed:
	j -= 1

	# Store the path with frame-wise probability
	prob = (p_change if changed > stayed else p_stay).exp().item()
	path.append(Point(j, t, prob))

	# Now j == 0, which means, it reached the SOS.
	# Fill up the rest for the sake of visualization
	while t > 0:
	prob = emission[t - 1, blank_id].exp().item()
	path.append(Point(j, t - 1, prob))
	t -= 1
	return path[::-1]

	path = backtrack(trellis, emission, tokens)
	for p in path:
	st.write('Token index, Time index and Score:')
	st.write(p)

	# Part I: Trellis with Path Visualization
	def plot_trellis_with_path(trellis, path):
	# To plot trellis with path, we take advantage of 'nan' value
	trellis_with_path = trellis.clone()
	for _, p in enumerate(path):
	trellis_with_path[p.time_index, p.token_index] = float("nan")
	fig, ax = plt.subplots()
	ax.imshow(trellis_with_path.T, origin="lower")
	ax.set_title("The path found by backtracking")
	ax.set_xlabel("Time")
	ax.set_ylabel("Labels")
	fig.tight_layout()
	return fig

	st.pyplot(fig)

	# Part J: Merge Repeats \| Segments
	# Merge the labels
	@dataclass
	class Segment:
	label: str
	start: int
	end: int
	score: float

	def __repr__(self):
	return f"{self.label}\t({self.score:4.2f}) : [{self.start:5d}, {self.end:5d})"

	@property
	def length(self):
	return self.end - self.start

	def merge_repeats(path):
	i1, i2 = 0, 0
	segments = []
	while i1 < len(path):
	while i2 < len(path) and path[i1].token_index == path[i2].token_index:
	i2 += 1
	score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
	segments.append(
	Segment(
	updated_clean_UPPERCASE_transcript[path[i1].token_index],
	path[i1].time_index,
	path[i2 - 1].time_index + 1,
	score,
	)
	)
	i1 = i2
	return segments

	segments = merge_repeats(path)
	for seg in segments:
	st.write('Segments:')
	st.write(seg)

	# Part K: Trellis with Segments Visualization
	def plot_alignments(trellis, segments, word_segments, waveform=np.random.randn(1024), sample_rate=44100):
	trellis_with_path = trellis.clone()
	for i, seg in enumerate(segments):
	if seg.label != "\|":
	trellis_with_path[seg.start : seg.end, i] = float("nan")

	fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(20, 18))

	ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
	ax1.set_facecolor("lightgray")
	ax1.set_xticks([])
	ax1.set_yticks([])

	for word in word_segments:
	ax1.axvspan(word.start - 0.5, word.end - 0.5, edgecolor="white", facecolor="none")

	for i, seg in enumerate(segments):
	if seg.label != "\|":
	ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
	ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")

	# The original waveform
	NFFT = 1024 # Adjust NFFT to be less than the length of the waveform
	ratio = len(waveform) / sample_rate / trellis.size(0)

	# Add a small offset to the waveform to avoid log of zero or negative numbers
	waveform = waveform + 1e-10

	ax2.specgram(waveform, Fs=sample_rate, NFFT=NFFT)
	for word in word_segments:
	x0 = ratio * word.start
	x1 = ratio * word.end
	ax2.axvspan(x0, x1, facecolor="none", edgecolor="white", hatch="/")
	ax2.annotate(f"{word.score:.2f}", (x0, sample_rate * 0.51), annotation_clip=False)

	for seg in segments:
	if seg.label != "\|":
	ax2.annotate(seg.label, (seg.start * ratio, sample_rate * 0.55), annotation_clip=False)
	ax2.set_xlabel("time [second]")
	ax2.set_yticks([])
	fig.tight_layout()
	return fig

	st.pyplot(fig)

	# Part L: Merge words \| Segments
	# Merge words
	def merge_words(segments, separator="\|"):
	words = []
	i1, i2 = 0, 0
	while i1 < len(segments):
	if i2 >= len(segments) or segments[i2].label == separator:
	if i1 != i2:
	segs = segments[i1:i2]
	word = "".join([seg.label for seg in segs])
	score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
	words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
	i1 = i2 + 1
	i2 = i1
	else:
	i2 += 1
	return words


	word_segments = merge_words(segments)
	for word in word_segments:
	st.write('Word Segments:')
	st.write(word)

	# Part M: Alignment Visualizations
	def plot_alignments(trellis, segments, word_segments, waveform=np.random.randn(1024), sample_rate=44100):
	trellis_with_path = trellis.clone()
	for i, seg in enumerate(segments):
	if seg.label != "\|":
	trellis_with_path[seg.start : seg.end, i] = float("nan")

	fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(20, 18))

	ax1.imshow(trellis_with_path.T, origin="lower", aspect="auto")
	ax1.set_facecolor("lightgray")
	ax1.set_xticks([])
	ax1.set_yticks([])

	for word in word_segments:
	ax1.axvspan(word.start - 0.5, word.end - 0.5, edgecolor="white", facecolor="none")

	for i, seg in enumerate(segments):
	if seg.label != "\|":
	ax1.annotate(seg.label, (seg.start, i - 0.7), size="small")
	ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), size="small")

	# The original waveform
	NFFT = 1024
	#ratio = waveform.size(0) / sample_rate / trellis.size(0)
	#ratio = len(waveform) / sample_rate / trellis.size(0)
	ratio = len(waveform) / sample_rate / trellis.size(0) #-> populates both visualizations

	ax2.specgram(waveform, Fs=sample_rate, NFFT=NFFT)
	for word in word_segments:
	x0 = ratio * word.start
	x1 = ratio * word.end
	ax2.axvspan(x0, x1, facecolor="none", edgecolor="white", hatch="/")
	ax2.annotate(f"{word.score:.2f}", (x0, sample_rate * 0.51), annotation_clip=False)

	for seg in segments:
	if seg.label != "\|":
	ax2.annotate(seg.label, (seg.start * ratio, sample_rate * 0.55), annotation_clip=False)
	ax2.set_xlabel("time [second]")
	ax2.set_yticks([])
	fig.tight_layout()
	return fig


	plot_alignments(trellis, segments, word_segments, waveform, sample_rate)
	st.pyplot(plot_alignments(trellis, word_segments, waveform, sample_rate))

	# Part N: Display Segment
	def display_segment(i):
	ratio = waveform.size(1) / trellis.size(0)
	word = word_segments[i]
	x0 = int(ratio * word.start)
	x1 = int(ratio * word.end)
	print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
	segment = waveform[:, x0:x1]
	return IPython.display.Audio(segment.numpy(), rate=bundle.sample_rate)

	# Part O: Audio generation for each segment
	st.write('Abby Cadabby Transcript:')
	st.write('Transcript')
	st.write(IPython.display.Audio(SPEECH_FILE))

	st.image('Abby_and_Prince.jpg')