### Dependencies and inputs

In [None]:
!pip -q install pydub
from google.colab import output
from base64 import b64decode, b64encode
from io import BytesIO
import numpy as np
from pydub import AudioSegment
from IPython.display import HTML, display
import torch
import matplotlib.pyplot as plt
import moviepy.editor as mpe
from matplotlib.animation import FuncAnimation, FFMpegWriter
import matplotlib
matplotlib.use('Agg')

torch.set_num_threads(1)

model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',
 model='silero_vad',
 force_reload=True)

def int2float(sound):
 abs_max = np.abs(sound).max()
 sound = sound.astype('float32')
 if abs_max > 0:
 sound *= 1/32768
 sound = sound.squeeze()
 return sound

AUDIO_HTML = """

"""

def record(sec=10):
 display(HTML(AUDIO_HTML))
 s = output.eval_js("data")
 b = b64decode(s.split(',')[1])
 audio = AudioSegment.from_file(BytesIO(b))
 audio.export('test.mp3', format='mp3')
 audio = audio.set_channels(1)
 audio = audio.set_frame_rate(16000)
 audio_float = int2float(np.array(audio.get_array_of_samples()))
 audio_tens = torch.tensor(audio_float )
 return audio_tens

def make_animation(probs, audio_duration, interval=40):
 fig = plt.figure(figsize=(16, 9))
 ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))
 line, = ax.plot([], [], lw=2)
 x = [i / 16000 * 512 for i in range(len(probs))]
 plt.xlabel('Time, seconds', fontsize=16)
 plt.ylabel('Speech Probability', fontsize=16)

 def init():
 plt.fill_between(x, probs, color='#064273')
 line.set_data([], [])
 line.set_color('#990000')
 return line,

 def animate(i):
 x = i * interval / 1000 - 0.04
 y = np.linspace(0, 1.02, 2)
 
 line.set_data(x, y)
 line.set_color('#990000')
 return line,

 anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))

 f = r"animation.mp4" 
 writervideo = FFMpegWriter(fps=1000/interval) 
 anim.save(f, writer=writervideo)
 plt.close('all')

def combine_audio(vidname, audname, outname, fps=25): 
 my_clip = mpe.VideoFileClip(vidname, verbose=False)
 audio_background = mpe.AudioFileClip(audname)
 final_clip = my_clip.set_audio(audio_background)
 final_clip.write_videofile(outname,fps=fps,verbose=False)

def record_make_animation():
 tensor = record()

 print('Calculating probabilities...')
 speech_probs = []
 window_size_samples = 512
 for i in range(0, len(tensor), window_size_samples):
 if len(tensor[i: i+ window_size_samples]) < window_size_samples:
 break
 speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()
 speech_probs.append(speech_prob)
 model.reset_states()
 print('Making animation...')
 make_animation(speech_probs, len(tensor) / 16000)

 print('Merging your voice with animation...')
 combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')
 print('Done!')
 mp4 = open('merged.mp4','rb').read()
 data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
 display(HTML("""
 
 """ % data_url))

## Record example

In [None]:
record_make_animation()