import io |
import os |
import base64 |
import librosa |
import numpy as np |
from io import BytesIO |
import streamlit as st |
from pydub import AudioSegment |
import matplotlib.pyplot as plt |
from scipy.io.wavfile import write |
from src.denoise import denoise |
from myrecorder import recorder |
SR = 16000 |
def np_audio_to_bytesio(np_audio, np_audio_sr): |
_bytes = bytes() |
byte_io = io.BytesIO(_bytes) |
write(byte_io, np_audio_sr, np_audio) |
bytes_audio = byte_io.read() |
return bytes_audio |
def autoplay_audio(audio: str): |
audio_base64 = base64.b64encode(audio).decode('utf-8') |
audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">' |
st.markdown(audio_tag, unsafe_allow_html=True) |
def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')): |
noisy_speech_paths = {'EN':{}, 'JA': {}} |
noisy_speech_names = os.listdir(root) |
for name in noisy_speech_names: |
splt = name.split('_') |
lang, snr = splt[0].upper(), int(splt[1][:2]) |
noisy_speech_paths[lang][snr] = os.path.join(root, name) |
en_keys = list(noisy_speech_paths['EN'].keys()) |
en_keys.sort() |
en_keys.reverse() |
noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys} |
ja_keys = list(noisy_speech_paths['JA'].keys()) |
ja_keys.sort() |
ja_keys.reverse() |
noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys} |
return noisy_speech_paths |
def load_wav(wav_path): |
wav_22k, sr = librosa.load(wav_path) |
wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR) |
return wav_22k, wav_16k |
def wav_to_spec(wav, sr): |
if sr == 16000: |
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050) |
spec = np.abs(librosa.stft(wav)) |
spec = librosa.amplitude_to_db(spec, ref=np.max) |
return spec |
def export_spec_to_buffer(spec): |
plt.clf() |
plt.rcParams['figure.figsize'] = (16, 3.6) |
plt.rc('axes', labelsize=15) |
plt.rc('xtick', labelsize=15) |
plt.rc('ytick', labelsize=15) |
librosa.display.specshow(spec, y_axis='linear', x_axis='time') |
img_buffer = BytesIO() |
img_buffer.truncate(0) |
img_buffer.seek(0) |
plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0) |
plt.close('all') |
return img_buffer |
def process_recorded_wav_bytes(wav_bytes, sr): |
file = BytesIO(wav_bytes) |
audio = AudioSegment.from_file(file=file, format='wav') |
audio = audio.set_sample_width(2) |
audio = audio.set_channels(1) |
audio_22k = audio.set_frame_rate(sr) |
audio_16k = audio.set_frame_rate(SR) |
audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32) |
audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32) |
return audio_22k, audio_16k |
def main(): |
st.set_page_config( |
page_title="speech-denoising-app", |
layout="wide" |
) |
logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small") |
with logo_space: |
st.write( |
""" |
<div style="display: flex; justify-content: left;"> |
<b><span style="text-align: center; color: #101414; font-size: 10px"><br></span></b> |
</div> |
""", |
unsafe_allow_html=True |
) |
st.image('logo.png', width=48) |
with title_space: |
st.image('title.png', width=640) |
with tooltip_space: |
st.markdown( |
""" |
<style> |
.tooltip { |
position: relative; |
display: inline-block; |
cursor: pointer; |
background-color: rgba(0, 76, 153, 1); /* Blue button color */ |
padding: 10px; |
border-radius: 50%; |
font-size: 16px; |
font-weight: bold; |
width: 40px; |
height: 40px; |
text-align: center; |
line-height: 20px; |
color: white; /* Text color */ |
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2); |
} |
.tooltip .tooltiptext { |
visibility: hidden; |
width: 300px; /* Adjust width for readability */ |
background-color: #333; /* Dark background for contrast */ |
color: #fff; |
text-align: left; /* Align text to the left */ |
border-radius: 8px; |
padding: 15px; /* Add padding for spacing */ |
position: absolute; |
z-index: 1; |
top: 150%; /* Position below the button */ |
left: 50%; |
transform: translateX(-50%); |
opacity: 0; |
transition: opacity 0.3s; |
font-size: 14px; |
line-height: 1.8; /* Adjust line height for readability */ |
white-space: normal; /* Allow wrapping of text */ |
} |
.tooltip:hover .tooltiptext { |
visibility: visible; |
opacity: 1; |
} |
</style> |
""", |
unsafe_allow_html=True, |
) |
st.markdown( |
""" |
<div class="tooltip"> |
ℹ |
<span class="tooltiptext"> |
<strong>Steps:</strong><br> |
1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br> |
2) Click <em>"Denoise"</em> and wait for a few seconds.<br> |
3) Both the original audio and denoised audio will be available for playback.<br><br> |
<strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech. |
</span> |
</div> |
""", |
unsafe_allow_html=True, |
) |
tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"]) |
with tab1: |
noisy_speech_files = load_noisy_speech() |
input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium") |
_, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small") |
with compute_space_tab1: |
compute_tab1 = st.button('Denoise', key='denoise_tab1') |
with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True): |
lang_select_space, snr_select_space = st.columns([1, 1], gap="small") |
with lang_select_space: |
language_select = st.selectbox("Language", list(noisy_speech_files.keys())) |
with snr_select_space: |
if language_select: |
snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys())) |
audio_path_tab1 = noisy_speech_files[language_select][snr_select] |
noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1) |
noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050) |
noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1) |
st.audio(audio_path_tab1, format="wav") |
st.image(image=noisy_spec_buff_tab1) |
with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True): |
st.write( |
""" |
<div style="display: flex; justify-content: center;"> |
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b> |
</div> |
""", |
unsafe_allow_html=True |
) |
if noisy_wav_tab1.any() and compute_tab1: |
with st.spinner("Denoising..."): |
denoised_wav_tab1 = denoise(noisy_wav_tab1) |
st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav") |
denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR) |
denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1) |
st.image(image=denoised_spec_buff_tab1) |
with tab2: |
input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium") |
_, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small") |
with record_space: |
record = recorder( |
start_prompt="Start Recording", |
stop_prompt="Stop Recording", |
just_once=False, |
use_container_width=False, |
format="wav", |
callback=None, |
args=(), |
kwargs={}, |
key="tab2_recorder" |
) |
with compute_space_tab2: |
compute_tab2 = st.button('Denoise', key='denoise_tab2') |
noisy_wav_tab2 = np.array([]) |
with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True): |
st.write( |
""" |
<div style="display: flex; justify-content: center;"> |
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b> |
</div> |
""", |
unsafe_allow_html=True |
) |
if record: |
wav_bytes_record = record['bytes'] |
sr = record['sample_rate'] |
noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050) |
noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050) |
noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2) |
st.audio(wav_bytes_record, format="wav") |
st.image(image=noisy_spec_buff_tab2) |
with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True): |
st.write( |
""" |
<div style="display: flex; justify-content: center;"> |
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b> |
</div> |
""", |
unsafe_allow_html=True |
) |
if noisy_wav_tab2.any() and compute_tab2: |
with st.spinner("Denoising..."): |
denoised_wav_tab2 = denoise(noisy_wav_tab2) |
st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav") |
denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR) |
denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2) |
st.image(image=denoised_spec_buff_tab2) |
record = None |
if __name__ == '__main__': |
main() |