|
import io |
|
import os |
|
import base64 |
|
import librosa |
|
import numpy as np |
|
from io import BytesIO |
|
import streamlit as st |
|
from pydub import AudioSegment |
|
import matplotlib.pyplot as plt |
|
from scipy.io.wavfile import write |
|
from src.denoise import denoise |
|
from myrecorder import recorder |
|
|
|
|
|
SR = 16000 |
|
CONTAINER_HEIGHT = 340 |
|
|
|
|
|
def np_audio_to_bytesio(np_audio, np_audio_sr): |
|
_bytes = bytes() |
|
byte_io = io.BytesIO(_bytes) |
|
write(byte_io, np_audio_sr, np_audio) |
|
bytes_audio = byte_io.read() |
|
return bytes_audio |
|
|
|
|
|
def autoplay_audio(audio: str): |
|
audio_base64 = base64.b64encode(audio).decode('utf-8') |
|
audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">' |
|
st.markdown(audio_tag, unsafe_allow_html=True) |
|
|
|
|
|
def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')): |
|
noisy_speech_paths = {'EN':{}, 'JA': {}} |
|
noisy_speech_names = os.listdir(root) |
|
for name in noisy_speech_names: |
|
splt = name.split('_') |
|
lang, snr = splt[0].upper(), int(splt[1][:2]) |
|
noisy_speech_paths[lang][snr] = os.path.join(root, name) |
|
|
|
en_keys = list(noisy_speech_paths['EN'].keys()) |
|
en_keys.sort() |
|
en_keys.reverse() |
|
noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys} |
|
|
|
ja_keys = list(noisy_speech_paths['JA'].keys()) |
|
ja_keys.sort() |
|
ja_keys.reverse() |
|
noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys} |
|
|
|
return noisy_speech_paths |
|
|
|
|
|
def load_wav(wav_path): |
|
wav_22k, sr = librosa.load(wav_path) |
|
wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR) |
|
return wav_22k, wav_16k |
|
|
|
|
|
def wav_to_spec(wav, sr): |
|
if sr == 16000: |
|
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050) |
|
spec = np.abs(librosa.stft(wav)) |
|
spec = librosa.amplitude_to_db(spec, ref=np.max) |
|
return spec |
|
|
|
|
|
def export_spec_to_buffer(spec): |
|
plt.clf() |
|
plt.rcParams['figure.figsize'] = (16, 3.6) |
|
plt.rc('axes', labelsize=15) |
|
plt.rc('xtick', labelsize=15) |
|
plt.rc('ytick', labelsize=15) |
|
librosa.display.specshow(spec, y_axis='linear', x_axis='time') |
|
img_buffer = BytesIO() |
|
img_buffer.truncate(0) |
|
img_buffer.seek(0) |
|
plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0) |
|
plt.close('all') |
|
return img_buffer |
|
|
|
|
|
def process_recorded_wav_bytes(wav_bytes, sr): |
|
file = BytesIO(wav_bytes) |
|
audio = AudioSegment.from_file(file=file, format='wav') |
|
audio = audio.set_sample_width(2) |
|
audio = audio.set_channels(1) |
|
audio_22k = audio.set_frame_rate(sr) |
|
audio_16k = audio.set_frame_rate(SR) |
|
audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32) |
|
audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32) |
|
return audio_22k, audio_16k |
|
|
|
|
|
def main(): |
|
|
|
st.set_page_config( |
|
page_title="speech-denoising-app", |
|
layout="wide" |
|
) |
|
|
|
logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small") |
|
|
|
with logo_space: |
|
st.write( |
|
""" |
|
<div style="display: flex; justify-content: left;"> |
|
<b><span style="text-align: center; color: #101414; font-size: 10px"><br></span></b> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
st.image('logo.png', width=48) |
|
|
|
with title_space: |
|
st.image('title.png', width=640) |
|
|
|
with tooltip_space: |
|
st.markdown( |
|
""" |
|
<style> |
|
.tooltip { |
|
position: relative; |
|
display: inline-block; |
|
cursor: pointer; |
|
background-color: rgba(0, 76, 153, 1); /* Blue button color */ |
|
padding: 10px; |
|
border-radius: 50%; |
|
font-size: 16px; |
|
font-weight: bold; |
|
width: 40px; |
|
height: 40px; |
|
text-align: center; |
|
line-height: 20px; |
|
color: white; /* Text color */ |
|
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2); |
|
} |
|
|
|
.tooltip .tooltiptext { |
|
visibility: hidden; |
|
width: 300px; /* Adjust width for readability */ |
|
background-color: #333; /* Dark background for contrast */ |
|
color: #fff; |
|
text-align: left; /* Align text to the left */ |
|
border-radius: 8px; |
|
padding: 15px; /* Add padding for spacing */ |
|
position: absolute; |
|
z-index: 1; |
|
top: 150%; /* Position below the button */ |
|
left: 50%; |
|
transform: translateX(-50%); |
|
opacity: 0; |
|
transition: opacity 0.3s; |
|
font-size: 14px; |
|
line-height: 1.8; /* Adjust line height for readability */ |
|
white-space: normal; /* Allow wrapping of text */ |
|
} |
|
|
|
.tooltip:hover .tooltiptext { |
|
visibility: visible; |
|
opacity: 1; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
st.markdown( |
|
""" |
|
<div class="tooltip"> |
|
ℹ |
|
<span class="tooltiptext"> |
|
<strong>Steps:</strong><br> |
|
1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br> |
|
2) Click <em>"Denoise"</em> and wait for a few seconds.<br> |
|
3) Both the original audio and denoised audio will be available for playback.<br><br> |
|
<strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech. |
|
</span> |
|
</div> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"]) |
|
|
|
with tab1: |
|
noisy_speech_files = load_noisy_speech() |
|
|
|
input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium") |
|
_, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small") |
|
|
|
with compute_space_tab1: |
|
compute_tab1 = st.button('Denoise', key='denoise_tab1') |
|
|
|
with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True): |
|
lang_select_space, snr_select_space = st.columns([1, 1], gap="small") |
|
with lang_select_space: |
|
language_select = st.selectbox("Language", list(noisy_speech_files.keys())) |
|
with snr_select_space: |
|
if language_select: |
|
snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys())) |
|
|
|
audio_path_tab1 = noisy_speech_files[language_select][snr_select] |
|
noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1) |
|
noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050) |
|
noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1) |
|
|
|
st.audio(audio_path_tab1, format="wav") |
|
st.image(image=noisy_spec_buff_tab1) |
|
|
|
with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True): |
|
st.write( |
|
""" |
|
<div style="display: flex; justify-content: center;"> |
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
if noisy_wav_tab1.any() and compute_tab1: |
|
with st.spinner("Denoising..."): |
|
denoised_wav_tab1 = denoise(noisy_wav_tab1) |
|
st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav") |
|
denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR) |
|
denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1) |
|
st.image(image=denoised_spec_buff_tab1) |
|
|
|
with tab2: |
|
input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium") |
|
_, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small") |
|
|
|
with record_space: |
|
record = recorder( |
|
start_prompt="Start Recording", |
|
stop_prompt="Stop Recording", |
|
just_once=False, |
|
use_container_width=False, |
|
format="wav", |
|
callback=None, |
|
args=(), |
|
kwargs={}, |
|
key="tab2_recorder" |
|
) |
|
|
|
with compute_space_tab2: |
|
compute_tab2 = st.button('Denoise', key='denoise_tab2') |
|
|
|
noisy_wav_tab2 = np.array([]) |
|
with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True): |
|
st.write( |
|
""" |
|
<div style="display: flex; justify-content: center;"> |
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
if record: |
|
wav_bytes_record = record['bytes'] |
|
sr = record['sample_rate'] |
|
noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050) |
|
noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050) |
|
noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2) |
|
|
|
st.audio(wav_bytes_record, format="wav") |
|
st.image(image=noisy_spec_buff_tab2) |
|
|
|
with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True): |
|
st.write( |
|
""" |
|
<div style="display: flex; justify-content: center;"> |
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b> |
|
</div> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
if noisy_wav_tab2.any() and compute_tab2: |
|
with st.spinner("Denoising..."): |
|
denoised_wav_tab2 = denoise(noisy_wav_tab2) |
|
st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav") |
|
denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR) |
|
denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2) |
|
st.image(image=denoised_spec_buff_tab2) |
|
record = None |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |