|
import io
|
|
import os
|
|
import base64
|
|
import librosa
|
|
import numpy as np
|
|
from io import BytesIO
|
|
import streamlit as st
|
|
from pydub import AudioSegment
|
|
import matplotlib.pyplot as plt
|
|
from scipy.io.wavfile import write
|
|
from src.denoise import denoise
|
|
from myrecorder import recorder
|
|
|
|
|
|
SR = 16000
|
|
CONTAINER_HEIGHT = 340
|
|
|
|
|
|
def np_audio_to_bytesio(np_audio, np_audio_sr):
|
|
_bytes = bytes()
|
|
byte_io = io.BytesIO(_bytes)
|
|
write(byte_io, np_audio_sr, np_audio)
|
|
bytes_audio = byte_io.read()
|
|
return bytes_audio
|
|
|
|
|
|
def autoplay_audio(audio: str):
|
|
audio_base64 = base64.b64encode(audio).decode('utf-8')
|
|
audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
|
|
st.markdown(audio_tag, unsafe_allow_html=True)
|
|
|
|
|
|
def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')):
|
|
noisy_speech_paths = {'EN':{}, 'JA': {}}
|
|
noisy_speech_names = os.listdir(root)
|
|
for name in noisy_speech_names:
|
|
splt = name.split('_')
|
|
lang, snr = splt[0].upper(), int(splt[1][:2])
|
|
noisy_speech_paths[lang][snr] = os.path.join(root, name)
|
|
|
|
en_keys = list(noisy_speech_paths['EN'].keys())
|
|
en_keys.sort()
|
|
en_keys.reverse()
|
|
noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys}
|
|
|
|
ja_keys = list(noisy_speech_paths['JA'].keys())
|
|
ja_keys.sort()
|
|
ja_keys.reverse()
|
|
noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys}
|
|
|
|
return noisy_speech_paths
|
|
|
|
|
|
def load_wav(wav_path):
|
|
wav_22k, sr = librosa.load(wav_path)
|
|
wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR)
|
|
return wav_22k, wav_16k
|
|
|
|
|
|
def wav_to_spec(wav, sr):
|
|
if sr == 16000:
|
|
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
|
|
spec = np.abs(librosa.stft(wav))
|
|
spec = librosa.amplitude_to_db(spec, ref=np.max)
|
|
return spec
|
|
|
|
|
|
def export_spec_to_buffer(spec):
|
|
plt.clf()
|
|
plt.rcParams['figure.figsize'] = (16, 3.6)
|
|
plt.rc('axes', labelsize=15)
|
|
plt.rc('xtick', labelsize=15)
|
|
plt.rc('ytick', labelsize=15)
|
|
librosa.display.specshow(spec, y_axis='linear', x_axis='time')
|
|
img_buffer = BytesIO()
|
|
img_buffer.truncate(0)
|
|
img_buffer.seek(0)
|
|
plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0)
|
|
plt.close('all')
|
|
return img_buffer
|
|
|
|
|
|
def process_recorded_wav_bytes(wav_bytes, sr):
|
|
file = BytesIO(wav_bytes)
|
|
audio = AudioSegment.from_file(file=file, format='wav')
|
|
audio = audio.set_sample_width(2)
|
|
audio = audio.set_channels(1)
|
|
audio_22k = audio.set_frame_rate(sr)
|
|
audio_16k = audio.set_frame_rate(SR)
|
|
audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32)
|
|
audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32)
|
|
return audio_22k, audio_16k
|
|
|
|
|
|
def main():
|
|
|
|
st.set_page_config(
|
|
page_title="speech-denoising-app",
|
|
layout="wide"
|
|
)
|
|
|
|
logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small")
|
|
|
|
with logo_space:
|
|
st.write(
|
|
"""
|
|
<div style="display: flex; justify-content: left;">
|
|
<b><span style="text-align: center; color: #101414; font-size: 10px">FPT Corporation</span></b>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
st.image('logo.png', width=48)
|
|
|
|
with title_space:
|
|
st.image('title.png', width=640)
|
|
|
|
with tooltip_space:
|
|
st.markdown(
|
|
"""
|
|
<style>
|
|
.tooltip {
|
|
position: relative;
|
|
display: inline-block;
|
|
cursor: pointer;
|
|
background-color: rgba(0, 76, 153, 1); /* Blue button color */
|
|
padding: 10px;
|
|
border-radius: 50%;
|
|
font-size: 16px;
|
|
font-weight: bold;
|
|
width: 40px;
|
|
height: 40px;
|
|
text-align: center;
|
|
line-height: 20px;
|
|
color: white; /* Text color */
|
|
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
|
|
}
|
|
|
|
.tooltip .tooltiptext {
|
|
visibility: hidden;
|
|
width: 300px; /* Adjust width for readability */
|
|
background-color: #333; /* Dark background for contrast */
|
|
color: #fff;
|
|
text-align: left; /* Align text to the left */
|
|
border-radius: 8px;
|
|
padding: 15px; /* Add padding for spacing */
|
|
position: absolute;
|
|
z-index: 1;
|
|
top: 150%; /* Position below the button */
|
|
left: 50%;
|
|
transform: translateX(-50%);
|
|
opacity: 0;
|
|
transition: opacity 0.3s;
|
|
font-size: 14px;
|
|
line-height: 1.8; /* Adjust line height for readability */
|
|
white-space: normal; /* Allow wrapping of text */
|
|
}
|
|
|
|
.tooltip:hover .tooltiptext {
|
|
visibility: visible;
|
|
opacity: 1;
|
|
}
|
|
</style>
|
|
""",
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
st.markdown(
|
|
"""
|
|
<div class="tooltip">
|
|
ℹ
|
|
<span class="tooltiptext">
|
|
<strong>Steps:</strong><br>
|
|
1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br>
|
|
2) Click <em>"Denoise"</em> and wait for a few seconds.<br>
|
|
3) Both the original audio and denoised audio will be available for playback.<br><br>
|
|
<strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech.
|
|
</span>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"])
|
|
|
|
with tab1:
|
|
noisy_speech_files = load_noisy_speech()
|
|
|
|
input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium")
|
|
_, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small")
|
|
|
|
with compute_space_tab1:
|
|
compute_tab1 = st.button('Denoise', key='denoise_tab1')
|
|
|
|
with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
|
|
lang_select_space, snr_select_space = st.columns([1, 1], gap="small")
|
|
with lang_select_space:
|
|
language_select = st.selectbox("Language", list(noisy_speech_files.keys()))
|
|
with snr_select_space:
|
|
if language_select:
|
|
snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys()))
|
|
|
|
audio_path_tab1 = noisy_speech_files[language_select][snr_select]
|
|
noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1)
|
|
noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050)
|
|
noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1)
|
|
|
|
st.audio(audio_path_tab1, format="wav")
|
|
st.image(image=noisy_spec_buff_tab1)
|
|
|
|
with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
|
|
st.write(
|
|
"""
|
|
<div style="display: flex; justify-content: center;">
|
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
if noisy_wav_tab1.any() and compute_tab1:
|
|
with st.spinner("Denoising..."):
|
|
denoised_wav_tab1 = denoise(noisy_wav_tab1)
|
|
st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav")
|
|
denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR)
|
|
denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1)
|
|
st.image(image=denoised_spec_buff_tab1)
|
|
|
|
with tab2:
|
|
input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium")
|
|
_, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small")
|
|
|
|
with record_space:
|
|
record = recorder(
|
|
start_prompt="Start Recording",
|
|
stop_prompt="Stop Recording",
|
|
just_once=False,
|
|
use_container_width=False,
|
|
format="wav",
|
|
callback=None,
|
|
args=(),
|
|
kwargs={},
|
|
key="tab2_recorder"
|
|
)
|
|
|
|
with compute_space_tab2:
|
|
compute_tab2 = st.button('Denoise', key='denoise_tab2')
|
|
|
|
noisy_wav_tab2 = np.array([])
|
|
with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
|
|
st.write(
|
|
"""
|
|
<div style="display: flex; justify-content: center;">
|
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
|
|
if record:
|
|
wav_bytes_record = record['bytes']
|
|
sr = record['sample_rate']
|
|
noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050)
|
|
noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050)
|
|
noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2)
|
|
|
|
st.audio(wav_bytes_record, format="wav")
|
|
st.image(image=noisy_spec_buff_tab2)
|
|
|
|
with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
|
|
st.write(
|
|
"""
|
|
<div style="display: flex; justify-content: center;">
|
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
if noisy_wav_tab2.any() and compute_tab2:
|
|
with st.spinner("Denoising..."):
|
|
denoised_wav_tab2 = denoise(noisy_wav_tab2)
|
|
st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav")
|
|
denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR)
|
|
denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2)
|
|
st.image(image=denoised_spec_buff_tab2)
|
|
record = None
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |