|
import io
|
|
import os
|
|
import base64
|
|
import librosa
|
|
import numpy as np
|
|
from io import BytesIO
|
|
import streamlit as st
|
|
from pydub import AudioSegment
|
|
import matplotlib.pyplot as plt
|
|
from scipy.io.wavfile import write
|
|
from src.denoise import denoise
|
|
from myrecorder import recorder
|
|
|
|
|
|
SR = 16000
|
|
CONTAINER_HEIGHT = 380
|
|
|
|
|
|
def np_audio_to_bytesio(np_audio, np_audio_sr):
|
|
_bytes = bytes()
|
|
byte_io = io.BytesIO(_bytes)
|
|
write(byte_io, np_audio_sr, np_audio)
|
|
bytes_audio = byte_io.read()
|
|
return bytes_audio
|
|
|
|
|
|
def autoplay_audio(audio: str):
|
|
audio_base64 = base64.b64encode(audio).decode('utf-8')
|
|
audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
|
|
st.markdown(audio_tag, unsafe_allow_html=True)
|
|
|
|
|
|
def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')):
|
|
noisy_speech_paths = {'EN':{}, 'JA': {}}
|
|
noisy_speech_names = os.listdir(root)
|
|
for name in noisy_speech_names:
|
|
splt = name.split('_')
|
|
lang, snr = splt[0].upper(), int(splt[1][:2])
|
|
noisy_speech_paths[lang][snr] = os.path.join(root, name)
|
|
|
|
en_keys = list(noisy_speech_paths['EN'].keys())
|
|
en_keys.sort()
|
|
en_keys.reverse()
|
|
noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys}
|
|
|
|
ja_keys = list(noisy_speech_paths['JA'].keys())
|
|
ja_keys.sort()
|
|
ja_keys.reverse()
|
|
noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys}
|
|
|
|
return noisy_speech_paths
|
|
|
|
|
|
def load_wav(wav_path):
|
|
wav_22k, sr = librosa.load(wav_path)
|
|
wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR)
|
|
return wav_22k, wav_16k
|
|
|
|
|
|
def wav_to_spec(wav, sr):
|
|
if sr == 16000:
|
|
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
|
|
spec = np.abs(librosa.stft(wav))
|
|
spec = librosa.amplitude_to_db(spec, ref=np.max)
|
|
return spec
|
|
|
|
|
|
def export_spec_to_buffer(spec):
|
|
plt.rcParams['figure.figsize'] = (16, 4.5)
|
|
plt.rc('axes', labelsize=15)
|
|
plt.rc('xtick', labelsize=15)
|
|
plt.rc('ytick', labelsize=15)
|
|
librosa.display.specshow(spec, y_axis='log', x_axis='time')
|
|
img_buffer = BytesIO()
|
|
plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0)
|
|
return img_buffer
|
|
|
|
|
|
def process_recorded_wav_bytes(wav_bytes, sr):
|
|
file = BytesIO(wav_bytes)
|
|
audio = AudioSegment.from_file(file=file, format='wav')
|
|
audio = audio.set_sample_width(2)
|
|
audio = audio.set_channels(1)
|
|
audio_22k = audio.set_frame_rate(sr)
|
|
audio_16k = audio.set_frame_rate(SR)
|
|
audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32)
|
|
audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32)
|
|
return audio_22k, audio_16k
|
|
|
|
|
|
def main():
|
|
|
|
st.set_page_config(
|
|
page_title="speech-denoising-app",
|
|
layout="wide"
|
|
)
|
|
|
|
logo_space, title_space, _ = st.columns([1, 5, 1], gap="small")
|
|
|
|
with logo_space:
|
|
st.write(
|
|
"""
|
|
<div style="display: flex; justify-content: left;">
|
|
<b><span style="text-align: center; color: #101414; font-size: 14px">FPT Corporation</span></b>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
st.image('aic-logo.png')
|
|
|
|
with title_space:
|
|
st.image('logo.png')
|
|
|
|
noisy_speech_files = load_noisy_speech()
|
|
|
|
input_space, output_space = st.columns([1, 1], gap="medium")
|
|
_, record_space, _, compute_space= st.columns([0.7, 1, 1, 1], gap="small")
|
|
|
|
with record_space:
|
|
record = recorder(
|
|
start_prompt="Start Recording",
|
|
stop_prompt="Stop Recording",
|
|
just_once=False,
|
|
use_container_width=False,
|
|
format="wav",
|
|
callback=None,
|
|
args=(),
|
|
kwargs={},
|
|
key=None
|
|
)
|
|
|
|
with compute_space:
|
|
compute = st.button('Denoise')
|
|
|
|
with input_space.container(height=CONTAINER_HEIGHT, border=True):
|
|
lang_select_space, snr_select_space = st.columns([1, 1], gap="small")
|
|
with lang_select_space:
|
|
language_select = st.selectbox("Language", list(noisy_speech_files.keys()))
|
|
with snr_select_space:
|
|
if language_select:
|
|
snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys()))
|
|
|
|
if record:
|
|
wav_bytes_record = record['bytes']
|
|
sr = record['sample_rate']
|
|
noisy_wav_22k, noisy_wav = process_recorded_wav_bytes(wav_bytes_record, sr=22050)
|
|
noisy_spec = wav_to_spec(noisy_wav_22k, sr=22050)
|
|
noisy_spec_buff = export_spec_to_buffer(noisy_spec)
|
|
|
|
st.audio(wav_bytes_record, format="wav")
|
|
st.image(image=noisy_spec_buff)
|
|
|
|
elif language_select and snr_select:
|
|
audio_path = noisy_speech_files[language_select][snr_select]
|
|
noisy_wav_22k, noisy_wav = load_wav(audio_path)
|
|
noisy_spec = wav_to_spec(noisy_wav_22k, sr=22050)
|
|
noisy_spec_buff = export_spec_to_buffer(noisy_spec)
|
|
|
|
st.audio(audio_path, format="wav")
|
|
st.image(image=noisy_spec_buff)
|
|
|
|
with output_space.container(height=CONTAINER_HEIGHT, border=True):
|
|
st.write(
|
|
"""
|
|
<div style="display: flex; justify-content: center;">
|
|
<b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
|
|
</div>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
if noisy_wav.any() and compute:
|
|
denoised_wav = denoise(noisy_wav)
|
|
st.audio(denoised_wav, sample_rate=SR, format="audio/wav")
|
|
denoised_spec = wav_to_spec(denoised_wav, sr=SR)
|
|
denoised_spec_buff = export_spec_to_buffer(denoised_spec)
|
|
st.image(image=denoised_spec_buff)
|
|
record = None
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |