Spaces:

fsoft-ai-center
/

Speech-Enhancement

Sleeping

App Files Files Community

fsoft-ai-center commited on Feb 10

Commit

ea6e252

verified ·

1 Parent(s): 3bbc43f

Update app.py

Browse files

Files changed (1) hide show

app.py +280 -280

app.py CHANGED Viewed

@@ -1,281 +1,281 @@
-import io
-import os
-import base64
-import librosa
-import numpy as np
-from io import BytesIO
-import streamlit as st
-from pydub import AudioSegment
-import matplotlib.pyplot as plt
-from scipy.io.wavfile import write
-from src.denoise import denoise
-from myrecorder import recorder
-SR = 16000
-CONTAINER_HEIGHT = 340
-def np_audio_to_bytesio(np_audio, np_audio_sr):
-    _bytes = bytes()
-    byte_io = io.BytesIO(_bytes)
-    write(byte_io, np_audio_sr, np_audio)
-    bytes_audio = byte_io.read()
-    return bytes_audio
-def autoplay_audio(audio: str):
-    audio_base64 = base64.b64encode(audio).decode('utf-8')
-    audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
-    st.markdown(audio_tag, unsafe_allow_html=True)
-def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')):
-    noisy_speech_paths = {'EN':{}, 'JA': {}}
-    noisy_speech_names = os.listdir(root)
-    for name in noisy_speech_names:
-        splt = name.split('_')
-        lang, snr = splt[0].upper(), int(splt[1][:2])
-        noisy_speech_paths[lang][snr] = os.path.join(root, name)
-    en_keys = list(noisy_speech_paths['EN'].keys())
-    en_keys.sort()
-    en_keys.reverse()
-    noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys}
-    ja_keys = list(noisy_speech_paths['JA'].keys())
-    ja_keys.sort()
-    ja_keys.reverse()
-    noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys}
-    return noisy_speech_paths
-def load_wav(wav_path):
-    wav_22k, sr = librosa.load(wav_path)
-    wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR)
-    return wav_22k, wav_16k
-def wav_to_spec(wav, sr):
-    if sr == 16000:
-        wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
-    spec = np.abs(librosa.stft(wav))
-    spec = librosa.amplitude_to_db(spec, ref=np.max)
-    return spec
-def export_spec_to_buffer(spec):
-    plt.clf()
-    plt.rcParams['figure.figsize'] = (16, 3.6)
-    plt.rc('axes', labelsize=15)
-    plt.rc('xtick', labelsize=15)
-    plt.rc('ytick', labelsize=15)
-    librosa.display.specshow(spec, y_axis='log', x_axis='time')
-    img_buffer = BytesIO()
-    img_buffer.truncate(0)  # Remove all contents
-    img_buffer.seek(0)  # Reset the pointer to the start
-    plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0)
-    plt.close('all')
-    return img_buffer
-def process_recorded_wav_bytes(wav_bytes, sr):
-    file = BytesIO(wav_bytes)
-    audio = AudioSegment.from_file(file=file, format='wav')
-    audio = audio.set_sample_width(2)
-    audio = audio.set_channels(1)
-    audio_22k = audio.set_frame_rate(sr)
-    audio_16k = audio.set_frame_rate(SR)
-    audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32)
-    audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32)
-    return audio_22k, audio_16k
-def main():
-    st.set_page_config(
-        page_title="speech-denoising-app",
-        layout="wide"
-    )
-    logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small")
-    with logo_space:
-        st.image('logo.png', width=48)
-    with title_space:
-        st.image('title.png', width=640)
-    with tooltip_space:
-        st.markdown(
-            """
-            <style>
-            .tooltip {
-                position: relative;
-                display: inline-block;
-                cursor: pointer;
-                background-color: rgba(0, 76, 153, 1); /* Blue button color */
-                padding: 10px;
-                border-radius: 50%;
-                font-size: 16px;
-                font-weight: bold;
-                width: 40px;
-                height: 40px;
-                text-align: center;
-                line-height: 20px;
-                color: white; /* Text color */
-                box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
-            }
-            .tooltip .tooltiptext {
-                visibility: hidden;
-                width: 300px; /* Adjust width for readability */
-                background-color: #333; /* Dark background for contrast */
-                color: #fff;
-                text-align: left; /* Align text to the left */
-                border-radius: 8px;
-                padding: 15px; /* Add padding for spacing */
-                position: absolute;
-                z-index: 1;
-                top: 150%; /* Position below the button */
-                left: 50%;
-                transform: translateX(-50%);
-                opacity: 0;
-                transition: opacity 0.3s;
-                font-size: 14px;
-                line-height: 1.8; /* Adjust line height for readability */
-                white-space: normal; /* Allow wrapping of text */
-            }
-            .tooltip:hover .tooltiptext {
-                visibility: visible;
-                opacity: 1;
-            }
-            </style>
-            """,
-            unsafe_allow_html=True,
-        )
-        st.markdown(
-            """
-            <div class="tooltip">
-                ℹ
-                <span class="tooltiptext">
-                <strong>Steps:</strong><br>
-                1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br>
-                2) Click <em>"Denoise"</em> and wait for a few seconds.<br>
-                3) Both the original audio and denoised audio will be available for playback.<br><br>
-                <strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech.
-                </span>
-            </div>
-            """,
-            unsafe_allow_html=True,
-        )
-    tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"])
-    with tab1:
-        noisy_speech_files = load_noisy_speech()
-        input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium")
-        _, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small")
-        with compute_space_tab1:
-            compute_tab1 = st.button('Denoise', key='denoise_tab1')
-        with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
-            lang_select_space, snr_select_space = st.columns([1, 1], gap="small")
-            with lang_select_space:
-                language_select = st.selectbox("Language", list(noisy_speech_files.keys()))
-            with snr_select_space:
-                if language_select:
-                    snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys()))
-            audio_path_tab1 = noisy_speech_files[language_select][snr_select]
-            noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1)
-            noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050)
-            noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1)
-            st.audio(audio_path_tab1, format="wav")
-            st.image(image=noisy_spec_buff_tab1)
-        with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
-            st.write(
-                """
-                <div style="display: flex; justify-content: center;">
-                    <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
-                </div>
-                """,
-                unsafe_allow_html=True
-            )
-            if noisy_wav_tab1.any() and compute_tab1:
-                with st.spinner("Denoising..."):
-                    denoised_wav_tab1 = denoise(noisy_wav_tab1)
-                st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav")
-                denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR)
-                denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1)
-                st.image(image=denoised_spec_buff_tab1)
-    with tab2:
-        input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium")
-        _, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small")
-        with record_space:
-            record = recorder(
-                start_prompt="Start Recording",
-                stop_prompt="Stop Recording",
-                just_once=False,
-                use_container_width=False,
-                format="wav",
-                callback=None,
-                args=(),
-                kwargs={},
-                key="tab2_recorder"
-            )
-        with compute_space_tab2:
-            compute_tab2 = st.button('Denoise', key='denoise_tab2')
-        noisy_wav_tab2 = np.array([])
-        with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
-            st.write(
-                """
-                <div style="display: flex; justify-content: center;">
-                    <b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b>
-                </div>
-                """,
-                unsafe_allow_html=True
-            )
-            if record:
-                wav_bytes_record = record['bytes']
-                sr = record['sample_rate']
-                noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050)
-                noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050)
-                noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2)
-                st.audio(wav_bytes_record, format="wav")
-                st.image(image=noisy_spec_buff_tab2)
-        with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
-            st.write(
-                """
-                <div style="display: flex; justify-content: center;">
-                    <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
-                </div>
-                """,
-                unsafe_allow_html=True
-            )
-            if noisy_wav_tab2.any() and compute_tab2:
-                with st.spinner("Denoising..."):
-                    denoised_wav_tab2 = denoise(noisy_wav_tab2)
-                st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav")
-                denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR)
-                denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2)
-                st.image(image=denoised_spec_buff_tab2)
-                record = None
-if __name__ == '__main__':
     main()

+import io
+import os
+import base64
+import librosa
+import numpy as np
+from io import BytesIO
+import streamlit as st
+from pydub import AudioSegment
+import matplotlib.pyplot as plt
+from scipy.io.wavfile import write
+from src.denoise import denoise
+from myrecorder import recorder
+SR = 16000
+CONTAINER_HEIGHT = 345
+def np_audio_to_bytesio(np_audio, np_audio_sr):
+    _bytes = bytes()
+    byte_io = io.BytesIO(_bytes)
+    write(byte_io, np_audio_sr, np_audio)
+    bytes_audio = byte_io.read()
+    return bytes_audio
+def autoplay_audio(audio: str):
+    audio_base64 = base64.b64encode(audio).decode('utf-8')
+    audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
+    st.markdown(audio_tag, unsafe_allow_html=True)
+def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')):
+    noisy_speech_paths = {'EN':{}, 'JA': {}}
+    noisy_speech_names = os.listdir(root)
+    for name in noisy_speech_names:
+        splt = name.split('_')
+        lang, snr = splt[0].upper(), int(splt[1][:2])
+        noisy_speech_paths[lang][snr] = os.path.join(root, name)
+    en_keys = list(noisy_speech_paths['EN'].keys())
+    en_keys.sort()
+    en_keys.reverse()
+    noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys}
+    ja_keys = list(noisy_speech_paths['JA'].keys())
+    ja_keys.sort()
+    ja_keys.reverse()
+    noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys}
+    return noisy_speech_paths
+def load_wav(wav_path):
+    wav_22k, sr = librosa.load(wav_path)
+    wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR)
+    return wav_22k, wav_16k
+def wav_to_spec(wav, sr):
+    if sr == 16000:
+        wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
+    spec = np.abs(librosa.stft(wav))
+    spec = librosa.amplitude_to_db(spec, ref=np.max)
+    return spec
+def export_spec_to_buffer(spec):
+    plt.clf()
+    plt.rcParams['figure.figsize'] = (16, 3.6)
+    plt.rc('axes', labelsize=15)
+    plt.rc('xtick', labelsize=15)
+    plt.rc('ytick', labelsize=15)
+    librosa.display.specshow(spec, y_axis='log', x_axis='time')
+    img_buffer = BytesIO()
+    img_buffer.truncate(0)  # Remove all contents
+    img_buffer.seek(0)  # Reset the pointer to the start
+    plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0)
+    plt.close('all')
+    return img_buffer
+def process_recorded_wav_bytes(wav_bytes, sr):
+    file = BytesIO(wav_bytes)
+    audio = AudioSegment.from_file(file=file, format='wav')
+    audio = audio.set_sample_width(2)
+    audio = audio.set_channels(1)
+    audio_22k = audio.set_frame_rate(sr)
+    audio_16k = audio.set_frame_rate(SR)
+    audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32)
+    audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32)
+    return audio_22k, audio_16k
+def main():
+    st.set_page_config(
+        page_title="speech-denoising-app",
+        layout="wide"
+    )
+    logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small")
+    with logo_space:
+        st.image('logo.png', width=48)
+    with title_space:
+        st.image('title.png', width=640)
+    with tooltip_space:
+        st.markdown(
+            """
+            <style>
+            .tooltip {
+                position: relative;
+                display: inline-block;
+                cursor: pointer;
+                background-color: rgba(0, 76, 153, 1); /* Blue button color */
+                padding: 10px;
+                border-radius: 50%;
+                font-size: 16px;
+                font-weight: bold;
+                width: 40px;
+                height: 40px;
+                text-align: center;
+                line-height: 20px;
+                color: white; /* Text color */
+                box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
+            }
+            .tooltip .tooltiptext {
+                visibility: hidden;
+                width: 300px; /* Adjust width for readability */
+                background-color: #333; /* Dark background for contrast */
+                color: #fff;
+                text-align: left; /* Align text to the left */
+                border-radius: 8px;
+                padding: 15px; /* Add padding for spacing */
+                position: absolute;
+                z-index: 1;
+                top: 150%; /* Position below the button */
+                left: 50%;
+                transform: translateX(-50%);
+                opacity: 0;
+                transition: opacity 0.3s;
+                font-size: 14px;
+                line-height: 1.8; /* Adjust line height for readability */
+                white-space: normal; /* Allow wrapping of text */
+            }
+            .tooltip:hover .tooltiptext {
+                visibility: visible;
+                opacity: 1;
+            }
+            </style>
+            """,
+            unsafe_allow_html=True,
+        )
+        st.markdown(
+            """
+            <div class="tooltip">
+                ℹ
+                <span class="tooltiptext">
+                <strong>Steps:</strong><br>
+                1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br>
+                2) Click <em>"Denoise"</em> and wait for a few seconds.<br>
+                3) Both the original audio and denoised audio will be available for playback.<br><br>
+                <strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech.
+                </span>
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+    tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"])
+    with tab1:
+        noisy_speech_files = load_noisy_speech()
+        input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium")
+        _, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small")
+        with compute_space_tab1:
+            compute_tab1 = st.button('Denoise', key='denoise_tab1')
+        with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
+            lang_select_space, snr_select_space = st.columns([1, 1], gap="small")
+            with lang_select_space:
+                language_select = st.selectbox("Language", list(noisy_speech_files.keys()))
+            with snr_select_space:
+                if language_select:
+                    snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys()))
+            audio_path_tab1 = noisy_speech_files[language_select][snr_select]
+            noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1)
+            noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050)
+            noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1)
+            st.audio(audio_path_tab1, format="wav")
+            st.image(image=noisy_spec_buff_tab1)
+        with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
+            st.write(
+                """
+                <div style="display: flex; justify-content: center;">
+                    <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
+                </div>
+                """,
+                unsafe_allow_html=True
+            )
+            if noisy_wav_tab1.any() and compute_tab1:
+                with st.spinner("Denoising..."):
+                    denoised_wav_tab1 = denoise(noisy_wav_tab1)
+                st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav")
+                denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR)
+                denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1)
+                st.image(image=denoised_spec_buff_tab1)
+    with tab2:
+        input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium")
+        _, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small")
+        with record_space:
+            record = recorder(
+                start_prompt="Start Recording",
+                stop_prompt="Stop Recording",
+                just_once=False,
+                use_container_width=False,
+                format="wav",
+                callback=None,
+                args=(),
+                kwargs={},
+                key="tab2_recorder"
+            )
+        with compute_space_tab2:
+            compute_tab2 = st.button('Denoise', key='denoise_tab2')
+        noisy_wav_tab2 = np.array([])
+        with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
+            st.write(
+                """
+                <div style="display: flex; justify-content: center;">
+                    <b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b>
+                </div>
+                """,
+                unsafe_allow_html=True
+            )
+            if record:
+                wav_bytes_record = record['bytes']
+                sr = record['sample_rate']
+                noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050)
+                noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050)
+                noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2)
+                st.audio(wav_bytes_record, format="wav")
+                st.image(image=noisy_spec_buff_tab2)
+        with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
+            st.write(
+                """
+                <div style="display: flex; justify-content: center;">
+                    <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
+                </div>
+                """,
+                unsafe_allow_html=True
+            )
+            if noisy_wav_tab2.any() and compute_tab2:
+                with st.spinner("Denoising..."):
+                    denoised_wav_tab2 = denoise(noisy_wav_tab2)
+                st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav")
+                denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR)
+                denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2)
+                st.image(image=denoised_spec_buff_tab2)
+                record = None
+if __name__ == '__main__':
     main()