fsoft-ai-center commited on
Commit
ea6e252
·
verified ·
1 Parent(s): 3bbc43f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +280 -280
app.py CHANGED
@@ -1,281 +1,281 @@
1
- import io
2
- import os
3
- import base64
4
- import librosa
5
- import numpy as np
6
- from io import BytesIO
7
- import streamlit as st
8
- from pydub import AudioSegment
9
- import matplotlib.pyplot as plt
10
- from scipy.io.wavfile import write
11
- from src.denoise import denoise
12
- from myrecorder import recorder
13
-
14
-
15
- SR = 16000
16
- CONTAINER_HEIGHT = 340
17
-
18
-
19
- def np_audio_to_bytesio(np_audio, np_audio_sr):
20
- _bytes = bytes()
21
- byte_io = io.BytesIO(_bytes)
22
- write(byte_io, np_audio_sr, np_audio)
23
- bytes_audio = byte_io.read()
24
- return bytes_audio
25
-
26
-
27
- def autoplay_audio(audio: str):
28
- audio_base64 = base64.b64encode(audio).decode('utf-8')
29
- audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
30
- st.markdown(audio_tag, unsafe_allow_html=True)
31
-
32
-
33
- def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')):
34
- noisy_speech_paths = {'EN':{}, 'JA': {}}
35
- noisy_speech_names = os.listdir(root)
36
- for name in noisy_speech_names:
37
- splt = name.split('_')
38
- lang, snr = splt[0].upper(), int(splt[1][:2])
39
- noisy_speech_paths[lang][snr] = os.path.join(root, name)
40
-
41
- en_keys = list(noisy_speech_paths['EN'].keys())
42
- en_keys.sort()
43
- en_keys.reverse()
44
- noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys}
45
-
46
- ja_keys = list(noisy_speech_paths['JA'].keys())
47
- ja_keys.sort()
48
- ja_keys.reverse()
49
- noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys}
50
-
51
- return noisy_speech_paths
52
-
53
-
54
- def load_wav(wav_path):
55
- wav_22k, sr = librosa.load(wav_path)
56
- wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR)
57
- return wav_22k, wav_16k
58
-
59
-
60
- def wav_to_spec(wav, sr):
61
- if sr == 16000:
62
- wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
63
- spec = np.abs(librosa.stft(wav))
64
- spec = librosa.amplitude_to_db(spec, ref=np.max)
65
- return spec
66
-
67
-
68
- def export_spec_to_buffer(spec):
69
- plt.clf()
70
- plt.rcParams['figure.figsize'] = (16, 3.6)
71
- plt.rc('axes', labelsize=15)
72
- plt.rc('xtick', labelsize=15)
73
- plt.rc('ytick', labelsize=15)
74
- librosa.display.specshow(spec, y_axis='log', x_axis='time')
75
- img_buffer = BytesIO()
76
- img_buffer.truncate(0) # Remove all contents
77
- img_buffer.seek(0) # Reset the pointer to the start
78
- plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0)
79
- plt.close('all')
80
- return img_buffer
81
-
82
-
83
- def process_recorded_wav_bytes(wav_bytes, sr):
84
- file = BytesIO(wav_bytes)
85
- audio = AudioSegment.from_file(file=file, format='wav')
86
- audio = audio.set_sample_width(2)
87
- audio = audio.set_channels(1)
88
- audio_22k = audio.set_frame_rate(sr)
89
- audio_16k = audio.set_frame_rate(SR)
90
- audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32)
91
- audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32)
92
- return audio_22k, audio_16k
93
-
94
-
95
- def main():
96
-
97
- st.set_page_config(
98
- page_title="speech-denoising-app",
99
- layout="wide"
100
- )
101
-
102
- logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small")
103
-
104
- with logo_space:
105
- st.image('logo.png', width=48)
106
-
107
- with title_space:
108
- st.image('title.png', width=640)
109
-
110
- with tooltip_space:
111
- st.markdown(
112
- """
113
- <style>
114
- .tooltip {
115
- position: relative;
116
- display: inline-block;
117
- cursor: pointer;
118
- background-color: rgba(0, 76, 153, 1); /* Blue button color */
119
- padding: 10px;
120
- border-radius: 50%;
121
- font-size: 16px;
122
- font-weight: bold;
123
- width: 40px;
124
- height: 40px;
125
- text-align: center;
126
- line-height: 20px;
127
- color: white; /* Text color */
128
- box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
129
- }
130
-
131
- .tooltip .tooltiptext {
132
- visibility: hidden;
133
- width: 300px; /* Adjust width for readability */
134
- background-color: #333; /* Dark background for contrast */
135
- color: #fff;
136
- text-align: left; /* Align text to the left */
137
- border-radius: 8px;
138
- padding: 15px; /* Add padding for spacing */
139
- position: absolute;
140
- z-index: 1;
141
- top: 150%; /* Position below the button */
142
- left: 50%;
143
- transform: translateX(-50%);
144
- opacity: 0;
145
- transition: opacity 0.3s;
146
- font-size: 14px;
147
- line-height: 1.8; /* Adjust line height for readability */
148
- white-space: normal; /* Allow wrapping of text */
149
- }
150
-
151
- .tooltip:hover .tooltiptext {
152
- visibility: visible;
153
- opacity: 1;
154
- }
155
- </style>
156
- """,
157
- unsafe_allow_html=True,
158
- )
159
-
160
- st.markdown(
161
- """
162
- <div class="tooltip">
163
-
164
- <span class="tooltiptext">
165
- <strong>Steps:</strong><br>
166
- 1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br>
167
- 2) Click <em>"Denoise"</em> and wait for a few seconds.<br>
168
- 3) Both the original audio and denoised audio will be available for playback.<br><br>
169
- <strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech.
170
- </span>
171
- </div>
172
- """,
173
- unsafe_allow_html=True,
174
- )
175
-
176
- tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"])
177
-
178
- with tab1:
179
- noisy_speech_files = load_noisy_speech()
180
-
181
- input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium")
182
- _, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small")
183
-
184
- with compute_space_tab1:
185
- compute_tab1 = st.button('Denoise', key='denoise_tab1')
186
-
187
- with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
188
- lang_select_space, snr_select_space = st.columns([1, 1], gap="small")
189
- with lang_select_space:
190
- language_select = st.selectbox("Language", list(noisy_speech_files.keys()))
191
- with snr_select_space:
192
- if language_select:
193
- snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys()))
194
-
195
- audio_path_tab1 = noisy_speech_files[language_select][snr_select]
196
- noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1)
197
- noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050)
198
- noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1)
199
-
200
- st.audio(audio_path_tab1, format="wav")
201
- st.image(image=noisy_spec_buff_tab1)
202
-
203
- with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
204
- st.write(
205
- """
206
- <div style="display: flex; justify-content: center;">
207
- <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
208
- </div>
209
- """,
210
- unsafe_allow_html=True
211
- )
212
- if noisy_wav_tab1.any() and compute_tab1:
213
- with st.spinner("Denoising..."):
214
- denoised_wav_tab1 = denoise(noisy_wav_tab1)
215
- st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav")
216
- denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR)
217
- denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1)
218
- st.image(image=denoised_spec_buff_tab1)
219
-
220
- with tab2:
221
- input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium")
222
- _, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small")
223
-
224
- with record_space:
225
- record = recorder(
226
- start_prompt="Start Recording",
227
- stop_prompt="Stop Recording",
228
- just_once=False,
229
- use_container_width=False,
230
- format="wav",
231
- callback=None,
232
- args=(),
233
- kwargs={},
234
- key="tab2_recorder"
235
- )
236
-
237
- with compute_space_tab2:
238
- compute_tab2 = st.button('Denoise', key='denoise_tab2')
239
-
240
- noisy_wav_tab2 = np.array([])
241
- with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
242
- st.write(
243
- """
244
- <div style="display: flex; justify-content: center;">
245
- <b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b>
246
- </div>
247
- """,
248
- unsafe_allow_html=True
249
- )
250
-
251
- if record:
252
- wav_bytes_record = record['bytes']
253
- sr = record['sample_rate']
254
- noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050)
255
- noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050)
256
- noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2)
257
-
258
- st.audio(wav_bytes_record, format="wav")
259
- st.image(image=noisy_spec_buff_tab2)
260
-
261
- with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
262
- st.write(
263
- """
264
- <div style="display: flex; justify-content: center;">
265
- <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
266
- </div>
267
- """,
268
- unsafe_allow_html=True
269
- )
270
- if noisy_wav_tab2.any() and compute_tab2:
271
- with st.spinner("Denoising..."):
272
- denoised_wav_tab2 = denoise(noisy_wav_tab2)
273
- st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav")
274
- denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR)
275
- denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2)
276
- st.image(image=denoised_spec_buff_tab2)
277
- record = None
278
-
279
-
280
- if __name__ == '__main__':
281
  main()
 
1
+ import io
2
+ import os
3
+ import base64
4
+ import librosa
5
+ import numpy as np
6
+ from io import BytesIO
7
+ import streamlit as st
8
+ from pydub import AudioSegment
9
+ import matplotlib.pyplot as plt
10
+ from scipy.io.wavfile import write
11
+ from src.denoise import denoise
12
+ from myrecorder import recorder
13
+
14
+
15
+ SR = 16000
16
+ CONTAINER_HEIGHT = 345
17
+
18
+
19
+ def np_audio_to_bytesio(np_audio, np_audio_sr):
20
+ _bytes = bytes()
21
+ byte_io = io.BytesIO(_bytes)
22
+ write(byte_io, np_audio_sr, np_audio)
23
+ bytes_audio = byte_io.read()
24
+ return bytes_audio
25
+
26
+
27
+ def autoplay_audio(audio: str):
28
+ audio_base64 = base64.b64encode(audio).decode('utf-8')
29
+ audio_tag = f'<audio autoplay="true" src="data:audio/wav;base64,{audio_base64}">'
30
+ st.markdown(audio_tag, unsafe_allow_html=True)
31
+
32
+
33
+ def load_noisy_speech(root=os.path.join(os.getcwd(), 'noisy_speech')):
34
+ noisy_speech_paths = {'EN':{}, 'JA': {}}
35
+ noisy_speech_names = os.listdir(root)
36
+ for name in noisy_speech_names:
37
+ splt = name.split('_')
38
+ lang, snr = splt[0].upper(), int(splt[1][:2])
39
+ noisy_speech_paths[lang][snr] = os.path.join(root, name)
40
+
41
+ en_keys = list(noisy_speech_paths['EN'].keys())
42
+ en_keys.sort()
43
+ en_keys.reverse()
44
+ noisy_speech_paths['EN'] = {f'{key}dB': noisy_speech_paths['EN'][key] for key in en_keys}
45
+
46
+ ja_keys = list(noisy_speech_paths['JA'].keys())
47
+ ja_keys.sort()
48
+ ja_keys.reverse()
49
+ noisy_speech_paths['JA'] = {f'{key}dB': noisy_speech_paths['JA'][key] for key in ja_keys}
50
+
51
+ return noisy_speech_paths
52
+
53
+
54
+ def load_wav(wav_path):
55
+ wav_22k, sr = librosa.load(wav_path)
56
+ wav_16k = librosa.resample(wav_22k, orig_sr=sr, target_sr=SR)
57
+ return wav_22k, wav_16k
58
+
59
+
60
+ def wav_to_spec(wav, sr):
61
+ if sr == 16000:
62
+ wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
63
+ spec = np.abs(librosa.stft(wav))
64
+ spec = librosa.amplitude_to_db(spec, ref=np.max)
65
+ return spec
66
+
67
+
68
+ def export_spec_to_buffer(spec):
69
+ plt.clf()
70
+ plt.rcParams['figure.figsize'] = (16, 3.6)
71
+ plt.rc('axes', labelsize=15)
72
+ plt.rc('xtick', labelsize=15)
73
+ plt.rc('ytick', labelsize=15)
74
+ librosa.display.specshow(spec, y_axis='log', x_axis='time')
75
+ img_buffer = BytesIO()
76
+ img_buffer.truncate(0) # Remove all contents
77
+ img_buffer.seek(0) # Reset the pointer to the start
78
+ plt.savefig(img_buffer, format='JPEG', bbox_inches='tight', pad_inches=0)
79
+ plt.close('all')
80
+ return img_buffer
81
+
82
+
83
+ def process_recorded_wav_bytes(wav_bytes, sr):
84
+ file = BytesIO(wav_bytes)
85
+ audio = AudioSegment.from_file(file=file, format='wav')
86
+ audio = audio.set_sample_width(2)
87
+ audio = audio.set_channels(1)
88
+ audio_22k = audio.set_frame_rate(sr)
89
+ audio_16k = audio.set_frame_rate(SR)
90
+ audio_22k = np.array(audio_22k.get_array_of_samples(), dtype=np.float32)
91
+ audio_16k = np.array(audio_16k.get_array_of_samples(), dtype=np.float32)
92
+ return audio_22k, audio_16k
93
+
94
+
95
+ def main():
96
+
97
+ st.set_page_config(
98
+ page_title="speech-denoising-app",
99
+ layout="wide"
100
+ )
101
+
102
+ logo_space, title_space, _, tooltip_space = st.columns([2.03, 5, 1, 0.75], gap="small")
103
+
104
+ with logo_space:
105
+ st.image('logo.png', width=48)
106
+
107
+ with title_space:
108
+ st.image('title.png', width=640)
109
+
110
+ with tooltip_space:
111
+ st.markdown(
112
+ """
113
+ <style>
114
+ .tooltip {
115
+ position: relative;
116
+ display: inline-block;
117
+ cursor: pointer;
118
+ background-color: rgba(0, 76, 153, 1); /* Blue button color */
119
+ padding: 10px;
120
+ border-radius: 50%;
121
+ font-size: 16px;
122
+ font-weight: bold;
123
+ width: 40px;
124
+ height: 40px;
125
+ text-align: center;
126
+ line-height: 20px;
127
+ color: white; /* Text color */
128
+ box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
129
+ }
130
+
131
+ .tooltip .tooltiptext {
132
+ visibility: hidden;
133
+ width: 300px; /* Adjust width for readability */
134
+ background-color: #333; /* Dark background for contrast */
135
+ color: #fff;
136
+ text-align: left; /* Align text to the left */
137
+ border-radius: 8px;
138
+ padding: 15px; /* Add padding for spacing */
139
+ position: absolute;
140
+ z-index: 1;
141
+ top: 150%; /* Position below the button */
142
+ left: 50%;
143
+ transform: translateX(-50%);
144
+ opacity: 0;
145
+ transition: opacity 0.3s;
146
+ font-size: 14px;
147
+ line-height: 1.8; /* Adjust line height for readability */
148
+ white-space: normal; /* Allow wrapping of text */
149
+ }
150
+
151
+ .tooltip:hover .tooltiptext {
152
+ visibility: visible;
153
+ opacity: 1;
154
+ }
155
+ </style>
156
+ """,
157
+ unsafe_allow_html=True,
158
+ )
159
+
160
+ st.markdown(
161
+ """
162
+ <div class="tooltip">
163
+
164
+ <span class="tooltiptext">
165
+ <strong>Steps:</strong><br>
166
+ 1) Denoise your own speech: Click <em>Start recording</em>, then <em>Stop recording</em> when you are finished.<br>
167
+ 2) Click <em>"Denoise"</em> and wait for a few seconds.<br>
168
+ 3) Both the original audio and denoised audio will be available for playback.<br><br>
169
+ <strong>Note:</strong> Playing "noise" on your device while recording your speech to emulate speaking in a noisy environment will not work as intended. To do this emulation more realistically, play the noise on a different device (such as your phone) while recording your speech.
170
+ </span>
171
+ </div>
172
+ """,
173
+ unsafe_allow_html=True,
174
+ )
175
+
176
+ tab1, tab2 = st.tabs(["📂Denoise our samples speech", "🎙️Denoise your own speech"])
177
+
178
+ with tab1:
179
+ noisy_speech_files = load_noisy_speech()
180
+
181
+ input_space_tab1, output_space_tab1 = st.columns([1, 1], gap="medium")
182
+ _, _, _, compute_space_tab1= st.columns([0.7, 1, 1, 1], gap="small")
183
+
184
+ with compute_space_tab1:
185
+ compute_tab1 = st.button('Denoise', key='denoise_tab1')
186
+
187
+ with input_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
188
+ lang_select_space, snr_select_space = st.columns([1, 1], gap="small")
189
+ with lang_select_space:
190
+ language_select = st.selectbox("Language", list(noisy_speech_files.keys()))
191
+ with snr_select_space:
192
+ if language_select:
193
+ snr_select = st.selectbox("SNR Level", list(noisy_speech_files[language_select].keys()))
194
+
195
+ audio_path_tab1 = noisy_speech_files[language_select][snr_select]
196
+ noisy_wav_22k_tab1, noisy_wav_tab1 = load_wav(audio_path_tab1)
197
+ noisy_spec_tab1 = wav_to_spec(noisy_wav_22k_tab1, sr=22050)
198
+ noisy_spec_buff_tab1 = export_spec_to_buffer(noisy_spec_tab1)
199
+
200
+ st.audio(audio_path_tab1, format="wav")
201
+ st.image(image=noisy_spec_buff_tab1)
202
+
203
+ with output_space_tab1.container(height=CONTAINER_HEIGHT, border=True):
204
+ st.write(
205
+ """
206
+ <div style="display: flex; justify-content: center;">
207
+ <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
208
+ </div>
209
+ """,
210
+ unsafe_allow_html=True
211
+ )
212
+ if noisy_wav_tab1.any() and compute_tab1:
213
+ with st.spinner("Denoising..."):
214
+ denoised_wav_tab1 = denoise(noisy_wav_tab1)
215
+ st.audio(denoised_wav_tab1, sample_rate=SR, format="audio/wav")
216
+ denoised_spec_tab1 = wav_to_spec(denoised_wav_tab1, sr=SR)
217
+ denoised_spec_buff_tab1 = export_spec_to_buffer(denoised_spec_tab1)
218
+ st.image(image=denoised_spec_buff_tab1)
219
+
220
+ with tab2:
221
+ input_space_tab2, output_space_tab2 = st.columns([1, 1], gap="medium")
222
+ _, record_space, _, compute_space_tab2 = st.columns([0.7, 1, 1, 1], gap="small")
223
+
224
+ with record_space:
225
+ record = recorder(
226
+ start_prompt="Start Recording",
227
+ stop_prompt="Stop Recording",
228
+ just_once=False,
229
+ use_container_width=False,
230
+ format="wav",
231
+ callback=None,
232
+ args=(),
233
+ kwargs={},
234
+ key="tab2_recorder"
235
+ )
236
+
237
+ with compute_space_tab2:
238
+ compute_tab2 = st.button('Denoise', key='denoise_tab2')
239
+
240
+ noisy_wav_tab2 = np.array([])
241
+ with input_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
242
+ st.write(
243
+ """
244
+ <div style="display: flex; justify-content: center;">
245
+ <b><span style="text-align: center; color: #808080; font-size: 51.5px">Input</span></b>
246
+ </div>
247
+ """,
248
+ unsafe_allow_html=True
249
+ )
250
+
251
+ if record:
252
+ wav_bytes_record = record['bytes']
253
+ sr = record['sample_rate']
254
+ noisy_wav_22k_tab2, noisy_wav_tab2 = process_recorded_wav_bytes(wav_bytes_record, sr=22050)
255
+ noisy_spec_tab2 = wav_to_spec(noisy_wav_22k_tab2, sr=22050)
256
+ noisy_spec_buff_tab2 = export_spec_to_buffer(noisy_spec_tab2)
257
+
258
+ st.audio(wav_bytes_record, format="wav")
259
+ st.image(image=noisy_spec_buff_tab2)
260
+
261
+ with output_space_tab2.container(height=CONTAINER_HEIGHT, border=True):
262
+ st.write(
263
+ """
264
+ <div style="display: flex; justify-content: center;">
265
+ <b><span style="text-align: center; color: #808080; font-size: 51.5px">Output</span></b>
266
+ </div>
267
+ """,
268
+ unsafe_allow_html=True
269
+ )
270
+ if noisy_wav_tab2.any() and compute_tab2:
271
+ with st.spinner("Denoising..."):
272
+ denoised_wav_tab2 = denoise(noisy_wav_tab2)
273
+ st.audio(denoised_wav_tab2, sample_rate=SR, format="audio/wav")
274
+ denoised_spec_tab2 = wav_to_spec(denoised_wav_tab2, sr=SR)
275
+ denoised_spec_buff_tab2 = export_spec_to_buffer(denoised_spec_tab2)
276
+ st.image(image=denoised_spec_buff_tab2)
277
+ record = None
278
+
279
+
280
+ if __name__ == '__main__':
281
  main()