Spaces:
Build error
Build error
Mod: Rework UI, remove tmp files and clear cache after 10min.
Browse files
app.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
#!/usr/bin/env python
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
|
|
|
|
4 |
from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
|
5 |
from typing import Any, Optional, Union
|
6 |
|
@@ -16,14 +18,18 @@ from conette.utils.collections import dict_list_to_list_dict
|
|
16 |
|
17 |
|
18 |
ALLOW_REP_MODES = ("stopwords", "all", "none")
|
|
|
19 |
MAX_BEAM_SIZE = 20
|
20 |
MAX_PRED_SIZE = 30
|
21 |
-
MAX_BATCH_SIZE =
|
22 |
RECORD_AUDIO_FNAME = "microphone_conette_record.wav"
|
23 |
DEFAULT_THRESHOLD = 0.3
|
24 |
THRESHOLD_PRECISION = 100
|
25 |
MIN_AUDIO_DURATION_SEC = 0.3
|
26 |
MAX_AUDIO_DURATION_SEC = 60
|
|
|
|
|
|
|
27 |
|
28 |
|
29 |
@st.cache_resource
|
@@ -46,7 +52,7 @@ def format_tags(tags: Optional[list[str]]) -> str:
|
|
46 |
|
47 |
|
48 |
def get_result_hash(audio_fname: str, generate_kwds: dict[str, Any]) -> str:
|
49 |
-
return f"{audio_fname}-{generate_kwds}"
|
50 |
|
51 |
|
52 |
def get_results(
|
@@ -64,7 +70,7 @@ def get_results(
|
|
64 |
# Save audio to be processed
|
65 |
tmp_files: dict[str, _TemporaryFileWrapper] = {}
|
66 |
for result_hash, (audio_fname, audio) in audio_to_predict.items():
|
67 |
-
tmp_file = NamedTemporaryFile(delete=False)
|
68 |
tmp_file.write(audio)
|
69 |
tmp_file.close()
|
70 |
|
@@ -109,6 +115,9 @@ def get_results(
|
|
109 |
output_i = st.session_state[result_hash]
|
110 |
outputs[audio_fname] = output_i
|
111 |
|
|
|
|
|
|
|
112 |
return outputs
|
113 |
|
114 |
|
@@ -145,20 +154,39 @@ def show_results(outputs: dict[str, Union[dict[str, Any], str]]) -> None:
|
|
145 |
else:
|
146 |
header = f'##### Result for "{audio_fname}"'
|
147 |
|
148 |
-
|
149 |
header,
|
150 |
-
f'
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
]
|
153 |
if len(mult_cands) > 0:
|
154 |
msg = f"- **Other descriptions:**"
|
155 |
-
|
156 |
|
157 |
for cand_i, prob_i in zip(mult_cands, mult_probs):
|
158 |
msg = f' - "{cand_i}" ({prob_i*100:.1f}%)'
|
159 |
-
|
|
|
|
|
|
|
160 |
|
161 |
-
|
|
|
162 |
st.divider()
|
163 |
|
164 |
|
@@ -167,19 +195,28 @@ def main() -> None:
|
|
167 |
|
168 |
st.header("Describe audio content with CoNeTTE")
|
169 |
st.markdown(
|
170 |
-
"This interface allows you to generate a short description of the sound events of any recording. You can try it from your microphone or upload a file below."
|
171 |
)
|
172 |
-
|
173 |
-
|
174 |
-
audio_files: Optional[list[UploadedFile]] = st.file_uploader(
|
175 |
-
"**Or upload audio files here:**",
|
176 |
-
type=["wav", "flac", "mp3", "ogg", "avi"],
|
177 |
-
accept_multiple_files=True,
|
178 |
-
help="Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum.",
|
179 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
-
with st.expander("Model
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
183 |
allow_rep_mode = st.selectbox("Allow repetition of words", ALLOW_REP_MODES, 0)
|
184 |
beam_size: int = st.select_slider( # type: ignore
|
185 |
"Beam size",
|
@@ -231,6 +268,15 @@ def main() -> None:
|
|
231 |
st.header("Results:")
|
232 |
show_results(outputs)
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
if __name__ == "__main__":
|
236 |
main()
|
|
|
1 |
#!/usr/bin/env python
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
+
import os
|
5 |
+
import time
|
6 |
from tempfile import NamedTemporaryFile, _TemporaryFileWrapper
|
7 |
from typing import Any, Optional, Union
|
8 |
|
|
|
18 |
|
19 |
|
20 |
ALLOW_REP_MODES = ("stopwords", "all", "none")
|
21 |
+
DEFAULT_TASK = "audiocaps"
|
22 |
MAX_BEAM_SIZE = 20
|
23 |
MAX_PRED_SIZE = 30
|
24 |
+
MAX_BATCH_SIZE = 16
|
25 |
RECORD_AUDIO_FNAME = "microphone_conette_record.wav"
|
26 |
DEFAULT_THRESHOLD = 0.3
|
27 |
THRESHOLD_PRECISION = 100
|
28 |
MIN_AUDIO_DURATION_SEC = 0.3
|
29 |
MAX_AUDIO_DURATION_SEC = 60
|
30 |
+
HASH_PREFIX = "hash_"
|
31 |
+
TMP_FILE_PREFIX = "audio_tmp_file_"
|
32 |
+
SECOND_BEFORE_CLEAR_CACHE = 10 * 60
|
33 |
|
34 |
|
35 |
@st.cache_resource
|
|
|
52 |
|
53 |
|
54 |
def get_result_hash(audio_fname: str, generate_kwds: dict[str, Any]) -> str:
|
55 |
+
return f"{HASH_PREFIX}{audio_fname}-{generate_kwds}"
|
56 |
|
57 |
|
58 |
def get_results(
|
|
|
70 |
# Save audio to be processed
|
71 |
tmp_files: dict[str, _TemporaryFileWrapper] = {}
|
72 |
for result_hash, (audio_fname, audio) in audio_to_predict.items():
|
73 |
+
tmp_file = NamedTemporaryFile(delete=False, prefix=TMP_FILE_PREFIX)
|
74 |
tmp_file.write(audio)
|
75 |
tmp_file.close()
|
76 |
|
|
|
115 |
output_i = st.session_state[result_hash]
|
116 |
outputs[audio_fname] = output_i
|
117 |
|
118 |
+
for tmp_file in tmp_files.values():
|
119 |
+
os.remove(tmp_file.name)
|
120 |
+
|
121 |
return outputs
|
122 |
|
123 |
|
|
|
154 |
else:
|
155 |
header = f'##### Result for "{audio_fname}"'
|
156 |
|
157 |
+
lines = [
|
158 |
header,
|
159 |
+
f'<center><p class="space"><p class="big-font">"{cand}"</p></p></center>',
|
160 |
+
]
|
161 |
+
|
162 |
+
st.markdown("""
|
163 |
+
<style>
|
164 |
+
.big-font {
|
165 |
+
font-size:22px !important;
|
166 |
+
background-color: rgba(0, 255, 0, 0.1);
|
167 |
+
padding: 10px;
|
168 |
+
}
|
169 |
+
</style>
|
170 |
+
""", unsafe_allow_html=True)
|
171 |
+
content = "<br>".join(lines)
|
172 |
+
st.markdown(content, unsafe_allow_html=True)
|
173 |
+
|
174 |
+
lines = [
|
175 |
+
f"- **Probability**: {prob*100:.1f}%",
|
176 |
]
|
177 |
if len(mult_cands) > 0:
|
178 |
msg = f"- **Other descriptions:**"
|
179 |
+
lines.append(msg)
|
180 |
|
181 |
for cand_i, prob_i in zip(mult_cands, mult_probs):
|
182 |
msg = f' - "{cand_i}" ({prob_i*100:.1f}%)'
|
183 |
+
lines.append(msg)
|
184 |
+
|
185 |
+
msg = f"- **Tags:** {tags}"
|
186 |
+
lines.append(msg)
|
187 |
|
188 |
+
content = "\n".join(lines)
|
189 |
+
st.markdown(content, unsafe_allow_html=False)
|
190 |
st.divider()
|
191 |
|
192 |
|
|
|
195 |
|
196 |
st.header("Describe audio content with CoNeTTE")
|
197 |
st.markdown(
|
198 |
+
"This interface allows you to generate a short description of the sound events of any recording using an Audio Captioning system. You can try it from your microphone or upload a file below."
|
199 |
)
|
200 |
+
st.markdown(
|
201 |
+
"Use '**Start Recording**' and '**Stop**' to record an audio from your microphone."
|
|
|
|
|
|
|
|
|
|
|
202 |
)
|
203 |
+
record_data = st_audiorec()
|
204 |
+
|
205 |
+
with st.expander("Or upload audio files here:"):
|
206 |
+
audio_files: Optional[list[UploadedFile]] = st.file_uploader(
|
207 |
+
f"Audio files are automatically resampled to 32 kHz.\nTheir duration must be in range [{MIN_AUDIO_DURATION_SEC}, {MAX_AUDIO_DURATION_SEC}] seconds.",
|
208 |
+
type=["wav", "flac", "mp3", "ogg", "avi"],
|
209 |
+
accept_multiple_files=True,
|
210 |
+
help="Recommanded audio: lasting from **1 to 30s**, sampled at **32 kHz** minimum.",
|
211 |
+
)
|
212 |
|
213 |
+
with st.expander("Model options"):
|
214 |
+
if DEFAULT_TASK in model.tasks:
|
215 |
+
default_task_idx = list(model.tasks).index(DEFAULT_TASK)
|
216 |
+
else:
|
217 |
+
default_task_idx = 0
|
218 |
+
|
219 |
+
task = st.selectbox("Task embedding input", model.tasks, default_task_idx)
|
220 |
allow_rep_mode = st.selectbox("Allow repetition of words", ALLOW_REP_MODES, 0)
|
221 |
beam_size: int = st.select_slider( # type: ignore
|
222 |
"Beam size",
|
|
|
268 |
st.header("Results:")
|
269 |
show_results(outputs)
|
270 |
|
271 |
+
current = time.perf_counter()
|
272 |
+
last_generation = st.session_state.get("last_generation", current)
|
273 |
+
if current > last_generation + SECOND_BEFORE_CLEAR_CACHE:
|
274 |
+
print(f"Removing result cache...")
|
275 |
+
for key in st.session_state.keys():
|
276 |
+
if isinstance(key, str) and key.startswith(HASH_PREFIX):
|
277 |
+
del st.session_state[key]
|
278 |
+
st.session_state["last_generation"] = current
|
279 |
+
|
280 |
|
281 |
if __name__ == "__main__":
|
282 |
main()
|