Spaces:
Running
Running
alessandro trinca tornidor
commited on
Commit
·
d009a59
1
Parent(s):
d3be968
feat: initial support for split and reproduce single recorded words
Browse files- aip_trainer/lambdas/lambdaSpeechToScore.py +68 -19
- app.py +53 -17
aip_trainer/lambdas/lambdaSpeechToScore.py
CHANGED
@@ -48,7 +48,7 @@ def lambda_handler(event, context):
|
|
48 |
return output
|
49 |
|
50 |
|
51 |
-
def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
|
52 |
from soundfile import LibsndfileError
|
53 |
app_logger.info(f"real_text:{real_text} ...")
|
54 |
app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
|
@@ -67,37 +67,37 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
|
|
67 |
app_logger.debug(f"random_file_name:{random_file_name} ...")
|
68 |
if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
|
69 |
app_logger.debug("writing streaming data to file on disk...")
|
70 |
-
with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=
|
71 |
f1.write(file_bytes_or_audiotmpfile)
|
72 |
duration = time.time() - start0
|
73 |
app_logger.info(f'Saved binary data in file in {duration}s.')
|
74 |
random_file_name = f1.name
|
75 |
|
76 |
start = time.time()
|
77 |
-
app_logger.info(f'Loading
|
78 |
try:
|
79 |
-
signal,
|
80 |
except LibsndfileError as sfe:
|
81 |
# https://github.com/beetbox/audioread/issues/144
|
82 |
# deprecation warnings => pip install standard-aifc standard-sunau
|
83 |
app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
|
84 |
try:
|
85 |
-
signal,
|
86 |
except ModuleNotFoundError as mnfe:
|
87 |
app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
|
88 |
raise mnfe
|
89 |
|
90 |
duration = time.time() - start
|
91 |
-
app_logger.info(f'Read
|
92 |
|
93 |
-
|
94 |
|
95 |
duration = time.time() - start
|
96 |
-
app_logger.info(f'Loaded
|
97 |
|
98 |
language_trainer_sst_lambda = trainer_SST_lambda[language]
|
99 |
app_logger.info('language_trainer_sst_lambda: preparing...')
|
100 |
-
result = language_trainer_sst_lambda.processAudioForGivenText(
|
101 |
app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
|
102 |
|
103 |
start = time.time()
|
@@ -140,15 +140,17 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
|
|
140 |
pronunciation_accuracy = float(result['pronunciation_accuracy'])
|
141 |
ipa_transcript = result['recording_ipa']
|
142 |
|
143 |
-
return {
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
152 |
|
153 |
|
154 |
def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
|
@@ -158,7 +160,53 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
|
|
158 |
pronunciation_accuracy = output['pronunciation_accuracy']
|
159 |
ipa_transcript = output['ipa_transcript']
|
160 |
real_transcripts_ipa = output['real_transcripts_ipa']
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
|
164 |
# From Librosa
|
@@ -284,3 +332,4 @@ def buf_to_float(x, n_bytes=2, dtype=np.float32):
|
|
284 |
|
285 |
# Rescale and format the data buffer
|
286 |
return scale * np.frombuffer(x, fmt).astype(dtype)
|
|
|
|
48 |
return output
|
49 |
|
50 |
|
51 |
+
def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True, extension: str = ".ogg"):
|
52 |
from soundfile import LibsndfileError
|
53 |
app_logger.info(f"real_text:{real_text} ...")
|
54 |
app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
|
|
|
67 |
app_logger.debug(f"random_file_name:{random_file_name} ...")
|
68 |
if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
|
69 |
app_logger.debug("writing streaming data to file on disk...")
|
70 |
+
with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
|
71 |
f1.write(file_bytes_or_audiotmpfile)
|
72 |
duration = time.time() - start0
|
73 |
app_logger.info(f'Saved binary data in file in {duration}s.')
|
74 |
random_file_name = f1.name
|
75 |
|
76 |
start = time.time()
|
77 |
+
app_logger.info(f'Loading {extension} file file {random_file_name} ...')
|
78 |
try:
|
79 |
+
signal, samplerate = soundfile_load(random_file_name)
|
80 |
except LibsndfileError as sfe:
|
81 |
# https://github.com/beetbox/audioread/issues/144
|
82 |
# deprecation warnings => pip install standard-aifc standard-sunau
|
83 |
app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
|
84 |
try:
|
85 |
+
signal, samplerate = audioread_load(random_file_name)
|
86 |
except ModuleNotFoundError as mnfe:
|
87 |
app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
|
88 |
raise mnfe
|
89 |
|
90 |
duration = time.time() - start
|
91 |
+
app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
|
92 |
|
93 |
+
signal_transformed = transform(torch.Tensor(signal)).unsqueeze(0)
|
94 |
|
95 |
duration = time.time() - start
|
96 |
+
app_logger.info(f'Loaded {extension} file {random_file_name} in {duration}s.')
|
97 |
|
98 |
language_trainer_sst_lambda = trainer_SST_lambda[language]
|
99 |
app_logger.info('language_trainer_sst_lambda: preparing...')
|
100 |
+
result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
|
101 |
app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
|
102 |
|
103 |
start = time.time()
|
|
|
140 |
pronunciation_accuracy = float(result['pronunciation_accuracy'])
|
141 |
ipa_transcript = result['recording_ipa']
|
142 |
|
143 |
+
return {
|
144 |
+
'real_transcript': result['recording_transcript'],
|
145 |
+
'ipa_transcript': ipa_transcript,
|
146 |
+
'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
|
147 |
+
'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
|
148 |
+
'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
|
149 |
+
'pair_accuracy_category': pair_accuracy_category,
|
150 |
+
'start_time': result['start_time'],
|
151 |
+
'end_time': result['end_time'],
|
152 |
+
'is_letter_correct_all_words': is_letter_correct_all_words
|
153 |
+
}
|
154 |
|
155 |
|
156 |
def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
|
|
|
160 |
pronunciation_accuracy = output['pronunciation_accuracy']
|
161 |
ipa_transcript = output['ipa_transcript']
|
162 |
real_transcripts_ipa = output['real_transcripts_ipa']
|
163 |
+
end_time = [float(x) for x in output['end_time'].split(" ")]
|
164 |
+
start_time = [float(x) for x in output['start_time'].split(" ")]
|
165 |
+
num_words = len(end_time)
|
166 |
+
app_logger.debug(f"start splitting recorded audio into {num_words} words...")
|
167 |
+
|
168 |
+
audio_files = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
|
169 |
+
output = {'audio_files': audio_files, **output}
|
170 |
+
return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, json.dumps(output)
|
171 |
+
|
172 |
+
|
173 |
+
def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
|
174 |
+
import soundfile as sf
|
175 |
+
sf.write(audiofile, data, samplerate)
|
176 |
+
|
177 |
+
|
178 |
+
def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> str:
|
179 |
+
json_output = json.loads(raw_json_output)
|
180 |
+
list_audio_files = json_output["audio_files"]
|
181 |
+
return list_audio_files[idx_recorded_word]
|
182 |
+
|
183 |
+
|
184 |
+
def get_audio_splitted(audiotmpfile: str | Path, text_raw_json_output_hidden: str) -> None:
|
185 |
+
input_json = json.loads(text_raw_json_output_hidden)
|
186 |
+
start_time = input_json["start_time"]
|
187 |
+
end_time = input_json["end_time"]
|
188 |
+
return get_splitted_audio_file(audiotmpfile, start_time, end_time)
|
189 |
+
|
190 |
+
|
191 |
+
def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> list[str]:
|
192 |
+
import soundfile as sf
|
193 |
+
audio_files = []
|
194 |
+
for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
|
195 |
+
if signal is not None:
|
196 |
+
audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
|
197 |
+
signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
|
198 |
+
audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
|
199 |
+
soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
|
200 |
+
app_logger.info(f"audio file {audiofile} written...")
|
201 |
+
audio_files.append(str(audiofile))
|
202 |
+
return audio_files
|
203 |
+
|
204 |
+
|
205 |
+
def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
|
206 |
+
pathname = Path(basefile)
|
207 |
+
dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
|
208 |
+
output_file = dirname / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
|
209 |
+
return output_file
|
210 |
|
211 |
|
212 |
# From Librosa
|
|
|
332 |
|
333 |
# Rescale and format the data buffer
|
334 |
return scale * np.frombuffer(x, fmt).astype(dtype)
|
335 |
+
|
app.py
CHANGED
@@ -72,6 +72,20 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
72 |
show_download_button=True,
|
73 |
elem_id="audio-student-recording-stt-id-element",
|
74 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
with gr.Column(scale=4, min_width=320):
|
76 |
text_transcribed_hidden = gr.Textbox(
|
77 |
placeholder=None, label="Transcribed text", visible=False
|
@@ -110,23 +124,38 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
110 |
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
|
111 |
number_score_en = gr.Number(label="Global score EN", value=0, interactive=False, elem_id="number-score-en-id-element")
|
112 |
with gr.Row():
|
113 |
-
|
114 |
-
with gr.
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
)
|
127 |
|
128 |
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
|
129 |
-
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang)
|
130 |
output = {
|
131 |
text_transcribed_hidden: _transcribed_text,
|
132 |
text_letter_correctness: _letter_correctness,
|
@@ -134,6 +163,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
134 |
text_recording_ipa: _recording_ipa,
|
135 |
text_ideal_ipa: _ideal_ipa,
|
136 |
text_raw_json_output_hidden: _res,
|
|
|
137 |
}
|
138 |
match lang:
|
139 |
case "de":
|
@@ -151,7 +181,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
151 |
case _:
|
152 |
raise NotImplementedError(f"Language {lang} not supported")
|
153 |
|
154 |
-
|
155 |
get_updated_score_by_language,
|
156 |
inputs=[text_student_transcription, audio_student_recording_stt, radio_language, number_score_de, number_score_en],
|
157 |
outputs=[
|
@@ -161,7 +191,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
161 |
text_recording_ipa,
|
162 |
text_ideal_ipa,
|
163 |
text_raw_json_output_hidden,
|
164 |
-
number_score_de,
|
|
|
|
|
165 |
],
|
166 |
)
|
167 |
btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
|
@@ -198,4 +230,8 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
198 |
|
199 |
|
200 |
if __name__ == "__main__":
|
201 |
-
|
|
|
|
|
|
|
|
|
|
72 |
show_download_button=True,
|
73 |
elem_id="audio-student-recording-stt-id-element",
|
74 |
)
|
75 |
+
with gr.Row():
|
76 |
+
with gr.Accordion("Click here to expand the table examples", open=True, elem_id="accordion-examples-id-element"):
|
77 |
+
examples_text = gr.Examples(
|
78 |
+
examples=[
|
79 |
+
["Hallo, wie geht es dir?", "de", 1],
|
80 |
+
["Hi there, how are you?", "en", 1],
|
81 |
+
["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
|
82 |
+
["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
|
83 |
+
["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
|
84 |
+
["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
|
85 |
+
],
|
86 |
+
inputs=[text_student_transcription, radio_language, radio_difficulty],
|
87 |
+
elem_id="examples-text-id-element",
|
88 |
+
)
|
89 |
with gr.Column(scale=4, min_width=320):
|
90 |
text_transcribed_hidden = gr.Textbox(
|
91 |
placeholder=None, label="Transcribed text", visible=False
|
|
|
124 |
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
|
125 |
number_score_en = gr.Number(label="Global score EN", value=0, interactive=False, elem_id="number-score-en-id-element")
|
126 |
with gr.Row():
|
127 |
+
btn_recognize_speech_accuracy = gr.Button(value="Recognize speech accuracy", elem_id="btn-recognize-speech-accuracy-id-element")
|
128 |
+
with gr.Row():
|
129 |
+
with gr.Column(scale=1, min_width=50):
|
130 |
+
num_tot_recognized_words = gr.Number(label="Total recognized words", visible=True, minimum=0)
|
131 |
+
with gr.Column(scale=1, min_width=50):
|
132 |
+
num_selected_recognized_words = gr.Number(label="Recognized word index", visible=True, value=0, minimum=0)
|
133 |
+
with gr.Column(scale=2, min_width=100):
|
134 |
+
audio_splitted_student_recording_stt = gr.Audio(
|
135 |
+
label="Splitted Speech-toText audio output",
|
136 |
+
type="filepath",
|
137 |
+
show_download_button=True,
|
138 |
+
elem_id="audio-splitted-student-recording-stt-id-element",
|
139 |
+
)
|
140 |
+
with gr.Row():
|
141 |
+
btn_select_recognized_word = gr.Button(value="Select recognized word", elem_id="btn-select-recognized-word-id-element")
|
142 |
+
# slider_select_student_recorded_stt = gr.Slider(
|
143 |
+
# label="Splitted Speech-toText audio output",
|
144 |
+
# elem_id="slider-split-audio-student-recording-stt-id-element"
|
145 |
+
# )
|
146 |
+
# slider_select_student_recorded_stt.change(
|
147 |
+
# lambdaSpeechToScore.get_selected_word,
|
148 |
+
# inputs=[slider_select_student_recorded_stt, text_raw_json_output_hidden],
|
149 |
+
# outputs=[audio_splitted_student_recording_stt]
|
150 |
+
# )
|
151 |
+
btn_select_recognized_word.click(
|
152 |
+
lambdaSpeechToScore.get_selected_word,
|
153 |
+
inputs=[num_selected_recognized_words, text_raw_json_output_hidden],
|
154 |
+
outputs=[audio_splitted_student_recording_stt],
|
155 |
)
|
156 |
|
157 |
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
|
158 |
+
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_words, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
|
159 |
output = {
|
160 |
text_transcribed_hidden: _transcribed_text,
|
161 |
text_letter_correctness: _letter_correctness,
|
|
|
163 |
text_recording_ipa: _recording_ipa,
|
164 |
text_ideal_ipa: _ideal_ipa,
|
165 |
text_raw_json_output_hidden: _res,
|
166 |
+
num_tot_recognized_words: _num_tot_recognized_words,
|
167 |
}
|
168 |
match lang:
|
169 |
case "de":
|
|
|
181 |
case _:
|
182 |
raise NotImplementedError(f"Language {lang} not supported")
|
183 |
|
184 |
+
btn_recognize_speech_accuracy.click(
|
185 |
get_updated_score_by_language,
|
186 |
inputs=[text_student_transcription, audio_student_recording_stt, radio_language, number_score_de, number_score_en],
|
187 |
outputs=[
|
|
|
191 |
text_recording_ipa,
|
192 |
text_ideal_ipa,
|
193 |
text_raw_json_output_hidden,
|
194 |
+
number_score_de,
|
195 |
+
number_score_en,
|
196 |
+
num_tot_recognized_words
|
197 |
],
|
198 |
)
|
199 |
btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
|
|
|
230 |
|
231 |
|
232 |
if __name__ == "__main__":
|
233 |
+
try:
|
234 |
+
gradio_app.launch()
|
235 |
+
except Exception as e:
|
236 |
+
app_logger.error(f"Error: {e}")
|
237 |
+
raise e
|