alessandro trinca tornidor commited on
Commit
d009a59
·
1 Parent(s): d3be968

feat: initial support for split and reproduce single recorded words

Browse files
Files changed (2) hide show
  1. aip_trainer/lambdas/lambdaSpeechToScore.py +68 -19
  2. app.py +53 -17
aip_trainer/lambdas/lambdaSpeechToScore.py CHANGED
@@ -48,7 +48,7 @@ def lambda_handler(event, context):
48
  return output
49
 
50
 
51
- def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
52
  from soundfile import LibsndfileError
53
  app_logger.info(f"real_text:{real_text} ...")
54
  app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
@@ -67,37 +67,37 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
67
  app_logger.debug(f"random_file_name:{random_file_name} ...")
68
  if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
69
  app_logger.debug("writing streaming data to file on disk...")
70
- with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=".ogg", delete=False) as f1:
71
  f1.write(file_bytes_or_audiotmpfile)
72
  duration = time.time() - start0
73
  app_logger.info(f'Saved binary data in file in {duration}s.')
74
  random_file_name = f1.name
75
 
76
  start = time.time()
77
- app_logger.info(f'Loading .ogg file file {random_file_name} ...')
78
  try:
79
- signal, _ = soundfile_load(random_file_name)
80
  except LibsndfileError as sfe:
81
  # https://github.com/beetbox/audioread/issues/144
82
  # deprecation warnings => pip install standard-aifc standard-sunau
83
  app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
84
  try:
85
- signal, _ = audioread_load(random_file_name)
86
  except ModuleNotFoundError as mnfe:
87
  app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
88
  raise mnfe
89
 
90
  duration = time.time() - start
91
- app_logger.info(f'Read .ogg file {random_file_name} in {duration}s.')
92
 
93
- signal = transform(torch.Tensor(signal)).unsqueeze(0)
94
 
95
  duration = time.time() - start
96
- app_logger.info(f'Loaded .ogg file {random_file_name} in {duration}s.')
97
 
98
  language_trainer_sst_lambda = trainer_SST_lambda[language]
99
  app_logger.info('language_trainer_sst_lambda: preparing...')
100
- result = language_trainer_sst_lambda.processAudioForGivenText(signal, real_text)
101
  app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
102
 
103
  start = time.time()
@@ -140,15 +140,17 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
140
  pronunciation_accuracy = float(result['pronunciation_accuracy'])
141
  ipa_transcript = result['recording_ipa']
142
 
143
- return {'real_transcript': result['recording_transcript'],
144
- 'ipa_transcript': ipa_transcript,
145
- 'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
146
- 'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
147
- 'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
148
- 'pair_accuracy_category': pair_accuracy_category,
149
- 'start_time': result['start_time'],
150
- 'end_time': result['end_time'],
151
- 'is_letter_correct_all_words': is_letter_correct_all_words}
 
 
152
 
153
 
154
  def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
@@ -158,7 +160,53 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
158
  pronunciation_accuracy = output['pronunciation_accuracy']
159
  ipa_transcript = output['ipa_transcript']
160
  real_transcripts_ipa = output['real_transcripts_ipa']
161
- return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, json.dumps(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
 
164
  # From Librosa
@@ -284,3 +332,4 @@ def buf_to_float(x, n_bytes=2, dtype=np.float32):
284
 
285
  # Rescale and format the data buffer
286
  return scale * np.frombuffer(x, fmt).astype(dtype)
 
 
48
  return output
49
 
50
 
51
+ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True, extension: str = ".ogg"):
52
  from soundfile import LibsndfileError
53
  app_logger.info(f"real_text:{real_text} ...")
54
  app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
 
67
  app_logger.debug(f"random_file_name:{random_file_name} ...")
68
  if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
69
  app_logger.debug("writing streaming data to file on disk...")
70
+ with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
71
  f1.write(file_bytes_or_audiotmpfile)
72
  duration = time.time() - start0
73
  app_logger.info(f'Saved binary data in file in {duration}s.')
74
  random_file_name = f1.name
75
 
76
  start = time.time()
77
+ app_logger.info(f'Loading {extension} file file {random_file_name} ...')
78
  try:
79
+ signal, samplerate = soundfile_load(random_file_name)
80
  except LibsndfileError as sfe:
81
  # https://github.com/beetbox/audioread/issues/144
82
  # deprecation warnings => pip install standard-aifc standard-sunau
83
  app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
84
  try:
85
+ signal, samplerate = audioread_load(random_file_name)
86
  except ModuleNotFoundError as mnfe:
87
  app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
88
  raise mnfe
89
 
90
  duration = time.time() - start
91
+ app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
92
 
93
+ signal_transformed = transform(torch.Tensor(signal)).unsqueeze(0)
94
 
95
  duration = time.time() - start
96
+ app_logger.info(f'Loaded {extension} file {random_file_name} in {duration}s.')
97
 
98
  language_trainer_sst_lambda = trainer_SST_lambda[language]
99
  app_logger.info('language_trainer_sst_lambda: preparing...')
100
+ result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
101
  app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
102
 
103
  start = time.time()
 
140
  pronunciation_accuracy = float(result['pronunciation_accuracy'])
141
  ipa_transcript = result['recording_ipa']
142
 
143
+ return {
144
+ 'real_transcript': result['recording_transcript'],
145
+ 'ipa_transcript': ipa_transcript,
146
+ 'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
147
+ 'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
148
+ 'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
149
+ 'pair_accuracy_category': pair_accuracy_category,
150
+ 'start_time': result['start_time'],
151
+ 'end_time': result['end_time'],
152
+ 'is_letter_correct_all_words': is_letter_correct_all_words
153
+ }
154
 
155
 
156
  def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
 
160
  pronunciation_accuracy = output['pronunciation_accuracy']
161
  ipa_transcript = output['ipa_transcript']
162
  real_transcripts_ipa = output['real_transcripts_ipa']
163
+ end_time = [float(x) for x in output['end_time'].split(" ")]
164
+ start_time = [float(x) for x in output['start_time'].split(" ")]
165
+ num_words = len(end_time)
166
+ app_logger.debug(f"start splitting recorded audio into {num_words} words...")
167
+
168
+ audio_files = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
169
+ output = {'audio_files': audio_files, **output}
170
+ return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, json.dumps(output)
171
+
172
+
173
+ def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
174
+ import soundfile as sf
175
+ sf.write(audiofile, data, samplerate)
176
+
177
+
178
+ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> str:
179
+ json_output = json.loads(raw_json_output)
180
+ list_audio_files = json_output["audio_files"]
181
+ return list_audio_files[idx_recorded_word]
182
+
183
+
184
+ def get_audio_splitted(audiotmpfile: str | Path, text_raw_json_output_hidden: str) -> None:
185
+ input_json = json.loads(text_raw_json_output_hidden)
186
+ start_time = input_json["start_time"]
187
+ end_time = input_json["end_time"]
188
+ return get_splitted_audio_file(audiotmpfile, start_time, end_time)
189
+
190
+
191
+ def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> list[str]:
192
+ import soundfile as sf
193
+ audio_files = []
194
+ for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
195
+ if signal is not None:
196
+ audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
197
+ signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
198
+ audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
199
+ soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
200
+ app_logger.info(f"audio file {audiofile} written...")
201
+ audio_files.append(str(audiofile))
202
+ return audio_files
203
+
204
+
205
+ def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
206
+ pathname = Path(basefile)
207
+ dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
208
+ output_file = dirname / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
209
+ return output_file
210
 
211
 
212
  # From Librosa
 
332
 
333
  # Rescale and format the data buffer
334
  return scale * np.frombuffer(x, fmt).astype(dtype)
335
+
app.py CHANGED
@@ -72,6 +72,20 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
72
  show_download_button=True,
73
  elem_id="audio-student-recording-stt-id-element",
74
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  with gr.Column(scale=4, min_width=320):
76
  text_transcribed_hidden = gr.Textbox(
77
  placeholder=None, label="Transcribed text", visible=False
@@ -110,23 +124,38 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
110
  with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
111
  number_score_en = gr.Number(label="Global score EN", value=0, interactive=False, elem_id="number-score-en-id-element")
112
  with gr.Row():
113
- btn = gr.Button(value="Recognize speech accuracy", elem_id="btn-recognize-speech-accuracy-id-element")
114
- with gr.Accordion("Click here to expand the table examples", open=True, elem_id="accordion-examples-id-element"):
115
- examples_text = gr.Examples(
116
- examples=[
117
- ["Hallo, wie geht es dir?", "de", 1],
118
- ["Hi there, how are you?", "en", 1],
119
- ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
120
- ["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
121
- ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
122
- ["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
123
- ],
124
- inputs=[text_student_transcription, radio_language, radio_difficulty],
125
- elem_id="examples-text-id-element",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  )
127
 
128
  def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
129
- _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang)
130
  output = {
131
  text_transcribed_hidden: _transcribed_text,
132
  text_letter_correctness: _letter_correctness,
@@ -134,6 +163,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
134
  text_recording_ipa: _recording_ipa,
135
  text_ideal_ipa: _ideal_ipa,
136
  text_raw_json_output_hidden: _res,
 
137
  }
138
  match lang:
139
  case "de":
@@ -151,7 +181,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
151
  case _:
152
  raise NotImplementedError(f"Language {lang} not supported")
153
 
154
- btn.click(
155
  get_updated_score_by_language,
156
  inputs=[text_student_transcription, audio_student_recording_stt, radio_language, number_score_de, number_score_en],
157
  outputs=[
@@ -161,7 +191,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
161
  text_recording_ipa,
162
  text_ideal_ipa,
163
  text_raw_json_output_hidden,
164
- number_score_de, number_score_en
 
 
165
  ],
166
  )
167
  btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
@@ -198,4 +230,8 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
198
 
199
 
200
  if __name__ == "__main__":
201
- gradio_app.launch()
 
 
 
 
 
72
  show_download_button=True,
73
  elem_id="audio-student-recording-stt-id-element",
74
  )
75
+ with gr.Row():
76
+ with gr.Accordion("Click here to expand the table examples", open=True, elem_id="accordion-examples-id-element"):
77
+ examples_text = gr.Examples(
78
+ examples=[
79
+ ["Hallo, wie geht es dir?", "de", 1],
80
+ ["Hi there, how are you?", "en", 1],
81
+ ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
82
+ ["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
83
+ ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
84
+ ["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
85
+ ],
86
+ inputs=[text_student_transcription, radio_language, radio_difficulty],
87
+ elem_id="examples-text-id-element",
88
+ )
89
  with gr.Column(scale=4, min_width=320):
90
  text_transcribed_hidden = gr.Textbox(
91
  placeholder=None, label="Transcribed text", visible=False
 
124
  with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
125
  number_score_en = gr.Number(label="Global score EN", value=0, interactive=False, elem_id="number-score-en-id-element")
126
  with gr.Row():
127
+ btn_recognize_speech_accuracy = gr.Button(value="Recognize speech accuracy", elem_id="btn-recognize-speech-accuracy-id-element")
128
+ with gr.Row():
129
+ with gr.Column(scale=1, min_width=50):
130
+ num_tot_recognized_words = gr.Number(label="Total recognized words", visible=True, minimum=0)
131
+ with gr.Column(scale=1, min_width=50):
132
+ num_selected_recognized_words = gr.Number(label="Recognized word index", visible=True, value=0, minimum=0)
133
+ with gr.Column(scale=2, min_width=100):
134
+ audio_splitted_student_recording_stt = gr.Audio(
135
+ label="Splitted Speech-toText audio output",
136
+ type="filepath",
137
+ show_download_button=True,
138
+ elem_id="audio-splitted-student-recording-stt-id-element",
139
+ )
140
+ with gr.Row():
141
+ btn_select_recognized_word = gr.Button(value="Select recognized word", elem_id="btn-select-recognized-word-id-element")
142
+ # slider_select_student_recorded_stt = gr.Slider(
143
+ # label="Splitted Speech-toText audio output",
144
+ # elem_id="slider-split-audio-student-recording-stt-id-element"
145
+ # )
146
+ # slider_select_student_recorded_stt.change(
147
+ # lambdaSpeechToScore.get_selected_word,
148
+ # inputs=[slider_select_student_recorded_stt, text_raw_json_output_hidden],
149
+ # outputs=[audio_splitted_student_recording_stt]
150
+ # )
151
+ btn_select_recognized_word.click(
152
+ lambdaSpeechToScore.get_selected_word,
153
+ inputs=[num_selected_recognized_words, text_raw_json_output_hidden],
154
+ outputs=[audio_splitted_student_recording_stt],
155
  )
156
 
157
  def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
158
+ _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_words, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
159
  output = {
160
  text_transcribed_hidden: _transcribed_text,
161
  text_letter_correctness: _letter_correctness,
 
163
  text_recording_ipa: _recording_ipa,
164
  text_ideal_ipa: _ideal_ipa,
165
  text_raw_json_output_hidden: _res,
166
+ num_tot_recognized_words: _num_tot_recognized_words,
167
  }
168
  match lang:
169
  case "de":
 
181
  case _:
182
  raise NotImplementedError(f"Language {lang} not supported")
183
 
184
+ btn_recognize_speech_accuracy.click(
185
  get_updated_score_by_language,
186
  inputs=[text_student_transcription, audio_student_recording_stt, radio_language, number_score_de, number_score_en],
187
  outputs=[
 
191
  text_recording_ipa,
192
  text_ideal_ipa,
193
  text_raw_json_output_hidden,
194
+ number_score_de,
195
+ number_score_en,
196
+ num_tot_recognized_words
197
  ],
198
  )
199
  btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
 
230
 
231
 
232
  if __name__ == "__main__":
233
+ try:
234
+ gradio_app.launch()
235
+ except Exception as e:
236
+ app_logger.error(f"Error: {e}")
237
+ raise e