Spaces:

aletrn
/

ai-pronunciation-trainer

Running

App Files Files Community

alessandro trinca tornidor commited on Nov 29, 2024

Commit

d009a59

1 Parent(s): d3be968

feat: initial support for split and reproduce single recorded words

Browse files

Files changed (2) hide show

aip_trainer/lambdas/lambdaSpeechToScore.py +68 -19
app.py +53 -17

aip_trainer/lambdas/lambdaSpeechToScore.py CHANGED Viewed

@@ -48,7 +48,7 @@ def lambda_handler(event, context):
     return output
-def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
     from soundfile import LibsndfileError
     app_logger.info(f"real_text:{real_text} ...")
     app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
@@ -67,37 +67,37 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
     app_logger.debug(f"random_file_name:{random_file_name} ...")
     if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
         app_logger.debug("writing streaming data to file on disk...")
-        with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=".ogg", delete=False) as f1:
             f1.write(file_bytes_or_audiotmpfile)
             duration = time.time() - start0
             app_logger.info(f'Saved binary data in file in {duration}s.')
             random_file_name = f1.name
     start = time.time()
-    app_logger.info(f'Loading .ogg file file {random_file_name} ...')
     try:
-        signal, _ = soundfile_load(random_file_name)
     except LibsndfileError as sfe:
         # https://github.com/beetbox/audioread/issues/144
         # deprecation warnings => pip install standard-aifc standard-sunau
         app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
         try:
-            signal, _ = audioread_load(random_file_name)
         except ModuleNotFoundError as mnfe:
             app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
             raise mnfe
     duration = time.time() - start
-    app_logger.info(f'Read .ogg file {random_file_name} in {duration}s.')
-    signal = transform(torch.Tensor(signal)).unsqueeze(0)
     duration = time.time() - start
-    app_logger.info(f'Loaded .ogg file {random_file_name} in {duration}s.')
     language_trainer_sst_lambda = trainer_SST_lambda[language]
     app_logger.info('language_trainer_sst_lambda: preparing...')
-    result = language_trainer_sst_lambda.processAudioForGivenText(signal, real_text)
     app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
     start = time.time()
@@ -140,15 +140,17 @@ def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | d
     pronunciation_accuracy = float(result['pronunciation_accuracy'])
     ipa_transcript = result['recording_ipa']
-    return {'real_transcript': result['recording_transcript'],
-           'ipa_transcript': ipa_transcript,
-           'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
-           'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
-           'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
-           'pair_accuracy_category': pair_accuracy_category,
-           'start_time': result['start_time'],
-           'end_time': result['end_time'],
-           'is_letter_correct_all_words': is_letter_correct_all_words}
 def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
@@ -158,7 +160,53 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
     pronunciation_accuracy = output['pronunciation_accuracy']
     ipa_transcript = output['ipa_transcript']
     real_transcripts_ipa = output['real_transcripts_ipa']
-    return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, json.dumps(output)
 # From Librosa
@@ -284,3 +332,4 @@ def buf_to_float(x, n_bytes=2, dtype=np.float32):
     # Rescale and format the data buffer
     return scale * np.frombuffer(x, fmt).astype(dtype)

     return output
+def get_speech_to_score_dict(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True, extension: str = ".ogg"):
     from soundfile import LibsndfileError
     app_logger.info(f"real_text:{real_text} ...")
     app_logger.debug(f"file_bytes:{file_bytes_or_audiotmpfile} ...")
     app_logger.debug(f"random_file_name:{random_file_name} ...")
     if isinstance(file_bytes_or_audiotmpfile, (bytes, bytearray)):
         app_logger.debug("writing streaming data to file on disk...")
+        with tempfile.NamedTemporaryFile(prefix="temp_sound_speech_score_", suffix=extension, delete=False) as f1:
             f1.write(file_bytes_or_audiotmpfile)
             duration = time.time() - start0
             app_logger.info(f'Saved binary data in file in {duration}s.')
             random_file_name = f1.name
     start = time.time()
+    app_logger.info(f'Loading {extension} file file {random_file_name} ...')
     try:
+        signal, samplerate = soundfile_load(random_file_name)
     except LibsndfileError as sfe:
         # https://github.com/beetbox/audioread/issues/144
         # deprecation warnings => pip install standard-aifc standard-sunau
         app_logger.error(f"Error reading file {random_file_name}: {sfe}, re-try with audioread...")
         try:
+            signal, samplerate = audioread_load(random_file_name)
         except ModuleNotFoundError as mnfe:
             app_logger.error(f"Error reading file {random_file_name}: {mnfe}, try read https://github.com/beetbox/audioread/issues/144")
             raise mnfe
     duration = time.time() - start
+    app_logger.info(f'Read {extension} file {random_file_name} in {duration}s.')
+    signal_transformed = transform(torch.Tensor(signal)).unsqueeze(0)
     duration = time.time() - start
+    app_logger.info(f'Loaded {extension} file {random_file_name} in {duration}s.')
     language_trainer_sst_lambda = trainer_SST_lambda[language]
     app_logger.info('language_trainer_sst_lambda: preparing...')
+    result = language_trainer_sst_lambda.processAudioForGivenText(signal_transformed, real_text)
     app_logger.info(f'language_trainer_sst_lambda: result: {result}...')
     start = time.time()
     pronunciation_accuracy = float(result['pronunciation_accuracy'])
     ipa_transcript = result['recording_ipa']
+    return {
+        'real_transcript': result['recording_transcript'],
+        'ipa_transcript': ipa_transcript,
+        'pronunciation_accuracy': float(f"{pronunciation_accuracy:.2f}"),
+        'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
+        'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
+        'pair_accuracy_category': pair_accuracy_category,
+        'start_time': result['start_time'],
+        'end_time': result['end_time'],
+        'is_letter_correct_all_words': is_letter_correct_all_words
+    }
 def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str | dict, language: str = "en", remove_random_file: bool = True):
     pronunciation_accuracy = output['pronunciation_accuracy']
     ipa_transcript = output['ipa_transcript']
     real_transcripts_ipa = output['real_transcripts_ipa']
+    end_time = [float(x) for x in output['end_time'].split(" ")]
+    start_time = [float(x) for x in output['start_time'].split(" ")]
+    num_words = len(end_time)
+    app_logger.debug(f"start splitting recorded audio into {num_words} words...")
+    audio_files = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
+    output = {'audio_files': audio_files, **output}
+    return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, json.dumps(output)
+def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
+    import soundfile as sf
+    sf.write(audiofile, data, samplerate)
+def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> str:
+    json_output = json.loads(raw_json_output)
+    list_audio_files = json_output["audio_files"]
+    return list_audio_files[idx_recorded_word]
+def get_audio_splitted(audiotmpfile: str | Path, text_raw_json_output_hidden: str) -> None:
+    input_json = json.loads(text_raw_json_output_hidden)
+    start_time = input_json["start_time"]
+    end_time = input_json["end_time"]
+    return get_splitted_audio_file(audiotmpfile, start_time, end_time)
+def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> list[str]:
+    import soundfile as sf
+    audio_files = []
+    for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
+        if signal is not None:
+            audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
+        signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
+        audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
+        soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
+        app_logger.info(f"audio file {audiofile} written...")
+        audio_files.append(str(audiofile))
+    return audio_files
+def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
+    pathname = Path(basefile)
+    dirname, filename_no_ext, filename_ext = pathname.parent, pathname.stem, pathname.suffix
+    output_file = dirname / f"{filename_no_ext}_{custom_suffix}.{filename_ext}"
+    return output_file
 # From Librosa
     # Rescale and format the data buffer
     return scale * np.frombuffer(x, fmt).astype(dtype)

app.py CHANGED Viewed

@@ -72,6 +72,20 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                     show_download_button=True,
                     elem_id="audio-student-recording-stt-id-element",
                 )
         with gr.Column(scale=4, min_width=320):
             text_transcribed_hidden = gr.Textbox(
                 placeholder=None, label="Transcribed text", visible=False
@@ -110,23 +124,38 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                 with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
                     number_score_en = gr.Number(label="Global score EN", value=0, interactive=False, elem_id="number-score-en-id-element")
             with gr.Row():
-                btn = gr.Button(value="Recognize speech accuracy", elem_id="btn-recognize-speech-accuracy-id-element")
-            with gr.Accordion("Click here to expand the table examples", open=True, elem_id="accordion-examples-id-element"):
-                examples_text = gr.Examples(
-                    examples=[
-                        ["Hallo, wie geht es dir?", "de", 1],
-                        ["Hi there, how are you?", "en", 1],
-                        ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
-                        ["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
-                        ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
-                        ["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
-                    ],
-                    inputs=[text_student_transcription, radio_language, radio_difficulty],
-                    elem_id="examples-text-id-element",
                 )
     def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
-        _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang)
         output = {
             text_transcribed_hidden: _transcribed_text,
             text_letter_correctness: _letter_correctness,
@@ -134,6 +163,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             text_recording_ipa: _recording_ipa,
             text_ideal_ipa: _ideal_ipa,
             text_raw_json_output_hidden: _res,
         }
         match lang:
             case "de":
@@ -151,7 +181,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             case _:
                 raise NotImplementedError(f"Language {lang} not supported")
-    btn.click(
         get_updated_score_by_language,
         inputs=[text_student_transcription, audio_student_recording_stt, radio_language, number_score_de, number_score_en],
         outputs=[
@@ -161,7 +191,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             text_recording_ipa,
             text_ideal_ipa,
             text_raw_json_output_hidden,
-            number_score_de, number_score_en
         ],
     )
     btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
@@ -198,4 +230,8 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
 if __name__ == "__main__":
-    gradio_app.launch()

                     show_download_button=True,
                     elem_id="audio-student-recording-stt-id-element",
                 )
+            with gr.Row():
+                with gr.Accordion("Click here to expand the table examples", open=True, elem_id="accordion-examples-id-element"):
+                    examples_text = gr.Examples(
+                        examples=[
+                            ["Hallo, wie geht es dir?", "de", 1],
+                            ["Hi there, how are you?", "en", 1],
+                            ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
+                            ["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
+                            ["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
+                            ["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
+                        ],
+                        inputs=[text_student_transcription, radio_language, radio_difficulty],
+                        elem_id="examples-text-id-element",
+                    )
         with gr.Column(scale=4, min_width=320):
             text_transcribed_hidden = gr.Textbox(
                 placeholder=None, label="Transcribed text", visible=False
                 with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
                     number_score_en = gr.Number(label="Global score EN", value=0, interactive=False, elem_id="number-score-en-id-element")
             with gr.Row():
+                btn_recognize_speech_accuracy = gr.Button(value="Recognize speech accuracy", elem_id="btn-recognize-speech-accuracy-id-element")
+            with gr.Row():
+                with gr.Column(scale=1, min_width=50):
+                    num_tot_recognized_words = gr.Number(label="Total recognized words", visible=True, minimum=0)
+                with gr.Column(scale=1, min_width=50):
+                    num_selected_recognized_words = gr.Number(label="Recognized word index", visible=True, value=0, minimum=0)
+                with gr.Column(scale=2, min_width=100):
+                    audio_splitted_student_recording_stt = gr.Audio(
+                        label="Splitted Speech-toText audio output",
+                        type="filepath",
+                        show_download_button=True,
+                        elem_id="audio-splitted-student-recording-stt-id-element",
+                    )
+            with gr.Row():
+                btn_select_recognized_word = gr.Button(value="Select recognized word", elem_id="btn-select-recognized-word-id-element")
+                # slider_select_student_recorded_stt = gr.Slider(
+                #     label="Splitted Speech-toText audio output",
+                #     elem_id="slider-split-audio-student-recording-stt-id-element"
+                # )
+                # slider_select_student_recorded_stt.change(
+                #     lambdaSpeechToScore.get_selected_word,
+                #     inputs=[slider_select_student_recorded_stt, text_raw_json_output_hidden],
+                #     outputs=[audio_splitted_student_recording_stt]
+                # )
+                btn_select_recognized_word.click(
+                    lambdaSpeechToScore.get_selected_word,
+                    inputs=[num_selected_recognized_words, text_raw_json_output_hidden],
+                    outputs=[audio_splitted_student_recording_stt],
                 )
     def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
+        _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_words, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
         output = {
             text_transcribed_hidden: _transcribed_text,
             text_letter_correctness: _letter_correctness,
             text_recording_ipa: _recording_ipa,
             text_ideal_ipa: _ideal_ipa,
             text_raw_json_output_hidden: _res,
+            num_tot_recognized_words: _num_tot_recognized_words,
         }
         match lang:
             case "de":
             case _:
                 raise NotImplementedError(f"Language {lang} not supported")
+    btn_recognize_speech_accuracy.click(
         get_updated_score_by_language,
         inputs=[text_student_transcription, audio_student_recording_stt, radio_language, number_score_de, number_score_en],
         outputs=[
             text_recording_ipa,
             text_ideal_ipa,
             text_raw_json_output_hidden,
+            number_score_de,
+            number_score_en,
+            num_tot_recognized_words
         ],
     )
     btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
 if __name__ == "__main__":
+    try:
+        gradio_app.launch()
+    except Exception as e:
+        app_logger.error(f"Error: {e}")
+        raise e