Spaces:

thanhtvt
/

uetasr

Runtime error

App Files Files Community

thanhtvt commited on Apr 27, 2023

Commit

e9812a3

1 Parent(s): 5e958ff

add new decoders

Browse files

Files changed (4) hide show

app.py +190 -12
decode.py +44 -0
model.py +28 -8
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import gradio as gr
 import librosa
 import logging
 import os
 import soundfile as sf
-import tensorflow as tf
 from datetime import datetime
 from time import time
@@ -13,7 +16,7 @@ from model import UETASRModel
 def get_duration(filename: str) -> float:
-    return librosa.get_duration(filename=filename)
 def convert_to_wav(in_filename: str) -> str:
@@ -24,6 +27,39 @@ def convert_to_wav(in_filename: str) -> str:
     return out_filename
 def build_html_output(s: str, style: str = "result_item_success"):
     return f"""
     <div class='result'>
@@ -34,7 +70,34 @@ def build_html_output(s: str, style: str = "result_item_success"):
     """
-def process_uploaded_file(in_filename: str):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
             "Please first upload a file and then click "
@@ -44,13 +107,23 @@ def process_uploaded_file(in_filename: str):
     logging.info(f"Processing uploaded file: {in_filename}")
     try:
-        return process(in_filename=in_filename)
     except Exception as e:
-        logging.error(str(e))
         return "", build_html_output(str(e), "result_item_error")
-def process_microphone(in_filename: str):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
             "Please first upload a file and then click "
@@ -60,13 +133,23 @@ def process_microphone(in_filename: str):
     logging.info(f"Processing microphone: {in_filename}")
     try:
-        return process(in_filename=in_filename)
     except Exception as e:
-        logging.error(str(e))
         return "", build_html_output(str(e), "result_item_error")
-def process(in_filename: str):
     logging.info(f"in_filename: {in_filename}")
     filename = convert_to_wav(in_filename)
@@ -79,7 +162,11 @@ def process(in_filename: str):
     start = time()
-    recognizer = UETASRModel(repo_id)
     text = recognizer.predict(filename)
     date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
@@ -130,6 +217,61 @@ demo = gr.Blocks(css=css)
 with demo:
     gr.Markdown(title)
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
             uploaded_file = gr.Audio(
@@ -166,17 +308,53 @@ with demo:
                 fn=process_microphone,
             )
         upload_button.click(
             process_uploaded_file,
-            inputs=uploaded_file,
             outputs=[uploaded_output, uploaded_html_info],
         )
         record_button.click(
             process_microphone,
-            inputs=microphone,
             outputs=[recorded_output, recorded_html_info],
         )
     gr.Markdown(description)

+import base64
 import gradio as gr
 import librosa
 import logging
 import os
 import soundfile as sf
+import subprocess
+import tempfile
+import urllib.request
 from datetime import datetime
 from time import time
 def get_duration(filename: str) -> float:
+    return librosa.get_duration(path=filename)
 def convert_to_wav(in_filename: str) -> str:
     return out_filename
+def convert_to_wav1(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}.flac'"
+    )
+    with open(out_filename + ".flac", "rb") as f:
+        s = "\n" + out_filename + "\n"
+        s += base64.b64encode(f.read()).decode()
+        logging.info(s)
+    return out_filename
+def convert_to_wav2(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    sp_args = ["ffmpeg", "-hide_banner", "-i", in_filename, "-ar", "16000", out_filename]
+    sp_args.insert(2, "-y") if os.path.exists(out_filename) else None
+    # Create a subprocess to run the ffmpeg command.
+    _ = subprocess.Popen(
+        sp_args,
+        stdin=subprocess.PIPE,
+    )
+    return out_filename
 def build_html_output(s: str, style: str = "result_item_success"):
     return f"""
     <div class='result'>
     """
+def process_url(
+    url: str,
+    decoding_method: str,
+    beam_size: int,
+    max_symbols_per_step: int,
+    max_out_seq_len_ratio: float,
+):
+    logging.info(f"Processing URL: {url}")
+    with tempfile.NamedTemporaryFile() as f:
+        try:
+            urllib.request.urlretrieve(url, f.name)
+            return process(in_filename=f.name,
+                           decoding_method=decoding_method,
+                           beam_size=beam_size,
+                           max_symbols_per_step=max_symbols_per_step,
+                           max_out_seq_len_ratio=max_out_seq_len_ratio)
+        except Exception as e:
+            logging.info(str(e))
+            return "", build_html_output(str(e), "result_item_error")
+def process_uploaded_file(
+    in_filename: str,
+    decoding_method: str,
+    beam_size: int,
+    max_symbols_per_step: int,
+    max_out_seq_len_ratio: float,
+):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
             "Please first upload a file and then click "
     logging.info(f"Processing uploaded file: {in_filename}")
     try:
+        return process(in_filename=in_filename,
+                       decoding_method=decoding_method,
+                       beam_size=beam_size,
+                       max_symbols_per_step=max_symbols_per_step,
+                       max_out_seq_len_ratio=max_out_seq_len_ratio)
     except Exception as e:
+        logging.info(str(e))
         return "", build_html_output(str(e), "result_item_error")
+def process_microphone(
+    in_filename: str,
+    decoding_method: str,
+    beam_size: int,
+    max_symbols_per_step: int,
+    max_out_seq_len_ratio: float,
+):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
             "Please first upload a file and then click "
     logging.info(f"Processing microphone: {in_filename}")
     try:
+        return process(in_filename=in_filename,
+                       decoding_method=decoding_method,
+                       beam_size=beam_size,
+                       max_symbols_per_step=max_symbols_per_step,
+                       max_out_seq_len_ratio=max_out_seq_len_ratio)
     except Exception as e:
+        logging.info(str(e))
         return "", build_html_output(str(e), "result_item_error")
+def process(
+    in_filename: str,
+    decoding_method: str,
+    beam_size: int,
+    max_symbols_per_step: int,
+    max_out_seq_len_ratio: float,
+):
     logging.info(f"in_filename: {in_filename}")
     filename = convert_to_wav(in_filename)
     start = time()
+    recognizer = UETASRModel(repo_id,
+                             decoding_method,
+                             beam_size,
+                             max_symbols_per_step,
+                             max_out_seq_len_ratio)
     text = recognizer.predict(filename)
     date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
 with demo:
     gr.Markdown(title)
+    decode_method_radio = gr.Radio(
+        label="Decoding method",
+        choices=["greedy_search", "beam_search", "alsd_search"],
+        value="greedy_search",
+        interactive=True,
+    )
+    with gr.Column(visible=False) as beam_col:
+        beam_size = gr.Slider(
+            label="Beam size",
+            minimum=1,
+            maximum=10,
+            step=1,
+            value=5,
+            interactive=True,
+        )
+    def enable_beam_col(decoding_method):
+        if decoding_method != "greedy_search":
+            return gr.update(visible=True)
+        else:
+            return gr.update(visible=False)
+    decode_method_radio.change(enable_beam_col, decode_method_radio, beam_col)
+    max_symbols_per_step_slider = gr.Slider(
+        label="Maximum symbols per step",
+        minimum=1,
+        maximum=15,
+        step=1,
+        value=5,
+        interactive=True,
+        visible=True,
+    )
+    max_out_seq_len_slider = gr.Slider(
+        label="Maximum output sequence length ratio",
+        minimum=0,
+        maximum=1,
+        step=0.01,
+        value=0.6,
+        interactive=True,
+        visible=False,
+    )
+    def switch_slider(decoding_method):
+        if decoding_method == "alsd_search":
+            return gr.update(visible=False), gr.update(visible=True)
+        else:
+            return gr.update(visible=True), gr.update(visible=False)
+    decode_method_radio.change(switch_slider,
+                               decode_method_radio,
+                               [max_symbols_per_step_slider, max_out_seq_len_slider])
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
             uploaded_file = gr.Audio(
                 fn=process_microphone,
             )
+        with gr.TabItem("From URL"):
+            url_textbox = gr.Textbox(
+                max_lines=1,
+                placeholder="URL to an audio file",
+                label="URL",
+                interactive=True,
+            )
+            url_button = gr.Button("Submit for recognition")
+            url_output = gr.Textbox(label="Recognized speech from URL")
+            url_html_info = gr.HTML(label="Info")
         upload_button.click(
             process_uploaded_file,
+            inputs=[
+                uploaded_file,
+                decode_method_radio,
+                beam_size,
+                max_symbols_per_step_slider,
+                max_out_seq_len_slider,
+            ],
             outputs=[uploaded_output, uploaded_html_info],
         )
         record_button.click(
             process_microphone,
+            inputs=[
+                microphone,
+                decode_method_radio,
+                beam_size,
+                max_symbols_per_step_slider,
+                max_out_seq_len_slider,
+            ],
             outputs=[recorded_output, recorded_html_info],
         )
+        url_button.click(
+            process_url,
+            inputs=[
+                url_textbox,
+                decode_method_radio,
+                beam_size,
+                max_symbols_per_step_slider,
+                max_out_seq_len_slider,
+            ],
+            outputs=[url_output, url_html_info],
+        )
     gr.Markdown(description)

decode.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import logging
+import tensorflow as tf
+from functools import lru_cache
+from uetasr.searchers import GreedyRNNT, BeamRNNT, ALSDBeamRNNT
+@lru_cache(maxsize=5)
+def get_searcher(
+    searcher_type: str,
+    decoder: tf.keras.Model,
+    jointer: tf.keras.Model,
+    text_decoder: tf.keras.layers.experimental.preprocessing.PreprocessingLayer,
+    beam_size: int,
+    max_symbols_per_step: int,
+    max_output_seq_length_ratio: float,
+):
+    common_kwargs = {
+        "decoder": decoder,
+        "jointer": jointer,
+        "text_decoder": text_decoder,
+        "return_scores": False,
+    }
+    if searcher_type == "greedy_search":
+        searcher = GreedyRNNT(
+            max_symbols_per_step=max_symbols_per_step,
+            **common_kwargs,
+        )
+    elif searcher_type == "beam_search":
+        searcher = BeamRNNT(
+            max_symbols_per_step=max_symbols_per_step,
+            beam=beam_size,
+            alpha=0.0,
+            **common_kwargs,
+        )
+    elif searcher_type == "alsd_search":
+        searcher = ALSDBeamRNNT(
+            fraction=max_output_seq_length_ratio,
+            beam_size=beam_size,
+            **common_kwargs,
+        )
+    else:
+        logging.info(f"Unknown searcher type: {searcher_type}")
+    return searcher

model.py CHANGED Viewed

@@ -5,6 +5,8 @@ from huggingface_hub import hf_hub_download
 from hyperpyyaml import load_hyperpyyaml
 from typing import Union
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
@@ -69,17 +71,20 @@ def _get_conformer_pre_trained_model(repo_id: str, checkpoint_dir: str = "checkp
         local_dir=os.path.dirname(__file__),  # noqa
         local_dir_use_symlinks=True,
     )
-    print(config_path)
     with open(config_path, "r") as f:
         config = load_hyperpyyaml(f)
     encoder_model = config["encoder_model"]
-    searcher = config["decoder"]
     model = config["model"]
     audio_encoder = config["audio_encoder"]
     model.load_weights(os.path.join(checkpoint_dir, "avg_top5_27-32.ckpt")).expect_partial()
-    return audio_encoder, encoder_model, searcher, model
 def read_audio(in_filename: str):
@@ -90,16 +95,30 @@ def read_audio(in_filename: str):
 class UETASRModel:
-    def __init__(self, repo_id: str):
-        self.featurizer, self.encoder_model, self.searcher, self.model = _get_conformer_pre_trained_model(repo_id)
     def predict(self, in_filename: str):
         inputs = read_audio(in_filename)
         features = self.featurizer(inputs)
         features = self.model.cmvn(features) if self.model.use_cmvn else features
-        batch_size = tf.shape(features)[0]
-        dim = tf.shape(features)[-1]
         mask = tf.sequence_mask([tf.shape(features)[1]], maxlen=tf.shape(features)[1])
         mask = tf.expand_dims(mask, axis=1)
         encoder_outputs, encoder_masks = self.encoder_model(
@@ -111,7 +130,8 @@ class UETASRModel:
             axis=1
         )
-        outputs = self.searcher(encoder_outputs, features_length)
         outputs = tf.compat.as_str_any(outputs.numpy())
         return outputs

 from hyperpyyaml import load_hyperpyyaml
 from typing import Union
+from decode import get_searcher
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
         local_dir=os.path.dirname(__file__),  # noqa
         local_dir_use_symlinks=True,
     )
     with open(config_path, "r") as f:
         config = load_hyperpyyaml(f)
     encoder_model = config["encoder_model"]
+    text_encoder = config["text_encoder"]
+    jointer = config["jointer_model"]
+    decoder = config["decoder_model"]
+    # searcher = config["decoder"]
     model = config["model"]
     audio_encoder = config["audio_encoder"]
     model.load_weights(os.path.join(checkpoint_dir, "avg_top5_27-32.ckpt")).expect_partial()
+    return audio_encoder, encoder_model, jointer, decoder, text_encoder, model
 def read_audio(in_filename: str):
 class UETASRModel:
+    def __init__(
+        self,
+        repo_id: str,
+        decoding_method: str,
+        beam_size: int,
+        max_symbols_per_step: int,
+        max_output_seq_length_ratio: float,
+    ):
+        self.featurizer, self.encoder_model, jointer, decoder, text_encoder, self.model = _get_conformer_pre_trained_model(repo_id)
+        self.searcher = get_searcher(
+            decoding_method,
+            decoder,
+            jointer,
+            text_encoder,
+            beam_size,
+            max_symbols_per_step,
+            max_output_seq_length_ratio,
+        )
     def predict(self, in_filename: str):
         inputs = read_audio(in_filename)
         features = self.featurizer(inputs)
         features = self.model.cmvn(features) if self.model.use_cmvn else features
         mask = tf.sequence_mask([tf.shape(features)[1]], maxlen=tf.shape(features)[1])
         mask = tf.expand_dims(mask, axis=1)
         encoder_outputs, encoder_masks = self.encoder_model(
             axis=1
         )
+        outputs = self.searcher.infer(encoder_outputs, features_length)
+        outputs = tf.squeeze(outputs)
         outputs = tf.compat.as_str_any(outputs.numpy())
         return outputs

requirements.txt CHANGED Viewed

@@ -1,3 +1,2 @@
-uetasr @ git+https://github.com/thanhtvt/uetasr@v0.1.0-beta
-librosa
 requests==2.28.2


1	+ uetasr @ git+https://github.com/thanhtvt/uetasr@v0.2.0

2	requests==2.28.2