Spaces:

thanhtvt
/

uetasr

Runtime error

App Files Files Community

thanhtvt commited on Apr 19, 2023

Commit

a0dfd75

•

1 Parent(s): edc41a0

demo of v0.1.0-beta release

Browse files

Files changed (8) hide show

.gitignore +4 -0
app.py +188 -0
examples.py +5 -0
model.py +117 -0
requirements.txt +3 -0
test_wavs/2022_1004_00001300_00002239.wav +0 -0
test_wavs/2022_1004_00087158_00087929.wav +0 -0
test_wavs/2022_1008_00110083_00110571.wav +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+checkpoints/
+vocabs/
+*.yaml

app.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import gradio as gr
+import librosa
+import logging
+import os
+import soundfile as sf
+import tensorflow as tf
+from datetime import datetime
+from time import time
+from examples import examples
+from model import UETASRModel
+def get_duration(filename: str) -> float:
+    return librosa.get_duration(filename=filename)
+def convert_to_wav(in_filename: str) -> str:
+    out_filename = os.path.splitext(in_filename)[0] + ".wav"
+    logging.info(f"Converting {in_filename} to {out_filename}")
+    y, sr = librosa.load(in_filename, sr=16000)
+    sf.write(out_filename, y, sr)
+    return out_filename
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def process_uploaded_file(in_filename: str):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing uploaded file: {in_filename}")
+    try:
+        return process(in_filename=in_filename)
+    except Exception as e:
+        logging.error(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process_microphone(in_filename: str):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing microphone: {in_filename}")
+    try:
+        return process(in_filename=in_filename)
+    except Exception as e:
+        logging.error(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process(in_filename: str):
+    logging.info(f"in_filename: {in_filename}")
+    filename = convert_to_wav(in_filename)
+    now = datetime.now()
+    date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
+    repo_id = "thanhtvt/uetasr-conformer_30.3m"
+    start = time()
+    recognizer = UETASRModel(repo_id)
+    text = recognizer.predict(filename)
+    date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
+    end = time()
+    duration = get_duration(filename)
+    rtf = (end - start) / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
+    info = f"""
+    Wave duration  : {duration: .3f} s <br/>
+    Processing time: {end - start: .3f} s <br/>
+    RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
+    """
+    if rtf > 1:
+        info += (
+            "<br/>We are loading the model for the first run. "
+            "Please run again to measure the real RTF.<br/>"
+        )
+    logging.info(info)
+    return text, build_html_output(info)
+title = "Vietnamese Automatic Speech Recognition with UETASR"
+description = """
+This space shows how to use UETASR for Vietnamese Automatic Speech Recognition.
+It is running on CPU provided by Hugging Face 🤗
+See more information by visiting the [Github repository](https://github.com/thanhtvt/uetasr/)
+"""
+# css style is copied from
+# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    with gr.Tabs():
+        with gr.TabItem("Upload from disk"):
+            uploaded_file = gr.Audio(
+                source="upload",  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Upload from disk",
+            )
+            upload_button = gr.Button("Submit for recognition")
+            uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
+            uploaded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=uploaded_file,
+                outputs=[uploaded_output, uploaded_html_info],
+                fn=process_uploaded_file,
+            )
+        with gr.TabItem("Record from microphone"):
+            microphone = gr.Audio(
+                source="microphone",
+                type="filepath",
+                label="Record from microphone",
+            )
+            record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Recognized speech from recordings")
+            recorded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=microphone,
+                outputs=[uploaded_output, uploaded_html_info],
+                fn=process_microphone,
+            )
+        upload_button.click(
+            process_uploaded_file,
+            inputs=uploaded_file,
+            outputs=[uploaded_output, uploaded_html_info],
+        )
+        record_button.click(
+            process_microphone,
+            inputs=microphone,
+            outputs=[recorded_output, recorded_html_info],
+        )
+    gr.Markdown(description)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch(share=True)

examples.py ADDED Viewed

	@@ -0,0 +1,5 @@

+examples = [
+    "./test_wavs/2022_1004_00001300_00002239.wav",
+    "./test_wavs/2022_1004_00087158_00087929.wav",
+    "./test_wavs/2022_1008_00110083_00110571.wav",
+]

model.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import tensorflow as tf
+from functools import lru_cache
+from huggingface_hub import hf_hub_download
+from hyperpyyaml import load_hyperpyyaml
+from typing import Union
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+def _get_checkpoint_filename(
+    repo_id: str,
+    filename: str,
+    local_dir: str = None,
+    local_dir_use_symlinks: Union[bool, str] = "auto",
+    subfolder: str = "checkpoints"
+) -> str:
+    model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+        local_dir=local_dir,
+        local_dir_use_symlinks=local_dir_use_symlinks,
+    )
+    return model_filename
+def _get_bpe_model_filename(
+    repo_id: str,
+    filename: str,
+    local_dir: str = None,
+    local_dir_use_symlinks: Union[bool, str] = "auto",
+    subfolder: str = "vocabs"
+) -> str:
+    bpe_model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+        local_dir=local_dir,
+        local_dir_use_symlinks=local_dir_use_symlinks,
+    )
+    return bpe_model_filename
+@lru_cache(maxsize=1)
+def _get_conformer_pre_trained_model(repo_id: str, checkpoint_dir: str = "checkpoints"):
+    for postfix in ["index", "data-00000-of-00001"]:
+        tmp = _get_checkpoint_filename(
+            repo_id=repo_id,
+            filename="avg_top5_27-32.ckpt.{}".format(postfix),
+            subfolder=checkpoint_dir,
+            local_dir=os.path.dirname(__file__),  # noqa
+            local_dir_use_symlinks=True,
+        )
+        print(tmp)
+    for postfix in ["model", "vocab"]:
+        tmp = _get_bpe_model_filename(
+            repo_id=repo_id,
+            filename="subword_vietnamese_500.{}".format(postfix),
+            local_dir=os.path.dirname(__file__),  # noqa
+            local_dir_use_symlinks=True,
+        )
+        print(tmp)
+    config_path = hf_hub_download(
+        repo_id=repo_id,
+        filename="config.yaml",
+        local_dir=os.path.dirname(__file__),  # noqa
+        local_dir_use_symlinks=True,
+    )
+    print(config_path)
+    with open(config_path, "r") as f:
+        config = load_hyperpyyaml(f)
+    encoder_model = config["encoder_model"]
+    searcher = config["decoder"]
+    model = config["model"]
+    audio_encoder = config["audio_encoder"]
+    model.load_weights(os.path.join(checkpoint_dir, "avg_top5_27-32.ckpt")).expect_partial()
+    return audio_encoder, encoder_model, searcher, model
+def read_audio(in_filename: str):
+    audio = tf.io.read_file(in_filename)
+    audio = tf.audio.decode_wav(audio)[0]
+    audio = tf.expand_dims(tf.squeeze(audio, axis=-1), axis=0)
+    return audio
+class UETASRModel:
+    def __init__(self, repo_id: str):
+        self.featurizer, self.encoder_model, self.searcher, self.model = _get_conformer_pre_trained_model(repo_id)
+    def predict(self, in_filename: str):
+        inputs = read_audio(in_filename)
+        features = self.featurizer(inputs)
+        features = self.model.cmvn(features) if self.model.use_cmvn else features
+        batch_size = tf.shape(features)[0]
+        dim = tf.shape(features)[-1]
+        mask = tf.sequence_mask([tf.shape(features)[1]], maxlen=tf.shape(features)[1])
+        mask = tf.expand_dims(mask, axis=1)
+        encoder_outputs, encoder_masks = self.encoder_model(
+            features, mask, training=False)
+        encoder_mask = tf.squeeze(encoder_masks, axis=1)
+        features_length = tf.math.reduce_sum(
+            tf.cast(encoder_mask, tf.int32),
+            axis=1
+        )
+        outputs = self.searcher(encoder_outputs, features_length)
+        outputs = tf.compat.as_str_any(outputs.numpy())
+        return outputs

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+uetasr @ git+https://github.com/thanhtvt/[email protected]
+librosa
+requests==2.28.2

test_wavs/2022_1004_00001300_00002239.wav ADDED Viewed

Binary file (301 kB). View file

test_wavs/2022_1004_00087158_00087929.wav ADDED Viewed

Binary file (247 kB). View file

test_wavs/2022_1008_00110083_00110571.wav ADDED Viewed

Binary file (156 kB). View file