Spaces:

Yehor
/

evaluate-asr-outputs

Running

App Files Files Community

Yehor commited on Feb 25

Commit

5545d25

1 Parent(s): 21e62d9

Init

Browse files

Files changed (8) hide show

.dockerignore +2 -0
.gitignore +5 -0
Dockerfile +59 -0
README.md +30 -6
app.py +164 -0
evaluation_results.jsonl +0 -0
requirements-dev.txt +1 -0
requirements.txt +5 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .ruff_cache/
2	+ .venv/

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.idea/
+.venv/
+.ruff_cache/
+flagged/

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    wget \
+    curl \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev \
+    # gradio dependencies \
+    ffmpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:${PATH}
+WORKDIR ${HOME}/app
+RUN curl https://pyenv.run | bash
+ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
+ARG PYTHON_VERSION=3.10.12
+RUN pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION} && \
+    pyenv rehash && \
+    pip install --no-cache-dir -U pip setuptools wheel && \
+    pip install packaging ninja
+COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
+COPY --chown=1000 . ${HOME}/app
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,34 @@
 ---
-title: Evaluate Asr Outputs
-emoji: 📉
-colorFrom: blue
-colorTo: purple
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+license: apache-2.0
+title: Evaluate ASR outputs
 sdk: docker
+emoji: 👀
+colorFrom: green
+colorTo: gray
+short_description: 'Calculate WER/CER values from JSONL files made by ASR models'
 ---
+## Install
+```shell
+uv venv --python 3.10
+source .venv/bin/activate
+uv pip install -r requirements.txt
+# in development mode
+uv pip install -r requirements-dev.txt
+```
+## Build image
+```shell
+docker build -t evaluate-asr-outputs .
+```
+## Run
+```shell
+docker run -it --rm -p 8888:7860 evaluate-asr-outputs
+```

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import sys
+from importlib.metadata import version
+import evaluate
+import polars as pl
+import gradio as gr
+# Load evaluators
+wer = evaluate.load("wer")
+cer = evaluate.load("cer")
+# Config
+concurrency_limit = 5
+title = "Evaluate ASR Outputs"
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = """
+## Authors
+Follow them on social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram                                                                  |
+| https://x.com/yehor_smoliakov at X                                                              |
+| https://github.com/egorsmkv at GitHub                                                           |
+| https://huggingface.co/Yehor at Hugging Face                                                    |
+| or use [email protected]                                                                       |
+""".strip()
+examples = [
+    ["evaluation_results.jsonl", True],
+]
+description_head = f"""
+# {title}
+## Overview
+Upload a JSONL file generated by the ASR model.
+""".strip()
+description_foot = f"""
+{authors_table}
+""".strip()
+metrics_value = """
+Metrics will appear here.
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- evaluate: {version('evaluate')}
+- gradio: {version('gradio')}
+- jiwer: {version('jiwer')}
+- polars: {version('polars')}
+""".strip()
+def clean_value(x):
+    return x.replace('’', "'").strip().lower().replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('–', '').replace('«', '').replace('»', '')
+def inference(file_name, clear_punctuation, progress=gr.Progress()):
+    if not file_name:
+        raise gr.Error("Please paste your JSON file.")
+    progress(0, desc="Calculating...")
+    df = pl.read_ndjson(file_name)
+    inference_seconds = df['inference_total'].sum()
+    duration_seconds = df['duration'].sum()
+    rtf = inference_seconds / duration_seconds
+    references = df['reference']
+    if clear_punctuation:
+        predictions = df['prediction'].map_elements(clean_value)
+    else:
+        predictions = df['prediction']
+    # Evaluate
+    wer_value = round(
+        wer.compute(predictions=predictions, references=references), 4
+    )
+    cer_value = round(
+        cer.compute(predictions=predictions, references=references), 4
+    )
+    inference_time = inference_seconds
+    audio_duration = duration_seconds
+    rtf = inference_time / audio_duration
+    results = []
+    results.append(f"Metrics using `evaluate` library:")
+    results.append('')
+    results.append(f"- WER: {wer_value} metric, {round(wer_value*100, 4)}%")
+    results.append(f"- CER: {cer_value} metric, {round(cer_value*100, 4)}%")
+    results.append('')
+    results.append(f"- Accuracy on words: {100 - 100 * wer_value}%")
+    results.append(f"- Accuracy on chars: {100 - 100 * cer_value}%")
+    results.append('')
+    results.append(f"- Inference time: {round(inference_time, 4)} seconds, {round(inference_time/60, 4)} mins, {round(inference_time/60/60, 4)} hours")
+    results.append(f"- Audio duration: {round(audio_duration, 4)} seconds, {round(audio_duration/60/60, 4)} hours")
+    results.append('')
+    results.append(f"- RTF: {round(rtf, 4)}")
+    return "\n".join(results)
+demo = gr.Blocks(
+    title=title,
+    analytics_enabled=False,
+    theme=gr.themes.Base(),
+)
+with demo:
+    gr.Markdown(description_head)
+    gr.Markdown("## Usage")
+    with gr.Row():
+        with gr.Column():
+            jsonl_file = gr.File(label="A JSONL file")
+            clear_punctuation = gr.Checkbox(
+                label="Clear punctuation, some chars and convert to lowercase",
+            )
+        metrics = gr.Textbox(
+            label="Metrics",
+            placeholder=metrics_value,
+            show_copy_button=True,
+        )
+    gr.Button("Calculate").click(
+        inference,
+        concurrency_limit=concurrency_limit,
+        inputs=[jsonl_file, clear_punctuation],
+        outputs=metrics,
+    )
+    with gr.Row():
+        gr.Examples(label="Choose an example", inputs=[jsonl_file, clear_punctuation], examples=examples)
+    gr.Markdown(description_foot)
+    gr.Markdown("### Gradio app uses:")
+    gr.Markdown(tech_env)
+    gr.Markdown(tech_libraries)
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch()

evaluation_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ruff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==5.18.0
+evaluate==0.4.3
+jiwer==3.1.0
+polars==1.23.0