Spaces:

speech-uk
/

see-asr-outputs

Sleeping

App Files Files Community

Yehor commited on Jun 3

Commit

63747e9

1 Parent(s): d4be073

Init

Browse files

Files changed (11) hide show

.dockerignore +2 -0
.gitignore +6 -0
Dockerfile +56 -0
README.md +41 -9
app.py +245 -0
evaluation_results.jsonl +0 -0
evaluation_results_batch.jsonl +0 -0
justfile +5 -0
requirements-dev.txt +1 -0
requirements.txt +6 -0
ruff.toml +2 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .ruff_cache/
2	+ .venv/

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.idea/
+.venv/
+.ruff_cache/
+__pycache__/
+flagged/

Dockerfile ADDED Viewed

	@@ -0,0 +1,56 @@

+FROM python:3.13.2-bookworm
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    wget \
+    curl \
+    ca-certificates \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev \
+    # gradio dependencies \
+    ffmpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+RUN python -m ensurepip --upgrade && python -m pip install --upgrade pip
+RUN useradd -m -u 1001 hf-space
+USER hf-space
+ENV HOME=/home/hf-space \
+    PATH=/home/hf-space/.local/bin:${PATH} \
+    PYTHONPATH=/home/hf-space/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces \
+    HF_HOME=/home/hf-space/app/hf-home
+COPY --chown=hf-space:hf-space . ${HOME}/app
+WORKDIR ${HOME}/app
+RUN mkdir ${HF_HOME} && chmod a+rwx ${HF_HOME}
+RUN pip install --no-cache-dir -r /home/hf-space/app/requirements.txt
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,44 @@
 ---
-title: See Asr Outputs
-emoji: 🔥
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 5.32.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+license: apache-2.0
+title: See ASR outputs
+sdk: docker
+emoji: 👀
+colorFrom: green
+colorTo: gray
+short_description: 'See generated JSONL files made by ASR models as a dataframe'
 ---
+## Install
+```shell
+uv venv --python 3.13.2
+source .venv/bin/activate
+uv pip install -r requirements.txt
+# in development mode
+uv pip install -r requirements-dev.txt
+```
+## Development
+Run app:
+```shell
+gradio app.py
+```
+## Production
+### Build image
+```shell
+docker build -t see-asr-outputs .
+```
+### Run
+```shell
+docker run -it --rm -p 8888:7860 see-asr-outputs
+```

app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import sys
+from importlib.metadata import version
+import evaluate
+import polars as pl
+import polars_distance as pld
+import gradio as gr
+# Load evaluators
+wer = evaluate.load("wer")
+cer = evaluate.load("cer")
+# Config
+title = "See ASR Outputs"
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = """
+## Authors
+Follow them on social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram                                                                  |
+| https://x.com/yehor_smoliakov at X                                                              |
+| https://github.com/egorsmkv at GitHub                                                           |
+| https://huggingface.co/Yehor at Hugging Face                                                    |
+| or use [email protected]                                                                       |
+""".strip()
+examples = [
+    ["evaluation_results.jsonl", False, True, False],
+    ["evaluation_results_batch.jsonl", True, False, False],
+]
+description_head = f"""
+# {title}
+## Overview
+See generated JSONL files made by ASR models as a dataframe. Also, this app calculates WER and CER metrics for each row.
+""".strip()
+description_foot = f"""
+{authors_table}
+""".strip()
+metrics_value = """
+Metrics will appear here.
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- gradio: {version("gradio")}
+- jiwer: {version("jiwer")}
+- evaluate: {version("evaluate")}
+- pandas: {version("pandas")}
+- polars: {version("polars")}
+- polars-distance: {version("polars_distance")}
+""".strip()
+def compute_wer(prediction, reference):
+    return round(wer.compute(predictions=[prediction], references=[reference]), 4)
+def compute_cer(prediction, reference):
+    return round(cer.compute(predictions=[prediction], references=[reference]), 4)
+def process_file(file_name, _batch_mode, _calculate_distance, _calculate_metrics):
+    if not file_name:
+        raise gr.Error("Please paste your JSON file.")
+    df = pl.read_ndjson(file_name)
+    required_columns = [
+        "filename",
+        "inference_start",
+        "inference_end",
+        "inference_total",
+        "duration",
+        "reference",
+        "prediction",
+    ]
+    required_columns_batch = [
+        "inference_start",
+        "inference_end",
+        "inference_total",
+        "filenames",
+        "durations",
+        "references",
+        "predictions",
+    ]
+    if _batch_mode:
+        if not all(col in df.columns for col in required_columns_batch):
+            raise gr.Error(
+                f"Please provide a JSONL file with the following columns: {required_columns_batch}"
+            )
+    else:
+        if not all(col in df.columns for col in required_columns):
+            raise gr.Error(
+                f"Please provide a JSONL file with the following columns: {required_columns}"
+            )
+    # exclude inference_start, inference_end
+    if _batch_mode:
+        df = df.drop(
+            ["inference_total", "inference_start", "inference_end", "filenames"]
+        )
+    else:
+        df = df.drop(
+            ["inference_total", "inference_start", "inference_end", "filename"]
+        )
+    if _batch_mode:
+        predictions = []
+        references = []
+        for row in df.iter_rows(named=True):
+            for idx, prediction in enumerate(row["predictions"]):
+                reference = row["references"][idx]
+                predictions.append(prediction)
+                references.append(reference)
+        df = pl.DataFrame(
+            {
+                "prediction": predictions,
+                "reference": references,
+            }
+        )
+    if _calculate_metrics:
+        # Pandas is needed for applying functions
+        df_pd = df.to_pandas()
+        df_pd["wer"] = df_pd.apply(
+            lambda row: compute_wer(row["prediction"], row["reference"]),
+            axis=1,
+        )
+        df_pd["cer"] = df_pd.apply(
+            lambda row: compute_cer(row["prediction"], row["reference"]),
+            axis=1,
+        )
+        fields = [
+            "wer",
+            "cer",
+            "prediction",
+            "reference",
+        ]
+        df = pl.DataFrame(df_pd)
+    else:
+        fields = [
+            "prediction",
+            "reference",
+        ]
+    df = df.select(fields)
+    if _calculate_distance:
+        df = df.with_columns(
+            pld.col("prediction").dist_str.levenshtein("reference").alias("distance")
+        )
+        # add distance to the first position
+        fields = [
+            "distance",
+            *fields,
+        ]
+    df = df.select(fields)
+    return df
+demo = gr.Blocks(
+    title=title,
+    analytics_enabled=False,
+    theme=gr.themes.Base(),
+)
+with demo:
+    gr.Markdown(description_head)
+    gr.Markdown("## Usage")
+    with gr.Row():
+        df = gr.DataFrame(
+            label="Dataframe",
+            show_search="search",
+            show_row_numbers=True,
+            pinned_columns=1,
+        )
+    with gr.Row():
+        with gr.Column():
+            jsonl_file = gr.File(label="A JSONL file")
+            batch_mode = gr.Checkbox(
+                label="Use batch mode",
+            )
+            calculate_distance = gr.Checkbox(
+                label="Calculate Levenshtein distance",
+                value=False,
+            )
+            calculate_metrics = gr.Checkbox(
+                label="Calculate WER/CER metrics",
+                value=False,
+            )
+    gr.Button("Show").click(
+        process_file,
+        inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
+        outputs=df,
+    )
+    with gr.Row():
+        gr.Examples(
+            label="Choose an example",
+            inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
+            examples=examples,
+        )
+    gr.Markdown(description_foot)
+    gr.Markdown("### Gradio app uses:")
+    gr.Markdown(tech_env)
+    gr.Markdown(tech_libraries)
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch()

evaluation_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation_results_batch.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

justfile ADDED Viewed

	@@ -0,0 +1,5 @@

+check:
+    ruff check
+fmt: check
+    ruff format

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ruff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==5.23.0
+polars==1.27.0
+polars-distance==0.5.2
+evaluate==0.4.3
+jiwer==3.1.0

ruff.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [lint]
2	+ ignore = ["F403"]