Yehor commited on
Commit
5545d25
·
1 Parent(s): 21e62d9
Files changed (8) hide show
  1. .dockerignore +2 -0
  2. .gitignore +5 -0
  3. Dockerfile +59 -0
  4. README.md +30 -6
  5. app.py +164 -0
  6. evaluation_results.jsonl +0 -0
  7. requirements-dev.txt +1 -0
  8. requirements.txt +5 -0
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .ruff_cache/
2
+ .venv/
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .idea/
2
+ .venv/
3
+ .ruff_cache/
4
+
5
+ flagged/
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ RUN apt-get update && \
6
+ apt-get upgrade -y && \
7
+ apt-get install -y --no-install-recommends \
8
+ git \
9
+ git-lfs \
10
+ wget \
11
+ curl \
12
+ # python build dependencies \
13
+ build-essential \
14
+ libssl-dev \
15
+ zlib1g-dev \
16
+ libbz2-dev \
17
+ libreadline-dev \
18
+ libsqlite3-dev \
19
+ libncursesw5-dev \
20
+ xz-utils \
21
+ tk-dev \
22
+ libxml2-dev \
23
+ libxmlsec1-dev \
24
+ libffi-dev \
25
+ liblzma-dev \
26
+ # gradio dependencies \
27
+ ffmpeg \
28
+ && apt-get clean \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+
32
+ RUN useradd -m -u 1000 user
33
+ USER user
34
+ ENV HOME=/home/user \
35
+ PATH=/home/user/.local/bin:${PATH}
36
+ WORKDIR ${HOME}/app
37
+
38
+ RUN curl https://pyenv.run | bash
39
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
40
+ ARG PYTHON_VERSION=3.10.12
41
+ RUN pyenv install ${PYTHON_VERSION} && \
42
+ pyenv global ${PYTHON_VERSION} && \
43
+ pyenv rehash && \
44
+ pip install --no-cache-dir -U pip setuptools wheel && \
45
+ pip install packaging ninja
46
+
47
+ COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
48
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
49
+
50
+ COPY --chown=1000 . ${HOME}/app
51
+ ENV PYTHONPATH=${HOME}/app \
52
+ PYTHONUNBUFFERED=1 \
53
+ GRADIO_ALLOW_FLAGGING=never \
54
+ GRADIO_NUM_PORTS=1 \
55
+ GRADIO_SERVER_NAME=0.0.0.0 \
56
+ GRADIO_THEME=huggingface \
57
+ SYSTEM=spaces
58
+
59
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,34 @@
1
  ---
2
- title: Evaluate Asr Outputs
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: docker
7
- pinned: false
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: apache-2.0
3
+ title: Evaluate ASR outputs
 
 
4
  sdk: docker
5
+ emoji: 👀
6
+ colorFrom: green
7
+ colorTo: gray
8
+ short_description: 'Calculate WER/CER values from JSONL files made by ASR models'
9
  ---
10
 
11
+ ## Install
12
+
13
+ ```shell
14
+ uv venv --python 3.10
15
+
16
+ source .venv/bin/activate
17
+
18
+ uv pip install -r requirements.txt
19
+
20
+ # in development mode
21
+ uv pip install -r requirements-dev.txt
22
+ ```
23
+
24
+ ## Build image
25
+
26
+ ```shell
27
+ docker build -t evaluate-asr-outputs .
28
+ ```
29
+
30
+ ## Run
31
+
32
+ ```shell
33
+ docker run -it --rm -p 8888:7860 evaluate-asr-outputs
34
+ ```
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ from importlib.metadata import version
4
+
5
+ import evaluate
6
+ import polars as pl
7
+ import gradio as gr
8
+
9
+ # Load evaluators
10
+ wer = evaluate.load("wer")
11
+ cer = evaluate.load("cer")
12
+
13
+ # Config
14
+ concurrency_limit = 5
15
+
16
+ title = "Evaluate ASR Outputs"
17
+
18
+ # https://www.tablesgenerator.com/markdown_tables
19
+ authors_table = """
20
+ ## Authors
21
+
22
+ Follow them on social networks and **contact** if you need any help or have any questions:
23
+
24
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
25
+ |-------------------------------------------------------------------------------------------------|
26
+ | https://t.me/smlkw in Telegram |
27
+ | https://x.com/yehor_smoliakov at X |
28
+ | https://github.com/egorsmkv at GitHub |
29
+ | https://huggingface.co/Yehor at Hugging Face |
30
+ | or use [email protected] |
31
+ """.strip()
32
+
33
+ examples = [
34
+ ["evaluation_results.jsonl", True],
35
+ ]
36
+
37
+ description_head = f"""
38
+ # {title}
39
+
40
+ ## Overview
41
+
42
+ Upload a JSONL file generated by the ASR model.
43
+ """.strip()
44
+
45
+ description_foot = f"""
46
+ {authors_table}
47
+ """.strip()
48
+
49
+ metrics_value = """
50
+ Metrics will appear here.
51
+ """.strip()
52
+
53
+ tech_env = f"""
54
+ #### Environment
55
+
56
+ - Python: {sys.version}
57
+ """.strip()
58
+
59
+ tech_libraries = f"""
60
+ #### Libraries
61
+
62
+ - evaluate: {version('evaluate')}
63
+ - gradio: {version('gradio')}
64
+ - jiwer: {version('jiwer')}
65
+ - polars: {version('polars')}
66
+ """.strip()
67
+
68
+ def clean_value(x):
69
+ return x.replace('’', "'").strip().lower().replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('–', '').replace('«', '').replace('»', '')
70
+
71
+ def inference(file_name, clear_punctuation, progress=gr.Progress()):
72
+ if not file_name:
73
+ raise gr.Error("Please paste your JSON file.")
74
+
75
+ progress(0, desc="Calculating...")
76
+
77
+ df = pl.read_ndjson(file_name)
78
+
79
+ inference_seconds = df['inference_total'].sum()
80
+ duration_seconds = df['duration'].sum()
81
+
82
+ rtf = inference_seconds / duration_seconds
83
+
84
+ references = df['reference']
85
+
86
+ if clear_punctuation:
87
+ predictions = df['prediction'].map_elements(clean_value)
88
+ else:
89
+ predictions = df['prediction']
90
+
91
+ # Evaluate
92
+ wer_value = round(
93
+ wer.compute(predictions=predictions, references=references), 4
94
+ )
95
+ cer_value = round(
96
+ cer.compute(predictions=predictions, references=references), 4
97
+ )
98
+
99
+ inference_time = inference_seconds
100
+ audio_duration = duration_seconds
101
+
102
+ rtf = inference_time / audio_duration
103
+
104
+ results = []
105
+
106
+ results.append(f"Metrics using `evaluate` library:")
107
+ results.append('')
108
+ results.append(f"- WER: {wer_value} metric, {round(wer_value*100, 4)}%")
109
+ results.append(f"- CER: {cer_value} metric, {round(cer_value*100, 4)}%")
110
+ results.append('')
111
+ results.append(f"- Accuracy on words: {100 - 100 * wer_value}%")
112
+ results.append(f"- Accuracy on chars: {100 - 100 * cer_value}%")
113
+ results.append('')
114
+ results.append(f"- Inference time: {round(inference_time, 4)} seconds, {round(inference_time/60, 4)} mins, {round(inference_time/60/60, 4)} hours")
115
+ results.append(f"- Audio duration: {round(audio_duration, 4)} seconds, {round(audio_duration/60/60, 4)} hours")
116
+ results.append('')
117
+ results.append(f"- RTF: {round(rtf, 4)}")
118
+
119
+ return "\n".join(results)
120
+
121
+
122
+ demo = gr.Blocks(
123
+ title=title,
124
+ analytics_enabled=False,
125
+ theme=gr.themes.Base(),
126
+ )
127
+
128
+ with demo:
129
+ gr.Markdown(description_head)
130
+
131
+ gr.Markdown("## Usage")
132
+
133
+ with gr.Row():
134
+ with gr.Column():
135
+ jsonl_file = gr.File(label="A JSONL file")
136
+ clear_punctuation = gr.Checkbox(
137
+ label="Clear punctuation, some chars and convert to lowercase",
138
+ )
139
+
140
+ metrics = gr.Textbox(
141
+ label="Metrics",
142
+ placeholder=metrics_value,
143
+ show_copy_button=True,
144
+ )
145
+
146
+ gr.Button("Calculate").click(
147
+ inference,
148
+ concurrency_limit=concurrency_limit,
149
+ inputs=[jsonl_file, clear_punctuation],
150
+ outputs=metrics,
151
+ )
152
+
153
+ with gr.Row():
154
+ gr.Examples(label="Choose an example", inputs=[jsonl_file, clear_punctuation], examples=examples)
155
+
156
+ gr.Markdown(description_foot)
157
+
158
+ gr.Markdown("### Gradio app uses:")
159
+ gr.Markdown(tech_env)
160
+ gr.Markdown(tech_libraries)
161
+
162
+ if __name__ == "__main__":
163
+ demo.queue()
164
+ demo.launch()
evaluation_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
requirements-dev.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ruff
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.18.0
2
+
3
+ evaluate==0.4.3
4
+ jiwer==3.1.0
5
+ polars==1.23.0