Yehor commited on
Commit
63747e9
·
1 Parent(s): d4be073
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .ruff_cache/
2
+ .venv/
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .idea/
2
+ .venv/
3
+ .ruff_cache/
4
+ __pycache__/
5
+
6
+ flagged/
Dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.13.2-bookworm
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ RUN apt-get update && \
6
+ apt-get upgrade -y && \
7
+ apt-get install -y --no-install-recommends \
8
+ git \
9
+ git-lfs \
10
+ wget \
11
+ curl \
12
+ ca-certificates \
13
+ # python build dependencies \
14
+ build-essential \
15
+ libssl-dev \
16
+ zlib1g-dev \
17
+ libbz2-dev \
18
+ libreadline-dev \
19
+ libsqlite3-dev \
20
+ libncursesw5-dev \
21
+ xz-utils \
22
+ tk-dev \
23
+ libxml2-dev \
24
+ libxmlsec1-dev \
25
+ libffi-dev \
26
+ liblzma-dev \
27
+ # gradio dependencies \
28
+ ffmpeg \
29
+ && apt-get clean \
30
+ && rm -rf /var/lib/apt/lists/*
31
+
32
+ RUN python -m ensurepip --upgrade && python -m pip install --upgrade pip
33
+
34
+ RUN useradd -m -u 1001 hf-space
35
+ USER hf-space
36
+
37
+ ENV HOME=/home/hf-space \
38
+ PATH=/home/hf-space/.local/bin:${PATH} \
39
+ PYTHONPATH=/home/hf-space/app \
40
+ PYTHONUNBUFFERED=1 \
41
+ GRADIO_ALLOW_FLAGGING=never \
42
+ GRADIO_NUM_PORTS=1 \
43
+ GRADIO_SERVER_NAME=0.0.0.0 \
44
+ GRADIO_THEME=huggingface \
45
+ SYSTEM=spaces \
46
+ HF_HOME=/home/hf-space/app/hf-home
47
+
48
+ COPY --chown=hf-space:hf-space . ${HOME}/app
49
+
50
+ WORKDIR ${HOME}/app
51
+
52
+ RUN mkdir ${HF_HOME} && chmod a+rwx ${HF_HOME}
53
+
54
+ RUN pip install --no-cache-dir -r /home/hf-space/app/requirements.txt
55
+
56
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,12 +1,44 @@
1
  ---
2
- title: See Asr Outputs
3
- emoji: 🔥
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.32.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: apache-2.0
3
+ title: See ASR outputs
4
+ sdk: docker
5
+ emoji: 👀
6
+ colorFrom: green
7
+ colorTo: gray
8
+ short_description: 'See generated JSONL files made by ASR models as a dataframe'
 
9
  ---
10
 
11
+ ## Install
12
+
13
+ ```shell
14
+ uv venv --python 3.13.2
15
+
16
+ source .venv/bin/activate
17
+
18
+ uv pip install -r requirements.txt
19
+
20
+ # in development mode
21
+ uv pip install -r requirements-dev.txt
22
+ ```
23
+
24
+ ## Development
25
+
26
+ Run app:
27
+
28
+ ```shell
29
+ gradio app.py
30
+ ```
31
+
32
+ ## Production
33
+
34
+ ### Build image
35
+
36
+ ```shell
37
+ docker build -t see-asr-outputs .
38
+ ```
39
+
40
+ ### Run
41
+
42
+ ```shell
43
+ docker run -it --rm -p 8888:7860 see-asr-outputs
44
+ ```
app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ from importlib.metadata import version
4
+
5
+ import evaluate
6
+ import polars as pl
7
+ import polars_distance as pld
8
+ import gradio as gr
9
+
10
+ # Load evaluators
11
+ wer = evaluate.load("wer")
12
+ cer = evaluate.load("cer")
13
+
14
+ # Config
15
+ title = "See ASR Outputs"
16
+
17
+ # https://www.tablesgenerator.com/markdown_tables
18
+ authors_table = """
19
+ ## Authors
20
+
21
+ Follow them on social networks and **contact** if you need any help or have any questions:
22
+
23
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
24
+ |-------------------------------------------------------------------------------------------------|
25
+ | https://t.me/smlkw in Telegram |
26
+ | https://x.com/yehor_smoliakov at X |
27
+ | https://github.com/egorsmkv at GitHub |
28
+ | https://huggingface.co/Yehor at Hugging Face |
29
+ | or use [email protected] |
30
+ """.strip()
31
+
32
+ examples = [
33
+ ["evaluation_results.jsonl", False, True, False],
34
+ ["evaluation_results_batch.jsonl", True, False, False],
35
+ ]
36
+
37
+ description_head = f"""
38
+ # {title}
39
+
40
+ ## Overview
41
+
42
+ See generated JSONL files made by ASR models as a dataframe. Also, this app calculates WER and CER metrics for each row.
43
+ """.strip()
44
+
45
+ description_foot = f"""
46
+ {authors_table}
47
+ """.strip()
48
+
49
+ metrics_value = """
50
+ Metrics will appear here.
51
+ """.strip()
52
+
53
+ tech_env = f"""
54
+ #### Environment
55
+
56
+ - Python: {sys.version}
57
+ """.strip()
58
+
59
+ tech_libraries = f"""
60
+ #### Libraries
61
+
62
+ - gradio: {version("gradio")}
63
+ - jiwer: {version("jiwer")}
64
+ - evaluate: {version("evaluate")}
65
+ - pandas: {version("pandas")}
66
+ - polars: {version("polars")}
67
+ - polars-distance: {version("polars_distance")}
68
+ """.strip()
69
+
70
+
71
+ def compute_wer(prediction, reference):
72
+ return round(wer.compute(predictions=[prediction], references=[reference]), 4)
73
+
74
+
75
+ def compute_cer(prediction, reference):
76
+ return round(cer.compute(predictions=[prediction], references=[reference]), 4)
77
+
78
+
79
+ def process_file(file_name, _batch_mode, _calculate_distance, _calculate_metrics):
80
+ if not file_name:
81
+ raise gr.Error("Please paste your JSON file.")
82
+
83
+ df = pl.read_ndjson(file_name)
84
+
85
+ required_columns = [
86
+ "filename",
87
+ "inference_start",
88
+ "inference_end",
89
+ "inference_total",
90
+ "duration",
91
+ "reference",
92
+ "prediction",
93
+ ]
94
+ required_columns_batch = [
95
+ "inference_start",
96
+ "inference_end",
97
+ "inference_total",
98
+ "filenames",
99
+ "durations",
100
+ "references",
101
+ "predictions",
102
+ ]
103
+
104
+ if _batch_mode:
105
+ if not all(col in df.columns for col in required_columns_batch):
106
+ raise gr.Error(
107
+ f"Please provide a JSONL file with the following columns: {required_columns_batch}"
108
+ )
109
+ else:
110
+ if not all(col in df.columns for col in required_columns):
111
+ raise gr.Error(
112
+ f"Please provide a JSONL file with the following columns: {required_columns}"
113
+ )
114
+
115
+ # exclude inference_start, inference_end
116
+ if _batch_mode:
117
+ df = df.drop(
118
+ ["inference_total", "inference_start", "inference_end", "filenames"]
119
+ )
120
+ else:
121
+ df = df.drop(
122
+ ["inference_total", "inference_start", "inference_end", "filename"]
123
+ )
124
+
125
+ if _batch_mode:
126
+ predictions = []
127
+ references = []
128
+ for row in df.iter_rows(named=True):
129
+ for idx, prediction in enumerate(row["predictions"]):
130
+ reference = row["references"][idx]
131
+
132
+ predictions.append(prediction)
133
+ references.append(reference)
134
+
135
+ df = pl.DataFrame(
136
+ {
137
+ "prediction": predictions,
138
+ "reference": references,
139
+ }
140
+ )
141
+
142
+ if _calculate_metrics:
143
+ # Pandas is needed for applying functions
144
+ df_pd = df.to_pandas()
145
+
146
+ df_pd["wer"] = df_pd.apply(
147
+ lambda row: compute_wer(row["prediction"], row["reference"]),
148
+ axis=1,
149
+ )
150
+ df_pd["cer"] = df_pd.apply(
151
+ lambda row: compute_cer(row["prediction"], row["reference"]),
152
+ axis=1,
153
+ )
154
+
155
+ fields = [
156
+ "wer",
157
+ "cer",
158
+ "prediction",
159
+ "reference",
160
+ ]
161
+
162
+ df = pl.DataFrame(df_pd)
163
+ else:
164
+ fields = [
165
+ "prediction",
166
+ "reference",
167
+ ]
168
+
169
+ df = df.select(fields)
170
+
171
+ if _calculate_distance:
172
+ df = df.with_columns(
173
+ pld.col("prediction").dist_str.levenshtein("reference").alias("distance")
174
+ )
175
+
176
+ # add distance to the first position
177
+ fields = [
178
+ "distance",
179
+ *fields,
180
+ ]
181
+
182
+ df = df.select(fields)
183
+
184
+ return df
185
+
186
+
187
+ demo = gr.Blocks(
188
+ title=title,
189
+ analytics_enabled=False,
190
+ theme=gr.themes.Base(),
191
+ )
192
+
193
+ with demo:
194
+ gr.Markdown(description_head)
195
+
196
+ gr.Markdown("## Usage")
197
+
198
+ with gr.Row():
199
+ df = gr.DataFrame(
200
+ label="Dataframe",
201
+ show_search="search",
202
+ show_row_numbers=True,
203
+ pinned_columns=1,
204
+ )
205
+
206
+ with gr.Row():
207
+ with gr.Column():
208
+ jsonl_file = gr.File(label="A JSONL file")
209
+
210
+ batch_mode = gr.Checkbox(
211
+ label="Use batch mode",
212
+ )
213
+
214
+ calculate_distance = gr.Checkbox(
215
+ label="Calculate Levenshtein distance",
216
+ value=False,
217
+ )
218
+
219
+ calculate_metrics = gr.Checkbox(
220
+ label="Calculate WER/CER metrics",
221
+ value=False,
222
+ )
223
+
224
+ gr.Button("Show").click(
225
+ process_file,
226
+ inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
227
+ outputs=df,
228
+ )
229
+
230
+ with gr.Row():
231
+ gr.Examples(
232
+ label="Choose an example",
233
+ inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
234
+ examples=examples,
235
+ )
236
+
237
+ gr.Markdown(description_foot)
238
+
239
+ gr.Markdown("### Gradio app uses:")
240
+ gr.Markdown(tech_env)
241
+ gr.Markdown(tech_libraries)
242
+
243
+ if __name__ == "__main__":
244
+ demo.queue()
245
+ demo.launch()
evaluation_results.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evaluation_results_batch.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
justfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ check:
2
+ ruff check
3
+
4
+ fmt: check
5
+ ruff format
requirements-dev.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ruff
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==5.23.0
2
+
3
+ polars==1.27.0
4
+ polars-distance==0.5.2
5
+ evaluate==0.4.3
6
+ jiwer==3.1.0
ruff.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [lint]
2
+ ignore = ["F403"]