Yehor commited on
Commit
ff99877
·
1 Parent(s): 07b19d7

Add Levenshtein distance for faster debugging

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +2 -0
  3. app.py +68 -69
  4. requirements.txt +2 -1
.gitignore CHANGED
@@ -1,5 +1,6 @@
1
  .idea/
2
  .venv/
3
  .ruff_cache/
 
4
 
5
  flagged/
 
1
  .idea/
2
  .venv/
3
  .ruff_cache/
4
+ __pycache__/
5
 
6
  flagged/
README.md CHANGED
@@ -23,6 +23,8 @@ uv pip install -r requirements-dev.txt
23
 
24
  ## Development
25
 
 
 
26
  ```shell
27
  gradio app.py
28
  ```
 
23
 
24
  ## Development
25
 
26
+ Run app:
27
+
28
  ```shell
29
  gradio app.py
30
  ```
app.py CHANGED
@@ -4,6 +4,7 @@ from importlib.metadata import version
4
 
5
  import evaluate
6
  import polars as pl
 
7
  import gradio as gr
8
 
9
  # Load evaluators
@@ -29,8 +30,8 @@ Follow them on social networks and **contact** if you need any help or have any
29
  """.strip()
30
 
31
  examples = [
32
- ["evaluation_results.jsonl", False, True],
33
- ["evaluation_results_batch.jsonl", True, True],
34
  ]
35
 
36
  description_head = f"""
@@ -63,6 +64,7 @@ tech_libraries = f"""
63
  - evaluate: {version("evaluate")}
64
  - pandas: {version("pandas")}
65
  - polars: {version("polars")}
 
66
  """.strip()
67
 
68
 
@@ -74,15 +76,7 @@ def compute_cer(prediction, reference):
74
  return round(cer.compute(predictions=[prediction], references=[reference]), 4)
75
 
76
 
77
- def compute_batch_wer(predictions, references):
78
- return round(wer.compute(predictions=predictions, references=references), 4)
79
-
80
-
81
- def compute_batch_cer(predictions, references):
82
- return round(cer.compute(predictions=predictions, references=references), 4)
83
-
84
-
85
- def inference(file_name, _batch_mode, _calculate_metrics):
86
  if not file_name:
87
  raise gr.Error("Please paste your JSON file.")
88
 
@@ -120,67 +114,67 @@ def inference(file_name, _batch_mode, _calculate_metrics):
120
 
121
  # exclude inference_start, inference_end
122
  if _batch_mode:
123
- df = df.drop(["inference_start", "inference_end", "filenames"])
 
 
124
  else:
125
- df = df.drop(["inference_start", "inference_end", "filename"])
 
 
126
 
127
- # round "inference_total" field to 2 decimal places
128
- df = df.with_columns(pl.col("inference_total").round(2).alias("elapsed"))
129
- df = df.drop(["inference_total"])
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- df_pd = df.to_pandas()
 
 
132
 
133
- # reassign columns
134
- if _batch_mode:
135
- if _calculate_metrics:
136
- df_pd["wer"] = df_pd.apply(
137
- lambda row: compute_batch_wer(row["predictions"], row["references"]), axis=1,
138
- )
139
- df_pd["cer"] = df_pd.apply(
140
- lambda row: compute_batch_cer(row["predictions"], row["references"]), axis=1,
141
- )
142
 
143
- fields = [
144
- "elapsed",
145
- "durations",
146
- "wer",
147
- "cer",
148
- "predictions",
149
- "references",
150
- ]
151
- else:
152
- fields = [
153
- "elapsed",
154
- "durations",
155
- "predictions",
156
- "references",
157
- ]
158
- else:
159
- if _calculate_metrics:
160
- df_pd["wer"] = df_pd.apply(
161
- lambda row: compute_wer(row["prediction"], row["reference"]), axis=1,
162
- )
163
- df_pd["cer"] = df_pd.apply(
164
- lambda row: compute_cer(row["prediction"], row["reference"]), axis=1,
165
- )
166
 
167
- fields = [
168
- "elapsed",
169
- "duration",
170
- "wer",
171
- "cer",
172
- "prediction",
173
- "reference",
174
- ]
175
- else:
176
- fields = [
177
- "elapsed",
178
- "duration",
179
- "prediction",
180
- "reference",
181
- ]
182
-
183
- df = pl.DataFrame(df_pd)
184
 
185
  return df.select(fields)
186
 
@@ -212,21 +206,26 @@ with demo:
212
  label="Use batch mode",
213
  )
214
 
 
 
 
 
 
215
  calculate_metrics = gr.Checkbox(
216
  label="Calculate WER/CER metrics",
217
  value=False,
218
  )
219
 
220
  gr.Button("Show").click(
221
- inference,
222
- inputs=[jsonl_file, batch_mode, calculate_metrics],
223
  outputs=df,
224
  )
225
 
226
  with gr.Row():
227
  gr.Examples(
228
  label="Choose an example",
229
- inputs=[jsonl_file, batch_mode, calculate_metrics],
230
  examples=examples,
231
  )
232
 
 
4
 
5
  import evaluate
6
  import polars as pl
7
+ import polars_distance as pld
8
  import gradio as gr
9
 
10
  # Load evaluators
 
30
  """.strip()
31
 
32
  examples = [
33
+ ["evaluation_results.jsonl", False, True, False],
34
+ ["evaluation_results_batch.jsonl", True, False, False],
35
  ]
36
 
37
  description_head = f"""
 
64
  - evaluate: {version("evaluate")}
65
  - pandas: {version("pandas")}
66
  - polars: {version("polars")}
67
+ - polars_distance: {version("polars_distance")}
68
  """.strip()
69
 
70
 
 
76
  return round(cer.compute(predictions=[prediction], references=[reference]), 4)
77
 
78
 
79
+ def process_file(file_name, _batch_mode, _calculate_distance, _calculate_metrics):
 
 
 
 
 
 
 
 
80
  if not file_name:
81
  raise gr.Error("Please paste your JSON file.")
82
 
 
114
 
115
  # exclude inference_start, inference_end
116
  if _batch_mode:
117
+ df = df.drop(
118
+ ["inference_total", "inference_start", "inference_end", "filenames"]
119
+ )
120
  else:
121
+ df = df.drop(
122
+ ["inference_total", "inference_start", "inference_end", "filename"]
123
+ )
124
 
125
+ if _batch_mode:
126
+ predictions = []
127
+ references = []
128
+ for row in df.iter_rows(named=True):
129
+ for idx, prediction in enumerate(row["predictions"]):
130
+ reference = row["references"][idx]
131
+
132
+ predictions.append(prediction)
133
+ references.append(reference)
134
+
135
+ df = pl.DataFrame(
136
+ {
137
+ "prediction": predictions,
138
+ "reference": references,
139
+ }
140
+ )
141
 
142
+ if _calculate_metrics:
143
+ # Pandas is needed for applying functions
144
+ df_pd = df.to_pandas()
145
 
146
+ df_pd["wer"] = df_pd.apply(
147
+ lambda row: compute_wer(row["prediction"], row["reference"]),
148
+ axis=1,
149
+ )
150
+ df_pd["cer"] = df_pd.apply(
151
+ lambda row: compute_cer(row["prediction"], row["reference"]),
152
+ axis=1,
153
+ )
 
154
 
155
+ fields = [
156
+ "wer",
157
+ "cer",
158
+ "prediction",
159
+ "reference",
160
+ ]
161
+
162
+ df = pl.DataFrame(df_pd)
163
+ elif _calculate_distance:
164
+ df = df.with_columns(
165
+ pld.col("prediction").dist_str.levenshtein("reference").alias("distance")
166
+ )
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ fields = [
169
+ "distance",
170
+ "prediction",
171
+ "reference",
172
+ ]
173
+ else:
174
+ fields = [
175
+ "prediction",
176
+ "reference",
177
+ ]
 
 
 
 
 
 
 
178
 
179
  return df.select(fields)
180
 
 
206
  label="Use batch mode",
207
  )
208
 
209
+ calculate_distance = gr.Checkbox(
210
+ label="Calculate Levenshtein distance",
211
+ value=False,
212
+ )
213
+
214
  calculate_metrics = gr.Checkbox(
215
  label="Calculate WER/CER metrics",
216
  value=False,
217
  )
218
 
219
  gr.Button("Show").click(
220
+ process_file,
221
+ inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
222
  outputs=df,
223
  )
224
 
225
  with gr.Row():
226
  gr.Examples(
227
  label="Choose an example",
228
+ inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
229
  examples=examples,
230
  )
231
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  gradio==5.23.0
2
 
3
- polars==1.26.0
 
4
  evaluate==0.4.3
5
  jiwer==3.1.0
 
1
  gradio==5.23.0
2
 
3
+ polars==1.25.2
4
+ polars-distance==0.5.2
5
  evaluate==0.4.3
6
  jiwer==3.1.0