huckiyang commited on
Commit
88c90d9
·
1 Parent(s): 9f029d4

finalize gui

Browse files
Files changed (2) hide show
  1. README.md +7 -7
  2. app.py +170 -379
README.md CHANGED
@@ -32,16 +32,16 @@ The leaderboard shows WER metrics for multiple speech recognition sources as col
32
  The leaderboard displays three baseline approaches:
33
 
34
  1. **No LM Baseline**: Uses the 1-best ASR output without any correction (input1)
35
- 2. **N-best LM Ranking**: Ranks the N-best hypotheses using a simple language model approach and chooses the best one
36
- 3. **N-best Correction**: Uses a voting-based method to correct the transcript by combining information from all N-best hypotheses
37
 
38
  ## Metrics
39
 
40
  The leaderboard displays as rows:
41
  - **Number of Examples**: Count of examples in the test set for each source
42
  - **Word Error Rate (No LM)**: WER between reference and 1-best ASR output
43
- - **Word Error Rate (N-best LM Ranking)**: WER between reference and LM-ranked best hypothesis
44
- - **Word Error Rate (N-best Correction)**: WER between reference and the corrected N-best hypothesis
45
 
46
  Lower WER values indicate better transcription accuracy.
47
 
@@ -56,15 +56,15 @@ Each cell shows the corresponding metric for that specific data source. The OVER
56
 
57
  ## Technical Details
58
 
59
- ### N-best LM Ranking
60
  This method scores each hypothesis in the N-best list using:
61
- - N-gram statistics (bigrams)
62
  - Text length
63
  - N-gram variety
64
 
65
  The hypothesis with the highest score is selected.
66
 
67
- ### N-best Correction
68
  This method uses a simple voting mechanism:
69
  - Groups hypotheses of the same length
70
  - For each word position, chooses the most common word across all hypotheses
 
32
  The leaderboard displays three baseline approaches:
33
 
34
  1. **No LM Baseline**: Uses the 1-best ASR output without any correction (input1)
35
+ 2. **N-gram Ranking**: Ranks the N-best hypotheses using a simple n-gram statistics approach and chooses the best one
36
+ 3. **Subwords Voting Correction**: Uses a voting-based method to correct the transcript by combining information from all N-best hypotheses
37
 
38
  ## Metrics
39
 
40
  The leaderboard displays as rows:
41
  - **Number of Examples**: Count of examples in the test set for each source
42
  - **Word Error Rate (No LM)**: WER between reference and 1-best ASR output
43
+ - **Word Error Rate (N-gram Ranking)**: WER between reference and n-gram ranked best hypothesis
44
+ - **Word Error Rate (Subwords Voting Correction)**: WER between reference and the voting-corrected N-best hypothesis
45
 
46
  Lower WER values indicate better transcription accuracy.
47
 
 
56
 
57
  ## Technical Details
58
 
59
+ ### N-gram Ranking
60
  This method scores each hypothesis in the N-best list using:
61
+ - N-gram statistics (4-grams)
62
  - Text length
63
  - N-gram variety
64
 
65
  The hypothesis with the highest score is selected.
66
 
67
+ ### Subwords Voting Correction
68
  This method uses a simple voting mechanism:
69
  - Groups hypotheses of the same length
70
  - For each word position, chooses the most common word across all hypotheses
app.py CHANGED
@@ -1,514 +1,305 @@
1
  import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
4
- import jiwer
5
  import numpy as np
6
  from functools import lru_cache
7
- import traceback
8
  import re
9
- import string
10
  from collections import Counter
 
11
 
12
  # Cache the dataset loading to avoid reloading on refresh
13
  @lru_cache(maxsize=1)
14
  def load_data():
15
  try:
16
- # Load only the test dataset by specifying the split
17
  dataset = load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction", split="test")
18
  return dataset
19
- except Exception as e:
20
- print(f"Error loading dataset: {str(e)}")
21
- # Try loading with explicit file path if the default loading fails
22
- try:
23
- dataset = load_dataset("parquet",
24
- data_files="https://huggingface.co/datasets/GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction/resolve/main/data/test-00000-of-00001.parquet")
25
- return dataset
26
- except Exception as e2:
27
- print(f"Error loading with explicit path: {str(e2)}")
28
- raise
29
 
30
  # Preprocess text for better WER calculation
31
  def preprocess_text(text):
32
  if not text or not isinstance(text, str):
33
  return ""
34
- # Convert to lowercase
35
  text = text.lower()
36
- # Remove punctuation
37
  text = re.sub(r'[^\w\s]', '', text)
38
- # Remove extra whitespace
39
  text = re.sub(r'\s+', ' ', text).strip()
40
  return text
41
 
42
- # Simple language model scoring - count n-grams
43
  def score_hypothesis(hypothesis, n=4):
44
- """Score a hypothesis using simple n-gram statistics"""
45
  if not hypothesis:
46
  return 0
47
 
48
  words = hypothesis.split()
49
  if len(words) < n:
50
- return len(words) # Just return word count for very short texts
51
 
52
- # Count n-grams
53
  ngrams = []
54
  for i in range(len(words) - n + 1):
55
  ngram = ' '.join(words[i:i+n])
56
  ngrams.append(ngram)
57
 
58
- # More unique n-grams might indicate better fluency
59
  unique_ngrams = len(set(ngrams))
60
  total_ngrams = len(ngrams)
61
-
62
- # Score is a combination of length and n-gram variety
63
  score = len(words) + unique_ngrams/max(1, total_ngrams) * 5
64
  return score
65
 
66
- # N-best LM ranking approach
67
  def get_best_hypothesis_lm(hypotheses):
68
- """Choose the best hypothesis using a simple language model approach"""
69
  if not hypotheses:
70
  return ""
71
 
72
- # Convert to list if it's not already
73
  if isinstance(hypotheses, str):
74
  return hypotheses
75
 
76
- # Ensure we have a list of strings
77
- hypothesis_list = []
78
- for h in hypotheses:
79
- if isinstance(h, str):
80
- hypothesis_list.append(preprocess_text(h))
81
 
82
  if not hypothesis_list:
83
  return ""
84
 
85
- # Score each hypothesis and choose the best one
86
  scores = [(score_hypothesis(h), h) for h in hypothesis_list]
87
  best_hypothesis = max(scores, key=lambda x: x[0])[1]
88
  return best_hypothesis
89
 
90
- # N-best correction approach
91
  def correct_hypotheses(hypotheses):
92
- """Simple n-best correction by voting on words"""
93
  if not hypotheses:
94
  return ""
95
 
96
- # Convert to list if it's not already
97
  if isinstance(hypotheses, str):
98
  return hypotheses
99
 
100
- # Ensure we have a list of strings
101
- hypothesis_list = []
102
- for h in hypotheses:
103
- if isinstance(h, str):
104
- hypothesis_list.append(preprocess_text(h))
105
 
106
  if not hypothesis_list:
107
  return ""
108
 
109
- # Split hypotheses into words
110
  word_lists = [h.split() for h in hypothesis_list]
111
-
112
- # Find the most common length
113
  lengths = [len(words) for words in word_lists]
 
114
  if not lengths:
115
  return ""
116
 
117
  most_common_length = Counter(lengths).most_common(1)[0][0]
118
-
119
- # Only consider hypotheses with the most common length
120
  filtered_word_lists = [words for words in word_lists if len(words) == most_common_length]
121
 
122
  if not filtered_word_lists:
123
- # Fall back to the longest hypothesis if filtering removed everything
124
  return max(hypothesis_list, key=len)
125
 
126
- # Vote on each word position
127
  corrected_words = []
128
  for i in range(most_common_length):
129
  position_words = [words[i] for words in filtered_word_lists]
130
  most_common_word = Counter(position_words).most_common(1)[0][0]
131
  corrected_words.append(most_common_word)
132
 
133
- # Join the corrected words
134
  return ' '.join(corrected_words)
135
 
136
- # Fix the Levenshtein distance calculation to avoid dependence on jiwer internals
137
  def calculate_simple_wer(reference, hypothesis):
138
- """Calculate WER using a simple word-based approach"""
139
  if not reference or not hypothesis:
140
- return 1.0 # Maximum error if either is empty
141
-
142
- # Split into words
143
  ref_words = reference.split()
144
  hyp_words = hypothesis.split()
145
 
146
- # Use editdistance package instead of jiwer internals
147
- try:
148
- import editdistance
149
- distance = editdistance.eval(ref_words, hyp_words)
150
- except ImportError:
151
- # Fallback to simple jiwer calculation
152
- try:
153
- # Try using the standard jiwer implementation
154
- wer_value = jiwer.wer(reference, hypothesis)
155
- return wer_value
156
- except Exception:
157
- # If all else fails, return 1.0 (maximum error)
158
- print("Error calculating WER - fallback to maximum error")
159
- return 1.0
160
 
161
- # WER calculation
162
  if len(ref_words) == 0:
163
  return 1.0
164
  return float(distance) / float(len(ref_words))
165
 
166
  # Calculate WER for a group of examples with multiple methods
167
- def calculate_wer_methods(examples):
168
- if not examples:
169
- return 0.0, 0.0, 0.0
170
 
171
- try:
172
- # Check if examples is a Dataset or a list
173
- is_dataset = hasattr(examples, 'features')
174
-
175
- # Get the first example for inspection
176
- if is_dataset and len(examples) > 0:
177
- example = examples[0]
178
- elif not is_dataset and len(examples) > 0:
179
- example = examples[0]
180
- else:
181
- print("No examples found")
182
- return np.nan, np.nan, np.nan
183
-
184
- print("\n===== EXAMPLE DATA INSPECTION =====")
185
- print(f"Keys in example: {example.keys()}")
186
-
187
- # Try different possible field names
188
- possible_reference_fields = ["transcription", "reference", "ground_truth", "target"]
189
- possible_hypothesis_fields = ["input1", "hypothesis", "asr_output", "source_text"]
190
-
191
- for field in possible_reference_fields:
192
- if field in example:
193
- print(f"Reference field '{field}' found with value: {str(example[field])[:100]}...")
194
-
195
- for field in possible_hypothesis_fields:
196
- if field in example:
197
- print(f"Hypothesis field '{field}' found with value: {str(example[field])[:100]}...")
198
-
199
- # Process each example in the dataset
200
- wer_values_no_lm = []
201
- wer_values_lm_ranking = []
202
- wer_values_n_best_correction = []
203
-
204
- valid_count = 0
205
- skipped_count = 0
206
 
207
- # Determine how to iterate based on type
208
- items_to_process = examples
209
- if is_dataset:
210
- # Limit to first 200 examples for efficiency
211
- items_to_process = examples.select(range(min(200, len(examples))))
212
- else:
213
- items_to_process = examples[:200] # First 200 examples
214
 
215
- for i, ex in enumerate(items_to_process):
216
- try:
217
- # Get reference transcription
218
- transcription = ex.get("transcription")
219
- if not transcription or not isinstance(transcription, str):
220
- skipped_count += 1
221
- continue
222
-
223
- # Process the reference
224
- reference = preprocess_text(transcription)
225
- if not reference:
226
- skipped_count += 1
227
- continue
228
-
229
- # Get 1-best hypothesis for baseline
230
- input1 = ex.get("input1")
231
- if input1 is None and "hypothesis" in ex and ex["hypothesis"]:
232
- if isinstance(ex["hypothesis"], list) and len(ex["hypothesis"]) > 0:
233
- input1 = ex["hypothesis"][0]
234
- elif isinstance(ex["hypothesis"], str):
235
- input1 = ex["hypothesis"]
236
-
237
- # Get n-best hypotheses for other methods
238
- n_best_hypotheses = ex.get("hypothesis", [])
239
-
240
- # Process and evaluate all methods
241
-
242
- # Method 1: No LM (1-best ASR output)
243
- if input1 and isinstance(input1, str):
244
- no_lm_hyp = preprocess_text(input1)
245
- if no_lm_hyp:
246
- wer_no_lm = calculate_simple_wer(reference, no_lm_hyp)
247
- wer_values_no_lm.append(wer_no_lm)
248
-
249
- # Method 2: LM ranking (best of n-best)
250
- if n_best_hypotheses:
251
- lm_best_hyp = get_best_hypothesis_lm(n_best_hypotheses)
252
- if lm_best_hyp:
253
- wer_lm = calculate_simple_wer(reference, lm_best_hyp)
254
- wer_values_lm_ranking.append(wer_lm)
255
-
256
- # Method 3: N-best correction (voting among n-best)
257
- if n_best_hypotheses:
258
- corrected_hyp = correct_hypotheses(n_best_hypotheses)
259
- if corrected_hyp:
260
- wer_corrected = calculate_simple_wer(reference, corrected_hyp)
261
- wer_values_n_best_correction.append(wer_corrected)
262
-
263
- # Count as valid if at least one method worked
264
- if (wer_values_no_lm and i == len(wer_values_no_lm) - 1) or \
265
- (wer_values_lm_ranking and i == len(wer_values_lm_ranking) - 1) or \
266
- (wer_values_n_best_correction and i == len(wer_values_n_best_correction) - 1):
267
- valid_count += 1
268
- else:
269
- skipped_count += 1
270
-
271
- # Print debug info for a few examples
272
- if i < 2:
273
- print(f"\nExample {i} inspection:")
274
- print(f" Reference: '{reference}'")
275
-
276
- if input1 and isinstance(input1, str):
277
- no_lm_hyp = preprocess_text(input1)
278
- print(f" No LM (1-best): '{no_lm_hyp}'")
279
- if no_lm_hyp:
280
- wer = calculate_simple_wer(reference, no_lm_hyp)
281
- print(f" No LM WER: {wer:.4f}")
282
-
283
- if n_best_hypotheses:
284
- print(f" N-best count: {len(n_best_hypotheses) if isinstance(n_best_hypotheses, list) else 'not a list'}")
285
- lm_best_hyp = get_best_hypothesis_lm(n_best_hypotheses)
286
- print(f" LM ranking best: '{lm_best_hyp}'")
287
- if lm_best_hyp:
288
- wer = calculate_simple_wer(reference, lm_best_hyp)
289
- print(f" LM ranking WER: {wer:.4f}")
290
-
291
- corrected_hyp = correct_hypotheses(n_best_hypotheses)
292
- print(f" N-best correction: '{corrected_hyp}'")
293
- if corrected_hyp:
294
- wer = calculate_simple_wer(reference, corrected_hyp)
295
- print(f" N-best correction WER: {wer:.4f}")
296
-
297
- except Exception as ex_error:
298
- print(f"Error processing example {i}: {str(ex_error)}")
299
- skipped_count += 1
300
- continue
301
 
302
- # Calculate average WER for each method
303
- print(f"\nProcessing summary: Valid pairs: {valid_count}, Skipped: {skipped_count}")
304
 
305
- no_lm_wer = np.mean(wer_values_no_lm) if wer_values_no_lm else np.nan
306
- lm_ranking_wer = np.mean(wer_values_lm_ranking) if wer_values_lm_ranking else np.nan
307
- n_best_correction_wer = np.mean(wer_values_n_best_correction) if wer_values_n_best_correction else np.nan
 
 
 
308
 
309
- print(f"Calculated WERs:")
310
- print(f" No LM: {len(wer_values_no_lm)} pairs, avg WER: {no_lm_wer:.4f}")
311
- print(f" LM Ranking: {len(wer_values_lm_ranking)} pairs, avg WER: {lm_ranking_wer:.4f}")
312
- print(f" N-best Correction: {len(wer_values_n_best_correction)} pairs, avg WER: {n_best_correction_wer:.4f}")
 
 
313
 
314
- return no_lm_wer, lm_ranking_wer, n_best_correction_wer
 
 
 
 
 
315
 
316
- except Exception as e:
317
- print(f"Error in calculate_wer: {str(e)}")
318
- print(traceback.format_exc())
319
- return np.nan, np.nan, np.nan
 
 
320
 
321
- # Get WER metrics by source
322
  def get_wer_metrics(dataset):
323
- try:
324
- # Print dataset info
325
- print(f"\n===== DATASET INFO =====")
326
- print(f"Dataset size: {len(dataset)}")
327
- print(f"Dataset features: {dataset.features}")
328
-
329
- # Group examples by source
330
- examples_by_source = {}
331
-
332
- # Process all examples
333
- for i, ex in enumerate(dataset):
334
- try:
335
- source = ex.get("source", "unknown")
336
- # Skip all_et05_real as requested
337
- if source == "all_et05_real":
338
- continue
339
-
340
- if source not in examples_by_source:
341
- examples_by_source[source] = []
342
- examples_by_source[source].append(ex)
343
- except Exception as e:
344
- print(f"Error processing example {i}: {str(e)}")
345
- continue
346
-
347
- # Get all unique sources
348
- all_sources = sorted(examples_by_source.keys())
349
- print(f"Found sources: {all_sources}")
350
-
351
- # Calculate metrics for each source
352
- source_results = {}
353
- for source in all_sources:
354
- try:
355
- examples = examples_by_source.get(source, [])
356
- count = len(examples)
357
-
358
- if count > 0:
359
- print(f"\nCalculating WER for source {source} with {count} examples")
360
- no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(examples)
361
- else:
362
- no_lm_wer, lm_ranking_wer, n_best_wer = np.nan, np.nan, np.nan
363
-
364
- source_results[source] = {
365
- "Count": count,
366
- "No LM Baseline": no_lm_wer,
367
- "N-best LM Ranking": lm_ranking_wer,
368
- "N-best Correction": n_best_wer
369
- }
370
- except Exception as e:
371
- print(f"Error processing source {source}: {str(e)}")
372
- source_results[source] = {
373
- "Count": 0,
374
- "No LM Baseline": np.nan,
375
- "N-best LM Ranking": np.nan,
376
- "N-best Correction": np.nan
377
- }
378
-
379
- # Calculate overall metrics with a sample but excluding all_et05_real
380
- try:
381
- # Create a filtered dataset without all_et05_real
382
- filtered_dataset = [ex for ex in dataset if ex.get("source") != "all_et05_real"]
383
- total_count = len(filtered_dataset)
384
- print(f"\nCalculating overall WER with a sample of examples (excluding all_et05_real)")
385
-
386
- # Sample for calculation
387
- sample_size = min(500, total_count)
388
- sample_dataset = filtered_dataset[:sample_size]
389
- no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(sample_dataset)
390
 
391
- source_results["OVERALL"] = {
392
- "Count": total_count,
393
- "No LM Baseline": no_lm_wer,
394
- "N-best LM Ranking": lm_ranking_wer,
395
- "N-best Correction": n_best_wer
396
- }
397
- except Exception as e:
398
- print(f"Error calculating overall metrics: {str(e)}")
399
- print(traceback.format_exc())
400
- source_results["OVERALL"] = {
401
- "Count": len(filtered_dataset),
402
- "No LM Baseline": np.nan,
403
- "N-best LM Ranking": np.nan,
404
- "N-best Correction": np.nan
405
- }
406
-
407
- # Create flat DataFrame with labels in the first column
408
- rows = []
409
-
410
- # First add row for number of examples
411
- example_row = {"Metric": "Number of Examples"}
412
- for source in all_sources + ["OVERALL"]:
413
- example_row[source] = source_results[source]["Count"]
414
- rows.append(example_row)
415
-
416
- # Then add rows for each WER method
417
- no_lm_row = {"Metric": "Word Error Rate (No LM)"}
418
- lm_ranking_row = {"Metric": "Word Error Rate (N-best LM Ranking)"}
419
- n_best_row = {"Metric": "Word Error Rate (N-best Correction)"}
420
-
421
- for source in all_sources + ["OVERALL"]:
422
- no_lm_row[source] = source_results[source]["No LM Baseline"]
423
- lm_ranking_row[source] = source_results[source]["N-best LM Ranking"]
424
- n_best_row[source] = source_results[source]["N-best Correction"]
425
-
426
- rows.append(no_lm_row)
427
- rows.append(lm_ranking_row)
428
- rows.append(n_best_row)
429
 
430
- # Create DataFrame from rows
431
- result_df = pd.DataFrame(rows)
 
 
432
 
433
- return result_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
- except Exception as e:
436
- print(f"Error in get_wer_metrics: {str(e)}")
437
- print(traceback.format_exc())
438
- return pd.DataFrame([{"Error": str(e)}])
 
 
 
 
 
 
 
 
 
439
 
440
  # Format the dataframe for display
441
  def format_dataframe(df):
442
- try:
443
- # Use vectorized operations instead of apply
444
- df = df.copy()
445
-
446
- # Find the rows containing WER values
447
- wer_row_indices = []
448
- for i, metric in enumerate(df["Metric"]):
449
- if "WER" in metric or "Error Rate" in metric:
450
- wer_row_indices.append(i)
451
-
452
- # Format WER values
453
- for idx in wer_row_indices:
454
- for col in df.columns:
455
- if col != "Metric": # Skip the metric column
456
- value = df.loc[idx, col]
457
- if pd.notna(value):
458
- df.loc[idx, col] = f"{value:.4f}"
459
- else:
460
- df.loc[idx, col] = "N/A"
461
-
462
- return df
463
 
464
- except Exception as e:
465
- print(f"Error in format_dataframe: {str(e)}")
466
- print(traceback.format_exc())
467
- return pd.DataFrame([{"Error": str(e)}])
 
 
 
 
 
 
 
468
 
469
  # Main function to create the leaderboard
470
  def create_leaderboard():
471
- try:
472
- dataset = load_data()
473
- metrics_df = get_wer_metrics(dataset)
474
- return format_dataframe(metrics_df)
475
- except Exception as e:
476
- error_msg = f"Error creating leaderboard: {str(e)}\n{traceback.format_exc()}"
477
- print(error_msg)
478
- return pd.DataFrame([{"Error": error_msg}])
479
 
480
  # Create the Gradio interface
481
- with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
482
  gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
483
  gr.Markdown("Word Error Rate (WER) metrics for different speech sources with multiple correction approaches")
484
 
485
  with gr.Row():
486
  refresh_btn = gr.Button("Refresh Leaderboard")
487
 
488
- with gr.Row():
489
- error_output = gr.Textbox(label="Debug Information", visible=True, lines=10)
490
-
491
  with gr.Row():
492
  try:
493
  initial_df = create_leaderboard()
494
  leaderboard = gr.DataFrame(initial_df)
495
- except Exception as e:
496
- error_msg = f"Error initializing leaderboard: {str(e)}\n{traceback.format_exc()}"
497
- print(error_msg)
498
- error_output.update(value=error_msg)
499
- leaderboard = gr.DataFrame(pd.DataFrame([{"Error": error_msg}]))
500
 
501
  def refresh_and_report():
502
- try:
503
- df = create_leaderboard()
504
- debug_info = "Leaderboard refreshed successfully. Check console for detailed debug information."
505
- return df, debug_info
506
- except Exception as e:
507
- error_msg = f"Error refreshing leaderboard: {str(e)}\n{traceback.format_exc()}"
508
- print(error_msg)
509
- return pd.DataFrame([{"Error": error_msg}]), error_msg
510
-
511
- refresh_btn.click(refresh_and_report, outputs=[leaderboard, error_output])
512
 
513
  if __name__ == "__main__":
514
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
 
4
  import numpy as np
5
  from functools import lru_cache
 
6
  import re
 
7
  from collections import Counter
8
+ import editdistance
9
 
10
  # Cache the dataset loading to avoid reloading on refresh
11
  @lru_cache(maxsize=1)
12
  def load_data():
13
  try:
 
14
  dataset = load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction", split="test")
15
  return dataset
16
+ except Exception:
17
+ # Fallback to explicit file path if default loading fails
18
+ return load_dataset("parquet",
19
+ data_files="https://huggingface.co/datasets/GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction/resolve/main/data/test-00000-of-00001.parquet")
 
 
 
 
 
 
20
 
21
  # Preprocess text for better WER calculation
22
  def preprocess_text(text):
23
  if not text or not isinstance(text, str):
24
  return ""
 
25
  text = text.lower()
 
26
  text = re.sub(r'[^\w\s]', '', text)
 
27
  text = re.sub(r'\s+', ' ', text).strip()
28
  return text
29
 
30
+ # N-gram scoring for hypothesis ranking
31
  def score_hypothesis(hypothesis, n=4):
 
32
  if not hypothesis:
33
  return 0
34
 
35
  words = hypothesis.split()
36
  if len(words) < n:
37
+ return len(words)
38
 
 
39
  ngrams = []
40
  for i in range(len(words) - n + 1):
41
  ngram = ' '.join(words[i:i+n])
42
  ngrams.append(ngram)
43
 
 
44
  unique_ngrams = len(set(ngrams))
45
  total_ngrams = len(ngrams)
 
 
46
  score = len(words) + unique_ngrams/max(1, total_ngrams) * 5
47
  return score
48
 
49
+ # N-gram ranking approach
50
  def get_best_hypothesis_lm(hypotheses):
 
51
  if not hypotheses:
52
  return ""
53
 
 
54
  if isinstance(hypotheses, str):
55
  return hypotheses
56
 
57
+ hypothesis_list = [preprocess_text(h) for h in hypotheses if isinstance(h, str)]
 
 
 
 
58
 
59
  if not hypothesis_list:
60
  return ""
61
 
 
62
  scores = [(score_hypothesis(h), h) for h in hypothesis_list]
63
  best_hypothesis = max(scores, key=lambda x: x[0])[1]
64
  return best_hypothesis
65
 
66
+ # Subwords voting correction approach
67
  def correct_hypotheses(hypotheses):
 
68
  if not hypotheses:
69
  return ""
70
 
 
71
  if isinstance(hypotheses, str):
72
  return hypotheses
73
 
74
+ hypothesis_list = [preprocess_text(h) for h in hypotheses if isinstance(h, str)]
 
 
 
 
75
 
76
  if not hypothesis_list:
77
  return ""
78
 
 
79
  word_lists = [h.split() for h in hypothesis_list]
 
 
80
  lengths = [len(words) for words in word_lists]
81
+
82
  if not lengths:
83
  return ""
84
 
85
  most_common_length = Counter(lengths).most_common(1)[0][0]
 
 
86
  filtered_word_lists = [words for words in word_lists if len(words) == most_common_length]
87
 
88
  if not filtered_word_lists:
 
89
  return max(hypothesis_list, key=len)
90
 
 
91
  corrected_words = []
92
  for i in range(most_common_length):
93
  position_words = [words[i] for words in filtered_word_lists]
94
  most_common_word = Counter(position_words).most_common(1)[0][0]
95
  corrected_words.append(most_common_word)
96
 
 
97
  return ' '.join(corrected_words)
98
 
99
+ # Calculate WER
100
  def calculate_simple_wer(reference, hypothesis):
 
101
  if not reference or not hypothesis:
102
+ return 1.0
103
+
 
104
  ref_words = reference.split()
105
  hyp_words = hypothesis.split()
106
 
107
+ distance = editdistance.eval(ref_words, hyp_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
109
  if len(ref_words) == 0:
110
  return 1.0
111
  return float(distance) / float(len(ref_words))
112
 
113
  # Calculate WER for a group of examples with multiple methods
114
+ def calculate_wer_methods(examples, max_samples=200):
115
+ if not examples or len(examples) == 0:
116
+ return np.nan, np.nan, np.nan
117
 
118
+ # Limit sample size for efficiency
119
+ if hasattr(examples, 'select'):
120
+ items_to_process = examples.select(range(min(max_samples, len(examples))))
121
+ else:
122
+ items_to_process = examples[:max_samples]
123
+
124
+ wer_values_no_lm = []
125
+ wer_values_lm_ranking = []
126
+ wer_values_n_best_correction = []
127
+
128
+ for ex in items_to_process:
129
+ # Get reference transcription
130
+ transcription = ex.get("transcription")
131
+ if not transcription or not isinstance(transcription, str):
132
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ reference = preprocess_text(transcription)
135
+ if not reference:
136
+ continue
 
 
 
 
137
 
138
+ # Get 1-best hypothesis for baseline
139
+ input1 = ex.get("input1")
140
+ if input1 is None and "hypothesis" in ex and ex["hypothesis"]:
141
+ if isinstance(ex["hypothesis"], list) and len(ex["hypothesis"]) > 0:
142
+ input1 = ex["hypothesis"][0]
143
+ elif isinstance(ex["hypothesis"], str):
144
+ input1 = ex["hypothesis"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ # Get n-best hypotheses for other methods
147
+ n_best_hypotheses = ex.get("hypothesis", [])
148
 
149
+ # Method 1: No LM (1-best ASR output)
150
+ if input1 and isinstance(input1, str):
151
+ no_lm_hyp = preprocess_text(input1)
152
+ if no_lm_hyp:
153
+ wer_no_lm = calculate_simple_wer(reference, no_lm_hyp)
154
+ wer_values_no_lm.append(wer_no_lm)
155
 
156
+ # Method 2: N-gram ranking
157
+ if n_best_hypotheses:
158
+ lm_best_hyp = get_best_hypothesis_lm(n_best_hypotheses)
159
+ if lm_best_hyp:
160
+ wer_lm = calculate_simple_wer(reference, lm_best_hyp)
161
+ wer_values_lm_ranking.append(wer_lm)
162
 
163
+ # Method 3: Subwords voting correction
164
+ if n_best_hypotheses:
165
+ corrected_hyp = correct_hypotheses(n_best_hypotheses)
166
+ if corrected_hyp:
167
+ wer_corrected = calculate_simple_wer(reference, corrected_hyp)
168
+ wer_values_n_best_correction.append(wer_corrected)
169
 
170
+ # Calculate average WER for each method
171
+ no_lm_wer = np.mean(wer_values_no_lm) if wer_values_no_lm else np.nan
172
+ lm_ranking_wer = np.mean(wer_values_lm_ranking) if wer_values_lm_ranking else np.nan
173
+ n_best_correction_wer = np.mean(wer_values_n_best_correction) if wer_values_n_best_correction else np.nan
174
+
175
+ return no_lm_wer, lm_ranking_wer, n_best_correction_wer
176
 
177
+ # Get WER metrics by source
178
  def get_wer_metrics(dataset):
179
+ # Group examples by source
180
+ examples_by_source = {}
181
+
182
+ for ex in dataset:
183
+ source = ex.get("source", "unknown")
184
+ # Skip all_et05_real as requested
185
+ if source == "all_et05_real":
186
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ if source not in examples_by_source:
189
+ examples_by_source[source] = []
190
+ examples_by_source[source].append(ex)
191
+
192
+ # Get all unique sources
193
+ all_sources = sorted(examples_by_source.keys())
194
+
195
+ # Calculate metrics for each source
196
+ source_results = {}
197
+ for source in all_sources:
198
+ examples = examples_by_source.get(source, [])
199
+ count = len(examples)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ if count > 0:
202
+ no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(examples)
203
+ else:
204
+ no_lm_wer, lm_ranking_wer, n_best_wer = np.nan, np.nan, np.nan
205
 
206
+ source_results[source] = {
207
+ "Count": count,
208
+ "No LM Baseline": no_lm_wer,
209
+ "N-best LM Ranking": lm_ranking_wer,
210
+ "N-best Correction": n_best_wer
211
+ }
212
+
213
+ # Calculate overall metrics
214
+ filtered_dataset = [ex for ex in dataset if ex.get("source") != "all_et05_real"]
215
+ total_count = len(filtered_dataset)
216
+
217
+ sample_size = min(500, total_count)
218
+ sample_dataset = filtered_dataset[:sample_size]
219
+ no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(sample_dataset)
220
+
221
+ source_results["OVERALL"] = {
222
+ "Count": total_count,
223
+ "No LM Baseline": no_lm_wer,
224
+ "N-best LM Ranking": lm_ranking_wer,
225
+ "N-best Correction": n_best_wer
226
+ }
227
+
228
+ # Create flat DataFrame with labels in the first column
229
+ rows = []
230
+
231
+ # First add row for number of examples
232
+ example_row = {"Metric": "Number of Examples"}
233
+ for source in all_sources + ["OVERALL"]:
234
+ example_row[source] = source_results[source]["Count"]
235
+ rows.append(example_row)
236
+
237
+ # Then add rows for each WER method
238
+ no_lm_row = {"Metric": "Word Error Rate (No LM)"}
239
+ lm_ranking_row = {"Metric": "Word Error Rate (N-gram Ranking)"}
240
+ n_best_row = {"Metric": "Word Error Rate (Subwords Voting Correction)"}
241
 
242
+ for source in all_sources + ["OVERALL"]:
243
+ no_lm_row[source] = source_results[source]["No LM Baseline"]
244
+ lm_ranking_row[source] = source_results[source]["N-best LM Ranking"]
245
+ n_best_row[source] = source_results[source]["N-best Correction"]
246
+
247
+ rows.append(no_lm_row)
248
+ rows.append(lm_ranking_row)
249
+ rows.append(n_best_row)
250
+
251
+ # Create DataFrame from rows
252
+ result_df = pd.DataFrame(rows)
253
+
254
+ return result_df
255
 
256
  # Format the dataframe for display
257
  def format_dataframe(df):
258
+ df = df.copy()
259
+
260
+ # Find the rows containing WER values
261
+ wer_row_indices = []
262
+ for i, metric in enumerate(df["Metric"]):
263
+ if "WER" in metric or "Error Rate" in metric:
264
+ wer_row_indices.append(i)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
+ # Format WER values
267
+ for idx in wer_row_indices:
268
+ for col in df.columns:
269
+ if col != "Metric":
270
+ value = df.loc[idx, col]
271
+ if pd.notna(value):
272
+ df.loc[idx, col] = f"{value:.4f}"
273
+ else:
274
+ df.loc[idx, col] = "N/A"
275
+
276
+ return df
277
 
278
  # Main function to create the leaderboard
279
  def create_leaderboard():
280
+ dataset = load_data()
281
+ metrics_df = get_wer_metrics(dataset)
282
+ return format_dataframe(metrics_df)
 
 
 
 
 
283
 
284
  # Create the Gradio interface
285
+ with gr.Blocks(title="ASR Text Correction Leaderboard") as demo:
286
  gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
287
  gr.Markdown("Word Error Rate (WER) metrics for different speech sources with multiple correction approaches")
288
 
289
  with gr.Row():
290
  refresh_btn = gr.Button("Refresh Leaderboard")
291
 
 
 
 
292
  with gr.Row():
293
  try:
294
  initial_df = create_leaderboard()
295
  leaderboard = gr.DataFrame(initial_df)
296
+ except Exception:
297
+ leaderboard = gr.DataFrame(pd.DataFrame([{"Error": "Error initializing leaderboard"}]))
 
 
 
298
 
299
  def refresh_and_report():
300
+ return create_leaderboard()
301
+
302
+ refresh_btn.click(refresh_and_report, outputs=[leaderboard])
 
 
 
 
 
 
 
303
 
304
  if __name__ == "__main__":
305
  demo.launch()