Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14

Commit

88c90d9

1 Parent(s): 9f029d4

finalize gui

Browse files

Files changed (2) hide show

README.md +7 -7
app.py +170 -379

README.md CHANGED Viewed

@@ -32,16 +32,16 @@ The leaderboard shows WER metrics for multiple speech recognition sources as col
 The leaderboard displays three baseline approaches:
 1. **No LM Baseline**: Uses the 1-best ASR output without any correction (input1)
-2. **N-best LM Ranking**: Ranks the N-best hypotheses using a simple language model approach and chooses the best one
-3. **N-best Correction**: Uses a voting-based method to correct the transcript by combining information from all N-best hypotheses
 ## Metrics
 The leaderboard displays as rows:
 - **Number of Examples**: Count of examples in the test set for each source
 - **Word Error Rate (No LM)**: WER between reference and 1-best ASR output
-- **Word Error Rate (N-best LM Ranking)**: WER between reference and LM-ranked best hypothesis
-- **Word Error Rate (N-best Correction)**: WER between reference and the corrected N-best hypothesis
 Lower WER values indicate better transcription accuracy.
@@ -56,15 +56,15 @@ Each cell shows the corresponding metric for that specific data source. The OVER
 ## Technical Details
-### N-best LM Ranking
 This method scores each hypothesis in the N-best list using:
-- N-gram statistics (bigrams)
 - Text length
 - N-gram variety
 The hypothesis with the highest score is selected.
-### N-best Correction
 This method uses a simple voting mechanism:
 - Groups hypotheses of the same length
 - For each word position, chooses the most common word across all hypotheses

 The leaderboard displays three baseline approaches:
 1. **No LM Baseline**: Uses the 1-best ASR output without any correction (input1)
+2. **N-gram Ranking**: Ranks the N-best hypotheses using a simple n-gram statistics approach and chooses the best one
+3. **Subwords Voting Correction**: Uses a voting-based method to correct the transcript by combining information from all N-best hypotheses
 ## Metrics
 The leaderboard displays as rows:
 - **Number of Examples**: Count of examples in the test set for each source
 - **Word Error Rate (No LM)**: WER between reference and 1-best ASR output
+- **Word Error Rate (N-gram Ranking)**: WER between reference and n-gram ranked best hypothesis
+- **Word Error Rate (Subwords Voting Correction)**: WER between reference and the voting-corrected N-best hypothesis
 Lower WER values indicate better transcription accuracy.
 ## Technical Details
+### N-gram Ranking
 This method scores each hypothesis in the N-best list using:
+- N-gram statistics (4-grams)
 - Text length
 - N-gram variety
 The hypothesis with the highest score is selected.
+### Subwords Voting Correction
 This method uses a simple voting mechanism:
 - Groups hypotheses of the same length
 - For each word position, chooses the most common word across all hypotheses

app.py CHANGED Viewed

@@ -1,514 +1,305 @@
 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
-import jiwer
 import numpy as np
 from functools import lru_cache
-import traceback
 import re
-import string
 from collections import Counter
 # Cache the dataset loading to avoid reloading on refresh
 @lru_cache(maxsize=1)
 def load_data():
     try:
-        # Load only the test dataset by specifying the split
         dataset = load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction", split="test")
         return dataset
-    except Exception as e:
-        print(f"Error loading dataset: {str(e)}")
-        # Try loading with explicit file path if the default loading fails
-        try:
-            dataset = load_dataset("parquet",
-                                  data_files="https://huggingface.co/datasets/GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction/resolve/main/data/test-00000-of-00001.parquet")
-            return dataset
-        except Exception as e2:
-            print(f"Error loading with explicit path: {str(e2)}")
-            raise
 # Preprocess text for better WER calculation
 def preprocess_text(text):
     if not text or not isinstance(text, str):
         return ""
-    # Convert to lowercase
     text = text.lower()
-    # Remove punctuation
     text = re.sub(r'[^\w\s]', '', text)
-    # Remove extra whitespace
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-# Simple language model scoring - count n-grams
 def score_hypothesis(hypothesis, n=4):
-    """Score a hypothesis using simple n-gram statistics"""
     if not hypothesis:
         return 0
     words = hypothesis.split()
     if len(words) < n:
-        return len(words)  # Just return word count for very short texts
-    # Count n-grams
     ngrams = []
     for i in range(len(words) - n + 1):
         ngram = ' '.join(words[i:i+n])
         ngrams.append(ngram)
-    # More unique n-grams might indicate better fluency
     unique_ngrams = len(set(ngrams))
     total_ngrams = len(ngrams)
-    # Score is a combination of length and n-gram variety
     score = len(words) + unique_ngrams/max(1, total_ngrams) * 5
     return score
-# N-best LM ranking approach
 def get_best_hypothesis_lm(hypotheses):
-    """Choose the best hypothesis using a simple language model approach"""
     if not hypotheses:
         return ""
-    # Convert to list if it's not already
     if isinstance(hypotheses, str):
         return hypotheses
-    # Ensure we have a list of strings
-    hypothesis_list = []
-    for h in hypotheses:
-        if isinstance(h, str):
-            hypothesis_list.append(preprocess_text(h))
     if not hypothesis_list:
         return ""
-    # Score each hypothesis and choose the best one
     scores = [(score_hypothesis(h), h) for h in hypothesis_list]
     best_hypothesis = max(scores, key=lambda x: x[0])[1]
     return best_hypothesis
-# N-best correction approach
 def correct_hypotheses(hypotheses):
-    """Simple n-best correction by voting on words"""
     if not hypotheses:
         return ""
-    # Convert to list if it's not already
     if isinstance(hypotheses, str):
         return hypotheses
-    # Ensure we have a list of strings
-    hypothesis_list = []
-    for h in hypotheses:
-        if isinstance(h, str):
-            hypothesis_list.append(preprocess_text(h))
     if not hypothesis_list:
         return ""
-    # Split hypotheses into words
     word_lists = [h.split() for h in hypothesis_list]
-    # Find the most common length
     lengths = [len(words) for words in word_lists]
     if not lengths:
         return ""
     most_common_length = Counter(lengths).most_common(1)[0][0]
-    # Only consider hypotheses with the most common length
     filtered_word_lists = [words for words in word_lists if len(words) == most_common_length]
     if not filtered_word_lists:
-        # Fall back to the longest hypothesis if filtering removed everything
         return max(hypothesis_list, key=len)
-    # Vote on each word position
     corrected_words = []
     for i in range(most_common_length):
         position_words = [words[i] for words in filtered_word_lists]
         most_common_word = Counter(position_words).most_common(1)[0][0]
         corrected_words.append(most_common_word)
-    # Join the corrected words
     return ' '.join(corrected_words)
-# Fix the Levenshtein distance calculation to avoid dependence on jiwer internals
 def calculate_simple_wer(reference, hypothesis):
-    """Calculate WER using a simple word-based approach"""
     if not reference or not hypothesis:
-        return 1.0  # Maximum error if either is empty
-    # Split into words
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    # Use editdistance package instead of jiwer internals
-    try:
-        import editdistance
-        distance = editdistance.eval(ref_words, hyp_words)
-    except ImportError:
-        # Fallback to simple jiwer calculation
-        try:
-            # Try using the standard jiwer implementation
-            wer_value = jiwer.wer(reference, hypothesis)
-            return wer_value
-        except Exception:
-            # If all else fails, return 1.0 (maximum error)
-            print("Error calculating WER - fallback to maximum error")
-            return 1.0
-    # WER calculation
     if len(ref_words) == 0:
         return 1.0
     return float(distance) / float(len(ref_words))
 # Calculate WER for a group of examples with multiple methods
-def calculate_wer_methods(examples):
-    if not examples:
-        return 0.0, 0.0, 0.0
-    try:
-        # Check if examples is a Dataset or a list
-        is_dataset = hasattr(examples, 'features')
-        # Get the first example for inspection
-        if is_dataset and len(examples) > 0:
-            example = examples[0]
-        elif not is_dataset and len(examples) > 0:
-            example = examples[0]
-        else:
-            print("No examples found")
-            return np.nan, np.nan, np.nan
-        print("\n===== EXAMPLE DATA INSPECTION =====")
-        print(f"Keys in example: {example.keys()}")
-        # Try different possible field names
-        possible_reference_fields = ["transcription", "reference", "ground_truth", "target"]
-        possible_hypothesis_fields = ["input1", "hypothesis", "asr_output", "source_text"]
-        for field in possible_reference_fields:
-            if field in example:
-                print(f"Reference field '{field}' found with value: {str(example[field])[:100]}...")
-        for field in possible_hypothesis_fields:
-            if field in example:
-                print(f"Hypothesis field '{field}' found with value: {str(example[field])[:100]}...")
-        # Process each example in the dataset
-        wer_values_no_lm = []
-        wer_values_lm_ranking = []
-        wer_values_n_best_correction = []
-        valid_count = 0
-        skipped_count = 0
-        # Determine how to iterate based on type
-        items_to_process = examples
-        if is_dataset:
-            # Limit to first 200 examples for efficiency
-            items_to_process = examples.select(range(min(200, len(examples))))
-        else:
-            items_to_process = examples[:200]  # First 200 examples
-        for i, ex in enumerate(items_to_process):
-            try:
-                # Get reference transcription
-                transcription = ex.get("transcription")
-                if not transcription or not isinstance(transcription, str):
-                    skipped_count += 1
-                    continue
-                # Process the reference
-                reference = preprocess_text(transcription)
-                if not reference:
-                    skipped_count += 1
-                    continue
-                # Get 1-best hypothesis for baseline
-                input1 = ex.get("input1")
-                if input1 is None and "hypothesis" in ex and ex["hypothesis"]:
-                    if isinstance(ex["hypothesis"], list) and len(ex["hypothesis"]) > 0:
-                        input1 = ex["hypothesis"][0]
-                    elif isinstance(ex["hypothesis"], str):
-                        input1 = ex["hypothesis"]
-                # Get n-best hypotheses for other methods
-                n_best_hypotheses = ex.get("hypothesis", [])
-                # Process and evaluate all methods
-                # Method 1: No LM (1-best ASR output)
-                if input1 and isinstance(input1, str):
-                    no_lm_hyp = preprocess_text(input1)
-                    if no_lm_hyp:
-                        wer_no_lm = calculate_simple_wer(reference, no_lm_hyp)
-                        wer_values_no_lm.append(wer_no_lm)
-                # Method 2: LM ranking (best of n-best)
-                if n_best_hypotheses:
-                    lm_best_hyp = get_best_hypothesis_lm(n_best_hypotheses)
-                    if lm_best_hyp:
-                        wer_lm = calculate_simple_wer(reference, lm_best_hyp)
-                        wer_values_lm_ranking.append(wer_lm)
-                # Method 3: N-best correction (voting among n-best)
-                if n_best_hypotheses:
-                    corrected_hyp = correct_hypotheses(n_best_hypotheses)
-                    if corrected_hyp:
-                        wer_corrected = calculate_simple_wer(reference, corrected_hyp)
-                        wer_values_n_best_correction.append(wer_corrected)
-                # Count as valid if at least one method worked
-                if (wer_values_no_lm and i == len(wer_values_no_lm) - 1) or \
-                   (wer_values_lm_ranking and i == len(wer_values_lm_ranking) - 1) or \
-                   (wer_values_n_best_correction and i == len(wer_values_n_best_correction) - 1):
-                    valid_count += 1
-                else:
-                    skipped_count += 1
-                # Print debug info for a few examples
-                if i < 2:
-                    print(f"\nExample {i} inspection:")
-                    print(f"  Reference: '{reference}'")
-                    if input1 and isinstance(input1, str):
-                        no_lm_hyp = preprocess_text(input1)
-                        print(f"  No LM (1-best): '{no_lm_hyp}'")
-                        if no_lm_hyp:
-                            wer = calculate_simple_wer(reference, no_lm_hyp)
-                            print(f"  No LM WER: {wer:.4f}")
-                    if n_best_hypotheses:
-                        print(f"  N-best count: {len(n_best_hypotheses) if isinstance(n_best_hypotheses, list) else 'not a list'}")
-                        lm_best_hyp = get_best_hypothesis_lm(n_best_hypotheses)
-                        print(f"  LM ranking best: '{lm_best_hyp}'")
-                        if lm_best_hyp:
-                            wer = calculate_simple_wer(reference, lm_best_hyp)
-                            print(f"  LM ranking WER: {wer:.4f}")
-                        corrected_hyp = correct_hypotheses(n_best_hypotheses)
-                        print(f"  N-best correction: '{corrected_hyp}'")
-                        if corrected_hyp:
-                            wer = calculate_simple_wer(reference, corrected_hyp)
-                            print(f"  N-best correction WER: {wer:.4f}")
-            except Exception as ex_error:
-                print(f"Error processing example {i}: {str(ex_error)}")
-                skipped_count += 1
-                continue
-        # Calculate average WER for each method
-        print(f"\nProcessing summary: Valid pairs: {valid_count}, Skipped: {skipped_count}")
-        no_lm_wer = np.mean(wer_values_no_lm) if wer_values_no_lm else np.nan
-        lm_ranking_wer = np.mean(wer_values_lm_ranking) if wer_values_lm_ranking else np.nan
-        n_best_correction_wer = np.mean(wer_values_n_best_correction) if wer_values_n_best_correction else np.nan
-        print(f"Calculated WERs:")
-        print(f"  No LM: {len(wer_values_no_lm)} pairs, avg WER: {no_lm_wer:.4f}")
-        print(f"  LM Ranking: {len(wer_values_lm_ranking)} pairs, avg WER: {lm_ranking_wer:.4f}")
-        print(f"  N-best Correction: {len(wer_values_n_best_correction)} pairs, avg WER: {n_best_correction_wer:.4f}")
-        return no_lm_wer, lm_ranking_wer, n_best_correction_wer
-    except Exception as e:
-        print(f"Error in calculate_wer: {str(e)}")
-        print(traceback.format_exc())
-        return np.nan, np.nan, np.nan
-# Get WER metrics by source
 def get_wer_metrics(dataset):
-    try:
-        # Print dataset info
-        print(f"\n===== DATASET INFO =====")
-        print(f"Dataset size: {len(dataset)}")
-        print(f"Dataset features: {dataset.features}")
-        # Group examples by source
-        examples_by_source = {}
-        # Process all examples
-        for i, ex in enumerate(dataset):
-            try:
-                source = ex.get("source", "unknown")
-                # Skip all_et05_real as requested
-                if source == "all_et05_real":
-                    continue
-                if source not in examples_by_source:
-                    examples_by_source[source] = []
-                examples_by_source[source].append(ex)
-            except Exception as e:
-                print(f"Error processing example {i}: {str(e)}")
-                continue
-        # Get all unique sources
-        all_sources = sorted(examples_by_source.keys())
-        print(f"Found sources: {all_sources}")
-        # Calculate metrics for each source
-        source_results = {}
-        for source in all_sources:
-            try:
-                examples = examples_by_source.get(source, [])
-                count = len(examples)
-                if count > 0:
-                    print(f"\nCalculating WER for source {source} with {count} examples")
-                    no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(examples)
-                else:
-                    no_lm_wer, lm_ranking_wer, n_best_wer = np.nan, np.nan, np.nan
-                source_results[source] = {
-                    "Count": count,
-                    "No LM Baseline": no_lm_wer,
-                    "N-best LM Ranking": lm_ranking_wer,
-                    "N-best Correction": n_best_wer
-                }
-            except Exception as e:
-                print(f"Error processing source {source}: {str(e)}")
-                source_results[source] = {
-                    "Count": 0,
-                    "No LM Baseline": np.nan,
-                    "N-best LM Ranking": np.nan,
-                    "N-best Correction": np.nan
-                }
-        # Calculate overall metrics with a sample but excluding all_et05_real
-        try:
-            # Create a filtered dataset without all_et05_real
-            filtered_dataset = [ex for ex in dataset if ex.get("source") != "all_et05_real"]
-            total_count = len(filtered_dataset)
-            print(f"\nCalculating overall WER with a sample of examples (excluding all_et05_real)")
-            # Sample for calculation
-            sample_size = min(500, total_count)
-            sample_dataset = filtered_dataset[:sample_size]
-            no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(sample_dataset)
-            source_results["OVERALL"] = {
-                "Count": total_count,
-                "No LM Baseline": no_lm_wer,
-                "N-best LM Ranking": lm_ranking_wer,
-                "N-best Correction": n_best_wer
-            }
-        except Exception as e:
-            print(f"Error calculating overall metrics: {str(e)}")
-            print(traceback.format_exc())
-            source_results["OVERALL"] = {
-                "Count": len(filtered_dataset),
-                "No LM Baseline": np.nan,
-                "N-best LM Ranking": np.nan,
-                "N-best Correction": np.nan
-            }
-        # Create flat DataFrame with labels in the first column
-        rows = []
-        # First add row for number of examples
-        example_row = {"Metric": "Number of Examples"}
-        for source in all_sources + ["OVERALL"]:
-            example_row[source] = source_results[source]["Count"]
-        rows.append(example_row)
-        # Then add rows for each WER method
-        no_lm_row = {"Metric": "Word Error Rate (No LM)"}
-        lm_ranking_row = {"Metric": "Word Error Rate (N-best LM Ranking)"}
-        n_best_row = {"Metric": "Word Error Rate (N-best Correction)"}
-        for source in all_sources + ["OVERALL"]:
-            no_lm_row[source] = source_results[source]["No LM Baseline"]
-            lm_ranking_row[source] = source_results[source]["N-best LM Ranking"]
-            n_best_row[source] = source_results[source]["N-best Correction"]
-        rows.append(no_lm_row)
-        rows.append(lm_ranking_row)
-        rows.append(n_best_row)
-        # Create DataFrame from rows
-        result_df = pd.DataFrame(rows)
-        return result_df
-    except Exception as e:
-        print(f"Error in get_wer_metrics: {str(e)}")
-        print(traceback.format_exc())
-        return pd.DataFrame([{"Error": str(e)}])
 # Format the dataframe for display
 def format_dataframe(df):
-    try:
-        # Use vectorized operations instead of apply
-        df = df.copy()
-        # Find the rows containing WER values
-        wer_row_indices = []
-        for i, metric in enumerate(df["Metric"]):
-            if "WER" in metric or "Error Rate" in metric:
-                wer_row_indices.append(i)
-        # Format WER values
-        for idx in wer_row_indices:
-            for col in df.columns:
-                if col != "Metric":  # Skip the metric column
-                    value = df.loc[idx, col]
-                    if pd.notna(value):
-                        df.loc[idx, col] = f"{value:.4f}"
-                    else:
-                        df.loc[idx, col] = "N/A"
-        return df
-    except Exception as e:
-        print(f"Error in format_dataframe: {str(e)}")
-        print(traceback.format_exc())
-        return pd.DataFrame([{"Error": str(e)}])
 # Main function to create the leaderboard
 def create_leaderboard():
-    try:
-        dataset = load_data()
-        metrics_df = get_wer_metrics(dataset)
-        return format_dataframe(metrics_df)
-    except Exception as e:
-        error_msg = f"Error creating leaderboard: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return pd.DataFrame([{"Error": error_msg}])
 # Create the Gradio interface
-with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
     gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
     gr.Markdown("Word Error Rate (WER) metrics for different speech sources with multiple correction approaches")
     with gr.Row():
         refresh_btn = gr.Button("Refresh Leaderboard")
-    with gr.Row():
-        error_output = gr.Textbox(label="Debug Information", visible=True, lines=10)
     with gr.Row():
         try:
             initial_df = create_leaderboard()
             leaderboard = gr.DataFrame(initial_df)
-        except Exception as e:
-            error_msg = f"Error initializing leaderboard: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            error_output.update(value=error_msg)
-            leaderboard = gr.DataFrame(pd.DataFrame([{"Error": error_msg}]))
     def refresh_and_report():
-        try:
-            df = create_leaderboard()
-            debug_info = "Leaderboard refreshed successfully. Check console for detailed debug information."
-            return df, debug_info
-        except Exception as e:
-            error_msg = f"Error refreshing leaderboard: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            return pd.DataFrame([{"Error": error_msg}]), error_msg
-    refresh_btn.click(refresh_and_report, outputs=[leaderboard, error_output])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
 import numpy as np
 from functools import lru_cache
 import re
 from collections import Counter
+import editdistance
 # Cache the dataset loading to avoid reloading on refresh
 @lru_cache(maxsize=1)
 def load_data():
     try:
         dataset = load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction", split="test")
         return dataset
+    except Exception:
+        # Fallback to explicit file path if default loading fails
+        return load_dataset("parquet",
+                          data_files="https://huggingface.co/datasets/GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction/resolve/main/data/test-00000-of-00001.parquet")
 # Preprocess text for better WER calculation
 def preprocess_text(text):
     if not text or not isinstance(text, str):
         return ""
     text = text.lower()
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+# N-gram scoring for hypothesis ranking
 def score_hypothesis(hypothesis, n=4):
     if not hypothesis:
         return 0
     words = hypothesis.split()
     if len(words) < n:
+        return len(words)
     ngrams = []
     for i in range(len(words) - n + 1):
         ngram = ' '.join(words[i:i+n])
         ngrams.append(ngram)
     unique_ngrams = len(set(ngrams))
     total_ngrams = len(ngrams)
     score = len(words) + unique_ngrams/max(1, total_ngrams) * 5
     return score
+# N-gram ranking approach
 def get_best_hypothesis_lm(hypotheses):
     if not hypotheses:
         return ""
     if isinstance(hypotheses, str):
         return hypotheses
+    hypothesis_list = [preprocess_text(h) for h in hypotheses if isinstance(h, str)]
     if not hypothesis_list:
         return ""
     scores = [(score_hypothesis(h), h) for h in hypothesis_list]
     best_hypothesis = max(scores, key=lambda x: x[0])[1]
     return best_hypothesis
+# Subwords voting correction approach
 def correct_hypotheses(hypotheses):
     if not hypotheses:
         return ""
     if isinstance(hypotheses, str):
         return hypotheses
+    hypothesis_list = [preprocess_text(h) for h in hypotheses if isinstance(h, str)]
     if not hypothesis_list:
         return ""
     word_lists = [h.split() for h in hypothesis_list]
     lengths = [len(words) for words in word_lists]
     if not lengths:
         return ""
     most_common_length = Counter(lengths).most_common(1)[0][0]
     filtered_word_lists = [words for words in word_lists if len(words) == most_common_length]
     if not filtered_word_lists:
         return max(hypothesis_list, key=len)
     corrected_words = []
     for i in range(most_common_length):
         position_words = [words[i] for words in filtered_word_lists]
         most_common_word = Counter(position_words).most_common(1)[0][0]
         corrected_words.append(most_common_word)
     return ' '.join(corrected_words)
+# Calculate WER
 def calculate_simple_wer(reference, hypothesis):
     if not reference or not hypothesis:
+        return 1.0
     ref_words = reference.split()
     hyp_words = hypothesis.split()
+    distance = editdistance.eval(ref_words, hyp_words)
     if len(ref_words) == 0:
         return 1.0
     return float(distance) / float(len(ref_words))
 # Calculate WER for a group of examples with multiple methods
+def calculate_wer_methods(examples, max_samples=200):
+    if not examples or len(examples) == 0:
+        return np.nan, np.nan, np.nan
+    # Limit sample size for efficiency
+    if hasattr(examples, 'select'):
+        items_to_process = examples.select(range(min(max_samples, len(examples))))
+    else:
+        items_to_process = examples[:max_samples]
+    wer_values_no_lm = []
+    wer_values_lm_ranking = []
+    wer_values_n_best_correction = []
+    for ex in items_to_process:
+        # Get reference transcription
+        transcription = ex.get("transcription")
+        if not transcription or not isinstance(transcription, str):
+            continue
+        reference = preprocess_text(transcription)
+        if not reference:
+            continue
+        # Get 1-best hypothesis for baseline
+        input1 = ex.get("input1")
+        if input1 is None and "hypothesis" in ex and ex["hypothesis"]:
+            if isinstance(ex["hypothesis"], list) and len(ex["hypothesis"]) > 0:
+                input1 = ex["hypothesis"][0]
+            elif isinstance(ex["hypothesis"], str):
+                input1 = ex["hypothesis"]
+        # Get n-best hypotheses for other methods
+        n_best_hypotheses = ex.get("hypothesis", [])
+        # Method 1: No LM (1-best ASR output)
+        if input1 and isinstance(input1, str):
+            no_lm_hyp = preprocess_text(input1)
+            if no_lm_hyp:
+                wer_no_lm = calculate_simple_wer(reference, no_lm_hyp)
+                wer_values_no_lm.append(wer_no_lm)
+        # Method 2: N-gram ranking
+        if n_best_hypotheses:
+            lm_best_hyp = get_best_hypothesis_lm(n_best_hypotheses)
+            if lm_best_hyp:
+                wer_lm = calculate_simple_wer(reference, lm_best_hyp)
+                wer_values_lm_ranking.append(wer_lm)
+        # Method 3: Subwords voting correction
+        if n_best_hypotheses:
+            corrected_hyp = correct_hypotheses(n_best_hypotheses)
+            if corrected_hyp:
+                wer_corrected = calculate_simple_wer(reference, corrected_hyp)
+                wer_values_n_best_correction.append(wer_corrected)
+    # Calculate average WER for each method
+    no_lm_wer = np.mean(wer_values_no_lm) if wer_values_no_lm else np.nan
+    lm_ranking_wer = np.mean(wer_values_lm_ranking) if wer_values_lm_ranking else np.nan
+    n_best_correction_wer = np.mean(wer_values_n_best_correction) if wer_values_n_best_correction else np.nan
+    return no_lm_wer, lm_ranking_wer, n_best_correction_wer
+# Get WER metrics by source
 def get_wer_metrics(dataset):
+    # Group examples by source
+    examples_by_source = {}
+    for ex in dataset:
+        source = ex.get("source", "unknown")
+        # Skip all_et05_real as requested
+        if source == "all_et05_real":
+            continue
+        if source not in examples_by_source:
+            examples_by_source[source] = []
+        examples_by_source[source].append(ex)
+    # Get all unique sources
+    all_sources = sorted(examples_by_source.keys())
+    # Calculate metrics for each source
+    source_results = {}
+    for source in all_sources:
+        examples = examples_by_source.get(source, [])
+        count = len(examples)
+        if count > 0:
+            no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(examples)
+        else:
+            no_lm_wer, lm_ranking_wer, n_best_wer = np.nan, np.nan, np.nan
+        source_results[source] = {
+            "Count": count,
+            "No LM Baseline": no_lm_wer,
+            "N-best LM Ranking": lm_ranking_wer,
+            "N-best Correction": n_best_wer
+        }
+    # Calculate overall metrics
+    filtered_dataset = [ex for ex in dataset if ex.get("source") != "all_et05_real"]
+    total_count = len(filtered_dataset)
+    sample_size = min(500, total_count)
+    sample_dataset = filtered_dataset[:sample_size]
+    no_lm_wer, lm_ranking_wer, n_best_wer = calculate_wer_methods(sample_dataset)
+    source_results["OVERALL"] = {
+        "Count": total_count,
+        "No LM Baseline": no_lm_wer,
+        "N-best LM Ranking": lm_ranking_wer,
+        "N-best Correction": n_best_wer
+    }
+    # Create flat DataFrame with labels in the first column
+    rows = []
+    # First add row for number of examples
+    example_row = {"Metric": "Number of Examples"}
+    for source in all_sources + ["OVERALL"]:
+        example_row[source] = source_results[source]["Count"]
+    rows.append(example_row)
+    # Then add rows for each WER method
+    no_lm_row = {"Metric": "Word Error Rate (No LM)"}
+    lm_ranking_row = {"Metric": "Word Error Rate (N-gram Ranking)"}
+    n_best_row = {"Metric": "Word Error Rate (Subwords Voting Correction)"}
+    for source in all_sources + ["OVERALL"]:
+        no_lm_row[source] = source_results[source]["No LM Baseline"]
+        lm_ranking_row[source] = source_results[source]["N-best LM Ranking"]
+        n_best_row[source] = source_results[source]["N-best Correction"]
+    rows.append(no_lm_row)
+    rows.append(lm_ranking_row)
+    rows.append(n_best_row)
+    # Create DataFrame from rows
+    result_df = pd.DataFrame(rows)
+    return result_df
 # Format the dataframe for display
 def format_dataframe(df):
+    df = df.copy()
+    # Find the rows containing WER values
+    wer_row_indices = []
+    for i, metric in enumerate(df["Metric"]):
+        if "WER" in metric or "Error Rate" in metric:
+            wer_row_indices.append(i)
+    # Format WER values
+    for idx in wer_row_indices:
+        for col in df.columns:
+            if col != "Metric":
+                value = df.loc[idx, col]
+                if pd.notna(value):
+                    df.loc[idx, col] = f"{value:.4f}"
+                else:
+                    df.loc[idx, col] = "N/A"
+    return df
 # Main function to create the leaderboard
 def create_leaderboard():
+    dataset = load_data()
+    metrics_df = get_wer_metrics(dataset)
+    return format_dataframe(metrics_df)
 # Create the Gradio interface
+with gr.Blocks(title="ASR Text Correction Leaderboard") as demo:
     gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
     gr.Markdown("Word Error Rate (WER) metrics for different speech sources with multiple correction approaches")
     with gr.Row():
         refresh_btn = gr.Button("Refresh Leaderboard")
     with gr.Row():
         try:
             initial_df = create_leaderboard()
             leaderboard = gr.DataFrame(initial_df)
+        except Exception:
+            leaderboard = gr.DataFrame(pd.DataFrame([{"Error": "Error initializing leaderboard"}]))
     def refresh_and_report():
+        return create_leaderboard()
+    refresh_btn.click(refresh_and_report, outputs=[leaderboard])
 if __name__ == "__main__":
     demo.launch()