Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14

Commit

c7f8633

1 Parent(s): 92a4ace

optz the data loading

Browse files

Files changed (2) hide show

README.md +30 -0
app.py +46 -30

README.md CHANGED Viewed

@@ -11,4 +11,34 @@ license: mit
 short_description: Generative Error Correction (GER) Task Baseline, WER
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Generative Error Correction (GER) Task Baseline, WER
 ---
+# Post-ASR Text Correction WER Leaderboard
+This application displays a baseline Word Error Rate (WER) leaderboard for the test data in the [GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction](https://huggingface.co/datasets/GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction) dataset.
+## Dataset Sources
+The leaderboard shows WER metrics for multiple speech recognition sources as columns:
+- CHiME4
+- CORAAL
+- CommonVoice
+- LRS2
+- LibriSpeech (Clean and Other)
+- SwitchBoard
+- Tedlium-3
+- OVERALL (aggregate across all sources)
+## Metrics
+The leaderboard displays as rows:
+- **Count**: Number of examples in the test set for each source
+- **No LM Baseline**: Word Error Rate between the reference transcription and 1-best ASR output without language model correction
+## Baseline Calculation
+Word Error Rate is calculated between:
+- Reference transcription ("transcription" field)
+- 1-best ASR output ("input1" field or first item from "hypothesis" when input1 is unavailable)
+Lower WER values indicate better transcription accuracy.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -194,6 +194,10 @@ def get_wer_metrics(dataset):
         for i, ex in enumerate(dataset):
             try:
                 source = ex.get("source", "unknown")
                 if source not in examples_by_source:
                     examples_by_source[source] = []
                 examples_by_source[source].append(ex)
@@ -206,7 +210,7 @@ def get_wer_metrics(dataset):
         print(f"Found sources: {all_sources}")
         # Calculate metrics for each source
-        results = []
         for source in all_sources:
             try:
                 examples = examples_by_source.get(source, [])
@@ -218,43 +222,50 @@ def get_wer_metrics(dataset):
                 else:
                     wer = np.nan
-                results.append({
-                    "Source": source,
                     "Count": count,
-                    "WER": wer
-                })
             except Exception as e:
                 print(f"Error processing source {source}: {str(e)}")
-                results.append({
-                    "Source": source,
                     "Count": 0,
-                    "WER": np.nan
-                })
-        # Calculate overall metrics with a sample
         try:
-            total_count = len(dataset)
-            print(f"\nCalculating overall WER with a sample of examples")
             # Sample for calculation
             sample_size = min(500, total_count)
-            sample_dataset = dataset.select(range(sample_size))
             overall_wer = calculate_wer(sample_dataset)
-            results.append({
-                "Source": "OVERALL",
                 "Count": total_count,
-                "WER": overall_wer
-            })
         except Exception as e:
             print(f"Error calculating overall metrics: {str(e)}")
             print(traceback.format_exc())
-            results.append({
-                "Source": "OVERALL",
-                "Count": len(dataset),
-                "WER": np.nan
-            })
-        return pd.DataFrame(results)
     except Exception as e:
         print(f"Error in get_wer_metrics: {str(e)}")
@@ -267,12 +278,17 @@ def format_dataframe(df):
         # Use vectorized operations instead of apply
         df = df.copy()
-        if "WER" in df.columns:
-            # Convert to string type first to avoid warning
-            df["WER"] = df["WER"].astype(object)
-            mask = df["WER"].notna()
-            df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
-            df.loc[~mask, "WER"] = "N/A"
         return df
@@ -295,7 +311,7 @@ def create_leaderboard():
 # Create the Gradio interface
 with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
     gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
-    gr.Markdown("Word Error Rate (WER) metrics for test data in GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction dataset")
     with gr.Row():
         refresh_btn = gr.Button("Refresh Leaderboard")

         for i, ex in enumerate(dataset):
             try:
                 source = ex.get("source", "unknown")
+                # Skip all_et05_real as requested
+                if source == "all_et05_real":
+                    continue
                 if source not in examples_by_source:
                     examples_by_source[source] = []
                 examples_by_source[source].append(ex)
         print(f"Found sources: {all_sources}")
         # Calculate metrics for each source
+        source_results = {}
         for source in all_sources:
             try:
                 examples = examples_by_source.get(source, [])
                 else:
                     wer = np.nan
+                source_results[source] = {
                     "Count": count,
+                    "No LM Baseline": wer
+                }
             except Exception as e:
                 print(f"Error processing source {source}: {str(e)}")
+                source_results[source] = {
                     "Count": 0,
+                    "No LM Baseline": np.nan
+                }
+        # Calculate overall metrics with a sample but excluding all_et05_real
         try:
+            # Create a filtered dataset without all_et05_real
+            filtered_dataset = [ex for ex in dataset if ex.get("source") != "all_et05_real"]
+            total_count = len(filtered_dataset)
+            print(f"\nCalculating overall WER with a sample of examples (excluding all_et05_real)")
             # Sample for calculation
             sample_size = min(500, total_count)
+            sample_dataset = filtered_dataset[:sample_size]
             overall_wer = calculate_wer(sample_dataset)
+            source_results["OVERALL"] = {
                 "Count": total_count,
+                "No LM Baseline": overall_wer
+            }
         except Exception as e:
             print(f"Error calculating overall metrics: {str(e)}")
             print(traceback.format_exc())
+            source_results["OVERALL"] = {
+                "Count": len(filtered_dataset),
+                "No LM Baseline": np.nan
+            }
+        # Create a transposed DataFrame with metrics as rows and sources as columns
+        metrics = ["Count", "No LM Baseline"]
+        result_df = pd.DataFrame(index=metrics, columns=all_sources + ["OVERALL"])
+        for source in all_sources + ["OVERALL"]:
+            for metric in metrics:
+                result_df.loc[metric, source] = source_results[source][metric]
+        return result_df
     except Exception as e:
         print(f"Error in get_wer_metrics: {str(e)}")
         # Use vectorized operations instead of apply
         df = df.copy()
+        # Format WER values
+        if "No LM Baseline" in df.index:
+            # Convert to object type first to avoid warnings
+            df.loc["No LM Baseline"] = df.loc["No LM Baseline"].astype(object)
+            for col in df.columns:
+                value = df.loc["No LM Baseline", col]
+                if pd.notna(value):
+                    df.loc["No LM Baseline", col] = f"{value:.4f}"
+                else:
+                    df.loc["No LM Baseline", col] = "N/A"
         return df
 # Create the Gradio interface
 with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
     gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
+    gr.Markdown("Word Error Rate (WER) metrics for different speech sources with No Language Model baseline")
     with gr.Row():
         refresh_btn = gr.Button("Refresh Leaderboard")