Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14

Commit

3c6aeb7

1 Parent(s): fbba242

optz the data loading

Browse files

Files changed (1) hide show

app.py +48 -34

app.py CHANGED Viewed

@@ -3,57 +3,64 @@ import pandas as pd
 from datasets import load_dataset
 import jiwer
 import numpy as np
-# Load the dataset
 def load_data():
-    dataset = load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction")
-    return dataset
 # Calculate WER for a group of examples
 def calculate_wer(examples):
     if not examples:
         return 0.0
-    valid_pairs = []
-    for ex in examples:
-        # Get transcription and input1 fields
-        transcription = ex.get("transcription")
-        input1 = ex.get("input1")
-        # Only include examples where both fields exist and are not empty
-        if transcription and input1:
-            valid_pairs.append((transcription.strip(), input1.strip()))
-    # If no valid pairs were found, return NaN
     if not valid_pairs:
         return np.nan
-    # Separate references and hypotheses
-    references = [pair[0] for pair in valid_pairs]
-    hypotheses = [pair[1] for pair in valid_pairs]
     # Calculate WER
-    wer = jiwer.wer(references, hypotheses)
-    return wer
 # Get WER metrics by source and split
 def get_wer_metrics(dataset):
-    results = []
-    # Get unique sources
-    train_sources = set([ex["source"] for ex in dataset["train"]])
-    test_sources = set([ex["source"] for ex in dataset["test"]])
-    all_sources = sorted(list(train_sources.union(test_sources)))
-    # Calculate WER for each source in train split
     for source in all_sources:
-        train_examples = [ex for ex in dataset["train"] if ex["source"] == source]
-        train_count = len(train_examples)
-        train_wer = calculate_wer(train_examples) if train_count > 0 else np.nan
-        test_examples = [ex for ex in dataset["test"] if ex["source"] == source]
         test_count = len(test_examples)
         test_wer = calculate_wer(test_examples) if test_count > 0 else np.nan
         results.append({
@@ -64,7 +71,7 @@ def get_wer_metrics(dataset):
             "Test WER": test_wer
         })
-    # Add overall metrics
     train_wer = calculate_wer(dataset["train"])
     test_wer = calculate_wer(dataset["test"])
@@ -80,8 +87,16 @@ def get_wer_metrics(dataset):
 # Format the dataframe for display
 def format_dataframe(df):
-    df["Train WER"] = df["Train WER"].apply(lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A")
-    df["Test WER"] = df["Test WER"].apply(lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A")
     return df
 # Main function to create the leaderboard
@@ -89,8 +104,7 @@ def create_leaderboard():
     try:
         dataset = load_data()
         metrics_df = get_wer_metrics(dataset)
-        formatted_df = format_dataframe(metrics_df)
-        return formatted_df
     except Exception as e:
         return pd.DataFrame({"Error": [str(e)]})

 from datasets import load_dataset
 import jiwer
 import numpy as np
+from functools import lru_cache
+# Cache the dataset loading to avoid reloading on refresh
+@lru_cache(maxsize=1)
 def load_data():
+    return load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction")
 # Calculate WER for a group of examples
 def calculate_wer(examples):
     if not examples:
         return 0.0
+    # Filter valid examples in a single pass
+    valid_pairs = [(ex.get("transcription", "").strip(), ex.get("input1", "").strip())
+                  for ex in examples
+                  if ex.get("transcription") and ex.get("input1")]
     if not valid_pairs:
         return np.nan
+    # Unzip the pairs in one operation
+    references, hypotheses = zip(*valid_pairs) if valid_pairs else ([], [])
     # Calculate WER
+    return jiwer.wer(references, hypotheses)
 # Get WER metrics by source and split
 def get_wer_metrics(dataset):
+    # Pre-process the data to avoid repeated filtering
+    train_by_source = {}
+    test_by_source = {}
+    # Group examples by source in a single pass for each split
+    for ex in dataset["train"]:
+        source = ex["source"]
+        if source not in train_by_source:
+            train_by_source[source] = []
+        train_by_source[source].append(ex)
+    for ex in dataset["test"]:
+        source = ex["source"]
+        if source not in test_by_source:
+            test_by_source[source] = []
+        test_by_source[source].append(ex)
+    # Get all unique sources
+    all_sources = sorted(set(train_by_source.keys()) | set(test_by_source.keys()))
+    # Calculate metrics for each source
+    results = []
     for source in all_sources:
+        train_examples = train_by_source.get(source, [])
+        test_examples = test_by_source.get(source, [])
+        train_count = len(train_examples)
         test_count = len(test_examples)
+        train_wer = calculate_wer(train_examples) if train_count > 0 else np.nan
         test_wer = calculate_wer(test_examples) if test_count > 0 else np.nan
         results.append({
             "Test WER": test_wer
         })
+    # Calculate overall metrics once
     train_wer = calculate_wer(dataset["train"])
     test_wer = calculate_wer(dataset["test"])
 # Format the dataframe for display
 def format_dataframe(df):
+    # Use vectorized operations instead of apply
+    df = df.copy()
+    mask = df["Train WER"].notna()
+    df.loc[mask, "Train WER"] = df.loc[mask, "Train WER"].map(lambda x: f"{x:.4f}")
+    df.loc[~mask, "Train WER"] = "N/A"
+    mask = df["Test WER"].notna()
+    df.loc[mask, "Test WER"] = df.loc[mask, "Test WER"].map(lambda x: f"{x:.4f}")
+    df.loc[~mask, "Test WER"] = "N/A"
     return df
 # Main function to create the leaderboard
     try:
         dataset = load_data()
         metrics_df = get_wer_metrics(dataset)
+        return format_dataframe(metrics_df)
     except Exception as e:
         return pd.DataFrame({"Error": [str(e)]})