Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14

Commit

4e73867

1 Parent(s): 7ec068d

optz the data loading

Browse files

Files changed (1) hide show

app.py +70 -64

app.py CHANGED Viewed

@@ -9,7 +9,20 @@ import traceback
 # Cache the dataset loading to avoid reloading on refresh
 @lru_cache(maxsize=1)
 def load_data():
-    return load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction")
 # Calculate WER for a group of examples
 def calculate_wer(examples):
@@ -21,11 +34,15 @@ def calculate_wer(examples):
         valid_pairs = []
         for ex in examples:
             try:
                 transcription = ex.get("transcription", "")
                 input1 = ex.get("input1", "")
-                # Only add valid pairs
-                if transcription and input1:
                     # Limit text length to avoid potential issues
                     transcription = transcription.strip()[:1000]  # Limit to 1000 chars
                     input1 = input1.strip()[:1000]
@@ -36,100 +53,93 @@ def calculate_wer(examples):
                 continue
         if not valid_pairs:
             return np.nan
         # Unzip the pairs in one operation
         references, hypotheses = zip(*valid_pairs) if valid_pairs else ([], [])
         # Calculate WER
-        return jiwer.wer(references, hypotheses)
     except Exception as e:
         print(f"Error in calculate_wer: {str(e)}")
         print(traceback.format_exc())
         return np.nan
-# Get WER metrics by source and split
 def get_wer_metrics(dataset):
     try:
-        # Pre-process the data to avoid repeated filtering
-        train_by_source = {}
-        test_by_source = {}
-        # Group examples by source in a single pass for each split
-        for ex in dataset["train"]:
             try:
                 source = ex.get("source", "unknown")
-                if source not in train_by_source:
-                    train_by_source[source] = []
-                train_by_source[source].append(ex)
             except Exception as e:
-                print(f"Error processing train example: {str(e)}")
-                continue
-        for ex in dataset["test"]:
-            try:
-                source = ex.get("source", "unknown")
-                if source not in test_by_source:
-                    test_by_source[source] = []
-                test_by_source[source].append(ex)
-            except Exception as e:
-                print(f"Error processing test example: {str(e)}")
                 continue
         # Get all unique sources
-        all_sources = sorted(set(train_by_source.keys()) | set(test_by_source.keys()))
         # Calculate metrics for each source
         results = []
         for source in all_sources:
             try:
-                train_examples = train_by_source.get(source, [])
-                test_examples = test_by_source.get(source, [])
-                train_count = len(train_examples)
-                test_count = len(test_examples)
-                train_wer = calculate_wer(train_examples) if train_count > 0 else np.nan
-                test_wer = calculate_wer(test_examples) if test_count > 0 else np.nan
                 results.append({
                     "Source": source,
-                    "Train Count": train_count,
-                    "Train WER": train_wer,
-                    "Test Count": test_count,
-                    "Test WER": test_wer
                 })
             except Exception as e:
                 print(f"Error processing source {source}: {str(e)}")
                 results.append({
                     "Source": source,
-                    "Train Count": 0,
-                    "Train WER": np.nan,
-                    "Test Count": 0,
-                    "Test WER": np.nan
                 })
         # Calculate overall metrics once
         try:
-            train_wer = calculate_wer(dataset["train"])
-            test_wer = calculate_wer(dataset["test"])
             results.append({
                 "Source": "OVERALL",
-                "Train Count": len(dataset["train"]),
-                "Train WER": train_wer,
-                "Test Count": len(dataset["test"]),
-                "Test WER": test_wer
             })
         except Exception as e:
             print(f"Error calculating overall metrics: {str(e)}")
             results.append({
                 "Source": "OVERALL",
-                "Train Count": len(dataset["train"]),
-                "Train WER": np.nan,
-                "Test Count": len(dataset["test"]),
-                "Test WER": np.nan
             })
         return pd.DataFrame(results)
@@ -145,15 +155,10 @@ def format_dataframe(df):
         # Use vectorized operations instead of apply
         df = df.copy()
-        if "Train WER" in df.columns:
-            mask = df["Train WER"].notna()
-            df.loc[mask, "Train WER"] = df.loc[mask, "Train WER"].map(lambda x: f"{x:.4f}")
-            df.loc[~mask, "Train WER"] = "N/A"
-        if "Test WER" in df.columns:
-            mask = df["Test WER"].notna()
-            df.loc[mask, "Test WER"] = df.loc[mask, "Test WER"].map(lambda x: f"{x:.4f}")
-            df.loc[~mask, "Test WER"] = "N/A"
         return df
@@ -174,15 +179,15 @@ def create_leaderboard():
         return pd.DataFrame([{"Error": error_msg}])
 # Create the Gradio interface
-with gr.Blocks(title="ASR Text Correction Leaderboard") as demo:
-    gr.Markdown("# ASR Text Correction Baseline WER Leaderboard")
-    gr.Markdown("Word Error Rate (WER) metrics for GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction dataset")
     with gr.Row():
         refresh_btn = gr.Button("Refresh Leaderboard")
     with gr.Row():
-        error_output = gr.Textbox(label="Errors (if any)")
     with gr.Row():
         try:
@@ -197,7 +202,8 @@ with gr.Blocks(title="ASR Text Correction Leaderboard") as demo:
     def refresh_and_report():
         try:
             df = create_leaderboard()
-            return df, ""
         except Exception as e:
             error_msg = f"Error refreshing leaderboard: {str(e)}\n{traceback.format_exc()}"
             print(error_msg)

 # Cache the dataset loading to avoid reloading on refresh
 @lru_cache(maxsize=1)
 def load_data():
+    try:
+        # Load only the test dataset by specifying the split
+        dataset = load_dataset("GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction", split="test")
+        return dataset
+    except Exception as e:
+        print(f"Error loading dataset: {str(e)}")
+        # Try loading with explicit file path if the default loading fails
+        try:
+            dataset = load_dataset("parquet",
+                                  data_files="https://huggingface.co/datasets/GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction/resolve/main/data/test-00000-of-00001.parquet")
+            return dataset
+        except Exception as e2:
+            print(f"Error loading with explicit path: {str(e2)}")
+            raise
 # Calculate WER for a group of examples
 def calculate_wer(examples):
         valid_pairs = []
         for ex in examples:
             try:
+                # Print a sample example to debug
+                if len(valid_pairs) == 0:
+                    print(f"Sample example keys: {ex.keys()}")
                 transcription = ex.get("transcription", "")
                 input1 = ex.get("input1", "")
+                # Only add valid pairs with non-empty strings
+                if transcription and input1 and isinstance(transcription, str) and isinstance(input1, str):
                     # Limit text length to avoid potential issues
                     transcription = transcription.strip()[:1000]  # Limit to 1000 chars
                     input1 = input1.strip()[:1000]
                 continue
         if not valid_pairs:
+            print("No valid pairs found for WER calculation")
             return np.nan
+        # Print sample pairs for debugging
+        print(f"Sample pair for WER calculation: {valid_pairs[0]}")
+        print(f"Total valid pairs: {len(valid_pairs)}")
         # Unzip the pairs in one operation
         references, hypotheses = zip(*valid_pairs) if valid_pairs else ([], [])
         # Calculate WER
+        try:
+            wer = jiwer.wer(references, hypotheses)
+            print(f"Calculated WER: {wer}")
+            return wer
+        except Exception as wer_error:
+            print(f"Error calculating WER: {str(wer_error)}")
+            return np.nan
     except Exception as e:
         print(f"Error in calculate_wer: {str(e)}")
         print(traceback.format_exc())
         return np.nan
+# Get WER metrics by source
 def get_wer_metrics(dataset):
     try:
+        # Group examples by source
+        examples_by_source = {}
+        # Process all examples
+        for ex in dataset:
             try:
                 source = ex.get("source", "unknown")
+                if source not in examples_by_source:
+                    examples_by_source[source] = []
+                examples_by_source[source].append(ex)
             except Exception as e:
+                print(f"Error processing example: {str(e)}")
                 continue
         # Get all unique sources
+        all_sources = sorted(examples_by_source.keys())
         # Calculate metrics for each source
         results = []
         for source in all_sources:
             try:
+                examples = examples_by_source.get(source, [])
+                count = len(examples)
+                if count > 0:
+                    print(f"Calculating WER for source {source} with {count} examples")
+                    wer = calculate_wer(examples)
+                else:
+                    wer = np.nan
                 results.append({
                     "Source": source,
+                    "Count": count,
+                    "WER": wer
                 })
             except Exception as e:
                 print(f"Error processing source {source}: {str(e)}")
                 results.append({
                     "Source": source,
+                    "Count": 0,
+                    "WER": np.nan
                 })
         # Calculate overall metrics once
         try:
+            total_count = len(dataset)
+            print(f"Calculating overall WER for {total_count} examples")
+            overall_wer = calculate_wer(dataset)
             results.append({
                 "Source": "OVERALL",
+                "Count": total_count,
+                "WER": overall_wer
             })
         except Exception as e:
             print(f"Error calculating overall metrics: {str(e)}")
             results.append({
                 "Source": "OVERALL",
+                "Count": len(dataset),
+                "WER": np.nan
             })
         return pd.DataFrame(results)
         # Use vectorized operations instead of apply
         df = df.copy()
+        if "WER" in df.columns:
+            mask = df["WER"].notna()
+            df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
+            df.loc[~mask, "WER"] = "N/A"
         return df
         return pd.DataFrame([{"Error": error_msg}])
 # Create the Gradio interface
+with gr.Blocks(title="ASR Text Correction Test Leaderboard") as demo:
+    gr.Markdown("# ASR Text Correction Baseline WER Leaderboard (Test Data)")
+    gr.Markdown("Word Error Rate (WER) metrics for test data in GenSEC-LLM/SLT-Task1-Post-ASR-Text-Correction dataset")
     with gr.Row():
         refresh_btn = gr.Button("Refresh Leaderboard")
     with gr.Row():
+        error_output = gr.Textbox(label="Debug Information", visible=True)
     with gr.Row():
         try:
     def refresh_and_report():
         try:
             df = create_leaderboard()
+            debug_info = "Leaderboard refreshed successfully."
+            return df, debug_info
         except Exception as e:
             error_msg = f"Error refreshing leaderboard: {str(e)}\n{traceback.format_exc()}"
             print(error_msg)