Spaces:

GenSEC-LLM
/

Post-ASR-LLM-Transcription-Correction

Running

App Files Files Community

huckiyang commited on Mar 14

Commit

92a4ace

1 Parent(s): d9795b9

optz the data loading

Browse files

Files changed (2) hide show

app.py +49 -12
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ def preprocess_text(text):
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-# Simple WER calculation
 def calculate_simple_wer(reference, hypothesis):
     """Calculate WER using a simple word-based approach"""
     if not reference or not hypothesis:
@@ -47,10 +47,20 @@ def calculate_simple_wer(reference, hypothesis):
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    # Levenshtein distance at the word level
-    # This is a simple implementation and may not be as accurate as jiwer
-    from jiwer.measures import _levenshtein_distance
-    distance = _levenshtein_distance(ref_words, hyp_words)
     # WER calculation
     if len(ref_words) == 0:
@@ -92,6 +102,8 @@ def calculate_wer(examples):
         # Process each example in the dataset
         wer_values = []
         # Determine how to iterate based on type
         items_to_process = examples
@@ -101,7 +113,7 @@ def calculate_wer(examples):
         else:
             items_to_process = examples[:200]  # First 200 examples
-        for ex in items_to_process:
             try:
                 # Try to get transcription and input1
                 transcription = ex.get("transcription")
@@ -114,23 +126,46 @@ def calculate_wer(examples):
                     elif isinstance(ex["hypothesis"], str):
                         input1 = ex["hypothesis"]
                 # Skip if either field is missing
-                if not transcription or not input1:
                     continue
-                # Clean the text
                 reference = preprocess_text(transcription)
                 hypothesis = preprocess_text(input1)
                 # Calculate WER for this pair
-                if reference and hypothesis:
-                    pair_wer = calculate_simple_wer(reference, hypothesis)
-                    wer_values.append(pair_wer)
             except Exception as ex_error:
-                print(f"Error processing example: {str(ex_error)}")
                 continue
         # Calculate average WER
         if not wer_values:
             print("No valid pairs found for WER calculation")
             return np.nan
@@ -233,6 +268,8 @@ def format_dataframe(df):
         df = df.copy()
         if "WER" in df.columns:
             mask = df["WER"].notna()
             df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
             df.loc[~mask, "WER"] = "N/A"

     text = re.sub(r'\s+', ' ', text).strip()
     return text
+# Fix the Levenshtein distance calculation to avoid dependence on jiwer internals
 def calculate_simple_wer(reference, hypothesis):
     """Calculate WER using a simple word-based approach"""
     if not reference or not hypothesis:
     ref_words = reference.split()
     hyp_words = hypothesis.split()
+    # Use editdistance package instead of jiwer internals
+    try:
+        import editdistance
+        distance = editdistance.eval(ref_words, hyp_words)
+    except ImportError:
+        # Fallback to simple jiwer calculation
+        try:
+            # Try using the standard jiwer implementation
+            wer_value = jiwer.wer(reference, hypothesis)
+            return wer_value
+        except Exception:
+            # If all else fails, return 1.0 (maximum error)
+            print("Error calculating WER - fallback to maximum error")
+            return 1.0
     # WER calculation
     if len(ref_words) == 0:
         # Process each example in the dataset
         wer_values = []
+        valid_count = 0
+        skipped_count = 0
         # Determine how to iterate based on type
         items_to_process = examples
         else:
             items_to_process = examples[:200]  # First 200 examples
+        for i, ex in enumerate(items_to_process):
             try:
                 # Try to get transcription and input1
                 transcription = ex.get("transcription")
                     elif isinstance(ex["hypothesis"], str):
                         input1 = ex["hypothesis"]
+                # Print debug info for a few examples
+                if i < 3:
+                    print(f"\nExample {i} inspection:")
+                    print(f"  transcription: {transcription}")
+                    print(f"  input1: {input1}")
+                    print(f"  type checks: transcription={type(transcription)}, input1={type(input1)}")
                 # Skip if either field is missing
+                if transcription is None or input1 is None:
+                    skipped_count += 1
+                    if i < 3:
+                        print(f"  SKIPPED: Missing field (transcription={transcription is None}, input1={input1 is None})")
                     continue
+                # Skip if either field is empty after preprocessing
                 reference = preprocess_text(transcription)
                 hypothesis = preprocess_text(input1)
+                if not reference or not hypothesis:
+                    skipped_count += 1
+                    if i < 3:
+                        print(f"  SKIPPED: Empty after preprocessing (reference='{reference}', hypothesis='{hypothesis}')")
+                    continue
                 # Calculate WER for this pair
+                pair_wer = calculate_simple_wer(reference, hypothesis)
+                wer_values.append(pair_wer)
+                valid_count += 1
+                if i < 3:
+                    print(f"  VALID PAIR: reference='{reference}', hypothesis='{hypothesis}', WER={pair_wer:.4f}")
             except Exception as ex_error:
+                print(f"Error processing example {i}: {str(ex_error)}")
+                skipped_count += 1
                 continue
         # Calculate average WER
+        print(f"\nProcessing summary: Valid pairs: {valid_count}, Skipped: {skipped_count}")
         if not wer_values:
             print("No valid pairs found for WER calculation")
             return np.nan
         df = df.copy()
         if "WER" in df.columns:
+            # Convert to string type first to avoid warning
+            df["WER"] = df["WER"].astype(object)
             mask = df["WER"].notna()
             df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
             df.loc[~mask, "WER"] = "N/A"

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ gradio>=3.50.2
 pandas>=2.0.0
 datasets>=2.14.0
 jiwer>=3.0.0
-numpy>=1.24.0

 pandas>=2.0.0
 datasets>=2.14.0
 jiwer>=3.0.0
+numpy>=1.24.0
+editdistance>=0.6.2