huckiyang commited on
Commit
92a4ace
·
1 Parent(s): d9795b9

optz the data loading

Browse files
Files changed (2) hide show
  1. app.py +49 -12
  2. requirements.txt +2 -1
app.py CHANGED
@@ -37,7 +37,7 @@ def preprocess_text(text):
37
  text = re.sub(r'\s+', ' ', text).strip()
38
  return text
39
 
40
- # Simple WER calculation
41
  def calculate_simple_wer(reference, hypothesis):
42
  """Calculate WER using a simple word-based approach"""
43
  if not reference or not hypothesis:
@@ -47,10 +47,20 @@ def calculate_simple_wer(reference, hypothesis):
47
  ref_words = reference.split()
48
  hyp_words = hypothesis.split()
49
 
50
- # Levenshtein distance at the word level
51
- # This is a simple implementation and may not be as accurate as jiwer
52
- from jiwer.measures import _levenshtein_distance
53
- distance = _levenshtein_distance(ref_words, hyp_words)
 
 
 
 
 
 
 
 
 
 
54
 
55
  # WER calculation
56
  if len(ref_words) == 0:
@@ -92,6 +102,8 @@ def calculate_wer(examples):
92
 
93
  # Process each example in the dataset
94
  wer_values = []
 
 
95
 
96
  # Determine how to iterate based on type
97
  items_to_process = examples
@@ -101,7 +113,7 @@ def calculate_wer(examples):
101
  else:
102
  items_to_process = examples[:200] # First 200 examples
103
 
104
- for ex in items_to_process:
105
  try:
106
  # Try to get transcription and input1
107
  transcription = ex.get("transcription")
@@ -114,23 +126,46 @@ def calculate_wer(examples):
114
  elif isinstance(ex["hypothesis"], str):
115
  input1 = ex["hypothesis"]
116
 
 
 
 
 
 
 
 
117
  # Skip if either field is missing
118
- if not transcription or not input1:
 
 
 
119
  continue
120
 
121
- # Clean the text
122
  reference = preprocess_text(transcription)
123
  hypothesis = preprocess_text(input1)
124
 
 
 
 
 
 
 
125
  # Calculate WER for this pair
126
- if reference and hypothesis:
127
- pair_wer = calculate_simple_wer(reference, hypothesis)
128
- wer_values.append(pair_wer)
 
 
 
 
129
  except Exception as ex_error:
130
- print(f"Error processing example: {str(ex_error)}")
 
131
  continue
132
 
133
  # Calculate average WER
 
 
134
  if not wer_values:
135
  print("No valid pairs found for WER calculation")
136
  return np.nan
@@ -233,6 +268,8 @@ def format_dataframe(df):
233
  df = df.copy()
234
 
235
  if "WER" in df.columns:
 
 
236
  mask = df["WER"].notna()
237
  df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
238
  df.loc[~mask, "WER"] = "N/A"
 
37
  text = re.sub(r'\s+', ' ', text).strip()
38
  return text
39
 
40
+ # Fix the Levenshtein distance calculation to avoid dependence on jiwer internals
41
  def calculate_simple_wer(reference, hypothesis):
42
  """Calculate WER using a simple word-based approach"""
43
  if not reference or not hypothesis:
 
47
  ref_words = reference.split()
48
  hyp_words = hypothesis.split()
49
 
50
+ # Use editdistance package instead of jiwer internals
51
+ try:
52
+ import editdistance
53
+ distance = editdistance.eval(ref_words, hyp_words)
54
+ except ImportError:
55
+ # Fallback to simple jiwer calculation
56
+ try:
57
+ # Try using the standard jiwer implementation
58
+ wer_value = jiwer.wer(reference, hypothesis)
59
+ return wer_value
60
+ except Exception:
61
+ # If all else fails, return 1.0 (maximum error)
62
+ print("Error calculating WER - fallback to maximum error")
63
+ return 1.0
64
 
65
  # WER calculation
66
  if len(ref_words) == 0:
 
102
 
103
  # Process each example in the dataset
104
  wer_values = []
105
+ valid_count = 0
106
+ skipped_count = 0
107
 
108
  # Determine how to iterate based on type
109
  items_to_process = examples
 
113
  else:
114
  items_to_process = examples[:200] # First 200 examples
115
 
116
+ for i, ex in enumerate(items_to_process):
117
  try:
118
  # Try to get transcription and input1
119
  transcription = ex.get("transcription")
 
126
  elif isinstance(ex["hypothesis"], str):
127
  input1 = ex["hypothesis"]
128
 
129
+ # Print debug info for a few examples
130
+ if i < 3:
131
+ print(f"\nExample {i} inspection:")
132
+ print(f" transcription: {transcription}")
133
+ print(f" input1: {input1}")
134
+ print(f" type checks: transcription={type(transcription)}, input1={type(input1)}")
135
+
136
  # Skip if either field is missing
137
+ if transcription is None or input1 is None:
138
+ skipped_count += 1
139
+ if i < 3:
140
+ print(f" SKIPPED: Missing field (transcription={transcription is None}, input1={input1 is None})")
141
  continue
142
 
143
+ # Skip if either field is empty after preprocessing
144
  reference = preprocess_text(transcription)
145
  hypothesis = preprocess_text(input1)
146
 
147
+ if not reference or not hypothesis:
148
+ skipped_count += 1
149
+ if i < 3:
150
+ print(f" SKIPPED: Empty after preprocessing (reference='{reference}', hypothesis='{hypothesis}')")
151
+ continue
152
+
153
  # Calculate WER for this pair
154
+ pair_wer = calculate_simple_wer(reference, hypothesis)
155
+ wer_values.append(pair_wer)
156
+ valid_count += 1
157
+
158
+ if i < 3:
159
+ print(f" VALID PAIR: reference='{reference}', hypothesis='{hypothesis}', WER={pair_wer:.4f}")
160
+
161
  except Exception as ex_error:
162
+ print(f"Error processing example {i}: {str(ex_error)}")
163
+ skipped_count += 1
164
  continue
165
 
166
  # Calculate average WER
167
+ print(f"\nProcessing summary: Valid pairs: {valid_count}, Skipped: {skipped_count}")
168
+
169
  if not wer_values:
170
  print("No valid pairs found for WER calculation")
171
  return np.nan
 
268
  df = df.copy()
269
 
270
  if "WER" in df.columns:
271
+ # Convert to string type first to avoid warning
272
+ df["WER"] = df["WER"].astype(object)
273
  mask = df["WER"].notna()
274
  df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
275
  df.loc[~mask, "WER"] = "N/A"
requirements.txt CHANGED
@@ -2,4 +2,5 @@ gradio>=3.50.2
2
  pandas>=2.0.0
3
  datasets>=2.14.0
4
  jiwer>=3.0.0
5
- numpy>=1.24.0
 
 
2
  pandas>=2.0.0
3
  datasets>=2.14.0
4
  jiwer>=3.0.0
5
+ numpy>=1.24.0
6
+ editdistance>=0.6.2