optz the data loading
Browse files- app.py +49 -12
- requirements.txt +2 -1
app.py
CHANGED
@@ -37,7 +37,7 @@ def preprocess_text(text):
|
|
37 |
text = re.sub(r'\s+', ' ', text).strip()
|
38 |
return text
|
39 |
|
40 |
-
#
|
41 |
def calculate_simple_wer(reference, hypothesis):
|
42 |
"""Calculate WER using a simple word-based approach"""
|
43 |
if not reference or not hypothesis:
|
@@ -47,10 +47,20 @@ def calculate_simple_wer(reference, hypothesis):
|
|
47 |
ref_words = reference.split()
|
48 |
hyp_words = hypothesis.split()
|
49 |
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
# WER calculation
|
56 |
if len(ref_words) == 0:
|
@@ -92,6 +102,8 @@ def calculate_wer(examples):
|
|
92 |
|
93 |
# Process each example in the dataset
|
94 |
wer_values = []
|
|
|
|
|
95 |
|
96 |
# Determine how to iterate based on type
|
97 |
items_to_process = examples
|
@@ -101,7 +113,7 @@ def calculate_wer(examples):
|
|
101 |
else:
|
102 |
items_to_process = examples[:200] # First 200 examples
|
103 |
|
104 |
-
for ex in items_to_process:
|
105 |
try:
|
106 |
# Try to get transcription and input1
|
107 |
transcription = ex.get("transcription")
|
@@ -114,23 +126,46 @@ def calculate_wer(examples):
|
|
114 |
elif isinstance(ex["hypothesis"], str):
|
115 |
input1 = ex["hypothesis"]
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
# Skip if either field is missing
|
118 |
-
if
|
|
|
|
|
|
|
119 |
continue
|
120 |
|
121 |
-
#
|
122 |
reference = preprocess_text(transcription)
|
123 |
hypothesis = preprocess_text(input1)
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
# Calculate WER for this pair
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
129 |
except Exception as ex_error:
|
130 |
-
print(f"Error processing example: {str(ex_error)}")
|
|
|
131 |
continue
|
132 |
|
133 |
# Calculate average WER
|
|
|
|
|
134 |
if not wer_values:
|
135 |
print("No valid pairs found for WER calculation")
|
136 |
return np.nan
|
@@ -233,6 +268,8 @@ def format_dataframe(df):
|
|
233 |
df = df.copy()
|
234 |
|
235 |
if "WER" in df.columns:
|
|
|
|
|
236 |
mask = df["WER"].notna()
|
237 |
df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
|
238 |
df.loc[~mask, "WER"] = "N/A"
|
|
|
37 |
text = re.sub(r'\s+', ' ', text).strip()
|
38 |
return text
|
39 |
|
40 |
+
# Fix the Levenshtein distance calculation to avoid dependence on jiwer internals
|
41 |
def calculate_simple_wer(reference, hypothesis):
|
42 |
"""Calculate WER using a simple word-based approach"""
|
43 |
if not reference or not hypothesis:
|
|
|
47 |
ref_words = reference.split()
|
48 |
hyp_words = hypothesis.split()
|
49 |
|
50 |
+
# Use editdistance package instead of jiwer internals
|
51 |
+
try:
|
52 |
+
import editdistance
|
53 |
+
distance = editdistance.eval(ref_words, hyp_words)
|
54 |
+
except ImportError:
|
55 |
+
# Fallback to simple jiwer calculation
|
56 |
+
try:
|
57 |
+
# Try using the standard jiwer implementation
|
58 |
+
wer_value = jiwer.wer(reference, hypothesis)
|
59 |
+
return wer_value
|
60 |
+
except Exception:
|
61 |
+
# If all else fails, return 1.0 (maximum error)
|
62 |
+
print("Error calculating WER - fallback to maximum error")
|
63 |
+
return 1.0
|
64 |
|
65 |
# WER calculation
|
66 |
if len(ref_words) == 0:
|
|
|
102 |
|
103 |
# Process each example in the dataset
|
104 |
wer_values = []
|
105 |
+
valid_count = 0
|
106 |
+
skipped_count = 0
|
107 |
|
108 |
# Determine how to iterate based on type
|
109 |
items_to_process = examples
|
|
|
113 |
else:
|
114 |
items_to_process = examples[:200] # First 200 examples
|
115 |
|
116 |
+
for i, ex in enumerate(items_to_process):
|
117 |
try:
|
118 |
# Try to get transcription and input1
|
119 |
transcription = ex.get("transcription")
|
|
|
126 |
elif isinstance(ex["hypothesis"], str):
|
127 |
input1 = ex["hypothesis"]
|
128 |
|
129 |
+
# Print debug info for a few examples
|
130 |
+
if i < 3:
|
131 |
+
print(f"\nExample {i} inspection:")
|
132 |
+
print(f" transcription: {transcription}")
|
133 |
+
print(f" input1: {input1}")
|
134 |
+
print(f" type checks: transcription={type(transcription)}, input1={type(input1)}")
|
135 |
+
|
136 |
# Skip if either field is missing
|
137 |
+
if transcription is None or input1 is None:
|
138 |
+
skipped_count += 1
|
139 |
+
if i < 3:
|
140 |
+
print(f" SKIPPED: Missing field (transcription={transcription is None}, input1={input1 is None})")
|
141 |
continue
|
142 |
|
143 |
+
# Skip if either field is empty after preprocessing
|
144 |
reference = preprocess_text(transcription)
|
145 |
hypothesis = preprocess_text(input1)
|
146 |
|
147 |
+
if not reference or not hypothesis:
|
148 |
+
skipped_count += 1
|
149 |
+
if i < 3:
|
150 |
+
print(f" SKIPPED: Empty after preprocessing (reference='{reference}', hypothesis='{hypothesis}')")
|
151 |
+
continue
|
152 |
+
|
153 |
# Calculate WER for this pair
|
154 |
+
pair_wer = calculate_simple_wer(reference, hypothesis)
|
155 |
+
wer_values.append(pair_wer)
|
156 |
+
valid_count += 1
|
157 |
+
|
158 |
+
if i < 3:
|
159 |
+
print(f" VALID PAIR: reference='{reference}', hypothesis='{hypothesis}', WER={pair_wer:.4f}")
|
160 |
+
|
161 |
except Exception as ex_error:
|
162 |
+
print(f"Error processing example {i}: {str(ex_error)}")
|
163 |
+
skipped_count += 1
|
164 |
continue
|
165 |
|
166 |
# Calculate average WER
|
167 |
+
print(f"\nProcessing summary: Valid pairs: {valid_count}, Skipped: {skipped_count}")
|
168 |
+
|
169 |
if not wer_values:
|
170 |
print("No valid pairs found for WER calculation")
|
171 |
return np.nan
|
|
|
268 |
df = df.copy()
|
269 |
|
270 |
if "WER" in df.columns:
|
271 |
+
# Convert to string type first to avoid warning
|
272 |
+
df["WER"] = df["WER"].astype(object)
|
273 |
mask = df["WER"].notna()
|
274 |
df.loc[mask, "WER"] = df.loc[mask, "WER"].map(lambda x: f"{x:.4f}")
|
275 |
df.loc[~mask, "WER"] = "N/A"
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ gradio>=3.50.2
|
|
2 |
pandas>=2.0.0
|
3 |
datasets>=2.14.0
|
4 |
jiwer>=3.0.0
|
5 |
-
numpy>=1.24.0
|
|
|
|
2 |
pandas>=2.0.0
|
3 |
datasets>=2.14.0
|
4 |
jiwer>=3.0.0
|
5 |
+
numpy>=1.24.0
|
6 |
+
editdistance>=0.6.2
|