optz the data code
Browse files
README.md
CHANGED
@@ -42,3 +42,12 @@ Word Error Rate is calculated between:
|
|
42 |
Lower WER values indicate better transcription accuracy.
|
43 |
|
44 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
Lower WER values indicate better transcription accuracy.
|
43 |
|
44 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
45 |
+
|
46 |
+
## Table Structure
|
47 |
+
|
48 |
+
The leaderboard is displayed as a table with:
|
49 |
+
|
50 |
+
- **Rows**: "Number of Examples" and "Word Error Rate (WER)"
|
51 |
+
- **Columns**: Different data sources (CHiME4, CORAAL, CommonVoice, etc.) and OVERALL
|
52 |
+
|
53 |
+
Each cell shows the corresponding metric for that specific data source. The OVERALL column shows aggregate metrics across all sources.
|
app.py
CHANGED
@@ -259,12 +259,18 @@ def get_wer_metrics(dataset):
|
|
259 |
|
260 |
# Create a transposed DataFrame with metrics as rows and sources as columns
|
261 |
metrics = ["Count", "No LM Baseline"]
|
262 |
-
result_df = pd.DataFrame(index=metrics, columns=all_sources + ["OVERALL"])
|
|
|
|
|
|
|
263 |
|
264 |
for source in all_sources + ["OVERALL"]:
|
265 |
for metric in metrics:
|
266 |
result_df.loc[metric, source] = source_results[source][metric]
|
267 |
|
|
|
|
|
|
|
268 |
return result_df
|
269 |
|
270 |
except Exception as e:
|
@@ -278,17 +284,23 @@ def format_dataframe(df):
|
|
278 |
# Use vectorized operations instead of apply
|
279 |
df = df.copy()
|
280 |
|
281 |
-
#
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
# Convert to object type first to avoid warnings
|
284 |
-
df.loc[
|
285 |
|
286 |
for col in df.columns:
|
287 |
-
value = df.loc[
|
288 |
if pd.notna(value):
|
289 |
-
df.loc[
|
290 |
else:
|
291 |
-
df.loc[
|
292 |
|
293 |
return df
|
294 |
|
|
|
259 |
|
260 |
# Create a transposed DataFrame with metrics as rows and sources as columns
|
261 |
metrics = ["Count", "No LM Baseline"]
|
262 |
+
result_df = pd.DataFrame(index=metrics, columns=["Metric"] + all_sources + ["OVERALL"])
|
263 |
+
|
264 |
+
# Add descriptive column
|
265 |
+
result_df["Metric"] = ["Number of Examples", "Word Error Rate (WER)"]
|
266 |
|
267 |
for source in all_sources + ["OVERALL"]:
|
268 |
for metric in metrics:
|
269 |
result_df.loc[metric, source] = source_results[source][metric]
|
270 |
|
271 |
+
# Set Metric as index for better display
|
272 |
+
result_df = result_df.set_index("Metric")
|
273 |
+
|
274 |
return result_df
|
275 |
|
276 |
except Exception as e:
|
|
|
284 |
# Use vectorized operations instead of apply
|
285 |
df = df.copy()
|
286 |
|
287 |
+
# Find the row containing WER values (now with new index name)
|
288 |
+
wer_row_index = None
|
289 |
+
for idx in df.index:
|
290 |
+
if "WER" in idx or "Error Rate" in idx:
|
291 |
+
wer_row_index = idx
|
292 |
+
break
|
293 |
+
|
294 |
+
if wer_row_index:
|
295 |
# Convert to object type first to avoid warnings
|
296 |
+
df.loc[wer_row_index] = df.loc[wer_row_index].astype(object)
|
297 |
|
298 |
for col in df.columns:
|
299 |
+
value = df.loc[wer_row_index, col]
|
300 |
if pd.notna(value):
|
301 |
+
df.loc[wer_row_index, col] = f"{value:.4f}"
|
302 |
else:
|
303 |
+
df.loc[wer_row_index, col] = "N/A"
|
304 |
|
305 |
return df
|
306 |
|