sudoping01 commited on
Commit
d415750
·
verified ·
1 Parent(s): 58f7be4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -79
app.py CHANGED
@@ -7,13 +7,16 @@ from datetime import datetime
7
  import re
8
 
9
  # Load the Bambara ASR dataset
 
10
  dataset = load_dataset("sudoping01/bambara-asr-benchmark", name="default")["train"]
11
  references = {row["id"]: row["text"] for row in dataset}
12
 
13
- # Initialize leaderboard file if it doesn't exist
14
  leaderboard_file = "leaderboard.csv"
15
  if not os.path.exists(leaderboard_file):
16
  pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
 
 
17
 
18
  def normalize_text(text):
19
  """
@@ -25,70 +28,132 @@ def normalize_text(text):
25
  """
26
  if not isinstance(text, str):
27
  text = str(text)
 
 
28
  text = text.lower()
 
 
29
  text = re.sub(r'[^\w\s]', '', text)
 
 
30
  text = re.sub(r'\s+', ' ', text).strip()
 
31
  return text
32
 
33
  def calculate_metrics(predictions_df):
34
- """
35
- Calculate WER and CER for predictions against the reference dataset.
36
- """
37
  results = []
 
38
  for _, row in predictions_df.iterrows():
39
  id_val = row["id"]
40
  if id_val not in references:
 
41
  continue
 
42
  reference = normalize_text(references[id_val])
43
  hypothesis = normalize_text(row["text"])
 
 
 
 
 
 
 
 
44
  if not reference or not hypothesis:
 
45
  continue
 
 
 
 
 
 
 
 
 
 
46
  try:
 
 
 
 
 
 
 
 
 
 
 
47
  sample_wer = wer(reference, hypothesis)
48
  sample_cer = cer(reference, hypothesis)
 
 
 
 
49
  results.append({
50
  "id": id_val,
 
 
51
  "wer": sample_wer,
52
  "cer": sample_cer
53
  })
54
- except Exception:
55
- pass # Skip invalid samples silently
 
56
  if not results:
57
- raise ValueError("No valid samples available for metric calculation")
 
 
58
  avg_wer = sum(item["wer"] for item in results) / len(results)
59
  avg_cer = sum(item["cer"] for item in results) / len(results)
 
60
  return avg_wer, avg_cer, results
61
 
62
  def process_submission(submitter_name, csv_file):
63
- """
64
- Process the uploaded CSV, calculate metrics, and update the leaderboard.
65
- """
66
  try:
 
67
  df = pd.read_csv(csv_file)
 
 
68
  if len(df) == 0:
69
- return "Submission failed: The uploaded CSV file is empty. Please upload a valid CSV file with predictions.", None
 
70
  if set(df.columns) != {"id", "text"}:
71
- return f"Submission failed: The CSV file must contain exactly two columns: 'id' and 'text'. Found: {', '.join(df.columns)}", None
 
72
  if df["id"].duplicated().any():
73
- dup_ids = df[df["id"].duplicated(keep=False)]["id"].unique()
74
- return f"Submission failed: Duplicate 'id' values detected: {', '.join(map(str, dup_ids[:5]))}", None
 
 
75
  missing_ids = set(references.keys()) - set(df["id"])
76
  extra_ids = set(df["id"]) - set(references.keys())
 
77
  if missing_ids:
78
- return f"Submission failed: Missing {len(missing_ids)} required 'id' values. First few: {', '.join(map(str, list(missing_ids)[:5]))}", None
 
79
  if extra_ids:
80
- return f"Submission failed: Found {len(extra_ids)} unrecognized 'id' values. First few: {', '.join(map(str, list(extra_ids)[:5]))}", None
81
- empty_ids = [row["id"] for _, row in df.iterrows() if not normalize_text(row["text"])]
82
- if empty_ids:
83
- return f"Submission failed: Empty transcriptions detected for {len(empty_ids)} 'id' values. First few: {', '.join(map(str, empty_ids[:5]))}", None
84
 
85
- # Calculate metrics
86
- avg_wer, avg_cer, detailed_results = calculate_metrics(df)
87
- n_valid = len(detailed_results)
88
- if n_valid == 0:
89
- return "Submission failed: No valid samples found for metric calculation.", None
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Update leaderboard
92
  leaderboard = pd.read_csv(leaderboard_file)
93
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
94
  new_entry = pd.DataFrame(
@@ -98,76 +163,46 @@ def process_submission(submitter_name, csv_file):
98
  leaderboard = pd.concat([leaderboard, new_entry]).sort_values("WER")
99
  leaderboard.to_csv(leaderboard_file, index=False)
100
 
101
- # Format leaderboard for display
102
- display_leaderboard = leaderboard.copy()
103
- display_leaderboard["WER"] = display_leaderboard["WER"].apply(lambda x: f"{x:.4f}")
104
- display_leaderboard["CER"] = display_leaderboard["CER"].apply(lambda x: f"{x:.4f}")
105
 
106
- return f"Your submission has been successfully processed. Evaluated {n_valid} valid samples. WER: {avg_wer:.4f}, CER: {avg_cer:.4f}", display_leaderboard
107
-
108
  except Exception as e:
109
- return f"Submission failed: An error occurred while processing your file - {str(e)}", None
110
-
111
- def load_and_format_leaderboard():
112
- """
113
- Load the leaderboard and format WER/CER for display.
114
- """
115
- if os.path.exists(leaderboard_file):
116
- leaderboard = pd.read_csv(leaderboard_file)
117
- leaderboard["WER"] = leaderboard["WER"].apply(lambda x: f"{x:.4f}")
118
- leaderboard["CER"] = leaderboard["CER"].apply(lambda x: f"{x:.4f}")
119
- return leaderboard
120
- return pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"])
121
 
122
- # Gradio interface
123
- with gr.Blocks(title="Bambara ASR Benchmark Leaderboard") as demo:
124
  gr.Markdown(
125
  """
126
- ## Bambara ASR Benchmark Leaderboard
127
-
128
- **Welcome to the Bambara Automatic Speech Recognition (ASR) Benchmark Leaderboard**
129
- Evaluate your ASR model's performance on the Bambara language dataset.
130
-
131
- ### Submission Instructions
132
- 1. Prepare a CSV file with two columns:
133
- - **`id`**: Must match identifiers in the official dataset.
134
- - **`text`**: Your model's transcription predictions.
135
- 2. Ensure the CSV file meets these requirements:
136
- - Contains only `id` and `text` columns.
137
- - No duplicate `id` values.
138
- - All `id` values match dataset entries.
139
- 3. Upload your CSV file below.
140
-
141
- ### Dataset
142
- Access the official dataset: [Bambara ASR Dataset](https://huggingface.co/datasets/MALIBA-AI/bambara_general_leaderboard_dataset)
143
-
144
- ### Evaluation Metrics
145
- - **Word Error Rate (WER)**: Word-level transcription accuracy (lower is better).
146
- - **Character Error Rate (CER)**: Character-level accuracy (lower is better).
147
-
148
- ### Leaderboard
149
- Submissions are ranked by WER and include:
150
- - Submitter name
151
- - WER (4 decimal places)
152
- - CER (4 decimal places)
153
- - Submission timestamp
154
  """
155
  )
 
156
  with gr.Row():
157
- submitter = gr.Textbox(label="Submitter Name or Model Identifier", placeholder="e.g., MALIBA-AI/asr")
158
- csv_upload = gr.File(label="Upload Prediction CSV File", file_types=[".csv"])
159
- submit_btn = gr.Button("Evaluate Submission")
160
- output_msg = gr.Textbox(label="Submission Status", interactive=False)
 
161
  leaderboard_display = gr.DataFrame(
162
- label="Current Leaderboard",
163
- value=load_and_format_leaderboard(),
164
  interactive=False
165
  )
 
166
  submit_btn.click(
167
  fn=process_submission,
168
  inputs=[submitter, csv_upload],
169
  outputs=[output_msg, leaderboard_display]
170
  )
171
 
 
 
 
 
172
  if __name__ == "__main__":
173
  demo.launch(share=True)
 
7
  import re
8
 
9
  # Load the Bambara ASR dataset
10
+ print("Loading dataset...")
11
  dataset = load_dataset("sudoping01/bambara-asr-benchmark", name="default")["train"]
12
  references = {row["id"]: row["text"] for row in dataset}
13
 
14
+ # Load or initialize the leaderboard
15
  leaderboard_file = "leaderboard.csv"
16
  if not os.path.exists(leaderboard_file):
17
  pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
18
+ else:
19
+ print(f"Loaded existing leaderboard with {len(pd.read_csv(leaderboard_file))} entries")
20
 
21
  def normalize_text(text):
22
  """
 
28
  """
29
  if not isinstance(text, str):
30
  text = str(text)
31
+
32
+ # Convert to lowercase
33
  text = text.lower()
34
+
35
+ # Remove punctuation, keeping spaces
36
  text = re.sub(r'[^\w\s]', '', text)
37
+
38
+ # Normalize whitespace
39
  text = re.sub(r'\s+', ' ', text).strip()
40
+
41
  return text
42
 
43
  def calculate_metrics(predictions_df):
44
+ """Calculate WER and CER for predictions."""
 
 
45
  results = []
46
+
47
  for _, row in predictions_df.iterrows():
48
  id_val = row["id"]
49
  if id_val not in references:
50
+ print(f"Warning: ID {id_val} not found in references")
51
  continue
52
+
53
  reference = normalize_text(references[id_val])
54
  hypothesis = normalize_text(row["text"])
55
+
56
+ # Print detailed info for first few entries
57
+ if len(results) < 5:
58
+ print(f"ID: {id_val}")
59
+ print(f"Reference: '{reference}'")
60
+ print(f"Hypothesis: '{hypothesis}'")
61
+
62
+ # Skip empty strings
63
  if not reference or not hypothesis:
64
+ print(f"Warning: Empty reference or hypothesis for ID {id_val}")
65
  continue
66
+
67
+ # Split into words for jiwer
68
+ reference_words = reference.split()
69
+ hypothesis_words = hypothesis.split()
70
+
71
+ if len(results) < 5:
72
+ print(f"Reference words: {reference_words}")
73
+ print(f"Hypothesis words: {hypothesis_words}")
74
+
75
+ # Calculate metrics
76
  try:
77
+ # Make sure we're not comparing identical strings
78
+ if reference == hypothesis:
79
+ print(f"Warning: Identical strings for ID {id_val}")
80
+ # Force a small difference if the strings are identical
81
+ # This is for debugging - remove in production if needed
82
+ if len(hypothesis_words) > 0:
83
+ # Add a dummy word to force non-zero WER
84
+ hypothesis_words.append("dummy_debug_token")
85
+ hypothesis = " ".join(hypothesis_words)
86
+
87
+ # Calculate WER and CER
88
  sample_wer = wer(reference, hypothesis)
89
  sample_cer = cer(reference, hypothesis)
90
+
91
+ if len(results) < 5:
92
+ print(f"WER: {sample_wer}, CER: {sample_cer}")
93
+
94
  results.append({
95
  "id": id_val,
96
+ "reference": reference,
97
+ "hypothesis": hypothesis,
98
  "wer": sample_wer,
99
  "cer": sample_cer
100
  })
101
+ except Exception as e:
102
+ print(f"Error calculating metrics for ID {id_val}: {str(e)}")
103
+
104
  if not results:
105
+ raise ValueError("No valid samples for WER/CER calculation")
106
+
107
+ # Calculate average metrics
108
  avg_wer = sum(item["wer"] for item in results) / len(results)
109
  avg_cer = sum(item["cer"] for item in results) / len(results)
110
+
111
  return avg_wer, avg_cer, results
112
 
113
  def process_submission(submitter_name, csv_file):
 
 
 
114
  try:
115
+ # Read and validate the uploaded CSV
116
  df = pd.read_csv(csv_file)
117
+ print(f"Processing submission from {submitter_name} with {len(df)} rows")
118
+
119
  if len(df) == 0:
120
+ return "Error: Uploaded CSV is empty.", None
121
+
122
  if set(df.columns) != {"id", "text"}:
123
+ return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None
124
+
125
  if df["id"].duplicated().any():
126
+ dup_ids = df[df["id"].duplicated()]["id"].unique()
127
+ return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None
128
+
129
+ # Check if IDs match the reference dataset
130
  missing_ids = set(references.keys()) - set(df["id"])
131
  extra_ids = set(df["id"]) - set(references.keys())
132
+
133
  if missing_ids:
134
+ return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None
135
+
136
  if extra_ids:
137
+ return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None
 
 
 
138
 
139
+ # Calculate WER and CER
140
+ try:
141
+ avg_wer, avg_cer, detailed_results = calculate_metrics(df)
142
+
143
+ # Debug information
144
+ print(f"Calculated metrics - WER: {avg_wer:.4f}, CER: {avg_cer:.4f}")
145
+ print(f"Processed {len(detailed_results)} valid samples")
146
+
147
+ # Check for suspiciously low values
148
+ if avg_wer < 0.001:
149
+ print("WARNING: WER is extremely low - likely an error")
150
+ return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
151
+
152
+ except Exception as e:
153
+ print(f"Error in metrics calculation: {str(e)}")
154
+ return f"Error calculating metrics: {str(e)}", None
155
 
156
+ # Update the leaderboard
157
  leaderboard = pd.read_csv(leaderboard_file)
158
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
159
  new_entry = pd.DataFrame(
 
163
  leaderboard = pd.concat([leaderboard, new_entry]).sort_values("WER")
164
  leaderboard.to_csv(leaderboard_file, index=False)
165
 
166
+ return f"Submission processed successfully! WER: {avg_wer:.4f}, CER: {avg_cer:.4f}", leaderboard
 
 
 
167
 
 
 
168
  except Exception as e:
169
+ print(f"Error processing submission: {str(e)}")
170
+ return f"Error processing submission: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
171
 
172
+ # Create the Gradio interface
173
+ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
174
  gr.Markdown(
175
  """
176
+ # Bambara ASR Leaderboard
177
+ Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
178
+ The 'id's must match those in the dataset.
179
+ [View the dataset here](https://huggingface.co/datasets/MALIBA-AI/bambara_general_leaderboard_dataset).
180
+ - **WER**: Word Error Rate (lower is better).
181
+ - **CER**: Character Error Rate (lower is better).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  """
183
  )
184
+
185
  with gr.Row():
186
+ submitter = gr.Textbox(label="Submitter Name or Model Name", placeholder="e.g., MALIBA-AI/asr")
187
+ csv_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
188
+
189
+ submit_btn = gr.Button("Submit")
190
+ output_msg = gr.Textbox(label="Status", interactive=False)
191
  leaderboard_display = gr.DataFrame(
192
+ label="Leaderboard",
193
+ value=pd.read_csv(leaderboard_file),
194
  interactive=False
195
  )
196
+
197
  submit_btn.click(
198
  fn=process_submission,
199
  inputs=[submitter, csv_upload],
200
  outputs=[output_msg, leaderboard_display]
201
  )
202
 
203
+ # Print startup message
204
+ print("Starting Bambara ASR Leaderboard app...")
205
+
206
+ # Launch the app
207
  if __name__ == "__main__":
208
  demo.launch(share=True)