File size: 10,598 Bytes
9ba8fab
 
 
3769468
9ba8fab
 
3769468
6960dc6
da12542
 
 
 
 
f23d956
 
 
 
 
9ba8fab
 
 
fdebe26
ddc83ff
5f3b2ed
 
 
e8b48ca
5f3b2ed
3769468
 
1c9f0b6
3769468
 
d415750
3769468
f81f1e2
 
3769468
 
 
d4aa692
 
f81f1e2
 
d415750
3769468
 
 
 
d415750
3769468
 
d415750
3769468
 
d415750
 
d4aa692
f81f1e2
 
3769468
 
 
ddc83ff
2e23fb2
 
f81f1e2
 
 
 
d4aa692
3769468
d415750
 
f81f1e2
 
3769468
 
 
e8b48ca
 
d415750
d4aa692
d415750
d4aa692
 
 
f81f1e2
1c9f0b6
d4aa692
 
f81f1e2
d4aa692
9ba8fab
32130a5
 
 
 
1c9f0b6
 
5815dce
1c9f0b6
5815dce
32130a5
1c9f0b6
 
 
 
 
5815dce
1c9f0b6
 
 
 
 
32130a5
 
1c9f0b6
5815dce
5f3b2ed
f23d956
9ba8fab
f23d956
 
 
e8b48ca
f23d956
5815dce
f23d956
5815dce
f23d956
5815dce
 
1c9f0b6
5815dce
e8b48ca
1c9f0b6
2e23fb2
f23d956
 
9ba8fab
d415750
3769468
d4aa692
d415750
29c8f24
d4aa692
d415750
9ba8fab
d415750
d4aa692
ddc83ff
3769468
 
d415750
3769468
d4aa692
d415750
3769468
d4aa692
f81f1e2
d415750
f81f1e2
d415750
2e23fb2
d4aa692
 
d415750
 
d4aa692
9ba8fab
 
 
5f3b2ed
1c9f0b6
5f3b2ed
 
9ba8fab
f23d956
 
9ba8fab
 
32130a5
5f3b2ed
 
 
1c9f0b6
5815dce
32130a5
3769468
9ba8fab
d4aa692
c726970
d415750
9ba8fab
 
1c9f0b6
f81f1e2
33f8987
5f3b2ed
9ba8fab
 
d415750
33f8987
 
f23d956
 
 
 
 
 
1c9f0b6
e8b48ca
1c9f0b6
33f8987
 
5f3b2ed
 
 
 
 
 
 
33f8987
1c9f0b6
33f8987
5f3b2ed
 
 
 
 
 
 
33f8987
 
 
 
 
1c9f0b6
 
 
33f8987
 
d4aa692
33f8987
 
 
 
 
 
 
 
 
 
 
f23d956
33f8987
 
 
 
 
 
1c9f0b6
33f8987
 
 
 
 
f23d956
33f8987
 
9ba8fab
3769468
1c9f0b6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import gradio as gr
import pandas as pd
from datasets import load_dataset
from jiwer import wer, cer
import os
from datetime import datetime
import re

from huggingface_hub import login

token = os.environ.get("HG_TOKEN")
login(token)

try:
    dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
    references = {row["id"]: row["text"] for row in dataset}
except Exception as e:
    references = {}

leaderboard_file = "leaderboard.csv"
if not os.path.exists(leaderboard_file):
    pd.DataFrame(columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]).to_csv(leaderboard_file, index=False)
else:
    leaderboard_df = pd.read_csv(leaderboard_file)
    
    if "Combined_Score" not in leaderboard_df.columns:
        leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
        leaderboard_df.to_csv(leaderboard_file, index=False)

def normalize_text(text):
    """Normalize text for WER/CER calculation"""
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def calculate_metrics(predictions_df):
    """Calculate WER and CER for predictions."""
    results = []
    total_ref_words = 0
    total_ref_chars = 0

    for _, row in predictions_df.iterrows():
        id_val = row["id"]
        if id_val not in references:
            continue
            
        reference = normalize_text(references[id_val])
        hypothesis = normalize_text(row["text"])
        
        if not reference or not hypothesis:
            continue
            
        reference_words = reference.split()
        hypothesis_words = hypothesis.split()
        reference_chars = list(reference)
        
        try:
            sample_wer = wer(reference, hypothesis)
            sample_cer = cer(reference, hypothesis)
            
            sample_wer = min(sample_wer, 2.0)  
            sample_cer = min(sample_cer, 2.0)  
            
            total_ref_words += len(reference_words)
            total_ref_chars += len(reference_chars)
            
            results.append({
                "id": id_val,
                "reference": reference,
                "hypothesis": hypothesis,
                "ref_word_count": len(reference_words),
                "ref_char_count": len(reference_chars),
                "wer": sample_wer,
                "cer": sample_cer
            })
        except Exception:
            pass
    
    if not results:
        raise ValueError("No valid samples for WER/CER calculation")
        
    avg_wer = sum(item["wer"] for item in results) / len(results)
    avg_cer = sum(item["cer"] for item in results) / len(results)
    
    # Calculate weighted average metrics based on reference length
    weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
    weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
    
    return avg_wer, avg_cer, weighted_wer, weighted_cer, results

def format_as_percentage(value):
    """Convert decimal to percentage with 2 decimal places"""
    return f"{value * 100:.2f}%"

def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
    """Format leaderboard for display with ranking and percentages"""
    if len(df) == 0:
        return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
    

    display_df = df.copy()
    
    display_df = display_df.sort_values(sort_by)
    
    display_df.insert(0, "Rank", range(1, len(display_df) + 1))
    
    for col in ["WER", "CER", "Combined_Score"]:
        if col in display_df.columns:
            display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
            display_df = display_df.drop(col, axis=1)
    
    # Removed the clickable model name transformation
    
    return display_df

def update_ranking(method):
    """Update leaderboard ranking based on selected method"""
    try:
        current_lb = pd.read_csv(leaderboard_file)
        
        if "Combined_Score" not in current_lb.columns:
            current_lb["Combined_Score"] = current_lb["WER"] * 0.7 + current_lb["CER"] * 0.3
        
        sort_column = "Combined_Score"
        if method == "WER Only":
            sort_column = "WER"
        elif method == "CER Only":
            sort_column = "CER"
        
        return prepare_leaderboard_for_display(current_lb, sort_column)
        
    except Exception:
        return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])

def process_submission(model_name, csv_file):
    try:
        df = pd.read_csv(csv_file)
        
        if len(df) == 0:
            return "Error: Uploaded CSV is empty.", None
            
        if set(df.columns) != {"id", "text"}:
            return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None
            
        if df["id"].duplicated().any():
            dup_ids = df[df["id"].duplicated()]["id"].unique()
            return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None

        missing_ids = set(references.keys()) - set(df["id"])
        extra_ids = set(df["id"]) - set(references.keys())
        
        if missing_ids:
            return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None
            
        if extra_ids:
            return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None
        
        try:
            avg_wer, avg_cer, weighted_wer, weighted_cer, detailed_results = calculate_metrics(df)
            
            # suspiciously low values
            if avg_wer < 0.001:
                return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
                
        except Exception as e:
            return f"Error calculating metrics: {str(e)}", None
        
        leaderboard = pd.read_csv(leaderboard_file)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        # Calculate combined score (70% WER, 30% CER)
        combined_score = avg_wer * 0.7 + avg_cer * 0.3
        
        new_entry = pd.DataFrame(
            [[model_name, avg_wer, avg_cer, combined_score, timestamp]],
            columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
        )
        

        updated_leaderboard = pd.concat([leaderboard, new_entry]).sort_values("Combined_Score")
        updated_leaderboard.to_csv(leaderboard_file, index=False)
        
        display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
        
        return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
        
    except Exception as e:
        return f"Error processing submission: {str(e)}", None

with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
    gr.Markdown(
        """
        # πŸ‡²πŸ‡± Bambara ASR Leaderboard
        
        This leaderboard ranks and evaluates speech recognition models for the Bambara language.
        Models are ranked based on a combined score of WER and CER metrics.
        """
    )
    
    with gr.Tabs() as tabs:
        with gr.TabItem("πŸ… Current Rankings"):
            try:
                current_leaderboard = pd.read_csv(leaderboard_file)
                
                if "Combined_Score" not in current_leaderboard.columns:
                    current_leaderboard["Combined_Score"] = current_leaderboard["WER"] * 0.7 + current_leaderboard["CER"] * 0.3
                
                display_leaderboard = prepare_leaderboard_for_display(current_leaderboard)
            except Exception:
                display_leaderboard = pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
            
            gr.Markdown("### Current ASR Model Rankings")
            
            ranking_method = gr.Radio(
                ["Combined Score (WER 70%, CER 30%)", "WER Only", "CER Only"], 
                label="Ranking Method",
                value="Combined Score (WER 70%, CER 30%)"
            )
            
            leaderboard_view = gr.DataFrame(
                value=display_leaderboard,
                interactive=False,
                label="Models are ranked by selected metric - lower is better"
            )
            
            ranking_method.change(
                fn=update_ranking,
                inputs=[ranking_method],
                outputs=[leaderboard_view]
            )
            
            gr.Markdown(
                """
                ## Metrics Explanation
                - **WER (%)**: Word Error Rate (lower is better) - measures word-level accuracy
                - **CER (%)**: Character Error Rate (lower is better) - measures character-level accuracy
                - **Combined Score (%)**: Weighted average of WER (70%) and CER (30%) - provides a balanced evaluation
                """
            )
        
        with gr.TabItem("πŸ“Š Submit New Results"):
            gr.Markdown(
                """
                ### Submit a new model for evaluation
                
                Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
                The 'id's must match those in the reference dataset.
                """
            )
            
            with gr.Row():
                model_name_input = gr.Textbox(label="Model Name", placeholder="e.g., MALIBA-AI/asr")
                csv_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
                
            submit_btn = gr.Button("Submit")
            output_msg = gr.Textbox(label="Status", interactive=False)
            leaderboard_display = gr.DataFrame(
                label="Updated Leaderboard",
                value=display_leaderboard,
                interactive=False
            )
            
            submit_btn.click(
                fn=process_submission,
                inputs=[model_name_input, csv_upload],
                outputs=[output_msg, leaderboard_display]
            )

if __name__ == "__main__":
    demo.launch()