Spaces:

MALIBA-AI
/

bambara-asr-leaderboard

Running

File size: 7,868 Bytes

9ba8fab
 
 
3769468
9ba8fab
 
3769468
6960dc6
ddc83ff
 
9ba8fab
 
 
ddc83ff
9ba8fab
 
 
ddc83ff
 
3769468
 
 
 
 
 
 
 
 
 
 
d415750
ddc83ff
3769468
ddc83ff
 
3769468
ddc83ff
 
3769468
ddc83ff
3769468
 
 
ddc83ff
3769468
d415750
3769468
 
 
ddc83ff
3769468
d415750
3769468
 
d415750
ddc83ff
 
 
 
 
 
 
3769468
ddc83ff
3769468
d415750
ddc83ff
d415750
 
 
ddc83ff
 
 
d415750
ddc83ff
3769468
ddc83ff
 
 
 
 
 
 
 
 
 
 
3769468
 
ddc83ff
 
 
d415750
3769468
 
d415750
 
3769468
 
 
d415750
 
 
3769468
d415750
 
ddc83ff
3769468
 
ddc83ff
3769468
9ba8fab
 
 
ddc83ff
9ba8fab
ddc83ff
d415750
3769468
d415750
 
29c8f24
d415750
 
9ba8fab
d415750
 
ddc83ff
 
3769468
 
d415750
3769468
d415750
 
3769468
d415750
6960dc6
ddc83ff
d415750
 
 
ddc83ff
d415750
 
 
ddc83ff
 
 
d415750
 
 
ddc83ff
d415750
9ba8fab
ddc83ff
9ba8fab
 
 
 
 
 
 
 
 
d415750
3769468
9ba8fab
ddc83ff
d415750
c726970
ddc83ff
d415750
9ba8fab
 
d415750
 
 
 
 
 
9ba8fab
 
d415750
9ba8fab
d415750
 
 
 
 
9ba8fab
d415750
 
9ba8fab
 
d415750
9ba8fab
 
 
 
 
 
ddc83ff
d415750
 
ddc83ff
3769468

import gradio as gr
import pandas as pd
from datasets import load_dataset
from jiwer import wer, cer
import os
from datetime import datetime
import re

# Load the Bambara ASR dataset
print("Loading dataset...")
dataset = load_dataset("sudoping01/bambara-asr-benchmark", name="default")["train"]
references = {row["id"]: row["text"] for row in dataset}

# Load or initialize the leaderboard
leaderboard_file = "leaderboard.csv"
if not os.path.exists(leaderboard_file):
    pd.DataFrame(columns=["submitter", "WER", "CER", "timestamp"]).to_csv(leaderboard_file, index=False)
else:
    print(f"Loaded existing leaderboard with {len(pd.read_csv(leaderboard_file))} entries")

def normalize_text(text):
    """
    Normalize text for WER/CER calculation:
    - Convert to lowercase
    - Remove punctuation
    - Replace multiple spaces with single space
    - Strip leading/trailing spaces
    """
    if not isinstance(text, str):
        text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation, keeping spaces
    text = re.sub(r'[^\w\s]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def calculate_metrics(predictions_df):
    """Calculate WER and CER for predictions."""
    results = []

    for _, row in predictions_df.iterrows():
        id_val = row["id"]
        if id_val not in references:
            print(f"Warning: ID {id_val} not found in references")
            continue
            
        reference = normalize_text(references[id_val])
        hypothesis = normalize_text(row["text"])
        
        # Print detailed info for first few entries
        if len(results) < 5:
            print(f"ID: {id_val}")
            print(f"Reference: '{reference}'")
            print(f"Hypothesis: '{hypothesis}'")
        
        # Skip empty strings
        if not reference or not hypothesis:
            print(f"Warning: Empty reference or hypothesis for ID {id_val}")
            continue
            
        # Split into words for jiwer
        reference_words = reference.split()
        hypothesis_words = hypothesis.split()
        
        if len(results) < 5:
            print(f"Reference words: {reference_words}")
            print(f"Hypothesis words: {hypothesis_words}")
        
        # Calculate metrics
        try:
            # Make sure we're not comparing identical strings
            if reference == hypothesis:
                print(f"Warning: Identical strings for ID {id_val}")
                # Force a small difference if the strings are identical
                # This is for debugging - remove in production if needed
                if len(hypothesis_words) > 0:
                    # Add a dummy word to force non-zero WER
                    hypothesis_words.append("dummy_debug_token")
                    hypothesis = " ".join(hypothesis_words)
            
            # Calculate WER and CER
            sample_wer = wer(reference, hypothesis)
            sample_cer = cer(reference, hypothesis)
            
            if len(results) < 5:
                print(f"WER: {sample_wer}, CER: {sample_cer}")
                
            results.append({
                "id": id_val,
                "reference": reference,
                "hypothesis": hypothesis,
                "wer": sample_wer,
                "cer": sample_cer
            })
        except Exception as e:
            print(f"Error calculating metrics for ID {id_val}: {str(e)}")
    
    if not results:
        raise ValueError("No valid samples for WER/CER calculation")
        
    # Calculate average metrics
    avg_wer = sum(item["wer"] for item in results) / len(results)
    avg_cer = sum(item["cer"] for item in results) / len(results)
    
    return avg_wer, avg_cer, results

def process_submission(submitter_name, csv_file):
    try:
        # Read and validate the uploaded CSV
        df = pd.read_csv(csv_file)
        print(f"Processing submission from {submitter_name} with {len(df)} rows")
        
        if len(df) == 0:
            return "Error: Uploaded CSV is empty.", None
            
        if set(df.columns) != {"id", "text"}:
            return f"Error: CSV must contain exactly 'id' and 'text' columns. Found: {', '.join(df.columns)}", None
            
        if df["id"].duplicated().any():
            dup_ids = df[df["id"].duplicated()]["id"].unique()
            return f"Error: Duplicate IDs found: {', '.join(map(str, dup_ids[:5]))}", None

        # Check if IDs match the reference dataset
        missing_ids = set(references.keys()) - set(df["id"])
        extra_ids = set(df["id"]) - set(references.keys())
        
        if missing_ids:
            return f"Error: Missing {len(missing_ids)} IDs in submission. First few missing: {', '.join(map(str, list(missing_ids)[:5]))}", None
            
        if extra_ids:
            return f"Error: Found {len(extra_ids)} extra IDs not in reference dataset. First few extra: {', '.join(map(str, list(extra_ids)[:5]))}", None
        
        # Calculate WER and CER
        try:
            avg_wer, avg_cer, detailed_results = calculate_metrics(df)
            
            # Debug information
            print(f"Calculated metrics - WER: {avg_wer:.4f}, CER: {avg_cer:.4f}")
            print(f"Processed {len(detailed_results)} valid samples")
            
            # Check for suspiciously low values
            if avg_wer < 0.001:
                print("WARNING: WER is extremely low - likely an error")
                return "Error: WER calculation yielded suspicious results (near-zero). Please check your submission CSV.", None
                
        except Exception as e:
            print(f"Error in metrics calculation: {str(e)}")
            return f"Error calculating metrics: {str(e)}", None
        
        # Update the leaderboard
        leaderboard = pd.read_csv(leaderboard_file)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        new_entry = pd.DataFrame(
            [[submitter_name, avg_wer, avg_cer, timestamp]],
            columns=["submitter", "WER", "CER", "timestamp"]
        )
        leaderboard = pd.concat([leaderboard, new_entry]).sort_values("WER")
        leaderboard.to_csv(leaderboard_file, index=False)
        
        return f"Submission processed successfully! WER: {avg_wer:.4f}, CER: {avg_cer:.4f}", leaderboard
        
    except Exception as e:
        print(f"Error processing submission: {str(e)}")
        return f"Error processing submission: {str(e)}", None

# Create the Gradio interface
with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
    gr.Markdown(
        """
        # Bambara ASR Leaderboard
        Upload a CSV file with 'id' and 'text' columns to evaluate your ASR predictions.
        The 'id's must match those in the dataset.
        [View the dataset here](https://huggingface.co/datasets/MALIBA-AI/bambara_general_leaderboard_dataset).
        - **WER**: Word Error Rate (lower is better).
        - **CER**: Character Error Rate (lower is better).
        """
    )
    
    with gr.Row():
        submitter = gr.Textbox(label="Submitter Name or Model Name", placeholder="e.g., MALIBA-AI/asr")
        csv_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
        
    submit_btn = gr.Button("Submit")
    output_msg = gr.Textbox(label="Status", interactive=False)
    leaderboard_display = gr.DataFrame(
        label="Leaderboard",
        value=pd.read_csv(leaderboard_file),
        interactive=False
    )
    
    submit_btn.click(
        fn=process_submission,
        inputs=[submitter, csv_upload],
        outputs=[output_msg, leaderboard_display]
    )

# Print startup message
print("Starting Bambara ASR Leaderboard app...")

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)