Spaces:

birder-project
/

leaderboard

Running

File size: 14,083 Bytes

7ff36f7
 
e7103ef
 
 
 
 
7ff36f7
e7103ef
df7618f
 
 
 
 
 
 
 
 
 
e7103ef
 
 
 
 
37d35c3
 
 
 
 
 
 
 
 
166d7f1
37d35c3
e7103ef
 
 
 
 
 
37d35c3
 
 
 
 
 
 
 
 
166d7f1
37d35c3
 
e7103ef
 
 
 
 
 
 
2daeddf
e7103ef
 
e77bee6
b315517
c99ec4f
b315517
 
 
 
 
e77bee6
 
 
 
 
 
 
 
 
37d35c3
 
166d7f1
37d35c3
e77bee6
 
 
 
 
 
 
 
 
 
 
 
 
 
37d35c3
 
166d7f1
37d35c3
e77bee6
 
 
 
 
 
 
 
b315517
2daeddf
b315517
e77bee6
 
e7103ef
e77bee6
 
 
c99ec4f
e77bee6
df7618f
e77bee6
 
 
 
 
df7618f
e77bee6
e7103ef
 
 
 
 
 
 
 
 
37d35c3
 
166d7f1
37d35c3
e7103ef
 
 
 
 
 
 
 
 
 
 
 
 
 
37d35c3
 
166d7f1
37d35c3
e7103ef
 
 
 
 
 
 
 
df7618f
 
 
 
 
 
e7103ef
df7618f
 
 
 
e7103ef
 
 
 
 
 
166d7f1
08ec329
e7103ef
 
 
 
 
166d7f1
e927422
c6773b7
 
e7103ef
 
 
4fbbca3
 
 
e7103ef
 
df7618f
e7103ef
 
 
 
 
e77bee6
 
 
e7103ef
 
e4db665
 
c035eb4
e7103ef
 
e77bee6
 
 
df7618f
e77bee6
 
 
 
 
 
 
 
 
 
 
 
e7103ef
 
df7618f
e77bee6
df7618f
 
e77bee6
e7103ef
 
 
 
 
e77bee6
 
 
 
 
 
e7103ef
 
 
 
 
 
 
 
 
4fbbca3
08ec329
 
 
e7103ef
 
 
 
 
 
 
 
 
 
 
 
9a6e63c
 
 
e7103ef
 
 
b64e36a
9a6e63c
 
 
30632f8
9a6e63c
 
 
65b906d
9a6e63c
e7103ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166d7f1
e927422
166d7f1
 
 
c6773b7
e7103ef
 
 
 
 
4fbbca3
 
 
 
 
 
 
 
08ec329
4fbbca3
e7103ef
4fbbca3
e7103ef
166d7f1
08ec329
e7103ef
 
 
 
 
 
166d7f1
c6773b7
e7103ef
4fbbca3
e7103ef
 
 
 
 
 
6636eb2
7ff36f7
6636eb2
 
 
7ff36f7
e7103ef

from pathlib import Path

import altair as alt
import polars as pl

import gradio as gr

DATASETS = []
BENCHMARKS = {
    # Name: (device, AMP, compile, single thread)
    "Parameters": (None, None, None, None),
    "GPU Memory": (None, None, None, None),
    "CPU rate": ("cpu", False, False, False),
    "CPU rate single core": ("cpu", False, False, True),
    "CPU rate with compile": ("cpu", False, True, False),
    "CPU rate AMP with compile": ("cpu", True, True, False),
    "CUDA rate": ("cuda", False, False, False),
    "CUDA rate with compile": ("cuda", False, True, False),
    "CUDA rate AMP with compile": ("cuda", True, True, False),
}


def plot_acc_param(param_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
    df = param_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "Parameters (M)",
        "Pareto frontier (p)",
        "Intermediate",
        "MIM",
        "Self-supervised learning",
        "Distilled",
    )
    base = df.plot.point(
        x="Parameters (M)",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
            "Intermediate",
            "MIM",
            "Self-supervised learning",
            "Distilled",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="Parameters (M)", y="Pareto frontier (p)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier
    return chart.properties(title="Accuracy vs Parameter Count", width=width, height=height).configure_scale(zero=False)


def plot_acc_memory(memory_compare_results_df: pl.DataFrame, width: int = 900, height: int = 640) -> alt.LayerChart:
    if len(memory_compare_results_df) > 0:
        batch_size = memory_compare_results_df["max_batch_size"][0]
        amp = memory_compare_results_df["amp"][0]
    else:
        batch_size = ""
        amp = ""

    df = memory_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "Peak GPU memory (MB)",
        "Parameters (M)",
        "Pareto frontier (mem)",
        "Intermediate",
        "MIM",
        "Self-supervised learning",
        "Distilled",
    )
    base = df.plot.point(
        x="Peak GPU memory (MB)",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "Peak GPU memory (MB)",
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
            "Intermediate",
            "MIM",
            "Self-supervised learning",
            "Distilled",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="Peak GPU memory (MB)", y="Pareto frontier (mem)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier
    return chart.properties(
        title=f"Accuracy vs GPU Memory (batch size={batch_size}, amp={amp})", width=width, height=height
    ).configure_scale(zero=False)


def plot_acc_rate(rate_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
    if len(rate_compare_results_df) > 0:
        device = rate_compare_results_df["device"][0]
        compiled = rate_compare_results_df["compile"][0]
        batch_size = rate_compare_results_df["max_batch_size"][0]
        amp = rate_compare_results_df["amp"][0]
        single_thread = rate_compare_results_df["single_thread"][0]
    else:
        device = ""
        compiled = ""
        batch_size = ""
        amp = ""
        single_thread = False

    df = rate_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "ms / sample",
        "Parameters (M)",
        "Pareto frontier (ms)",
        "Intermediate",
        "MIM",
        "Self-supervised learning",
        "Distilled",
    )
    base = df.plot.point(
        x="ms / sample",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "ms / sample",
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
            "Intermediate",
            "MIM",
            "Self-supervised learning",
            "Distilled",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="ms / sample", y="Pareto frontier (ms)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier

    if single_thread is True:
        single_thread_title = " Single Core"
    else:
        single_thread_title = ""

    return chart.properties(
        title=(
            f"Accuracy vs {device.upper()}{single_thread_title} Rate (compile={compiled}, "
            f"batch size={batch_size}, amp={amp})"
        ),
        width=width,
        height=height,
    ).configure_scale(zero=False)


def update_data(
    dataset: str, benchmark: str, intermediate: bool, mim: bool, ssl: bool, dist: bool, log_x: bool, search_bar: str
) -> tuple[alt.LayerChart, pl.DataFrame, str]:
    compare_results_df = pl.read_csv(f"results_{dataset}.csv")
    if intermediate is False:
        compare_results_df = compare_results_df.filter(pl.col("Intermediate") == intermediate)
    if mim is False:
        compare_results_df = compare_results_df.filter(pl.col("MIM") == mim)
    if ssl is False:
        compare_results_df = compare_results_df.filter(pl.col("Self-supervised learning") == ssl)
    if dist is False:
        compare_results_df = compare_results_df.filter(pl.col("Distilled") == dist)

    x_scale_type = "log" if log_x is True else "linear"

    # Filter models
    compare_results_df = compare_results_df.filter(pl.col("Model name").str.contains(search_bar))

    # Parameter count
    if benchmark == "Parameters":
        param_compare_results_df = compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
            "Parameters (M)", descending=False
        )
        param_compare_results_df = param_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (p)")
        )
        param_compare_results_df = param_compare_results_df.drop(
            "Samples / sec", "device", "ms / sample", "Peak GPU memory (MB)"
        )
        chart = plot_acc_param(param_compare_results_df)

        x_max = param_compare_results_df["Parameters (M)"].quantile(0.9)
        x_min = param_compare_results_df["Parameters (M)"].quantile(0.1)
        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = param_compare_results_df

    # Peak memory
    elif benchmark == "GPU Memory":
        memory_compare_results_df = compare_results_df.drop_nulls(subset=["Peak GPU memory (MB)"])
        memory_compare_results_df = memory_compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
            "Peak GPU memory (MB)", descending=False
        )
        memory_compare_results_df = memory_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (mem)")
        )
        memory_compare_results_df = memory_compare_results_df.drop("Samples / sec", "device", "ms / sample")
        chart = plot_acc_memory(memory_compare_results_df)
        x_max = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.9)
        x_min = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.1)
        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = memory_compare_results_df

    # Rate
    else:
        (device, amp_enabled, compiled, single_thread) = BENCHMARKS[benchmark]
        df = compare_results_df.drop_nulls(subset=["ms / sample"])
        df = df.filter(device=device, amp=amp_enabled, compile=compiled, single_thread=single_thread)
        device_compare_results_df = df.unique(subset=["Model name", "Resolution"]).sort("ms / sample", descending=False)
        device_compare_results_df = device_compare_results_df.drop("Peak GPU memory (MB)")
        device_compare_results_df = device_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (ms)")
        )
        chart = plot_acc_rate(device_compare_results_df)

        x_max = device_compare_results_df["ms / sample"].quantile(0.95)
        x_min = device_compare_results_df["ms / sample"].min()
        if x_max is not None and x_min is not None:
            x_max = x_max * 1.04
            x_min = x_min * 0.96

        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = device_compare_results_df

    output_df = output_df.select(
        [
            pl.col(col).round(4) if output_df.schema[col] in [pl.Float32, pl.Float64] else col
            for col in output_df.columns
        ]
    )

    model_count_text = f"{len(output_df)} models displayed"

    return (chart, output_df.drop("Mistakes", "Samples", "torch_version"), model_count_text)


def app() -> None:
    with gr.Blocks(title="Birder Leaderboard", analytics_enabled=False) as leaderboard:
        gr.HTML("<center><h1>The Birder Leaderboard</h1></center>")
        with gr.Row():
            with gr.Column():
                pass

            with gr.Column():
                gr.Markdown(
                    """
                    Leaderboard of all the pre-trained Birder models across multiple datasets.

                    ### Benchmark Setup

                    * GPU: A5000 ADA Generation
                    * CPU: AMD Ryzen Threadripper PRO 7975WX
                    * PyTorch version: 2.7.1+cu128

                    ### Dataset Information

                    | Name                | Training samples | Validation samples | Classes     |
                    |---------------------|------------------|--------------------|-------------|
                    | arabian-peninsula   | 583,868          | 21,634             | 735         |
                    | eu-common           | 569,784          | 19,869             | 707         |
                    | il-all              | 462,430          | 18,621             | 550         |
                    | il-common           | 330,880          | 15,828             | 371         |
                    """
                )

            with gr.Column():
                pass

        with gr.Row():
            with gr.Column():
                pass

            with gr.Column():
                dataset_dropdown = gr.Dropdown(
                    choices=DATASETS,
                    label="Select Dataset",
                    value=DATASETS[0] if DATASETS else None,
                )
                benchmark_dropdown = gr.Dropdown(
                    choices=BENCHMARKS.keys(),
                    label="Select Benchmark",
                    value=next(iter(BENCHMARKS.keys())) if BENCHMARKS else None,
                    filterable=False,
                )

            with gr.Column():
                intermediate = gr.Checkbox(
                    label="Intermediate",
                    value=True,
                    info="Show models that underwent intermediate training (extra data)",
                )
                mim = gr.Checkbox(label="MIM", value=True, info="Show models with Masked Image Modeling pre-training")
                ssl = gr.Checkbox(
                    label="SSL",
                    value=True,
                    info="Show models with Self-supervised learning pre-training",
                )
                dist = gr.Checkbox(label="Distilled", value=True, info="Show distilled models")
                log_x = gr.Checkbox(label="Log scale X-axis", value=False)

            with gr.Column():
                pass

        with gr.Row():
            with gr.Column():
                pass

            with gr.Column(scale=2):
                search_bar = gr.Textbox(label="Model Filter", placeholder="e.g. convnext, efficient|mobile")

            with gr.Column():
                model_count_display = gr.Markdown()

        plot = gr.Plot(container=False)
        table = gr.Dataframe(show_search="search")

        inputs = [dataset_dropdown, benchmark_dropdown, intermediate, mim, ssl, dist, log_x, search_bar]
        outputs = [plot, table, model_count_display]
        leaderboard.load(update_data, inputs=inputs, outputs=outputs)

        dataset_dropdown.change(update_data, inputs=inputs, outputs=outputs)
        benchmark_dropdown.change(update_data, inputs=inputs, outputs=outputs)
        intermediate.change(update_data, inputs=inputs, outputs=outputs)
        mim.change(update_data, inputs=inputs, outputs=outputs)
        ssl.change(update_data, inputs=inputs, outputs=outputs)
        dist.change(update_data, inputs=inputs, outputs=outputs)
        log_x.change(update_data, inputs=inputs, outputs=outputs)
        search_bar.change(update_data, inputs=inputs, outputs=outputs)

    leaderboard.launch()


# Launch the app
if __name__ == "__main__":
    file_info = []
    for p in Path.glob(Path("."), "results_*.csv"):
        file_info.append((p.stat().st_size, p.stem.removeprefix("results_")))

    DATASETS = [dataset_name for _, dataset_name in sorted(file_info, key=lambda x: x[1], reverse=True)]

    app()