File size: 13,309 Bytes
7ff36f7
 
e7103ef
 
 
 
 
7ff36f7
e7103ef
df7618f
 
 
 
 
 
 
 
 
 
e7103ef
 
 
 
 
37d35c3
 
 
 
 
 
 
 
 
 
e7103ef
 
 
 
 
 
37d35c3
 
 
 
 
 
 
 
 
 
 
e7103ef
 
 
 
 
 
 
 
 
 
e77bee6
b315517
 
 
 
 
 
 
e77bee6
 
 
 
 
 
 
 
 
37d35c3
 
 
e77bee6
 
 
 
 
 
 
 
 
 
 
 
 
 
37d35c3
 
 
e77bee6
 
 
 
 
 
 
 
b315517
6636eb2
b315517
e77bee6
 
e7103ef
e77bee6
 
 
 
 
df7618f
e77bee6
 
 
 
 
df7618f
e77bee6
e7103ef
 
 
 
 
 
 
 
 
37d35c3
 
 
e7103ef
 
 
 
 
 
 
 
 
 
 
 
 
 
37d35c3
 
 
e7103ef
 
 
 
 
 
 
 
df7618f
 
 
 
 
 
e7103ef
df7618f
 
 
 
e7103ef
 
 
 
 
 
c6773b7
e7103ef
 
 
 
 
 
c6773b7
 
e7103ef
 
 
4fbbca3
 
 
e7103ef
 
df7618f
e7103ef
 
 
 
 
e77bee6
 
 
e7103ef
 
e4db665
 
c035eb4
e7103ef
 
e77bee6
 
 
df7618f
e77bee6
 
 
 
 
 
 
 
 
 
 
 
e7103ef
 
df7618f
e77bee6
df7618f
 
e77bee6
e7103ef
 
 
 
 
e77bee6
 
 
 
 
 
e7103ef
 
 
 
 
 
 
 
 
4fbbca3
df7618f
e7103ef
 
 
 
 
 
 
 
 
 
 
 
9a6e63c
 
 
e7103ef
 
 
df7618f
9a6e63c
 
 
30632f8
9a6e63c
 
 
 
 
e7103ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6773b7
e7103ef
 
 
 
 
4fbbca3
 
 
 
 
 
 
 
 
 
e7103ef
4fbbca3
e7103ef
c6773b7
e7103ef
 
 
 
 
 
 
c6773b7
e7103ef
4fbbca3
e7103ef
 
 
 
 
 
6636eb2
7ff36f7
6636eb2
 
 
7ff36f7
e7103ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
from pathlib import Path

import altair as alt
import polars as pl

import gradio as gr

DATASETS = []
BENCHMARKS = {
    # Name: (device, AMP, compile, single thread)
    "Parameters": (None, None, None, None),
    "GPU Memory": (None, None, None, None),
    "CPU rate": ("cpu", False, False, False),
    "CPU rate single core": ("cpu", False, False, True),
    "CPU rate with compile": ("cpu", False, True, False),
    "CPU rate AMP with compile": ("cpu", True, True, False),
    "CUDA rate": ("cuda", False, False, False),
    "CUDA rate with compile": ("cuda", False, True, False),
    "CUDA rate AMP with compile": ("cuda", True, True, False),
}


def plot_acc_param(param_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
    df = param_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "Parameters (M)",
        "Pareto frontier (p)",
        "Intermediate",
        "MIM",
        "Distilled",
    )
    base = df.plot.point(
        x="Parameters (M)",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
            "Intermediate",
            "MIM",
            "Distilled",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="Parameters (M)", y="Pareto frontier (p)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier
    return chart.properties(title="Accuray vs Parameter Count", width=width, height=height).configure_scale(zero=False)


def plot_acc_memory(memory_compare_results_df: pl.DataFrame, width: int = 900, height: int = 640) -> alt.LayerChart:
    if len(memory_compare_results_df) > 0:
        batch_size = memory_compare_results_df["batch_size"][0]
        amp = memory_compare_results_df["amp"][0]
    else:
        batch_size = ""
        amp = ""

    df = memory_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "Peak GPU memory (MB)",
        "Parameters (M)",
        "Pareto frontier (mem)",
        "Intermediate",
        "MIM",
        "Distilled",
    )
    base = df.plot.point(
        x="Peak GPU memory (MB)",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "Peak GPU memory (MB)",
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
            "Intermediate",
            "MIM",
            "Distilled",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="Peak GPU memory (MB)", y="Pareto frontier (mem)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier
    return chart.properties(
        title=f"Accuray vs GPU Memory (batch size={batch_size}, amp={amp})", width=width, height=height
    ).configure_scale(zero=False)


def plot_acc_rate(rate_compare_results_df: pl.DataFrame, width: int = 1000, height: int = 680) -> alt.LayerChart:
    if len(rate_compare_results_df) > 0:
        device = rate_compare_results_df["device"][0]
        compiled = rate_compare_results_df["compile"][0]
        batch_size = rate_compare_results_df["batch_size"][0]
        amp = rate_compare_results_df["amp"][0]
        single_thread = rate_compare_results_df["single_thread"][0]
    else:
        device = ""
        compiled = ""
        batch_size = ""
        amp = ""
        single_thread = False

    df = rate_compare_results_df.select(
        "Model name",
        "Model type",
        "Accuracy",
        "Top-3 accuracy",
        "Resolution",
        "ms / sample",
        "Parameters (M)",
        "Pareto frontier (ms)",
        "Intermediate",
        "MIM",
        "Distilled",
    )
    base = df.plot.point(
        x="ms / sample",
        y="Accuracy",
        color="Model type",
        shape="Resolution:N",
        tooltip=[
            "ms / sample",
            "Parameters (M)",
            "Accuracy",
            "Top-3 accuracy",
            "Model name",
            "Model type",
            "Resolution",
            "Intermediate",
            "MIM",
            "Distilled",
        ],
    )
    text = base.mark_text(align="center", baseline="middle", dy=-10).encode(text="Model name")
    frontier = df.plot.line(x="ms / sample", y="Pareto frontier (ms)").mark_line(
        interpolate="step-after", color="red", strokeWidth=0.3, strokeDash=(2, 2)
    )

    chart = base + text + frontier

    if single_thread is True:
        single_thread_title = " Single Core"
    else:
        single_thread_title = ""

    return chart.properties(
        title=(
            f"Accuracy vs {device.upper()}{single_thread_title} Rate (compile={compiled}, "
            f"batch size={batch_size}, amp={amp})"
        ),
        width=width,
        height=height,
    ).configure_scale(zero=False)


def update_data(
    dataset: str, benchmark: str, intermediate: bool, mim: bool, dist: bool, log_x: bool, search_bar: str
) -> tuple[alt.LayerChart, pl.DataFrame]:
    compare_results_df = pl.read_csv(f"results_{dataset}.csv")
    if intermediate is False:
        compare_results_df = compare_results_df.filter(pl.col("Intermediate") == intermediate)
    if mim is False:
        compare_results_df = compare_results_df.filter(pl.col("MIM") == mim)
    if dist is False:
        compare_results_df = compare_results_df.filter(pl.col("Distilled") == dist)

    x_scale_type = "log" if log_x is True else "linear"

    # Filter models
    compare_results_df = compare_results_df.filter(pl.col("Model name").str.contains(search_bar))

    # Parameter count
    if benchmark == "Parameters":
        param_compare_results_df = compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
            "Parameters (M)", descending=False
        )
        param_compare_results_df = param_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (p)")
        )
        param_compare_results_df = param_compare_results_df.drop(
            "Samples / sec", "device", "ms / sample", "Peak GPU memory (MB)"
        )
        chart = plot_acc_param(param_compare_results_df)

        x_max = param_compare_results_df["Parameters (M)"].quantile(0.9)
        x_min = param_compare_results_df["Parameters (M)"].quantile(0.1)
        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = param_compare_results_df

    # Peak memory
    elif benchmark == "GPU Memory":
        memory_compare_results_df = compare_results_df.drop_nulls(subset=["Peak GPU memory (MB)"])
        memory_compare_results_df = memory_compare_results_df.unique(subset=["Model name", "Resolution"]).sort(
            "Peak GPU memory (MB)", descending=False
        )
        memory_compare_results_df = memory_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (mem)")
        )
        memory_compare_results_df = memory_compare_results_df.drop("Samples / sec", "device", "ms / sample")
        chart = plot_acc_memory(memory_compare_results_df)
        x_max = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.9)
        x_min = memory_compare_results_df["Peak GPU memory (MB)"].quantile(0.1)
        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = memory_compare_results_df

    # Rate
    else:
        (device, amp_enabled, compiled, single_thread) = BENCHMARKS[benchmark]
        df = compare_results_df.drop_nulls(subset=["ms / sample"])
        df = df.filter(device=device, amp=amp_enabled, compile=compiled, single_thread=single_thread)
        device_compare_results_df = df.unique(subset=["Model name", "Resolution"]).sort("ms / sample", descending=False)
        device_compare_results_df = device_compare_results_df.drop("Peak GPU memory (MB)")
        device_compare_results_df = device_compare_results_df.with_columns(
            pl.col("Accuracy").cum_max().alias("Pareto frontier (ms)")
        )
        chart = plot_acc_rate(device_compare_results_df)

        x_max = device_compare_results_df["ms / sample"].quantile(0.95)
        x_min = device_compare_results_df["ms / sample"].min()
        if x_max is not None and x_min is not None:
            x_max = x_max * 1.04
            x_min = x_min * 0.96

        chart.layer[0].encoding.x.scale = alt.Scale(domain=[x_min, x_max], type=x_scale_type)
        output_df = device_compare_results_df

    output_df = output_df.select(
        [
            pl.col(col).round(4) if output_df.schema[col] in [pl.Float32, pl.Float64] else col
            for col in output_df.columns
        ]
    )

    return (chart, output_df.drop("Mistakes", "Samples", "torch_version"))


def app() -> None:
    with gr.Blocks(title="Birder Leaderboard", analytics_enabled=False) as leaderboard:
        gr.HTML("<center><h1>The Birder Leaderboard</h1></center>")
        with gr.Row():
            with gr.Column():
                pass

            with gr.Column():
                gr.Markdown(
                    """
                    Leaderboard of all the pre-trained Birder models across multiple datasets.

                    ### Benchmark Setup

                    * GPU: A5000 ADA Generation
                    * CPU: AMD Ryzen Threadripper PRO 7975WX
                    * PyTorch version: 2.5.1+cu124

                    ### Dataset Information

                    | Name                | Training samples | Validation samples | Classes     |
                    |---------------------|------------------|--------------------|-------------|
                    | arabian-peninsula   | 583,868          | 21,634             | 735         |
                    | eu-common           | 569,784          | 19,869             | 707         |
                    | il-all              | 462,346          | 18,614             | 550         |
                    | il-common           | 330,880          | 15,828             | 371         |
                    """
                )

            with gr.Column():
                pass

        with gr.Row():
            with gr.Column():
                pass

            with gr.Column():
                dataset_dropdown = gr.Dropdown(
                    choices=DATASETS,
                    label="Select Dataset",
                    value=DATASETS[0] if DATASETS else None,
                )
                benchmark_dropdown = gr.Dropdown(
                    choices=BENCHMARKS.keys(),
                    label="Select Benchmark",
                    value=next(iter(BENCHMARKS.keys())) if BENCHMARKS else None,
                    filterable=False,
                )

            with gr.Column():
                intermediate = gr.Checkbox(
                    label="Intermediate",
                    value=True,
                    info="Show models that underwent intermediate training (extra data)",
                )
                mim = gr.Checkbox(label="MIM", value=True, info="Show models with Masked Image Modeling pre-training")
                dist = gr.Checkbox(label="Distilled", value=True, info="Show distilled models")
                log_x = gr.Checkbox(label="Log scale X-axis", value=False)

            with gr.Column():
                pass

        with gr.Row():
            with gr.Column():
                pass

            with gr.Column(scale=2):
                search_bar = gr.Textbox(label="Model Filter", placeholder="e.g. convnext, efficient|mobile")

            with gr.Column():
                pass

        plot = gr.Plot(container=False)
        table = gr.Dataframe(show_search="search")

        inputs = [dataset_dropdown, benchmark_dropdown, intermediate, mim, dist, log_x, search_bar]
        outputs = [plot, table]
        leaderboard.load(update_data, inputs=inputs, outputs=outputs)

        dataset_dropdown.change(update_data, inputs=inputs, outputs=outputs)
        benchmark_dropdown.change(update_data, inputs=inputs, outputs=outputs)
        intermediate.change(update_data, inputs=inputs, outputs=outputs)
        mim.change(update_data, inputs=inputs, outputs=outputs)
        dist.change(update_data, inputs=inputs, outputs=outputs)
        log_x.change(update_data, inputs=inputs, outputs=outputs)
        search_bar.change(update_data, inputs=inputs, outputs=outputs)

    leaderboard.launch()


# Launch the app
if __name__ == "__main__":
    file_info = []
    for p in Path.glob(Path("."), "results_*.csv"):
        file_info.append((p.stat().st_size, p.stem.removeprefix("results_")))

    DATASETS = [dataset_name for _, dataset_name in sorted(file_info, key=lambda x: x[1], reverse=True)]

    app()